SpiderAbstract.php 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. <?php
  2. namespace console\models\spider;
  3. use common\models\Article;
  4. use common\models\ArticleData;
  5. use common\models\Category;
  6. use common\models\Gather;
  7. use common\models\Spider;
  8. use yii\base\Exception;
  9. use Goutte\Client;
  10. use yii\base\BaseObject;
  11. class SpiderAbstract extends BaseObject
  12. {
  13. public $spiderName = '';
  14. private $_url;
  15. protected $config;
  16. /**
  17. * 构造方法,初始化采集网站属性.
  18. */
  19. public function init()
  20. {
  21. $this->setConfig();
  22. }
  23. public function getConfig()
  24. {
  25. return $this->config;
  26. }
  27. protected function setConfig()
  28. {
  29. if (empty($this->spiderName)) {
  30. $className = strtolower(get_class($this));
  31. // 去除命名空间
  32. $this->spiderName = implode('', array_slice(explode('\\', $className), -1));
  33. }
  34. $this->config = \Yii::$app->cache->get($this->spiderName.'Config');
  35. if ($this->config === false) {
  36. $spider = Spider::find()->where(['name' => $this->spiderName])->one();
  37. if (empty($spider)) {
  38. throw new Exception('不存在目标网站');
  39. }
  40. $this->config = [];
  41. $category = [];
  42. $targetCategory = explode(',', $spider->target_category);
  43. $targetCategoryUrl = explode(',', $spider->target_category_url);
  44. foreach ($targetCategory as $k => $cate) {
  45. $category[$cate] = $targetCategoryUrl[$k];
  46. }
  47. $this->config['category'] = $category;
  48. $this->config['name'] = $spider->name;
  49. $this->config['title'] = $spider->title;
  50. $this->config['domain'] = $spider->domain;
  51. $this->config['page_dom'] = $spider->page_dom;
  52. $this->config['list_dom'] = $spider->list_dom;
  53. $this->config['title_dom'] = $spider->title_dom;
  54. $this->config['time_dom'] = $spider->time_dom;
  55. $this->config['content_dom'] = $spider->content_dom;
  56. \Yii::$app->cache->set($this->spiderName.'Config', $this->config);
  57. }
  58. }
  59. /**
  60. * 采集执行函数,调用 getPages ,获取所有分页 ;然后调用 urls ,获取每页文章的文章url,并将他们存入队列.
  61. */
  62. public function process()
  63. {
  64. foreach ($this->config['category'] as $category => $url) {
  65. $pages = $this->getPages($url, $category);
  66. if ($pages) {
  67. foreach ($pages as $p) {
  68. $this->urls($category, $p);
  69. }
  70. }
  71. }
  72. }
  73. /**
  74. * 判断文章是否采集.
  75. *
  76. * @param $url
  77. *
  78. * @return bool
  79. */
  80. public function isGathered($url)
  81. {
  82. $gather = Gather::find()->where(['url' => md5(trim($url)), 'res' => 1])->one();
  83. return $gather ? 1 : 0;
  84. }
  85. /**
  86. * 插入URL队列.
  87. *
  88. * @param $category
  89. * @param $url
  90. * @param $className
  91. * @param string $publishTime
  92. */
  93. public function enqueue($category, $url, $cover, $className, $publishTime = '')
  94. {
  95. \Resque::enqueue('article_spider', 'console\models\ArticleJob', ['category' => $category, 'url' => $url, 'className' => $className, 'publishTime' => $publishTime, 'cover' => $cover]);
  96. }
  97. /**
  98. * 获取当前网站指定分类的分页.
  99. *
  100. * @return array
  101. */
  102. protected function getPages($pageUrl, $category)
  103. {
  104. $client = new Client();
  105. $crawler = $client->request('GET', $pageUrl);
  106. // 没有分页
  107. if (!empty($this->config['page_dom'])) {
  108. //获取分页
  109. $crawler->filter($this->config['page_dom'])->each(function ($node) use ($pageUrl,$category) {
  110. if ($node) {
  111. try {
  112. $this->_url[] = strpos($node->attr('href'), '/') === 0 ? $this->config['domain']. trim($node->attr('href')) : $pageUrl.trim($node->attr('href'));
  113. } catch (\Exception $e) {
  114. $this->addLog($pageUrl, $category, 0, $e->getMessage());
  115. }
  116. }
  117. });
  118. } else {
  119. $this->_url[] = $pageUrl;
  120. }
  121. return array_unique($this->_url);
  122. }
  123. /**
  124. * 获取每页的文章列表中文章URL和发布时间.
  125. *
  126. * @param $category
  127. * @param $url
  128. */
  129. protected function urls($category, $url)
  130. {
  131. $client = new Client();
  132. $crawler = $client->request('GET', $url);
  133. $crawler->filter($this->config['list_dom'])->each(function ($node) use ($category,$url) {
  134. if ($node) {
  135. try {
  136. $u = strpos(trim($node->attr('href')), 'http') === false ? $this->config['domain'].trim($node->attr('href')) : trim($node->attr('href'));
  137. if (method_exists($this, 'getCover')) {
  138. $cover = $this->getCover($node);
  139. } else {
  140. $cover = '';
  141. }
  142. echo $this->isGathered($u);
  143. if (!$this->isGathered($u)) {
  144. $this->enqueue($category, $u, $cover, $this->config['name']);
  145. }
  146. } catch (\Exception $e) {
  147. $this->addLog($url, $category, 0, $e->getMessage());
  148. }
  149. }
  150. });
  151. }
  152. /**
  153. * 获取指定url的文章标题、内容、发布时间.
  154. *
  155. * @param $url
  156. * @param $category
  157. *
  158. * @return string
  159. */
  160. public function getContent($url, $category)
  161. {
  162. $client = new Client();
  163. $crawler = $client->request('GET', $url);
  164. try {
  165. $title = $crawler->filter($this->config['title_dom']);
  166. $time = $crawler->filter($this->config['time_dom']);
  167. $con = $crawler->filter($this->config['content_dom']);
  168. if ($title && $con) {
  169. $title = trim($title->text());
  170. $content = $con->html();
  171. if (method_exists($this, 'filterContent')) {
  172. $content = $this->filterContent($content);
  173. }
  174. $time = strtotime($time->text()) > 0 ? strtotime($time->text()) : time();
  175. return json_encode(['title' => $title, 'content' => $content, 'time' => $time]);
  176. }
  177. } catch (\Exception $e) {
  178. $this->addLog($url, $category, 0, $e->getMessage());
  179. }
  180. return '';
  181. }
  182. /**
  183. * 将文章插入数据库.
  184. *
  185. * @param $title 标题
  186. * @param $content 内容
  187. * @param $publish_at 发布时间
  188. * @param string $category 分类名
  189. * @param string $cover 封面
  190. *
  191. * @return int
  192. */
  193. public function insert($title, $content, $publish_at, $category = '', $cover = '')
  194. {
  195. //插入标签(搜索的分类)
  196. $categoryId = Category::getIdByName($category);
  197. if (!$categoryId) {
  198. throw new Exception('该分类不存在');
  199. }
  200. $article = new Article();
  201. $article->title = $title;
  202. $article->status = 1;
  203. $article->category = $category;
  204. $article->category_id = $categoryId;
  205. $article->source = $this->config['domain'];
  206. $article->cover = $cover;
  207. $article->created_at = $publish_at;
  208. $article->user_id = 1;
  209. $res = $article->save(false);
  210. if ($res) {
  211. $articleData = new ArticleData();
  212. $articleData->id = $article->id;
  213. $articleData->content = $content;
  214. $res = $articleData->save(false);
  215. }
  216. return $res ? 1 : 0;
  217. }
  218. /**
  219. * 采集日志.
  220. *
  221. * @param $url
  222. * @param $category
  223. * @param $res
  224. * @param $result
  225. */
  226. public function addLog($url, $category, $res, $result)
  227. {
  228. $gather = new Gather();
  229. $gather->name = $this->config['name'];
  230. $gather->category = $category;
  231. $gather->url = md5($url);
  232. $gather->url_org = $url;
  233. $gather->res = $res;
  234. $gather->result = $result;
  235. $gather->save(false);
  236. }
  237. }