123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251 |
- <?php
- namespace console\models\spider;
- use common\models\Article;
- use common\models\ArticleData;
- use common\models\Category;
- use common\models\Gather;
- use common\models\Spider;
- use yii\base\Exception;
- use Goutte\Client;
- use yii\base\BaseObject;
- class SpiderAbstract extends BaseObject
- {
- public $spiderName = '';
- private $_url;
- protected $config;
- /**
- * 构造方法,初始化采集网站属性.
- */
- public function init()
- {
- $this->setConfig();
- }
- public function getConfig()
- {
- return $this->config;
- }
- protected function setConfig()
- {
- if (empty($this->spiderName)) {
- $className = strtolower(get_class($this));
- // 去除命名空间
- $this->spiderName = implode('', array_slice(explode('\\', $className), -1));
- }
- $this->config = \Yii::$app->cache->get($this->spiderName.'Config');
- if ($this->config === false) {
- $spider = Spider::find()->where(['name' => $this->spiderName])->one();
- if (empty($spider)) {
- throw new Exception('不存在目标网站');
- }
- $this->config = [];
- $category = [];
- $targetCategory = explode(',', $spider->target_category);
- $targetCategoryUrl = explode(',', $spider->target_category_url);
- foreach ($targetCategory as $k => $cate) {
- $category[$cate] = $targetCategoryUrl[$k];
- }
- $this->config['category'] = $category;
- $this->config['name'] = $spider->name;
- $this->config['title'] = $spider->title;
- $this->config['domain'] = $spider->domain;
- $this->config['page_dom'] = $spider->page_dom;
- $this->config['list_dom'] = $spider->list_dom;
- $this->config['title_dom'] = $spider->title_dom;
- $this->config['time_dom'] = $spider->time_dom;
- $this->config['content_dom'] = $spider->content_dom;
- \Yii::$app->cache->set($this->spiderName.'Config', $this->config);
- }
- }
- /**
- * 采集执行函数,调用 getPages ,获取所有分页 ;然后调用 urls ,获取每页文章的文章url,并将他们存入队列.
- */
- public function process()
- {
- foreach ($this->config['category'] as $category => $url) {
- $pages = $this->getPages($url, $category);
- if ($pages) {
- foreach ($pages as $p) {
- $this->urls($category, $p);
- }
- }
- }
- }
- /**
- * 判断文章是否采集.
- *
- * @param $url
- *
- * @return bool
- */
- public function isGathered($url)
- {
- $gather = Gather::find()->where(['url' => md5(trim($url)), 'res' => 1])->one();
- return $gather ? 1 : 0;
- }
- /**
- * 插入URL队列.
- *
- * @param $category
- * @param $url
- * @param $className
- * @param string $publishTime
- */
- public function enqueue($category, $url, $cover, $className, $publishTime = '')
- {
- \Resque::enqueue('article_spider', 'console\models\ArticleJob', ['category' => $category, 'url' => $url, 'className' => $className, 'publishTime' => $publishTime, 'cover' => $cover]);
- }
- /**
- * 获取当前网站指定分类的分页.
- *
- * @return array
- */
- protected function getPages($pageUrl, $category)
- {
- $client = new Client();
- $crawler = $client->request('GET', $pageUrl);
- // 没有分页
- if (!empty($this->config['page_dom'])) {
- //获取分页
- $crawler->filter($this->config['page_dom'])->each(function ($node) use ($pageUrl,$category) {
- if ($node) {
- try {
- $this->_url[] = strpos($node->attr('href'), '/') === 0 ? $this->config['domain']. trim($node->attr('href')) : $pageUrl.trim($node->attr('href'));
- } catch (\Exception $e) {
- $this->addLog($pageUrl, $category, 0, $e->getMessage());
- }
- }
- });
- } else {
- $this->_url[] = $pageUrl;
- }
- return array_unique($this->_url);
- }
- /**
- * 获取每页的文章列表中文章URL和发布时间.
- *
- * @param $category
- * @param $url
- */
- protected function urls($category, $url)
- {
- $client = new Client();
- $crawler = $client->request('GET', $url);
- $crawler->filter($this->config['list_dom'])->each(function ($node) use ($category,$url) {
- if ($node) {
- try {
- $u = strpos(trim($node->attr('href')), 'http') === false ? $this->config['domain'].trim($node->attr('href')) : trim($node->attr('href'));
- if (method_exists($this, 'getCover')) {
- $cover = $this->getCover($node);
- } else {
- $cover = '';
- }
- echo $this->isGathered($u);
- if (!$this->isGathered($u)) {
- $this->enqueue($category, $u, $cover, $this->config['name']);
- }
- } catch (\Exception $e) {
- $this->addLog($url, $category, 0, $e->getMessage());
- }
- }
- });
- }
- /**
- * 获取指定url的文章标题、内容、发布时间.
- *
- * @param $url
- * @param $category
- *
- * @return string
- */
- public function getContent($url, $category)
- {
- $client = new Client();
- $crawler = $client->request('GET', $url);
- try {
- $title = $crawler->filter($this->config['title_dom']);
- $time = $crawler->filter($this->config['time_dom']);
- $con = $crawler->filter($this->config['content_dom']);
- if ($title && $con) {
- $title = trim($title->text());
- $content = $con->html();
- if (method_exists($this, 'filterContent')) {
- $content = $this->filterContent($content);
- }
- $time = strtotime($time->text()) > 0 ? strtotime($time->text()) : time();
- return json_encode(['title' => $title, 'content' => $content, 'time' => $time]);
- }
- } catch (\Exception $e) {
- $this->addLog($url, $category, 0, $e->getMessage());
- }
- return '';
- }
- /**
- * 将文章插入数据库.
- *
- * @param $title 标题
- * @param $content 内容
- * @param $publish_at 发布时间
- * @param string $category 分类名
- * @param string $cover 封面
- *
- * @return int
- */
- public function insert($title, $content, $publish_at, $category = '', $cover = '')
- {
- //插入标签(搜索的分类)
- $categoryId = Category::getIdByName($category);
- if (!$categoryId) {
- throw new Exception('该分类不存在');
- }
- $article = new Article();
- $article->title = $title;
- $article->status = 1;
- $article->category = $category;
- $article->category_id = $categoryId;
- $article->source = $this->config['domain'];
- $article->cover = $cover;
- $article->created_at = $publish_at;
- $article->user_id = 1;
- $res = $article->save(false);
- if ($res) {
- $articleData = new ArticleData();
- $articleData->id = $article->id;
- $articleData->content = $content;
- $res = $articleData->save(false);
- }
- return $res ? 1 : 0;
- }
- /**
- * 采集日志.
- *
- * @param $url
- * @param $category
- * @param $res
- * @param $result
- */
- public function addLog($url, $category, $res, $result)
- {
- $gather = new Gather();
- $gather->name = $this->config['name'];
- $gather->category = $category;
- $gather->url = md5($url);
- $gather->url_org = $url;
- $gather->res = $res;
- $gather->result = $result;
- $gather->save(false);
- }
- }
|