基于 pyspider 的 infoq.com 爬虫

目标

需要多了解下世界 IT 领域发生了些什么事情, infoq 是最好的入口。定期爬 infoq 英文版内容,并展现在自己的网站上。

先贴爬虫效果:
基于 pyspider 的 infoq.com 爬虫_第1张图片

总体架构

pyspider 负责爬取 infoq 内容,启用了 phantomjs。爬取的内容插入 MySQL。前端用 PHP 读 MySQL 数据库来展示。

脚本

CREATE TABLE `news_latest` (
  `id` int(22) NOT NULL AUTO_INCREMENT,
  `url` varchar(1024) DEFAULT NULL,
  `title` varchar(1024) DEFAULT NULL,
  `brief` varchar(5192) DEFAULT NULL,
  `publish_date` varchar(64) DEFAULT NULL,
  `content` text,
  `author` varchar(128) DEFAULT NULL,
  `source` varchar(32) DEFAULT '',
  `log_id` varchar(32) DEFAULT '' COMMENT 'baidu summary service',
  `tags` varchar(128) DEFAULT NULL,
  `gmt_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2015 DEFAULT CHARSET=utf8mb4
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-04-27 19:18:06
# Project: InfoQ

from pyspider.libs.base_handler import *


import MySQLdb
import re

class SQL():
    #数据库初始化
    def __init__(self):
        #数据库连接相关信息
        hosts    = '127.0.0.1'
        username = 'crawl'
        password = 'mypass'
        database = 'spiderdb'
        charsets = 'utf8'

        self.connection = False
        try:
            self.conn = MySQLdb.connect(host = hosts,user = username,passwd = password,db = database,charset = charsets)
            self.cursor = self.conn.cursor()
            self.cursor.execute("set names "+charsets)
            self.connection = True
        except Exception,e:
            print "Cannot Connect To Mysql!/n",e

    def escape(self,string):
        return '%s' % string
    #插入数据到数据库
    def insert(self,tablename,**values):
        if self.connection:
            tablename = self.escape(tablename)
            list_value = values.values() #list(values)
            params = ",".join(['%s']*len(list_value))
            columns = ",".join(list(values))
            sql_query = "insert into %s(%s) values (%s)" % (tablename, columns, params)

            print list_value
            print sql_query

            try:
                self.cursor.execute(sql_query,list_value)
                self.conn.commit()
                return True
            except Exception,e:
                print "An Error Occured: ",e
                print list_value
                print sql_query
                return False

            
class Handler(BaseHandler):
    crawl_config = {

    }
    
    def __init__(self):      
        self.siteUrl = 'https://www.infoq.com/'
        self.contentSelector = '.article__content'
        self.authorSelector = '.author__link:first'
        self.dateSelector = '.date:first'
        self.source = 'InfoQ.com'
        self.contentUrlRegx = r'http(.*)infoq.com/(news|podcasts|articles|minibooks)/(.+)'
        self.followUrlRegx = r'http(.*)infoq.com/(.*)'
        self.tagsSelector = '.related__topic'
        self.briefLength = 800
        
    @every(minutes=3 * 60)
    def on_start(self):
        self.crawl(self.siteUrl, fetch_type = 'js', callback=self.index_page, age=3*60*60, auto_recrawl=True)

    @config(age=3 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(self.followUrlRegx, each.attr.href) is not None:
                self.crawl(each.attr.href,
                           cookies = response.cookies,                            
                           fetch_type = 'js',
                           js_script="""
                               setTimeout(function() {
                                   window.scrollTo(0,document.body.scrollHeight);
                               }, 2000);
                               """,
                           callback=self.detail_page,  itag=each.attr.href)
            elif (re.match(self.contentUrlRegx, each.attr.href) is not None):
                url = self.strip_param(each.attr.href)
                self.crawl(url, cookies = response.cookies, callback=self.record, itag=url)

    @config(priority=2)
    def detail_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            url = self.strip_param(each.attr.href)
            if (re.match(self.contentUrlRegx, url) is not None):
                self.crawl(url, cookies = response.cookies, callback=self.record, itag=url)
            else:
                print ("pass %s") % (each.attr.href)
        return self.record(response)
    
    def on_result(self,result):
        if result and result['author']:
            sql = SQL()
            sql.insert('news_latest',**result) 


    def strip_param(self, u):
        q = u.find('?')
        if (q > 0):
            return u[0:q]
        else:
            return u
              
 
    def record(self, response):
        match = re.match( self.contentUrlRegx, response.url)
        if match is None:
            print ("url do not match record reg: %s" % (response.url))
            return None
        return {
            "url": response.url,
            "title": response.doc('title').text(),
            "brief": response.doc(self.contentSelector).text()[0:self.briefLength],
            "content": response.doc(self.contentSelector).text(),
            "publish_date": response.doc(self.dateSelector).text(),
            "author": response.doc(self.authorSelector).text(),
            "source" : self.source,
            "tags" : ",".join([item.text() for item in response.doc(self.tagsSelector).items()])
        }          
        

你可能感兴趣的:(脚本)