import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from wxapp.items import WxappItem class WxSpider(CrawlSpider): name = 'wx' allowed_domains = ['wxapp-union.com'] start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1'] rules = ( Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=\d+'), follow=True), Rule(LinkExtractor(allow=r'.*article-.+\.html'), callback='parse_detail', follow=False), ) def parse_detail(self, response): detail_href = response.request.url title = response.xpath('//h1[@class="ph"]/text()').get() content = response.xpath('//td[@id="article_content"]//text()').getall() content = [c.strip() for c in content] content = ''.join(content).strip() pub_time = response.xpath('//p[@class="authors"]/span/text()').get() author = response.xpath('//p[@class="authors"]/a/text()').get() item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author) yield item
items:
class WxAppItem(scrapy.Item): title = scrapy.Field() pub_time = scrapy.Field() content = scrapy.Field() summary = scrapy.Field() article_url = scrapy.Field() read_count = scrapy.Field()
pipline:
import pymysql from pymysql import cursors from twisted.enterprise import adbapi class WxAppPipeline(object): def __init__(self): db_params = { 'host': '127.0.0.1', 'port': 3306, 'user': 'root', 'password': '', 'database': 'wxapp', 'charset': 'utf8', 'cursorclass': cursors.DictCursor # 指定游标类 } # 定义数据库连接池 self.db_pool = adbapi.ConnectionPool('pymysql', **db_params) self._sql = None def process_item(self, item, spider): defer = self.db_pool.runInteraction(self.insert_item, item) defer.addErrback(self.handle_error, item, spider) return item def insert_item(self, cursor, item): print('kkkkkkkkkkkkkkkkkkkk') cursor.execute(self.sql, (item['title'], item['content'], item['summary'], item['read_count'], item['pub_time'], item['article_url'])) def handle_error(self, error, item, spider): print('=' * 10 + 'error' + '=' * 10) print(error) @property def sql(self): if not self._sql: self._sql = """ INSERT INTO article(id, title, content, summary, read_count, pub_time, article_url) VALUES (null, %s, %s, %s, %s, %s, %s); """ return self._sql return self._sql