【学习笔记】scrapy爬虫框架

scrapy基本使用

【学习笔记】scrapy爬虫框架_第1张图片

import os
import re
import scrapy


class QuoteSpider(scrapy.Spider):
    name = 'quote'
    start_urls = ['http://quotes.toscrape.com/']

    def parse(self, response):
        quotes = response.xpath('//div[@class="quote"]')
        for quote in quotes:
            yield {
                'text': re.findall('“(.+)”', quote.xpath('./span[@class="text"]/text()').extract_first())[0],
                'author': quote.xpath('./span/small[@class="author"]/text()').extract_first()
            }
        next_page = response.xpath('//li[@class="next"]/a/@href').extract_first()
        if next_page:
            yield response.follow(next_page, self.parse)


if __name__ == '__main__':
    print(os.system('scrapy runspider quotes_spider.py -o quote.csv -t csv'))

示例1

【学习笔记】scrapy爬虫框架_第2张图片
【学习笔记】scrapy爬虫框架_第3张图片

import os
import scrapy


class QianMuSpider(scrapy.Spider):
    name = 'usnews'
    # 允许爬的域名范围
    allowed_domains = ['www.qianmu.org']
    # 爬虫的入口地址
    start_urls = ['http://www.qianmu.org/ranking/1528.htm']

    # 当框架请求start_urls内的链接成功后,会调用此方法
    def parse(self, response):
        # 提取链接,并提取链接内容,extra返回列表
        links = response.xpath('//div[@class="rankItem"]//tr[position()>1]/td/a/@href').extract()
        for link in links:
            if not link.startswith('http://www.qianmu.org'):
                continue
            # 让框架继续跟随这个链接,请求成功后调用指定的callback函数
            yield response.follow(link, self.parse_university)

    def parse_university(self, response):
        data = dict()
        # 数据只有一个时,用extra_first
        data['name'] = response.xpath('//div[@id="wikiContent"]/h1/text()').extract_first()
        table = response.xpath('//div[@id="wikiContent"]/div[@class="infobox"]/table')
        if table:
            table = table[0]
            col1 = table.xpath('.//td[1]')
            col2 = table.xpath('.//td[2]')
            keys, values = [[self.space_filter_and_join(col.xpath('.//text()').extract())
                             for col in cols] for cols in (col1, col2)]
            if len(keys) == len(values):
                data.update(zip(keys, values))
            # yield的数据被框架接收,进行下一步的处理,没有处理则打印
            yield data

    def not_empty(self, str):
        return str and str.strip()

    def space_filter_and_join(self, li):
        return ''.join(list(filter(self.not_empty, li)))


if __name__ == '__main__':
    print(os.system('scrapy runspider use_scrapy.py'))

scrapy调试

【学习笔记】scrapy爬虫框架_第4张图片

Item Pipelines 将数据导入数据库

items.py

import scrapy

class UniversityItem(scrapy.Item):
    name = scrapy.Field()
    rank = scrapy.Field()
    country = scrapy.Field()
    state = scrapy.Field()
    city = scrapy.Field()
    undergraduate_num = scrapy.Field()
    postgraduate_num = scrapy.Field()
    website = scrapy.Field()

pipelines.py

import pymysql
import redis
from scrapy.exceptions import DropItem


class MysqlPipeline:
    def open_spider(self, spider):
        self.conn = pymysql.connect(host='localhost', user='root', password='123456',
                                    port=3306, charset='utf8', database='python')
        self.cur = self.conn.cursor()

    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()

    def process_item(self, item, spider):
        # keys = item.keys()
        # values = [item[k] for k in keys]
        keys, values = zip(*item.items())
        sql = "insert into university ({}) values ({})".format(
            ','.join(keys), ','.join(['%s'] * len(keys))
        )
        self.cur.execute(sql, values)
        self.conn.commit()
        return item


class RedisPipeline:
    def open_spider(self, spider):
        self.r = redis.Redis()

    def close_spider(self, spider):
        self.r.close()

    def process_item(self, item, spider):
        if self.r.sadd(spider.name, item['name']):
            return item
        raise DropItem

设置setting.py

ITEM_PIPELINES = {
   'scrapy_t.pipelines.RedisPipeline': 300,
   'scrapy_t.pipelines.MysqlPipeline': 301,
}

中间件

【学习笔记】scrapy爬虫框架_第5张图片
【学习笔记】scrapy爬虫框架_第6张图片
【学习笔记】scrapy爬虫框架_第7张图片
【学习笔记】scrapy爬虫框架_第8张图片
【学习笔记】scrapy爬虫框架_第9张图片
【学习笔记】scrapy爬虫框架_第10张图片
【学习笔记】scrapy爬虫框架_第11张图片

你可能感兴趣的:(爬虫,xpath)