scrapy爬虫使用示例

scrapy爬虫使用示例

一、访问汽车之家

  1. 创建爬虫项目scrapy_carhome

scrapy startproject scrapy_carhome

  1. 创建爬虫carhome

(1)settings.py

BOT_NAME = 'scrapy_carhome'
SPIDER_MODULES = ['scrapy_carhome.spiders']
NEWSPIDER_MODULE = 'scrapy_carhome.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

(2)items.py

import scrapy
class ScrapyCarhomeItem(scrapy.Item):
    pass

(3)pipelines.py

from itemadapter import ItemAdapter
class ScrapyCarhomePipeline:
    def process_item(self, item, spider):
        return item

(4)carhome.py

import scrapy

class CarhomeSpider(scrapy.Spider):
    name = 'carhome'
    allowed_domains = ['car.autohome.com.cn']
    start_urls = ['https://car.autohome.com.cn/price/brand-15.html']
    
    def parse(self, response):
        print("===================================================")
        # text()用来定位a标签文本内容
        names = response.xpath('//div[@class="main-class"]/a/text()')
        prices = response.xpath('//div[@class="main-lever"]//span/span/text()')
        for i in range(len(names)):
            name = names[i].extract()
            price = prices[i].extract()
            print(name, price)

二、使用POST方式访问百度翻译

  1. 创建爬虫项目scrapy_post_fanyi

scrapy startproject scrapy_post_fanyi

  1. 创建爬虫

scrapy genspider fanyi_post fanyi.baidu.com

(1)settings.py

BOT_NAME = 'scrapy_post_fanyi'

SPIDER_MODULES = ['scrapy_post_fanyi.spiders']
NEWSPIDER_MODULE = 'scrapy_post_fanyi.spiders'
ROBOTSTXT_OBEY = False 
LOG_LEVEL = 'ERROR'

(2)pipelines.py

import pymongo
import json 
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem 

class ScrapyPostFanyiPipeline:
    def process_item(self, item, spider):
        return item

class MongoPipeline:
    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    # 用来指定一个类的方法为类方法,没有此参数指定的类的方法为实例方法
    @classmethod 
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri = crawler.settings.get('MONGO_URI'),
            mongo_db = crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())                
        return item 
    
class DuplicatesPipeline: 

    def __init__(self):
        self.ids_seen = set()

    def process_items(self, item, spider):
        adapter = ItemAdapter(item)
        if adapter['id'] in self.ids_seen:
            raise DropItem(f"Duplicate item found: {item!r}")
        else:
            self.ids_seen.add(adapter['id'])
            return item

class JsonWriterPipeline:
    def open_spider(self, spider):
        self.file = open('items.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(ItemAdapter(item).asdict() + "\n")
        self.file.write(line)
        return item

(3)fanyi_post.py

import scrapy
import json 

class FanyiPostSpider(scrapy.Spider):
    name = 'fanyi_post'
    def start_requests(self):
        url = 'http://fanyi.baidu.com/sug'
        data = {'kw': 'final'}
        yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse_second)

    def parse_second(self, response):
        content = response.text 
        print(content)
        obj = json.loads(content)
        print(obj) 

    # post请求如果没有参数,这个请求将没有任何意义,与start_urls和parse无关
    # allowed_domains = ['fanyi.baidu.com']
    # start_urls = ['http://fanyi.baidu.com/']
    # def parse(self, response):
        # pass

三、访问当当网书籍信息

  1. 创建爬虫项目scrapy_dangdang

scrapy startproject scrapy_dangdang

  1. 创建爬虫dangdang

scrapy genspider dangdang category.dangdang.com

(1)settings.py

BOT_NAME = 'scrapy_dangdang'

SPIDER_MODULES = ['scrapy_dangdang.spiders']
NEWSPIDER_MODULE = 'scrapy_dangdang.spiders'
ROBOTSTXT_OBEY = False 

LOG_LEVEL = 'ERROR'
# 配置item pipelines
ITEM_PIPELINES = {
    # 管道可以有很多个,优先级是1到1000,值越小优先级越高
   'scrapy_dangdang.pipelines.ScrapyDangdangPipeline': 300,
   'scrapy_dangdang.pipelines.DangDangDownloadPipeline': 301
}

(2)pipelines.py

from urllib import request
from itemadapter import ItemAdapter
import uuid

# 需要在settings.py中开启pipelines
class ScrapyDangdangPipeline:
    # 在爬虫文件执行前执行
    def open_spider(self, spider):
        self.f = open('book.json', 'w', encoding='utf-8')

    # item就是yield的book对象
    def process_item(self, item, spider):
        self.f.write(str(item))
        return item

    # 在爬虫文件执行后执行
    def close_spider(self, spider):
        self.f.close()


# 多条管道开启:定义管道类,在settings.py中开启管道
class DangDangDownloadPipeline:
    def process_item(self, item, spider):
        url = 'https:' + item.get('img')
        filename = './books/' + str(uuid.uuid4()).replace('-', '') + '.jpg'
        request.urlretrieve(url=url, filename=filename)
        return item 

(3)items.py

import scrapy

class ScrapyDangdangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 定义下载的数据项 图片 名称 价格
    img = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()

(4)dangdang.py

import scrapy
from scrapy_dangdang.items import ScrapyDangdangItem

class DangdangSpider(scrapy.Spider):
    name = 'dangdang'
    #allowed_domains = ['category.dangdang.com']
    base_url = 'http://category.dangdang.com/pg'
    page = 1
    start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']

    def parse(self, response):
        # pipelines下载数据
        # item 定义数据结构
        lis = response.xpath('//ul[@id="component_59"]/li')
        for li in lis:
            img = li.xpath('.//img/@data-original').extract_first()
            if img:
                img = img 
            else:
                img = li.xpath('.//img/@src').extract_first()
                
            name = li.xpath('.//img/@alt').extract_first().strip()
            price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()
            book = ScrapyDangdangItem(img=img, name=name, price=price)
            # 获取到book后交给pipelines
            yield book 

            if self.page < 2:
                self.page = self.page + 1
                url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html'
                yield scrapy.Request(url=url, callback=self.parse)

你可能感兴趣的:(python,scrapy,爬虫,python)