scrapy详解
昨日回顾:
1.scrapy框架
五大核心组件:
1.引擎: 负责数据流传递, 各个组件之间的通信
2.spider: 定义了爬取的行为和数据解析
3.调度器: scheduler, 负责调度所有的请求
4.下载器: 负责发送网络请求, 返回响应数据
5.管道: item Pipeline —> item: 定义了要进行数据持久化的字段
pipeline: 与数据库进行交互, 将数据存储在数据库中
scrapy框架的数据流:
spider --> 引擎 --> 调度器 --> 引擎 --> 下载器 --> 引擎 --> spider -->引擎 --> pipeline --> database
# spider编码在原基础之上, 构建其他页面的url地址, 并利用scrapy.Request发起新的请求, 请求的回调函数依然是parse:
page = 1
base_url = 'http://www.xiaohuar.com/list-1-%s.html'
if self.page < 4:
page_url = base_url%self.page
self.page += 1
yield scrapy.Request(url=page_url, callback=self.parse)
# (其他文件不用改动)
# 需求: 爬取笑话的标题与详情页连接, 通过详情页链接, 爬取详情页笑话内容
# item编码: 定义数据持久化的字段信息
import scrapy
class JokeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
# spider的编码:
# -*- coding: utf-8 -*-
import scrapy
from ..items import JokeItem
class XhSpider(scrapy.Spider):
name = 'xh'
# allowed_domains = ['www.baidu.com']
start_urls = ['http://www.jokeji.cn/list.htm']
def parse(self, response):
li_list = response.xpath('//div[@class="list_title"]/ul/li')
for li in li_list:
title = li.xpath('./b/a/text()').extract_first()
link = 'http://www.jokeji.cn' + li.xpath('./b/a/@href').extract_first()
yield scrapy.Request(url=link, callback=self.datail_parse, meta={"title":title})
def datail_parse(self, response):
joke_list = response.xpath('//span[@id="text110"]//text()').extract()
title = response.meta["title"]
content = ''
for s in joke_list:
content += s
item = JokeItem()
item["title"] = title
item["content"] = content
yield item
# Pipeline编码: 数据持久化具体操作
import pymongo
class JokePipeline(object):
conn = pymongo.MongoClient('localhost', 27017)
db = conn.haha
table = db.hahatable
def process_item(self, item, spider):
self.table.insert(dict(item))
return item
def close_spider(self, spider):
self.conn.close()
# settings配置编码:
UA伪装
Robots协议
Item_Pipeline
import scrapy
import json
class FySpider(scrapy.Spider):
name = 'fy'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://fanyi.baidu.com/sug']
def start_requests(self):
data = {
'kw':'boy'
}
yield scrapy.FormRequest(url=self.start_urls[0], callback=self.parse, formdata=data)
def parse(self, response):
print(1111111111111111111111111111111111111111111111111111111111111111111111111111111111)
print(response.text)
print(json.loads(response.text))
print(2222222222222222222222222222222222222222222222222222222222222222222222222222222222)
# 中间件分类:
- 下载中间件: DownloadMiddleware
- 爬虫中间件: SpiderMiddleware
# 中间件的作用:
- 下载中间件: 拦截请求与响应, 篡改请求与响应
- 爬虫中间件: 拦截请求与响应, 拦截管道item, 篡改请求与响应, 处理item
# 下载中间件的主要方法:
process_request
process_response
process_exception
# 下载中间件拦截请求, 使用代理ip案例
# spider编码:
import scrapy
class DlproxySpider(scrapy.Spider):
name = 'dlproxy'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://www.baidu.com/s?wd=ip']
def parse(self, response):
with open('baiduproxy.html', 'w', encoding='utf-8') as f:
f.write(response.text)
# Downloadermiddleware编码:
def process_request(self, request, spider):
request.meta['proxy'] = 'http://111.231.90.122:8888'
return None
# spider编码:
class DlproxySpider(scrapy.Spider):
name = 'dlproxy'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://www.baidu.com/','https://www.baidu.com/','https://www.baidu.com/','https://www.baidu.com/','https://www.baidu.com/']
def parse(self, response):
pass
# 中间件的编码:
from scrapy import signals
from fake_useragent import UserAgent
import random
ua = UserAgent()
ua_list = []
for i in range(100):
ua_chrome = ua.Chrome
ua_list.append(ua_chrome)
class ...():
def process_request(self, request, spider):
# request.meta['proxy'] = 'http://111.231.90.122:8888'
print(55555555555555555555555555555)
print(self.ua_pool)
print(55555555555555555555555555555)
request.headers['User-Agent'] = random.choice(self.ua_pool)
return None
def process_response(self, request, response, spider):
print(1111111111111111111111111111111)
print(request.headers["User-Agent"])
print(2222222222222222222222222222222)
return response