携带cookies登陆github
# -*- coding: utf-8 -*-
import scrapy
class Git1Spider(scrapy.Spider):
name = 'git1'
allowed_domains = ['github.com']
start_urls = ['https://github.com/QIKAIDESHENG/']
#重写start_re
def start_requests(self):
url=self.start_urls[0]
temp='_octo=GH1.1.343375846.1592104544; _ga=GA1.2.492559135.1592104764; _gat=1; tz=Asia%2FShanghai; _device_id=201c322d3b8b89b42c2e458c9890507a; has_recent_activity=1; user_session=EYDxg3kELetjNE-FnJybkEI1d53BhtyqyXuWF363HscapAYA; __Host-user_session_same_site=EYDxg3kELetjNE-FnJybkEI1d53BhtyqyXuWF363HscapAYA; logged_in=yes; dotcom_user=QIKAIDESHENG; _gh_sess=ArwonYQL%2FsGIXzmbFwJWBzyVZ3fE8USozF3N08YwZIawA6lbBy1yJ9Lt%2BwNPMZOjM5RCkGgypZJmAoIL3LdjsssNGH0d2SgtF1lBW6HLh5NZoCkX'
cookies={data.split('=')[0]: data.split('=')[-1] for data in temp.split(';')}
yield scrapy.Request(
url=url,
callback=self.parse,
cookies=cookies
)
def parse(self, response):
print(response.xpath('/html/head/title'))
可以通过scrapy.Request()指定method、body参数来发送post请求;通常使用scrapy.FormRequest()来发送post请求
import scrapy
class Git2Spider(scrapy.Spider):
name = 'git2'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
def parse(self, response):
#从登陆页面响应中解析出post数据
token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
utf8 = response.xpath("//input[@name='utf8']/@value").extract_first()
commit = response.xpath("//input[@name='commit']/@value").extract_first()
post_data={
"authenticity_token": token,
"utf8": utf8,
"commit": commit,
"login": "QIKAIDESHENG",
"password": "***"
}
print(post_data)
#针对登陆url发送post请求
yield scrapy.FormRequest(
url='http://github.com/session',
callback=self.after_login,
formdata=post_data
)
def after_login(self,response):
yield scrapy.Request('https://github.com/QIKAIDESHENG/',callback=self.check_login)
def check_login(self,response):
print(response.xpath('/html/head/title').extract_first())
在settings.py中通过设置COOKIES_DEBUG=TRUE 能够在终端看到cookie的传递传递过程
import json
from pymongo import MongoClient
class WangyiPipeline:
def open_spider(self,spider):
if spider.name=='job':
self.file=open('wangyi.json','w')
def process_item(self, item, spider):
if spider.name == 'job':
item=dict(item)
str_data=json.dumps(item,ensure_ascii=False)+',\n'
self.file.write(str_data)
return item
def close_spider(self,spider):
if spider.name == 'job':
self.file.close()
class Wangyi2Pipeline:
def open_spider(self,spider):
if spider.name == 'job2':
self.file=open('wangyi2.json','w')
def process_item(self, item, spider):
if spider.name == 'job2':
item=dict(item)
str_data=json.dumps(item,ensure_ascii=False)+',\n'
self.file.write(str_data)
return item
def close_spider(self,spider):
if spider.name == 'job2':
self.file.close()
class MongoPipeline:
def open_spider(self,spider):
self.client=MongoClient('127.0.0.1',27017) #实例化mongoclient
self.db=self.client['wahhh']
self.col=self.db['wangyi']
def process_item(self,item,spider):
data=dict(item)
self.col.insert(data)
return item
def close_spider(self,spider):
self.file.close()
开启管道
ITEM_PIPELINES = {
'wangyi.pipelines.WangyiPipeline': 300,
'wangyi.pipelines.Wangyi2Pipeline': 301,
'wangyi.pipelines.MongoPipeline': 301,
}
1.在middleware.py 中定义中间件类
2. 在中间件类中重写处理请求或者响应方法
3. 在setting文件中开启中间件使用
middleware.py
from Douban.settings import USER_AGENT_LIST
class RandomUserAgent:
def process_request(self,request,spider):
# print(request.headers)
ua=random.choice(USER_AGENT_LIST)
request.headers['User-Agent']=ua
setting.py
USER_AGENT_LIST=[
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
]
DOWNLOADER_MIDDLEWARES = {
'Douban.middlewares.RandomUserAgent': 543,
}
from Douban.settings import PROXY_LIST
class RandomProxy(object):
def process_request(self,request,spider):
proxy=random.choice(PROXY_LIST)
print(proxy)
#收费ip认证
if 'user_passwd' in proxy:
#对账号密码进行加密
b64_up=base64.b64encode(proxy['user_passwd'])
# 设置认证
request.header['Proxy-Authorization']='Basic ' +b64_up.decode()
# 设置代理
request.meta['proxy']=proxy['ip_port']
else:
request.meta['proxy'] = proxy['ip_port']
settings
PROXY_LIST=[
{"ip_port":"110.243.10.133:9999"},
{"ip_port":"58.253.155.194:9999"},
{"ip_port":"122.5.177.122:9999"},
{"ip_port":"118.25.35.202:9999"}
]
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True #见到尽量关闭
DOWNLOADER_MIDDLEWARES = {
'Douban.middlewares.RandomProxy': 543,
}
第三个url需要渲染等待页面加载,再爬取数据。
middleware.py #需要渲染等待的页面设置
class SeleniumMiddleware(object):
def process_request(self, request, spider):
url = request.url
if 'daydata' in url:
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
data = driver.page_source #渲染之后的源码
driver.close() #关闭
# 创建响应对象
res = HtmlResponse(url=url, body=data, encoding='utf-8', request=request)
return res
ssettings.py
DOWNLOADER_MIDDLEWARES = {
# 'AQI.middlewares.MyCustomDownloaderMiddleware': 543,
'AQI.middlewares.SeleniumMiddleware': 543,
}
git clone https://github.com/rolando/scrapy-redis.git
1.settings添加redis的地址,程序才能够使用redis
REDIS_URL = "redis://127.0.0.1:6379"
#或者使用下面的方式
# REDIS_HOST = "127.0.0.1"
# REDIS_PORT = 6379
2.redis中多了三个键:
3. 中止进程后再次运行dmoz爬虫
继续执行程序,会发现程序在前一次的基础之上继续往后执行,domz爬虫是一个基于url地址的增量式的爬虫
1.编写普通爬虫
创建项目-明确目标-创建爬虫-保存内容
2.改造成分布式爬虫
2.1 改造爬虫
2.1.1导入scrapy_redis中分布式爬虫
2.1.2继承类
2.1.3注释start_urls和allowed_domains
2.1.4设置redis_key获取start_urls
2.1.5设置init获取允许的域
2.2 改造配置文件
copy scrapy_redis配置参数
京东图书爬取并改写分布式案例,注意:提取价格价格在另一网页,提取商品,商品有单品和套装都需要提取。此案例是19年四月。20年网站已简化,在一个页面就能提取出来。19年案例仅提供爬虫分析思路
book.py
import scrapy
from JD.items import JdItem
import json
# ----1 导入分布式爬虫类
from scrapy_redis.spiders import RedisSpider
# ----2 继承分布式爬虫类
class BookSpider(RedisSpider):
name = 'book'
# ----3 注销start_urls&allowed_domains
# # 修改允许的域
# allowed_domains = ['jd.com', 'p.3.cn']
# # 修改起始的url
# start_urls = ['https://book.jd.com/booksort.html']
# ----4 设置redis-key
redis_key = 'py21'
# ----5 设置__init__
def __init__(self, *args, **kwargs):
domain = kwargs.pop('domain', '')
self.allowed_domains = list(filter(None, domain.split(',')))
super(BookSpider, self).__init__(*args, **kwargs)
def parse(self, response):
# 获取所有图书大分类节点列表
big_node_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt/a')
for big_node in big_node_list[:1]:
big_category = big_node.xpath('./text()').extract_first()
big_category_link = response.urljoin(big_node.xpath('./@href').extract_first())
# 获取所有图书小分类节点列表
small_node_list = big_node.xpath('../following-sibling::dd[1]/em/a')
for small_node in small_node_list[:1]:
temp = {}
temp['big_category'] = big_category
temp['big_category_link'] = big_category_link
temp['small_category'] = small_node.xpath('./text()').extract_first()
temp['small_category_link'] = response.urljoin(small_node.xpath('./@href').extract_first())
# 模拟点击小分类链接
yield scrapy.Request(
url=temp['small_category_link'],
callback=self.parse_book_list,
meta={"py21": temp}
)
def parse_book_list(self, response):
temp = response.meta['py21']
book_list = response.xpath('//*[@id="plist"]/ul/li/div')
# print(len(book_list))
for book in book_list:
item = JdItem()
item['big_category'] = temp['big_category']
item['big_category_link'] = temp['big_category_link']
item['small_category'] = temp['small_category']
item['small_category_link'] = temp['small_category_link']
#书的分类有单品和套装,都需要提取出来
item['bookname'] = book.xpath('./div[3]/a/em/text()|./div/div[2]/div[2]/div[3]/a/em/text()').extract_first().strip()
item['author'] = book.xpath('./div[4]/span[1]/span/a/text()|./div/div[2]/div[2]/div[4]/span[1]/span[1]/a/text()').extract_first().strip()
item['link'] = book.xpath('./div[1]/a/@href|./div/div[2]/div[2]/div[1]/a/@href').extract_first()
#价格在另一个页面,需要特殊处理,提取价格 。
# 获取图书编号
skuid = book.xpath('.//@data-sku').extract_first()
# skuid = book.xpath('./@data-sku').extract_first()
# print("skuid:",skuid)
# 拼接图书价格低至
pri_url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid
yield scrapy.Request(url=pri_url,callback=self.parse_price,meta={'meta_1':item})
def parse_price(self, response):
item = response.meta['meta_1']
dict_data = json.loads(response.body)
item['price'] = dict_data[0]['p']
yield item
scrapy-redis中的settings中复制,修改example
SPIDER_MODULES = ['JD.spiders']
NEWSPIDER_MODULE = 'JD.spiders'
USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
# 设置重复过滤器的模块
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置调取器,scrap_redis中的调度器具备与数据库交互的功能
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 设置当爬虫结束的时候是否保持redis数据库中的去重集合与任务队列
SCHEDULER_PERSIST = True
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
ITEM_PIPELINES = {
# 'JD.pipelines.ExamplePipeline': 300,
# 当开启该管道,该管道将会把数据存到Redis数据库中
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# 设置redis数据库
REDIS_URL = "redis://172.16.123.223:6379"
# LOG_LEVEL = 'DEBUG'
# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1
20年
import scrapy
from JD.items import JdItem
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['jd.com']
start_urls = ['https://book.jd.com/booksort.html']
def parse(self, response):
# 获取所有图书大分类列表结点
big_node_list=response.xpath('//*[@id="booksort"]/div[2]/dl/dt/a')
for big_node in big_node_list[:1]:
big_category=big_node.xpath('./text()').extract_first()
big_category_link=response.urljoin(big_node.xpath('./@href').extract_first())
# 获取所有图书小分类-----取兄弟结点
small_node_list=big_node.xpath('../following-sibling::dd[1]/em/a')
for small_node in small_node_list[:1]:
temp={}
temp['big_category']=big_category
temp['big_category_link']=big_category_link
temp['small_category']=small_node.xpath('./text()').extract_first()
temp['small_category_link']=response.urljoin(small_node.xpath('./@href').extract_first())
# print(temp)
# 模拟点击小分类连接
yield scrapy.Request(
url=temp['small_category_link'],
callback=self.parse_book_list,
meta={"temp":temp}
)
def parse_book_list(self,response):
temp=response.meta['temp']
book_list=response.xpath('//*[@id="J_goodsList"]/ul/li/div')
print(len(book_list))
for book in book_list:
item=JdItem()
item['big_category']=temp['big_category']
item['big_category_link']=temp['big_category_link']
item['small_category']=temp['small_category']
item['small_category_link']=temp['small_category_link']
item['bookname']=book.xpath('./div[3]/a/em/text()').extract_first()
item['price']=book.xpath('./div[2]/strong/i/text()').extract_first()
item['link']=book.xpath('./div[3]/a/@href').extract_first()
yield item