pip install wheel
pip install scrapy
scrapy startproject '项-t目名称'
scrapy startproject mySpider
scrapy genspider '爬虫名称' '爬取的网站'
cd mySpider
scrapy genspider example example.com
目录结构:
注意:scrapy.cfg文件中不要添加中文注释,并且注释只能添加到行首
|—— mySpider
|—— mySpider
|—— spiders "创建的爬虫文件夹"
|——__init__.py
|——itcast.py "创建的一个爬虫"
|-- __init__.py
|—— items.py "在这里为您的抓取项目定义模型"
|—— middlewares.py "自定义中间件的文件"
|—— pipelines.py "管道,保存数据"
|—— settings.py "设置文件,UA,启动管道"
|—— scrapy.cfg "项目的配置文件"
# -*- coding: utf-8 -*-
import scrapy
class ItcastSpider(scrapy.Spider):
name = 'itcast' # 爬虫名称
allowed_domains = ['XXX.cn'] # 允许爬取的范围
start_urls = ['http://www.XXXX.html'] # 最开始请求的url地址
def parse(self, response):
# 处理start_url地址对应的响应
# print(response.status)
# ret = response.xpath('//div[@class="tea_con"]//h3/text()')
# print(ret)
# 分组
li_list = response.xpath('//div[@class="tea_con"]//li')
for li in li_list:
item = {
'name': li.xpath('.//h3/text()').extract_first(),
'title': li.xpath('.//h4/text()').extract_first(),
}
# print(item)
# 只能return Request, BaseItem, dict or None这些值
yield item
from pymongo import MongoClient
client = MongoClient()
collection = client['mySpider']['itcast']
class MyspiderPipeline:
def process_item(self, item, spider):
item['hello'] = 'word'
return item
class MyspiderPipeline1:
def process_item(self, item, spider):
print(item)
collection.insert(dict(item))
return item
在项目目录下执行命令
语法:scrapy crawl '爬虫名称'
scrapy crawl example
settings.py文件中:
LOG_LEVEL='WARNING'
LOG_FILE='./log.log' # 设置日志文件保存的路径
数字越小越先经过pipelines.py中的pipeline(管道)
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'mySpider.pipelines.MyspiderPipeline': 300, #先进过300,再经过301
'mySpider.pipelines.MyspiderPipeline1': 301,
}
import scrapy
class MyspiderItem(scrapy.Item):
name=scrapy.Field()
title=scrapy.Field()
hello=scrapy.Field()
from mySpider.items import MyspiderItem
for li in li_list:
item = MyspiderItem()
item['name'] = li.xpath('.//h3/text()').extract_first(),
item['title'] = li.xpath('.//h4/text()').extract_first(),
在settings.py文件中设置log等级和保存路径
在任何文件中import logging
import logging
logger=logging.getLogger(__name__) # 获取当前文件名
logger.warning("*"*10) # 设置log日志
import logging
# 设置日志输出的样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)s %(filename)s [line:%(lineno)d]] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='myapp.log',
filemode='w')
logger = logging.getLogger(__name__) # 获取当前文件名
if __name__ == '__main__':
logger.info("this is info log ")
logger.warning("this is warning log ")
print(logger)
next_page_url = response.xpath('a[text()="下一页"]/@href').extract()
while len(next_page_url)>0:
yield scrapy.Request(next_page_url,callback=self.parse)
语法:
scrapy.Request( url[, callback=None, method='GET', headers=None, body=None,cookies=None, meta=None, encoding='utf-8', priority=0,dont_filter=False, errback=None, flags=None, cb_kwargs=None])
callbac:指定传入的url交个那个解析函数去处理
meta:实现在不同的解析函数中传递数据,meta默认会携带部分信息,比如下载延迟,请求深度等
dont_filter:让scrapy的去重不会过滤当前url,scrapy默认有url去重功能,对需要重复请求的url有重要用途
yield scrapy.Request(
url=item['href'],
callback=self.parse_detail,
meta={
'item': item}
)
def parse_detail(self, response): # 处理详情页
item=response.meta['item']
item['comment'] = response.xpath('//div[@class="details-box"]/pre/text()').extract()
item['content_img'] = response.xpath('//div[@class="clear details-img-list Picture-img"]/img/@src').extract()
yield item
# -*- coding: utf-8 -*-
import scrapy
from yangguang.items import YangguangItem
class TousuSpider(scrapy.Spider):
name = 'tousu'
allowed_domains = ['XXX.com'] # 此处写错将爬取不到任何数据
start_urls = ['http://XXXX']
def parse(self, response):
# print(response.text)
li_list = response.css('li.clear')
for li in li_list:
item = YangguangItem()
item['code'] = li.css('span.state1::text').extract_first()
item['title'] = li.css('span.state3 a.color-hover::text').extract_first()
item['publish_date'] = li.css('span.state5::text').extract_first()
item['href'] = response.urljoin(li.css('span.state3 a.color-hover::attr(href)').extract_first())
# print('详情页:',item['href'])
yield scrapy.Request(
url=item['href'],
callback=self.parse_detail,
meta={
'item': item}
)
# 翻页
next_url = response.xpath('//a[@class="arrow-page prov_rota"]//@href').extract_first()
if next_url:
next_url = response.urljoin(next_url) # 拼接绝对路径
# print("下一页:",next_url)
yield scrapy.Request(
url=next_url,
callback=self.parse,
)
def parse_detail(self, response): # 处理详情页
item=response.meta['item']
item['comment'] = response.xpath('//div[@class="details-box"]/pre/text()').extract()
item['content_img'] = response.xpath('//div[@class="clear details-img-list Picture-img"]/img/@src').extract()
yield item
自动提取url地址,爬取详情页数据,和下一页数据
语法: scrapy genspider -t crawl '爬虫名称' '爬取的网站'
scrapy genspider -t crawl example example.com
爬虫实例
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class GgSpider(CrawlSpider):
name = 'gg'
allowed_domains = ['xxx.cn']
start_urls = ['http://xxxx.html']
rules = (
# link_extractor是一个链接提取器对象,它定义如何从每个已爬网页面提取链接。每个产生的链接将用于生成一个Request对象,该对象将在其meta字典中(在link_text键下方)包含链接的文本。如果省略,将使用不带参数创建的默认链接提取器,从而提取所有链接。
# callback是为使用指定链接提取器提取的每个链接调用的可调用或字符串(在这种情况下,将使用具有该名称的Spider对象中的方法)。
# 此回调接收Response 作为第一个参数,并必须返回的单个实例或可迭代的 Item,dict和/或Request对象(或它们的任何亚类)。
# follow是一个布尔值,它指定是否应从使用此规则提取的每个响应中跟随链接。如果callback为None ,则follow默认为True,否则为False
Rule(LinkExtractor(allow=r'/\d+/t\d+_\d+\.html'), callback='parse_item'),
Rule(LinkExtractor(allow=r'list_\d+.html'), follow=True)
)
# parse 函数不能定义,他有特殊的功能需要实现,定义后会重写函数
# def parse(self, response):
# print(response.text)
def parse_item(self, response):
item = {
}
item['title'] = response.xpath('//div[@class="dj-xl-tit"]/h2/text()').get()
item['date'] = response.xpath('//div[@class="dysx"][4]/div[@class="tab_content"]/text()').extract_first()
print(item)
# return item
def start_requests(self):
cookies = '。。。。。。'
cookies = {
i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
yield scrapy.Request(
self.start_urls[0],
callback=self.parse,
cookies=cookies
)
def parse(self, response):
yield scrapy.Request(
'http://www.renren.com/xxxxxxx/profile', # 个人主页
callback=self.parse_detial
)
def parse_detial(self, response):
print(re.findall('人艰不拆', response.body.decode()))
def parse(self, response):
authenticity_token = response.xpath('//input[@name="authenticity_token"]/@value').extract_first()
ga_id = response.xpath('//input[@name="authenticity_token"]/@value').extract_first()
webauthn_support = response.xpath('//input[@name="webauthn-support"]/@value').extract_first() # supported
webauthn_iuvpaa_support = response.xpath('//input[@name="webauthn-iuvpaa-support"]/@value').extract_first()
# return_to:
# required_field_ce3f:
timestamp = response.xpath('//input[@name="timestamp"]/@value').extract_first()
timestamp_secret = response.xpath('//input[@name="timestamp_secret"]/@value').extract_first()
post_data = {
'authenticity_token': authenticity_token,
'ga_id': ga_id,
'login': '[email protected]',
'password': 'clg159600',
'webauthn-support': webauthn_support,
'webauthn-iuvpaa-support': webauthn_iuvpaa_support,
'return_to': '',
'required_field_ce3f': '',
'timestamp': timestamp,
'timestamp_secret': timestamp_secret
}
yield scrapy.FormRequest(
'https://github.com/session',
formdata=post_data,
callback=self.after_login
)
def after_login(self,response):
print(re.findall('gelanyingqi',response.body.decode()))
scrapy.FormRequest.from_response
,from_response中的fromid属性可以选择页面中多个from表单
input的name值作为formdata的key值,value值作为字典的value
def parse(self, response):
yield scrapy.FormRequest.from_response(
response, # 自动从response中寻找from表单的地址
formdata={
'login': '[email protected]',
'password': 'clg159600'
},
callback=self.after_login
)
def after_login(self, response):
print(re.findall('gelanyingqi', response.body.decode()))
在middlewares.py文件中
import random
class RandomUserAgentMiddleware:
def process_request(self, request, spider):
ua = random.choice(spider.settings.get('USER_AGENTS_LIST')) # 随机一个User-Agent
request.headers['User-Agent'] = ua
request.meta['proxy'] = 'http://1.85.5.66:8060' # 使用代理IP
class CheckUserAgent:
def process_response(self, request, response, spider):
# print(dir(response)) # 查看response的属性
print('使用浏览器:', request.headers['User-Agent'])
return response
实例下载
dont_filter=True,构造请求的时候,把dont_filter设置为True,该URL会被反复抓取(URL地址对应的内容会更新的情况下使用)
一个全新的url地址被抓到的时候,构造request请求
url地址在strat_url中的时候,会入赘,不管之前是否请求过
因为构造start_url地址的请求时候,dont_filter=True
源码scheduler.py文件中
def enqueue_request(self, request):
if not request.dont_filter and self.df.request_seen(request):
# dont_filter=False True True request指纹已存在 # 不会入队
# dont_filter=False True False request指纹已经存在 全新的url # 会入队
# dont_filter=True False #会入队
self.df.log(request, self.spider)
return False
if self.stats:
self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
self.queue.push(request) #入队
return True
源码dupefilter.py中return request_fingerprint的引用
fp = hashlib.sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
fp.update(request.body or b'')
if include_headers:
for hdr in include_headers:
if hdr in request.headers:
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
cache[cache_key] = fp.hexdigest()
return cache[cache_key]
源码dupefilter.py文件中
fp = self.request_fingerprint(request)
# 这将返回添加的值的数量,如果已经存在,则为零
added = self.server.sadd(self.key, fp)
return added == 0
# 在settings.py文件中添加过滤
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL='redis://127.0.0.1:6379'
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'myspider_redis'
redis_key = 'myspider:start_urls' # 储存在数据库中
allowed_domain=[]
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider
class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'mycrawler_redis'
redis_key = 'mycrawler:start_urls'
allow_domains=[]
rules = (
# follow all links
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
return {
'name': response.css('title::text').extract_first(),
'url': response.url,
}