import scrapy
class QiubaiproItem(scrapy.Item):
# define the fields for your item here like:
author = scrapy.Field()
content = scrapy.Field()
# pass
import scrapy
from qiubaiPro.items import QiubaiproItem
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response, **kwargs):
#
# print(response)
...
item = QiubaiproItem()
item['author'] = xxxx
item['content'] = xxxx
# 将item提交给了管道
yield item
# 爬虫结束时会被调用一次
def closed(self, spider):
pass
class QiubaiproPipeline(object):
# 开始爬虫时调用,只会被调用一次
def open_spider(self, spider):
print('开始爬虫!')
# 专门用来处理item类型对象
# 该方法可以接收爬虫文件提交过来的item对象
# 该方法每接收到一个item就会被调用一次
def process_item(self, item, spider):
...
# 这个返回值传递给下一个即将被执行的管道类
return item
def close_spider(self, spider):
print('结束爬虫!')
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class imgsPileLine(ImagesPipeline):
# 就是可以根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(url=item['src'])
# 指定图片存储的名称
def file_path(self, request, response=None, info=None, **kwargs):
# request的类型是:
print('请求对象的类型是:', type(request))
imgName = request.url.split('/')[-1]
return imgName
def item_completed(self, results, item, info):
return item # 返回给下一个即将被执行的管道类
import random
class MiddleproDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
....
]
PROXY_http = [
'153.180.102.104:80',
'195.208.131.189:56055',
]
PROXY_https = [
'120.83.49.90:9000',
'95.189.112.214:35508',
]
# 拦截请求
# spider:爬虫对象。下同。
def process_request(self, request, spider):
# UA伪装
request.headers['User-Agent'] = random.choice(self.user_agent_list)
# 为了验证代理的操作是否生效
request.meta['proxy'] = 'http://183.146.213.198:80'
return None
# 拦截所有的响应
def process_response(self, request, response, spider):
...
new_response = HtmlResponse(
url=request.url, # 请求的url
body=page_text, # 页面的源码数据
encoding='utf-8',
request=request # 请求对象
)
return new_response
# 拦截发生异常的请求
def process_exception(self, request, exception, spider):
if request.url.split(':')[0] == 'http':
# 代理
request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
else:
request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
return request # 将修正之后的请求对象进行重新的请求发送
注:如果指定多个链接提取器和规则解析器,在回调函数中是无法实现请求传参,因为没有调用scrapy.Request方法。
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunPro.items import SunproItem, DetailItem
# 需求:爬取sun网站中的编号,新闻标题以及详情页的新闻内容,编号
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
# 链接提取器:根据指定规则(allow="正则")进行指定链接的提取
# 链接提取器具有去重的功能:将重复的链接自动过滤
link = LinkExtractor(allow=正则)
link_detail = LinkExtractor(allow=正则)
# 规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作
rules = (
# follow=True:可以将链接提取器 继续作用到 连接提取器提取到的链接 所对应的页面中
Rule(link, callback='parse_item', follow=True),
Rule(link_detail, callback='parse_detail')
)
# 解析新闻编号和新闻的标题
# 如下两个解析方法中是不可以实现请求传参!(没有调用scrapy.Request方法)
# 无法将两个解析方法解析的数据存储到同一个item中,要存储到两个item
def parse_item(self, response):
# 注意:xpath表达式中不可以出现tbody标签
tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
for tr in tr_list:
new_num = tr.xpath('./td[1]/text()').extract_first()
new_title = tr.xpath('./td[2]/a[2]/@title').extract_first()
item = SunproItem()
item['title'] = new_title
item['new_num'] = new_num
yield item
# 解析新闻内容和新闻编号
def parse_detail(self, response):
new_id = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first()
new_content = response.xpath('/html/body/div[9]/table[2]//tr[1]//text()').extract()
new_content = ''.join(new_content)
# print(new_id,new_content)
item = DetailItem()
item['content'] = new_content
item['new_id'] = new_id
yield item