注:url地址不完整时,crawlspider会自动补充完整之后再请求;parse函数不能定义,它有特殊功能需求实现;callback连接提取器提取出来的url地址对应的相应交给它来处理。
爬取 凤凰周刊|新闻资讯 全部页码页面数据
创建工程scrapy startproject sunPro
创建CrawlSpider文件scrapy genspider -t sun www.xxx.com
设置settings.py
配置文件
LOG_LEVEL = 'ERROR'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/...
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
使用开发者工具,观察跳转页码的标签属性
发现每一页的跳转标签对应的属性均为’/list.php?lmid=5&page=num’
因此编写的正则表达式为:lmid=5&page=\d+
知识点:正则表达式“\d+||(\d+\.\d+”是什么意思
首先爬取全部页面
sun.py
文件
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.ifengweekly.com/list.php?lmid=39&page=1']
#链接提取器:根据指定规则(allow="正则")进行指定链接的提取
link = LinkExtractor(allow=r'lmid=5&page=\d+')
#规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作
#follow=True后可实现爬取全部页码页面效果
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response)
全部页面成功爬取
sun.py
文件
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.cnblogs.com/sitehome/p/1']
#链接提取器:根据指定规则(allow="正则")进行指定链接的提取
link = LinkExtractor(allow=r'/sitehome/p/\d+')
#规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
div_list = response.xpath('//*[@id="hot"]/div[@class="column"]')
for div in div_list:
title = div.xpath('./h1/a/text()').extract_first()
time = div.xpath('./p/text()').extract_first()
number = div.xpath('./p/a/text()').extract_first()
print(title,time,number)
确定链接所在位置,再构建要给解析详情页的函数,将含有解析数据的response提交给解析详情页的函数。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.ifengweekly.com/list.php?lmid=5&page=1']
#链接提取器:根据指定规则(allow="正则")进行指定链接的提取
link = LinkExtractor(allow=r'lmid=5&page=\d+')
#详情页链接提取器
#detail_link = linkExtractor(allow=r'')
#规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作
#follow=True后可实现爬取全部页码页面效果
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
div_list = response.xpath('//*[@id="hot"]/div[@class="column"]')
for div in div_list:
title = div.xpath('./h1/a/text()').extract_first()
time = div.xpath('./p/text()').extract_first()
number = div.xpath('./p/a/text()').extract_first()
detail_url = 'http://www.ifengweekly.com/' + div.xpath('./h1/a/@href').extract_first()
# print(detail_url)
yield scrapy.Request(url=detail_url,callback=self.parse_detail)
def parse_detail(self, response):
print(response)
编写代码
sun.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.ifengweekly.com/list.php?lmid=39&page=1']
#链接提取器:根据指定规则(allow="正则")进行指定链接的提取
link = LinkExtractor(allow=r'lmid=5&page=\d+')
#详情页链接提取器
#detail_link = linkExtractor(allow=r'')
#规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作
#follow=True后可实现爬取全部页码页面效果
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
div_list = response.xpath('//*[@id="hot"]/div[@class="column"]')
for div in div_list:
title = div.xpath('./h1/a/text()').extract_first()
time = div.xpath('./p/text()').extract_first()
number = div.xpath('./p/a/text()').extract_first()
detail_url = 'http://www.ifengweekly.com/' + div.xpath('./h1/a/@href').extract_first()
# print(detail_url)
yield scrapy.Request(url=detail_url,callback=self.parse_detail)
def parse_detail(self, response):
content = response.xpath('//*[@id="detil"]/div[3]/p//text()').extract()
content = ''.join(content)
#result_content
print(content)
开启piplines配置,文件piplines.py
ITEM_PIPELINES = {
'sunPro.pipelines.SunproPipeline': 300,
}
items.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import os
class SunproPipeline:
if not os.path.exists('./news'):
os.mkdir('./news')
fp = None
def open_spider(self, spider):
print('开始爬虫......')
def process_item(self, item, spider):
title = item['title']
time = item['time']
number = item['number']
content = item['content']
file_name = './news/'+ title +'.txt'
self.fp = open(file_name,'w',encoding='utf-8')
self.fp.write(title+time+number+'\n'+content)
print(title)
def close_spider(self, spider):
print('结束爬虫.')
self.fp.close()
sun.py
文件
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunPro.items import SunproItem
import re
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.ifengweekly.com/list.php?lmid=39&page=1']
#链接提取器:根据指定规则(allow="正则")进行指定链接的提取
link = LinkExtractor(allow=r'lmid=39&page=\d+')
#详情页链接提取器
#detail_link = linkExtractor(allow=r'')
#规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作
#follow=True后可实现爬取全部页码页面效果
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
div_list = response.xpath('//*[@id="hot"]/div[@class="column"]')
for div in div_list:
title = div.xpath('./h1/a/text()').extract_first()
time = div.xpath('./p/text()').extract_first()
number = div.xpath('./p/a/text()').extract_first()
detail_url = 'http://www.ifengweekly.com/' + div.xpath('./h1/a/@href').extract_first()
time = time.replace("\r","")
item = SunproItem()
item['title'] = title
item['time'] = time
item['number'] = number
# print(item)
# print(detail_url)
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={
'item':item})
def parse_detail(self, response):
content = response.xpath('//*[@id="detil"]/div[@class="jfont"]/p//text()').extract()
#将文本中的\r、\t、\n通过替换方式除去
#content = [re.sub(r"\r|\t|\n","",i) for i in content]
#content = [i for i in content if len(i)>0]
content = ''.join(content)
#该方法与上面用re替换除去\r、\t、\n可实现同样的效果
content = content.replace("\r","").replace("\t","").replace("\n","")
item = response.meta['item']
item['content'] = content
#print(item)
yield item
#result_content