本系列为自己学习爬虫的相关笔记,如有误,欢迎大家指正
Scrapy是一个为了爬取网站数据,提取数据而编写的应用框架。简单来说,它把爬虫的三步:获取网页,解析网页,存储数据都整合成了这个爬虫框架。
Scrapy主要的组件有Scrapy Engine(引擎), Scheduler(调度器),Downloader(下载器), Spider(爬虫器), Item Pipeline(管道)。还有两个中间件:Downloader Middlewares(下载器中间件)和SpiderMiddlewares(爬虫器中间件)。这些组件的功能分别是:
(1)引擎:向爬虫器请求第一个要抓取的url。
(2)爬虫器:提供请求url给引擎。
(3)引擎:接收到网址,交给调度器排序入队。
(4)调度器:将它处理成请求(request)给引擎。
(5)引擎:接收到request,并通过下载器中间件给下载器下载。
(6)下载器:根据request下载页面,返回回应(response)给引擎。
(7)引擎:接收到response,并通过爬虫器中间件给爬虫器处理。
(8)爬虫器:处理response,提取博客标题数据,返回结果item给引擎,如果有跟进的请求request也会给引擎。
(9)引擎:接收到item,交给管道;新的request给调度器。
(10)管道:存储数据。
pip install Scrapy
conda install -c conda-forge scrapy
在爬取之前需要先定义目标字段。在items.py中输入需要的字段
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BlogspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
#content = scrapy.Field()
在项目目录下输入scrapy genspider csdn blog.csdn.net
此时会在blogSpider/spider创建一个csdn.py的文件,此处是定义了名称和爬取范围
# -*- coding: utf-8 -*-
import scrapy
class SantotangSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['blog.csdn.net']
start_urls = ['https://blog.csdn.net/weixin_51656605/article/details/113567112']
def parse(self, response):
pass
如上是默认生成的内容,如果不用命令的话,手动创建文件和内容也是可以的。
补充完整解析:
# -*- coding: utf-8 -*-
import scrapy
class SantotangSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['blog.csdn.net']
start_urls = ['https://blog.csdn.net/weixin_51656605/article/details/113567112']
def parse(self, response):
print(response.text)
with open('index.html','w',encoding='utf-8') as f:
f.write(response.text)
然后执行:
scrapy crawl csdn
重新编写,将数据存入文件中:
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from blogSpider.items import BlogspiderItem
class CsdnSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['blog.csdn.net']
start_urls = ['https://blog.csdn.net/weixin_51656605/category_10700169.html']
def parse(self, response):
#print(response.text)
#with open('index.html','w',encoding='utf-8') as f:
#f.write(response.text)
soup = BeautifulSoup(response.text,'lxml')
article_list = soup.find_all('ul',class_='column_article_list')
#print(article_list)
article_info_list = []
for article in article_list:
title = article.find('h2',class_='title').text.strip().replace(' ','');
date = article.find('div',class_='column_article_data').span.text.strip().replace(' ','')
#article_info_list.append([title,date])
item = BlogspiderItem()
item['title'] = title
item['date'] = date
article_info_list.append(item)
#print(article.text)
return article_info_list
修改pipeline:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BlogspiderPipeline(object):
file_path = 'result.txt'
def __init__(self):
self.article = open(self.file_path,'a+',encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
date = item['date']
output = title +'\t' +date+'\t'
self.article.write(output)
return item
settings.py注释放开:
ITEM_PIPELINES = {
'blogSpider.pipelines.BlogspiderPipeline': 300,
}
然后再执行
scrapy crawl csdn即可将数据保存在本地
爬列表之后爬取内容完整代码:
csdn.py
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from blogSpider.items import BlogspiderItem
class SantotangSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['blog.csdn.net']
start_urls = ['https://blog.csdn.net/weixin_51656605/category_10690950.html']
def parse(self, response):
#print(response.text)
#with open('index.html','w',encoding='utf-8') as f:
#f.write(response.text)
soup = BeautifulSoup(response.text,'lxml')
article_list = soup.find('ul',class_='column_article_list').find_all('li')
#print(article_list)
article_info_list = []
for article in article_list:
link = article.a['href'];
title = article.find('h2',class_='title').text.strip().replace(' ','');
date = article.find('div',class_='column_article_data').span.text.strip().replace(' ','')
#article_info_list.append([title,date])
item = BlogspiderItem() #将数据封装到BlogspiderItem对象,字典类型数据
item['title'] = title
item['date'] = date
item['link'] = link
article_info_list.append(item)
# 根据文章链接,发送Request请求,并传递item参数
yield scrapy.Request(url = link,meta={'item':item},callback=self.parse2)
#print(article.text)
return article_info_list
def parse2(self,response):
# 接收传递的item
item = response.meta['item']
soup = BeautifulSoup(response.text,'lxml')
content = soup.find('div',class_='article_content clearfix').text
item['content'] = content
# 返回item 交给item pipeline
yield item
与之前的不同之处:一是加入了yield,二是加入了函数parse2。然而,Scrapy的并行获取能力就是通过yield实现的。通过yield来发起一个请求,定义url是文章链接,使用meta传递item参数,并通过callback参数为这个请求添加回调函数,这里是self.parse2
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BlogspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
#content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BlogspiderPipeline(object):
file_path = 'result.txt'
def __init__(self):
self.article = open(self.file_path,'a+',encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
date = item['date']
link = item['link']
content = item['content']
output = title +'\t' +date+'\t' +link +'\t' +content
self.article.write(output)
return item
settings.py
#此处不是新增,是将注释放开
ITEM_PIPELINES = {
'blogSpider.pipelines.BlogspiderPipeline': 300,
}
http://finance.eastmoney.com/news/cywjh.html
创建项目
scrapy startproject financeSpider
创建模板
scrapy genspider finance finance.eastmoney.com
定义items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class FinancespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
date = scrapy.Field()
source = scrapy.Field()
comment = scrapy.Field()
involve = scrapy.Field()
编写请求和解析内容
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from financeSpider.items import FinancespiderItem
class FinanceSpider(scrapy.Spider):
name = 'finance'
allowed_domains = ['finance.eastmoney.com']
start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html']
url_head = 'http://finance.eastmoney.com/news/cywjh_'
url_end = '.html'
# scrapy自带功能,从start_request开始发送请求
#start_requests这个方法是Scrapy自带功能,目的是能够使用一个循环来获取新闻列表的前三页
def start_requests(self):
for i in range(1,4):
url = self.url_head+str(i)+self.url_end
print('当前页面是'+url)
# 对新闻列表也发起请求
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
soup = BeautifulSoup(response.text,'lxml')
title_list = soup.find_all('p',class_='title')
for i in range(len(title_list)):
item = FinancespiderItem()
title = title_list[i].a.text.strip()
link = title_list[i].a['href']
item['title']=title
item['link']=link
# 根据文章链接,发送request请,并传递item参数
yield scrapy.Request(url = link,meta={'item':item},callback=self.parse2)
def parse2(self,response):
# 接收传递的item
item = response.meta['item']
#解析文章内容
soup = BeautifulSoup(response.text,'lxml')
date = soup.find('div',class_='time').text.strip()
source = soup.find('div' ,class_='data-source')['data-source']
content = soup.find('div',id='ContentBody').text.strip()
content = content.replace('\n',' ')
comment = soup.find('span',class_='cNumShow num').text.strip()
involve = soup.find('span',class_='num ml5').text.strip()
item['content']=content
item['date'] = date
item['source'] = source
item['comment'] = comment
item['involve']= involve
# 返回item交给item pipeline
yield item
pipeline.py内容修改存储内容
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class FinancespiderPipeline(object):
file_path = 'finance.txt'
def __init__(self):
self.article = open(self.file_path,'a+',encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
link = item['link']
content = item['content']
date = item['date']
source = item['source']
comment = item['comment']
involve = item['involve']
output = title +'\t' + link +'\t' +date+ '\t'+source+'\t'+comment+'\t'+involve + '\t' +content +'\t'
self.article.write(output)
return item
settings.py
ITEM_PIPELINES = {
'financeSpider.pipelines.FinancespiderPipeline': 300,
}