创建目录
import os
if not os.path.exists('./tupians'):
os.mkdir('./tupians')
存储图片
imgData = data#二进制图片数据
with open('./tupians.jpg','wb') as fp:
fp.write(imgData)
for pageNum in range(1,36):
print(pageNum)
#1,2,3...
arr = [1,1,2]
for a in arr:
print(a)
#1,1,2
import re
ex = '.*?'
imgSrc = re.findall(ex,response,re.S)
# .*? 非贪婪匹配任意字符
# re.S 和换行符有关貌似
requests
# 发起Get请求获取文本信息,连接地址为newUrl(字符串),请求头为headers(对象)
response = requests.get(url=newUrl,headers=headers).text
# 发起Get请求获取二进制流信息,连接地址为src(字符串),请求头为headers2(对象)
imgData = requests.get(url=src,headers=headers2).content
# 发起Post请求,请求数据为 data(对象)
response = requests.post(url=newUrl,headers=headers,data=data)
response.status_code#请求状态码,可以判断请求是否成功 200
# 设置cookie 请求头中自动自动加入Cookie字段
session = requests.Session()
session.post(url=url,headers=headers,data=data).text
# 代理ip
response = requests.get(url=url,headers=headers,proxies={"https":"202.12.1.22"}).text
代理:www.goubanjia.com
xpath
属性包含
res = requests.get(url=linkPath,headers=headers).text
tree = etree.HTML(res)
links = tree.xpath('//div[contains(@class,"ws_block")]/a')
asyncio
import asyncio
async def request(url):
print('begin')
return 'over'
#async修饰的函数,调用之后返回一个协程对象
c = request('www.baidu.com')
#报错There is no current event loop python
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
#创建一个事件循环对象 报错There is no current event loop python,使用上面两句替换
#loop = asyncio.get_event_loop()
#将协程对象注册到loop中,然后启动loop
#loop.run_until_complete(c)
###task 创建task对象
##task = loop.create_task(c)
##print(task)
##
##loop.run_until_complete(task)
##print(task)
###future 创建future对象
##task = asyncio.ensure_future(c)
##print(task)
##
##loop.run_until_complete(task)
##print(task)
#绑定回调
def callbackFn(task):
print(task.result())
task = asyncio.ensure_future(c)
task.add_done_callback(callbackFn)
loop.run_until_complete(task)
print('all over')
报错:There is no current event loop
将下面一句:
loop = asyncio.get_event_loop()
替换成下面两句即可
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
多任务
import asyncio
import time
async def request(url):
print('begin',url)
await asyncio.sleep(2)
print('end',url)
start = time.time()
urls= ['www.baidu.com','www.souhu.com','www.sogou.com']
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
stasks = []
for url in urls:
c=request(url)
task = asyncio.ensure_future(c)
stasks.append(task)
#loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(stasks))
stop = time.time()
print(stop-start)
带延时服务用例
from flask import Flask
import time
app = Flask(__name__)
@app.route('/walk')
def index_walk():
time.sleep(2)
return 'i walk'
@app.route('/say')
def index_say():
time.sleep(2)
return 'i say'
@app.route('/sing')
def index_sing():
time.sleep(2)
return 'i sing'
if __name__ =='__main__':
app.run(debug=True)
基于aiohttp的异步请求
import requests
import time
import asyncio
import aiohttp
start = time.time()
urls = ['http://127.0.0.1:5000/walk','http://127.0.0.1:5000/sing','http://127.0.0.1:5000/say']
async def getPage(url):
print('begin',url)
async with aiohttp.ClientSession() as session:
#使用post()发起post请求
#headers=headers,增加头;params,data同样;proxy='http://ip:port'
async with await session.get(url) as response:
pageText = await response.text() #read()返回二进制;json()返回json对象
print('over1',url)
#response = requests.get(url=url)
print('over2',url)
pool = asyncio.new_event_loop()
asyncio.set_event_loop(pool)
tasks=[]
for url in urls:
c = getPage(url)
task = asyncio.ensure_future(c)
tasks.append(task)
pool.run_until_complete(asyncio.wait(tasks))
end = time.time()
print(end-start)
begin http://127.0.0.1:5000/walk
begin http://127.0.0.1:5000/sing
begin http://127.0.0.1:5000/say
over1 http://127.0.0.1:5000/say
over2 http://127.0.0.1:5000/say
over1 http://127.0.0.1:5000/walk
over1 http://127.0.0.1:5000/sing
over2 http://127.0.0.1:5000/walk
over2 http://127.0.0.1:5000/sing
2.1247029304504395
selenium
下载浏览器驱动:http://chromedriver.storage.googleapis.com/index.html
对应关系:https://blog.csdn.net/huilan_same/article/details/51896672
示例
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
s = Service('./chromedriver.exe')
bro = webdriver.Chrome(service = s)
bro.get('https://www.taobao.com/')
#标签定位
search_input = bro.find_element('id','q')
#标签交互
search_input.send_keys('IPhone')
#执行js程序
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
#点击搜索按钮
btn = bro.find_element(By.CSS_SELECTOR,'.btn-search')
btn.click()
bro.get('https://www.baidu.com')
sleep(2)
#后退
bro.back()
sleep(2)
#前进
bro.forward()
sleep(2)
sleep(5)
bro.quit()
iframe及拖拽
from selenium import webdriver
from selenium.webdriver import ActionChains
from time import sleep
s = webdriver.chrome.service.Service('./chromedriver.exe')
bro = webdriver.Chrome(service=s)
bro.get('https://www.runoob.com/try/try.php?filename=tryhtml5_draganddrop')
#切换作用域到iframe
bro.switch_to.frame('iframeResult')
div = bro.find_element('id','drag1')
print(div)
#动作链
action = ActionChains(bro)
#点击长按指定标签
#action.click_and_hold(div)
element = bro.find_element('id','div1')
print(element)
sleep(3)
action.drag_and_drop(div, element).perform()
##element = bro.find_element('id','div1')
##action.move_to_element(element).perform()
##for i in range(5):
## #perform()立即执行动作链操作
## action.move_by_offset(0,-5).perform()
## sleep(0.3)
#释放
action.release().perform()
sleep(1)
bro.quit()
没有成功,不知道什么原因
无可视化界面
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.options import Options
#规避检测
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitches',['enable-automation'])
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#options
s = webdriver.chrome.service.Service('./chromedriver.exe')
#options = option
bro = webdriver.Chrome(service=s,options = chrome_options)
#无头浏览器
bro.get('https://www.baidu.com')
print(bro.page_source)
sleep(2)
bro.quit()
超级鹰:
注册用户,充值,创建软件,下载示例
www.chaojiying.com/about.html
截图
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
s = webdriver.chrome.service.Service('./chromedriver.exe')
bro = webdriver.Chrome(service = s)
#bro.get('https://kyfw.12306.cn/otn/login/init')
bro.get('https://www.sina.com.cn/')
bro.save_screenshot('aa.png')
#imgEle= bro.find_element('class','hxjzg')
#imgEle= bro.find_element(By.XPATH,'/html/body/div[12]/div/a[2]')
#imgEle.save_screenshot('bb.png')
sleep(4)
bro.quit()
scrapy
安装:
- pip install wheel
- 下载twisted ,地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
- 安装twisted:pip install Twisted-…whl
- pip install pywin32
- pip install scrapy
新建工程
安装时没有添加到环境变量,找到路径执行C:\Users\admin\AppData\Roaming\Python\Python310\Scripts
新建工程执行:scrapy startproject helloscrapy
(C:\Users\admin\AppData\Roaming\Python\Python310\Scripts\scrapy.exe startproject helloscrapy)
新工程目录结构:
(原来插入图片的水印是可以去除的)
增加py文件
cd到firstBlood中执行命令:scrapy genspider spiderName wwww.xxx.com
(C:\Users\admin\AppData\Roaming\Python\Python310\Scripts\scrapy.exe genspider spiderName wwww.xxx.com)
就会在spiders中创建一个名为 spiderName.py 的文件。
这里把spiderName换乘first后执行:
first.py内容
import scrapy
class FirstSpider(scrapy.Spider):
#爬虫文件的名称:就是爬虫源文件的唯一标识
name = "first"
#允许的域名:用来限定start_urls列表中哪些url可以进行请求发送,可注释
allowed_domains = ["www.xxx.com"]
#起始url列表:该列表中存放的url会被scrapy自动进行请求发送
start_urls = ["http://www.xxx.com/"]
#用作于数据解析,response是请求成功后的响应对象,调用次数由start_urls长度确定
def parse(self, response):
pass
执行工程:scrapy crawl spiderName
,spiderName就是新创建文件的名字
请求前需要先修改裙子协议:
settings.py中ROBOTSTXT_OBEY=True
改为False
设置日志输出等级:
settings.py中增加LOG_LEVEL='ERROR'
设置ua伪装,填写对应ua信息
USER_AGENT=‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36’
如:
import scrapy
class XiaohuaSpider(scrapy.Spider):
name = "xiaohua"
#allowed_domains = ["www.xxx.com"]
start_urls = ["https://www.xiaohua.com/"]
def parse(self, response):
divList = response.xpath('//div[@class="content-left"]/div')
for div in divList:
user = div.xpath('./div[1]/div//i/text()')[0].extract()
content = div.xpath('./p/a/text()').extract()
content = ''.join(content)
print(user,content)
break
cd 到xiaohuaPro工程路径中,执行scrapy crawl xiaohua
持久化存储:
终端指令:只可以将parse的返回值存储到文本文件中。scrapy crawl first -o ./wenben.csv
;文本类型需要为:‘json’, ‘jsonlines’, ‘jsonl’, ‘jl’, ‘csv’, ‘xml’, ‘marshal’, ‘pickle’
如:
import scrapy
class XiaohuaSpider(scrapy.Spider):
name = "xiaohua"
#allowed_domains = ["www.xxx.com"]
start_urls = ["https://www.xiaohua.com/"]
def parse(self, response):
alldata = []
divList = response.xpath('//div[@class="content-left"]/div')
for div in divList:
user = div.xpath('./div[1]/div//i/text()')[0].extract()
content = div.xpath('./p/a/text()').extract()
content = ''.join(content)
#print(user,content)
#break
dic = {
'author':user,
'content':content
}
alldata.append(dic)
return alldata
基于管道:
- 在item类中定义相关的属性
- 将解析的数据封装存储到item类型的对象
- 将item类型的对象提交给管道进行持久化存储的操作
- 在管道类的process_item中要将其接收到的item对象中存储的数据进行持久化存储操作
- 在配置文件中开启管道
举例:
item.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class XiaohuaproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
#pass
xiaohua.py
import scrapy
from xiaohuaPro.items import XiaohuaproItem
class XiaohuaSpider(scrapy.Spider):
name = "xiaohua"
#allowed_domains = ["www.xxx.com"]
start_urls = ["https://www.xiaohua.com/"]
def parse(self, response):
alldata = []
divList = response.xpath('//div[@class="content-left"]/div')
for div in divList:
user = div.xpath('./div[1]/div//i/text()').extract()
user = ''.join(user)
content = div.xpath('./p/a/text()').extract()
content = ''.join(content)
#print(user,content)
#break
dic = {
'author':user,
'content':content
}
alldata.append(dic)
item = XiaohuaproItem()
item['author']=user
item['content']=content
yield item#提交到了管道
return alldata
settings.py
ITEM_PIPELINES = {
"xiaohuaPro.pipelines.XiaohuaproPipeline": 300,
}
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class XiaohuaproPipeline:
fp=None
#重写父类方法
def open_spider(self,spider):
print('begin')
self.fp = open('./xiaohua.txt','w',encoding='utf-8')
#专门处理item类型对象
#每次接受一个item对象就会被调用一次
def process_item(self, item, spider):
author = item['author']
content = item['content']
self.fp.write(author+':'+content+'\n')
return item
#
def close_spider(self,spider):
print('end')
self.fp.close()
如果想要将数据同时存到mysql一份,需要在pipelines.py中增加以下内容:
import pymysql
class XiaohuaproMysqlPipeline:
conn = None
cursor=None
def open_spider(self,spider):
self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='xiaohua',charset='utf8')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into xiaohua values("%s","%s")'%(item['author'],item["content"]))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item #会传递给下一个即将执行存储的管道类
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
同时修改settings.py
ITEM_PIPELINES = {
"xiaohuaPro.pipelines.XiaohuaproPipeline": 300,
"xiaohuaPro.pipelines.XiaohuaproMysqlPipeline": 301,
}
分页信息的爬取
import scrapy
class YhdmSpider(scrapy.Spider):
name = "yhdm"
#allowed_domains = ["www.xxx.com"]
start_urls = ["http://www.yhdmp.net/list/"]
url = 'https://www.yhdmp.net/list/?pagesize=24&pageindex=%d'
pageName = 1
def parse(self, response):
lists = response.xpath('//div[@class="lpic"]/ul/li')
for li in lists:
name = li.xpath('./h2/a/text()').extract_first()
print(name)
if self.pageName <= 2:
newUrl = format(self.url%self.pageName)
self.pageName+=1
#手动发送请求
yield scrapy.Request(url=newUrl,callback=self.parse)
请求传参:
整体思路如下,没有测试
import scrapy
from itemPro.items import proItem
class targetSpider(scrapy.Spider):
name = 'boss'
starturls = ['https://www.xxx.com']
url = 'https://www.xxx.com/page=%d'
pageNum = 2
def parseDetail(self,response):
item = response.meta['item']
desc = response.xpath('//*[@id="myid"]/div//text()').extract()
desc = ''.join(desc)
item['desc'] = desc
yield item
def parse(self,response):
lists = response.xpath('//*[@id="myidall"]/div/ul')
for li in lists:
item = proItem()
name = li.xpath('.//div[@class=""]/a/div/text()').extract_first()
item['name'] = name
detailUrl = 'https://www.xxx.com' + li.xpath('.//div[@class=""]/a/text()').extra_first()
yield scrapy.Request(detailUrl,callback = self.parseDetail,meta={'item':item})
if self.pageNum < 3:
newUrl = format(self.url%self.pageNum)
self.pageNum +=1
yield scrapy.Request(newUrl,callback=self.parse)
图片
scchina.py
import scrapy
from picSave.items import PicsaveItem
class ScchinaSpider(scrapy.Spider):
name = "scchina"
# allowed_domains = ["www.xxx.com"]
start_urls = ["https://sc.chinaz.com/tupian/"]
def parse(self, response):
lists = response.xpath('//div[contains(@class,"tupian-list")]/div')
#print(lists)
for li in lists:
pic = li.xpath('./img/@data-original').extract_first()
print(pic)
item = PicsaveItem()
item['src'] = 'https:' + pic
yield item
重新管道类pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
# from itemadapter import ItemAdapter
# class PicsavePipeline:
# def process_item(self, item, spider):
# return item
from scrapy.pipelines.images import ImagesPipeline
import scrapy
#ImagesPipeline 专门用于下载的管道类
class imgsPileLine(ImagesPipeline):
#根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item['src'])
#执行图片存储路径
def file_path(self, request, response=None, info=None, *, item=None):
imgName = request.url.split('/')[-1]
return imgName
def item_completed(self, results, item, info):
return item #返回值会传递给下一个即将执行的管道类
setting.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
ITEM_PIPELINES = {
"picSave.pipelines.imgsPileLine": 300,
}
IMAGES_STORE = './imgs'#存储的文件夹
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PicsaveItem(scrapy.Item):
# define the fields for your item here like:
src = scrapy.Field()
# pass
下载中间件
位置:引擎和下载器之间
作用:批量拦截到整个工程中的所有请求和响应
拦截请求:UA伪装、代理IP
拦截响应:篡改响应数据
ua池貌似可以安装 库 fake-useragent
例子:
scrapy startproject wangyiPro
cd .\wangyiPro\
scrapy genspider wangyi www.xxx.com
wangyi.py
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from wangyiPro.items import WangyiproItem
class WangyiSpider(scrapy.Spider):
name = "wangyi"
#allowed_domains = ["www.xxx.com"]
start_urls = ["https://news.163.com/"]
block_urls = []#板块新闻url
#实例化浏览器对象
def __init__(self, name=None, **kwargs):
s=Service('./chromedriver.exe')
self.bro = webdriver.Chrome(service=s)
def parse(self, response):
lists = response.xpath('//*[@id="index2016_wrap"]/div[3]/div[2]/div[2]/div[2]/div/ul/li')
listindex = [2,3,5,6,7]
for index in listindex:
blockUrl = lists[index].xpath('./a/@href').extract_first()
self.block_urls.append(blockUrl)
#依次对每个板块的页面进行请求
for url in self.block_urls:
#print(url)
yield scrapy.Request(url=url,callback=self.parse_block)
#解析每一个板块的内容,对应内容死动态加载获得
def parse_block(self,response):
divList = response.xpath('/html/body/div/div[3]/div[3]/div[1]/div[1]/div/ul/li/div/div')
for div in divList:
title = div.xpath('./div/div[1]//text()').extract()
title = ''.join(title)
detailUrl = div.xpath('./div/div[1]/h3/a/@href').extract_first()
item = WangyiproItem()
item['title']=title
#对详情发起请求
#print(title,detailUrl)
yield scrapy.Request(url = detailUrl,callback=self.parse_detail,meta={'item':item})
def parse_detail(self,response):
content = response.xpath('//*[@id="content"]/div[2]//text()').extract()
content = ''.join(content)
item = response.meta['item']
item['content'] = content
yield item
def closed(self,spider):
self.bro.quit()
settings.py
BOT_NAME = "wangyiPro"
SPIDER_MODULES = ["wangyiPro.spiders"]
NEWSPIDER_MODULE = "wangyiPro.spiders"
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
DOWNLOADER_MIDDLEWARES = {
"wangyiPro.middlewares.WangyiproDownloaderMiddleware": 543,
}
ITEM_PIPELINES = {
"wangyiPro.pipelines.WangyiproPipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
middlewares.py
from scrapy import signals
import time
from scrapy.http import HtmlResponse
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class WangyiproDownloaderMiddleware:
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
bro = spider.bro#获取了在爬虫类中定义的浏览器对象
#挑选指定的响应对象进行篡改
#通过url指定request
#通过request指定response
if request.url in spider.block_urls:
bro.get(request.url) #对板块对应的url进行请求
time.sleep(3)
pageText = bro.page_source #包含了动态加载的新闻数据
#基于selenium便捷的获取动态加载数据
newResponse = HtmlResponse(url=request.url,body=pageText,encoding='utf-8')
return newResponse
else:
return response
def process_exception(self, request, exception, spider):
pass
pipelines.py
from itemadapter import ItemAdapter
class WangyiproPipeline:
def process_item(self, item, spider):
print(item)
return item
items.py
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
content = scrapy.Field()
#pass
最后执行
scrapy crawl wangyi
需要浏览器驱动放置到scrapy.cfg
同级目录
crawlspider
主要应用于全站爬取
scrapy genspider -t crawl name www.xxx.com
Rule() 规则解析器 将链接提取到的链接进行指定规则的解析
LinkExtractor 链接提取器 根据指定规则(allow)进行指定链接的提取
follow=True 可以将链接提取器继续作用到 链接提取器提取到的链接 所对应的页面中
思路如下:
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunpro.items import DetailItem,SunproItem
class SunSpider(CrawlSpider):
name = "sun"
#allowed_domains = ["www.xxx.com"]
start_urls = ["http://wz.sun0769.com/"]
rules = (Rule(LinkExtractor(allow=r"type=4&page=\d+"), callback="parse_item", follow=True),
Rule(LinkExtractor(allow=r"question/\d+/\d+\.shtml"), callback="parse_detail", follow=False))
def parse_item(self, response):
trlist = response.xpath('')
for tr in trlist:
num = tr.xpath('').extract_first()
title = tr.xpath('').extract_first()
item = SunproItem()
item['title'] = title
item['num'] = num
yield item
return item
def parse_detail(self, response):
id = response.xpath('').extract_first()
content = response.xpath('').extract()
content = ''.join(content)
item = DetailItem()
item['content'] = content
item['id'] = id
yield item
items.py
import scrapy
class SunproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
id = scrapy.Field()
class DetailItem(scrapy.Item):
num = scrapy.Field()
content = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
class SunproPipeline:
def process_item(self, item, spider):
if item.__class__.__name__ == 'DetailItem':
print(item['id'],item['content'])
else:
print(item['num'],item['title'])
return item
分布式爬虫
需要安装 scrapy-redis
整体流程如下:
- 创建工程
- 创建一个机遇CrawlSpider的爬虫文件
- 修改爬虫文件:
- 导包:from scrapy_redis.spiders import RedisCrawlSpider
- 将 start_urls 和 allowed_domains进行注释
- 添加一个新属性:redis_key=‘sun’ 可以被共享的调度器队列的名称
- 编写一个数据解析相关的操作
- 将当前爬虫类的父类修改为 RedisCrawlSpider
- 修改配置文件settings
- 指定可以被共享的管道: ITEM_PIPELINES={'scrapy_redis.pipelines.RedisPipeline':400}
- 指定调度器:
增加一个去重容器类的配置 DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter"
使用scrapy_redis组件自己的调度器 SCHEDULER="scrapy_redis.scheduler.Scheduler"
配置调度器是否持久化,当爬虫结束了,要不要清空redis SCHEDULER_PERSIST=True
- redis相关操作配置:
- 配置redis的配置文件:
- windows:redis.windows.conf
- 代开配置文件修改:
将bind 127.0.0.1 进行删除
关闭包含模式:protected-mode yes改成no
- 结合配置文件开启redis服务 redis-server 配置文件
- 启动客户端 redis-cli
- 执行工程:
scrapy runspider xxx.py
- 向调度器的队列放入一个起始的url
- 调度器队列在redis客户端中输入命令:
lpush xxx www.xxx.com
- 爬取到的数据存储到了redis 的proName:items这个数据结构中
如果redis不在本机,需要在配置文件settings中指定redis:
REDIS_HOST = '127.0.0.1`#REDIS的ip
REDIS_PORT = 6379
增量式爬虫
概念:检测网站数据的更新情况,只会爬取网站最新更新的数据
思路:将请求过的URL或信息进行存储(如元组中),下次请求前进行判断,如果已经有了则不再请求。
打包exe
使用pyinstaller生成exe可执行文件
安装pyinstaller:pip install pyinstaller
打包(选择主程序): pyinstaller -F main.py
打包时比较慢,进度条不走时可以按按回车
打包完成双击dist中main.exe即可运行
快捷键