# 学习aiohttp
# 第一步 安装
#pip install aiohttp -i https://pypi.douban.com/simple
#pip install cchardet -i https://pypi.douban.com/simple
# 客户端
import aiohttp
import asyncio
async def fetch(session,url):
async with session.get(url) as response:
return await response.text()
# 基本
# async def main():
# async with aiohttp.ClientSession() as session:
# html=await fetch(session,"http://httpbin.org/headers")
# print(html)
'''
{
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Host": "httpbin.org",
"User-Agent": "Python/3.9 aiohttp/3.7.3",
"X-Amzn-Trace-Id": "Root=1-602b6a92-01f81ee520af7312137b8421"
}
}
'''
#读取网页内容
# async def main():
# async with aiohttp.ClientSession() as session:
# async with session.get('http://httpbin.org/get') as resp:
# print(resp.status) # 状态码
# print(await resp.text(encoding='utf-8'))
'''
200
{
"args": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Host": "httpbin.org",
"User-Agent": "Python/3.9 aiohttp/3.7.3",
"X-Amzn-Trace-Id": "Root=1-602b6c17-66a4d09013f941f320068d7f"
},
"origin": "171.44.106.55",
"url": "http://httpbin.org/get"
}
'''
# 非文本内容格式 只需把text方法改成read即可
#请求的自定义
# 自定义Headers
# async def main():
# async with aiohttp.ClientSession() as session:
# url="http://httpbin.org/"
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
# "AppleWebKit/537.36 (KHTML, like Gecko)"
# " Chrome/78.0.3904.108 Safari/537.36"
# }
# await session.post(url, headers=headers)
# 自定义cookie
async def main():
url = 'http://httpbin.org/cookies'
cookies = {'cookies_are': 'working'}
async with aiohttp.ClientSession(cookies=cookies) as session:
async with session.get(url) as resp:
assert await resp.json() == {
"cookies": {"cookies_are": "working"}}
asyncio.run(main())
# 一个函数用来发起请求,另外一个函数用来下载网页
# 同步爬虫
#爬取豆瓣电影
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from lxml import etree
#请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
"/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.121 Safari/537.36"}
def get_movie_url():
req_url='https://movie.douban.com/chart'
response=requests.get(url=req_url,headers=headers)
html=etree.HTML(response.text)
movies_url=html.xpath('//*[@id="content"]/div/div[1]/div/div/table/tr/td/a/@href')
return movies_url
def get_movie_content(movie_url):
response=requests.get(movie_url,headers=headers)
result=etree.HTML(response.text)
movie=dict()
name=result.xpath('//*[@id="content"]/h1/span[1]/text()')
author =result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
movie["name"] = name
movie["author"] = author
return movie
if __name__ == '__main__':
start = datetime.now()
movie_url_list = get_movie_url()
movies = dict()
for url in movie_url_list:
movies[url] = get_movie_content(url)
print(movies)
print("同步用时为:{}".format(datetime.now() - start))
'''
{'https://movie.douban.com/subject/30458949/': {'name': ['无依之地 Nomadland'], 'author': ['赵婷']}, 'https://movie.douban.com/subject/30257787/': {'name': ['一秒钟'], 'author': ['张艺谋']}, 'https://movie.douban.com/subject/30443686/': {'name': ['穷途鼠的奶酪梦 窮鼠はチーズの夢を見る'], 'author': ['行定勋']}, 'https://movie.douban.com/subject/34869387/': {'name': ['女人的碎片 Pieces of a Woman'], 'author': ['凯内尔·穆德卢佐']}, 'https://movie.douban.com/subject/33408026/': {'name': ['刻在你心底的名字'], 'author': ['柳广辉']}, 'https://movie.douban.com/subject/34894753/': {'name': ['沐浴之王'], 'author': ['易小星']}, 'https://movie.douban.com/subject/35211578/': {'name': ['逃避虽可耻但有用 新春特别篇 逃げるは恥だが役に立つ ガンバレ人類! 新春スペシャル!!'], 'author': ['金子文纪']}, 'https://movie.douban.com/subject/30450313/': {'name': ['前程似锦的女孩 Promising Young Woman'], 'author': ['埃默拉尔德·芬内尔']}, 'https://movie.douban.com/subject/35275115/': {'name': ['2020去死 Death to 2020'], 'author': ['阿尔·坎贝尔', ' / ', 'Alice Mathias']}, 'https://movie.douban.com/subject/34982759/': {'name': ["玫瑰岛的不可思议的历史 L'incredibile storia dell'isola delle rose"], 'author': ['希德尼·希比利亚']}}
同步用时为:0:00:08.478348
'''
# 将同步代码改成异步测试速度
import asyncio
from datetime import datetime
import aiohttp
from lxml import etree
#请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
"/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.121 Safari/537.36"}
async def get_movie_url():
req_url='https://movie.douban.com/chart'
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url=req_url,headers=headers) as response:
result=await response.text()
result=etree.HTML(result)
return result.xpath('//*[@id="content"]/div/div[1]/div/div/table/tr/td/a/@href')
async def get_movie_content(movie_url):
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url=movie_url,headers=headers) as response:
result=await response.text()
result=etree.HTML(result)
movie = dict()
name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
movie["name"] = name
movie["author"] = author
return movie
if __name__ == '__main__':
start = datetime.now()
loop = asyncio.get_event_loop()
movie_url_list = loop.run_until_complete(get_movie_url()) # run_until_complete 来运行 loop ,等到 future 完成,run_until_complete 也就返回了。
tasks = [get_movie_content(url) for url in movie_url_list]
movies = loop.run_until_complete(asyncio.gather(*tasks))
print(movies)
print("异步用时为:{}".format(datetime.now()- start))
'''
[{'name': ['无依之地 Nomadland'], 'author': ['赵婷']}, {'name': ['一秒钟'], 'author': ['张艺谋']}, {'name': ['穷途鼠的奶酪梦 窮鼠はチーズの夢を見る'], 'author': ['行定勋']}, {'name': ['女人的碎片 Pieces of a Woman'], 'author': ['凯内尔·穆德卢佐']}, {'name': ['刻在你心底的名字'], 'author': ['柳广辉']}, {'name': ['沐浴之王'], 'author': ['易小星']}, {'name': ['逃避虽可耻但有用 新春特别篇 逃げるは恥だが役に立つ ガンバレ人類! 新春スペシャル!!'], 'author': ['金子文纪']}, {'name': ['前程似锦的女孩 Promising Young Woman'], 'author': ['埃默拉尔德·芬内尔']}, {'name': ['2020去死 Death to 2020'], 'author': ['阿尔·坎贝尔', ' / ', 'Alice Mathias']}, {'name': ["玫瑰岛的不可思议的历史 L'incredibile storia dell'isola delle rose"], 'author': ['希德尼·希比利亚']}]
异步用时为:0:00:01.431173
'''
# 异步理解
'''
参考博客 https://www.cnblogs.com/xinghun85/p/9937741.html
'''
import asyncio
from datetime import datetime
import requests
# 例子
async def test2(i):
r=await other_test(i)
print("1",i,r)
async def other_test(i):
r = requests.get(i)
print("2",i)
await asyncio.sleep(4)
print("3",datetime.now()-start)
return r
url = ["https://segmentfault.com/p/1210000013564725",
"https://www.jianshu.com/p/83badc8028bd",
"https://www.baidu.com/"]
loop = asyncio.get_event_loop()
task = [asyncio.ensure_future(test2(i)) for i in url]
start =datetime.now()
loop.run_until_complete(asyncio.wait(task))
endtime = datetime.now()-start
print(endtime)
loop.close()
# aiohttp 爬虫实战
# 参考博客 :https://blog.csdn.net/qq_36772866/article/details/105355445?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522161346654316780271553327%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=161346654316780271553327&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_v1~rank_blog_v1-3-105355445.pc_v1_rank_blog_v1&utm_term=%E7%88%AC%E8%99%AB&spm=1018.2226.3001.4450
# 第一步 分析 页面 查看小说信息所在的div
# 导包
import asyncio
import csv
import aiohttp
from bs4 import BeautifulSoup
#爬取网站内容
async def getData(url,headers):
# 创建会话对象
async with aiohttp.ClientSession() as session:
# 发送get请求,设置请求头
async with session.get(url,headers=headers) as response:
#返回响应内容
return await response.text()
#保存数据
def saveData(result):
for i in result:
soup=BeautifulSoup(i,'lxml')
find_div=soup.find_all('div',class_='book-mid-info')
for d in find_div:
# 小说名
name = d.find('h4').getText()
# 作者
author = d.find('a', class_='name').getText()
# 更新时间
update = d.find('p', class_='update').getText()
# 写入 csv
csvFile = open(r'C:\Users\zxy\Desktop\data.csv', 'w', encoding='utf-8-sig', newline='')
writer = csv.writer(csvFile)
writer.writerow([name, author, update])
csvFile.close()
# 创建异步任务并保存数据
def run():
for i in range(25):
#构建不同的 url 传入 getData,最后由 asyncio 模块执行
task=asyncio.ensure_future(getData(url.format(i+1),headers))
#将所有请求都加入到列表 tasks
tasks.append(task)
# 当所有请求执行完成,返回响应结果
result = loop.run_until_complete(asyncio.gather(*tasks))
saveData(result)
print(len(result))
if __name__=='__main__':
#请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
# 任务列表
tasks=[]
# url
url='https://www.qidian.com/rank/hotsales?page={}'
#loop对象
loop=asyncio.get_event_loop()
#调用run函数
run()
这里需要修改下载中间件
import requests
# 现在,我们创建一个中间件,在这个中间件里面,使用 requests 请求一个需要延迟 5 秒钟才会返回的网址:
# class TestAiohttp:
# def get_ip(self):
# requests.get('http://httpbin.org/delay/5').json()
#
# def process_request(self,request,spider):
# print("请求一个5秒才会返回的网址")
# self.get_ip()
#将requests换成aiohttp
import asyncio
import aiohttp
class TestAiohttp:
async def get_ip(self):
async with aiohttp.ClientSession() as client:
resp = await client.get('http://httpbin.org/delay/5')
result = await resp.json()
print(result)
async def process_request(self, request, spider):
print('请求一个延迟5秒的网址开始')
await asyncio.create_task(self.get_ip())
爬虫部分
from datetime import datetime
import scrapy
class ExerciseSpider(scrapy.Spider):
name = 'exercise'
#allowed_domains = ['v.qq.com']
# 跟进url
start_urls = ['http://exercise.com']
def start_requests(self):
for page in range(1, 10):
url=f'http://exercise.kingname.info/exercise_middleware_ip/{page}'
yield scrapy.Request(url)
def parse(self, response):
now=datetime.now()
print(f'现在时间是:',now,response.text)
'''
在没使用下载中间件的时候,每次请求只需要一秒钟。
我么那在下载中间件内 需要请求一个五秒才会给反馈的地址
如果用普通的requests 每次请求时,请求将会被卡五秒
换成aiohttp后,启动后会启动五个并发(这里设置的最大并发数是5),所以会同时打印出请求一个延迟5秒的网址开始5次。然后稍稍停 5 秒,这 5 个请求几乎同时完成,于是同时打印出这个延迟网址的返回信息。接下来,后面的请求就是每秒一个。
'''
设置部分settings
# 开启自定义下载中间件
DOWNLOADER_MIDDLEWARES = {
'aioTest.middlewares.TestAiohttp': 543
}
# 开启异步
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
启动类
import asyncio
from scrapy import cmdline
import sys
if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
# 我这里不加这句会报错
cmdline.execute('scrapy crawl exercise'.split())