asyncio aiohttp 异步爬虫 实例

Python
import urllib.request as request from bs4 import BeautifulSoup as bs import asyncio import aiohttp,re @asyncio.coroutine async def getPage(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087") async with aiohttp.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 res_list.append(await resp.text()) async def getTitle(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087") async with aiohttp.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 html = await resp.text() title=re.search("(.*?)",html,re.S).group(0) print(title) # with open('title.txt','a+') as f: # print(title,url) # f.write(title+","+url+"\n") # print(type(await resp.text())) # res_list.append(await resp.text()) class parseListPage(): def __init__(self,page_str): self.page_str = page_str def __enter__(self): page_str = self.page_str page = bs(page_str,'lxml') # 获取文章链接 articles = page.select('.txtList30 li') art_urls = [] for a in articles: x = a.find('a')['href'] art_urls.append(x) return art_urls def __exit__(self, exc_type, exc_val, exc_tb): pass page_num = 100 page_url_base = 'http://news.artron.net/morenews/list728/p' page_urls = [page_url_base + str(i+1) for i in range(page_num)] loop = asyncio.get_event_loop() ret_list = [] tasks = [getPage(host,ret_list) for host in page_urls] print(tasks) loop.run_until_complete(asyncio.wait(tasks)) articles_url = [] for ret in ret_list: with parseListPage(ret) as tmp: articles_url += tmp ret_list = [] tasks = [getTitle(url, ret_list) for url in articles_url] loop.run_until_complete(asyncio.wait(tasks)) loop.close() # 例子 0 import asyncio import aiohttp,time NUMBERS = range(12) ''' 1. 当我们给一个函数添加了async关键字,就会把它变成一个异步函数。 每个线程有一个事件循环,主线程调用asyncio.get_event_loop时会创建事件循环, 你需要把异步的任务丢给这个循环的run_until_complete方法,事件循环会安排协同程序的执行。 和方法名字一样,异步的任务完成方法才会就执行完成了。 await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务,直到完成。 ''' URL = 'http://httpbin.org/get?a={}' async def fetch_async(a): async with aiohttp.ClientSession() as session: async with session.get(URL.format(a)) as r: data = await r.json() #希望能进行协程切换的地方,就需要使用await关键字。如上的例子中r.json方法会等待I/O(也就是正在做一个网络请求),这种就可以切换去做其他的时候,之后再切换回来。 return data['args']['a'] start = time.time() event_loop = asyncio.get_event_loop() #会创建事件循环 tasks = [fetch_async(num) for num in NUMBERS] results = event_loop.run_until_complete(asyncio.gather(*tasks)) for num, result in zip(NUMBERS, results): print('fetch({}) = {}'.format(num, result)) print('Use asyncio aiohttp : {}'.format(time.time() - start))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import urllib . request as request
from bs4 import BeautifulSoup as bs
import asyncio
import aiohttp , re
 
@ asyncio . coroutine
async def getPage ( url , res_list ) :
     print ( url )
     headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }
     # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")
     async with aiohttp . ClientSession ( ) as session :
         async with session . get ( url , headers = headers ) as resp :
             assert resp . status == 200
             res_list . append ( await resp . text ( ) )
 
async def getTitle ( url , res_list ) :
     print ( url )
     headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }
     # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")
     async with aiohttp . ClientSession ( ) as session :
         async with session . get ( url , headers = headers ) as resp :
             assert resp . status == 200
             html = await resp . text ( )
             title = re . search ( "(.*?)" , html , re . S ) . group ( 0 )
             print ( title )
             # with open('title.txt','a+') as f:
             #     print(title,url)
             #     f.write(title+","+url+"\n")
             # print(type(await resp.text()))
             # res_list.append(await resp.text())
 
 
class parseListPage ( ) :
     def __init__ ( self , page_str ) :
         self . page_str = page_str
     def __enter__ ( self ) :
         page_str = self . page_str
         page = bs ( page_str , 'lxml' )
         # 获取文章链接
         articles = page . select ( '.txtList30 li' )
         art_urls = [ ]
         for a in articles :
             x = a . find ( 'a' ) [ 'href' ]
             art_urls . append ( x )
         return art_urls
     def __exit__ ( self , exc_type , exc_val , exc_tb ) :
         pass
 
 
page_num = 100
page_url_base = 'http://news.artron.net/morenews/list728/p'
page_urls = [ page_url_base + str ( i + 1 ) for i in range ( page_num ) ]
loop = asyncio . get_event_loop ( )
ret_list = [ ]
tasks = [ getPage ( host , ret_list ) for host in page_urls ]
print ( tasks )
loop . run_until_complete ( asyncio . wait ( tasks ) )
 
articles_url = [ ]
for ret in ret_list :
     with parseListPage ( ret ) as tmp :
         articles_url += tmp
 
ret_list = [ ]
 
tasks = [ getTitle ( url , ret_list ) for url in articles_url ]
loop . run_until_complete ( asyncio . wait ( tasks ) )
loop . close ( )
 
# 例子 0
 
import asyncio
import aiohttp , time
NUMBERS = range ( 12 )
'''
1.   当我们给一个函数添加了async关键字,就会把它变成一个异步函数。
    每个线程有一个事件循环,主线程调用asyncio.get_event_loop时会创建事件循环,
    你需要把异步的任务丢给这个循环的run_until_complete方法,事件循环会安排协同程序的执行。
    和方法名字一样,异步的任务完成方法才会就执行完成了。
await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务,直到完成。
'''
URL = 'http://httpbin.org/get?a={}'
async def fetch_async ( a ) :
     async with aiohttp . ClientSession ( ) as session :
         async with session . get ( URL . format ( a ) ) as r :
             data = await r . json ( )
             #希望能进行协程切换的地方,就需要使用await关键字。如上的例子中r.json方法会等待I/O(也就是正在做一个网络请求),这种就可以切换去做其他的时候,之后再切换回来。
             return data [ 'args' ] [ 'a' ]
 
start = time . time ( )
event_loop = asyncio . get_event_loop ( )
#会创建事件循环
tasks = [ fetch_async ( num ) for num in NUMBERS ]
 
results = event_loop . run_until_complete ( asyncio . gather ( * tasks ) )
for num , result in zip ( NUMBERS , results ) :
     print ( 'fetch({}) = {}' . format ( num , result ) )
print ( 'Use asyncio aiohttp : {}' . format ( time . time ( ) - start ) )

参考:http://blog.csdn.net/u014595019/article/details/52295642




你可能感兴趣的:(asyncio aiohttp 异步爬虫 实例)