1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
import
urllib
.
request
as
request
from
bs4
import
BeautifulSoup
as
bs
import
asyncio
import
aiohttp
,
re
@
asyncio
.
coroutine
async
def
getPage
(
url
,
res_list
)
:
print
(
url
)
headers
=
{
'User-Agent'
:
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
# conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
url
,
headers
=
headers
)
as
resp
:
assert
resp
.
status
==
200
res_list
.
append
(
await
resp
.
text
(
)
)
async
def
getTitle
(
url
,
res_list
)
:
print
(
url
)
headers
=
{
'User-Agent'
:
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
# conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
url
,
headers
=
headers
)
as
resp
:
assert
resp
.
status
==
200
html
=
await
resp
.
text
(
)
title
=
re
.
search
(
"
print
(
title
)
# with open('title.txt','a+') as f:
# print(title,url)
# f.write(title+","+url+"\n")
# print(type(await resp.text()))
# res_list.append(await resp.text())
class
parseListPage
(
)
:
def
__init__
(
self
,
page_str
)
:
self
.
page_str
=
page_str
def
__enter__
(
self
)
:
page_str
=
self
.
page_str
page
=
bs
(
page_str
,
'lxml'
)
# 获取文章链接
articles
=
page
.
select
(
'.txtList30 li'
)
art_urls
=
[
]
for
a
in
articles
:
x
=
a
.
find
(
'a'
)
[
'href'
]
art_urls
.
append
(
x
)
return
art_urls
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
)
:
pass
page_num
=
100
page_url_base
=
'http://news.artron.net/morenews/list728/p'
page_urls
=
[
page_url_base
+
str
(
i
+
1
)
for
i
in
range
(
page_num
)
]
loop
=
asyncio
.
get_event_loop
(
)
ret_list
=
[
]
tasks
=
[
getPage
(
host
,
ret_list
)
for
host
in
page_urls
]
print
(
tasks
)
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
)
)
articles_url
=
[
]
for
ret
in
ret_list
:
with
parseListPage
(
ret
)
as
tmp
:
articles_url
+=
tmp
ret_list
=
[
]
tasks
=
[
getTitle
(
url
,
ret_list
)
for
url
in
articles_url
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
)
)
loop
.
close
(
)
# 例子 0
import
asyncio
import
aiohttp
,
time
NUMBERS
=
range
(
12
)
'''
1. 当我们给一个函数添加了async关键字,就会把它变成一个异步函数。
每个线程有一个事件循环,主线程调用asyncio.get_event_loop时会创建事件循环,
你需要把异步的任务丢给这个循环的run_until_complete方法,事件循环会安排协同程序的执行。
和方法名字一样,异步的任务完成方法才会就执行完成了。
await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务,直到完成。
'''
URL
=
'http://httpbin.org/get?a={}'
async
def
fetch_async
(
a
)
:
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
URL
.
format
(
a
)
)
as
r
:
data
=
await
r
.
json
(
)
#希望能进行协程切换的地方,就需要使用await关键字。如上的例子中r.json方法会等待I/O(也就是正在做一个网络请求),这种就可以切换去做其他的时候,之后再切换回来。
return
data
[
'args'
]
[
'a'
]
start
=
time
.
time
(
)
event_loop
=
asyncio
.
get_event_loop
(
)
#会创建事件循环
tasks
=
[
fetch_async
(
num
)
for
num
in
NUMBERS
]
results
=
event_loop
.
run_until_complete
(
asyncio
.
gather
(
*
tasks
)
)
for
num
,
result
in
zip
(
NUMBERS
,
results
)
:
print
(
'fetch({}) = {}'
.
format
(
num
,
result
)
)
print
(
'Use asyncio aiohttp : {}'
.
format
(
time
.
time
(
)
-
start
)
)
|
参考:http://blog.csdn.net/u014595019/article/details/52295642