,html,re.S|re.M)
for
username,level,laugh_count,comment_count,content
in
zip
(usernames,levels,laugh_counts,comment_counts,contents):
information
=
{
"username"
: username.strip(),
"level"
: level.strip(),
"laugh_count"
: laugh_count.strip(),
"comment_count"
: comment_count.strip(),
"content"
: content.strip()
}
duanzi_list.append(information)
time.sleep(
1
)
return
duanzi_list
def
normal_scapper(url_lists):
'''
定义调用函数,使用普通的爬虫函数爬取数据
'''
begin_time
=
time.time()
for
url
in
url_lists:
scrap_qiushi_info(url)
end_time
=
time.time()
print
"普通爬虫一共耗费时长:%f"
%
(end_time
-
begin_time)
def
muti_process_scapper(url_lists,process_num
=
2
):
'''
定义多进程爬虫调用函数,使用mutiprocessing模块爬取web数据
'''
begin_time
=
time.time()
pool
=
Pool(processes
=
process_num)
pool.
map
(scrap_qiushi_info,url_lists)
end_time
=
time.time()
print
"%d个进程爬虫爬取所耗费时长为:%s"
%
(process_num,(end_time
-
begin_time))
def
main():
'''
定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
'''
url_lists
=
[
'https://www.qiushibaike.com/text/page/{}'
.
format
(i)
for
i
in
range
(
1
,
11
)]
normal_scapper(url_lists)
muti_process_scapper(url_lists,process_num
=
2
)
if
__name__
=
=
"__main__"
:
main()