完成目标:
使用协程对网站进行抓,加快执行效率,提取评论,制作词云。
《巾帼枭雄之义海豪情》是《巾帼枭雄》是姐妹篇。
该剧为TVB四十三周年台庆剧,也是2010年节目巡礼剧集之一。
故事以三十年代的广州为背景。郑九妹(邓萃雯 饰)是广州最大黑道势力的郑朗军(岳华 饰)的大女儿,打理家族鸦片生意。局势动荡,日军大举侵华,大佐向山铁也(金刚 饰)利用鸦片生意赚取大量军费及蚕食中国人身心,与九妹继续开设鸦片烟馆,九妹被视为汉奸,而她所做的一切背后另有目的……
九妹父异母三弟郑少杰(张松枝 饰)想来与之不和,打理家族白鸽标公司。警察局的刑侦队长刘醒(黎耀祥 饰)嗜赌白鸽标,一日让新下属唐吉(敖嘉年 饰)买白鸽标,但钱被扒走,唐吉改动过期白鸽标希望瞒天过海;但假白鸽标竟中奖,刘醒领奖金不果,与少杰大吵起来,九妹指明刘醒作假。刘醒对九妹留下更深印象,从此两人变结下不解之缘……
日本侵华、广州沦陷,最坏的时代,往往亦是展现人性的最佳时机……
——摘自豆瓣简介
编辑器:pycharm
用到的库:aiohttp、lxml、wordcloud、jieba、asyncio
使用协程实现异步爬取
async def get_remark(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.44",
"Cookie":'bid=y5MzJD8qtho; ll="118161"; _vwo_uuid_v2=DF03C7A4E3BC2DADAF50824473DEC1B26|3961db6f04b97501d92456a91fee8865; ct=y; push_noty_num=0; push_doumail_num=0; douban-fav-remind=1; ps=y; __utmv=30149280.24514; dbcl2="245147785:ot0qsmPKgCM"; gr_user_id=6f6d3a67-18d3-41f9-a4f3-80f25dcf6b04; UM_distinctid=17bd4348a12141-0fef2abcc607ae-5734174f-144000-17bd4348a13fb; ck=p5Gs; ap_v=0,6.0; __utma=30149280.1481648560.1628930427.1631352817.1631368641.26; __utmc=30149280; __utmz=30149280.1631368641.26.10.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.60511358.1628930427.1630324138.1631368669.24; __utmb=223695111.0.10.1631368669; __utmc=223695111; __utmz=223695111.1631368669.24.7.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/search; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1631368669%2C%22https%3A%2F%2Fwww.douban.com%2Fsearch%3Fsource%3Dsuggest%26q%3D%25E4%25B9%2589%25E6%25B5%25B7%25E8%25B1%25AA%25E6%2583%2585%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.4.10.1631368641; _pk_id.100001.4cf6=4277457464aecd3e.1628930427.23.1631369077.1630324137.'
}
async with aiohttp.ClientSession() as sesion:
async with sesion.get(url,headers=headers) as req:
page = await req.text()
tree = etree.HTML(page)
remarks = tree.xpath('//*[@id="comments"]//p/span/text()')
with open("remark.txt", "a", encoding="utf-8") as fp:
for remark in remarks:
fp.write(remark.replace("\n", "") + "\n")
读取remark.txt文件并制作词云
def stopwordslist():
stopwords = [line.strip() for line in open('Chinesestopword.txt', encoding='UTF-8').readlines()]
return stopwords
def get_pic():
with open("remark.txt", "r", encoding="utf-8") as fp:
data = fp.read()
data_cut = jieba.lcut(data)
new_data_list=""
stopwords = stopwordslist()
for word in data_cut:
if word not in stopwords:
new_data_list += word + " "
wcd = wordcloud.WordCloud(
font_path="simkai.ttf"
, colormap="brg"
, width=800
, height=400
, max_words=200
, background_color="white"
, scale=16
).generate(new_data_list)
wcd.to_file('remark.jpg')
if __name__ == '__main__':
# 列表解析
urls=[f"https://movie.douban.com/subject/4195676/comments?start={
i*20}&limit=20&status=P&sort=new_score" for i in range(0,10)]
# 创建事件循环
loop = asyncio.get_event_loop()
# 注入协程函数
tasks=[asyncio.ensure_future(get_remark(url)) for url in urls]
loop.run_until_complete(asyncio.wait(tasks))
# 保存词云
get_pic()
在黎耀祥的微博上看到他写道,多謝大冢,真的多謝大家陪劉醒過了這一生。
我想说,谢谢你们,让我看到九姑娘和刘醒的一生。