小福利,用python多协程爬取电影列表,话不多说上代码。有兴趣加qq群,纯学习,1098016198。
from bs4 import BeautifulSoup
from gevent import monkey
monkey.patch_all()
import gevent,time,requests
from gevent.queue import Queue
start=time.time()
url_list=['http://www.mtime.com/top/tv/top100/']
for j in range(2,11):
url_list.append('http://www.mtime.com/top/tv/top100/index-{}.html'.format(str(j)))
print(url_list)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Referer':'http://www.mtime.com/top/tv/top100/',
'Cookie':'_userCode_=202052221483601; _userIdentity_=202052221485994; DefaultCity-CookieKey=627; DefaultDistrict-CookieKey=0; _tt_=6A0D5B7802C889B92ADDF4B9DA5330DC; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1588428109,1588460637; __utma=196937584.392826187.1588428110.1588463461.1588470412.4; __utmz=196937584.1588428110.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); waf_cookie=14c7c021-c3bd-4e2a998ff60044124738825863a8e64fcc61; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1588470414; __utmc=196937584; _ydclearance=e11e8c3d48a0aef51a6ece4e-dd11-4511-ae6f-98e6a44ceddd-1588477611; __utmb=196937584.4.10.1588470412; __utmt=1; __utmt_~1=1'
}
work=Queue()
for url in url_list:
work.put_nowait(url)
def getdata():
while not work.empty():
url=work.get_nowait()
res=requests.get(url,headers=headers)
print(res.status_code)
soup = BeautifulSoup(res.text, 'html.parser')
items = soup.find('div', class_='top_list')
list = items.find_all('li')
for film in list:
name = film.find('h2').text
abc = film.find_all('p')
daoyan = abc[0].text.strip()
zhuyan = abc[1].text.strip()
jianjie = film.find('p', class_='mt3')
if jianjie != None:
print(name, daoyan, zhuyan, jianjie.text)
mess=name+'\n'+daoyan+'\n'+zhuyan+'\n'+jianjie.text+'\n'+'-----------'+'\n'
else:
print(name, daoyan, zhuyan, None)
mess=name+'\n'+daoyan+'\n'+zhuyan+'\n'+'-----------'+'\n'
# with open("时光网电影top100","a",encoding='utf-8') as f:
# f.write(mess)
# f.close()
task_list=[]
for x in range(3):
task=gevent.spawn(getdata)
task_list.append(task)
gevent.joinall(task_list)
end=time.time()
print(end-start)