小福利,用python多协程爬取电影列表,效率杠杠的

小福利,用python多协程爬取电影列表,话不多说上代码。有兴趣加qq群,纯学习,1098016198。

from bs4 import BeautifulSoup
from gevent import monkey
monkey.patch_all()
import gevent,time,requests
from gevent.queue import Queue
start=time.time()
url_list=['http://www.mtime.com/top/tv/top100/']
for j in range(2,11):
    url_list.append('http://www.mtime.com/top/tv/top100/index-{}.html'.format(str(j)))
print(url_list)


headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
        'Referer':'http://www.mtime.com/top/tv/top100/',
         'Cookie':'_userCode_=202052221483601; _userIdentity_=202052221485994; DefaultCity-CookieKey=627; DefaultDistrict-CookieKey=0; _tt_=6A0D5B7802C889B92ADDF4B9DA5330DC; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1588428109,1588460637; __utma=196937584.392826187.1588428110.1588463461.1588470412.4; __utmz=196937584.1588428110.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); waf_cookie=14c7c021-c3bd-4e2a998ff60044124738825863a8e64fcc61; Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1588470414; __utmc=196937584; _ydclearance=e11e8c3d48a0aef51a6ece4e-dd11-4511-ae6f-98e6a44ceddd-1588477611; __utmb=196937584.4.10.1588470412; __utmt=1; __utmt_~1=1'
         }
work=Queue()
for url in url_list:
    work.put_nowait(url)
def getdata():
    while not work.empty():
        url=work.get_nowait()
        res=requests.get(url,headers=headers)
        print(res.status_code)
        soup = BeautifulSoup(res.text, 'html.parser')
        items = soup.find('div', class_='top_list')
        list = items.find_all('li')
        for film in list:
            name = film.find('h2').text
            abc = film.find_all('p')
            daoyan = abc[0].text.strip()
            zhuyan = abc[1].text.strip()
            jianjie = film.find('p', class_='mt3')
            if jianjie != None:
                print(name, daoyan, zhuyan, jianjie.text)
                mess=name+'\n'+daoyan+'\n'+zhuyan+'\n'+jianjie.text+'\n'+'-----------'+'\n'
            else:
                print(name, daoyan, zhuyan, None)
                mess=name+'\n'+daoyan+'\n'+zhuyan+'\n'+'-----------'+'\n'
            # with open("时光网电影top100","a",encoding='utf-8') as f:
            #     f.write(mess)
            #     f.close()

task_list=[]
for x in range(3):
    task=gevent.spawn(getdata)
    task_list.append(task)
gevent.joinall(task_list)
end=time.time()
print(end-start)




你可能感兴趣的:(python)