Top100猫眼电影爬取案例

#保存请求头的列表
ua_list=[
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",

    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
    "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"

]


from urllib import request
import re
import random
import time
import csv

class MaoyanSpider(object):
    def __init__(self):
        self.url="https://maoyan.com/board/4?offset={}"
        #添加计数变量
        self.i=0
    #请求
    def get_html(self,url):
        headers={"User-Agent":random.choice(ua_list)}#随机获取请求头
        req=request.Request(url=url,headers=headers)
        res=request.urlopen(req)
        html=res.read().decode()
        # 直接调用解析函数
        self.parse_html(html)
    #解析
    def parse_html(self,html):
        re_bds='
.*?title=' \ '"(.*?)".*?class="star">(.*?)

.*?class="releasetime">(.*?)

'
pattern=re.compile(re_bds,re.S) r_list=pattern.findall(html) #直接调用写入函数 self.write_html(r_list) #保存 def write_html(self,r_list): item={} #以a方式追加写入 with open("maoyan.csv","a",newline="",encoding="utf-8") as f: writer=csv.writer(f)#初始化对象 for r in r_list: item["name"] = r[0].strip() item["star"] = r[1].strip() item["time"] = r[2].strip()[5:15]#切取后面的时间段 print(item) L=[item["name"],item["star"],item["time"]] writer.writerow(L) self.i += 1 #主函数 def run(self): for offset in range(0,91,10):#从0到90以步长为10爬取10个页面 url=self.url.format(offset) self.get_html(url) print("爬取电影数量:",self.i) if __name__ == '__main__': start=time.time() spider=MaoyanSpider() spider.run() end=time.time() print("执行时间:%.2f"%(end-start))

你可能感兴趣的:(爬虫)