使用python爬取豆瓣电影排行榜

有时候一部电影给人的启示,比一本书给人的启示更大,能够获得极高的评分,就说明这部电影获得了全球观众的认可,人生在遭遇迷惑的时候,一部高分电影可以给人解困。为了防止大家在家剧荒,小编使用python爬取了豆瓣的豆瓣电影排行TOP250,首先让我们输入网址:

https://movie.douban.com/top250?start=0&filter=

然后我们分析链接的规律:

https://movie.douban.com/top250?start=25&filter=

……

https://movie.douban.com/top250?start=50&filter=

……

https://movie.douban.com/top250?start=225&filter=

综上start值为0,25,50…225,则看出数值的步长为25,最大值为225,找到链接的规律后,就让我们来查看元素的获取。同样F12审查元素,

图片
由此我们可以看出,里面的每个电影是由ol标签下面的li标签包裹着,我们直接取到li的值就可以了,获取标签如下:

ol = soup.find('ol')
li = ol.find_all('li')

接下来让我们进入编码环节:

import re
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Alignment

def top250():
        wb = Workbook()
        ws = wb['Sheet']
        num = 0
        num1 = 0
        lst = []
        name_lst = []
        dy_lst = []
        zy_lst = []
        time_lst = []
        country_lst = []
        leixing_lst = []
        pj_lst = []
        people_lst = []
        quote_lst = []
        headers = {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36 Edg/81.0.416.68",
        }
        while num <= 225:
                url = 'https://movie.douban.com/top250?start=' + str(num) + '&filter='
                with requests.get(url=url, headers=headers) as r:
                        if r.status_code == 200:
                                r.encoding = r.apparent_encoding
                                soup = BeautifulSoup(r.text, 'html.parser')
                                ol = soup.find('ol')
                                li = ol.find_all('li')
                                for i in li:
                                        name = i.find('span').text
                                        p = i.find('p')
                                        p = str(p).split('
', 1) fst = p[0].split('>', 1)[1] sec = p[1].split('<', 1)[0] daoyan = fst.split('主', 1)[0] daoyan = daoyan.replace('导演:', '') daoyan = daoyan.replace('\xa0', '') daoyan = daoyan.replace('\n ', '') try: zhuyan = fst.split('主演:', 1)[1] except: zhuyan = '' time = sec.split(' / ', 2)[0] time = time.replace('\xa0', '') time = time.replace('\n ', '') time = re.findall(r'\d{4}', time)[-1] country = sec.split('\xa0/\xa0', 2)[1] country = country.replace('\xa0', '') type = sec.split('\xa0/\xa0', 2)[2] type = type.replace('\xa0', '') type = type.replace('\n ', '') star = i.find('div', attrs={'class': "star"}) span = star.find_all('span') pingjia = span[1].text people = span[3].text.split('评价', 1)[0] try: quote = i.find('p', attrs={'class': "quote"}).text quote = quote.replace('\n', '') except: quote = '' name_lst.append(name) dy_lst.append(daoyan) zy_lst.append(zhuyan) time_lst.append(time) country_lst.append(country) leixing_lst.append(type) pj_lst.append(pingjia) people_lst.append(people) quote_lst.append(quote) num1 += 1 print('第{}页爬取完毕!'.format(num1)) if num == 225: print('爬取结束,开始写入excel。。。') paiming = list(range(1, 251)) lst.append(paiming) lst.append(name_lst) lst.append(dy_lst) lst.append(zy_lst) lst.append(time_lst) lst.append(country_lst) lst.append(leixing_lst) lst.append(pj_lst) lst.append(people_lst) lst.append(quote_lst) head = ['排名', '电影名称', '导演', '主演', '年份', '地区', '类型', '评分', '评价人数', '一句简介'] ws.append(head) for i in range(len(lst)): for j in range(len(lst[0])): ws.cell(j + 2, i + 1).value = lst[i][j] print('写入excel完成!') for cell in ws['1']: cell.alignment = Alignment(horizontal='center', vertical='center') for cell in ws['A']: cell.alignment = Alignment(horizontal='center', vertical='center') for cell in ws['B']: cell.alignment = Alignment(horizontal='center', vertical='center') for cell in ws['E']: cell.alignment = Alignment(horizontal='center', vertical='center') for cell in ws['H']: cell.alignment = Alignment(horizontal='center', vertical='center') for cell in ws['I']: cell.alignment = Alignment(horizontal='center', vertical='center') ws.column_dimensions['B'].width = 25 ws.column_dimensions['I'].width = 13 wb.save('豆瓣电影top250.xlsx') num += 25 else: print('失败!') if __name__ == '__main__': print('开始爬取!') top250()

右击运行代码,即可执行,当前文件夹内会生成一个豆瓣电影top250 xlsx文件,所有电影的信息都爬取成功。如下图即代表程序运行成功。

图片
以上就是今天给大家分享的内容。

你可能感兴趣的:(python)