Python爬虫爬取香港电影评分网站各榜单

Python爬虫

香港电影评分网站


各大榜单


最新电影榜单

步骤

1、找到所有的榜单地址

base_url = "https://hkmovie6.com"
edit_url = "https://hkmovie6.com/collection" # 编辑精选
marvel_url = "https://hkmovie6.com/collection/0bb46a15-ceaf-4cf0-a9ed-14284b32c5c1" # 漫威宇宙
dc_url = "https://hkmovie6.com/collection/c3400e0e-78ac-4184-ac4f-972f46dd3b07" # dc宇宙
star_wars_url = "https://hkmovie6.com/collection/3eabc7b5-c674-42a3-8069-522a02bf8f1a" # 星球大战系列
oscars_url = "https://hkmovie6.com/collection/ba07fa64-8297-49d7-ba4c-fb9bed210a65" # 奥斯卡金像奖得奖名单
new_url = "https://hkmovie6.com/watch/latest" # 最新

2、分析排行榜获取电影详情页地址

Python爬虫爬取香港电影评分网站各榜单_第1张图片

3、根据详情页获取电影信息

Python爬虫爬取香港电影评分网站各榜单_第2张图片
4、将爬取到的数据保存到Excel文件

Python爬虫爬取香港电影评分网站各榜单_第3张图片

代码

# -*- coding: utf-8 -*- 

import requests
from bs4 import BeautifulSoup
import xlwt

'''
https://hkmovie6.com
'''
base_url = "https://hkmovie6.com"
edit_url = "https://hkmovie6.com/collection" # 编辑精选
marvel_url = "https://hkmovie6.com/collection/0bb46a15-ceaf-4cf0-a9ed-14284b32c5c1" # 漫威宇宙
dc_url = "https://hkmovie6.com/collection/c3400e0e-78ac-4184-ac4f-972f46dd3b07" # dc宇宙
star_wars_url = "https://hkmovie6.com/collection/3eabc7b5-c674-42a3-8069-522a02bf8f1a" # 星球大战系列
oscars_url = "https://hkmovie6.com/collection/ba07fa64-8297-49d7-ba4c-fb9bed210a65" # 奥斯卡金像奖得奖名单
new_url = "https://hkmovie6.com/watch/latest" # 最新

def get_html(url,encoding):
    response = requests.get(url)
    if response.status_code == 200:
        # 判断请求是否成功
        response.encoding = encoding
        return response.text
    else:
        return None

def get_new_data(savepath):
    '''
    获取最新模块数据
    :return:
    '''
    html = get_html(new_url, "utf-8")
    soup = BeautifulSoup(html,"html.parser")
    shows = soup.find("div",class_="shows")
    a_links = shows.find_all("a")
    datalist = []
    for a in a_links:
        mv_url = a['href']
        print(mv_url)
        result = get_detail_data(base_url + mv_url)
        datalist.append(result)
    save_to_excel(savepath,datalist)

def get_marvel_data():
    '''
    获取漫威电影系列榜单
    :return:
    '''
    html = get_html(marvel_url,"utf-8")
    get_new_data(html,"./漫威.xls")

def get_dc_data():
    '''
    获取dc电影系列榜单
    :return:
    '''
    html = get_html(dc_url,"utf-8")
    get_new_data(html,"./dc.xls")

def get_sw_data():
    '''
    获取星球大战电影系列榜单
    :return:
    '''
    html = get_html(star_wars_url,"utf-8")
    get_new_data(html,"./星球大战.xls")

def get_oscars_data():
    '''
    获取星球大战电影系列榜单
    :return:
    '''
    html = get_html(oscars_url, "utf-8")
    get_new_data(html, "./奥斯卡金像奖榜单.xls")

def save_to_excel(savepath,datalist):
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)  # 创建workbook对象
    sheet = book.add_sheet('Top100', cell_overwrite_ok=True)  # 创建工作表
    col = ("电影名", "时长", "上映日期", "级别","简介")
    for i in range(0,5):
        sheet.write(0, i, col[i])  # 列名
    for i in range(0, len(datalist)):
        print("第{}条".format(i + 1))
        data = datalist[i]
        if len(data) >= 5:# 数据完整才保存
            for j in range(0, 5):
                sheet.write(i + 1, j, data[j])
    book.save(savepath)  # 保存

def get_detail_data(url):
    '''

    :return:
    '''
    print(url)
    html = get_html(url,"utf-8")
    soup = BeautifulSoup(html, "html.parser")
    mv_content = soup.find("div",class_="movieContent")
    mv_detail = mv_content.find("div",class_="movieMobileDetail")
    mv_title = mv_detail.find("div",class_="movieName").get_text().strip()
    mv_tad = mv_detail.find("div",class_="text").get_text().strip().split("|")
    mv_duration = mv_tad[1].strip()
    mv_time = mv_tad[0].strip()
    mv_level = mv_detail.find("div",class_="cat").get_text().strip()
    mv_des = mv_content.find("div",class_="synopsis").span.span['aria-label'].strip()
    # 标题 2021年2月24日 | 103 分鐘 简介  级别
    return [mv_title,mv_duration,mv_time,mv_level,mv_des]


def main():
    get_marvel_data()
    get_dc_data()
    get_sw_data()
    get_oscars_data()

if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫,python,excel,大数据,爬虫,数据挖掘)