香港电影评分网站
各大榜单
最新电影榜单
1、找到所有的榜单地址
base_url = "https://hkmovie6.com"
edit_url = "https://hkmovie6.com/collection" # 编辑精选
marvel_url = "https://hkmovie6.com/collection/0bb46a15-ceaf-4cf0-a9ed-14284b32c5c1" # 漫威宇宙
dc_url = "https://hkmovie6.com/collection/c3400e0e-78ac-4184-ac4f-972f46dd3b07" # dc宇宙
star_wars_url = "https://hkmovie6.com/collection/3eabc7b5-c674-42a3-8069-522a02bf8f1a" # 星球大战系列
oscars_url = "https://hkmovie6.com/collection/ba07fa64-8297-49d7-ba4c-fb9bed210a65" # 奥斯卡金像奖得奖名单
new_url = "https://hkmovie6.com/watch/latest" # 最新
2、分析排行榜获取电影详情页地址
3、根据详情页获取电影信息
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import xlwt
'''
https://hkmovie6.com
'''
base_url = "https://hkmovie6.com"
edit_url = "https://hkmovie6.com/collection" # 编辑精选
marvel_url = "https://hkmovie6.com/collection/0bb46a15-ceaf-4cf0-a9ed-14284b32c5c1" # 漫威宇宙
dc_url = "https://hkmovie6.com/collection/c3400e0e-78ac-4184-ac4f-972f46dd3b07" # dc宇宙
star_wars_url = "https://hkmovie6.com/collection/3eabc7b5-c674-42a3-8069-522a02bf8f1a" # 星球大战系列
oscars_url = "https://hkmovie6.com/collection/ba07fa64-8297-49d7-ba4c-fb9bed210a65" # 奥斯卡金像奖得奖名单
new_url = "https://hkmovie6.com/watch/latest" # 最新
def get_html(url,encoding):
response = requests.get(url)
if response.status_code == 200:
# 判断请求是否成功
response.encoding = encoding
return response.text
else:
return None
def get_new_data(savepath):
'''
获取最新模块数据
:return:
'''
html = get_html(new_url, "utf-8")
soup = BeautifulSoup(html,"html.parser")
shows = soup.find("div",class_="shows")
a_links = shows.find_all("a")
datalist = []
for a in a_links:
mv_url = a['href']
print(mv_url)
result = get_detail_data(base_url + mv_url)
datalist.append(result)
save_to_excel(savepath,datalist)
def get_marvel_data():
'''
获取漫威电影系列榜单
:return:
'''
html = get_html(marvel_url,"utf-8")
get_new_data(html,"./漫威.xls")
def get_dc_data():
'''
获取dc电影系列榜单
:return:
'''
html = get_html(dc_url,"utf-8")
get_new_data(html,"./dc.xls")
def get_sw_data():
'''
获取星球大战电影系列榜单
:return:
'''
html = get_html(star_wars_url,"utf-8")
get_new_data(html,"./星球大战.xls")
def get_oscars_data():
'''
获取星球大战电影系列榜单
:return:
'''
html = get_html(oscars_url, "utf-8")
get_new_data(html, "./奥斯卡金像奖榜单.xls")
def save_to_excel(savepath,datalist):
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('Top100', cell_overwrite_ok=True) # 创建工作表
col = ("电影名", "时长", "上映日期", "级别","简介")
for i in range(0,5):
sheet.write(0, i, col[i]) # 列名
for i in range(0, len(datalist)):
print("第{}条".format(i + 1))
data = datalist[i]
if len(data) >= 5:# 数据完整才保存
for j in range(0, 5):
sheet.write(i + 1, j, data[j])
book.save(savepath) # 保存
def get_detail_data(url):
'''
:return:
'''
print(url)
html = get_html(url,"utf-8")
soup = BeautifulSoup(html, "html.parser")
mv_content = soup.find("div",class_="movieContent")
mv_detail = mv_content.find("div",class_="movieMobileDetail")
mv_title = mv_detail.find("div",class_="movieName").get_text().strip()
mv_tad = mv_detail.find("div",class_="text").get_text().strip().split("|")
mv_duration = mv_tad[1].strip()
mv_time = mv_tad[0].strip()
mv_level = mv_detail.find("div",class_="cat").get_text().strip()
mv_des = mv_content.find("div",class_="synopsis").span.span['aria-label'].strip()
# 标题 2021年2月24日 | 103 分鐘 简介 级别
return [mv_title,mv_duration,mv_time,mv_level,mv_des]
def main():
get_marvel_data()
get_dc_data()
get_sw_data()
get_oscars_data()
if __name__ == '__main__':
main()