豆瓣top250

import requests

import bs4

import re

import openpyxl

def open_url(url):

# 使用代理

# proxies = {"http": "127.0.0.1:1080", "https": "127.0.0.1:1080"}

    headers = {

'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}

# res = requests.get(url, headers=headers, proxies=proxies)

    res = requests.get(url,headers=headers)

return res

def find_movies(res):

soup = bs4.BeautifulSoup(res.text,'html.parser')

# 电影名

    movies = []

targets = soup.find_all("div",class_="hd")

for eachin targets:

movies.append(each.a.span.text)

# 评分

    ranks = []

targets = soup.find_all("span",class_="rating_num")

for eachin targets:

ranks.append(each.text)

# 资料

    messages = []

targets = soup.find_all("div",class_="bd")

for eachin targets:

try:

messages.append(each.p.text.split('\n')[1].strip() + each.p.text.split('\n')[2].strip())

except:

continue

    result = []

length =len(movies)

for iin range(length):

result.append([movies[i], ranks[i], messages[i]])

return result

# 找出一共有多少个页面

def find_depth(res):

soup = bs4.BeautifulSoup(res.text,'html.parser')

depth = soup.find('span',class_='next').previous_sibling.previous_sibling.text

return int(depth)

def save_to_excel(result):

wb = openpyxl.Workbook()

ws = wb.active

ws['A1'] ="电影名称"

    ws['B1'] ="评分"

    ws['C1'] ="资料"

    for eachin result:

ws.append(each)

wb.save("豆瓣TOP250电影.xlsx")

def main():

host ="https://movie.douban.com/top250"

    res = open_url(host)

depth = find_depth(res)

result = []

for iin range(depth):

url = host +'/?start=' +str(25 * i)

res = open_url(url)

result.extend(find_movies(res))

'''

with open("test.txt", "w", encoding="utf-8") as f:

for each in result:

f.write(each)

'''

    save_to_excel(result)

if __name__ =="__main__":

main()

你可能感兴趣的:(豆瓣top250)