from bs4 import BeautifulSoup
import urllib.request
url = 'https://movie.douban.com/top250'
def get_links(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('div', class_='paginator').find_all('a')
linklist = list()
for page in pages:
str = url + page.get('href')
linklist.append(str)
return linklist
def get_info(url, score_list):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, 'lxml')
movies = soup.find_all('div', class_='info')
for movie in movies:
title = movie.find('span', class_='title').get_text()
rating_num = movie.find('span', class_='rating_num').get_text()
score_list.append({'title': title, 'score': rating_num})
scoreList = list()
linklist = get_links(url)
get_info(url,scoreList)
for link in linklist:
get_info(link, scoreList)
from openpyxl import Workbook
wb = Workbook()
sheet = wb.active
sheet['A1'].value = 'number'
sheet['B1'].value = 'title'
sheet['C1'].value = 'score'
a = 2
for d in scoreList:
title = d['title']
score = d['score']
sheet.cell(row=a, column=1).value=a-1
sheet.cell(row=a, column=2).value=title
sheet.cell(row=a, column=3).value=score
a += 1
wb.save('douban_top_movie.xlsx')