"""
爬取豆瓣电影TOP250 - 完整示例代码
""
import codecs
import xlwings as xw
import requests
from bs4 import BeautifulSoup
DOWNLOAD_URL = 'http://movie.douban.com/top250/'
movie_name_list = []
director_name_list = []
score_list = []
def download_page(url):
return requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}).content
def parse_html(html):
soup = BeautifulSoup(html)
movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
for movie_li in movie_list_soup.find_all('li'):
detail = movie_li.find('div', attrs={'class': 'hd'})
director_detail = movie_li.find('div',attrs={'class':'bd'})
score_detail = movie_li.find('div',attrs={'class':'star'})
movie_name = detail.find('span', attrs={'class': 'title'}).getText()
director_name = director_detail.find('p',attrs={'':''}).getText()
score = score_detail.find('span',attrs={'class':'rating_num'}).getText()
director_name_list.append(director_name)
movie_name_list.append(movie_name)
score_list.append(score)
next_page = soup.find('span', attrs={'class': 'next'}).find('a')
if next_page:
return movie_name_list, DOWNLOAD_URL + next_page['href']
return movie_name_list, None
def showExcel():
i = 0
app = xw.App(visible=True, add_book=False)
app.display_alerts = False
filepath = r'D://Desktop/myexcel.xlsx'
wb = app.books.open(filepath)
sht = wb.sheets['sheet1']
sht.range('A1').value = "电影名称"
sht.range('B1').value = "详细信息"
sht.range('C1').value = "豆瓣评分"
while i < len(movie_name_list):
sht.cells(i+2,1).value = movie_name_list[i]
sht.cells(i+2,2).value = director_name_list[i]
sht.cells(i+2,3).value = score_list[i]
i = i + 1
def main():
url = DOWNLOAD_URL
while url:
html = download_page(url)
movies, url = parse_html(html)
if __name__ == '__main__':
main()
i = 0
showExcel()