python 爬取豆瓣 TOP250 电影存到 excel 中

#!/usr/bin/env python
# encoding=utf-
"""
爬取豆瓣电影TOP250 - 完整示例代码
""
import codecs
import xlwings as xw
import requests
from bs4 import BeautifulSoup

DOWNLOAD_URL = 'http://movie.douban.com/top250/'

movie_name_list = []  # 电影名字
director_name_list = []  # 导演名字
score_list = []  # 评分
def download_page(url):
    return requests.get(url, headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }).content


def parse_html(html):
    soup = BeautifulSoup(html)
    movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})



    #寻找所有的条目
    for movie_li in movie_list_soup.find_all('li'):

        detail = movie_li.find('div', attrs={'class': 'hd'})

        director_detail = movie_li.find('div',attrs={'class':'bd'})

        score_detail = movie_li.find('div',attrs={'class':'star'})


        movie_name = detail.find('span', attrs={'class': 'title'}).getText()

        director_name = director_detail.find('p',attrs={'':''}).getText()

        score = score_detail.find('span',attrs={'class':'rating_num'}).getText()

        #print(score)
        #print(director_name)
        director_name_list.append(director_name) #导演名字
        movie_name_list.append(movie_name) #电影名字
        score_list.append(score)  #电影评分

    next_page = soup.find('span', attrs={'class': 'next'}).find('a')
    #下一页存在
    if next_page:
        return movie_name_list, DOWNLOAD_URL + next_page['href']
    #下一页不存在
    return movie_name_list, None
def showExcel():
    i = 0
    app = xw.App(visible=True, add_book=False)
    app.display_alerts = False
    # 文件位置:filepath,打开test文档,然后保存,关闭,结束程序
    filepath = r'D://Desktop/myexcel.xlsx'
    wb = app.books.open(filepath)
    sht = wb.sheets['sheet1']
    sht.range('A1').value = "电影名称"
    sht.range('B1').value = "详细信息"
    sht.range('C1').value = "豆瓣评分"
    while i < len(movie_name_list):
        sht.cells(i+2,1).value = movie_name_list[i]
        sht.cells(i+2,2).value = director_name_list[i]
        sht.cells(i+2,3).value = score_list[i]
        i = i + 1

def main():
    url = DOWNLOAD_URL
    while url:
        html = download_page(url)
        movies, url = parse_html(html)
        #fp.write(u'{movies}\n{director}\n{score}'.format(movies='\n'.join(movies)))

if __name__ == '__main__':
    main()
    i = 0
    showExcel()

你可能感兴趣的:(python)