豆瓣影评爬虫——导出excel

豆瓣影评爬虫——导出excel

#导入库
import requests
from bs4 import BeautifulSoup
import xlwt
import time

#创建worbook
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = workbook.add_sheet('movie', cell_overwrite_ok = True)
sheet.write(0, 0, 'Users')
sheet.write(0, 1, 'Rating')
sheet.write(0, 2, 'Review')

#创建网页链接
url = "https://movie.douban.com/subject/30163509/comments?start=20&limit=20&sort=new_score&status=P"
urls = ["https://movie.douban.com/subject/30163509/comments?start={}&limit=20&sort=new_score&status=P".format(str(i)) for i in range(0, 100, 20)]

#设置评分转换词典
rate_map = {'力荐':5, '推荐':4, '还行':3, '较差':2, '很差':1}

#构建代码主体
def get_reviews(url, index):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
        'Cookie': 'bid=eW42c3smMr8; __yadk_uid=3raqDcGLrLfJEWbgxVrvRv8Ck1LF0oVI; ll="118282"; _vwo_uuid_v2=D2264B87A31A89AC8205792ADB5A0CAF8|eec96c21ac819e4bcddf5bfe38d57fee; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1556782088%2C%22https%3A%2F%2Fwww.google.com.hk%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.932586090.1556681751.1556713635.1556782088.4; __utmb=30149280.0.10.1556782088; __utmc=30149280; __utmz=30149280.1556782088.4.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utma=223695111.1165095227.1556681751.1556713635.1556782088.4; __utmb=223695111.0.10.1556782088; __utmc=223695111; __utmz=223695111.1556782088.4.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; _pk_id.100001.4cf6=7db113e9a9cc4d8b.1556681751.4.1556782104.1556716214.'
    }

    time.sleep(2)

    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')

    users = soup.select("span.comment-info > a")
    rates = soup.select("span.comment-info > span:nth-of-type(2)")
    reviews = soup.select("#comments > div > div.comment > p > span")

    for user, rate, review in zip(users, rates, reviews):
        if rate.get('title') in rate_map:
            sheet.write(index, 0, user.get_text())
            sheet.write(index, 1, rate_map[rate.get('title')])
            sheet.write(index, 2, review.get_text())
            index = index + 1
    return index

index = 1

for single_url in urls:
    index = get_reviews(single_url, index)
    print(single_url)

#保存为excel格式
workbook.save('E://test2.xls')

你可能感兴趣的:(爬虫实战)