2019-12-24

#! /usr/bin/python3

# -*- coding:UTF-8

import requests

from lxml import etree

url = 'https://movie.douban.com/top250?'

headers = {

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",

}

new_url = ''

file = open('xiaoshuo.txt', 'r+', encoding="utf-8")

for i in range(0, 10):

    new_url = url + 'start=' + str(i*25) + '&filter='

    response = requests.get(new_url, headers=headers)

    html = etree.HTML(response.text)

    title_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')

    inf_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()')

    score_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[2]/text()')

    quote_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span/text()')

    for j in range(0, 25):

        str1 = title_list[j] + ' ' + inf_list[2*j].strip() + inf_list[2*j+1].strip() +' 评分为:' + score_list[j] + ' "' + quote_list[j] + '"\n'

        file.write(str1)

        print(str1)

print('电影爬取完毕')

file.close()

你可能感兴趣的:(2019-12-24)