python 2-2(2019-10-29 )中 (优化豆瓣查询将映电影)

from xpinyin import Pinyin
import requests
from lxml import html
import pandas as pd

# pip install xpinyin
def spider(city):
    # splitter 式分割使用的符号,默认是'-'
    city_pinyin = Pinyin().get_pinyin(city,splitter='')
    url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
    print('您要查询的目标站点是',url)
    print('爬虫进行中,请稍后......')
    # 获取网络源代码,请求头信息,目的是伪装浏览器进行爬虫
    headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}  # 伪装成浏览器
    # 提取网页源代码
    response = requests.get(url, headers = headers)
    html_data = response.text
    # print(html_data)
    # 提取我们想要的内容
    selector = html.fromstring(html_data)
    div_list = selector.xpath('//div[@id="showing-soon"]/div')
    print('您好,{}市有{}几部电影即将上映'.format(city,len(div_list)))
    movie_info_list = []
    for div in div_list:
        # 获取电影名字
        movie_name = div.xpath('div[1]/h3/a/text()')
        # if len(movie_name)==0:
        #     movie_name = '没有查询到数据'
        # else:
        #     movie_name =movie_name[0]
        # movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
        # print(movie_name)

        # 上映日期
        date = div.xpath('div[1]/ul/li[1]/text()')[0]
        # date = '没有查询到数据' if len(date) == 0 else date[0]
        # print(date)

        # 类型
        type = div.xpath('div[1]/ul/li[2]/text()')[0]
        # type = '没有查询到数据' if len(type) == 0 else type[0]
        # print(type)

        # 国家
        country = div.xpath('div[1]/ul/li[3]/text()')[0]
        # country = '没有查询到数据' if len(country) == 0 else country[0]
        # print(country)

        # 想看人数
        want_see = div.xpath('div[1]/ul/li[4]/span/text()')[0]
        # want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
        want_see = int(want_see.replace('人想看',''))
        # print(want_see)

        # 图片链接
        img_link = div.xpath('a/img/@src')[0]
        # img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]

        movie_info_list.append({
                    "movie_name": movie_name,
                    "date": date,
                    "type": type,
                    "country": country,
                    "want_see": want_see,
                    "img_link": img_link
                })


    movie_info_list.sort(key=lambda x: x['want_see'])
    # 遍历
    for movie in movie_info_list:
        print(movie)
        # 图片获取
        with open('./douban_img/{}.jpg'.format(movie['movie_name']),'wb') as f:
            f.write(requests.get(movie['img_link']).content)

    pd.DataFrame(movie_info_list).to_csv('{}douban_movie_info.csv'.format(city_pinyin))

# 在屏幕中输入您要查看的即将上映电影信息的城市
city = input('输入您要查看即将上映电影信息的城市')
# 调用函数
spider(city)

你可能感兴趣的:(python 2-2(2019-10-29 )中 (优化豆瓣查询将映电影))