requests配合etree爬取豆瓣电影信息

背景

爬虫最常见的爬去方式就是requests+etree的方式了,用豆瓣电影的信息来做简单案例

url: https://movie.douban.com/cinema/later/beijing/

代码

import requests
from lxml import etree
import pandas as pd
class dangdang_home(object):
    def __init__(self):
        self.url = 'https://movie.douban.com/cinema/later/beijing/'
    def spider(self):
        #浏览器标识
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        resp = requests.get(self.url,headers=headers)
        data = etree.HTML(resp.text)
        # print(data)
        return data
    def parse_data(self,data):
        movie_info = []
        movie_list = data.xpath('//*[@id="showing-soon"]/div/div')
        for movie in movie_list:
            title = movie.xpath('h3/a/text()')[0]
            #print(title)
            info_list = movie.xpath('ul/li/text()')
            #print(info_list[0])
            time = info_list[0]
            type = info_list[1]
            country = info_list[2]
            wanted = movie.xpath('ul/li[4]/span/text()')[0]
            #想看人数数据截取
            wanted = (wanted[-4::-1])[::-1]

            movie_info.append({
                "title":title,
                "time":time,
                "type":type,
                "country":country,
                "wanted":wanted
            })

        return movie_info

if __name__ == "__main__":
    dangdang = dangdang_home()
    data = dangdang.spider()
    movie_list = dangdang.parse_data(data)

    #按照想看人数逆序排序
    movie_list = sorted(movie_list,key=lambda x:int(x['wanted']),reverse=True)

    for data in movie_list:
        print(data)

    #数据存入csv文件
    df = pd.DataFrame(movie_list)
    df.to_csv("movie.csv")
    #df.to_excel()

 

你可能感兴趣的:(爬虫)