python学习第四天

今天主要是让自己完成一个项目,自己动手去爬取猫眼电影top100具体实现如下,仅供参考

  • 猫眼电影top100
import requests
from lxml import etree
def parse():
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
    movie_info_list =[]
    for i in range(0,91,10):
        url = "https://maoyan.com/board/4?offset={}".format(i)
        req = requests.get(url , headers = headers)
        req.encoding = 'utf-8'
        data = req.text
        html = etree.HTML(data)
        movie_info = html.xpath("//div[@class='main']/dl/dd")
        for movie in movie_info:
            #电影名
            name = movie.xpath("./div/div/div[1]/p[1]/a/text()")
            name = '' if len(name) == 0 else name[0]
            #主演
            role = movie.xpath("./div/div/div[1]/p[2]/text()")
            role = 'role:' if len(role) == 0 else role[0]
            role = role.replace('role:','')
            role = role.strip()
            #上映时间
            time = movie.xpath("./div/div/div[1]/p[3]/text()")
            time = 'time:' if len(time) == 0 else time[0]
            time = time.replace('time:','')
            movie_info_list.append({
                'name': name,
                'role': role,
                'time': time
            })
    for movie_info in movie_info_list:
            print(movie_info)
parse()

图片爬取和详情连接的爬取在此并没有给出,下面我会给出
豆瓣网的爬取案例。

  • 豆瓣电影top250
import requests
from lxml import etree
import pandas as pd
def parse():
    """豆瓣网top250爬虫"""
    # 1、获取url地址
    # for i in range(0, 226, 25):
    #     url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
    #     print(url)
    #     # 获取 byte的类型的响应
    #     resp = requests.get(url)
    #     data = resp.content
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
    # 定义列表存储top250信息 [{},{},{}]
    movie_info_list = []

    # 循环每一页
    for i in range(0, 226, 25):
        url = 'https://movie.douban.com/top250?start={}&filter='.format(i)

        # 获取 byte的类型的响应
        resp = requests.get(url, headers=headers)
        data = resp.content
        # 调用etree.HTML获取html对象,然后调用html的xpath语法
        html = etree.HTML(data)

        movie_list = html.xpath('//div[@id="content"]//ol/li')
        # print(len(movie_list))
        for movie in movie_list:
            # 获取电影序号
            serial_number = movie.xpath('./div[@class="item"]/div[@class="pic"]/em/text()')
            serial_number = '' if len(serial_number) == 0 else serial_number[0]
            # print(serial_number)

            # 电影名称
            movie_name = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')

            movie_name = '' if len(movie_name) == 0 else movie_name[0]
            # print(movie_name)

            # 电影介绍
            introduce = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[1]/text()')
            introduce = '' if len(introduce) == 0 else introduce[0]
            # 去两端空格操作
            introduce = introduce.strip()
            # print(introduce)

            # 电影星级
            star = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()')

            star = '' if len(star) == 0 else star[0]
            # print(star)

            # 电影的评价
            evalute = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')

            evalute = '人评价' if len(evalute) == 0 else evalute[0]
            evalute = evalute.replace('人评价', '')
            # print(evalute)

            # 电影的描述
            describe = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()')

            describe = '' if len(describe) == 0 else describe[0]
            # print(describe)
            # 详情链接地址

            detail_link = movie.xpath('./div[@class="item"]/div[@class="pic"]/a/@href')

            detail_link = '' if len(detail_link) == 0 else detail_link[0]

            # print(detail_link)

            # 图片地址

            img_url = movie.xpath('./div[@class="item"]/div[@class="pic"]/a/img/@src')

            img_url = '' if len(img_url) == 0 else img_url[0]

            # print(img_url)

            movie_info_list.append({
                'serial_number': serial_number,
                'movie_name':movie_name,
                'introduce':introduce,
                'star':star,
                'evalute':evalute,
                'describe':describe,
                'detail_link': detail_link,
                'img_url':img_url

            })

    for movie_info in movie_info_list:
        print(movie_info)
        resp = requests.get(movie_info['img_url'])
        if resp.status_code == 200:
            # 执行图片写入操作
            # 参考图片命名方式  0000001.jpg
            img_name = '000000{}.jpg'.format(movie_info['serial_number'])
            with open('./imgs/{}'.format(img_name), 'wb') as f:
                f.write(resp.content)

    # 存储成csv

    df = pd.DataFrame(movie_info_list)
    df.to_csv('douban_top250_info.csv')

parse()

csv文件是用逗号分隔开的一种文件。具体解释如下:
逗号分隔值(Comma-Separated Values,CSV,有时也称为字符分隔值,因为分隔字符也可以不是逗号),其文件以纯文本形式存储表格数据(数字和文本)。纯文本意味着该文件是一个字符序列,不含必须像二进制数字那样被解读的数据。CSV文件由任意数目的记录组成,记录间以某种换行符分隔;每条记录由字段组成,字段间的分隔符是其它字符或字符串,最常见的是逗号或制表符。通常,所有记录都有完全相同的字段序列。通常都是纯文本文件。建议使用WORDPAD或是记事本来开启,再则先另存新档后用EXCEL开启,也是方法之一。

CSV文件格式的通用标准并不存在,但是在RFC 4180中有基础性的描述。使用的字符编码同样没有被指定,但是bitASCII是最基本的通用编码。

你可能感兴趣的:(python学习第四天)