requests + xpath 爬取猫眼电影top100

requests + xpath 爬取猫眼电影top100_第1张图片

import json

import requests
from requests.exceptions import RequestException
from multiprocessing import Pool
from lxml import etree


# 获取页面
def get_one_page(url):
    # 加入请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
        'Referer': 'https://www.baidu.com'
    }
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.content.decode('utf-8')
        else:
            return None
    except RequestException:
        return None

# 保存TXT文件
def save_txt(dic):
    with open('maoyan_top100.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(dic, ensure_ascii=False) + '\n')  # ensure_ascii:显示中文

# 解析页面
def parse_one_page(html):
    html = etree.HTML(html)
    # 片名
    name = html.xpath('//dd/a[1]/@title')
    # 主演
    protagonist = html.xpath('//p[@class="star"]/text()')
    # 上影时间
    time = html.xpath('//p[@class="releasetime"]/text()')
    # 宣传图片链接
    image_links = html.xpath('//dd/a/img[2]/@data-src')
    
    for i in range(len(name)):
        dic = {
            'name': name[i].strip(),
            'protagonist': protagonist[i].strip(),
            'time': time[i].strip(),
            'image_links': image_links[i].strip()
        }
        # 保存数据为TXT文件
        save_txt(dic)

# 主函数
def main(offset):
    url = 'https://maoyan.com/board/4?offset=' + str(offset)
    # 获取每一页面的html
    html = get_one_page(url)
    # 解析每一页数据
    parse_one_page(html)

if __name__ == '__main__':
    # 进程池太大,会出现数据丢失
    pool = Pool(3)
    pool.map(main, [i*10 for i in range(10)])

你可能感兴趣的:(spider)