2019-03-06

猫眼电影top100爬取

我们要提取的信息是电影名称,演员,排名,上映日期,图片链接

由于这个页面是静态的页面,页面分析就不多说了,直接附源码。

#-*- coding:utf8-*-

"""

@author:Administrator

@file: seleniumStu_2.py

@time: 2019/02/{DAY}

"""

import logging

from lxmlimport etree

import pymongo

LOG_FORMAT= "%(asctime)s - %(levelname)s - %(message)s"

logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)

"""

模块的作用:      爬取猫眼top100电影排名"""

import requests

from urllib.parseimport urljoin

def myrequest(url):

    '''

    请求页面

    :param url:

:return: response.text

'''

    header= {

        'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '

                      '(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36')

}

    try:

        logging.debug('开始请求页面 %s'%url)

        response= requests.get(url,headers=header,timeout=3)

        if response.status_code== 200:

            return response.text

except TimeoutError:

return None

def parse(html):

    '''

    解析页面,提取需要的数据

    :param html:

    :return: 生成器'''

    page= etree.HTML(html)

    Level= page.xpath('//dd/i/text()')

    MoviceName= page.xpath('//p[@class="name"]/a/text()')

    Director= page.xpath('//p[@class="star"]/text()')

    ShowTime= page.xpath('//p[@class="releasetime"]/text()')

    ImageUrl= page.xpath('//*[@class="board-img"]')

    for indexin range(len(MoviceName)):

        yield {

            'Level':Level[index],

            'MoviceName':MoviceName[index],

            'Director':Director[index].replace('\t\n\n','').replace('\n','').split(':')[-1].strip(),

            'ShowTime':ShowTime[index].split(':')[-1],

            'ImageUrl':ImageUrl[index].attrib['data-src']

}

def save_to_mongodb(item,db):

    '''

    数据导入mongodb

:param item:

:param db:

:return: None

'''

    db['maoyantop100'].insert(item)

def closed(client):

    client.close()

    logging.debug('页面爬取完成,信息导入mongodb... ...')

    logging.debug('mongodb数据库连接已关闭... ...')

def main(client,db):

    '''

    程序入口

    :param client:

:param db:

:return: None

'''

    url= 'https://maoyan.com/board/4'

    for pagein range(0,10):

        full_url= urljoin(url,'?offset='+str(page* 10))

        html= myrequest(full_url)

        for itemin parse(html):

            save_to_mongodb(item,db)

    closed(client)

if __name__== '__main__':

    mongo_url= 'mongodb://localhost:27017'

    mongo_db= 'MaoYanTop100'

    client= pymongo.MongoClient(mongo_url)

    db= client['MaoYanTop100']

    main(client,db)

你可能感兴趣的:(2019-03-06)