猫眼电影top100爬取
我们要提取的信息是电影名称,演员,排名,上映日期,图片链接
由于这个页面是静态的页面,页面分析就不多说了,直接附源码。
#-*- coding:utf8-*-
"""
@author:Administrator
@file: seleniumStu_2.py
@time: 2019/02/{DAY}
"""
import logging
from lxmlimport etree
import pymongo
LOG_FORMAT= "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
"""
模块的作用: 爬取猫眼top100电影排名"""
import requests
from urllib.parseimport urljoin
def myrequest(url):
'''
请求页面
:param url:
:return: response.text
'''
header= {
'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36')
}
try:
logging.debug('开始请求页面 %s'%url)
response= requests.get(url,headers=header,timeout=3)
if response.status_code== 200:
return response.text
except TimeoutError:
return None
def parse(html):
'''
解析页面,提取需要的数据
:param html:
:return: 生成器'''
page= etree.HTML(html)
Level= page.xpath('//dd/i/text()')
MoviceName= page.xpath('//p[@class="name"]/a/text()')
Director= page.xpath('//p[@class="star"]/text()')
ShowTime= page.xpath('//p[@class="releasetime"]/text()')
ImageUrl= page.xpath('//*[@class="board-img"]')
for indexin range(len(MoviceName)):
yield {
'Level':Level[index],
'MoviceName':MoviceName[index],
'Director':Director[index].replace('\t\n\n','').replace('\n','').split(':')[-1].strip(),
'ShowTime':ShowTime[index].split(':')[-1],
'ImageUrl':ImageUrl[index].attrib['data-src']
}
def save_to_mongodb(item,db):
'''
数据导入mongodb
:param item:
:param db:
:return: None
'''
db['maoyantop100'].insert(item)
def closed(client):
client.close()
logging.debug('页面爬取完成,信息导入mongodb... ...')
logging.debug('mongodb数据库连接已关闭... ...')
def main(client,db):
'''
程序入口
:param client:
:param db:
:return: None
'''
url= 'https://maoyan.com/board/4'
for pagein range(0,10):
full_url= urljoin(url,'?offset='+str(page* 10))
html= myrequest(full_url)
for itemin parse(html):
save_to_mongodb(item,db)
closed(client)
if __name__== '__main__':
mongo_url= 'mongodb://localhost:27017'
mongo_db= 'MaoYanTop100'
client= pymongo.MongoClient(mongo_url)
db= client['MaoYanTop100']
main(client,db)