爬取猫眼电影存入mongodb

爬取猫眼电影存入mongodb

from urllib import request
import time
import re
import pymongo

class MaoyanSpider(object):
    def __init__(self):
        self.baseurl = 'https://maoyan.com/board/4?offset='
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
        }
        # 爬取页数计数
        self.page = 1
        self.conn = pymongo.MongoClient('localhost', 27017)
        self.db = self.conn['maoyandb']
        self.myset = self.db['filmset']

    # 获取页面
    def get_page(self,url):
        req = request.Request(url,headers=self.headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        # 直接调用解析函数
        self.parse_page(html)

    # 解析页面
    def parse_page(self,html):
        # 正则解析
        p = re.compile('
.*?title="(.*?)".*?class="star">(.*?)

.*?releasetime">(.*?)

',re.S) r_list = p.findall(html) # r_list : [('霸王别姬','张国荣','1993'),(),()] self.write_mongo(r_list) # 保存数据(从终端输出) def write_mongo(self, r_list): for rt in r_list: film = { '名称': rt[0].strip(), '主演': rt[1].strip(), '时间': rt[2].strip() } self.myset.insert_one(film) # 主函数 def main(self): # 用range函数可获取某些查询参数的值 for offset in range(0, 41, 10): url = self.baseurl + str(offset) self.get_page(url) print('第%d页爬取成功' % self.page) self.page += 1 time.sleep(1) if __name__ == '__main__': spider = MaoyanSpider() spider.main()

你可能感兴趣的:(爬取猫眼电影存入mongodb)