爬取猫眼电影存入mysql

爬取猫眼电影存入mysql

from urllib import request
import re
import time
import random
import pymysql


class MaoyanSpider(object):
    def __init__(self):
        self.base_url = 'https://maoyan.com/board/4?offset={}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
        }
        self.page = 1
        self.db = pymysql.connect(
            'localhost', 'root', '123456', 'maoyandb', charset='utf8'
        )
        self.cursor = self.db.cursor()

    def get_pages(self, url):
        req = request.Request(url, headers=self.headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        self.parse_page(html)

    def parse_page(self, html):
        pattern = re.compile('(.*?)

.*?

(.*?)

', re.S) results = pattern.findall(html) self.write_sql(results) def write_sql(self, results): data_list = [] for film in results: L = [ film[0].strip(), film[1].strip(), film[2].strip()[5:15]] data_list.append(L) ins = 'insert into filmset values(%s,%s,%s)' self.cursor.executemany(ins, data_list) self.db.commit() def main(self): # 用range函数可获取某些查询参数的值 for offset in range(0, 41, 10): url = self.base_url.format(str(offset)) self.get_pages(url) print('第%d页爬取成功' % self.page) self.page += 1 time.sleep(random.randint(1, 2)) self.cursor.close() self.db.close() if __name__ == '__main__': spider = MaoyanSpider() spider.main()

你可能感兴趣的:(爬取猫眼电影存入mysql)