记录爬取《猫眼电影》-影名-作者-上映时间代码

import time

from lxml import etree

import requests

import pymysql

class MaoYanSpider(object):
    def __init__(self):
        self.url = "https://www.maoyan.com/board/4?offset={}"
        # 因为反爬有可能headers有问题,更换一个即可
        self.headers = {  # 设置header
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            # 'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
            'referer': 'https://passport.meituan.com/',
            'Cookie': '__mta=42753434.1633656738499.1634781127005.1634781128998.34; uuid_n_v=v1; _lxsdk_cuid=17c5d879290c8-03443510ba6172-6373267-144000-17c5d879291c8; uuid=60ACEF00317A11ECAAC07D88ABE178B722CFA72214D742A2849B46660B8F79A8; _lxsdk=60ACEF00317A11ECAAC07D88ABE178B722CFA72214D742A2849B46660B8F79A8; _csrf=94b23e138a83e44c117736c59d0901983cb89b75a2c0de2587b8c273d115e639; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1634716251,1634716252,1634719353,1634779997; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1634781129; _lxsdk_s=17ca07b2470-536-b73-84%7C%7C12'
        }
        # self.proxies = {'http':'http://121.41.122.215','https':'121.41.122.215'}
        # 链接数据库
        self.db = pymysql.connect(host="127.0.0.1", user='root', password='', db='maoyandb')
        # 取得数据库游标(类似于java的Statement操作对象)
        self.cursor = self.db.cursor()

    def get_html(self, url):
        html = requests.get(url=url, headers=self.headers).text
        # print(html)
        r_list = etree.HTML(html).xpath('//dl[@class="board-wrapper"]/dd')
        # print(r_list)
        items = []
        for dd in  r_list:
            i =(
                dd.xpath('.//p[@class="name"]/a/text()')[0].strip(),
                dd.xpath('.//p[@class="star"]/text()')[0].strip()[3:],
                dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()[5:15]
            )
            # print(i)
            items.append(i)

        self.save_html(items)
    def save_html(self,items):
        try:

            sql = 'insert into filmtab(name,star,time)values(%s,%s,%s)'
            self.cursor.executemany(sql,items)
            self.db.commit()
        except Exception as e:
            self.db.rollback()
            print(str(e.args))

    def run(self):
        offset = int(input("请输入页码:"))
        url = self.url.format((offset-1)*10)
        self.get_html(url)
        # 断开数据库资源
        self.cursor.close()
        self.db.close()

if __name__ == '__main__':
    start = time.time()
    spider = MaoYanSpider()
    spider.run()
    end = time.time()

    print("数据抓取完毕,总耗时:%.2f" % (end - start))

你可能感兴趣的:(python,爬虫,python,爬虫)