爬虫 猫眼电影排行榜爬取代码

from urllib import request, error, parse
import re, ssl
import pymysql
from fake_useragent import UserAgent


def maoyan_sipder():
    start_url = 'https://maoyan.com/board/4?offset=0'
    start_request(start_url)


def start_request(url):
    headers = {
        "User-Agent": UserAgent().random
    }
    req = request.Request(url=url, headers=headers)
    context = ssl._create_unverified_context()
    try:
        response = request.urlopen(req, context=context, timeout=10)
        if response.status == 200:
            print('请求成功')
            html_str = response.read().decode('utf-8')
            result = parse_data(html_str)
            if len(result) > 0:
                # 存储数据
                save_data_to_db(result)
                # 获取当前请求的url地址
                current_url = response.url
                # 获取当前请求的偏移量
                pattern = re.compile('.*?offset=(\d+)')
                offset_result = re.findall(pattern, current_url)
                if offset_result:
                    offset = offset_result[0]
                    print('当前偏移量', offset)
                    next_offset = int(offset) + 10
                    next_url = 'https://maoyan.com/board/4?offset=' + str(next_offset)
                    print(next_url)
                    start_request(next_url)

            else:
                import time
                time.sleep(5)
                #关闭数据库链接
                cursor.close()
                mysql_client.close()
                print('没有数据了')
    except error.HTTPError as err:
        print(err.code, err.reason)
    except error.URLError as err:
        print(err.reason)


def parse_data(html_str):
    pattern = re.compile(
        '(.*?).*?' +
        '.*?' +
        '.*?(.*?).*?' +
        '(.*?)

.*?' + '(.*?)

.*?' + '(.*?).*?' + '(.*?)', re.S ) result = re.findall(pattern, html_str) return result def save_data_to_db(result): for movieInfo in result: info = {} info['rank'] = int(movieInfo[0]) info['coverImage'] = movieInfo[1] info['title'] = movieInfo[2] info['actor'] = movieInfo[3] info['publishTime'] = movieInfo[4] info['score'] = float(movieInfo[5]+movieInfo[6]) insert_sql = """ INSERT INTO maoyanmovie(%s) VALUES(%s) """ % (','.join(info.keys()),','.join(['%s']*len(info))) try: cursor.execute(insert_sql,list(info.values())) mysql_client.commit() except Exception as err: print(err) mysql_client.rollback() # insert_sql = """ # INSERT INTO maoyanmovie(rank,coverImage,title,actor,publishTime,score) # VALUES(%s,%s,%s,%s,%s,%s) # """ # try: # cursor.execute( # insert_sql, # [ # int(movieInfo[0]), # movieInfo[1], # movieInfo[2], # movieInfo[3].replace('','').replace('\n',''), # movieInfo[4], # float(movieInfo[5]+movieInfo[6]) # ] # ) # mysql_client.commit() # except Exception as err: # print(err) # mysql_client.rollback() if __name__ == "__main__": # 创建数据库链接 mysql_client = pymysql.Connect(host='localhost', port=3306, user='root', password='201314', database='maoyan', charset='utf8') #创建游标 cursor = mysql_client.cursor() maoyan_sipder()

你可能感兴趣的:(爬虫 猫眼电影排行榜爬取代码)