day 01 用正则爬取电影

功能:爬取数据,创建数据库,储存数据

/maoyan.py(爬取数据代码)

import requests
import re
import json
from maoyan9_db_helper import *

db = get_connection()
cursor = get_cursor(db)

# 获取网页
def get_page(page):
    url = 'https://maoyan.com/board/4?offset=%d' % (page * 10)
    response = requests.get(url)
    if response.status_code == 200:
        return response.content.decode('utf-8')
    return None


# 解析网页
def parse_page(html):
    # 取片名
    pattern = re.compile('movieId.*?>.*?(.*?)

', re.S) actors = re.findall(pattern, html) actors = [actor.strip() for actor in actors] # print(actors) # 取上映时间 pattern = re.compile('

(.*?)

', re.S) releasetimes = re.findall(pattern, html) # print(releasetimes) # 取封面图片url pattern = re.compile('movieId.*?>.*?(.*?)(.*?)

', re.S) scores = re.findall(pattern, html) scores = [''.join(score) for score in scores] # print(scores) # 取排名 pattern = re.compile('(.*?)', re.S) ranks = re.findall(pattern, html) # print(ranks) # 取详情链接 pattern = re.compile('

', re.S) detail_urls = re.findall(pattern, html) # print(detail_urls) # 获取数据列表 result_list = [] for i in range(len(movie_names)): item_dict = {} item_dict['movie_name'] = movie_names[i] item_dict['actor'] = actors[i] item_dict['releasetime'] = releasetimes[i] item_dict['cover_url'] = cover_urls[i] save_image(item_dict['cover_url']) item_dict['score'] = scores[i] item_dict['rank'] = ranks[i] item_dict['detail_url'] = detail_urls[i] execute_sql(db, cursor, item_dict) result_list.append(item_dict) return result_list # 写入json数据文件 def write_json(result_list): json_str = json.dumps(result_list, ensure_ascii=False) with open('./movies9.json', 'w', encoding='utf-8') as f: f.write(json_str) # 保存图片文件到本地 def save_image(cover_url): response = requests.get(cover_url) # file_name = re.findall(r'.*?movie/(.*?)@', cover_url)[0] # print(file_name) file_name = cover_url.split('/')[-1].split('@')[0] print(file_name) with open('./images9/%s' % file_name, 'wb') as f: f.write(response.content) def main(): result_list = [] for page in range(10): print(page) html = get_page(page) # print(html) one_page_list = parse_page(html) result_list.extend(one_page_list) print(result_list) print(len(result_list)) write_json(result_list) close_conn(db) if __name__ == '__main__': main()

SQL/maoyan.sql(创建数据库)

create database maoyan9 default character set=utf8;

use maoyan9;

create table movie(
    id integer auto_increment primary key,
    movie_name varchar(256),
    actor varchar(256),
    releasetime varchar(256),
    cover_url varchar(1024),
    score   varchar(32),
    ranks    varchar(256),
    detail_url  varchar(1024)
);

create unique index ux_movie_movie_name on movie(movie_name);

create index ix_movie_actor on movie(actor);

maoyan9_db_helper.py(存储数据)

import pymysql

def get_connection():
    conn= pymysql.connect(host='127.0.0.1', port=3306, database='maoyan9',user='root', password='DENG5rong2hua0!', charset='utf8')
    return conn

def get_cursor(db):
    cursor = db.cursor()
    return cursor

def close_conn(db):
    db.close()

#第一种
def execute_sql(db, cursor, item_dict):
    sql = 'insert into movie (movie_name,actor,releasetime,cover_url,score,ranks,detail_url) values ("%s","%s","%s","%s","%s","%s","%s")' % (item_dict['movie_name'],item_dict['actor'],item_dict['releasetime'],item_dict['cover_url'],item_dict['score'],item_dict['rank'],item_dict['detail_url'])
    print(sql)
    cursor.execute(sql)
    db.commit()
#第二种
def execute_sql2(db, cursor, item_dict):
    sql = 'insert into movie (movie_name,actor,releasetime,cover_url,score,ranks,detail_url) values (%s,%s,%s,%s,%s,%s,%s)'
    print(sql, (item_dict['movie_name'],item_dict['actor'],item_dict['releasetime'],item_dict['cover_url'],item_dict['score'],item_dict['rank'],item_dict['detail_url']))
    cursor.execute(sql)
    db.commit()

你可能感兴趣的:(day 01 用正则爬取电影)