爬取猫眼电影TOP100电影信息并将其保存到数据库(电影名,主演,上映时间)

import  re
from urllib import request
from urllib.request import urlopen
from concurrent.futures import ThreadPoolExecutor
#猫眼电影Top100网址
import pymysql

url='http://maoyan.com/board/4'
films=[]
#获取TOP100所有页的网址
def get_all_url(url,page):
    url_li=[url]
    for i in range(page-1):
        new_url=url+'?offset=%d'%((i+1)*10)
        url_li.append(new_url)
    return url_li
#爬取网页信息
def get_info(url):
    header={'User-agent':'Google Chrome/67.0.3396.87'}
    req=request.Request(url,headers=header)
    with urlopen(req) as urlObj:
        return urlObj.read().decode('utf-8')
# get_info(url)
#利用正则表达式提取所需要的信息
def get_content(url):
    content=get_info(url)
    '''
    

主演:张国荣,张丰毅,巩俐

上映时间:1993-01-01(中国香港)

''' pattern = r'
.*<\/a><\/p>\s*

\s*(主演:.*)\s*<\/p>\s*' \ r'

(上映时间:.*)<\/p> ' films.extend(re.findall(pattern,content)) return films # get_content(url='http://maoyan.com/board/4') def main(): res=get_all_url(url,10) #使用多线程获取信息 with ThreadPoolExecutor(max_workers=5)as pool: pool.map(get_content,res) #数据库连接 conn=pymysql.connect(host='localhost',user='root',passwd='axis0214', db='films',charset='utf8') #创建游标 cur=conn.cursor() try: cur.execute('create table movies (movie varchar(40),action varchar(60),timeon varchar(120))') insert_sql='insert into movies values(%s,%s,%s)' cur.executemany(insert_sql,films) cur.execute('select * from movies') print(cur.fetchall()) except Exception as e: print(e) else: print('信息写入成功。。。') conn.commit() cur.close() conn.close() if __name__ == '__main__': main()

爬取猫眼电影TOP100电影信息并将其保存到数据库(电影名,主演,上映时间)_第1张图片

你可能感兴趣的:(爬取猫眼电影TOP100电影信息并将其保存到数据库(电影名,主演,上映时间))