Python--爬取电影票房数据

爬取1995年至今的电影数据及其票房数据。

import urllib
import requests
import re
from bs4 import BeautifulSoup

R = requests.Session()
Url = 'http://58921.com'

# 模拟登录,手动登录网站,然后复制cookies。
def log_in():
    cookies = {}
    url = 'http://58921.com/user/login'
    raw_cookies = 'DIDA642a4585eb3d6e32fdaa37b44468fb6c=5d6fpatsbd0irf8vcila883nk2; time=MTEzNTI2LjIxNjM0Mi4xMDI4MTYuMTA3MTAwLjExMTM4NC4yMDc3NzQuMTE5OTUyLjExMTM4NC4xMDQ5NTguMTEzNTI2LjExMTM4NC4xMDcxMDAuMTA5MjQyLjExMzUyNi4xMTM1MjYuMTEzNTI2LjExMTM4NC4xMTk5NTIuMA%3D%3D; Hm_lvt_e71d0b417f75981e161a94970becbb1b=1542275343,1542341828; Hm_lpvt_e71d0b417f75981e161a94970becbb1b=1542355547'
    for lies in raw_cookies.split(';'):
        key, word = lies.split('=', 1)
        cookies[key] = word

    res = R.post(url, cookies=cookies)

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read().decode(encoding='utf-8')
    return html

def getUrls(itemStr):
    try:
        pattern = re.compile("(?<=)
        return pattern.findall(str(itemStr))
    except:
        return '获取失败'

def getMovieName(itemStr):
    try:
        pattern = re.compile(r'(?<=电影 › ).*(?=
)') return pattern.findall(str(itemStr)) except: return '获取失败' def getMovieDescription(itemStr): try: pattern = re.compile(r'(?<=
).*(?=
)'
) return pattern.findall(str(itemStr)) except: return '获取失败' def getDirector(itemStr): try: pattern = re.compile(r'>导演:<.+?title="(.+?)"') return pattern.findall(str(itemStr)) except: return '获取失败' def getActor(itemStr): try: pattern = re.compile(r'>主演:<.+?title="(.+?)"') return pattern.findall(str(itemStr)) except: return '获取失败' def getReleaseTime(itemStr): try: pattern = re.compile(r'>上映时间:<.+?>(.+?)<') return pattern.findall(str(itemStr)) except: return '获取失败' def getMovieTime(itemStr): try: pattern = re.compile(r'>片长:<.+?>(.+?)<') return pattern.findall(str(itemStr)) except: return '获取失败' def getMovieArea(itemStr): try: pattern = re.compile(r'>制作国家/地区:<.+?title="(.+?)"') return pattern.findall(str(itemStr)) except: return '获取失败' def getMovieType(itemStr): try: pattern = re.compile(r'>类型:<.+?title="(.+?)"') return pattern.findall(str(itemStr)) except: return '获取失败' def getMovieLanguage(itemStr): try: pattern = re.compile(r'>语言:<.+?title="(.+?)"') return pattern.findall(str(itemStr)) except: return '获取失败' def getTotalTicket(item): try: Html=getHtml("http://58921.com/film/"+str(item)+"/boxoffice") pattern = re.compile(r'\(最新票房 (.+?)\)') return pattern.findall(str(Html)) except: return '获取失败' #获取最后一页的页数 def getPage(Text): try: pattern = re.compile(r'
  • ) number = re.findall(pattern, Text) return int(number[0]) except: return int(1) #获取当前页数 def getCurrentPage(Text): try: pattern = re.compile(r'(?<=
  • )\d+') number = re.findall(pattern, Text) return int(number[0]) except: return int(1) def getDetailData(): log_in(); yearnumber=1995 while yearnumber<2019: Html=getHtml("http://58921.com/alltime/"+str(yearnumber)) lastPage=getPage(Html)+1 currentPage=getCurrentPage(Html) while currentPage<=lastPage: if currentPage<=1: Html = getHtml("http://58921.com/alltime/" + str(yearnumber)) else: Html = getHtml("http://58921.com/alltime/" + str(yearnumber) + "?page=" + str(currentPage - 1)) items=getUrls(Html) for item in items: Html=getHtml("http://58921.com/film/"+str(item)) print(getMovieName(Html)) print(getDirector(Html)) print(getActor(Html)) print(getReleaseTime(Html)) print(getMovieTime(Html)) print(getMovieArea(Html)) print(getMovieType(Html)) print(getMovieLanguage(Html)) print(getMovieDescription(Html)) print(getTotalTicket(item)) currentPage+=1 yearnumber+=1 #执行下面语句即可: getDetailData();
  • 你可能感兴趣的:(Python,爬虫)