import urllib
import requests
import re
from bs4 import BeautifulSoup
R = requests.Session()
Url = 'http://58921.com'
# 模拟登录,手动登录网站,然后复制cookies。
def log_in():
cookies = {}
url = 'http://58921.com/user/login'
raw_cookies = 'DIDA642a4585eb3d6e32fdaa37b44468fb6c=5d6fpatsbd0irf8vcila883nk2; time=MTEzNTI2LjIxNjM0Mi4xMDI4MTYuMTA3MTAwLjExMTM4NC4yMDc3NzQuMTE5OTUyLjExMTM4NC4xMDQ5NTguMTEzNTI2LjExMTM4NC4xMDcxMDAuMTA5MjQyLjExMzUyNi4xMTM1MjYuMTEzNTI2LjExMTM4NC4xMTk5NTIuMA%3D%3D; Hm_lvt_e71d0b417f75981e161a94970becbb1b=1542275343,1542341828; Hm_lpvt_e71d0b417f75981e161a94970becbb1b=1542355547'
for lies in raw_cookies.split(';'):
key, word = lies.split('=', 1)
cookies[key] = word
res = R.post(url, cookies=cookies)
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read().decode(encoding='utf-8')
return html
def getUrls(itemStr):
try:
pattern = re.compile("(?<=)
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getMovieName(itemStr):
try:
pattern = re.compile(r'(?<=电影 › ).*(?=)')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getMovieDescription(itemStr):
try:
pattern = re.compile(r'(?<=).*(?=)')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getDirector(itemStr):
try:
pattern = re.compile(r'>导演:<.+?title="(.+?)"')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getActor(itemStr):
try:
pattern = re.compile(r'>主演:<.+?title="(.+?)"')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getReleaseTime(itemStr):
try:
pattern = re.compile(r'>上映时间:<.+?>(.+?)<')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getMovieTime(itemStr):
try:
pattern = re.compile(r'>片长:<.+?>(.+?)<')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getMovieArea(itemStr):
try:
pattern = re.compile(r'>制作国家/地区:<.+?title="(.+?)"')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getMovieType(itemStr):
try:
pattern = re.compile(r'>类型:<.+?title="(.+?)"')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getMovieLanguage(itemStr):
try:
pattern = re.compile(r'>语言:<.+?title="(.+?)"')
return pattern.findall(str(itemStr))
except:
return '获取失败'
def getTotalTicket(item):
try:
Html=getHtml("http://58921.com/film/"+str(item)+"/boxoffice")
pattern = re.compile(r'\(最新票房 (.+?)\)')
return pattern.findall(str(Html))
except:
return '获取失败'
#获取最后一页的页数
def getPage(Text):
try:
pattern = re.compile(r')
number = re.findall(pattern, Text)
return int(number[0])
except:
return int(1)
#获取当前页数
def getCurrentPage(Text):
try:
pattern = re.compile(r'(?<=)\d+')
number = re.findall(pattern, Text)
return int(number[0])
except:
return int(1)
def getDetailData():
log_in();
yearnumber=1995
while yearnumber<2019:
Html=getHtml("http://58921.com/alltime/"+str(yearnumber))
lastPage=getPage(Html)+1
currentPage=getCurrentPage(Html)
while currentPage<=lastPage:
if currentPage<=1:
Html = getHtml("http://58921.com/alltime/" + str(yearnumber))
else:
Html = getHtml("http://58921.com/alltime/" + str(yearnumber) + "?page=" + str(currentPage - 1))
items=getUrls(Html)
for item in items:
Html=getHtml("http://58921.com/film/"+str(item))
print(getMovieName(Html))
print(getDirector(Html))
print(getActor(Html))
print(getReleaseTime(Html))
print(getMovieTime(Html))
print(getMovieArea(Html))
print(getMovieType(Html))
print(getMovieLanguage(Html))
print(getMovieDescription(Html))
print(getTotalTicket(item))
currentPage+=1
yearnumber+=1
#执行下面语句即可:
getDetailData();