dang dang dang~
今天我们来做一件非常有意思的事~
不知道看什么电影?来看看最近上映的电影数据海报就OK
猫眼数据的爬取很简单,因为猫眼提供了一个静态的json文件,供第三方应用调用其数据,地址如下:
https://box.maoyan.com/promovie/api/box/second.json
接着,我们将这些数据保存,并结合猫眼详情页面的URL,进行更详细的查询
https://piaofang.maoyan.com/movie/{movieID}
其中的movieID参数可通过第一个url拿到
def detail(self, item):
URL = "https://piaofang.maoyan.com/movie/{}".format(item.movieId)
print('猫眼详情获取 : ', URL)
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
}
try:
res = requests.get(URL,
headers=HEADERS,
timeout=3,
verify=False,
allow_redirects=False)
if res.status_code == 200:
html = res.text
# 解析html
html = BeautifulSoup(html, "lxml")
baseInfo = html.find(class_="info-base")
item.category = baseInfo.find(class_="info-category").contents[0].strip()
tag = baseInfo.find(class_="info-tag")
if tag:
item.tag = tag.get_text().strip()
blockInfo = baseInfo.find_next_sibling()
scoreDetail = blockInfo.find(class_="score-detail")
if scoreDetail:
s1 = scoreDetail.find(class_="rating-stars")
s2 = scoreDetail.find(class_="detail-score-count")
s3 = scoreDetail.find(class_="detail-wish-count")
item.ratingNum = s1.find_next().get_text() if s1 else 0
item.scoreCount = s2.get_text() if s2 else 0
item.wishCount = s3.get_text() if s3 else 0
else:
item.ratingNum = 0
item.scoreCount = 0
item.wishCount = 0
boxInfo = blockInfo.find_next_sibling()
row1 = boxInfo.find(class_="dividing-line").find_next_sibling()
cols1 = row1.find_all(class_="info-detail-col")
# 累计票房
item.total_box = cols1[0].find(class_="info-detail-content").get_text().strip()
# 首日票房
item.day_box = cols1[1].find(class_="info-detail-content").get_text().strip()
# 预测票房
item.forecast_box = cols1[3].find(class_="info-detail-content").get_text().strip()
# info-detail-content
else:
print('猫眼详情获取失败')
except Exception as e:
print(e)
print('traceback.print_exc():', traceback.print_exc())
接下来是豆瓣的评分信息获取,地址如下:
https://movie.douban.com
从该页面中取到我们需要的值
def index(self):
URL = "https://movie.douban.com"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
try:
res = requests.get(URL,
headers=HEADERS,
timeout=3,
verify=False,
allow_redirects=False)
if res.status_code == 200:
html = res.text
print('豆瓣数据获取成功')
# 解析html
html = BeautifulSoup(html, "lxml")
ul = html.find(class_="screening-bd").find("ul", class_="ui-slide-content")
list = ul.find_all("li", class_="ui-slide-item")
items = []
for li in list:
title = li.get('data-title')
release = li.get('data-release')
duration = li.get('data-duration')
actors = li.get('data-actors')
region = li.get('data-region')
rater = li.get('data-rater')
rate = li.get('data-rate')
# data-actors data-region
print("--------------")
print("名称:", title)
print("发布时间:", release)
print("影片时长:", duration)
print("演员:", actors)
print("地区:", region)
print("评分人数:", rater)
print("评分:", rate)
item = {
'title': title,
'release': release,
'duration': duration,
'actors': actors,
'region': region,
'rater': rater,
'rate': rate,
}
items.append(item)
print(items)
self.finish(items)
except Exception as e:
print(e)
print('traceback.print_exc():', traceback.print_exc())
pass
然后,就是展示成果的时候啦~
结合使用tornado框架,生成我们的控制台页面
点击开始爬取猫眼数据,等待爬取成功后,点击查看猫眼数据
查看条形统计图,分析票房信息
查看环形统计图
查看词图
查看评分
到这里,我们就拿到了一份好看的数据海报,还在等什么,赶快约起来~