# 定义一个函数获取猫眼电影的数据
import requests
def main():
url = url = 'http://maoyan.com/board/4?offset=0'
html = requests.get(url).text
print(html)
if __name__ == '__main__':
main()
# 利用正则匹配,获得我们想要的信息
""" < dd > < i class ="board-index board-index-10">10 < a href = "/films/2760" title = "魂断蓝桥" class ="image-link" data-act="boarditem-click" data-val="{movieId:2760}" > < img src = "//ms0.meituan.net/mywww/image/loading_2.e3d934bf.png" alt = "" class ="poster-default" / > < img data - src = "http://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c" alt = "魂断蓝桥" class ="board-img" / >< / a > < div class ="board-item-main" > < div class ="board-item-content" > < div class ="movie-item-info" > < p class ="name" > < a href="/films/2760" title="魂断蓝桥" data-act="boarditem-click" data-val="{movieId:2760}" > 魂断蓝桥 < / a > < / p > < p class ="star" >主演:费雯·丽, 罗伯特·泰勒, 露塞尔·沃特森< / p > < p class ="releasetime" > 上映时间:1940-05-17(美国) < / p > < / div > < div class ="movie-item-number score-num" > < p class ="score" > < i class ="integer" > 9. < / i > < i class ="fraction" > 2 < / i > < / p > < / div >< / div >< / div > < / dd > """
import re
reg = r'.*?>(.*?).*?data-src="(.*?)".*?title="(.*?)".*?主演:(.*?).*?' \
r'上映时间:(.*?).*?integer.*?>(.*?).*?fraction.*?>(.*?).*?'
reg = re.compile(reg, re.S)
items = re.findall(reg, html)
print(items)
# 循环遍历列表并且把列表转换为字典
for item in items:
index = item[0]
image = item[1]
title = item[2]
actor = item[3]
time = item[4]
score = item[5] + item[6]
dict1 = {'index': index, 'image': image, 'title': title,
'actor': actor, 'time': time, 'score': score}
print(dict1)
# 把获得的数据保存在文件中
import json
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(dict1, ensure_ascii=False))
# 利用循环获取猫眼电影所有数据
def main():
for i in range(10):
url = 'http://maoyan.com/board/4?offset=' + str(i*10)
# 最后代码整理如下
import json
import re
from time import sleep
import requests
def main():
for i in range(10):
url = 'http://maoyan.com/board/4?offset=' + str(i * 10)
html = requests.get(url).text
reg = r'.*?>(.*?).*?data-src="(.*?)".*?title="(.*?)"'
r'.*?主演:(.*?).*?上映时间:(.*?).*?integer.*?>'
r'(.*?).*?fraction.*?>(.*?).*?'
reg = re.compile(reg, re.S)
items = re.findall(reg, html)
for item in items:
# print(item)
index = item[0]
image = item[1]
title = item[2]
actor = item[3]
time = item[4]
score = item[5] + item[6]
dict1 = {'index': index, 'image': image, 'title': title,
'actor': actor, 'time': time, 'score': score}
sleep(1)
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(dict1, ensure_ascii=False))
if __name__ == '__main__':
main()