python爬虫 爬取猫眼电影数据

# 定义一个函数获取猫眼电影的数据

import requests

def main():
    url = url = 'http://maoyan.com/board/4?offset=0'
    html = requests.get(url).text
    print(html)



if __name__ == '__main__':
    main()

# 利用正则匹配,获得我们想要的信息

"""
< dd >
< i class ="board-index board-index-10">10
< a href = "/films/2760" title = "魂断蓝桥" class ="image-link" data-act="boarditem-click" 
data-val="{movieId:2760}" >
< img src = "//ms0.meituan.net/mywww/image/loading_2.e3d934bf.png" alt = "" class ="poster-default" / >
< img data - src = "http://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c" 
alt = "魂断蓝桥" class ="board-img" / >< / a >
< div class ="board-item-main" >
< div class ="board-item-content" >
< div class ="movie-item-info" >
< p class ="name" > < a href="/films/2760" title="魂断蓝桥" 
data-act="boarditem-click" data-val="{movieId:2760}" > 魂断蓝桥 < / a > < / p >
< p class ="star" >主演:费雯·丽, 罗伯特·泰勒, 露塞尔·沃特森< / p >
< p class ="releasetime" > 上映时间:1940-05-17(美国) < / p > < / div >
< div class ="movie-item-number score-num" >
< p class ="score" > < i class ="integer" > 9. < / i > < i class ="fraction" > 2 < / i > < / p >
< / div >< / div >< / div >
< / dd >
"""
import re


reg = r'
.*?>(.*?).*?data-src="(.*?)".*?title="(.*?)".*?主演:(.*?)

.*?' \ r'上映时间:(.*?)

.*?integer.*?>(.*?).*?fraction.*?>(.*?).*?' reg = re.compile(reg, re.S) items = re.findall(reg, html) print(items)

# 循环遍历列表并且把列表转换为字典

for item in items:
        
    index = item[0]
    image = item[1]
    title = item[2]
    actor = item[3]
    time = item[4]
    score = item[5] + item[6]
    dict1 = {'index': index, 'image': image, 'title': title,
             'actor': actor, 'time': time, 'score': score}
    print(dict1)

# 把获得的数据保存在文件中

import json


with open('result.txt', 'a', encoding='utf-8') as f:
    f.write(json.dumps(dict1, ensure_ascii=False))

# 利用循环获取猫眼电影所有数据

def main():
    for i in range(10):
        url = 'http://maoyan.com/board/4?offset=' + str(i*10)
        

# 最后代码整理如下

import json
import re
from time import sleep
import requests


def main():

    for i in range(10):
        url = 'http://maoyan.com/board/4?offset=' + str(i * 10)
        html = requests.get(url).text

        reg = r'
.*?>(.*?).*?data-src="(.*?)".*?title="(.*?)"' r'.*?主演:(.*?)

.*?上映时间:(.*?)

.*?integer.*?>' r'(.*?).*?fraction.*?>(.*?).*?' reg = re.compile(reg, re.S) items = re.findall(reg, html) for item in items: # print(item) index = item[0] image = item[1] title = item[2] actor = item[3] time = item[4] score = item[5] + item[6] dict1 = {'index': index, 'image': image, 'title': title, 'actor': actor, 'time': time, 'score': score} sleep(1) with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(dict1, ensure_ascii=False)) if __name__ == '__main__': main()

 

你可能感兴趣的:(爬虫,python)