抓取猫眼电影 Code

import json, requests, re
from datetime import time

def get_one_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
    }
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.text
    except BaseException as base:
        print(base)

def parse_one_page(html):
    # 匹配i节点中的排名信息
    rank = '
.*?board-index.*?>(.*?)' # 提取图片信息 img = '.*?data-src="(.*?)"' # 提取电影名称 name = '.*?(.*?)' # 提取主演 act = '.*?class="star">(.*?)

' # 提取上映时间 time = '.*?class="releasetime">(.*?)

' # 评分 grade = '.*?class="integer">(.*?)(.*?)' regul = rank + img + name + act + time + grade pattern = re.compile(regul, re.S) results = re.findall(pattern, html) for result in results: yield { 'index': result[0], 'image': result[1], 'title': result[2], 'actor': result[3].strip()[3:], 'time': result[4].strip()[4:], 'score': result[5].strip() + result[6].strip() } def write_json(data): with open('movie.json', 'a', encoding='utf-8') as w: json.dump(data, w) w.write('\n') def main(offset): url = 'Http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) for i in parse_one_page(html): # print(i) write_json(i) if __name__ == '__main__': for i in range(10): main(offset=i * 10) time.sleep(2)

你可能感兴趣的:(抓取猫眼电影 Code)