Python爬虫:requests+正则爬取猫眼最受期待榜单

# 基本思路:
# 1.浏览访问网页榜单-最受期待榜 url = 'http://maoyan.com/board/6?offset=0'
# 2.获取数据:电影名称 主演 网址 上映时间 本月新增想看人数 总想看人数
# 3.数据解析处理等
# 4.数据保存

import requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import Pool           # 多线程

def get_page(url):        # 网页请求
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException as e:
        return e

def analysis(html):  # 解析网页数据
    pattern = re.compile('.*?

(.*?)

.*?class="star">(.*?)
\

.*?class="releasetime">(.*?)

.*?class="month-wish">(.*?).*?class="stonefont">.*?.*?
\ class="total-wish">(.*?).*?class="stonefont">.*?', re.S) # re.S 匹配换行符 result = re.findall(pattern, html) for item in result: yield { 'name': item[0], 'actor': item[1].strip()[3:], 'time': item[2].strip()[5:], 'month_wish': item[3], 'total_wish': item[4] } def write_to_file(content): # 数据保存至本地 with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def main(offset): url = 'http://maoyan.com/board/6?offset=' + str(offset) html = get_page(url) for item in analysis(html): print(item) # write_to_file(item) if __name__ == '__main__': """ for i in range(0,4): main(i*10) """ pool = Pool() pool.map(main, [i*10 for i in range(0, 4)])

你可能感兴趣的:(Python爬虫:requests+正则爬取猫眼最受期待榜单)