requests库爬取猫眼电影“最受期待榜”榜单 --网络爬虫

目标站点:https://maoyan.com/board/6

# coding:utf8
import requests, re, json
from requests.exceptions import RequestException


# from multiprocessing import Pool

# 获取页面
def get_one_page(url):
    try:
        resp = requests.get(url)
        if resp.status_code == requests.codes.ok:
            return resp.text
        else:
            return None
    except RequestException:
        return None


# 页面解析
def parse_one_page(html):
    pattern = re.compile('
.*?board-index.*?>(\\d+).*?data-src="(.*?)"' '.*?name">(.*?).*?star">(.*?)

.*?releasetime">(.*?)

' '.*?
', re.S) items = re.findall(pattern, html) # result is a list,made up of tuples for item in items: # 生成字典 yield { 'index': item[0], 'img_url': item[1], 'title': item[2], 'stars': item[3][3:], 'releasetime': item[4], } # 将爬取到的内容写入到文件中 def write_file(content): with open('content.txt', 'a', encoding='utf-8') as f: str_content = json.dumps(content, ensure_ascii=False) # 转换成字符串 f.write(str_content + '\n') f.close() # 主函数 def main(offset): url = "https://maoyan.com/board/6/?offset=" + str(offset) html = get_one_page(url) for item in parse_one_page(html): write_file(item) print(item) if __name__ == "__main__": # 请求5次 for i in range(5): main(i * 10)

requests库爬取猫眼电影“最受期待榜”榜单 --网络爬虫_第1张图片

转载于:https://www.cnblogs.com/qikeyishu/p/10758081.html

你可能感兴趣的:(requests库爬取猫眼电影“最受期待榜”榜单 --网络爬虫)