python 爬虫抓取猫眼电影 top100 源码

#根据崔大庆视频整理
import requests
import re
import json
from requests.exceptions import RequestException
from multiprocessing import Pool
from bs4 import BeautifulSoup

#获取html 的文本 键  值
def getOnePage(url):
    try:
        headers = {
            'Host': 'maoyan.com',
            'User-Agent': 'User-Agent  Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN'
        }
        response = requests.get(url,headers=headers)

        if(response.status_code == 200):
            return response.text

        return None
    except RequestException:
        return None

#正则表达式获取需要的内容,并放入字典中
def parseOnePage(html):
    pattern = re.compile('
.*?board-index.*?>(\d*).*?data-src="(.*?)".*?name">(.*?).*?star">' +'(.*?)

.*?releasetime">(.*?)

'
+'.*?integer">(.*?).*?fraction">(.*?)

.*?
'
,re.S) items = re.findall(pattern,html) for item in items: yield { 'index':item[0], 'image':item[1], 'title':item[2], 'actor':item[3], 'time':item[4], 'score':item[5]+item[6] } #写入文本文件 def writeToFile(content): with open("maoyan.txt",'a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False) + "\n") f.close() #main函数 def main(offset): url = "http://maoyan.com/board/4?offset=" + str(offset) html = getOnePage(url) soup = BeautifulSoup(html,'html.parser',from_encoding='utf-8') print(soup) for item in parseOnePage(html): # print(item) writeToFile(item) #入口 if __name__ == '__main__': #main(0) # for i in range(10): # print(i) # main(i*10) #多线程抓取 pool = Pool() pool.map(main,[i*10 for i in range(10)])


你可能感兴趣的:(python)