爬虫练习 | 爬取猫眼电影Top100

#coding=utf-8
_date_ = '2018/12/9 16:18'
import requests
import re
import json
import time
def get_one_page(url):

    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    reg=requests.get(url,headers=headers)
    if reg.status_code==200:
        return reg.text
    else:
        print('出错了')
def write_to_file(conments):
    with open('page.text','a',encoding='utf-8')as a:
        a.write(json.dumps(conments,ensure_ascii=False)+'\n')

def parse_one_page(html):
    r=re.compile('
.*?board-index.*?>(.*?).*?data-src="(.*?)".*?name.*?a.*?>(.*?).*?star.*?>(.*?)

.*?releasetime.*?>(.*?)

.*?"integer.*?>(.*?).*?"fraction.*?>(.*?).*?
',re.S) items=re.findall(r,html) for item in items: yield { 'index':item[0], 'image':item[1], 'name':item[2], 'star':item[3].strip(), 'time':item[4].strip(), 'score':item[5]+item[6] } if __name__ == '__main__': for i in range(0,10): url='https://maoyan.com/board/4?offset={}'.format(i*10) html=get_one_page(url) content=parse_one_page(html) for i in content: write_to_file(i) time.sleep(1)

 

转载于:https://www.cnblogs.com/404NooFound/p/10092210.html

你可能感兴趣的:(爬虫练习 | 爬取猫眼电影Top100)