爬虫案例:使用re,requests,json,RequestException爬取猫眼电影Top100加保存文件

#python3.7
#author:huangtao
#pycharm

import requests
from requests.exceptions import RequestException
import re
import json
#导入相关库文件
#定义get_url,通过主程序传递的url进行访问
def get_url(url,headers):
#判断访问是否成功,如果响应值为200,则访问成功
    try:
        response = requests.get(url,headers=headers)
        #访问url
        if response.status_code == 200:
            return response.text
            #返回网页源代码
        else:
            return None
    except RequestException:
        return None
    #否则返回None


def parse_html(html):
#定义def parse_html,通过解析从上面得到的html源代码,解析数据
    ranks = re.findall(r'(.*?)',html,re.DOTALL)
    titles = re.findall(r'

(.*?)

',html,re.DOTALL) lianjies = re.findall(r'.*?',html,re.DOTALL) authors_list = re.findall(r'

(.*?)

', html, re.DOTALL) releasetimes = re.findall(r'

(.*?)

',html,re.DOTALL) scores = re.findall(r'(.*?)(.*?)',html,re.DOTALL) #通过正则解析网页得到数据 for rank,title,lianjie,authors,releasetime,score in zip(ranks,titles,lianjies, authors_list,releasetimes,scores): # print(authors.strip()) movie = [] #定义一个movie movies = { '排名':rank, '电影名':title, '海报链接':lianjie, '主演':authors.strip()[3:], '上映时间/国家':releasetime[5:], '分数':score[0]+score[1] } movie.append(movies) print(movie) with open('movie','a',encoding='utf-8')as f: f.write(json.dumps(movie,ensure_ascii=False)+'\n') f.close() #保存数据到movie.txt中,movie先转下格式,然后保存,保存的时候要转码 def main(offset): url = 'http://maoyan.com/board/4?offset='+str(offset) headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/70.0.3538.77 Safari/537.36'} html = get_url(url,headers) parse_html(html) #main方法,传递各种参数,构造url if __name__ == '__main__': #主程序 for i in range(10): #循环遍历 main(i*10)

 

你可能感兴趣的:(Python的爬虫使用案例,Python的常见库使用,一只网络上的虫(爬虫实例))