Python网络爬虫爬取豆瓣电影的数据

一、豆瓣电影的数据是动态加载,所以要用到json解析

https://movie.douban.com/typeranktype_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=为例子
用火狐浏览器打开

完成第四步就获取到请求动态数据的json请求地址,在执行第三步时,你会发现你点击不同的url地址,消息头下的请求地址中的action的数据会发生变化,start表示第几部电影,limit表示一页显示多少部电影,为了更好的获取数据,我们在下面的程序中实在limit=1,start的值通过循环来设置,设置的大小看下图。

    json文件里面的数据进行格式化后如下,根据下面数据提前信息

源代码:

#coding:utf-8
import json
from time import sleep


import jsonpath as jsonpath
import requests
import csv


#解析json数据并存储到CSV文件
def pardeContent(url,headers):
    # 因为豆瓣是https的,所以我们在此处需要稍微注意一下,将verify置为False表示不需要验证SSL证书
    response = requests.get(url, headers=headers, verify=False)
    # 读取reponse
    html = response.text
    # 把json格式字符串转换成python对象
    html = json.loads(html)
    # 获取节点下的数据并存储到CSV文件
    with open('movie_data.csv', 'a+', newline="", encoding='utf-8') as f:
        writer = csv.writer(f)
        #从json文件提取数据
        id = jsonpath.jsonpath(html, '$..id')
        title = jsonpath.jsonpath(html, '$..title')
        regions = jsonpath.jsonpath(html, '$..regions')
        types = jsonpath.jsonpath(html, '$..types')
        release_date = jsonpath.jsonpath(html, '$..release_date')
        score = jsonpath.jsonpath(html, '$..score')
        cover_url = jsonpath.jsonpath(html, '$..cover_url')
        url = jsonpath.jsonpath(html, '$..url')
        actors = jsonpath.jsonpath(html, '$..actors')


        print(id, title, regions, types, release_date, score, cover_url, url, actors)
        #写入CSv文件
        writer.writerow([id, title, regions, types, release_date, score, cover_url, url, actors])


if __name__ == "__main__":
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/77.0.3865.120 Safari/537.36'
    }
    with open('movie_data.csv','w+',encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        #设置文件目录
        writer.writerow(['id', 'regions', 'title', 'types', 'release_date', 'score', 'cover_url', 'url', 'actors'])


    for start in range(79, 654):
        url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=1".format(
            start)
        print(url)
        pardeContent(url, headers)
  • 二、源代码中的不足
    1、如果你想要豆瓣上首有电影的信息,每次都要输入新页面的url

三、优化后的代码

#coding:utf-8
import json
import requests
import csv


#解析json数据并存储到CSV文件
def pardeContent(url,headers):
    # 因为豆瓣是https的,所以我们在此处需要稍微注意一下,将verify置为False表示不需要验证SSL证书
    response = requests.get(url, headers=headers, verify=False)
    # 读取reponse
    html = response.text
    # 把json格式字符串转换成python对象
    html = json.loads(html)
    # 获取节点下的数据并存储到CSV文件
    with open('movie_data1.csv', 'a+', newline="", encoding='utf-8') as f:
        writer = csv.writer(f)
        #从json文件提取数据
        # id = jsonpath.jsonpath(html, '$..id')
        # title = jsonpath.jsonpath(html, '$..title')
        # regions = jsonpath.jsonpath(html, '$..regions')
        # types = jsonpath.jsonpath(html, '$..types')
        # release_date = jsonpath.jsonpath(html, '$..release_date')
        # score = jsonpath.jsonpath(html, '$..score')
        # cover_url = jsonpath.jsonpath(html, '$..cover_url')
        # url = jsonpath.jsonpath(html, '$..url')
        # actors = jsonpath.jsonpath(html, '$..actors')
        for key in html:
            id = key["id"]
            title = key["title"]
            regions = key["regions"]
            types = key["types"]
            release_date = key["release_date"]
            score = key["score"]
            cover_url = key["cover_url"]
            url = key["url"]
            actors = key["actors"]


        print(id, title, regions, types, release_date, score, cover_url, url, actors)
        #写入CSv文件
        writer.writerow([id, title, regions, types, release_date, score, cover_url, url, actors])


if __name__ == "__main__":
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/77.0.3865.120 Safari/537.36'
    }
    with open('movie_data1.csv','w+',encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        #设置文件目录
        writer.writerow(['id',  'title', 'regions', 'types', 'release_date', 'score', 'cover_url', 'url', 'actors'])


    for start in range(0, 654):
        url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=1".format(
            start)
        print(url)
        print(start)
        pardeContent(url, headers)

你可能感兴趣的:(Python)