网络爬虫爬取动态网页数据

目录

一、导学与指南

豆瓣单页分析

豆瓣多页输出

二、理论学习

1.抓取动态网页的技术

2.Selenium和WebDriver的安装与配置

3.Selenium的基本使用

三、小结


一、导学与指南

豆瓣单页分析

import json
 
import requests
 
# 基础URL 不顶事了
url_base="https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action="
 
#经过分析的 动态内容url 才是我们要的结果
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
    'Cookie': 'bid=hCw6GK7T3ko; _pk_id.100001.4cf6=e05499d4844cbfde.1697382901.; __yadk_uid=Y0K7d13OW6bvDo7Rfg4GEhEopPLKv9Vk; ll="118303"; _vwo_uuid_v2=D116B2284E0415DE6F0E8E62C0F3F1B7C|dbd80bec580d442e73cbc806b51e709a; ct=y; douban-fav-remind=1; __utmc=30149280; __utmz=30149280.1698504570.8.7.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; __utmz=223695111.1698504570.7.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1698504600; Hm_lpvt_16a14f3002af32bf3a75dfe352478639=1698504600; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1698558921%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DxUZ2pFPHLGI9UjAZ5BVOkGTqzr9mirz0hM9tSnQ4LgGBvkpYQRkEaveglj68M1Hs%26wd%3D%26eqid%3Dbd0c500a000673f800000006653d1f77%22%5D; __utma=30149280.1569954803.1697382901.1698557082.1698558922.11; __utma=223695111.1379090793.1697382901.1698557082.1698558922.10'
}
# # 1、查看url_base返回结果
# result_base = requests.get(url_base, headers=headers)
# # print(result_base.apparent_encoding) # utf-8
# # print(result_base.encoding) # utf-8
# print(result_base.text) # 搜索榜一大哥 肖申克的救赎,查为空
 
 
# 2、对比分析F12-NETWORK-FETCH/XHR中的连接
result = requests.get(url, headers=headers)
# print(result.apparent_encoding) # utf-8
# print(result.encoding) # utf-8
#print(result.text) # 获取正常,不看了关掉
 
 
#r = json.loads() # 字符串是json、就可以用,常出现在正则表达式提取后
result_json = result.json() # 网页是json的时候,可以直接用
#print(result_json) # 成功转化成json,不看了关掉
 
movies = []
for i in result_json:
    title = i["title"]
    date = i["release_date"]
    types = i["types"]
    score = i["score"]
    actors = i["actors"]
    movie = {
        "title": title,
        "date": date,
        "type": types,
        "score": score,
        "actors": actors
    }
    

你可能感兴趣的:(爬虫)