Python 抓取纵*横中文网小说内容 实现小说内容AES解密还原

嗨喽~大家好呀,这里是魔王呐 ❤ ~!

python更多源码/资料/解答/教程等 点击此处跳转文末名片免费获取

知识点:

  • 爬虫基本流程

  • nodejs的使用

  • AES加密还原

开发环境:

  • 解释器: python 3.8

  • 编辑器: pycharm 2022.3

第三方模块:

  • crypto-js>>> npm install crypto-js

  • requests >>> pip install requests pyexecjs

模块安装:

win + R 输入cmd 输入安装命令 pip install 模块名 (如果你觉得安装速度比较慢, 你可以切换国内镜像源)

爬虫作用:

采集数据 / 模拟用户行为

(可见即可爬, 爬虫不是破解)

爬虫原理:

模拟成 客户端 向 服务器 发送网络请求

批量采集数据 / 类重复的行为 自动化

爬虫基本流程:

一、思路分析

找到数据的来源

先分析单章的小说情况

https://read.zongheng.com/chapter/1215341/68208370.html

小说内容 都在网页源代码当中

二、实现代码

  1. 发送请求

  2. 获取数据

  3. 解析数据

  4. 保存数据

代码展示

'''
遇到问题没人解答?小编创建了一个Python学习交流QQ群:926207505
寻找有志同道合的小伙伴,互帮互助,群里还有不错的视频学习教程和PDF电子书!
'''
import re
import requests

cookies = {
    'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22%24device_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D',
    'ZHID': '6B8B1AA3F580487ED41316E742908662',
    'zhffr': 'www.baidu.com',
    'logon': 'NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA',
    '__logon__': 'NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA',
    '__zhs__': '973b86c348fb9e61d9d2d4d0b547163185771221da00ef66e39fda1bb69f1cb3b9fdbf6a9d4951cace27c091188d5bf0523696eec79fda879fde21d1c7f7a04c4f74760316032fe19f4b22607ba71697fe11db20672a9dc00b547639aa6302516f2bed9e51e93c11fc9ca0c6fb9d6c501503ca3588543a4c2c8a03b4dd9d4433cd7a339653b26b8d9f2c612227f501f4e55fa5987245bdf4b8e61a05984c286f53e6f6850b1101338b1517b5c2de5aeea5dc0e993b7a36212a22c1933fa40c6f33033ad83c8d4bd3e357f6de6df90c8d9a3930b43efc853d650c1424e067262e9522284830db113629c22fc63847000f24a276b63eef37b895f70b22c78db2f8',
    '__zhc__': '30820122300d06092a864886f70d01010105000382010f003082010a0282010100d2dc7166074e33bbf091a23856b0e3e56888b6060c154a102a401b2a88f60f60610fb20df48d03c51c14441987f59edb3dd73dba0e2fbe0bf0074986ef38dd8f907f1312f06ba93ceef9ec18b9ae4abab4b439490062eb152fb01bbc331fd15fa2c4e1ce370ce555df528b71ca2e12d5cdcc138232b745bbccc3568e39802350d46b1f08925e14127bc0a67b50c74674ca67e42b0c396f92bbee6b38b550a022c32897b6369c2f5def7eaa01d667b3a70953ac2152d2777e91d67a31a4d8c159c8d4a0de1602f73492991c1a88e42157c7c513fde391dd31763664ef75d532262d5b8b72fc4bf38ce361bc17f53b940cd5ce4a37ab69f1a3c9549fd985db8a870203010001',
    'loginphone': '19973017649',
    'zh_visitTime': '1700136858520',
    'Hm_lvt_c202865d524849216eea846069349eb9': '1700136859',
    'PassportCaptchaId': '79cceff0bf14300835cd3bad4db019f1',
    'Hm_lpvt_c202865d524849216eea846069349eb9': '1700136972',
}
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22%24device_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; ZHID=6B8B1AA3F580487ED41316E742908662; zhffr=www.baidu.com; logon=NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA; __logon__=NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA; __zhs__=973b86c348fb9e61d9d2d4d0b547163185771221da00ef66e39fda1bb69f1cb3b9fdbf6a9d4951cace27c091188d5bf0523696eec79fda879fde21d1c7f7a04c4f74760316032fe19f4b22607ba71697fe11db20672a9dc00b547639aa6302516f2bed9e51e93c11fc9ca0c6fb9d6c501503ca3588543a4c2c8a03b4dd9d4433cd7a339653b26b8d9f2c612227f501f4e55fa5987245bdf4b8e61a05984c286f53e6f6850b1101338b1517b5c2de5aeea5dc0e993b7a36212a22c1933fa40c6f33033ad83c8d4bd3e357f6de6df90c8d9a3930b43efc853d650c1424e067262e9522284830db113629c22fc63847000f24a276b63eef37b895f70b22c78db2f8; __zhc__=30820122300d06092a864886f70d01010105000382010f003082010a0282010100d2dc7166074e33bbf091a23856b0e3e56888b6060c154a102a401b2a88f60f60610fb20df48d03c51c14441987f59edb3dd73dba0e2fbe0bf0074986ef38dd8f907f1312f06ba93ceef9ec18b9ae4abab4b439490062eb152fb01bbc331fd15fa2c4e1ce370ce555df528b71ca2e12d5cdcc138232b745bbccc3568e39802350d46b1f08925e14127bc0a67b50c74674ca67e42b0c396f92bbee6b38b550a022c32897b6369c2f5def7eaa01d667b3a70953ac2152d2777e91d67a31a4d8c159c8d4a0de1602f73492991c1a88e42157c7c513fde391dd31763664ef75d532262d5b8b72fc4bf38ce361bc17f53b940cd5ce4a37ab69f1a3c9549fd985db8a870203010001; loginphone=19973017649; zh_visitTime=1700136858520; Hm_lvt_c202865d524849216eea846069349eb9=1700136859; PassportCaptchaId=79cceff0bf14300835cd3bad4db019f1; Hm_lpvt_c202865d524849216eea846069349eb9=1700136972',
    'Origin': '******',
    'Pragma': 'no-cache',
    'Referer': '*****/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}
# data = {
#     'bookId': '1215341',
# }
# resp = requests.post('******/api/chapter/getChapterList', cookies=cookies, headers=headers, data=data)
# json_data = resp.json()
# chapterViewList = json_data['result']['chapterList'][0]['chapterViewList']
# for chapterView in chapterViewList:
#     url = f'https://read.zongheng.com/chapter/{chapterView["bookId"]}/{chapterView["chapterId"]}.html'
#     # 发送请求
#     response = requests.get(url=url, headers=headers, cookies=cookies)
#     # 获取数据
#     html_data = response.text
#     # 解析数据
#     # .*?
#     # .: 可以代替任意字符
#     # *: .* 代替多个字符
#     # ?: 非贪婪匹配符
#     #   aaaaacccccbbbbbeeeeebbbbb
#     #   aaaaa.*?bbbbb
#     # 
(.*?)
# text = re.findall('
(.*?)
', html_data)[0]
# text = chapterView['chapterName']+'\n\n'+text.replace('

', '\n').replace('

', '\n') + '\n\n'
# print(text) # # 保存数据 # with open('1.txt', mode='a', encoding='utf-8') as f: # f.write(text) url = f'****/chapter/1215341/68311496.html' # 发送请求 response = requests.get(url=url, headers=headers, cookies=cookies) # 获取数据 html_data = response.text # 解析数据 # .*? # .: 可以代替任意字符 # *: .* 代替多个字符 # ?: 非贪婪匹配符 # aaaaacccccbbbbbeeeeebbbbb # aaaaa.*?bbbbb #
(.*?)
text = re.findall('
(.*?)
'
, html_data)[0] text = chapterView['chapterName']+'\n\n'+text.replace('

', '\n').replace('

'
, '\n') + '\n\n' print(text) # 保存数据 with open('1.txt', mode='a', encoding='utf-8') as f: f.write(text)

尾语

最后感谢你观看我的文章呐~本次航班到这里就结束啦

希望本篇文章有对你带来帮助 ,有学习到一点知识~

躲起来的星星也在努力发光,你也要努力加油(让我们一起努力叭)。

最后,宣传一下呀~更多源码、资料、素材、解答、交流皆点击下方名片获取呀

你可能感兴趣的:(python爬虫,python,开发语言,pycharm,学习,爬虫)