公众号爬虫备份
1、介绍与说明
对于一些质量比较高的公众号,或者网页来说,一般会选择收藏,便于以后经常查看或者分享,但也会出现比较喜欢的文章被删之类的情况,便会觉得痛失心爱之物一样,如果提前做了爬虫备份便不会有这样的情况了
技术实现原理时,先使用抓包工具获取文章数据集合的json串,再解析出json串中文章地址的URL,通过http请求获取文章内容,解析出文章中图片的地址并下载图片,替换response中的图片引入地址,保存网页到本地,使用wkhtmltopdf将网页转化为pdf保存。
2、软件工具准备
a、抓包工具 用户获取公众号文章json结果集合 https://www.charlesproxy.com/
b、wkhtmltopdf 页面转pdf下载 https://wkhtmltopdf.org/
c、pycharm之类的通用软件就不说了
3、抓包获取json串
电脑打开微信,通过公众号进入查看历史推送的问题
刷新记录,并往下划以便查看更多的历史文章,使用charles查看浏览痕迹,filter到我们需要的信息通过mp.weixin.qq.com/mp/profile_ext?action=getms
这样可以拿到请求的url集合
将url地址复制到Chrome中,就可知,请求的url便可以获取到json串,json中的content_ur便是我们想要的文章链接。
4、获取文章URL
可以直接将request记录,复制的文本中,写个字符串处理的脚步,便可以解析出url,再发送请求获取json串中的content_url
通过字符切割解析出url
def str_to_json():
urls = read_file(filepath)
da = urls.split('\n')
data = []
for url in da:
da = url.split('200')[0]
data.append(da)
return data
#读取文件
def read_file(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
all_the_text = f.read()
return all_the_text
请求结果保存到本地文件, cookie信息自己保存
def get_json(data):
header = b'''
sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"
x-requested-with: XMLHttpRequest
sec-ch-ua-mobile: ?0
user-agent:
sec-fetch-site: same-origin
sec-fetch-mode: cors
sec-fetch-dest: empty
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cookie: XXXX
'''
# 转换成字典
headers = headers_raw_to_dict(header)
pattern = r"general_msg_list"
num = 0
for url in data:
response = requests.get(url=url, headers=headers, verify=False)
html = response.text
result = re.search(pattern, html)
print(html)
print(result)
if result != 'None':
num = num + 1
save_file(json_filepath + str(num) + '.json', html)
# 保存文件
def save_file(fpath, fileContent):
with open(fpath, 'w', encoding='utf-8') as f:
f.write(fileContent)
需要有两个path常量
filepath = '/Users/../file/str2json.txt'
json_filepath = '/Users/tmp/data/wechat/book/json/'
从json中解析url
# 保存的json文件中提取文章url等信息
def GetArticleList(jsondir):
filelist = os.listdir(jsondir)
ArtList = []
pattern = r"unknown"
for file in filelist:
filepath = os.path.join(jsondir, file)
if '/Users/xxx/json/.DS_Store' == str(filepath):
continue
filetxt = ReadFile(filepath)
jsbody = json.loads(filetxt)
result = re.search(pattern, str(jsbody))
if result == 'None':
continue
try:
general_msg_list = jsbody["general_msg_list"]
except:
print('error' + str(jsbody))
general_msg_list = ''
if general_msg_list == '':
continue
jsbd2 = json.loads(general_msg_list)
list = jsbd2["list"]
for item in list: # 一个item里可能有多篇文章
artidx = 1 # 请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条)
comm_msg_info = item["comm_msg_info"]
pubstamp = comm_msg_info["datetime"]
pubdate = Timestamp2Datetime(pubstamp)
if comm_msg_info["type"] == 49: # 49为普通图文类型,还有其他类型,暂不考虑
app_msg_ext_info = item["app_msg_ext_info"]
url = app_msg_ext_info["content_url"] # 文章链接
idx = artidx
title = app_msg_ext_info["title"]
art = Article(url, pubdate, idx, title)
if len(url) > 3: # url不完整则跳过
ArtList.append(art)
print(len(ArtList), pubdate, idx, title)
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
artidx += 1
multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"]
for subArt in multi_app_msg_item_list:
url = subArt["content_url"]
idx = artidx
title = subArt["title"]
art = Article(url, pubdate, idx, title)
if len(url) > 3:
ArtList.append(art)
print(len(ArtList), pubdate, idx, title)
return ArtList
# 文章类 地址、发布时间、标题等
class Article():
def __init__(self, url, pubdate, idx, title):
self.url = url
self.pubdate = pubdate
self.idx = idx
self.title = title
如果请求的url有控制的话,可能会产生异常,需要做判空或者异常处理,返回的ArtList,便是解析出文章的url
5、获取文章content下载图片保存
通过URL获取文章的内容,更改引入的外部了解,下载图片到本地
# 下载url网页
def DownLoadHtml(url):
# 构造请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
}
requests.packages.urllib3.disable_warnings()
response = requests.get(url, headers=headers, proxies=None, verify=False)
if response.status_code == 200:
htmltxt = response.text # 返回的网页正文
return htmltxt
else:
return None
# 修改网页中图片的src,使图片能正常显示
# htmltxt 网页文本 saveimgdir 保存目录 htmlname 网页名字
def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
imgList = bs.findAll("img")
imgindex = 0
for img in imgList:
imgindex += 1
originalURL = "" # 图片真实url
if "data-src" in img.attrs: # 有的 0:
print("\r down imgs " + "▇" * imgindex + " " + str(imgindex), end="")
if "data-type" in img.attrs:
imgtype = img.attrs["data-type"]
else:
imgtype = "png"
imgname = htmlname + "_" + str(imgindex) + "." + imgtype # 形如 1.png的图片名
imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录
DownImg(originalURL, imgsavepath)
img.attrs["src"] = "images/" + imgname # 网页中图片的相对路径
else:
img.attrs["src"] = ""
ChangeCssSrc(bs) # 修改link标签
ChangeContent(bs) # 修改js_content的style,使正文能正常显示
return str(bs) # 将BeautifulSoup对象再转换为字符串,用于保存
def ChangeCssSrc(bs):
linkList = bs.findAll("link")
for link in linkList:
href = link.attrs["href"]
if href.startswith("//"):
newhref = "http:" + href
link.attrs["href"] = newhref
def ChangeContent(bs):
jscontent = bs.find(id="js_content")
if jscontent:
jscontent.attrs["style"] = ""
else:
print("-----可能文章被删了-----")
# html下载到本地
def SaveFile(fpath, fileContent):
with open(fpath, 'w', encoding='utf-8') as f:
f.write(fileContent)
6、html文件都转为pdf
使用第三方工具wkhtmltopdf 转化
# 把一个文件夹下的html文件都转为pdf
def PDFDir(htmldir, pdfdir):
if not os.path.exists(pdfdir):
os.makedirs(pdfdir)
flist = os.listdir(htmldir)
for f in flist:
if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换,含有tmp的不转换
continue
htmlpath = htmldir + "/" + f
tmppath = htmlpath[:-5] + "_tmp.html" # 生成临时文件,供转pdf用
htmlstr = ReadFile(htmlpath)
bs = BeautifulSoup(htmlstr, "lxml")
title = ""
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
titleTag = bs.find(id="activity-name")
if titleTag is not None:
title = "_" + titleTag.get_text().replace(" ", "").replace(" ", "").replace("\n", "")
ridx = htmlpath.rindex("/") + 1
pdfname = htmlpath[ridx:-5] + title
pdfpath = pdfdir + "/" + pdfname + ".pdf"
"""
把js等去掉,减少转PDF时的加载项,
注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css
"""
[s.extract() for s in bs(["script", "iframe", "link"])]
SaveFile(tmppath, str(bs))
PDFOne(tmppath, pdfpath)
# 把一个Html文件转为pdf
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
if skipExists and os.path.exists(pdfpath):
print("pdf exists", pdfpath)
if removehtml:
os.remove(htmlpath)
return
# --enable-local-file-access
exepath = "wkhtmltopdf"
cmdlist = []
cmdlist.append(" --enable-local-file-access ")
# cmdlist.append(" --load-error-handling ignore ")
# cmdlist.append(" --page-height 200 ") #数字可以自己调节,也可以不加这两行
# cmdlist.append(" --page-width 140 ")
cmdlist.append(" " + htmlpath + " ")
cmdlist.append(" " + pdfpath + " ")
cmdstr = exepath + "".join(cmdlist)
print(cmdstr)
os.system(cmdstr)
7、其他说明
导入到包有
import os, sys
import requests
import json
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import re
我使用的Mac,window系统抓包工具可使用其他 Fiddler
参考内容
https://github.com/LeLe86/vWeChatCrawl
一步步教你打造文章爬虫(2)-下载网页
一步步教你打造文章爬虫(3)-批量下载
2021/06/26 于成都