试了下许久不用的python, 主要是尝试了下网页转pdf的强大工具wkhtmltopdf
其他必备条件请参考脚本前注释
本文只是对单一文章就行下载,稍加改造,即可下载任一订阅
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
# @Time : 2019/1/22 21:13
# @Author : ysj
"""
需安装 wkhtmltopdf 软件 并添加软件bin目录至环境变量 https://wkhtmltopdf.org/downloads.html
下载对应版本chromdriver.exe并放入仁义环境变量目录 如上述目录 或 C:\\windows
pip install requests,selenium,pdfkit
"""
import os
import sys
import time
import requests
from selenium import webdriver
import pdfkit
import re
class JikeDownloadPDF:
def __init__(self, username, passwd, article_list_url):
"""
:param username: 用户名
:param passwd: 密码
:param article_list_url:文章列表链接 https://time.geekbang.org/column/126
"""
self._user = str(username).strip()
self._passwd = str(passwd).strip()
self.article_list_url = article_list_url.strip()
self.driver = self._get_webdriver()
self.session = self._get_session()
self.download()
def _get_session(self):
"""
利用浏览器的cookies生成requests session,用于后续接口请求,或者文章列表(不登录也可),文章详情,如音频下载
:return:
"""
header = {'Referer': 'https://time.geekbang.org',
'Content-Type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}
s = requests.session()
s.headers = header
cookies_dict = dict([(i['name'], i['value']) for i in self.driver.get_cookies()])
requests.utils.add_dict_to_cookiejar(s.cookies, cookies_dict)
return s
def _get_webdriver(self):
driver = webdriver.Chrome()
# 使用driver打开极客时间登录页面
login_url = 'https://account.geekbang.org/signin'
driver.get(login_url)
# 重试三次
for i in range(3):
# 输入手机号
driver.find_element_by_class_name("nw-input").send_keys(self._user)
# 输入密码
driver.find_element_by_class_name("input").send_keys(self._passwd)
# 点击登录按钮
driver.find_element_by_class_name("mybtn").click()
# 为了使ajax加载完成 此处使用隐式等待让程序等待5秒钟
driver.implicitly_wait(5)
check_url = 'https://account.geekbang.org/dashboard/info'
time.sleep(2)
if driver.current_url != check_url:
print(driver.current_url)
print("登录失败, 重新登录")
else:
return driver
else:
print("登录失败次数3次,程序退出")
time.sleep(2)
sys.exit(1)
def _get_articles_id_list(self):
data = {'cid': str(self.article_list_url).rsplit("/", 1)[-1], 'size': 20, 'prev': 0, 'order': 'newest', 'sample': True}
url = 'https://time.geekbang.org/serv/v1/column/articles'
result = []
def get_data(data):
response = self.session.post(url=url, json=data)
if response.status_code != 200:
print("未登录成功或其他原因,访问文章列表失败")
return
res = response.json()
if res['error']:
print(res['error'])
return
for cell in res['data']['list']:
result.append(cell['id'])
# 递归获得文章列表
if res['data']['page']['more']:
prev = res['data']['list'][-1]['score']
inner_data = data
inner_data['prev'] = prev
get_data(inner_data)
get_data(data)
return result
def download(self, prefix=''):
# 增加下载目录,默认为文章列表id,若前缀为已存在文件,则跳过
prefix = str(self.article_list_url).rsplit("/", 1)[-1] if not prefix else prefix
if os.path.isfile(prefix):
prefix = ''
print("指定路径前缀为文件,自动跳过")
else:
os.path.isdir(prefix) or os.mkdir(prefix)
driver = self.driver
url = 'https://time.geekbang.org/column/article/{}'
audio_url = 'https://time.geekbang.org/serv/v1/article'
for index, id_ in enumerate(self._get_articles_id_list(), 1):
art_url = url.format(id_)
# 最多重试三次
for i in range(3):
try:
driver.get(art_url)
# 网站更改class规则,通过tag获取标题
# title = driver.find_element_by_class_name("article-title").text
title = driver.find_element_by_tag_name('body').find_element_by_tag_name('h1').text
except Exception as e:
print(e, "{}获取失败次数{}".format(art_url, i))
else:
print("开始下载第{}篇文章,文章名字:{}".format(index, title))
"""
因为要在windows下保存为PDF文件
所以文件名不能为特殊字符
此处将可能出现的特殊替换为空字符串
"""
raw_title = '{}{}{}'.format(prefix, "/", re.sub('[\/::*?"<>|]', '', title))
options = {
'page-size': 'Letter',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
]
}
pdfkit.from_string(driver.page_source, raw_title + '.pdf', options=options)
# 下载音频
data = {"id": str(id_), "include_neighbors": True}
response = self.session.post(audio_url, json=data)
try:
res = response.json()
audio_downurl = res['data']['audio_download_url']
except Exception as e:
print(e)
else:
if audio_downurl:
with open(raw_title + '.mp3', 'wb') as f:
f.write(self.session.get(audio_downurl).content)
# 成功即出
break
else:
print("{}获取失败次数达到上限,跳过".format(art_url))
if __name__ == '__main__':
down = JikeDownloadPDF('###', '###', 'https://time.geekbang.org/column/126')