极客世界文章及音频下载

前提需要账号对文章有阅读权限

试了下许久不用的python, 主要是尝试了下网页转pdf的强大工具wkhtmltopdf
其他必备条件请参考脚本前注释

本文只是对单一文章就行下载,稍加改造,即可下载任一订阅

#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
# @Time    : 2019/1/22 21:13
# @Author  : ysj
"""
需安装 wkhtmltopdf 软件 并添加软件bin目录至环境变量 https://wkhtmltopdf.org/downloads.html
下载对应版本chromdriver.exe并放入仁义环境变量目录 如上述目录 或 C:\\windows
pip install requests,selenium,pdfkit
"""
import os
import sys
import time
import requests
from selenium import webdriver
import pdfkit
import re


class JikeDownloadPDF:

    def __init__(self, username, passwd, article_list_url):
        """

        :param username: 用户名
        :param passwd: 密码
        :param article_list_url:文章列表链接 https://time.geekbang.org/column/126
        """
        self._user = str(username).strip()
        self._passwd = str(passwd).strip()
        self.article_list_url = article_list_url.strip()

        self.driver = self._get_webdriver()
        self.session = self._get_session()
        self.download()

    def _get_session(self):
        """
        利用浏览器的cookies生成requests session,用于后续接口请求,或者文章列表(不登录也可),文章详情,如音频下载
        :return:
        """
        header = {'Referer': 'https://time.geekbang.org',
                  'Content-Type': 'application/json',
                  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
                  }

        s = requests.session()
        s.headers = header
        cookies_dict = dict([(i['name'], i['value']) for i in self.driver.get_cookies()])
        requests.utils.add_dict_to_cookiejar(s.cookies, cookies_dict)
        return s

    def _get_webdriver(self):
        driver = webdriver.Chrome()
        # 使用driver打开极客时间登录页面
        login_url = 'https://account.geekbang.org/signin'
        driver.get(login_url)
        # 重试三次
        for i in range(3):
            # 输入手机号
            driver.find_element_by_class_name("nw-input").send_keys(self._user)
            # 输入密码
            driver.find_element_by_class_name("input").send_keys(self._passwd)
            # 点击登录按钮
            driver.find_element_by_class_name("mybtn").click()
            # 为了使ajax加载完成 此处使用隐式等待让程序等待5秒钟
            driver.implicitly_wait(5)

            check_url = 'https://account.geekbang.org/dashboard/info'
            time.sleep(2)
            if driver.current_url != check_url:
                print(driver.current_url)
                print("登录失败, 重新登录")
            else:
                return driver
        else:
            print("登录失败次数3次,程序退出")
            time.sleep(2)
            sys.exit(1)

    def _get_articles_id_list(self):
        data = {'cid': str(self.article_list_url).rsplit("/", 1)[-1], 'size': 20, 'prev': 0, 'order': 'newest', 'sample': True}
        url = 'https://time.geekbang.org/serv/v1/column/articles'
        result = []

        def get_data(data):
            response = self.session.post(url=url, json=data)
            if response.status_code != 200:
                print("未登录成功或其他原因,访问文章列表失败")
                return
            res = response.json()
            if res['error']:
                print(res['error'])
                return
            for cell in res['data']['list']:
                result.append(cell['id'])
            # 递归获得文章列表
            if res['data']['page']['more']:
                prev = res['data']['list'][-1]['score']
                inner_data = data
                inner_data['prev'] = prev
                get_data(inner_data)

        get_data(data)
        return result

    def download(self, prefix=''):
        # 增加下载目录,默认为文章列表id,若前缀为已存在文件,则跳过
        prefix = str(self.article_list_url).rsplit("/", 1)[-1] if not prefix else prefix
        if os.path.isfile(prefix):
            prefix = ''
            print("指定路径前缀为文件,自动跳过")
        else:
            os.path.isdir(prefix) or os.mkdir(prefix)
        driver = self.driver
        url = 'https://time.geekbang.org/column/article/{}'
        audio_url = 'https://time.geekbang.org/serv/v1/article'
        for index, id_ in enumerate(self._get_articles_id_list(), 1):
            art_url = url.format(id_)
            # 最多重试三次
            for i in range(3):
                try:
                    driver.get(art_url)
                    # 网站更改class规则,通过tag获取标题
                    # title = driver.find_element_by_class_name("article-title").text
                    title = driver.find_element_by_tag_name('body').find_element_by_tag_name('h1').text
                except Exception as e:
                    print(e, "{}获取失败次数{}".format(art_url, i))
                else:
                    print("开始下载第{}篇文章,文章名字:{}".format(index, title))
                    """
                    因为要在windows下保存为PDF文件 
                    所以文件名不能为特殊字符
                    此处将可能出现的特殊替换为空字符串
                    """
                    raw_title = '{}{}{}'.format(prefix, "/", re.sub('[\/::*?"<>|]', '', title))
                    options = {
                        'page-size': 'Letter',
                        'encoding': "UTF-8",
                        'custom-header': [
                            ('Accept-Encoding', 'gzip')
                        ]
                    }
                    pdfkit.from_string(driver.page_source, raw_title + '.pdf', options=options)
                    # 下载音频
                    data = {"id": str(id_), "include_neighbors": True}
                    response = self.session.post(audio_url, json=data)
                    try:
                        res = response.json()
                        audio_downurl = res['data']['audio_download_url']
                    except Exception as e:
                        print(e)
                    else:
                        if audio_downurl:
                            with open(raw_title + '.mp3', 'wb') as f:
                                f.write(self.session.get(audio_downurl).content)
                    # 成功即出
                    break
            else:
                print("{}获取失败次数达到上限,跳过".format(art_url))



if __name__ == '__main__':
    down = JikeDownloadPDF('###', '###', 'https://time.geekbang.org/column/126')

你可能感兴趣的:(python)