Python-获取CSDN网页内容并输出为pdf

因为老师要求实验报告中附带参考文献原文,所以编写了这个文件。(作业果然是最佳动力

Python的pdf工具还是很全的,本次使用了wkhtmltopdf接口

因为基本不会报什么错,所以写成了命令行运行形式

一次只能处理一个url

因为根据自己需要编写的,放到博客上只是给小伙伴们参考思路,根据自己的需求修改~

#!/usr/bin/python
#@Author: zhongshsh

import requests
from bs4 import BeautifulSoup, NavigableString
import urllib
import pdfkit
import sys

# 获取网页内容
def get_html(url):
        headers = {
        'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
        AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        return response.text

# 删除超链接,保留标签内的内容
def strip_tags(html, invalid_tags):
    soup = BeautifulSoup(html, 'lxml')
    for tag in soup.findAll(True):
        if tag.name in invalid_tags:
            s = ""
            for c in tag.contents:
                if not isinstance(c, NavigableString):
                    c = strip_tags(str(c), invalid_tags)
                s += str(c)
            tag.replaceWith(s)
    return soup

# 删除一些标签
def strip_ct(soup):
    [s.extract() for s in soup(class_="article-bar-top")]
    [s.extract() for s in soup(class_="href-article-edit slide-toggle")]
    [s.extract() for s in soup(class_="person-messagebox")]
    return soup

# 过滤网页信息
def get_main(html):
    soup = BeautifulSoup(html,'lxml')
    i=strip_tags(str(strip_ct(soup.find(attrs={'class':"blog-content-box"}))),['a'])
    return str(i)

# 生成pdf
def html_pdf(html):
    path_wkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
    options = {
        'page-size':'A4',
        'margin-top': '0.75in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ],
        'cookie': [
            ('cookie-name1', 'cookie-value1'),
            ('cookie-name2', 'cookie-value2'),
        ],
        'no-outline': None
    }
    config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
    pdfkit.from_string(html, 'data.pdf', options=options, configuration=config)


if __name__ == '__main__':
    url = sys.argv[1]
    with open('data.pdf', 'w') as f:
        f.write('')
    # url_list = ['https://blog.csdn.net/u013803499/article/details/82877993']
    html_pdf(get_main(get_html(url)))

结果的部分截图

Python-获取CSDN网页内容并输出为pdf_第1张图片

你可能感兴趣的:(Python-获取CSDN网页内容并输出为pdf)