python html,docx类型转换成pdf

html 类型转换:

path_wkthmltopdf = r"D:\soft-tools\pdf-change\install\wkhtmltopdf\bin\wkhtmltopdf.exe"
def url_to_pdf(url, to_file):
    '''将网页生成pdf文件'''
    config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
    # 生成pdf文件,to_file为文件路径
    pdfkit.from_url(url, to_file, configuration=config)
    print('完成')

def html_to_pdf(html, to_file):
    '''将html文件生成pdf文件'''
    # 将wkhtmltopdf.exe程序绝对路径传入config对象
    config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
    # 生成pdf文件,to_file为文件路径
    pdfkit.from_file(html, to_file, configuration=config)
    print('完成')

def str_to_pdf(string, to_file):
    '''将字符串生成pdf文件'''
    config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
    # 生成pdf文件,to_file为文件路径
    pdfkit.from_string(string, to_file, configuration=config)
    print('完成')


if __name__ == '__main__':
    url_to_pdf(r'http://www.csrc.gov.cn/csrc/c105891/c1806775/content.shtml', 'out_1.pdf')
    # html_to_pdf('sample.html', 'out_2.pdf')
    # str_to_pdf('我是中国人!', 'out_3.pdf')

doc,docx 转换成pdf:

import  os
from win32com import client
#pip instatll win32com
def doc2pdf(doc_name, pdf_name):
    """
    :word文件转pdf
    :param doc_name word文件名称
    :param pdf_name 转换后pdf文件名称
    """
    try:
        word = client.DispatchEx("Word.Application")
        if os.path.exists(pdf_name):
            os.remove(pdf_name)
        worddoc = word.Documents.Open(doc_name,ReadOnly = 1)
        worddoc.SaveAs(pdf_name, FileFormat = 17)
        worddoc.Close()
        return pdf_name
    except Exception as err:
        print(err)
        return 1
if __name__=='__main__':
    # doc_name = "D:\GIT_CODE\data_extract_python\s3file\804e0fc28fbe11ec8af01866da996334.docx"
    doc_name = "D:\GIT_CODE\data_extract_python\s3file\8001.doc"
    ftp_name = r"D:\GIT_CODE\data_extract_python\s3file\001.pdf"
    doc2pdf(doc_name, ftp_name)

你可能感兴趣的:(python爬虫,python,html,开发语言)