P12.Python提取PDF文字内容

P12.Python提取PDF文字内容.md

#pdfplumber提取文字
    pdfplumber.open(PDF路径)
    pdf.pagess[页数]
    page.extract_text()
    例:
        import pdfplumber

        with pdfplumber.open('test.pdf') as pdf:
        for page in pages:
            print(page.extract_text())

#pdfplumber提取表格
    page.extract_table()
    例:
        with pdfplumber.open('test.pdf') as pdf:
            table_page = pdf.pages[0]
            table = table.extract_table()
            print(table)

    #提取多个简单表格
    page.extract_tables()
    例:
        with pdfplumber.open('test.pdf') as pdf:
            table_page = pdf.pages[1]
            table = table.extract_tables()
            print(table)

    #利用pdfplumber提取网易财报表格
    table_settings:提取表格时的设定
    例:
        with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
            table_page = pdf.pages[9]
            table = table.extract_table(
                table_settings={
                    "vertical_strategy":"text",
                    "horizontal_strategy":"text",
                })
            print(table)

    #写入到Excel表格中
    例:
        with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
            table_page = pdf.pages[9]
            table = table.extract_table(
                table_settings={
                    "vertical_strategy":"text",
                    "horizontal_strategy":"text",
                })
            workbook = Workbook()
            sheet = workbook.active
            for row in table:
                sheet.append(row)
            workbook.save(filename='Netease_Q2_2019_Earnings.xlsx')
        #注意:存在空行和将单词切分到多个不同列的问题

        #去除空行
            简单判断,非空行的才加进来
            将列表中每个元素都连接成一个字符串,如果还是空字符串那么肯定就是空行
            new_table = []
            for row in table:
                if not ''.join([str(item) for iterm in row]) == '':

            #''.join(列表)用法:拼接字符串,如print('~'.join([1,2,3]))输出1~2~3
            #[str(item) for iterm in row]列表生成式,简单快速地生成一个python列表

        #合并单词
            分析结果可知前三列分布着第一列的单词
            将前三列非None的内容合并为一个字符串,然后再合到一个列表里
            new_row = []
            new_row.append(''.join([str(item) if item else '' for item in row[:3]]))
            new_row += row[3:]
            new_table.append(new_row)
            #行内条件判断 y = x*2 if x<10 else 20


        #提取网易财报表格完整代码
        import pdfplumber
        from openpyxl import Workbook
        
        with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
            table_page = pdf.pages[9]
            table = table.extract_table(
                table_settings={
                    "vertical_strategy":"text",
                    "horizontal_strategy":"text",
                })
            new_table = []
            for row in table:
                if not ''.join([str(item) for iterm in row]) == '':
                    new_row = []
                    new_row.append(''.join([str(item) if item else '' for item in row[:3]]))
                    new_row += row[3:]
                    new_table.append(new_row)

            workbook = Workbook()
            sheet = workbook.active
            for row in new_table:
                sheet.append(row)
            workbook.save(filename='Netease_Q2_2019_Earnings.xlsx')

你可能感兴趣的:(python办公自动化)