【python】读取文件doc/ppt/xls

组件安装

docx==0.2.4

python-docx==0.8.1

python-pptx==0.6.21

xlrd==2.0.1

xlrd==2.0.1

openpyxl==3.0.9


# -*- coding: utf-8 -*-

# @Date    : 2022/3/31 15:42

# version: Python 3.8.*

# @File : FileRead.py

import docx

import subprocess

from pptx import Presentation

from xlrd import open_workbook

import openpyxl

def ReadDocx(filepath):

    text = ''

    doc = docx.Document(filepath)  # Creating word reader object.

    for para in doc.paragraphs:

        text = text + para.text

    return text

def ReadDoc(filepath):

    output = subprocess.check_output(['antiword', filepath])

    return output.decode("utf-8")

def ReadText(filepath):

    f=open(filepath,  mode='r',encoding="utf-8")

    text=f.read()

    # with open(filepath, "r", "utf-8") as f:

    #    ftext = f.read()  # 一次性读全部

    return text

def ReadText2(filepath):

    with open(filepath, mode='r',encoding="utf-8") as f:

        ftextlist = f.readlines()

    return ftextlist

def ReadPptx(filepath):

    text = ''

    ppt = Presentation(filepath)

    for slide in ppt.slides:

        # print(slide)

        for shape in slide.shapes:

            if shape.has_text_frame:

                text_frame = shape.text_frame

                # print(text_frame.text)

                text=text+text_frame.text

    return text

def ReadXlsFile(filepath):

    text=''

    with open(filepath, 'rb') as f:

        wb = open_workbook(filepath)

        for s in wb.sheets():

            # print('Sheet:', s.name)

            text=text+s.name

            for row in range(s.nrows):

                print(row)

            for col in range(s.ncols):

                text+=s.cell(row, col).value;

    return text

def ReadXlsxFile(filepath):

    context=''

    wb = openpyxl.load_workbook(filepath)

    # 获取workbook中所有的表格

    sheets = wb.sheetnames

    # print(sheets)

    # 循环遍历所有sheet

    for i in range(len(sheets)):

        sheet = wb[sheets[i]]

        context=context+sheet.title

        for r in range(1, sheet.max_row + 1):

            if r == 1:

                value=''.join([str(sheet.cell(row=r, column=c).value).ljust(17) for c in range(1, sheet.max_column + 1)])

                context=context+ value

            else:

                value =''.join([str(sheet.cell(row=r, column=c).value).ljust(20) for c in range(1, sheet.max_column + 1)])

                context = context + value

    return context

你可能感兴趣的:(【python】读取文件doc/ppt/xls)