针对不同类型的文件,需要采取特定的访问与解析策略来有效获取其中蕴含的知识。下面我们将介绍对于不同数据源数据的获取方式。
from docx import Document
# pip install python-docx
# python-docx == 0.8.11
filename = 'xxx.docx'
doc = Document(filename)
for para in doc.paragraphs:
print(para.text)
from docx import Document
filename = r'sample.docx'
doc = Document(filename)
print(f"\n 便利文档中表格:")
print(f"\n 方法一:")
for table in doc.tables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
row = table.rows[i].cells
print(f"row : {row}")
print(f"\n 方法二:")
for table in doc.tables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
print(table.cell(i,j).text)
with open('sample.txt','r+',encoding='utf-8') as f:
data = f.read()
print(data)
with open('sample.txt','r+',encoding='utf-8') as f:
data = f.readline()
print(data)
with open('sample.txt','r+',encoding='utf-8') as f:
data = f.readlines()
print(data)
with open('sample.txt','r+',encoding='utf-8') as f:
for ann in f.readlines()
ann = ann.strip('\n') # 去除文本中的换行符
print(ann)
# 简单版本
for i in open(file='sample.txt',encoding='utf-8').readlines():
ann = i.strip('\n')
print(ann)
pip install pdfplumber
import pdfplumber
file_name = r'sample.pdf' # 需要解析的pdf文件
output_file = 'sample.txt' # pdf解析后的内容
with pdfplumber.open(file_name) as p:
page_count = len(p.pages)
for i in range(0,page_count):
page = p.pages[i]
text_data = page.extract_text()
data = open(output_file,'a',encoding='utf-8')
data.write(text_data)
import pdfplumber
from openpyxl import Workbook # 保存表格
file_name = r'sample.pdf'
output_file = 'sample.xlsx'
with pdfplumber.open(file_name) as pdf:
page = pdf.pages[0]
table = page.extract_table()
workbook = Workbook()
sheet = workbook.active
for row in table:
sheet.append(row)
workbook.save(filename=output_file)
#extract_tables()法
with pdfplumber.open(r'exm.pdf') as pdf: # 打开pdf
page_one = pdf.pages[0]
page_one_table =page_one.extract_tables() # 获取pdf第一页的所有表格数据
for row in page_one_table:
print('第一页的表格数据:', row)
# extract_table()法
with pdfplumber.open(r'exm.pdf') as pdf_info: # 打开pdf
page_one = pdf_info.pages[0]
page_one_table = page_one.extract_table()
for row in page_one_table:
print(row)