Python 批量提取Word表格数据到EXCEL

关注 https://space.bilibili.com/187492698 代码演示

# -*- coding: utf-8 -*-
# Version: Python 3.9.7
# Author: TRIX
# Date: 2022-09-08 20:40:11
# Use:
# intro: 批量把word表格数据提取到Excel
import re

from win32com.client import Dispatch#pip install pypiwin32
word=Dispatch('Word.Application')#打开word
word.Visible=False#显示word界面
pat=re.compile(r'.+?',re.S)

def getInfo(f):
	doc=word.Documents.Open(f)#打开doc/docx
	table = doc.Tables(1)
	tableinfo=table.Range.Text
	infoL=[text.replace('\r','').replace('\x07','') for text in pat.findall(tableinfo)]
	info=infoC(infoL[1],infoL[3],infoL[13],infoL[9],infoL[25],infoL[29].split('QQ')[0].replace('电话:',''),infoL[33],' '.join(infoL[45:48]))#'姓名','学号','性别','寝室号','籍贯','身份证号','电话','家庭住址','父母联系方式'
	for n in range(5):#6-1
		starti=51+n*6
		endi=starti+3
		if infoL[starti]!='':#如果该行第一个单元格有填写信息
			info.ptels+='|'
			info.ptels+=' '.join(infoL[starti:endi])
	info.ptels=info.ptels.strip()
	print(info.getDetailInfo())
	return info

def getNids():
	with open('221.txt','r',encoding='utf-8') as f:
		nids=f.readlines()
	return nids
nids=getNids()

class infoC(object):
	def __init__(self,name,sex,dorm,nat,pid,tel,loc,ptels):
		self.name = name
		self.nid = 'NoValue'
		for n in nids:#如果在nids找到就赋值给nid
			if name in n:
				self.nid = n.split('\t')[0]
				break
		self.sex = sex
		self.dorm = dorm
		self.nat = nat
		self.pid = pid
		self.tel = tel
		self.loc = loc
		self.ptels = ptels
	def getDetailInfo(self):
		return f'name:{self.name}\nnid:{self.nid}\nsex:{self.sex}\ndorm:{self.dorm}\nnative:{self.nat}\nid:{self.pid}\ntel:{self.tel}\nloc:{self.loc}\nptels:{self.ptels}'
	def getAttrL(self):
		return [self.name,self.nid,self.sex,self.dorm,self.nat,self.pid,self.tel,self.loc,self.ptels]

from pandas import DataFrame,ExcelWriter
def saveTable(outXlsx,tables=[],sheet_names=[]):
	'''没写意外判断的代码
	把tables中多个table以sheet_names中对应的sheetname保存到outXlsx'''
	if not tables:raise ('no data')
	with ExcelWriter(outXlsx) as writer:
		for i,table in enumerate(tables):
			df=DataFrame(table)
			if sheet_names:df.to_excel(writer,sheet_name=sheet_names[i],header=0,index=0)#writer=文件路径或现有的ExcelWriter sheet_name=它是指包含DataFrame的工作表的名称 header=写出列名 如果给出了字符串列表, 则假定它是列名的别名 index=写入索引
			else:df.to_excel(writer,sheet_name=str(i+1),header=0,index=0)

from guietta import QFileDialog
files = QFileDialog.getOpenFileNames(None, "Open File",r"","Images (*.doc *.docx)")[0]#files

def saveXLsx():
	resTable=[['序号','姓名','学号','性别','寝室号','籍贯','身份证号','电话','家庭住址','父母联系方式']]
	for i,f in enumerate(files):
		info=getInfo(f)
		resTable.append([str(i+1),*info.getAttrL()])
	saveTable(outpath,tables=[resTable])

outpath='res.xlsx'
saveXLsx()

你可能感兴趣的:(python,pandas,开发语言)