2019独角兽企业重金招聘Python工程师标准>>>
在linux上用PHP读取WORD文档,其实是使用了 antiword程序把word文档转化为txt文档。
再使用php执行系统命令调用而已。
具体操作如下:
1、安装antiword
官方站:http://www.winfield.demon.nl/
下载地:http://www.winfield.demon.nl/linux/antiword-0.37.tar.gz
下载完,解压,进入目录
tar xvzf antiword-0.37.tar.gz
cd antiword-0.37
make && make install
即可完成安装。
安装时,自动安装到了/root/目录下,只有root才可执行该命令,我们需要改一下路径,COPY到/usr中方便调用。
cp /root/bin/*antiword /usr/local/bin/
mkdir /usr/share/antiword
cp -R /root/.antiword/* /usr/share/antiword/
chmod 777 /usr/local/bin/*antiword
chmod 755 /usr/share/antiword/*
以上操作以后即可在任意用户调用 antiword命令。
linux 平台转换txt:
# coding:utf-8 #!/user/bin/python import subprocess import tempfile def chang2txtbyantiword(fpath, tpath): """通过antiword读取doc,需要安装antiword,目前仅用于linux(详情见:http://www.winfield.demon.nl) wget http://www.winfield.demon.nl/linux/antiword-0.37.tar.gz tar -xvf antiword-0.37.tar.gz cd antiword-0.37 make && make install @param fp: 相对文件路径""" try: com = "antiword -m UTF-8.txt -t " + fpath + " > "+ tpath out_temp = tempfile.SpooledTemporaryFile(bufsize=10*1000) fileno = out_temp.fileno() obj = subprocess.Popen(com, shell=True,stdout=fileno,stderr=fileno) output = obj .wait() out_temp.seek(0) lines = out_temp.readlines() if output == 0: # 执行成功 return True return False except: return False finally: if out_temp: out_temp.close() a = "/data/appsystems/appSvr00/media/upload/doc/20170510/71d60eb8-3564-11e7-995c-000c29eb773f.doc" b = "/data/appsystems/appSvr00/media/upload/txt/20170510/71d60eb8-3564-11e7-995c-000c29eb773f.txt" chang2txtbyantiword(a, b)
直接读取doc文件:
# -*- encoding: utf8 -*- import subprocess output = subprocess.check_output(["antiword", "test.doc"])
windows doc转docx:
# coding:utf-8 #!/user/bin/python def changedoc2docxbywin32(fpath, tpath): """把doc文件转换为docx因为用到win32com,所以仅支持windows系统 @param fpath: 文件绝对路径,不能包含中文 @param tpath: 文件绝对保存路径,不能包含中文""" try: from win32com import client import pythoncom pythoncom.CoInitialize() word = client.DispatchEx('Word.Application') # 独立进程 word.Visible = 0 # 不显示 word.DisplayAlerts = 0 # 不警告 doc = word.Documents.Open(fpath) doc.SaveAs(tpath, 12) # 参数16是保存为doc,转化成docx是12 doc.Close() word.Quit() return True except: if doc: doc.Close() word.Quit() return False
读取docx和txt
# coding:utf-8 # !/user/bin/python import os from docx import Document def readDocx(docName, separated_rows=0, addStr=''): """获取docx的文档中的所有文字,不管格式,暂不支持doc格式每line行 @param separated_rows: int 相隔行数 @param addStr: str 追加的字符 """ TotalPage = 1 # 页数 extension = os.path.splitext(docName)[1] # 文件后缀名 separated_rows = int(separated_rows) # 需要隔separated_rows行追加字符 if extension == ".txt": with open(docName) as f: if separated_rows != 0: # 需要隔行追加字符 docText = '
'.join( paragraph + addStr if index % separated_rows == 0 else paragraph for index, paragraph in enumerate(f)) docText = docText.replace(addStr, "", 1) # 第一行不需要分页符 TotalPage = docText.count(addStr) + 1 # 页数统计 else: docText = '
'.join(paragraph for paragraph in f) # 生成迭代器,然后加入回车换行,比起生成列表需要更少内存,此情况迭代器的括号可省略 elif extension == ".docx": paras = Document(docName) if separated_rows != 0: # 需要隔行追加字符 docText = '
'.join( paragraph.text.encode('utf-8') + addStr if index % separated_rows == 0 else paragraph.text.encode( 'utf-8') for index, paragraph in enumerate(paras.paragraphs)) docText = docText.replace(addStr, "", 1) # 第一行不需要分页符 TotalPage = docText.count(addStr) + 1 # 页数统计 else: docText = '
'.join(paragraph.text.encode('utf-8') for paragraph in paras.paragraphs) # 生成迭代器,然后加入回车换行,比起生成列表需要更少内存,此情况迭代器的括号可省略 else: docText = "" return docText, TotalPage