用python实现一个文档小工具(支持文档关键字筛选)

功能:根据关键词批量从doc、docx、pdf文件中筛选出包含所输入关键词的文件

那么开始上代码,不是专业python程序猿,代码写的不好勿喷,哈哈

from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
import sys, os
import docx
from docx import Document
import os
import shutil
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

class Window(QDialog):
    def __init__(self, parent=None):
        super(Window, self).__init__(parent)
        self.path = ''
        self.initUI()
        self.setWindowTitle("文件小助手")
        self.resize(240, 200)

    def initUI(self):
        grid = QGridLayout()

        grid.addWidget(QLabel("源路径:"), 0, 0)
        self.pathLineEdit = QLineEdit()
        self.pathLineEdit.setFixedWidth(200)
        self.pathLineEdit.setText(self.path)
        grid.addWidget(self.pathLineEdit, 0, 1)
        button = QPushButton("选择文件夹")
        grid.addWidget(button, 0, 3)
        button.clicked.connect(self.msg)

        grid.addWidget(QLabel("输出路径:"), 1, 0)
        self.pathLineEdit1 = QLineEdit()
        self.pathLineEdit1.setFixedWidth(200)
        self.pathLineEdit1.setText(self.path)
        grid.addWidget(self.pathLineEdit1, 1, 1)
        button = QPushButton("选择文件夹")
        grid.addWidget(button, 1, 3)
        button.clicked.connect(self.msg1)

        # create textbox
        grid.addWidget(QLabel("关键字:"), 2, 0)
        self.textbox = QLineEdit(self)
        self.textbox.move(20, 20)
        self.textbox.resize(180, 30)
        grid.addWidget(self.textbox, 2, 1)

        # Create a button in the window
        self.button1 = QPushButton('点我开始干活儿', self)
        grid.addWidget(self.button1, 3, 1)
        self.setLayout(grid)
        fileDir = self.pathLineEdit.text()
        keyword = self.textbox.text()
        self.button1.clicked.connect(lambda : self.working(self.pathLineEdit,self.pathLineEdit1,self.textbox))

    def msg(self):
        dir = QFileDialog.getExistingDirectory(self,"选取文件夹","./")  # 起始路径
        self.pathLineEdit.setText(dir)
        print(dir)

    def msg1(self):
        dir = QFileDialog.getExistingDirectory(self, "选取文件夹", "./")  # 起始路径
        self.pathLineEdit1.setText(dir)
        print(dir)

    #word 解析器
    def readDoc(self,root,path,target,key):
        #将doc文件改为docx
        filename = path[-3:]
        if filename == 'doc':
            name = os.path.basename(path)
            os.rename(path,root+'/'+name+'x')
            path = path+'x'

        flag = False
        try:
            document = Document(path)
        except:
            return
        else:
            for paragraph in document.paragraphs:
                if key in paragraph.text:
                    flag = True
                    self.copyFile(target,path)
                    break
            if flag == False:
                tables = document.tables
                for table in tables:
                    # 行列个数
                    row_count = len(table.rows)
                    col_count = len(table.columns)
                    for i in range(row_count):
                        for j in range(col_count):
                            if key in table.cell(i, j).text:
                                self.copyFile(target, path)
                                break

    #pdf文件解析器
    def readPdf(self,root,path,target,key):
        # 获取文档对象
        fp = open(path, "rb")

        # 创建一个一个与文档关联的解释器
        parser = PDFParser(fp)

        # PDF文档的对象
        doc = PDFDocument()

        # 连接解释器和文档对象
        parser.set_document(doc)
        doc.set_parser(parser)

        # 初始化文档,当前文档没有密码,设为空字符串
        doc.initialize("")

        # 创建PDF资源管理器
        resource = PDFResourceManager()

        # 参数分析器
        laparam = LAParams()

        # 创建一个聚合器
        device = PDFPageAggregator(resource, laparams=laparam)

        # 创建PDF页面解释器
        interpreter = PDFPageInterpreter(resource, device)

        # 使用文档对象得到页面的集合
        for page in doc.get_pages():
            # 使用页面解释器读取
            interpreter.process_page(page)

            # 使用聚合器来获得内容
            layout = device.get_result()

            for out in layout:
                if hasattr(out, "get_text"):
                    txt = out.get_text()
                    if key in txt:
                        self.copyFile(target,path)
                        break

    # 复制文件
    def copyFile(self, path, oldname):
        hasFile = os.path.exists(path)
        if hasFile == True:
            name = os.path.basename(oldname)
            shutil.copyfile(oldname, path + '/' + name)
        else:
            os.mkdir(path)
            name = os.path.basename(oldname)
            shutil.copyfile(oldname, path + '/' + name)

    # 开始干活儿
    @pyqtSlot()
    def working(self,pathLineEdit1,pathLineEdit2,textbox):
        sourcedir = pathLineEdit1.text()
        targetdir = pathLineEdit2.text()
        key = textbox.text()
        msg = '处理好了'
        if sourcedir.strip() == '':
            msg = '源路径不能为空'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)
            return
        if targetdir.strip() == '':
            msg = '输出路径不能为空'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)
            return
        if key.strip() == '':
            msg = '关键字不能为空'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)
            return
        # 处理文件
        if sourcedir.strip() != '' and targetdir.strip() != '' and key.strip() != '':
            flag = False
            for root, dirs, files in os.walk(sourcedir):
                for file in files:
                    diricto = os.path.join(root, file)
                    filetype = diricto[-4:]
                    if 'doc' in filetype:
                        self.readDoc(root, diricto, targetdir, key)
                        flag = True
                    if 'pdf' in filetype:
                        self.readPdf(root, diricto, targetdir, key)
                        flag = True

            if flag == False :
                msg = '源路径中没有word和pdf文件'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)

if __name__ == '__main__':
    app = QApplication(sys.argv)
    dialog = Window()
    if dialog.exec_():
        pass

 

工具演示效果图如下

用python实现一个文档小工具(支持文档关键字筛选)_第1张图片

 

工具下载链接:  https://pan.baidu.com/s/1w7CQUAowSgR_d6V2h5OlwA  密码:kyuy


文末小福利免费视频资源网站:www.sousuohou.com

转载于:https://www.cnblogs.com/vicF/p/9803566.html

你可能感兴趣的:(用python实现一个文档小工具(支持文档关键字筛选))