有时我们在借鉴一篇文档之后还不想有太多重复,这个时候可以使用这个工具对两个word文档进行对比
import sys
from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton, QVBoxLayout, QWidget, QLabel, QFileDialog
from docx import Document
import re, datetime
class WordComparerApp(QMainWindow):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
self.setWindowTitle('Word 文档比较器')
self.setGeometry(100, 100, 400, 200)
self.centralWidget = QWidget(self)
self.setCentralWidget(self.centralWidget)
self.layout = QVBoxLayout()
self.file1_label = QLabel('选择文件1:')
self.layout.addWidget(self.file1_label)
self.file1_button = QPushButton('选择文件1')
self.file1_button.clicked.connect(self.openFile1)
self.layout.addWidget(self.file1_button)
self.file2_label = QLabel('选择文件2:')
self.layout.addWidget(self.file2_label)
self.file2_button = QPushButton('选择文件2')
self.file2_button.clicked.connect(self.openFile2)
self.layout.addWidget(self.file2_button)
self.compare_button = QPushButton('开始比较')
self.compare_button.clicked.connect(self.compareFiles)
self.layout.addWidget(self.compare_button)
self.centralWidget.setLayout(self.layout)
def openFile1(self):
options = QFileDialog.Options()
file1, _ = QFileDialog.getOpenFileName(self, "选择文件1", "", "Word Files (*.docx)", options=options)
if file1:
self.file1_label.setText(f'选择文件1: {file1}')
self.file1 = file1
def openFile2(self):
options = QFileDialog.Options()
file2, _ = QFileDialog.getOpenFileName(self, "选择文件2", "", "Word Files (*.docx)", options=options)
if file2:
self.file2_label.setText(f'选择文件2: {file2}')
self.file2 = file2
def compareFiles(self):
if hasattr(self, 'file1') and hasattr(self, 'file2'):
doc1 = self.readDocx(self.file1)
doc2 = self.readDocx(self.file2)
print('开始比对...'.center(80, '*'))
t1 = datetime.datetime.now()
for i in range(len(doc1)):
if i % 100 == 0:
print('处理进行中,已处理段落 {0:>4d} (总数 {1:0>4d} ) '.format(i, len(doc1)))
for j in range(len(doc2)):
self.compareParagraph(doc1, i, doc2, j)
t2 = datetime.datetime.now()
print('\n比对完成,总用时: ', t2 - t1)
def getText(self, wordname):
d = Document(wordname)
texts = []
for para in d.paragraphs:
texts.append(para.text)
return texts
def msplit(self, s, separators=',|\.|\?|,|。|?|!'):
return re.split(separators, s)
def readDocx(self, docfile):
print('*' * 80)
print('文件', docfile, '加载中……')
t1 = datetime.datetime.now()
paras = self.getText(docfile)
segs = []
for p in paras:
temp = []
for s in self.msplit(p):
if len(s) > 2:
temp.append(s.replace(' ', ""))
if len(temp) > 0:
segs.append(temp)
t2 = datetime.datetime.now()
print('加载完成,用时: ', t2 - t1)
self.showInfo(segs, docfile)
return segs
def showInfo(self, doc, filename='filename'):
chars = 0
segs = 0
for p in doc:
for s in p:
segs = segs + 1
chars = chars + len(s)
print('段落数: {0:>8d} 个。'.format(len(doc)))
print('短句数: {0:>8d} 句。'.format(segs))
print('字符数: {0:>8d} 个。'.format(chars))
def compareParagraph(self, doc1, i, doc2, j, min_segment=5):
p1 = doc1[i]
p2 = doc2[j]
len1 = sum([len(s) for s in p1])
len2 = sum([len(s) for s in p2])
if len1 < 10 or len2 < 10:
return []
lst = []
for s1 in p1:
if len(s1) < min_segment:
continue
for s2 in p2:
if len(s2) < min_segment:
continue
if s2 in s1:
lst.append(s2)
elif s1 in s2:
lst.append(s1)
count = sum([len(s) for s in lst])
ratio = float(count) / min(len1, len2)
if count > 10 and ratio > 0.1:
print(' 发现相同内容 '.center(80, '*'))
print('文件1第{0:0>4d}段内容:{1}'.format(i + 1, p1))
print('文件2第{0:0>4d}段内容:{1}'.format(j + 1, p2))
print('相同内容:', lst)
print('相同字符比:{1:.2f}%\n相同字符数: {0}\n'.format(count, ratio * 100))
return lst
def main():
app = QApplication(sys.argv)
ex = WordComparerApp()
ex.show()
sys.exit(app.exec_())
if __name__ == '__main__':
main()
文档查重器
如果觉得文章对你有用请点赞、关注 ->> 你的点赞对我太有用了
群内交流更多技术
130856474 <-- 在这里