python调用docx模块实现docx文件内容交叉查重升级版

# !/usr/bin/env python
# -*-coding:utf-8 -*-
# File       : docx文件交叉查重.py
# Time       :2023/9/2 13:28
# Author     :QQ736592720
# pip install python-docx
# pip install pypiwin32
# pip install pyinstaller
# pyinstaller -F docx文件交叉查重.py
# pyinstaller -F -i 128.ico docx文件交叉查重.py

import glob
import os
import re
from docx import Document


# from docx.oxml.ns import qn  # 可用于设置中文格式


# from win32com import client as wc
# def doc_to_docx(oldfilename, newfilename):
#     word = wc.Dispatch("Word.Application")
#     doc = word.Documents.Open(oldfilename)
#     doc.SaveAs(newfilename, 12)
#     doc.Close()
#     word.Quit()


# def set_run(run, size, color):
#     '''
#     设置run的字体大小,是否加粗,字体颜色
#     '''
#     run.font.name = u"宋体"
#     run.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
#     run.font.size = size  # 设置字体大小为12磅 相当于 小四
#     run.font.color.rgb = color  # 字体颜色 red
#     run.bold = False  # 是否加粗
#     run.italic = False  # 斜体
#     run.text = "111"  # 修改文本


def get_docx_dic(filenames):
    '''
    获取除被查重文件以外的所有参考文献的内容,以文件名:内容,键值对保存
    :return: dic
    '''
    dic = {}
    for file in filenames:
        file_key = os.path.split(file)[1][:-5]  # r"C:\Users\999\Desktop\查重\A1.docx"
        doc = Document(file)
        dic[file_key] = tuple(p.text for p in doc.paragraphs)
        doc.save(file)
    return dic


def find_in_dic(s, dic, weight):
    '''
    根据语句,在参考文献构成的dic中找内容是否重复
    :return: 返回文件名
    '''
    s = s.strip().replace("。", "").replace(";", "")
    r = re.search("[\u4e00-\u9fa5]", s)  # 从汉字开始index
    if r: s = s[r.span()[0]:]  # 从汉字开始
    res = ""
    if len(s) >= weight:
        for k, v in dic.items():
            for p_index, p in enumerate(v):
                if len(p.strip()) >= len(s) and p.find(s) > -1:
                    print("Author:QQ736592720--- repf found in :  " + k + ".docx P" + str(p_index + 1))
                    res = res + "[ref:" + k + "P" + str(p_index + 1) + "]"
    return res


def main(root, weight=0):
    ls_dir0 = glob.glob(os.path.join(root, "*_result.docx"))
    for i in ls_dir0: os.remove(i)
    ls_dir0 = glob.glob(os.path.join(root, "*_result.txt"))
    for i in ls_dir0: os.remove(i)
    ls_dir0 = glob.glob(os.path.join(root, "*.docx"))
    dic = get_docx_dic(ls_dir0)  # 获取参考文献原版
    for file in ls_dir0:
        print("Author:QQ736592720---当前目标文件:" + file)
        file_key = os.path.split(file)[1][:-5]  # A1   123
        dic1 = dict(dic.items())  # 复制字典
        dic1.pop(file_key)  # 更新参考文献
        ################################################################
        ls_txt = []  # 保存重复内容
        doc = Document(file)
        for index, p in enumerate(doc.paragraphs):
            p1 = p.text.strip()
            if len(p1) > weight:  # 15
                arr = re.split(r"[\n。;]", p1)
                arr = [x for x in arr if x]
                for x in arr:
                    r = find_in_dic(x, dic1, weight)
                    if r:
                        p.text = p.text.replace(x, x + r)
                        ls_txt.append(x + r)

        save_docx = file[:-5] + "_result.docx"
        doc.save(save_docx)
        print("Author:QQ736592720---文件保存成功:" + save_docx)
        save_txt = file[:-5] + "_result.txt"
        with open(save_txt, "w", encoding="utf-8") as f:
            f.write("\n".join(ls_txt))
        print("Author:QQ736592720---文件保存成功:" + save_txt)
        ################################################################


if __name__ == '__main__':
    print("Author:QQ736592720---程序开始......")
    print("Author:QQ736592720---当前文字起步权重设置 = 15")
    root = os.getcwd()
    # root = r"C:\Users\999\Desktop\查重"
    main(root, weight=15)  # 超过几个字的句子才开始判断
    print("Author:QQ736592720---运行结束")

你可能感兴趣的:(python,c#,开发语言)