自制 查重软件

首先需要导入包和数据的目录

#!usr/bin/evn python
#! -*- coding:utf8 -*-
from __future__ import division
import os
import re
from functools import reduce
from math import sqrt

path =#你要查重的路径
filelist = os.listdir(path) #列出文件夹下所有的目录与文件

核心比较相似度的代码部分

class Similarity(object):
    def __init__(self, target1, target2):
        self.target1 = target1
        self.target2 = target2
    def vector(self):
        self.vdict1 = {}
        self.vdict2 = {}
        for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target1):
                self.vdict1[target] = self.vdict1.get(target, 0) + 1
        for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target2):
                self.vdict2[target] = self.vdict2.get(target, 0) + 1

    def mix(self):
        def mapminmax(vdict):
            _min = min(vdict.values())
            _max = max(vdict.values())
            _mid = _max - _min
            #print (_min, _max, _mid)
            for key in vdict:
                if(_mid!=0):
                    vdict[key] = (vdict[key] - _min)/_mid
            return vdict
        for key in self.vdict1:
            self.vdict2[key] = self.vdict2.get(key, 0)
        for key in self.vdict2:
            self.vdict1[key] = self.vdict1.get(key, 0)
        self.vdict1 = mapminmax(self.vdict1)
        self.vdict2 = mapminmax(self.vdict2)

    def similar(self):
        self.vector()
        self.mix()
        sum = 0
        for key in self.vdict1:
            sum += self.vdict1[key] * self.vdict2[key]
        A = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.vdict1.values())))
        B = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.vdict2.values())))
        if A==0 and B==0:
            return 0
        elif A==0:
            return sum/B
        elif B==0:
            return sum/A
        else:
            return sum/(A*B)
studentNameList =#当前代码的文件夹目录

for i in studentNameList:
    filelist.remove(i)
    seed = 0
    for f in filelist:
       # print("compare "+i +" with "+f)
        seed = seed + 1
        print(seed)#正在比 第几个
        thisStudentName = i
        compareStudentName = f
        
        thisStudent = path + "\\" + i
        compareStudent = path +"\\" + f
        textpath = 'E:\\result\\' +str(i)+ '.txt'
        txtfilepath = open(textpath, 'w')
        
        flag =False
        javaListForThisStudent = get_all_files(thisStudent)
        javaListForCompareStudent = get_all_files(compareStudent)
        
        score = 0.0
		max = 0.0
        num = 0
        for m in javaListForThisStudent:
            for n in javaListForCompareStudent:
                t1 = open(m, "r", encoding='UTF-8',errors="ignore").read()#为防止文件有问题, 加入errors="ignore"
                t2 = open(n, "r", encoding='UTF-8',errors="ignore").read()   
                s = Similarity(t1, t2)
                thiss = str2float(str(s.similar()))
				if max < thiss:
                    max = thiss
                if thiss>85:
                    txtfilepath.write(i + " " + m +" with "+ f + " "+n + " with score" + thiss +"\n") #证明当前两个文件相似度极高    
                    flag=True
                score = score + thiss
                num = num +1
        if num!=0:
            if score/num >80:    
                print(score/num)
                txtfilep并ath.write(i + " total with "+ f + " " + thiss +"\n")   #msg也就是下面的Hello world!
                flag=True
            print("avg" + score/num)
            print("max"+max)
            if flag==False:
                txtfilepath.write("ok")
                #如果和其他文件相比,都没有问题,可以输出ok至文件中

读取全部的结果文件,并进行输出

import os
path = "D:\\result"
filelist = os.listdir(path) #列出文件夹下所有的目录与文件
for i in filelist:
    textpath = 'D:\\result\\' + i 
    txtfilepath = open(textpath, 'r')
    print(txtfilepath.read())
#                txtfilepath.write("ok")

你可能感兴趣的:(python)