首先需要导入包和数据的目录
#!usr/bin/evn python
#! -*- coding:utf8 -*-
from __future__ import division
import os
import re
from functools import reduce
from math import sqrt
path =#你要查重的路径
filelist = os.listdir(path) #列出文件夹下所有的目录与文件
核心比较相似度的代码部分
class Similarity(object):
def __init__(self, target1, target2):
self.target1 = target1
self.target2 = target2
def vector(self):
self.vdict1 = {}
self.vdict2 = {}
for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target1):
self.vdict1[target] = self.vdict1.get(target, 0) + 1
for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target2):
self.vdict2[target] = self.vdict2.get(target, 0) + 1
def mix(self):
def mapminmax(vdict):
_min = min(vdict.values())
_max = max(vdict.values())
_mid = _max - _min
#print (_min, _max, _mid)
for key in vdict:
if(_mid!=0):
vdict[key] = (vdict[key] - _min)/_mid
return vdict
for key in self.vdict1:
self.vdict2[key] = self.vdict2.get(key, 0)
for key in self.vdict2:
self.vdict1[key] = self.vdict1.get(key, 0)
self.vdict1 = mapminmax(self.vdict1)
self.vdict2 = mapminmax(self.vdict2)
def similar(self):
self.vector()
self.mix()
sum = 0
for key in self.vdict1:
sum += self.vdict1[key] * self.vdict2[key]
A = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.vdict1.values())))
B = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.vdict2.values())))
if A==0 and B==0:
return 0
elif A==0:
return sum/B
elif B==0:
return sum/A
else:
return sum/(A*B)
studentNameList =#当前代码的文件夹目录
for i in studentNameList:
filelist.remove(i)
seed = 0
for f in filelist:
# print("compare "+i +" with "+f)
seed = seed + 1
print(seed)#正在比 第几个
thisStudentName = i
compareStudentName = f
thisStudent = path + "\\" + i
compareStudent = path +"\\" + f
textpath = 'E:\\result\\' +str(i)+ '.txt'
txtfilepath = open(textpath, 'w')
flag =False
javaListForThisStudent = get_all_files(thisStudent)
javaListForCompareStudent = get_all_files(compareStudent)
score = 0.0
max = 0.0
num = 0
for m in javaListForThisStudent:
for n in javaListForCompareStudent:
t1 = open(m, "r", encoding='UTF-8',errors="ignore").read()#为防止文件有问题, 加入errors="ignore"
t2 = open(n, "r", encoding='UTF-8',errors="ignore").read()
s = Similarity(t1, t2)
thiss = str2float(str(s.similar()))
if max < thiss:
max = thiss
if thiss>85:
txtfilepath.write(i + " " + m +" with "+ f + " "+n + " with score" + thiss +"\n") #证明当前两个文件相似度极高
flag=True
score = score + thiss
num = num +1
if num!=0:
if score/num >80:
print(score/num)
txtfilep并ath.write(i + " total with "+ f + " " + thiss +"\n") #msg也就是下面的Hello world!
flag=True
print("avg" + score/num)
print("max"+max)
if flag==False:
txtfilepath.write("ok")
#如果和其他文件相比,都没有问题,可以输出ok至文件中
读取全部的结果文件,并进行输出
import os
path = "D:\\result"
filelist = os.listdir(path) #列出文件夹下所有的目录与文件
for i in filelist:
textpath = 'D:\\result\\' + i
txtfilepath = open(textpath, 'r')
print(txtfilepath.read())
# txtfilepath.write("ok")