相关原理知识在数据挖掘笔记-寻找相似文章-Java这篇文章中已经介绍过了。这里只是记录用Python语言实现。分词器用的是结巴分词器python包。
代码托管:https://github.com/fighting-one-piece/repository-datamining.git
class Doc:
def __init__(self, name):
self._name = name
def setName(self, name):
self._name = name
def getName(self):
return self._name
def setCategory(self, category):
self._category = category
def getCategory(self):
return self._category
def setWords(self, words):
self._words = words
def getWords(self):
return self._words
def setTfidfWords(self, tfidfWords):
self._tfidfWords = tfidfWords
def getTfidfWords(self):
return self._tfidfWords
def getSortedTfidfWords(self):
results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ]
return results
def setCHIWords(self, chiWords):
self._chiWords = chiWords
def getCHIWords(self):
return self._chiWords
def setSimilarities(self, similarities):
self._similarities = similarities
def getSimilarities(self):
return self._similarities
class DocSimilarity:
def getName1(self):
return self._name1
def setName1(self, name1):
self._name1 = name1
def getName2(self):
return self._name2
def setName2(self, name2):
self._name2 = name2
def getVector1(self):
return self._vector1
def setVector1(self, vector1):
self._vector1 = vector1
def getVector2(self):
return self._vector2
def setVector2(self, vector2):
self._vector2 = vector2
def getCosine(self):
return self._cosine
def setCosine(self, cosine):
self._cosine = cosine
class DocHelper:
@staticmethod
def genDocs(path):
docs = []
DocHelper.genDocsIterator(path, docs)
return docs
@staticmethod
def genDocsIterator(path, docs):
if os.path.isdir(path):
for subPathName in os.listdir(path):
subPath = os.path.join(path, subPathName)
DocHelper.genDocsIterator(subPath, docs)
else:
name = path[path.rfind('\\') + 1 : path.rfind('.')]
doc = Doc(name)
doc.setCategory(path.split('\\')[-2])
doc.setWords(WordUtils.splitFile(path));
docs.append(doc)
@staticmethod
def docHasWord(doc, word):
for dword in doc.getWords():
if dword == word:
return True
return False
@staticmethod
def docWordsStatistics(doc):
map = {}
for word in doc.getWords():
count = map.get(word)
if count is None:
count = 0
map[word] = count + 1
return map
@staticmethod
def docCategorySplit(docs):
docSplits = {}
for doc in docs:
category = doc.getCategory()
if docSplits.has_key(category):
cDocs = docSplits.get(category)
cDocs.append(doc)
else :
cDocs = [doc]
docSplits[category] = cDocs
return docSplits
@staticmethod
def docTopNWords(doc, n):
sortedWords = DocHelper.sortWordValueMap(doc.getTfidfWords())
words = []
for item in sortedWords:
for i in item[0:n]:
words.append(i[0])
return words
@staticmethod
def docWordsVector(doc, words):
vector = []
docWords = DocHelper.docWordsStatistics(doc)
for word in words:
count = docWords.get(word)
if count is None:
vector.append(0)
else :
vector.append(count)
return vector
@staticmethod
def wordCategorySplit(category, docs):
belongDocs = []
nobelongDocs = []
for doc in docs:
if category == doc.getCategory():
belongDocs.append(doc)
else:
nobelongDocs.append(doc)
return belongDocs, nobelongDocs
@staticmethod
def wordInDocsStatistics(word, docs):
sum = 0
for doc in docs:
if DocHelper.docHasWord(doc, word):
sum += 1
return sum
@staticmethod
def wordNotInDocsStatistics(word, docs):
sum = 0
for doc in docs:
if DocHelper.docHasWord(doc, word) == False:
sum += 1
return sum
@staticmethod
def calculateTFIDF(docs):
docTotalCount = float(len(docs))
for doc in docs:
wordTotalCount = len(doc.getWords())
tfidfWords = {}
docWords = DocHelper.docWordsStatistics(doc)
for word in docWords.keys():
wordCount = docWords.get(word)
tf = float(wordCount) / wordTotalCount
docCount = DocHelper.wordInDocsStatistics(word, docs) + 1
if docCount > docTotalCount:
docCount = docTotalCount
idf = math.log(docTotalCount / docCount);
tfidf = tf * idf
tfidfWords[word] = tfidf
doc.setTfidfWords(tfidfWords)
@staticmethod
def calculateSimilar(docs):
for doc in docs:
topWords = DocHelper.docTopNWords(doc, 20)
similarities = []
for odoc in docs:
otopWords = DocHelper.docTopNWords(odoc, 20)
words = WordUtils.mergeAndRemoveRepeat(topWords, otopWords);
v1 = DocHelper.docWordsVector(doc, words)
v2 = DocHelper.docWordsVector(odoc, words)
cosine = DistanceUtils.cosine(v1,v2)
similarity = DocSimilarity()
similarity.setName1(doc.getName())
similarity.setName2(odoc.getName())
similarity.setVector1(v1)
similarity.setVector2(v2)
similarity.setCosine(cosine)
similarities.append(similarity)
doc.setSimilarities(similarities)
@staticmethod
def sortWordValueMap(wordValueMap):
results = [sorted(wordValueMap.items(), key=lambda i : i[1], reverse=True), ]
return results
import jieba as ws
class WordUtils:
@staticmethod
def split(input):
seg_list = ws.cut(input, cut_all=False)
words = []
for word in seg_list:
words.append(word)
return words
@staticmethod
def splitFile(path):
file = open(path)
words = []
for line in file.readlines():
line = line.strip();
if len(line) > 0:
for w in WordUtils.split(line):
words.append(w)
file.close()
return WordUtils.removeStopWords(words)
@staticmethod
def removeStopWords(words):
file = open("stopwords.dic")
stopwords = []
for line in file.readlines():
line = line.strip();
if len(line) > 0:
stopwords.append(line)
file.close()
rwords = []
for word in words:
flag = True
for stopword in stopwords:
#if word.encode('utf-8') == stopword.encode('utf-8'):
if word == stopword:
flag = False
break
if flag and len(word.strip()) > 0:
rwords.append(word)
return rwords
@staticmethod
def mergeAndRemoveRepeat(w1, w2):
all = [i1 for i1 in w1]
all += [i2 for i2 in w2]
return [i for i in set(all)]
def testSimilarity():
path = r'D:\resources\chinese'
docs = DocHelper.genDocs(path)
DocHelper.calculateTFIDF(docs)
DocHelper.calculateSimilar(docs)
for doc in docs:
print '----------'
for similarity in doc.getSimilarities():
print '%s-%s-%s' %(similarity.getName1(),\
similarity.getName2(), similarity.getCosine())