相关原理知识在数据挖掘笔记-寻找相似文章-Java这篇文章中已经介绍过了。这里只是记录用Python语言实现。分词器用的是结巴分词器python包。
代码托管:https://github.com/fighting-one-piece/repository-datamining.git
class Doc: def __init__(self, name): self._name = name def setName(self, name): self._name = name def getName(self): return self._name def setCategory(self, category): self._category = category def getCategory(self): return self._category def setWords(self, words): self._words = words def getWords(self): return self._words def setTfidfWords(self, tfidfWords): self._tfidfWords = tfidfWords def getTfidfWords(self): return self._tfidfWords def getSortedTfidfWords(self): results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ] return results def setCHIWords(self, chiWords): self._chiWords = chiWords def getCHIWords(self): return self._chiWords def setSimilarities(self, similarities): self._similarities = similarities def getSimilarities(self): return self._similarities class DocSimilarity: def getName1(self): return self._name1 def setName1(self, name1): self._name1 = name1 def getName2(self): return self._name2 def setName2(self, name2): self._name2 = name2 def getVector1(self): return self._vector1 def setVector1(self, vector1): self._vector1 = vector1 def getVector2(self): return self._vector2 def setVector2(self, vector2): self._vector2 = vector2 def getCosine(self): return self._cosine def setCosine(self, cosine): self._cosine = cosine class DocHelper: @staticmethod def genDocs(path): docs = [] DocHelper.genDocsIterator(path, docs) return docs @staticmethod def genDocsIterator(path, docs): if os.path.isdir(path): for subPathName in os.listdir(path): subPath = os.path.join(path, subPathName) DocHelper.genDocsIterator(subPath, docs) else: name = path[path.rfind('\\') + 1 : path.rfind('.')] doc = Doc(name) doc.setCategory(path.split('\\')[-2]) doc.setWords(WordUtils.splitFile(path)); docs.append(doc) @staticmethod def docHasWord(doc, word): for dword in doc.getWords(): if dword == word: return True return False @staticmethod def docWordsStatistics(doc): map = {} for word in doc.getWords(): count = map.get(word) if count is None: count = 0 map[word] = count + 1 return map @staticmethod def docCategorySplit(docs): docSplits = {} for doc in docs: category = doc.getCategory() if docSplits.has_key(category): cDocs = docSplits.get(category) cDocs.append(doc) else : cDocs = [doc] docSplits[category] = cDocs return docSplits @staticmethod def docTopNWords(doc, n): sortedWords = DocHelper.sortWordValueMap(doc.getTfidfWords()) words = [] for item in sortedWords: for i in item[0:n]: words.append(i[0]) return words @staticmethod def docWordsVector(doc, words): vector = [] docWords = DocHelper.docWordsStatistics(doc) for word in words: count = docWords.get(word) if count is None: vector.append(0) else : vector.append(count) return vector @staticmethod def wordCategorySplit(category, docs): belongDocs = [] nobelongDocs = [] for doc in docs: if category == doc.getCategory(): belongDocs.append(doc) else: nobelongDocs.append(doc) return belongDocs, nobelongDocs @staticmethod def wordInDocsStatistics(word, docs): sum = 0 for doc in docs: if DocHelper.docHasWord(doc, word): sum += 1 return sum @staticmethod def wordNotInDocsStatistics(word, docs): sum = 0 for doc in docs: if DocHelper.docHasWord(doc, word) == False: sum += 1 return sum @staticmethod def calculateTFIDF(docs): docTotalCount = float(len(docs)) for doc in docs: wordTotalCount = len(doc.getWords()) tfidfWords = {} docWords = DocHelper.docWordsStatistics(doc) for word in docWords.keys(): wordCount = docWords.get(word) tf = float(wordCount) / wordTotalCount docCount = DocHelper.wordInDocsStatistics(word, docs) + 1 if docCount > docTotalCount: docCount = docTotalCount idf = math.log(docTotalCount / docCount); tfidf = tf * idf tfidfWords[word] = tfidf doc.setTfidfWords(tfidfWords) @staticmethod def calculateSimilar(docs): for doc in docs: topWords = DocHelper.docTopNWords(doc, 20) similarities = [] for odoc in docs: otopWords = DocHelper.docTopNWords(odoc, 20) words = WordUtils.mergeAndRemoveRepeat(topWords, otopWords); v1 = DocHelper.docWordsVector(doc, words) v2 = DocHelper.docWordsVector(odoc, words) cosine = DistanceUtils.cosine(v1,v2) similarity = DocSimilarity() similarity.setName1(doc.getName()) similarity.setName2(odoc.getName()) similarity.setVector1(v1) similarity.setVector2(v2) similarity.setCosine(cosine) similarities.append(similarity) doc.setSimilarities(similarities) @staticmethod def sortWordValueMap(wordValueMap): results = [sorted(wordValueMap.items(), key=lambda i : i[1], reverse=True), ] return results
import jieba as ws class WordUtils: @staticmethod def split(input): seg_list = ws.cut(input, cut_all=False) words = [] for word in seg_list: words.append(word) return words @staticmethod def splitFile(path): file = open(path) words = [] for line in file.readlines(): line = line.strip(); if len(line) > 0: for w in WordUtils.split(line): words.append(w) file.close() return WordUtils.removeStopWords(words) @staticmethod def removeStopWords(words): file = open("stopwords.dic") stopwords = [] for line in file.readlines(): line = line.strip(); if len(line) > 0: stopwords.append(line) file.close() rwords = [] for word in words: flag = True for stopword in stopwords: #if word.encode('utf-8') == stopword.encode('utf-8'): if word == stopword: flag = False break if flag and len(word.strip()) > 0: rwords.append(word) return rwords @staticmethod def mergeAndRemoveRepeat(w1, w2): all = [i1 for i1 in w1] all += [i2 for i2 in w2] return [i for i in set(all)]
def testSimilarity(): path = r'D:\resources\chinese' docs = DocHelper.genDocs(path) DocHelper.calculateTFIDF(docs) DocHelper.calculateSimilar(docs) for doc in docs: print '----------' for similarity in doc.getSimilarities(): print '%s-%s-%s' %(similarity.getName1(),\ similarity.getName2(), similarity.getCosine())