关于特征选择相关的知识可以参考一下连接
数据挖掘笔记-特征选择-开方检验
数据挖掘笔记-特征选择-信息增益
数据挖掘笔记-特征选择-期望交叉熵
数据挖掘笔记-特征选择-互信息
数据挖掘笔记-特征选择-遗传算法
数据挖掘笔记-特征选择-整体汇总
项目源码里面包含Java和Python的实现,这里只列出Python实现:
代码托管:https://github.com/fighting-one-piece/repository-datamining.git
class Doc: def __init__(self, name): self._name = name def setName(self, name): self._name = name def getName(self): return self._name def setCategory(self, category): self._category = category def getCategory(self): return self._category def setWords(self, words): self._words = words def getWords(self): return self._words def setTfidfWords(self, tfidfWords): self._tfidfWords = tfidfWords def getTfidfWords(self): return self._tfidfWords def getSortedTfidfWords(self): results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ] return results def setCHIWords(self, chiWords): self._chiWords = chiWords def getCHIWords(self): return self._chiWords def setSimilarities(self, similarities): self._similarities = similarities def getSimilarities(self): return self._similarities
#文档操作工具类 class DocHelper: #获取目录下所有的文档 @staticmethod def genDocs(path): docs = [] DocHelper.genDocsIterator(path, docs) return docs #遍历目录获取目录下所有的文档 @staticmethod def genDocsIterator(path, docs): if os.path.isdir(path): for subPathName in os.listdir(path): subPath = os.path.join(path, subPathName) DocHelper.genDocsIterator(subPath, docs) else: name = path[path.rfind('\\') + 1 : path.rfind('.')] doc = Doc(name) doc.setCategory(path.split('\\')[-2]) doc.setWords(WordUtils.splitFile(path)); docs.append(doc) #文档中是否包含指定词 @staticmethod def docHasWord(doc, word): for dword in doc.getWords(): if dword == word: return True return False #文档中词频统计 @staticmethod def docWordsStatistics(doc): map = {} for word in doc.getWords(): count = map.get(word) if count is None: count = 0 map[word] = count + 1 return map #根据文档所属类型分割文档集 @staticmethod def docCategorySplit(docs): docSplits = {} for doc in docs: category = doc.getCategory() if docSplits.has_key(category): cDocs = docSplits.get(category) cDocs.append(doc) else : cDocs = [doc] docSplits[category] = cDocs return docSplits #根据TFIDF取文档排行前N的词 @staticmethod def docTopNWords(doc, n): sortedWords = DocHelper.sortWordValueMap(doc.getTfidfWords()) words = [] for item in sortedWords[0:n]: words.append(item[0]) return words #文档中词的向量化 @staticmethod def docWordsVector(doc, words): vector = [] docWords = DocHelper.docWordsStatistics(doc) for word in words: count = docWords.get(word) if count is None: vector.append(0) else : vector.append(count) return vector #根据词所属文档类型获取同类文档集和非同类文档集 @staticmethod def categorySplit(category, docs): belongDocs = [] nobelongDocs = [] for doc in docs: if category == doc.getCategory(): belongDocs.append(doc) else: nobelongDocs.append(doc) return belongDocs, nobelongDocs #根据词所属文档类型获取同类文档集数量 @staticmethod def categoryStatistics(category, docs): sum = 0 for doc in docs: if category == doc.getCategory(): sum = sum + 1 return sum #根据词所属文档类型获取文档集中含有词数量 @staticmethod def categoryWordStatistics(category, word, docs): sum = 0 for doc in docs: if category == doc.getCategory() and \ DocHelper.docHasWord(doc, word): sum = sum + 1 return sum #获取包含词文档集和非包含词文档集 @staticmethod def wordDocsSplit(word, docs): belongDocs = [] nobelongDocs = [] for doc in docs: flag = False for dword in doc.getWords(): if word == dword: flag = True belongDocs.append(doc) break; if flag == False: nobelongDocs.append(doc) return belongDocs, nobelongDocs #统计文档集包含词的文档数 @staticmethod def wordInDocsStatistics(word, docs): sum = 0 for doc in docs: if DocHelper.docHasWord(doc, word): sum += 1 return sum #统计文档集不包含词的文档数 @staticmethod def wordNotInDocsStatistics(word, docs): sum = 0 for doc in docs: if DocHelper.docHasWord(doc, word) == False: sum += 1 return sum #文档集包含词的文档数所属类型在包含次文档数中的概率 @staticmethod def wordCategoryInDocsPercent(word, category, docs): sumWord = 0 sumCategory = 0 for doc in docs: if DocHelper.docHasWord(doc, word): sumWord += 1 if category == doc.getCategory(): sumCategory += 1 return float(sumCategory) / sumWord @staticmethod def calculateTF(doc): tf = {} for word in doc.getWords(): if tf.has_key(word): tf[word] = tf.get(word) + 1 else: tf[word] = 1 return tf #计算TFIDF @staticmethod def calculateTFIDF(docs): docTotalCount = float(len(docs)) for doc in docs: wordTotalCount = len(doc.getWords()) tfidfWords = {} docWords = DocHelper.docWordsStatistics(doc) for word in docWords.keys(): wordCount = docWords.get(word) tf = float(wordCount) / wordTotalCount docCount = DocHelper.wordInDocsStatistics(word, docs) + 1 if docCount > docTotalCount: docCount = docTotalCount idf = math.log(docTotalCount / docCount); tfidf = tf * idf tfidfWords[word] = tfidf doc.setTfidfWords(tfidfWords) #根据开方检验特征选择算法计算文档集中各个文档的词与类别的开方值 @staticmethod def calculateCHI(docs): docTotalCount = len(docs) for doc in docs: chiWords = {} words = doc.getWords() belongDocs,nobelongDocs = DocHelper.categorySplit(\ doc.getCategory(), docs) for word in words: a = DocHelper.wordInDocsStatistics(word, belongDocs) b = DocHelper.wordInDocsStatistics(word, nobelongDocs) c = DocHelper.wordNotInDocsStatistics(word, belongDocs) d = DocHelper.wordNotInDocsStatistics(word, nobelongDocs) x = float((a*d-b*c)**2) / ((a+b)*(c+d)) chiWords[word] = x doc.setCHIWords(chiWords) #根据信息增益特征选择算法计算文档集中词的信息增益 @staticmethod def calculateInformationGain(docs): docTotalCount = len(docs) splits = DocHelper.docCategorySplit(docs) categories = [] pcSum = 0 for item in splits.items(): categories.append(item[0]) categoryCount = float(len(item[1])) pc = categoryCount / docTotalCount pcSum += pc * (math.log(pc) / math.log(2)) words = [] for doc in docs: words += [i for i in doc.getWords()] wordDict = {} for word in words: belongDocs,nobelongDocs = DocHelper.wordDocsSplit(word, docs) wordInDocsCount = len(belongDocs) wordNotInDocsCount = len(nobelongDocs) pctSum = 0;pcntSum = 0 for category in categories: ctCount = len(DocHelper.categorySplit(category, belongDocs)[0]) pct = float(ctCount) / wordInDocsCount if pct != 0: pctSum += pct * (math.log(pct) / math.log(2)) cntCount = len(DocHelper.categorySplit(category, nobelongDocs)[0]) if cntCount != 0: pcnt = float(cntCount) / wordNotInDocsCount if pcnt != 0: pcntSum += pcnt * (math.log(pcnt) / math.log(2)) pt = float(wordInDocsCount) / docTotalCount pnt = float(wordNotInDocsCount) / docTotalCount ig = -pcSum + pt * pctSum + pnt * pcntSum wordDict[word] = ig return DocHelper.sortWordValueMap(wordDict) #计算文档集中词的交叉期望熵 @staticmethod def calculateKL(docs): docTotalCount = len(docs) allWords = [] categories = [] cateToCount = {} wordToCount = {} for doc in docs: cate = doc.getCategory() categories.append(cate) cateCount = cateToCount.get(cate) if cateCount is None: cateToCount[cate] = 1 else: cateToCount[cate] = cateCount + 1 words = doc.getWords() for word in words: allWords.append(word) count = wordToCount.get(word) if count is None: wordToCount[word] = 1 else : wordToCount[word] = count + 1 allWords = set(allWords) categories = set(categories) wordDict = {} word_len = len(allWords) for word in allWords: pt = float(wordToCount.get(word)) / word_len sum = 0; cd = 0; dd = 0 nt = DocHelper.wordInDocsStatistics(word, docs) for category in categories: cateCount = cateToCount.get(category) pc = float(cateCount) / docTotalCount pct = DocHelper.wordCategoryInDocsPercent(word, category, docs) sum += pct * math.log(pct / pc) nct = DocHelper.categoryWordStatistics(category, word, docs) cd += float(nct) / nt dd += float(nct) / cateCount wordDict[word] = cd * dd * pt * sum return DocHelper.sortWordValueMap(wordDict) #计算文档集之间的相似度 @staticmethod def calculateSimilar(docs): for doc in docs: topWords = DocHelper.docTopNWords(doc, 20) similarities = [] for odoc in docs: otopWords = DocHelper.docTopNWords(odoc, 20) words = WordUtils.mergeAndRemoveRepeat(topWords, otopWords); v1 = DocHelper.docWordsVector(doc, words) v2 = DocHelper.docWordsVector(odoc, words) cosine = DistanceUtils.cosine(v1,v2) similarity = DocSimilarity() similarity.setName1(doc.getName()) similarity.setName2(odoc.getName()) similarity.setVector1(v1) similarity.setVector2(v2) similarity.setCosine(cosine) similarities.append(similarity) doc.setSimilarities(similarities) #根据字典的value字段倒排序 @staticmethod def sortWordValueMap(wordValueMap): results = sorted(wordValueMap.items(), key=lambda i : i[1], reverse=True) return results
import jieba as ws #词工具类 class WordUtils: #对文本进行分词 @staticmethod def split(input): seg_list = ws.cut(input, cut_all=False) words = [] for word in seg_list: words.append(word) return words #对文件进行分词 @staticmethod def splitFile(path): file = open(path) words = [] for line in file.readlines(): line = line.strip(); if len(line) > 0: for w in WordUtils.split(line): words.append(w) file.close() return WordUtils.removeStopWords(words) #根据停用词文件移除词集中的停用词 @staticmethod def removeStopWords(words): file = open("stopwords.dic") stopwords = [] for line in file.readlines(): line = line.strip(); if len(line) > 0: stopwords.append(line) file.close() rwords = [] for word in words: flag = True for stopword in stopwords: #if word.encode('utf-8') == stopword.encode('utf-8'): if word == stopword: flag = False break if flag and len(word.strip()) > 0: rwords.append(word) return rwords #词汇总并且去除重复 @staticmethod def mergeAndRemoveRepeat(w1, w2): all = [i1 for i1 in w1] all += [i2 for i2 in w2] return [i for i in set(all)] #下面这种方式也可以 #all = set(l1) | set(l2) #return [i for i in all]
def testTFIDF(): path = r'D:\resources\test' docs = DocHelper.genDocs(path) DocHelper.calculateTFIDF(docs) for doc in docs: print '----------' tf = DocHelper.calculateTF(doc) tfidf = doc.getTfidfWords() for item in DocHelper.sortWordValueMap(tf)[0:20]: print '%s-%s-%s' %(item[0],item[1],tfidf.get(item[0])) def testSimilarity(): path = r'D:\resources\test' docs = DocHelper.genDocs(path) DocHelper.calculateTFIDF(docs) DocHelper.calculateSimilar(docs) for doc in docs: print '----------' for similarity in doc.getSimilarities(): print '%s-%s-%s' %(similarity.getName1(),\ similarity.getName2(), similarity.getCosine()) def testCHI(): path = r'D:\resources\test' docs = DocHelper.genDocs(path) DocHelper.calculateCHI(docs) for doc in docs: print '----------' for item in DocHelper.sortWordValueMap(doc.getCHIWords())[0:10]: print '%s-%s' %(item[0],item[1]) def testInformationGain(): path = r'D:\resources\test' docs = DocHelper.genDocs(path) wordDict = DocHelper.calculateInformationGain(docs) for item in wordDict[0:30]: print '%s-%s' %(item[0],item[1]) def testKL(): path = r'D:\resources\test' docs = DocHelper.genDocs(path) wordDict = DocHelper.calculateKL(docs) for item in wordDict[0:30]: print '%s-%s' %(item[0],item[1]) if __name__ == '__main__': print '-----TFIDF-----' testTFIDF() print '-----CHI-----' testCHI() print '-----IG-----' testInformationGain() print '-----KL-----' testKL() print '----------'