数据挖掘笔记-特征选择-算法实现-1

关于特征选择相关的知识可以参考一下连接

数据挖掘笔记-特征选择-开方检验

数据挖掘笔记-特征选择-信息增益

数据挖掘笔记-特征选择-期望交叉熵

数据挖掘笔记-特征选择-互信息

数据挖掘笔记-特征选择-遗传算法

数据挖掘笔记-特征选择-整体汇总

项目源码里面包含Java和Python的实现,这里只列出Python实现:

代码托管:https://github.com/fighting-one-piece/repository-datamining.git

class Doc:
    
    def __init__(self, name):
        self._name = name
     
    def setName(self, name):
        self._name = name
    
    def getName(self):
        return self._name
    
    def setCategory(self, category):
        self._category = category
        
    def getCategory(self):
        return self._category
        
    def setWords(self, words):
        self._words = words
        
    def getWords(self):
        return self._words
    
    def setTfidfWords(self, tfidfWords):
        self._tfidfWords = tfidfWords
        
    def getTfidfWords(self):
        return self._tfidfWords
    
    def getSortedTfidfWords(self):
        results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ]
        return results
    
    def setCHIWords(self, chiWords):
        self._chiWords = chiWords
        
    def getCHIWords(self):
        return self._chiWords

    def setSimilarities(self, similarities):
        self._similarities = similarities
        
    def getSimilarities(self):
        return self._similarities

#文档操作工具类        
class DocHelper:
    
    #获取目录下所有的文档
    @staticmethod
    def genDocs(path):
        docs = []
        DocHelper.genDocsIterator(path, docs)
        return docs
    
    #遍历目录获取目录下所有的文档
    @staticmethod
    def genDocsIterator(path, docs):
        if os.path.isdir(path):
            for subPathName in os.listdir(path):
                subPath = os.path.join(path, subPathName)
                DocHelper.genDocsIterator(subPath, docs)
        else:
            name = path[path.rfind('\\') + 1 : path.rfind('.')]
            doc = Doc(name)
            doc.setCategory(path.split('\\')[-2])
            doc.setWords(WordUtils.splitFile(path));
            docs.append(doc)
    
    #文档中是否包含指定词
    @staticmethod
    def docHasWord(doc, word):
        for dword in doc.getWords():
            if dword == word:
                return True
        return False
    
    #文档中词频统计
    @staticmethod
    def docWordsStatistics(doc):
        map = {}
        for word in doc.getWords():
            count = map.get(word)
            if count is None:
                count = 0
            map[word] = count + 1
        return map
    
    #根据文档所属类型分割文档集
    @staticmethod
    def docCategorySplit(docs):
        docSplits = {}
        for doc in docs:
            category = doc.getCategory()
            if docSplits.has_key(category):
                cDocs = docSplits.get(category)
                cDocs.append(doc)
            else :
                cDocs = [doc]
                docSplits[category] = cDocs
        return docSplits
    
    #根据TFIDF取文档排行前N的词
    @staticmethod
    def docTopNWords(doc, n):
        sortedWords = DocHelper.sortWordValueMap(doc.getTfidfWords())
        words = []
        for item in sortedWords[0:n]:
            words.append(item[0])
        return words
                    
    #文档中词的向量化
    @staticmethod
    def docWordsVector(doc, words):
        vector = []
        docWords = DocHelper.docWordsStatistics(doc)
        for word in words:
            count = docWords.get(word)
            if count is None:
                vector.append(0)
            else :
                vector.append(count)
        return vector
    
    #根据词所属文档类型获取同类文档集和非同类文档集
    @staticmethod
    def categorySplit(category, docs):
        belongDocs = []
        nobelongDocs = []
        for doc in docs:
            if category == doc.getCategory():
                belongDocs.append(doc)
            else:
                nobelongDocs.append(doc)
        return belongDocs, nobelongDocs
    
    #根据词所属文档类型获取同类文档集数量
    @staticmethod
    def categoryStatistics(category, docs):
        sum = 0
        for doc in docs:
            if category == doc.getCategory():
                sum = sum + 1
        return sum

    #根据词所属文档类型获取文档集中含有词数量
    @staticmethod
    def categoryWordStatistics(category, word, docs):
        sum = 0
        for doc in docs:
            if category == doc.getCategory() and \
                DocHelper.docHasWord(doc, word):
                sum = sum + 1
        return sum
    
    #获取包含词文档集和非包含词文档集
    @staticmethod
    def wordDocsSplit(word, docs):
        belongDocs = []
        nobelongDocs = []
        for doc in docs:
            flag = False
            for dword in doc.getWords():
                if word == dword:
                    flag = True
                    belongDocs.append(doc)
                    break;
            if flag == False:        
                nobelongDocs.append(doc)
        return belongDocs, nobelongDocs
    
    #统计文档集包含词的文档数
    @staticmethod
    def wordInDocsStatistics(word, docs):
        sum = 0
        for doc in docs:
            if DocHelper.docHasWord(doc, word):
                sum += 1
        return sum

    #统计文档集不包含词的文档数
    @staticmethod
    def wordNotInDocsStatistics(word, docs):
        sum = 0
        for doc in docs:
            if DocHelper.docHasWord(doc, word) == False:
                sum += 1
        return sum
    
    #文档集包含词的文档数所属类型在包含次文档数中的概率
    @staticmethod
    def wordCategoryInDocsPercent(word, category, docs):
        sumWord = 0
        sumCategory = 0
        for doc in docs:
            if DocHelper.docHasWord(doc, word):
                sumWord += 1
            if category == doc.getCategory():
                sumCategory += 1
        return float(sumCategory) / sumWord
    
    @staticmethod
    def calculateTF(doc):
        tf = {}
        for word in doc.getWords():
            if tf.has_key(word):
                tf[word] = tf.get(word) + 1
            else:
                tf[word] = 1
        return tf
    
    #计算TFIDF
    @staticmethod
    def calculateTFIDF(docs):
        docTotalCount = float(len(docs))
        for doc in docs:
            wordTotalCount = len(doc.getWords())
            tfidfWords = {}
            docWords = DocHelper.docWordsStatistics(doc)
            for word in docWords.keys():
                wordCount = docWords.get(word)
                tf = float(wordCount) / wordTotalCount
                docCount = DocHelper.wordInDocsStatistics(word, docs) + 1
                if docCount > docTotalCount:
                    docCount = docTotalCount
                idf = math.log(docTotalCount / docCount);
                tfidf = tf * idf
                tfidfWords[word] = tfidf
            doc.setTfidfWords(tfidfWords)
            
    #根据开方检验特征选择算法计算文档集中各个文档的词与类别的开方值
    @staticmethod
    def calculateCHI(docs):
        docTotalCount = len(docs)
        for doc in docs:
            chiWords = {}
            words = doc.getWords()
            belongDocs,nobelongDocs = DocHelper.categorySplit(\
                doc.getCategory(), docs)
            for word in words:
                a = DocHelper.wordInDocsStatistics(word, belongDocs)
                b = DocHelper.wordInDocsStatistics(word, nobelongDocs)
                c = DocHelper.wordNotInDocsStatistics(word, belongDocs)
                d = DocHelper.wordNotInDocsStatistics(word, nobelongDocs)
                x = float((a*d-b*c)**2) / ((a+b)*(c+d))
                chiWords[word] = x
            doc.setCHIWords(chiWords)
            
    #根据信息增益特征选择算法计算文档集中词的信息增益
    @staticmethod
    def calculateInformationGain(docs):
        docTotalCount = len(docs)
        splits = DocHelper.docCategorySplit(docs)
        categories = []
        pcSum = 0
        for item in splits.items():
            categories.append(item[0])
            categoryCount = float(len(item[1]))
            pc = categoryCount / docTotalCount
            pcSum += pc * (math.log(pc) / math.log(2))
        words = []
        for doc in docs:
            words += [i for i in doc.getWords()]
        wordDict = {}
        for word in words:
            belongDocs,nobelongDocs = DocHelper.wordDocsSplit(word, docs)
            wordInDocsCount = len(belongDocs)
            wordNotInDocsCount = len(nobelongDocs)
            pctSum = 0;pcntSum = 0
            for category in categories:
                ctCount = len(DocHelper.categorySplit(category, belongDocs)[0])
                pct = float(ctCount) / wordInDocsCount
                if pct != 0:
                    pctSum += pct * (math.log(pct) / math.log(2))
                cntCount = len(DocHelper.categorySplit(category, nobelongDocs)[0])
                if cntCount != 0:
                    pcnt = float(cntCount) / wordNotInDocsCount
                    if pcnt != 0:
                        pcntSum += pcnt * (math.log(pcnt) / math.log(2))
            pt = float(wordInDocsCount) / docTotalCount
            pnt = float(wordNotInDocsCount) / docTotalCount
            ig = -pcSum + pt * pctSum + pnt * pcntSum
            wordDict[word] = ig
        return DocHelper.sortWordValueMap(wordDict)
    
    #计算文档集中词的交叉期望熵
    @staticmethod
    def calculateKL(docs):
        docTotalCount = len(docs)
        allWords = []
        categories = []
        cateToCount = {}
        wordToCount = {}
        for doc in docs:
            cate = doc.getCategory()
            categories.append(cate)
            cateCount = cateToCount.get(cate)
            if cateCount is None:
                cateToCount[cate] = 1
            else:
                cateToCount[cate] = cateCount + 1 
            words = doc.getWords()
            for word in words:
                allWords.append(word)
                count = wordToCount.get(word)
                if count is None:
                    wordToCount[word] = 1
                else :
                    wordToCount[word] = count + 1
        allWords = set(allWords)
        categories = set(categories)
        wordDict = {}
        word_len = len(allWords)
        for word in allWords:
            pt = float(wordToCount.get(word)) / word_len
            sum = 0; cd = 0; dd = 0
            nt = DocHelper.wordInDocsStatistics(word, docs)
            for category in categories:
                cateCount = cateToCount.get(category)
                pc = float(cateCount) / docTotalCount
                pct = DocHelper.wordCategoryInDocsPercent(word, category, docs)
                sum += pct * math.log(pct / pc)
                nct = DocHelper.categoryWordStatistics(category, word, docs)
                cd += float(nct) / nt
                dd += float(nct) / cateCount
            wordDict[word] = cd * dd * pt * sum
        return DocHelper.sortWordValueMap(wordDict)    
                
            
    #计算文档集之间的相似度    
    @staticmethod
    def calculateSimilar(docs):
        for doc in docs:
            topWords = DocHelper.docTopNWords(doc, 20)
            similarities = []
            for odoc in docs:
                otopWords = DocHelper.docTopNWords(odoc, 20)
                words = WordUtils.mergeAndRemoveRepeat(topWords, otopWords);
                v1 = DocHelper.docWordsVector(doc, words)
                v2 = DocHelper.docWordsVector(odoc, words)
                cosine = DistanceUtils.cosine(v1,v2)
                similarity = DocSimilarity()
                similarity.setName1(doc.getName())
                similarity.setName2(odoc.getName())
                similarity.setVector1(v1)
                similarity.setVector2(v2)
                similarity.setCosine(cosine)
                similarities.append(similarity)
            doc.setSimilarities(similarities)
    
    #根据字典的value字段倒排序            
    @staticmethod
    def sortWordValueMap(wordValueMap):
        results = sorted(wordValueMap.items(), key=lambda i : i[1], reverse=True)
        return results


import jieba as ws

#词工具类
class WordUtils:

    #对文本进行分词
    @staticmethod
    def split(input):
        seg_list = ws.cut(input, cut_all=False)
        words = []
        for word in seg_list:
            words.append(word)
        return words
    
    #对文件进行分词
    @staticmethod
    def splitFile(path):
        file = open(path)
        words = []
        for line in file.readlines():
            line = line.strip();
            if len(line) > 0:
                for w in WordUtils.split(line):
                    words.append(w)
        file.close()
        return WordUtils.removeStopWords(words)
    
    #根据停用词文件移除词集中的停用词
    @staticmethod
    def removeStopWords(words):
        file = open("stopwords.dic")
        stopwords = []
        for line in file.readlines():
            line = line.strip();
            if len(line) > 0:
                stopwords.append(line)
        file.close()
        rwords = []
        for word in words:
            flag = True
            for stopword in stopwords:
                #if word.encode('utf-8') == stopword.encode('utf-8'):
                if word == stopword:
                    flag = False
                    break
            if flag and len(word.strip()) > 0:
                rwords.append(word)
        return rwords
    
    #词汇总并且去除重复
    @staticmethod
    def mergeAndRemoveRepeat(w1, w2):
        all = [i1 for i1 in w1]
        all += [i2 for i2 in w2]
        return [i for i in set(all)]
        #下面这种方式也可以 
        #all = set(l1) | set(l2)
        #return [i for i in all] 

测试用例:

def testTFIDF():
    path = r'D:\resources\test'
    docs = DocHelper.genDocs(path)
    DocHelper.calculateTFIDF(docs)
    for doc in docs:
        print '----------'
        tf = DocHelper.calculateTF(doc)
        tfidf = doc.getTfidfWords()
        for item in DocHelper.sortWordValueMap(tf)[0:20]:
            print '%s-%s-%s' %(item[0],item[1],tfidf.get(item[0]))

def testSimilarity(): 
    path = r'D:\resources\test'
    docs = DocHelper.genDocs(path)
    DocHelper.calculateTFIDF(docs)
    DocHelper.calculateSimilar(docs)
    for doc in docs:
        print '----------'
        for similarity in doc.getSimilarities():
            print '%s-%s-%s' %(similarity.getName1(),\
                    similarity.getName2(), similarity.getCosine())
            
def testCHI():
    path = r'D:\resources\test'
    docs = DocHelper.genDocs(path)
    DocHelper.calculateCHI(docs)
    for doc in docs:
        print '----------'
        for item in DocHelper.sortWordValueMap(doc.getCHIWords())[0:10]:
            print '%s-%s' %(item[0],item[1])
                
def testInformationGain():
    path = r'D:\resources\test'
    docs = DocHelper.genDocs(path)
    wordDict = DocHelper.calculateInformationGain(docs)
    for item in wordDict[0:30]:
        print '%s-%s' %(item[0],item[1])
    
def testKL():
    path = r'D:\resources\test'
    docs = DocHelper.genDocs(path)
    wordDict = DocHelper.calculateKL(docs)
    for item in wordDict[0:30]:
        print '%s-%s' %(item[0],item[1])

if __name__ == '__main__':
    print '-----TFIDF-----' 
    testTFIDF()
    print '-----CHI-----' 
    testCHI()
    print '-----IG-----' 
    testInformationGain()
    print '-----KL-----' 
    testKL()
    print '----------' 




你可能感兴趣的:(DataMining,Python)