文本分类学习笔记(3)- 特征提取

特征提取TF-IDF值的两种方法,结果保存在.mat文件中:
1、人工分词,每类取3000合并

#人工分词,每类取3000合并
def load_files(directory,prefix=None,postfix=None):
    #获取所有文件
    files_list=[]
    classlen=[0 for i in range(11)]
    i = 0
    for root, sub_dirs, files in os.walk(directory):
        classlen[i] = len(files)
        i += 1
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                    files_list.append(os.path.join(root, special_file))
    #扫描建立词典
    articallist = [dict() for i in range(len(files_list))]
    filelen = [0 for l in range(len(files_list))]
    i = 0
    for eachfile in files_list:
        file_object = open(eachfile,'r')
        t = 0
        for line in file_object:
            for word in line.split():
                #非数字
                if not str(word).isdigit():
                    t += 1
                    #大小写转换
                    word = str(word).lower()
                    if articallist[i].has_key(word):
                        articallist[i][word] += 1
                    else:
                        articallist[i][word] = 1
        filelen[i] = t
        i += 1
        file_object.close()
    #print '总文件数:',len(files_list)
    print len(articallist[1])
    return articallist,classlen,filelen

#导入停止词表
def load_stop_en(filename):
    word_list=[]
    file_object = open(filename,'r')  
    for line in file_object:
        word_list.append(line.strip())
    return word_list

#去停止词
def delet_stopword_en(stop_en_set, en_dict):
    for key in stop_en_set:
        if en_dict.has_key(key):
            del en_dict[key]

#获取某个词在所有文档中的TF-IDF
def get_TFIDF(articallist, word):
    num = len(articallist)
    TFindex = [0 for i in range(num)]
    IDFindex = 0
    for i,eachdict in enumerate(articallist):
        if eachdict.has_key(word):
            TFindex[i] = eachdict[word]/float(len(eachdict))
            IDFindex += 1
    for i in range(num):
        if IDFindex != 0:
            TFindex[i] = TFindex[i] * math.log(num/float(IDFindex))
    return TFindex

def updatex(dict1,dict2):
    #print '$',dict2
    for key in dict2.keys():
        #key = str(key)
        if dict1.has_key(key):
            dict1[key] += dict2[key]
        else:
            dict1[key] = dict2[key]

def get_Mat(trainfilepath='training',testfilepath='test',stop_enname='en.txt',matfilename='D:\Py\SetMat.mat'):
    #导入文件
    articallist,classlen,filelen = load_files(trainfilepath)
    #print classlen
    #去停止词
    stop_en_set = load_stop_en(stop_enname)
    #训练分类器标签集
    #classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]
    classlabel = [i+1 for i in range(10)]
    labeled_names = [0 for i in range(len(articallist))]
    classr = 0
    finaldict = {}
    for i in range(10):
        classl = classr
        classr += classlen[i+1]
        labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]
        tempdict = {}
        for eachdict in articallist[classl:classr]:
            delet_stopword_en(stop_en_set,eachdict)
            updatex(tempdict,eachdict)
        #各类均前3000个,合并
        tempdict = OrderedDict(sorted(tempdict.iteritems(), key=itemgetter(1), reverse=True))      
        tempdict = dict(Counter(tempdict).most_common(3000))
    print 'vector:', len(finaldict)
    #为每个关键词求TF-IDF值,得到文本特征值
    vectormat = [get_TFIDF(articallist,filelen,each) for each in finaldict]
    #转置:行为文件,列为特征
    vectormat = array(vectormat).transpose()

    articallist1,classlen1,filelen1 = load_files(testfilepath)
    vectormat1 = [get_TFIDF(articallist1,filelen1,each) for each in finaldict]
    vectormat1 = array(vectormat1).transpose() 

    data = {}
    data['trainSet'] = vectormat
    data['labeled'] = labeled_names
    data['testSet'] = vectormat1
    io.savemat(matfilename,data)

if __name__ == '__main__':
    get_Mat()

2、nltk包分词,添加词干化、只保留字母,取8000词

#nltk包分词,添加词干化、只保留字母,取8000词
def load_files(directory,stop_enname='en.txt'):
    #获取所有文件
    files_list=[]
    classlen=[0 for i in range(11)]
    #file_name=[[] for i in range(11)]
    i = 0
    for root, sub_dirs, files in os.walk(directory):
        classlen[i] = len(files)   
        for special_file in files:
            temp = str(os.path.join(root,special_file))
            #file_name[i].append(str(temp.split("\\")[2]))
            files_list.append(temp)
        i += 1
    #扫描建立词典   
    worddict = {}
    articallist = [dict() for i in range(len(files_list))]
    filelen = [0 for l in range(len(files_list))]
    i = 0
    for eachfile in files_list:
        file_object = open(eachfile,'r')
        article = file_object.read()
        filelen[i] = len(article)
        articallist[i] = array2dict(ie_preprocess(article), worddict)
        file_object.close()
        i += 1
    #去停止词
    stop_en_set = load_stop_en(stop_enname)
    delet_stopword_en(stop_en_set,worddict)
    del worddict['']
    worddict = OrderedDict(sorted(worddict.iteritems(), key=itemgetter(1), reverse=True))
    worddict = dict(Counter(worddict).most_common(8000))
    print len(articallist),len(worddict)
    #worddict = dict(Counter(worddict).most_common(8000))
    #file_o = open('words.txt','w')
    #file_o.write(str(worddict))
    return worddict,classlen,articallist#,file_name

#分句分词
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    return sentences

#只保留字母
def OnlyChar(s,oth=''):
    s2 = s.lower()
    fomart = 'abcdefghijklmnopqrstuvwxyz'
    for c in s2:
        if not c in fomart:
            s = s.replace(c,'')
    return s

#array to dict
def array2dict(arr,worddict):
    temp = {}
    for l in arr:
        for word in l:
            #词干化
            #word = wn.morphy(OnlyChar(word))
            word = OnlyChar(word)
            if not str(word).isdigit():
                word = str(word).lower()
                if worddict.has_key(word):
                    worddict[word] += 1
                else:
                    worddict[word] = 1
                if temp.has_key(word):
                    temp[word] += 1
                else:
                    temp[word] = 1
    return temp

#导入停止词表
def load_stop_en(filename):
    word_list=[]
    file_object = open(filename,'r')
    for line in file_object:
        word_list.append(line.strip())
    return word_list

#去停止词
def delet_stopword_en(stop_en_set, en_dict):
    for key in stop_en_set:
        if en_dict.has_key(key):
            del en_dict[key]

#获取某个词在所有文档中的TF-IDF
def get_TFIDF(articallist, word):
    num = len(articallist)
    TFindex = [0 for i in range(num)]
    IDFindex = 0
    for i,eachdict in enumerate(articallist):
        if eachdict.has_key(word):
            TFindex[i] = eachdict[word]/float(len(eachdict))
            IDFindex += 1
    for i in range(num):
        if IDFindex != 0:
            TFindex[i] = TFindex[i] * math.log(num/float(IDFindex))
    return TFindex

def get_Mat(trainfilepath='training',testfilepath='test',stop_enname='en.txt',matfilename='D:\Py\SetMat1.mat'):
    #导入文件
    worddict, classlen, articallist = load_files(trainfilepath)
    #为每个关键词求TF-IDF值,得到文本特征值
    vectormat = [get_TFIDF(articallist, each) for each in worddict]
    #转置:行为文件,列为特征
    vectormat = array(vectormat).transpose()
    #pca = PCA(n_components='mle')
    #vectormat = pca.fit_transform(vectormat)

    #训练分类器标签集
    #classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]
    classlabel = [i+1 for i in range(10)]
    labeled_names = [0 for i in range(len(articallist))]
    classr = 0
    for i in range(10):
        classl = classr
        classr += classlen[i+1]
        #print classl,classr,classlabel[i]
        labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]

    worddict1,classlen1,articallist1 = load_files(testfilepath)
    vectormat1 = [get_TFIDF(articallist1, each) for each in worddict]
    vectormat1 = array(vectormat1).transpose()
    labeled_names1 = [0 for i in range(len(articallist1))]
    classr = 0
    for i in range(10):
        classl = classr
        classr += classlen1[i+1]
        #print classl,classr,classlabel[i]
        labeled_names1[classl:classr] = [classlabel[i] for k in range(classlen1[i+1])]
    data = {}
    data['trainSet'] = vectormat
    data['train_labeled'] = labeled_names
    data['testSet'] = vectormat1
    data['test_labeled'] = labeled_names1
    io.savemat(matfilename,data)  

if __name__ == '__main__':
    get_Mat()

附:nltk包提取特征方法

#load datasets
    doc_train = load_files('training')
    doc_test = load_files('test')
    #TF-IDF特征(词频)
    count_vec = TfidfVectorizer(min_df=1,decode_error='replace')   
    #Bool型特征(one-hot#count_vec = CountVectorizer(binary = True,decode_error='replace')
    doc_train_bool = count_vec.fit_transform(doc_train.data)
    //注意此处是transform,保证和训练集维度一致
    doc_test_bool = count_vec.transform(doc_test.data)
    train = doc_train_bool.toarray()
    test = doc_test_bool.toarray()
    print 'load finished'

你可能感兴趣的:(文本分类)