流程:
step1:导入文件
step2:分词
step3:去停用词
step4:tf-idf筛选
step5:卡方筛选
step6:训练预测
step1、2、3
导入文件很简单,如果路径是中文,需要注意,在Windows上需要用Unicode(path,'utf8')转换路径名称
文件中出现大量连续空格、换行符,所以使用正则匹配方法将之替换成一个空格
数字(这里暂且认为数字没有用处),中文英文标点符号,都没有用,过滤掉
也可以将他们写入停用词,然后全部一并过滤掉
用jieba分词,遇到空格也会作为一个单词,分完词后,将空格全部过滤掉
# -*- coding: utf-8 -*-
import jieba
import os
import re import time import string rootpath="../转换后的文件" os.chdir(rootpath) # stopword words_list = [] filename_list = [] category_list = [] all_words = {} # 全词库 {'key':value } stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords.txt')]) category = os.listdir(rootpath) # 类别列表 delEStr = string.punctuation + ' ' + string.digits identify = string.maketrans('', '') ######################### # 分词,创建词库 # ######################### def fileWordProcess(contents): wordsList = [] contents = re.sub(r'\s+',' ',contents) # trans 多空格 to 空格 contents = re.sub(r'\n',' ',contents) # trans 换行 to 空格 contents = re.sub(r'\t',' ',contents) # trans Tab to 空格 contents = contents.translate(identify, delEStr) for seg in jieba.cut(contents): seg = seg.encode('utf8') if seg not in stopwords: # remove 停用词 if seg!=' ': # remove 空格 wordsList.append(seg) # create 文件词列表 file_string = ' '.join(wordsList) return file_string for categoryName in category: # 循环类别文件,OSX系统默认第一个是系统文件 if(categoryName=='.DS_Store'):continue categoryPath = os.path.join(rootpath,categoryName) # 这个类别的路径 filesList = os.listdir(categoryPath) # 这个类别内所有文件列表 # 循环对每个文件分词 for filename in filesList: if(filename=='.DS_Store'):continue starttime = time.clock() contents = open(os.path.join(categoryPath,filename)).read() wordProcessed = fileWordProcess(contents) # 内容分词成列表 #暂时不做#filenameWordProcessed = fileWordProcess(filename) # 文件名分词,单独做特征 # words_list.append((wordProcessed,categoryName,filename)) # 训练集格式:[(当前文件内词列表,类别,文件名)] words_list.append(wordProcessed) filename_list.append(filename) category_list.append(categoryName) endtime = time.clock(); print '类别:%s >>>>文件:%s >>>>导入用时: %.3f' % (categoryName,filename,endtime-starttime)
用三个列表存储文件内容,
words_list存放所有文件分完词后的词库,filename_list存放对应的文件名称,category_list存放对应的文件类型(这里是‘机密,秘密,内部’三类)
step4
sklearn 非常强大的两个函数CountVectorizer,TfidfTransformer,第一个可以生成词频矩阵,将词频权重大于1的转为1就是词向量矩阵,第二个函数计算tf-idf矩阵,利用他过滤掉tf-idf计算值小的特征词,
# 创建词向量矩阵,创建tfidf值矩阵 from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer freWord = CountVectorizer(stop_words='english') transformer = TfidfTransformer() fre_matrix = freWord.fit_transform(words_list) tfidf = transformer.fit_transform(fre_matrix) import pandas as pd feature_names = freWord.get_feature_names() # 特征名 freWordVector_df = pd.DataFrame(fre_matrix.toarray()) # 全词库 词频 向量矩阵 tfidf_df = pd.DataFrame(tfidf.toarray()) # tfidf值矩阵 # print freWordVector_df tfidf_df.shape
# tf-idf 筛选 tfidf_sx_featuresindex = tfidf_df.sum(axis=0).sort_values(ascending=False)[:10000].index print len(tfidf_sx_featuresindex) freWord_tfsx_df = freWordVector_df.ix[:,tfidf_sx_featuresindex] # tfidf法筛选后的词向量矩阵 df_columns = pd.Series(feature_names)[tfidf_sx_featuresindex] print df_columns.shape def guiyi(x): x[x>1]=1 return x import numpy as np tfidf_df_1 = freWord_tfsx_df.apply(guiyi) tfidf_df_1.columns = df_columns from sklearn import preprocessing le = preprocessing.LabelEncoder() tfidf_df_1['label'] = le.fit_transform(category_list) tfidf_df_1.index = filename_list
step5卡方检验筛选就更简单了,之前一直找不到卡方检验选择的特征索引这次终于会用了,get_support(indices=False),选false则不返回索引,而返回全部特征的Boolean值列表,选true则返回选中的索引值
# 卡方检验 from sklearn.feature_selection import SelectKBest, chi2 ch2 = SelectKBest(chi2, k=7000) nolabel_feature = [x for x in tfidf_df_1.columns if x not in ['label']] ch2_sx_np = ch2.fit_transform(tfidf_df_1[nolabel_feature],tfidf_df_1['label']) label_np = np.array(tfidf_df_1['label'])
step6:
这里我先选用了朴素贝叶斯算法训练,训练前我先将样本按照分层10折交叉法划分数据集,然后迭代10次,分别进行训练和预测,
最后将预测的值与真实值比较,最高84%正确率
# 朴素贝叶斯 from sklearn.naive_bayes import MultinomialNB from sklearn.cross_validation import train_test_split from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import KFold from sklearn.metrics import precision_recall_curve from sklearn.metrics import classification_report # nolabel_feature = [x for x in tfidf_df_1.columns if x not in ['label']] # x_train, x_test, y_train, y_test = train_test_split(ch2_sx_np, tfidf_df_1['label'], test_size = 0.2) X = ch2_sx_np y = label_np skf = StratifiedKFold(y,n_folds=10) y_pre = y.copy() for train_index,test_index in skf: X_train,X_test = X[train_index],X[test_index] y_train,y_test = y[train_index],y[test_index] clf = MultinomialNB().fit(X_train, y_train) y_pre[test_index] = clf.predict(X_test) print '准确率为 %.6f' %(np.mean(y_pre == y))
step7:验证精确率、召回率、f1值和confusion matrix
# 精准率 召回率 F1score from sklearn.metrics import confusion_matrix,classification_report print 'precision,recall,F1-score如下:》》》》》》》》' print classification_report(y,y_pre) # confusion matrix import matplotlib.pyplot as plt %matplotlib inline def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(category[1:])) category_english=['neibu','jimi','mimi'] plt.xticks(tick_marks, category_english, rotation=45) plt.yticks(tick_marks, category_english) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') for x in range(len(cm)): for y in range(len(cm)): plt.annotate(cm[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center') print '混淆矩阵如下:》》》》》》' cm = confusion_matrix(y,y_pre) plt.figure() plot_confusion_matrix(cm) plt.show()
![]()