特征提取TF-IDF值的两种方法,结果保存在.mat文件中:
1、人工分词,每类取3000合并
#人工分词,每类取3000合并
def load_files(directory,prefix=None,postfix=None):
#获取所有文件
files_list=[]
classlen=[0 for i in range(11)]
i = 0
for root, sub_dirs, files in os.walk(directory):
classlen[i] = len(files)
i += 1
for special_file in files:
if postfix:
if special_file.endswith(postfix):
files_list.append(os.path.join(root, special_file))
elif prefix:
if special_file.startswith(prefix):
files_list.append(os.path.join(root, special_file))
else:
files_list.append(os.path.join(root, special_file))
#扫描建立词典
articallist = [dict() for i in range(len(files_list))]
filelen = [0 for l in range(len(files_list))]
i = 0
for eachfile in files_list:
file_object = open(eachfile,'r')
t = 0
for line in file_object:
for word in line.split():
#非数字
if not str(word).isdigit():
t += 1
#大小写转换
word = str(word).lower()
if articallist[i].has_key(word):
articallist[i][word] += 1
else:
articallist[i][word] = 1
filelen[i] = t
i += 1
file_object.close()
#print '总文件数:',len(files_list)
print len(articallist[1])
return articallist,classlen,filelen
#导入停止词表
def load_stop_en(filename):
word_list=[]
file_object = open(filename,'r')
for line in file_object:
word_list.append(line.strip())
return word_list
#去停止词
def delet_stopword_en(stop_en_set, en_dict):
for key in stop_en_set:
if en_dict.has_key(key):
del en_dict[key]
#获取某个词在所有文档中的TF-IDF
def get_TFIDF(articallist, word):
num = len(articallist)
TFindex = [0 for i in range(num)]
IDFindex = 0
for i,eachdict in enumerate(articallist):
if eachdict.has_key(word):
TFindex[i] = eachdict[word]/float(len(eachdict))
IDFindex += 1
for i in range(num):
if IDFindex != 0:
TFindex[i] = TFindex[i] * math.log(num/float(IDFindex))
return TFindex
def updatex(dict1,dict2):
#print '$',dict2
for key in dict2.keys():
#key = str(key)
if dict1.has_key(key):
dict1[key] += dict2[key]
else:
dict1[key] = dict2[key]
def get_Mat(trainfilepath='training',testfilepath='test',stop_enname='en.txt',matfilename='D:\Py\SetMat.mat'):
#导入文件
articallist,classlen,filelen = load_files(trainfilepath)
#print classlen
#去停止词
stop_en_set = load_stop_en(stop_enname)
#训练分类器标签集
#classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]
classlabel = [i+1 for i in range(10)]
labeled_names = [0 for i in range(len(articallist))]
classr = 0
finaldict = {}
for i in range(10):
classl = classr
classr += classlen[i+1]
labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]
tempdict = {}
for eachdict in articallist[classl:classr]:
delet_stopword_en(stop_en_set,eachdict)
updatex(tempdict,eachdict)
#各类均前3000个,合并
tempdict = OrderedDict(sorted(tempdict.iteritems(), key=itemgetter(1), reverse=True))
tempdict = dict(Counter(tempdict).most_common(3000))
print 'vector:', len(finaldict)
#为每个关键词求TF-IDF值,得到文本特征值
vectormat = [get_TFIDF(articallist,filelen,each) for each in finaldict]
#转置:行为文件,列为特征
vectormat = array(vectormat).transpose()
articallist1,classlen1,filelen1 = load_files(testfilepath)
vectormat1 = [get_TFIDF(articallist1,filelen1,each) for each in finaldict]
vectormat1 = array(vectormat1).transpose()
data = {}
data['trainSet'] = vectormat
data['labeled'] = labeled_names
data['testSet'] = vectormat1
io.savemat(matfilename,data)
if __name__ == '__main__':
get_Mat()
2、nltk包分词,添加词干化、只保留字母,取8000词
#nltk包分词,添加词干化、只保留字母,取8000词
def load_files(directory,stop_enname='en.txt'):
#获取所有文件
files_list=[]
classlen=[0 for i in range(11)]
#file_name=[[] for i in range(11)]
i = 0
for root, sub_dirs, files in os.walk(directory):
classlen[i] = len(files)
for special_file in files:
temp = str(os.path.join(root,special_file))
#file_name[i].append(str(temp.split("\\")[2]))
files_list.append(temp)
i += 1
#扫描建立词典
worddict = {}
articallist = [dict() for i in range(len(files_list))]
filelen = [0 for l in range(len(files_list))]
i = 0
for eachfile in files_list:
file_object = open(eachfile,'r')
article = file_object.read()
filelen[i] = len(article)
articallist[i] = array2dict(ie_preprocess(article), worddict)
file_object.close()
i += 1
#去停止词
stop_en_set = load_stop_en(stop_enname)
delet_stopword_en(stop_en_set,worddict)
del worddict['']
worddict = OrderedDict(sorted(worddict.iteritems(), key=itemgetter(1), reverse=True))
worddict = dict(Counter(worddict).most_common(8000))
print len(articallist),len(worddict)
#worddict = dict(Counter(worddict).most_common(8000))
#file_o = open('words.txt','w')
#file_o.write(str(worddict))
return worddict,classlen,articallist#,file_name
#分句分词
def ie_preprocess(document):
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
return sentences
#只保留字母
def OnlyChar(s,oth=''):
s2 = s.lower()
fomart = 'abcdefghijklmnopqrstuvwxyz'
for c in s2:
if not c in fomart:
s = s.replace(c,'')
return s
#array to dict
def array2dict(arr,worddict):
temp = {}
for l in arr:
for word in l:
#词干化
#word = wn.morphy(OnlyChar(word))
word = OnlyChar(word)
if not str(word).isdigit():
word = str(word).lower()
if worddict.has_key(word):
worddict[word] += 1
else:
worddict[word] = 1
if temp.has_key(word):
temp[word] += 1
else:
temp[word] = 1
return temp
#导入停止词表
def load_stop_en(filename):
word_list=[]
file_object = open(filename,'r')
for line in file_object:
word_list.append(line.strip())
return word_list
#去停止词
def delet_stopword_en(stop_en_set, en_dict):
for key in stop_en_set:
if en_dict.has_key(key):
del en_dict[key]
#获取某个词在所有文档中的TF-IDF
def get_TFIDF(articallist, word):
num = len(articallist)
TFindex = [0 for i in range(num)]
IDFindex = 0
for i,eachdict in enumerate(articallist):
if eachdict.has_key(word):
TFindex[i] = eachdict[word]/float(len(eachdict))
IDFindex += 1
for i in range(num):
if IDFindex != 0:
TFindex[i] = TFindex[i] * math.log(num/float(IDFindex))
return TFindex
def get_Mat(trainfilepath='training',testfilepath='test',stop_enname='en.txt',matfilename='D:\Py\SetMat1.mat'):
#导入文件
worddict, classlen, articallist = load_files(trainfilepath)
#为每个关键词求TF-IDF值,得到文本特征值
vectormat = [get_TFIDF(articallist, each) for each in worddict]
#转置:行为文件,列为特征
vectormat = array(vectormat).transpose()
#pca = PCA(n_components='mle')
#vectormat = pca.fit_transform(vectormat)
#训练分类器标签集
#classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]
classlabel = [i+1 for i in range(10)]
labeled_names = [0 for i in range(len(articallist))]
classr = 0
for i in range(10):
classl = classr
classr += classlen[i+1]
#print classl,classr,classlabel[i]
labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]
worddict1,classlen1,articallist1 = load_files(testfilepath)
vectormat1 = [get_TFIDF(articallist1, each) for each in worddict]
vectormat1 = array(vectormat1).transpose()
labeled_names1 = [0 for i in range(len(articallist1))]
classr = 0
for i in range(10):
classl = classr
classr += classlen1[i+1]
#print classl,classr,classlabel[i]
labeled_names1[classl:classr] = [classlabel[i] for k in range(classlen1[i+1])]
data = {}
data['trainSet'] = vectormat
data['train_labeled'] = labeled_names
data['testSet'] = vectormat1
data['test_labeled'] = labeled_names1
io.savemat(matfilename,data)
if __name__ == '__main__':
get_Mat()
附:nltk包提取特征方法
#load datasets
doc_train = load_files('training')
doc_test = load_files('test')
#TF-IDF特征(词频)
count_vec = TfidfVectorizer(min_df=1,decode_error='replace')
#Bool型特征(one-hot)
#count_vec = CountVectorizer(binary = True,decode_error='replace')
doc_train_bool = count_vec.fit_transform(doc_train.data)
//注意此处是transform,保证和训练集维度一致
doc_test_bool = count_vec.transform(doc_test.data)
train = doc_train_bool.toarray()
test = doc_test_bool.toarray()
print 'load finished'