用LDA模型抽取文本特征,再用线性SVM分类,发现效果很差,F1=0.654。
Precision:0.680,Recall:0.649,F1:0.654
RandomForestClassifier的表现也比较差:
Precision:0.680,Recall:0.668,F1:0.670
而随便用一个深度学习模型(textCNN,LSTM+Attention)都能达到0.95+的F1,而且还不用处理特征、不用分词。
说下具体流程:提取LDA特征时,需要CountVectorizer来先对文本进行向量化,首先需要对文本进行分词,考虑到样本数量较多(搜狐新闻数据集,5个类别*3000条信息),使用了多进程程(此处用了进程池ProcessPoolExecutor来实现)来进行jieba分词。
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import multiprocessing
from concurrent.futures import ProcessPoolExecutor,as_completed
from utils import log
from tqdm import tqdm
import time
import pickle as pk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,recall_score,f1_score
def transform_text(text,stopwords):
#对文章进行jieba分词
words=[w for w in jieba.cut(text) if w.strip() and (w not in stopwords)]
return ','.join(words)
def cut_texts(lock,texts,stopwords,processName,doc_list=[]):
#进程+锁的形式来做多进程分词
log('Process {} is cutting texts...'.format(processName))
docs=[]
for text in tqdm(texts):
doc=transform_text(text,stopwords)
#log(doc)
docs.append(doc)
lock.acquire()
doc_list.extend(docs)
lock.release()
def cut_texts_pool(texts,stopwords,processName):
#分词,此方法将以,进程池方式的方式实现多进程加速执行
log('Process {} is cutting texts...'.format(processName))
docs=[]
for text in tqdm(texts):
doc=transform_text(text,stopwords)
#log(doc)
docs.append(doc)
log('Process {} finished cutting.'.format(processName))
return docs
def hard_work(processName):
#测试方法,模拟耗时操作
log('Process {} is running...'.format(processName))
time.sleep(2)
log('Process {} finished.'.format(processName))
return processName
def mp_pool_test(texts=None,res=None):
#多进程测试
n_process=multiprocessing.cpu_count()
pool=ProcessPoolExecutor()
fs=[]
for i in range(n_process):
f=pool.submit(hard_work,i)
fs.append(f)
names=[]
for f in as_completed(fs):
name = f.result()
names.append(name)
log(names)
def partition(iterable_,n_parittion):
#多文本进行分割,大体均分为n_parittion份
assert isinstance(n_parittion,int) and n_parittion>0,'Invalid value for "n_partition"'
temp=list(iterable_)
total=len(temp)
assert total>n_parittion,'Size of iterable is less than "n_partition"'
partition_size=total//n_parittion
res=[]
for i in range(n_parittion-1):
res.append(temp[partition_size*i:partition_size*(i+1)])
res.append(temp[partition_size*(i+1):])
return res
def mp_cut_pool(texts):
#有几个CPU就创建几个进程
n_process=multiprocessing.cpu_count()
texts=partition(texts,n_process)
#以进程池的方式进行多进程分词
pool=ProcessPoolExecutor(max_workers=12)
fs=[]
docs=[]
for i in range(n_process):
#submit启动进程,第一个参数是目标方法,后面是该方法的参数
f=pool.submit(cut_texts_pool,texts[i],[],i)
#f是一个Future对象
fs.append(f)
#as_completed返回一个迭代器,当进程池当中的进程执行结束时调用
for f in as_completed(fs):
#f.result()获取每个进程的返回值
docs.extend(f.result())
return docs
class LDA_Transformer:
def __init__(self,n_features):
self.n_features=n_features
def fit(self,texts):
log('Building CountVectorizer with texts...')
ct=CountVectorizer()
self.count_vectorizer=ct
log(type(texts))
if isinstance(texts,list):
log('Len of texts:{}'.format(len(texts)))
#log(texts)
else:
log('Shape of texts:{}'.format(texts.shape))
print('texts[0]',texts[0])
ctv=ct.fit_transform(texts)
log('Building LDA model with CountVectorizer..')
#n_components是LDA的主题个数,类似于word embedding的维度大小
lda=LatentDirichletAllocation(n_components=self.n_features)
lda.fit(ctv)
log('Done building LDA model.')
self.lda_model=lda
def transform(self,texts):
count_vec=self.count_vectorizer.transform(texts)
return self.lda_model.transform(count_vec)
def build_data():
df=pd.read_excel('data/souhu_news_400_500.xlsx')
texts=list(df['content'])#文本字段
log(df.columns)
docs=mp_cut_pool(texts)
lda_transformer=LDA_Transformer(64)
lda_transformer.fit(docs)
#保存LDA模型到本地
with open('output/lda_transformer.pkl','wb') as f:
pk.dump(lda_transformer,f)
indices=list(range(df.shape[0]))
np.random.shuffle(indices)
df=df.iloc[indices]
dic={topic:i for i,topic in enumerate(list(df['topic'].unique()))}
y=[dic[topic] for topic in list(df['topic'])]
with open('data/y_lda.pkl','wb') as f:
pk.dump(y,f)
texts=list(df['content'])
X=lda_transformer.transform(texts)
with open('data/X_lda.pkl','wb') as f:
pk.dump(X,f)
log('Training data is saved.')
def load_train_data():
with open('data/X_lda.pkl','rb') as f:
X=pk.load(f)
with open('data/y_lda.pkl','rb') as f:
y=pk.load(f)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
return X_train,X_test,y_train,y_test
def main():
log('Building training data...')
build_data()
log('Loading training data with LDA features...')
X_train,X_test,y_train,y_test=load_train_data()
log('Training LinearSVC model..')
#model=LinearSVC()
model=RandomForestClassifier()
model.fit(X_train,y_train)
log('Evaluating model...')
acc=model.score(X_test,y_test)
log('Accuracy:{}'.format(acc))
y_pred=model.predict(X_test)
p=precision_score(y_test,y_pred,average='macro')
r=recall_score(y_test,y_pred,average='macro')
f1=f1_score(y_test,y_pred,average='macro')
log('Precision:{:.3f},Recall:{:.3f},F1:{:.3f}'.format(p,r,f1))
if __name__=='__main__':
main()