文本分类—机器学习方法

##不好意思最近事情有点多下次在完善一下
导入常用包

import random
import jieba
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

加载文本与停用词

#加载停用词
stop_words = "".join(['stopwords.txt'])
stopwords=pd.read_csv(stop_words,index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values
laogong = "".join(['beilaogongda.csv'])
laopo = "".join(['beilaopoda.csv'])
erzi = "".join(['beierzida.csv']) 
nver = "".join(['beinverda.csv'])
#加载文本
laogong_df = pd.read_csv(laogong, encoding='utf-8', sep=',')
laopo_df = pd.read_csv(laopo, encoding='utf-8', sep=',')
erzi_df = pd.read_csv(erzi, encoding='utf-8', sep=',')
nver_df = pd.read_csv(nver, encoding='utf-8', sep=',')

对于文本进行处理

 	 #删除文本的nan行
    laogong_df.dropna(inplace=True)
    laopo_df.dropna(inplace=True)
    erzi_df.dropna(inplace=True)
    nver_df.dropna(inplace=True)
    #转换
    laogong = laogong_df.segment.values.tolist()
    laopo = laopo_df.segment.values.tolist()
    erzi = erzi_df.segment.values.tolist()
    nver = nver_df.segment.values.tolist()
    #定义分词和打标签函数preprocess_text
#参数content_lines即为上面转换的list
#参数sentences是定义的空list,用来储存打标签之后的数据
#参数category 是类型标签
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = [v for v in segs if not str(v).isdigit()]#去数字
            segs = list(filter(lambda x:x.strip(), segs))   #去左右空格
            segs = list(filter(lambda x:len(x)>1, segs)) #长度为1的字符
            segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词
            sentences.append((" ".join(segs), category))# 打标签
        except Exception:
            print(line)
            continue 
#调用函数、生成训练数据
sentences = []
preprocess_text(laogong, sentences, 'laogong')
preprocess_text(laopo, sentences, 'laopo')
preprocess_text(erzi, sentences, 'erzi')
preprocess_text(nver, sentences, 'nver')

#用sk-learn对数据切分,分成训练集和测试集

from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)
#抽取特征,我们对文本抽取词袋模型特征
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer='word', # tokenise by character ngrams
    max_features=4000,  # keep the most common 1000 ngrams
)
vec.fit(x_train)

基于朴素贝叶斯算法

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
#对结果进行评分
print(classifier.score(vec.transform(x_test), y_test))

#0.9928400954653938

基于SVM测试

 from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(vec.transform(x_train), y_train)
print(svm.score(vec.transform(x_test), y_test))

#0.9952267303102625

你可能感兴趣的:(自然语言处理)