使用sklearn+jieba分词写的文本分类

结巴分词是对中文语言进行处理的一个Python模块

import jieba
luca = jieba.cut('遇见你真的是太好了')
print(list(luca))#直接使用jieba.cut()返回的不是列表,需要用list()或set(),''.join()等方式输出,或使用jieba.lcut()

import jieba.analyse
for x, w in jieba.analyse.extract_tags(text):#可以再添加一个参数指定输出个数
    print(x, w)#直接输出关键词和词频

jieba分词的这个函数返回的高频词效果不太理想,在实际中我没使用它,事实上也没使用analyse的停词方法,而是手动进行了停词处理

def rm_char1(text1):
    text1 = re.sub('\u3000', '', text1)    
    return text1
    
def rm_char2(text2):
    text2 = re.sub('\xa0', '', text2)
    return text2

def get_stop_words():
    # stop_words中,每行放一个停用词,以\n分隔
    with open('F:\jieba\stop_words.txt','r', encoding = 'utf8') as f:
        file = f.read().split('\n')
    return set(file)

def rm_tokens(words): # 去掉一些停用词和数字
    words_list = list(words)
    stop_words = get_stop_words()
    for i in range(words_list.__len__())[::-1]:
        if words_list[i] in stop_words: # 去除停用词
            words_list.pop(i)
        elif words_list[i].isdigit():
            words_list.pop(i)
    return words_list

def convert_text_to_wordlist(str_doc):
    # 分词的主要方法
    sent_list = str_doc.split('\n')
    sent_list = map(rm_char1, sent_list) # 去掉一些字符,例如\u3000
    sent_list = map(rm_char2, sent_list) # 去掉\xa0
    word_2dlist = [rm_tokens(jieba.cut(part)) for part in sent_list] # 分词
    word_list = sum(word_2dlist,[])
    return word_list

使用的话直接

luca = convert_text_to_wordlist('遇见你真的是太好了')
import jieba
import jieba.analyse
import csv
from sklearn import feature_extraction  

def get_dataset():
    data, targetdata = [], []
    with open('D:\datatrain.csv',  'r', encoding='gb18030') as file:
        f = csv.reader(file)
        for line in f:
            seglist = jieba.cut(line[2])
            words = ' '.join(seglist)
            data.append(words)
            targetdata.append(1) if 'T' in line[1] or 't' in line[1] else targetdata.append(0)
    return data,targetdata
def get_testset():
    testdata, targettest = [], []
    with open('D:\datatest.csv', 'r',encoding='gb18030') as file:
        f = csv.reader(file)
        for line in f:
            seglist = jieba.cut(line[2])
            words = ' '.join(seglist)
            testdata.append(words)
            targettest.append(1) if 'T' in line[1] or 't' in line[1] else targettest.append(0)
    return testdata, targettest

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

def data_pro():
    data_,target_train = get_dataset()
    testdata, target_test = get_testset()
    
    v = TfidfVectorizer()
    train_data = v.fit_transform(data_)
    test_data = v.transform(testdata)
    
    return train_data, test_data

datapro()   
clf = MultinomialNB(alpha=0.01)  
clf.fit(train_data,target_train)   
pred = clf.predict(testdata)
'''#辣鸡不如Naive bayes
svc = svm.SVC(kernel='linear')  
svc.fit(train_data,target_train)   
pred = svc.predict(testdata) 
'''
count=0
for l,r in zip(pred, target_test):
    if l == r:
        count +=1
print(count/len(target_test))#输出正确率

你可能感兴趣的:(使用sklearn+jieba分词写的文本分类)