中文文本情感分析(word2vec)

gensim做word2vec文本处理,sklearn.svm做建模

from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib
from sklearn.svm import SVC
import sys
sys.reload(sys)
sys.setdefaultencoding("utf8")

载入数据,数据预处理,切分训练集与测试集

def load_file_and_preprocessing():
    neg=pd.read_excel("data/neg.xls",header=None,index=None)
    pos=pd.read_excel("data/pos.xls",header=None,index=None)
    cw=lambda x:list(jieba.cut(x))
    pos["words"]=pos[0].apply(cw)
    neg["words"]=neg[0].apply(cw)
    y=np.concatenate((np.ones(len(pos)),np.zeros(len(neg))))
    x_train,x_test,y_train,y_test=train_test_split(np.concatenate((pos["words"],neg["words"])),y,test)size=0.2)
    np.save("svm_data/y_train.npy",y_train)
    np.save("svm_data/y_test.npy",y_test)
    return x_train,x_test

对每个句子的所有向量取均值,来生成一个句子的vectot

def build_sentence_vector(text,size,imdb_w2v):
    vec=np.zeros(size).reshape((1.size))
    count=0
    for word in text:
        try:
            vec+=imdb_w2v[word].reshape((1,size))
            count+=1
        except KeyError:
            continue
    if count !=0:
        vec/=count
    return vec

计算词向量

def get_train_vecs(x_train,x_test):
    n_dim=300
    imdb_w2v=Word2Vec(size=n_dim,min_count=10)#初始化模型
    imdb_w2v.build_vocab(x_train)#确定词表
    imdb_w2v.train(x_train)#在评论训练集上建模
    train_vecs=np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_train])
    np.save("svm_data/train_vec.npy",train_vecs)
    #在测试集上训练
    imdb_w2v.train(x_test)
    imdb_w2c.save("svm_data/w2v_model/w2v_model.pkl")
    test_vecs=np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_test])
    np.save("svm_data/test_vecs.npy",test_vecs)

def get_data():
    train_vecs=np.load("svm_data/train_vecs.npy")
    y_train=np.load("svm_data/y_train.npy")
    test_vecs=np.load("svm_data/test_vecs.npy")
    y_test=np.load("svm_data/y_test.npy")
    return train_vecs,y_train,test_vecs,y_test

训练svm模型

def svm_train(train_vecs,y_train,test_vecs,y_test):
    clf=SVC(kernel="rbf",verbose=True)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf,"svm_data/svm_model/model.pkl")
    print clf.score(test_vecs,y_test)

#构建带预测句子的向量
def get_predict_vecs(words):
    n_dim=300
    imdb_w2v=Word2Vec.load("svm_data/w2v_model/w2v_model.pkl")
    train_vecs=build_sentence_vector(words,n_dim,imdb_w2v)
    return train_vecs

#对单个句子进行情感判断
def svm_predict(string):
    words=jieba.cut(string)
    words_vecs=get_predict(words)
    clf=joblib.load("svm_data/svm_model/model.pkl")
    result=clf.predict(words_vecs)
    if int(result[0]==1:
        print string, "positive"
    else:
        print string, "negative"


对单个句子进行情感判断

string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如' 
svm_predict(string)

 

你可能感兴趣的:(中文文本情感分析(word2vec))