使用SVM和Word2Vec进行情感分类

得到句子分词后的结果,并把类别标签保存为y_train.npy,y_test.npy

# 加载文件,导入数据,分词
def loadfile():
    neg=pd.read_excel(config.NEG_PATH,header=None,index=None)
    pos=pd.read_excel(config.POS_PATH,header=None,index=None)

    cw = lambda x: list(jieba.cut(x))
    pos['words'] = pos[0].apply(cw)
    neg['words'] = neg[0].apply(cw)

    #print pos['words']
    #use 1 for positive sentiment, 0 for negative
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

    x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2)
    
    np.save('./svm_data/y_train.npy',y_train)
    np.save('./svm_data/y_test.npy',y_test)
    return x_train,x_test

计算词向量并保存为train_vecs.npy,test_vecs.npy
get_train_vecs(x_train,x_test)

def get_train_vecs(x_train,x_test):
    n_dim = 300
    #Initialize model and build vocab
    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
    imdb_w2v.build_vocab(x_train)
    
    #Train the model over train_reviews (this may take several minutes)
#    imdb_w2v.train(x_train)
    imdb_w2v.train(x_train,total_examples=imdb_w2v.corpus_count,epochs=2)
    
    train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
    #train_vecs = scale(train_vecs)
    
    np.save('./svm_data/train_vecs.npy',train_vecs)
    print(train_vecs.shape)
    #Train word2vec on test tweets
    imdb_w2v.train(x_test,total_examples=imdb_w2v.corpus_count,epochs=2)
    imdb_w2v.save('./svm_data/w2v_model/w2v_model.pkl')
    #Build test tweet vectors then scale
    test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
    #test_vecs = scale(test_vecs)
    np.save('./svm_data/test_vecs.npy',test_vecs)
    print(test_vecs.shape)

导入训练数据和测试数据train_vecs,y_train,test_vecs,y_test=get_data()

def get_data():
    train_vecs=np.load('./svm_data/train_vecs.npy')
    y_train=np.load('./svm_data/y_train.npy')
    test_vecs=np.load('./svm_data/test_vecs.npy')
    y_test=np.load('./svm_data/y_test.npy') 
    return train_vecs,y_train,test_vecs,y_test

训练svm并保存模型

from sklearn.svm import SVC
from sklearn.externals import joblib

def svm_train(train_vecs,y_train,test_vecs,y_test):
    clf=SVC(kernel='rbf',verbose=True)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf, 'svm_data/svm_model/model.pkl')
    print(clf.score(test_vecs,y_test))

对输入句子情感进行判断

####对单个句子进行情感判断    
def svm_predict(string):
    words=jieba.lcut(string)
    words_vecs=get_predict_vecs(words)
    clf=joblib.load('./svm_data/svm_model/model.pkl')
     
    result=clf.predict(words_vecs)
    
    if int(result[0])==1:
        print(string,' positive')
    else:
        print(string,' negative')

string=‘电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如’
svm_predict(string)

输出:
电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如 negative
main:48: DeprecationWarning: Call to deprecated getitem (Method will be removed in 4.0.0, use self.wv.getitem() instead).

你可能感兴趣的:(机器学习,机器学习实验,趣味应用)