得到句子分词后的结果,并把类别标签保存为y_train.npy,y_test.npy
# 加载文件,导入数据,分词
def loadfile():
neg=pd.read_excel(config.NEG_PATH,header=None,index=None)
pos=pd.read_excel(config.POS_PATH,header=None,index=None)
cw = lambda x: list(jieba.cut(x))
pos['words'] = pos[0].apply(cw)
neg['words'] = neg[0].apply(cw)
#print pos['words']
#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2)
np.save('./svm_data/y_train.npy',y_train)
np.save('./svm_data/y_test.npy',y_test)
return x_train,x_test
计算词向量并保存为train_vecs.npy,test_vecs.npy
get_train_vecs(x_train,x_test)
def get_train_vecs(x_train,x_test):
n_dim = 300
#Initialize model and build vocab
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
#Train the model over train_reviews (this may take several minutes)
# imdb_w2v.train(x_train)
imdb_w2v.train(x_train,total_examples=imdb_w2v.corpus_count,epochs=2)
train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
#train_vecs = scale(train_vecs)
np.save('./svm_data/train_vecs.npy',train_vecs)
print(train_vecs.shape)
#Train word2vec on test tweets
imdb_w2v.train(x_test,total_examples=imdb_w2v.corpus_count,epochs=2)
imdb_w2v.save('./svm_data/w2v_model/w2v_model.pkl')
#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
#test_vecs = scale(test_vecs)
np.save('./svm_data/test_vecs.npy',test_vecs)
print(test_vecs.shape)
导入训练数据和测试数据train_vecs,y_train,test_vecs,y_test=get_data()
def get_data():
train_vecs=np.load('./svm_data/train_vecs.npy')
y_train=np.load('./svm_data/y_train.npy')
test_vecs=np.load('./svm_data/test_vecs.npy')
y_test=np.load('./svm_data/y_test.npy')
return train_vecs,y_train,test_vecs,y_test
训练svm并保存模型
from sklearn.svm import SVC
from sklearn.externals import joblib
def svm_train(train_vecs,y_train,test_vecs,y_test):
clf=SVC(kernel='rbf',verbose=True)
clf.fit(train_vecs,y_train)
joblib.dump(clf, 'svm_data/svm_model/model.pkl')
print(clf.score(test_vecs,y_test))
对输入句子情感进行判断
####对单个句子进行情感判断
def svm_predict(string):
words=jieba.lcut(string)
words_vecs=get_predict_vecs(words)
clf=joblib.load('./svm_data/svm_model/model.pkl')
result=clf.predict(words_vecs)
if int(result[0])==1:
print(string,' positive')
else:
print(string,' negative')
string=‘电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如’
svm_predict(string)
输出:
电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如 negative
main:48: DeprecationWarning: Call to deprecated getitem (Method will be removed in 4.0.0, use self.wv.getitem() instead).