word2vec

import json
from gensim.models import Word2Vec, Doc2Vec
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
import sys


emb_size = 200
fenci = json.load(open('data/result.json', 'r', encoding='utf-8'))
fenci_git = json.load(open('data/result_git.json', 'r', encoding='utf-8'))
X = fenci['words']
Y = fenci['label']
print(len(X), len(Y))
X.extend(fenci_git['words'][:2000])
Y.extend(fenci_git['label'][:2000])


if sys.argv[1] == 'test':
    model = Word2Vec.load('car.model')
else:
    model = Word2Vec(X, size=emb_size, window=5, min_count=5, workers=4)
    model.save('car%s.model' % sys.argv[2])

X_vec = list()
for sentence in X:
    vec = list()
    for w in sentence:
        if w in model.wv:
            vec.append(model.wv[w])
    if len(vec) > 0:
        vec = np.array(vec).mean(axis=0)
    else:
        vec = np.zeros(emb_size)
    X_vec.append(vec)
X_vec = np.array(X_vec)
Y = np.array(Y)

kf = KFold(n_splits=5, shuffle=True, random_state=2019)
clf = SVC(C=2, gamma=10.0, kernel='linear')
# clf = XGBClassifier(learning_rate=0.4, max_depth=10, n_estimators=50)

for train_idx, test_idx in kf.split(X_vec):
    train_x = X_vec[train_idx]
    train_y = Y[train_idx]
    test_x = X_vec[test_idx]
    test_y = Y[test_idx]
    clf.fit(train_x, train_y)
    pred_y = clf.predict(test_x)
    # print(pred_y)
    # print(np.array(X)[test_idx])
    # exit()
    print(classification_report(test_y, pred_y))

你可能感兴趣的:(word2vec)