from gensim.models import word2vec 结果好坏跟训练词库有关。
#[['I', 'am', 'hansome'], ['Mu', 'wen', 'looks', 'cool'], ...]
# 用生成器的方式读取文件里的句子# 适合读取大容量文件,而不用加载到内存
class MySentences(object):
def __init__(self, fname):
self.fname = fname
def __iter__(self):
for line in open(self.fname,'r'):
yield line.split()
# 模型训练函数
def w2vTrain(f_input, model_output):
sentences = MySentences(DataDir+f_input)
w2v_model = word2vec.Word2Vec(sentences, min_count = MIN_COUNT, workers = CPU_NUM, size = VEC_SIZE, window = CONTEXT_WINDOW )
w2v_model.save(ModelDir+model_output)
# 训练DataDir = "./"ModelDir = "./ipynb_garbage_files/"MIN_COUNT = 4CPU_NUM = 2 # 需要预先安装 Cython 以支持并行VEC_SIZE = 20CONTEXT_WINDOW = 5 # 提取目标词上下文距离最长5个词f_input = "bioCorpus_5000.txt"model_output = "test_w2v_model"w2vTrain(f_input, model_output)
w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)
w2v_model.wv.most_similar('body')
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
vocab = 'vocab.txt'
model = Word2Vec(LineSentence(vocab), size=32, window=5, min_count=5,workers=multiprocessing.cpu_count())
model.save('text/model')
print(model.wv.vocab)
print(model.wv['男人'])
model.most_similar('军事')
model.most_similar_cosmul('军事')
model.wv.most_similar(positive=['军事', '政治'], negative=['经济'])
model.wv.similarity('男人','军事')
model.wv.doesnt_match(['中国','美国','叙利亚','水果'])
模型调优
# 停止词from nltk.corpus import stopwords StopWords = stopwords.words('english')
# 重新训练# 模型训练函数def w2vTrain_removeStopWords(f_input, model_output): sentences = list(MySentences(DataDir+f_input)) for idx,sentence in enumerate(sentences): sentence = [w for w in sentence if w not in StopWords] sentences[idx]=sentence w2v_model = word2vec.Word2Vec(sentences, min_count = MIN_COUNT, workers = CPU_NUM, size = VEC_SIZE) w2v_model.save(ModelDir+model_output) w2vTrain_removeStopWords(f_input, model_output) w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)
w2v_model.most_similar('body')
>>> from gensim.models import KeyedVectors
>>>
>>> path = get_tmpfile("wordvectors.kv")
>>>
>>> model.wv.save(path)
>>> wv = KeyedVectors.load("model.wv", mmap='r')
>>> vector = wv['computer']