word2vec

from gensim.models import word2vec 结果好坏跟训练词库有关。

#[['I', 'am', 'hansome'], ['Mu', 'wen', 'looks', 'cool'], ...]

# 用生成器的方式读取文件里的句子# 适合读取大容量文件,而不用加载到内存

class MySentences(object):  

          def __init__(self, fname):    

                   self.fname = fname  

          def __iter__(self):      

                   for line in open(self.fname,'r'):      

                   yield line.split()

# 模型训练函数

def w2vTrain(f_input, model_output):    

       sentences = MySentences(DataDir+f_input)  

       w2v_model = word2vec.Word2Vec(sentences,   min_count = MIN_COUNT,                                  workers = CPU_NUM,                                  size = VEC_SIZE,  window = CONTEXT_WINDOW   )  

       w2v_model.save(ModelDir+model_output)

# 训练DataDir = "./"ModelDir = "./ipynb_garbage_files/"MIN_COUNT = 4CPU_NUM = 2 # 需要预先安装 Cython 以支持并行VEC_SIZE = 20CONTEXT_WINDOW = 5 # 提取目标词上下文距离最长5个词f_input = "bioCorpus_5000.txt"model_output = "test_w2v_model"w2vTrain(f_input, model_output)

w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)

w2v_model.wv.most_similar('body')

import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
vocab = 'vocab.txt'
model = Word2Vec(LineSentence(vocab), size=32, window=5, min_count=5,workers=multiprocessing.cpu_count())

model.save('text/model')
print(model.wv.vocab)
print(model.wv['男人'])
model.most_similar('军事')
model.most_similar_cosmul('军事')
model.wv.most_similar(positive=['军事', '政治'], negative=['经济'])
model.wv.similarity('男人','军事')
model.wv.doesnt_match(['中国','美国','叙利亚','水果'])

模型调优

# 停止词from nltk.corpus import stopwords StopWords = stopwords.words('english')

# 重新训练# 模型训练函数def w2vTrain_removeStopWords(f_input, model_output):            sentences = list(MySentences(DataDir+f_input))    for idx,sentence in enumerate(sentences):        sentence = [w for w in sentence if w not in StopWords]        sentences[idx]=sentence    w2v_model = word2vec.Word2Vec(sentences, min_count = MIN_COUNT,                                  workers = CPU_NUM, size = VEC_SIZE)    w2v_model.save(ModelDir+model_output) w2vTrain_removeStopWords(f_input, model_output) w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)

w2v_model.most_similar('body')

>>> from gensim.models import KeyedVectors
>>>
>>> path = get_tmpfile("wordvectors.kv")
>>>
>>> model.wv.save(path)
>>> wv = KeyedVectors.load("model.wv", mmap='r')
>>> vector = wv['computer']  

你可能感兴趣的:(机器学习算法)