model = KeyedVectors.load_word2vec_format(open(word2vec_file, 'r'), binary=False, unicode_errors='replace')
就这一行代码,总是报错:
self.vectors[target_index] = vector ValueError: could not broadcast input array from shape (0,) into shape (300,)
查阅各种资料。。。搞了两三个小时还是没解决。
最后换成二进制存储模型,世界和平了。。。
from gensim.models import Word2Vec
def train_gensim(dim):
import multiprocessing
with open("all_context.txt") as f:
model = Word2Vec(sentences=f.readlines(),vector_size=dim, window=5, min_count=1, workers=multiprocessing.cpu_count())
model.save("all_context_word2vec_s{}.wv".format(dim))
train_gensim(300)
from gensim.models import KeyedVectors
w2vmodel =KeyedVectors.load("all_context_word2vec_s300.wv")
print(w2vmodel)
print(w2vmodel.wv.vectors.shape[0])
>>> Word2Vec(vocab=1774, vector_size=300, alpha=0.025)
>>> 1774