刚开始使用以下代码,发现报错:UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa6 in position 0: invalid start byte
import gensim
import codecs
def main():
path_to_model = 'file.bin'
output_file = 'file.txt'
bin2txt(path_to_model, output_file)
def bin2txt(path_to_model, output_file):
output = codecs.open(output_file, 'w', 'utf-8')
model = gensim.models.KeyedVectors.load_word2vec_format(path_to_model, binary=True)
print('Done loading Word2Vec!')
vocab = model.vocab
for item in vocab:
vector = list()
for dimension in model[item]:
vector.append(str(dimension))
vector_str = ",".join(vector)
line = item + "\t" + vector_str
output.writelines(line + "\n")
output.close()
if __name__ == "__main__":
main()
后来调试发现是这一句报错了:
model = gensim.models.KeyedVectors.load_word2vec_format(path_to_model, binary=True)
将这一句改为如下,成功运行:
model = gensim.models.KeyedVectors.load_word2vec_format(path_to_model, binary=False)