词向量存pkl格式

import os
import numpy as np

def load_cn_wiki_wordvec(s_word_vec_path):
    n_vocab_num = 0
    n_word_dim = 0
    m_word_vec ={}
    m_word_id = {}
    with open(s_word_vec_path, "r", encoding="utf-8") as fp:
        for i, s_line in enumerate(fp):
            if i == 0:
                ls_line = s_line.strip().split(" ")
                n_token_num = int(ls_line[0])
                n_word_dim = int(ls_line[1])
                print("token num ", n_vocab_num)
                print("word dim ", n_word_dim)
            else:
                ls_line = s_line.rstrip().split(" ")
                s_token = ls_line[0]
                lf_vec = [float(e) for e in ls_line[1:]] 
                np_vec = np.array(lf_vec)

                m_word_vec[s_token] = np_vec
                m_word_id[s_token] = i-1
                if i % 10000 == 0:
                    print("load %s %s" % (str(i), s_token))
                    # break

    o_out = {
        "word_vec":m_word_vec,
        "word_id": m_word_id,
        "word_dim": n_word_dim
    }                
    return o_out

if __name__ == '__main__':
    import pickle
    w_file = 'tencent-ailab-embedding-zh-d200-v0.2.0-s.txt'
    o_out = load_cn_wiki_wordvec(w_file)
    with open(w_file + '.pkl', 'wb') as f:
        pickle.dump(o_out['word_vec'], f)

你可能感兴趣的:(ML,nlp,深度学习)