读glove文件的代码


import hashlib
import gensim

#原文件加上一行成为gensim可读的格式
def prepend_slow(infile, outfile, line):
    """
    Slower way to prepend the line by re-creating the inputfile.
    """
    with open(infile, 'r',encoding= 'utf-8') as fin:
        with open(outfile, 'w',encoding= 'utf-8 ') as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)


def checksum(filename):
    """
    This is to verify the file checksum is the same as the glove files we use to
    pre-computed the no. of lines in the glove file(s).
    """
    BLOCKSIZE = 65536
    hasher = hashlib.md5()
    with open(filename, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(BLOCKSIZE)
    return hasher.hexdigest()


# Pre-computed glove files values.
pretrain_num_lines = {"glove.840B.300d.txt": 2196017}

def check_num_lines_in_glove(filename, check_checksum=False):
        return pretrain_num_lines[filename]


# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
glove_file = "glove.840B.300d.txt"
_, tokens, dimensions, _ = glove_file.split('.')
num_lines = check_num_lines_in_glove(glove_file)
dims = int(dimensions[:-1])

# Output: Gensim Model text format.
gensim_file = 'glove_model.txt'
gensim_first_line = "{} {}".format(num_lines, dims)

# Prepends the line.
prepend_slow(glove_file, gensim_file, gensim_first_line)

# Load model
model =gensim.models.KeyedVectors.load_word2vec_format('glove_model.txt')
model.syn0norm = model.syn0  # prevent recalc of normed vectors

model.word_vec('computer') #obtain word vector
print(model.most_similar(positive=['australia'], topn=10))
print(model.similarity('woman', 'man'))

你可能感兴趣的:(Python,自然语言处理NLP)