word2vec

import pandas as pd
import json
import jieba
import gensim
from gensim.models.word2vec import LineSentence
from gensim.models import word2vec
from gensim.models import FastText
import logging

logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)

def get_sentence(data_file):
    f= open(data_file,'r',encoding='utf-8')
    reader= f.readlines()
    sentence=[]
    for line in reader:
        line=json.loads(line.strip());
        sentence.append(line['sentence'])
    return sentence

def cut_sentece(train_data):
    train_data = [list(jieba.cut(sentence)) for sentence in train_data]
    return train_data

if __name__ == '__main__':
    train_sentence = get_sentence('G:\\nlpPro\\data\\train.json')
    test_sentence = get_sentence('G:\\nlpPro\\data\\test.json')
    dev_sentence = get_sentence('G:\\nlpPro\\data\\dev.json')
    train_data = train_sentence+test_sentence+dev_sentence
    train_data = cut_sentece(train_data)
    print(len(train_data))
    # print(train_data)
    model = FastText(train_data,vector_size=4,window=3,min_count=1)

    #skip-gram 与CBOW

    # model1 = word2vec.Word2Vec(train_data,sg=1,workers=4,min_count=5,vector_size=200)
    print(model.wv.most_similar(['智能'],10))
    # model1.save("路径")

你可能感兴趣的:(word2vec,人工智能,自然语言处理)