from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import collections
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
embedding_size = 32 # 嵌入向量的维度 vector.
max_vocabulary_size = 50000 # 词汇表中不同单词的总数words in the vocabulary.
min_occurrence = 10 # 删除出现小于n次的所有单词
skip_window = 3 # 左右各要考虑多少个单词
num_skips = 2 # 重复使用输入生成标签的次数
num_sampled = 64 # 负采样数量
# 读取数据
data_file = "C:/project/data/movielens-m1/ratings.dat"
orig_data = pd.read_csv(data_file, sep="::", names=["user_id", "item_id", "score", "timestamp"],
dtype={"user_id": int, "item_id": str, "score": int, "timestamp": int})
# 根据user_id合并item_ids
grouped_data = orig_data.groupby("user_id")["item_id"].apply(",".join).reset_index()
grouped_data.columns = ["user_id", "item_ids"]
grouped_data["item_ids_array"] = grouped_data["item_ids"].apply(lambda s: s.split(","))
sentences = grouped_data["item_ids_array"]
# class gensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, size=100,
# alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1,
# workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75,
# cbow_mean=1, hashfxn=, iter=5, null_word=0, trim_rule=None,
# sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)
model = Word2Vec(size=embedding_size, min_count=min_occurrence, max_vocab_size=max_vocabulary_size)
# train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None,
# epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2,
# report_delay=1.0, compute_loss=False, callbacks=())
model.build_vocab(sentences)
model.train(sentences, total_words=max_vocabulary_size, epochs=500, start_alpha=0.1, end_alpha=0.02, compute_loss=True)
model.save("gensim/item2vec.model")
model.wv.save_word2vec_format("gensim/item2vec.txt", total_vec=max_vocabulary_size)