利用Word2Vec可以计算电影所有标签词之间的关系程度,可用于计算电影之间的相似度
word2vec是google在2013年开源的一个NLP(Natural Language Processing自然语言处理) 工具,它的特点是将所有的词向量化,这样词与词之间就可以定量的去度量他们之间的关系,挖掘词之间的联系。
one-hot vector VS. word vector
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-F0oUMMMu-1583649631397)(img/word2vec1.png)]
one hot vector的问题
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qdJZjGQ0-1583649631398)(img/word2vec2.png)]
有了用Dristributed representation表示的较短的词向量,就可以较容易的分析词之间的关系,比如将词的维度降维到2维,用下图的词向量表示我们的词时,发现: K i n g ⃗ − M a n ⃗ + W o m a n ⃗ = Q u e e n ⃗ \vec{King} - \vec{Man} + \vec{Woman} = \vec{Queen} King−Man+Woman=Queen
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QESKPVEc-1583649631398)(img/word2vec3.png)]
什么是word vector(词向量)
animal | pet | |
dog | -0.4 | 0.02 |
lion | 0.2 | 0.35 |
animal那一列表示的就是左边的词与animal这个概念的”距离“
from gensim.models import TfidfModel
import pandas as pd
import numpy as np
def get_movie_dataset():
# 加载基于所有电影的标签
# all-tags.csv来自ml-latest数据集中
# 由于ml-latest-small中标签数据太多,因此借助其来扩充
_tags = pd.read_csv("datasets/ml-latest-small/all-tags.csv", usecols=range(1, 3)).dropna()
tags = _tags.groupby("movieId").agg(list)
# 加载电影列表数据集
movies = pd.read_csv("datasets/ml-latest-small/movies.csv", index_col="movieId")
# 将类别词分开
movies["genres"] = movies["genres"].apply(lambda x: x.split("|"))
# 为每部电影匹配对应的标签数据,如果没有将会是NAN
movies_index = set(movies.index) & set(tags.index)
new_tags = tags.loc[list(movies_index)]
ret = movies.join(new_tags)
# 构建电影数据集,包含电影Id、电影名称、类别、标签四个字段
# 如果电影没有标签数据,那么就替换为空列表
movie_dataset = pd.DataFrame(
map(
lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())
, columns=["movieId", "title", "genres","tags"]
)
movie_dataset.set_index("movieId", inplace=True)
return movie_dataset
def create_movie_profile(movie_dataset):
'''
使用tfidf,分析提取topn关键词
:param movie_dataset:
:return:
'''
dataset = movie_dataset["tags"].values
from gensim.corpora import Dictionary
dct = Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]
model = TfidfModel(corpus)
_movie_profile = []
for i, data in enumerate(movie_dataset.itertuples()):
mid = data[0]
title = data[1]
genres = data[2]
vector = model[corpus[i]]
movie_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), movie_tags))
# 将类别词的添加进去,并设置权重值为1.0
for g in genres:
topN_tags_weights[g] = 1.0
topN_tags = [i[0] for i in topN_tags_weights.items()]
_movie_profile.append((mid, title, topN_tags, topN_tags_weights))
movie_profile = pd.DataFrame(_movie_profile, columns=["movieId", "title", "profile", "weights"])
movie_profile.set_index("movieId", inplace=True)
return movie_profile
movie_dataset = get_movie_dataset()
movie_profile = create_movie_profile(movie_dataset)
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = list(movie_profile["profile"].values)
model = gensim.models.Word2Vec(sentences, window=3, min_count=1, iter=20)
while True:
words = input("words: ") # action
ret = model.wv.most_similar(positive=[words], topn=10)
print(ret)
Doc2Vec是建立在Word2Vec上的,用于直接计算以文档为单位的文档向量,这里我们将一部电影的所有标签词,作为整个文档,这样可以计算出每部电影的向量,通过计算向量之间的距离,来判断用于计算电影之间的相似程度。
这样可以解决物品冷启动问题
from gensim.models import TfidfModel
import pandas as pd
import numpy as np
from pprint import pprint
def get_movie_dataset():
# 加载基于所有电影的标签
# all-tags.csv来自ml-latest数据集中
# 由于ml-latest-small中标签数据太多,因此借助其来扩充
_tags = pd.read_csv("datasets/ml-latest-small/all-tags.csv", usecols=range(1, 3)).dropna()
tags = _tags.groupby("movieId").agg(list)
# 加载电影列表数据集
movies = pd.read_csv("datasets/ml-latest-small/movies.csv", index_col="movieId")
# 将类别词分开
movies["genres"] = movies["genres"].apply(lambda x: x.split("|"))
# 为每部电影匹配对应的标签数据,如果没有将会是NAN
movies_index = set(movies.index) & set(tags.index)
new_tags = tags.loc[list(movies_index)]
ret = movies.join(new_tags)
# 构建电影数据集,包含电影Id、电影名称、类别、标签四个字段
# 如果电影没有标签数据,那么就替换为空列表
movie_dataset = pd.DataFrame(
map(
lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())
, columns=["movieId", "title", "genres","tags"]
)
movie_dataset.set_index("movieId", inplace=True)
return movie_dataset
def create_movie_profile(movie_dataset):
'''
使用tfidf,分析提取topn关键词
:param movie_dataset:
:return:
'''
dataset = movie_dataset["tags"].values
from gensim.corpora import Dictionary
dct = Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]
model = TfidfModel(corpus)
_movie_profile = []
for i, data in enumerate(movie_dataset.itertuples()):
mid = data[0]
title = data[1]
genres = data[2]
vector = model[corpus[i]]
movie_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), movie_tags))
# 将类别词的添加进去,并设置权重值为1.0
for g in genres:
topN_tags_weights[g] = 1.0
topN_tags = [i[0] for i in topN_tags_weights.items()]
_movie_profile.append((mid, title, topN_tags, topN_tags_weights))
movie_profile = pd.DataFrame(_movie_profile, columns=["movieId", "title", "profile", "weights"])
movie_profile.set_index("movieId", inplace=True)
return movie_profile
movie_dataset = get_movie_dataset()
movie_profile = create_movie_profile(movie_dataset)
import gensim, logging
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
documents = [TaggedDocument(words, [movie_id]) for movie_id, words in movie_profile["profile"].iteritems()]
# 训练模型并保存
model = Doc2Vec(documents, vector_size=100, window=3, min_count=1, workers=4, epochs=20)
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
words = movie_profile["profile"].loc[6]
print(words)
inferred_vector = model.infer_vector(words)
sims = model.docvecs.most_similar([inferred_vector], topn=10)
print(sims)