list of lists of tokens
from random import choice
ls_of_ls = [['cat', 'dog', 'fish'], ['car', 'plane', 'tank']]
ls_of_words = [] # 存放(假设是jieba.lcut后得到的)分词列表的列表
for i in range(1500):
ls = choice(ls_of_ls)
ls_of_words.append([choice(ls) for _ in range(9, 15)])
from gensim.models import Word2Vec
model = Word2Vec(ls_of_words)
参数 | 解释 | 默认值 |
---|---|---|
sentences | list of lists of tokens | None |
size | 词向量维数 | 100 |
window | 同句中当前词和预测词的最大距离 | 5 |
min_count | 最低词频过滤 | 5 |
workers | 线程数 | 3 |
sg | 0:CBOW;1:skip-gram | 0 |
hs | 1:层次softmax;0:负例采样 | 0 |
negative | 负例样本数 | 5 |
ns_exponent | The exponent used to shape the negative sampling distribution | 0.75 |
cbow_mean | 0:上下文词向量求和值;1:上下文词向量平均值 | 1 |
alpha | 初始学习率 | 0.025 |
min_alpha | 最小学习率 | 0.0001 |
print(model.similar_by_word('car'))
print(model.similarity('car', 'tank'))
[(‘plane’, 0.9965242743492126),
(‘tank’, 0.996138334274292),
(‘fish’, 0.6187092661857605),
(‘cat’, 0.6155150532722473),
(‘dog’, 0.5961228609085083)]
0.9961384
print(model.predict_output_word(['car']))
total = sum(i[1] for i in model.predict_output_word(['car']))
print('概率总和为%.2f' % total)
[(‘car’, 0.2208322),
(‘tank’, 0.20190619),
(‘plane’, 0.19147001),
(‘cat’, 0.1310298),
(‘dog’, 0.1299157),
(‘fish’, 0.124846116)]
概率总和为:1.00
from gensim.models import Word2Vec
ls_of_words = ['AaAaAa', 'BbBbBb'] * 9999
model = Word2Vec(ls_of_words, size=6)
print('\033[033m%s\033[0m' % ' wv.index2word '.center(60, '-'))
print(model.wv.index2word)
print('\033[033m%s\033[0m' % ' wv.vectors '.center(60, '-'))
vectors = model.wv.vectors
print(vectors)
print(vectors.shape)
print('\033[033m%s\033[0m' % ' similar_by_vector '.center(60, '-'))
for i in range(4):
print(model.similar_by_vector(vectors[i]))
from gensim.models import Word2Vec
from seaborn import heatmap
from matplotlib import pyplot
import numpy as np
np.random.seed(0)
n = 8
sentences = [
''.join(chr(i) for i in range(30, 40)) * n,
''.join(chr(i) for i in range(40, 50)) * n,
''.join(chr(i) for i in range(50, 60)) * n,
''.join(chr(i) for i in range(60, 70)) * n,
''.join(chr(i) for i in range(70, 80)) * n,
''.join(chr(i) for i in range(80, 90)) * n,
''.join(chr(i) for i in range(90, 100)) * n,
''.join(chr(i) for i in range(100, 110)) * n,
''.join(chr(i) for i in range(110, 120)) * n,
''.join(chr(i) for i in range(120, 130)) * n,
] * n * 10000
model = Word2Vec(sentences, 10, window=15, sg=1, hs=1, sorted_vocab=0, cbow_mean=0)
vectors = model.wv.vectors
heatmap(vectors, center=np.max(vectors))
pyplot.show()
# 捏造数据
from random import choice
ls_of_ls = [['芝士', '酸奶', '蛋糕', '巧克力', '做', '吃'],
['文本', '数据', '挖掘', '分析', '做', '玩'],
['佛山', '广州', '南海', '天河', '吃', '玩']]
ls_of_words = [] # 存放分词列表(假设是jieba.lcut后得到的)的列表
for i in range(2500):
ls = choice(ls_of_ls)
ls_of_words.append([choice(ls) for _ in range(9, 15)])
# 建模训练
from gensim.models import Word2Vec
model = Word2Vec(ls_of_words, size=3, window=7)
# 词向量聚类(基于密度)
from sklearn.cluster import DBSCAN
vectors = [model[word] for word in model.wv.index2word]
labels = DBSCAN(eps=0.24, min_samples=3).fit(vectors).labels_
# 词向量可视化
import matplotlib
from mpl_toolkits import mplot3d
import matplotlib.pyplot as mp
mp.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文
matplotlib.rcParams['axes.unicode_minus'] = False # 显示负号
fig = mp.figure()
ax = mplot3d.Axes3D(fig) # 创建3d坐标轴
colors = ['red', 'blue', 'green', 'black']
for word, vector, label in zip(model.wv.index2word, vectors, labels):
ax.scatter(vector[0], vector[1], vector[2], c=colors[label], s=500, alpha=0.4)
ax.text(vector[0], vector[1], vector[2], word, ha='center', va='center')
mp.show()
from gensim.models import Word2Vec
from random import choice
import warnings
warnings.filterwarnings('ignore') # 不打印警告
"""配置"""
path = '春节.txt'
window = 12
min_count = 30
size = 110
topn = 13
"""数据读取"""
with open(path, encoding='utf-8') as f:
ls_of_ls_of_c = [list(line.strip()) for line in f]
"""建模训练"""
model = Word2Vec(ls_of_ls_of_c, size, window=window, min_count=min_count)
chr_dict = model.wv.index2word
"""文本序列生成"""
def poem_generator(title, form):
filter = lambda lst: [t[0] for t in lst if t[0] not in [',', '。']]
# 标题补全
if len(title) < 4:
if not title:
title += choice(chr_dict)
for _ in range(4 - len(title)):
similar_chr = filter(model.similar_by_word(title[-1], topn // 2))
char = choice([c for c in similar_chr if c not in title])
title += char
# 文本生成
poem = list(title)
for i in range(form[0]):
for _ in range(form[1]):
predict_chr = model.predict_output_word(poem[-window:], max(topn, len(poem) + 1))
predict_chr = filter(predict_chr)
char = choice([c for c in predict_chr if c not in poem[len(title):]])
poem.append(char)
poem.append(',' if i % 2 == 0 else '。')
length = form[0] * (form[1] + 1)
return '《%s》' % ''.join(poem[:-length]) + '\n' + ''.join(poem[-length:])
"""诗歌生成"""
literary_form = {'五言绝句': (4, 5), '对联': (2, 7)}
while True:
title = input('输入标题:').strip()
poem5 = poem_generator(title, literary_form['五言绝句'])
print('\033[033m', poem5, '\033[0m', sep='')
poem7 = poem_generator(title, literary_form['对联'])
print('\033[036m', poem7, '\033[0m', sep='')
print()
语料下载地址:https://blog.csdn.net/Yellow_python/article/details/86726619
gensim版本:3.5.0
阅读扩展
jieba中文分词
冗余的词向量:one hot representation
Python共现矩阵
sklearn聚类
❤️Python程序写诗【1分钟】古诗词生成
GitHub
注释
en | cn |
---|---|
CBOW | Continuous Bag-of-Words |
token | 表征;代币; |
hierarchical | 分层的 |
exponent | [数]指数;说明者 |
proportion | 比例 |
exactly | 精确地 |
toolkit | 工具箱 |
alpha | 希腊字母的第1个字母: α \alpha α |
appendix | 附录 |