import urllib.request # 提取下载链接 下载
import zipfile # 解压zip
import lxml.etree # 解析xml文档
import re # 正则
from sklearn.feature_extraction.text import CountVectorizer # 词频统计
from collections import Counter # 词频统计
from bokeh.plotting import figure, show # 绘图
from bokeh.models import ColumnDataSource, LabelSet # 绘图
from gensim.models import Word2Vec, FastText # 词向量
from gensim import corpora # 词典
from sklearn.manifold import TSNE # 做t-SNE可视化
import numpy as np
import os
from random import shuffle
from sklearn.cluster import KMeans # 聚类
数据包较大,建议直接输入地址到浏览器下载,然后放到指定地址
# 如果在指定文件夹下不存在则下载,并命名为ted_en-20160408.zip
if not os.path.isfile('../Data/ted_en-20160408.zip'):
urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")
# 建议自行解压后打开一个xml文件,分析如何获取其中有用的信息, 这里只需要content下的文本 在文本分类时还需用到 keywords
with zipfile.ZipFile('../Data/ted_en-20160408.zip', 'r') as z:
doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()')) # 读取content标签下的文本
del doc # 删除doc变量 释放这一部分内存
这一部分是对content文本进行清理,分句,分词(文本为英语所以分词只需分辨空格 注意 what’s 这类单词的分词)
首先清除 ‘(Laughter)’ 这些待括号的词
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text) # 可以在前后使用 i = input_text.find('some words') 对比是否有效
然后清楚 类似 Hyowon Gweon: say something 发言者的姓名以及‘:’,并完成分句
这个方式有些欠缺 比如 无法确定 20个字符都为发言者,实践情况要复杂的多
因为这个测试不做文本分类,不做太多的分析,下面的清洗也做了一些偷懒
这里用到了一个复杂的正则表达式:'^(?:(?P
我将其拆开解释下:
^(?:(?P[^:]{,20}):) 意思为 匹配 从开头到':'之间 除':'外字符数小于等于20的字符串,并将所匹配的组命名为 preconlon
(字符包括空格)
(?Pexp) 为命名一个分组
(?:(exp):)匹配不获取分组
[^:] 匹配除':'外的字符
{,20} 最多匹配前20个字符 超过则不匹配
^ 指非或者开头
例: "And here's the thing: it's not really a joke." 匹配到"And here's the thing"
"And here is the thing: it's not really a joke." 匹配到 None
"Here's the thing: it's not really a joke." 匹配到 "Here's the thing"
? 即之后的内容可有可无i
(?P.*)$ 指':'后的所有内容 并命名为postcolon组
'. '代表任意字符
'* '代表 0-正无穷
.* 贪婪匹配 匹配所有被人
$结尾
m.groupdict()['postcolon'].split('.') 取 postcolon 组的 字典按。号分割
print('分句')
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
m = re.match(r'^(?:(?P[^:]{,20}):)?(?P.*)$' , line)
sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
# 让我们看看前几个:
print(sentences_strings_ted[:5])
del input_text, input_text_noparens
然后在对句子进行分词 这里之间采用最简单的方法 将除字母数字外的 字符以空格代替 此分词方式还存在许多缺陷例如 “what’s” 直接切成了"what", “s”
这些可以用清楚 停用词的方式 清理 这里我就偷懒了
sentences_ted = []
for sent_str in sentences_strings_ted:
tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
sentences_ted.append(tokens)
统计词频为前1000名的单词
两种统计方法
# 方法1 使用 from collections import Counter 需要输入分词后的文本
sentences_ted_all = [i for row in sentences_ted for i in row] # 所有单词存到一列数组
c = Counter(sentences_ted_all) # 统计各单词出现的次数
counts_ted_top1000_cunter = c.most_common(1000) # 取前1000名
# counts_ted_top1000_cunter[:5] # 输出前5个看看
# 方法2 使用 from sklearn.feature_extraction.text import CountVectorizer 可直接用句子处理
# argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引) -X则取反
vec = CountVectorizer() # 初始化
X = vec.fit_transform(sentences_strings_ted) # 训练
counts_ted_top1000_index = np.array(np.argsort(-X.sum(axis=0))).squeeze()[:1000] # 取前1000名的索引 从大到小排列
counts_ted_top1000_word = np.array(vec.get_feature_names())[counts_ted_top1000_index] # 按索引取前1000名单词
counts_ted_top1000 = np.array(-1 * np.sort(-X.sum(axis=0))).squeeze()[:1000] # 取前1000名计数 从大到小排列
# 按同样的方式 取40个观察下
counts_ted_top40_index = np.array(np.argsort(-X.sum(axis=0))).squeeze()[:40] # 前40索引
counts_ted_top40_word = np.array(vec.get_feature_names())[counts_ted_top40_index] # 前40 word
counts_ted_top40 = np.array(-1 * np.sort(-X.sum(axis=0))).squeeze()[:40] # 前40 计数
print(counts_ted_top40_word)
绘制直方图
print('绘制直方图') # 使用from bokeh.plotting import figure, show 图片在浏览器中显示可缩放
hist, edges = np.histogram(counts_ted_top1000, density=True, bins=100, normed=True)
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)
Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=10, workers=4, sg=0)
FastText(sentences=sentences_ted, size=100, window=5, min_count=10, workers=4, sg=1)
两种参数相同
· sentences:切分句子的列表。
· size:嵌入向量的维数
· window:你正在查看的上下文单词数
· min_count:告诉模型忽略总计数小于这个数字的单词。
· workers:正在使用的线程数
· sg:是否使用skip-gram或CBOW
from gensim.models import Word2Vec,FastText
from gensim import corpora
model_ted = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=10, workers=4, sg=0)
dictionary_ted = corpora.Dictionary(sentences_ted)
print(len(dictionary_ted), dictionary_ted) # 查看字典信息
model_fas = FastText(sentences=sentences_ted, size=100, window=5, min_count=10, workers=4, sg=1)
# word2vec
model_ted.wv.most_similar("man")
# Out[] :
# [('woman', 0.8375369906425476),
# ('guy', 0.8316255211830139),
# ('gentleman', 0.762170672416687),
# ('lady', 0.7582734227180481),
# ('boy', 0.7424250841140747),
# ('girl', 0.7392549514770508),
# ('kid', 0.7149066925048828),
# ('soldier', 0.7003762125968933),
# ('poet', 0.6889200210571289),
# ('david', 0.6748576164245605)]
# FastText
model_fas.wv.most_similar('man')
# Out[] :
# [('batman', 0.8018248677253723),
# ('woman', 0.785014271736145),
# ('shaman', 0.7629657983779907),
# ('foreman', 0.7365020513534546),
# ('guy', 0.7283130288124084),
# ('van', 0.7156551480293274),
# ('anderson', 0.711168110370636),
# ('soldier', 0.7072361707687378),
# ('jefferson', 0.7063894271850586),
# ('shepherd', 0.7054215669631958)]
# FastText 还能查找 词典中不存在的单词的近似词 这是 word2vec做不到的
model_fas.wv.most_similar("Gastroenteritis")
# Out[] :
# [('arthritis', 0.7575085759162903),
# ('h1n1', 0.7479996681213379),
# ('disembodied', 0.7372597455978394),
# ('embroidery', 0.7301926612854004),
# ('epa', 0.726107656955719),
# ('respiratory', 0.7254592180252075),
# ('homicide', 0.7155354022979736),
# ('cardiovascular', 0.7088590860366821),
# ('pneumonia', 0.7087223529815674),
# ('embryonic', 0.7074483633041382)]
# 补充作业计算两个词向量的余弦距离,-接近1则更接近
def CosineDistance(vv1, vv2): # 计算夹角余弦
return np.dot(vv1, vv2) / (np.linalg.norm(vv1) * np.linalg.norm(vv2))
print('夹角余弦')
print(CosineDistance(model_ted.wv['man'], model_ted.wv['kid'])) # 0.68030196
print(CosineDistance(model_ted.wv['computer'], model_ted.wv['kid'])) # 0.30493858
words_top_vec_ted = model_ted[words_top_ted]
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="word2vec T-SNE for most common words")
source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
x2=words_top_ted_tsne[:,1],
names=words_top_ted))
p.scatter(x="x1", y="x2", size=8, source=source)
labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(labels)
show(p)
步骤与上面一致就不细讲了 上代码
print('-----------下载维基百科文本数据-----------')
if not os.path.isfile('../Data/wikitext-103-raw-v1.zip'):
urllib.request.urlretrieve("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
filename="wikitext-103-raw-v1.zip")
print('-----------解压并读取数据-----------')
with zipfile.ZipFile('../Data/wikitext-103-raw-v1.zip', 'r') as z:
input_text = str(z.open('wikitext-103-raw/wiki.train.raw', 'r').read(), encoding='utf-8')
print('-----------分句-----------')
sentences_wiki = []
for line in input_text.split('\n'):
s = [x for x in line.split('.') if x and len(x.split()) >= 5]
sentences_wiki.extend(s)
for s_i in range(len(sentences_wiki)):
sentences_wiki[s_i] = re.sub("[^a-z0-9]", " ", sentences_wiki[s_i].lower())
sentences_wiki[s_i] = re.sub(r'\([^)]*\)', '', sentences_wiki[s_i])
del input_text
print('-----------取1/5的数据-----------')
# sample 1/5 of the data shuffle打乱顺序
shuffle(sentences_wiki)
print(len(sentences_wiki))
sentences_wiki = sentences_wiki[:int(len(sentences_wiki) / 5)]
print(len(sentences_wiki))
wiki_ted = []
for wiki_str in sentences_wiki:
tokens = re.sub(r"[^a-z0-9]+", " ", wiki_str.lower()).split()
# tokens = re.sub(r'[{}]+'.format('-!:,.;?"'), " ", sent_str.lower()).split()
wiki_ted.append(tokens)
model_wiki = Word2Vec(sentences=wiki_ted, size=100, window=5, min_count=10, workers=4, sg=0)
dictionary_wiki = corpora.Dictionary(wiki_ted)
print(len(dictionary_wiki), dictionary_wiki)
# #### t-SNE visualization
wiki = CountVectorizer()
Y = wiki.fit_transform(sentences_wiki)
wiki_ted_top1000_index = np.array(np.argsort(-Y.sum(axis=0))).squeeze()[:1000]
wiki_ted_top1000_word = np.array(wiki.get_feature_names())[wiki_ted_top1000_index]
wiki_ted_top1000 = np.array(-1 * np.sort(-Y.sum(axis=0))).squeeze()[:1000]
words_top_wiki = wiki_ted_top1000_word
# This assumes words_top_wiki is a list of strings, the top 1000 words
words_top_vec_wiki = model_wiki[words_top_wiki]
tsne = TSNE(n_components=2, random_state=0)
words_top_wiki_tsne = tsne.fit_transform(words_top_vec_wiki)
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="word2vec T-SNE for most common words")
source = ColumnDataSource(data=dict(x1=words_top_wiki_tsne[:, 0],
x2=words_top_wiki_tsne[:, 1],
names=words_top_wiki))
p.scatter(x="x1", y="x2", size=8, source=source)
labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(labels)
show(p)
clustering = KMeans(n_clusters=100) # 聚几个类
clustering.fit(words_top_vec_wiki)
clu_labels = clustering.labels_ # 0-100 个标签
print(wiki_ted_top1000_word[clu_labels == 0])
print(wiki_ted_top1000_word[clu_labels == 1])
colors = [
'#%dr%dr%d' % (int(2.55 * r), 150, 150) for r in clu_labels
]
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="word2vec T-SNE for most common words")
clustering_source = ColumnDataSource(data=dict(x1=words_top_wiki_tsne[:, 0],
x2=words_top_wiki_tsne[:, 1],
names=words_top_wiki))
p.scatter(x="x1", y="x2", size=8, source=clustering_source, fill_color=colors)
labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(labels)
show(p)
from sklearn.cluster import KMeans
clustering = KMeans(n_clusters=100) # 聚几个类
clustering.fit(words_top_vec_wiki)
clu_labels = clustering.labels_ # 0-100 个标签
print(wiki_ted_top1000_word[clu_labels == 0])
print(wiki_ted_top1000_word[clu_labels == 1])
colors = ['#%d%d%d' % (int(2.55 * r), int(1.5 * r), int(1 * r)) for r in clu_labels]
# 每个聚类生成颜色
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="word2vec T-SNE for most common words")
clustering_source = ColumnDataSource(data=dict(x1=words_top_wiki_tsne[:, 0],
x2=words_top_wiki_tsne[:, 1],
names=words_top_wiki,
colors=colors))
p.scatter(x="x1", y="x2", size=8, source=clustering_source, fill_color='colors')
labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=clustering_source, text_align='center')
p.add_layout(labels)
show(p)
完整代码:https://github.com/RayX-X/NLPLearning/tree/master/word2vec