Natural Language Processing
单词含义
如何使用同义词词典根据自然语言处理的具体应用的不同而不同。比如,在信息检索场景中,如果事先知道 automobile 和 car 是近义词,就可以将 automobile 的检索结果添加到 car 的检索结果中
可以参考 附录 B
text = 'You say goodbye and I say hello.' # 语料库的样本文章
text = text.lower() # 将所有字母转化为小写,这样可以将句子开头的单词也作为常规单词处理
text = text.replace('.', ' .') # 方便后面用空格切分句子
print(text) # you say goodbye and i say hello .
words = text.split(' ') # 将空格作为分隔符, 切分句子
print(words) # ['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']
通过导入正则表达式的
re
模块,使用re.split('(\W+)?', text)
也可以进行分词
>>> word_to_id = {
} # 将单词转化为单词 ID
>>> id_to_word = {
} # 将单词 ID 转化为单词
>>>
>>> for word in words:
... if word not in word_to_id:
... new_id = len(word_to_id)
... word_to_id[word] = new_id
... id_to_word[new_id] = word
>>> import numpy as np
>>> corpus = [word_to_id[w] for w in words]
>>> corpus = np.array(corpus) # 单词 ID 列表
>>> corpus
array([0, 1, 2, 3, 4, 1, 5, 6])
preprocess()
函数def preprocess(text):
text = text.lower()
text = text.replace('.', ' .')
words = text.split(' ')
word_to_id = {
}
id_to_word = {
}
for word in words:
if word not in word_to_id:
new_id = len(word_to_id)
word_to_id[word] = new_id
id_to_word[new_id] = word
corpus = np.array([word_to_id[w] for w in words])
return corpus, word_to_id, id_to_word
根据具体情况,也可以仅将左边的单词或者右边的单词作为上下文。此外,也可以使用考虑了句子分隔符的上下文。简单起见,本书仅处理不考虑句子分隔符、左右单词数量相同的上下文
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
print(corpus)
# [0 1 2 3 4 1 5 6]
print(id_to_word)
# {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6:
'.'}
# corpus 是单词 ID 列表
# vocab_size 是词汇个数
# window_size 是窗口大小
def create_co_matrix(corpus, vocab_size, window_size=1):
corpus_size = len(corpus)
co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
for idx, word_id in enumerate(corpus):
for i in range(1, window_size + 1):
left_idx = idx - i
right_idx = idx + i
if left_idx >= 0:
left_word_id = corpus[left_idx]
co_matrix[word_id, left_word_id] += 1
if right_idx < corpus_size:
right_word_id = corpus[right_idx]
co_matrix[word_id, right_word_id] += 1
return co_matrix
def cos_similarity(x, y, eps=1e-8):
nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
return np.dot(nx, ny)
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
c0 = C[word_to_id['you']] # you的单词向量
c1 = C[word_to_id['i']] # i的单词向量
print(cos_similarity(c0, c1))
# 0.7071067691154799
# 由于余弦相似度的取值范围是 −1 到 1,所以可以说这个值是相对比较高的(存在相似性)
"""
@ query: 查询词
@ word_to_id: 单词到单词ID 的字典
@ id_to_word 单词ID 到单词的字典
@ word_matrix 汇总了单词向量的矩阵,假定保存了与各行对应的单词向量
@ top 显示到前几位
"""
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
# 取出查询词
if query not in word_to_id:
print('%s is not found' % query)
return
print('\n[query] ' + query)
query_id = word_to_id[query]
query_vec = word_matrix[query_id]
# 计算余弦相似度
vocab_size = len(id_to_word)
similarity = np.zeros(vocab_size)
for i in range(vocab_size):
similarity[i] = cos_similarity(word_matrix[i], query_vec)
# 基于余弦相似度,按降序输出值
count = 0
for i in (-1 * similarity).argsort(): # argsort 返回排序后的索引
if id_to_word[i] == query:
continue
print(' %s: %s' % (id_to_word[i], similarity[i]))
count += 1
if count >= top:
return
Pointwise Mutual Information
def ppmi(C, verbose=False, eps=1e-8):
M = np.zeros_like(C, dtype=np.float32)
N = np.sum(C)
S = np.sum(C, axis=0)
total = C.shape[0] * C.shape[1]
cnt = 0
for i in range(C.shape[0]):
for j in range(C.shape[1]):
pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
M[i, j] = max(0, pmi)
if verbose:
cnt += 1
if cnt % (total//100+1) == 0:
print('%.1f%% done' % (100*cnt/total))
return M
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)
np.set_printoptions(precision=3) # 有效位数为3位
print('covariance matrix')
print(C)
print('-'*50)
print('PPMI')
print(W)
output:
covariance matrix
[[0 1 0 0 0 0 0]
[1 0 1 0 1 1 0]
[0 1 0 1 0 0 0]
[0 0 1 0 1 0 0]
[0 1 0 1 0 0 0]
[0 1 0 0 0 0 1]
[0 0 0 0 0 1 0]]
--------------------------------------------------
PPMI
[[ 0. 1.807 0. 0. 0. 0. 0. ]
[ 1.807 0. 0.807 0. 0.807 0.807 0. ]
[ 0. 0.807 0. 1.807 0. 0. 0. ]
[ 0. 0. 1.807 0. 1.807 0. 0. ]
[ 0. 0.807 0. 1.807 0. 0. 0. ]
[ 0. 0.807 0. 0. 0. 0. 2.807]
[ 0. 0. 0. 0. 0. 2.807 0. ]]
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(id_to_word)
C = create_co_matrix(corpus, vocab_size, window_size=1)
W = ppmi(C)
# SVD
U, S, V = np.linalg.svd(W)
如果矩阵大小是 N N N,SVD的计算的复杂度将达到 O ( N 3 ) O(N^3) O(N3), 所以往往会使用 Truncated SVD 等更快的方法。Truncated SVD 通过截去(truncated)奇异值较小的部分,从而实现高速化
# 单词 ID 为 0 的单词向量:
print(C[0]) # 共现矩阵
# [0 1 0 0 0 0 0]
print(W[0]) # PPMI矩阵
# [ 0. 1.807 0. 0. 0. 0. 0. ]
print(U[0]) # SVD
# [ 3.409e-01 -1.110e-16 -1.205e-01 -4.441e-16 0.000e+00 -9.323e-01
# 2.226e-16]
W[0]
经过 SVD 被转化成了密集向量 U[0]
。如果要对这个密集向量降维,比如把它降维到二维向量,取出前两个元素即可print(U[0, :2])
# [ 3.409e-01 -1.110e-16]
for word, word_id in word_to_id.items():
plt.annotate(word, (U[word_id, 0], U[word_id, 1])) # 在 2D 图形中坐标为 (x, y) 的地方绘制单词的文本
plt.scatter(U[:,0], U[:,1], alpha=0.5)
plt.show()
Penn Treebank 语料库
from dataset import ptb
corpus, word_to_id, id_to_word = ptb.load_data('train')
print('corpus size:', len(corpus))
print('corpus[:30]:', corpus[:30])
print()
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print()
print("word_to_id['car']:", word_to_id['car'])
output:
corpus size: 929589
corpus[:30]: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29]
id_to_word[0]: aer
id_to_word[1]: banknote
id_to_word[2]: berlitz
word_to_id['car']: 3856
word_to_id['happy']: 4428
word_to_id['lexus']: 7426
window_size = 2
wordvec_size = 100
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('counting co-occurrence ...')
C = create_co_matrix(corpus, vocab_size, window_size)
print('calculating PPMI ...')
W = ppmi(C, verbose=True)
print('calculating SVD ...')
try:
# truncated SVD (fast!)
from sklearn.utils.extmath import randomized_svd
U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None)
except ImportError:
# SVD (slow)
U, S, V = np.linalg.svd(W)
word_vecs = U[:, :wordvec_size]
querys = ['you', 'year', 'car', 'toyota']
for query in querys:
most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
output:
[query] you
i: 0.702039909619
we: 0.699448543998
've: 0.554828709147
do: 0.534370693098
else: 0.512044146526
[query] year
month: 0.731561990308
quarter: 0.658233992457
last: 0.622425716735
earlier: 0.607752074689
next: 0.601592506413
[query] car
luxury: 0.620933665528
auto: 0.615559874277
cars: 0.569818364381
vehicle: 0.498166879744
corsica: 0.472616831915
[query] toyota
motor: 0.738666107068
nissan: 0.677577542584
motors: 0.647163210589
honda: 0.628862370943
lexus: 0.604740429865