在图像的处理中,我们无需对图像进行特殊的处理,因为图像本身就是由矩阵来表示的。而处理自然语言的时候,语言是由每一个字、词组成的。而字、词是通过编码存储在计算机当中的。所以我们可以通过将字、词编码成为向量的形式,如此就可以输入进深度学习网络模型中。
但是我们应该如何去编码呢?
像做图像分类那样,使用one—hot的形式编码?
但是这样会存在一个问题,在自然语言中,存在着词和词之间的关系,字和字之间的关系。例如近义词,反义词,同属性的词语。
例如:‘beijing’,‘china’,'tokyo’这三个词语之间的关系是如何的?
很显然,beijing是属于china 的,相比于tokyo,beijing离china的关系应该是要更近的才行。
看一看pytorch已经训练好的词向量,来解决这样一个问题。
安装torchtext。
pip install torchtext
运行下列程序:
import torch
import torchtext
from torchtext import vocab
"""实例化词向量对象"""
gv = torchtext.vocab.GloVe(name='6B')
"""打印词的长度"""
print(len(gv.vectors))
"""打印词的形状"""
print(gv.vectors.shape)
"""查询词索引位置"""
index = gv.stoi['tokyo']
print(index)
"""打印词向量"""
vector_tokyo = gv.vectors[index]
print(vector_tokyo)
"""由索引,查找词"""
print(gv.itos[index])
打印以下类容:
400000
torch.Size([400000, 300])
1363
tensor([ 0.0986, 0.2744, -0.1910, -0.4745, 0.3238, 0.0363, -0.1794, 0.0715,
-0.1600, -1.3565, 0.0544, 0.0586, 0.9632, 0.6459, 0.3183, -0.2209,
-0.3294, 0.2590, 0.0383, 0.0368, -0.0728, -0.8611, -0.1433, -0.0682,
0.1370, 0.3474, 0.6794, -0.3918, -0.1797, -0.3086, 0.0509, -0.6783,
0.2079, 0.0570, -0.0109, -0.4524, 0.0618, -0.4837, -0.4535, 0.3663,
0.1987, 0.5068, 0.0085, 0.2570, -0.5934, 0.5397, -0.0467, 0.1591,
-0.6192, 0.7122, 0.1573, 0.0781, 0.0654, 0.0127, -0.1181, 0.0454,
0.1191, 0.3135, -0.3338, -0.7205, 0.1635, 0.3757, 0.2461, -0.1986,
0.3314, -0.0696, 0.0323, -0.0119, 0.0461, -0.5783, 0.2841, -0.3058,
-0.1039, -0.5089, 0.2029, -0.5188, 0.1908, 0.9081, -0.1614, -0.5558,
-0.2981, 0.0113, -0.6955, 0.3706, -0.2457, 0.6815, -0.0221, -0.5354,
-0.2667, -0.4546, 0.5362, -0.4829, -0.0112, -0.4637, -0.0534, -0.1134,
-0.3340, 0.0190, 0.1398, -0.2753, -0.2229, -0.9672, -0.3900, 0.6600,
-0.1395, -0.2849, 0.4003, -0.4742, 0.1142, 0.5477, -0.5486, -0.4966,
-0.1614, -0.0464, 0.5475, 0.3730, 0.1716, 0.0252, -0.0163, -0.8848,
0.6577, -0.7852, 0.0250, -0.2150, -0.4689, 0.1600, -0.1755, -0.2799,
-0.3763, -0.1656, -0.3830, 1.1125, 0.7755, 0.2812, 0.5062, 0.1783,
0.1184, 0.0525, 0.5988, 0.6151, 0.1548, 0.0550, 0.3621, -0.3447,
0.2023, -0.3025, 0.0443, 0.2394, 0.1108, -0.1181, 0.9393, 0.2026,
0.2374, 0.8568, 0.6142, 0.5347, -0.8022, -0.3214, -0.0874, 0.0590,
0.1219, -0.0198, -0.2513, -0.5628, 0.6591, 0.0719, 0.3806, -0.0970,
0.2537, -0.1051, -0.3114, -1.2763, -0.5952, -0.1996, -0.4410, -0.1974,
0.4121, 0.5094, 0.0537, 0.9708, 0.1140, 0.2382, -1.1227, 0.2767,
0.1361, 0.7891, 0.1975, -0.0671, -0.2377, -0.2153, 0.5068, -0.3815,
-0.0401, -0.2702, -0.6019, -0.4694, 0.0836, -0.0187, -0.5859, 0.5743,
1.0775, 0.2871, -0.1479, 0.7543, -1.1862, 0.8951, -0.2454, -0.2608,
0.3586, -0.3043, -0.2555, -0.2138, -0.1634, -0.1754, -0.3832, 0.0035,
-0.1285, 0.0648, -0.4690, 0.2399, 0.8058, -0.2286, 0.0707, 0.0218,
-0.4434, 0.3778, -0.8856, 0.0924, -0.4961, 0.3073, 0.3822, -0.3354,
0.1413, 0.2973, 0.6780, 0.2839, 0.4161, 0.4181, 0.5403, 0.7092,
0.0378, -0.3367, -0.2768, 0.5240, -0.1976, -0.2343, -0.1787, -0.3622,
-0.1782, -0.2002, 0.4667, -0.2682, -0.0780, 0.8032, -0.7729, -0.4938,
-0.3711, 0.5108, 0.5503, -0.2175, -0.0640, -0.2579, -0.4843, 0.4356,
0.5931, -0.1293, -0.3471, 0.3159, 0.2683, -0.5112, -0.4244, 0.0833,
0.3387, -0.0699, -0.0656, 0.1321, -1.1871, -0.1551, 0.7677, -0.3515,
-0.4988, 0.3188, 0.1130, -1.1187, -0.6493, -0.2563, -0.3067, 0.6126,
-0.3617, -0.4735, 0.4456, 0.0256, -0.1027, -0.0352, 0.3227, 0.6737,
0.0972, 0.1478, -0.0172, -0.2390])
tokyo
接着,
运行以下代码,通过向量的余弦相似度,来寻找和它关系最近的一条词。
import torch
import torchtext
from torchtext import vocab
"""实例化词向量对象"""
gv = vocab.GloVe(name='6B',dim=50)
def get_word_vector(word):
"""
取出词的向量
:param word: 词
:return: 词向量
"""
index = gv.stoi[word]
return gv.vectors[index]
def sim_10(word, n=10, way='cosine_similarity'):
"""
取出这个词向量,拿这个向量去遍历所有的向量,求距离,拉出10个最近的词
:param way: 衡量方法
:param word: 词向量
:param n: 范围
:return: 最相似的词向量
"""
if way == 'cosine_similarity':
func = torch.cosine_similarity
reverse = True
else:
func = torch.dist
reverse = False
all_cosine_similarity = []
for i, w in enumerate(gv.vectors):
if reverse:
temp = func(word, w, dim=0)
else:
temp = func(word, w)
all_cosine_similarity.append((gv.itos[i],temp))
return sorted(all_cosine_similarity,key=lambda t:t[1],reverse=reverse)[:n]
def answer(word1, word2, word3):
print(f'{word1}:{word2}=={word3}:?')
"""word1 - word2 = word3 - word4"""
"""word4 = word3 - word1 + word2"""
word4 = get_word_vector(word3)-get_word_vector(word1)+get_word_vector(word2)
return sim_10(word4,way='dist')