#单词级的 one-hot编码
import numpy as np
#利用split方法对样本进行分词,在实际应用中还需要从样本中去掉标点和特殊符号
samples =['The cat sat on the mat.','The dog ate my homework.']
#构建数据中所有标记的索引
token_index = {}
for sample in samples:
for word in sample.split():
if word not in token_index:
#为每个唯一的单词制定唯一索引,没有索引为0的指定单词
token_index[word] = len(token_index)+1
#对样本进行分词,只考虑每个样本前max_lenght个单词
max_length = 10
#将结果保存到results中
results = np.zeros(shape=(len(samples),max_length,max(token_index.values())+1))
print(token_index.values())
print(token_index)
print(results)
for i ,sample in enumerate(samples):
print(i,sample)
for j,word in list(enumerate(sample.split()))[:max_length]:
index = token_index.get(word)
results[i,j,index] = 1
results
dict_values([10, 2, 5, 4, 8, 3, 6, 9, 1, 7])
{‘homework.’: 10, ‘cat’: 2, ‘the’: 5, ‘on’: 4, ‘ate’: 8, ‘sat’: 3, ‘mat.’: 6, ‘my’: 9, ‘The’: 1, ‘dog’: 7}
[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]
0 The cat sat on the mat.
1 The dog ate my homework.
array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])
#字符集的one-hot编码(简单示例)
import string
import numpy as np
samples = ['The cat sat on the mat.','The dog ate my homework.']
#所有课打印的ASCII字符
characters = string.printable
token_index = dict(zip(range(1,len(characters)+1),characters))
token_index
{1: ‘0’,
2: ‘1’,
3: ‘2’,
4: ‘3’,
5: ‘4’,
6: ‘5’,
7: ‘6’,
8: ‘7’,
9: ‘8’,
10: ‘9’,
11: ‘a’,
12: ‘b’,
13: ‘c’,
14: ‘d’,
15: ‘e’,
16: ‘f’,
17: ‘g’,
18: ‘h’,
19: ‘i’,
20: ‘j’,
21: ‘k’,
22: ‘l’,
23: ‘m’,
24: ‘n’,
25: ‘o’,
26: ‘p’,
27: ‘q’,
28: ‘r’,
29: ‘s’,
30: ‘t’,
31: ‘u’,
32: ‘v’,
33: ‘w’,
34: ‘x’,
35: ‘y’,
36: ‘z’,
37: ‘A’,
38: ‘B’,
39: ‘C’,
40: ‘D’,
41: ‘E’,
42: ‘F’,
43: ‘G’,
44: ‘H’,
45: ‘I’,
46: ‘J’,
47: ‘K’,
48: ‘L’,
49: ‘M’,
50: ‘N’,
51: ‘O’,
52: ‘P’,
53: ‘Q’,
54: ‘R’,
55: ‘S’,
56: ‘T’,
57: ‘U’,
58: ‘V’,
59: ‘W’,
60: ‘X’,
61: ‘Y’,
62: ‘Z’,
63: ‘!’,
64: ‘"’,
65: ‘#’,
66: ‘$’,
67: ‘%’,
68: ‘&’,
69: “’”,
70: ‘(’,
71: ‘)’,
72: ‘*’,
73: ‘+’,
74: ‘,’,
75: ‘-’,
76: ‘.’,
77: ‘/’,
78: ‘:’,
79: ‘;’,
80: ‘<’,
81: ‘=’,
82: ‘>’,
83: ‘?’,
84: ‘@’,
85: ‘[’,
86: ‘\’,
87: ‘]’,
88: ‘^’,
89: ‘_’,
90: ‘`’,
91: ‘{’,
92: ‘|’,
93: ‘}’,
94: ‘~’,
95: ’ ',
96: ‘\t’,
97: ‘\n’,
98: ‘\r’,
99: ‘\x0b’,
100: ‘\x0c’}
characters
‘0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&’()*+,-./:;<=>?@[\]^_`{|}~ \t\n\r\x0b\x0c’
samples = ['The cat sat on the mat.','The dog ate my homework.']
#所有课打印的ASCII字符
characters = string.printable
token_index = dict(zip(range(1,len(characters)+1),characters))
max_length = 50
#初始化零矩阵
results = np.zeros((len(samples),max_length,max(token_index.keys()) +1))
for i , sample in enumerate(samples):
for j, character in enumerate(sample):
index = token_index.get(character)
results[i, j, index] = 1.
print(results)
[[[1. 1. 1. … 1. 1. 1.]
[1. 1. 1. … 1. 1. 1.]
[1. 1. 1. … 1. 1. 1.]
…
[0. 0. 0. … 0. 0. 0.]
[0. 0. 0. … 0. 0. 0.]
[0. 0. 0. … 0. 0. 0.]]
[[1. 1. 1. … 1. 1. 1.]
[1. 1. 1. … 1. 1. 1.]
[1. 1. 1. … 1. 1. 1.]
…
[0. 0. 0. … 0. 0. 0.]
[0. 0. 0. … 0. 0. 0.]
[0. 0. 0. … 0. 0. 0.]]]
#用keras实现单词级的one-hot编码
from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
#创建一个分词器(tokenizer),设置 为只考虑前 1000 个最常见的单词
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples) #构建单词索引
sequences = tokenizer.texts_to_sequences(samples)# 将字符串转换为整数索引组成的列表
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
word_index = tokenizer.word_index# 找回单词索引
print('Found %s unique tokens.' % len(word_index))
#使用散列技巧的单词级的one-hot编码
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
dimensionality = 1000#将单词保存为长度为 1000 的向量。如果单词数量接近 1000 个(或更多), 那么会遇到很多散列冲突,这会降低这种编码方法的准确性
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split()))[:max_length]:
index = abs(hash(word)) % dimensionality #将单词散列为 0~1000 范围内的 一个随机整数索引
results[i, j, index] = 1.