txt1='''Hi! This is Wang.
Hello! It's Sun speaking.
Do you have free time this evening?
Uh... Let's me see. What's the matter?
I hope you can see a new movie Super Hero with me. I've been wanting to see it for a long time.
Sorry, I didn't catch that. Could you say the name again?
Super Hero.
Oh! Super Hero. I'm very interested in it. I haven't seen it. I can go with you. When the movie start?
It begin at seven o'clock. Let's gather at the cinema at ten to seven.
Ok, I'll arrive there on time. Goodbye!
Bye.'''
txt2="Hi! Wang. I've arrive the cinema. Where are you?"
#自定义文本预处理函数
def txtpre():
global txt#txt为全局变量
txt=txt.lower()
#变特殊字符为空格
for ch in "!,.?":
txt=txt.replace(ch," ")
#对于缩写,通过空格按要求分隔
txt=txt.replace("it's","it 's")
txt=txt.replace("i've","i 've")
txt=txt.replace("don't","do n't")
txt=txt.replace("i'll","i 'll")
txt=txt.replace("i'd","i 'd")
#将txt1赋给txt,作预处理
txt=txt1
txtpre()
list1=list(txt.split())#将字符串按空格分隔转换成列表
dict1=dict.fromkeys(list1,0)#创建一个新字典,默认键对应的值为0
#记录词频到字典的值当中,避免重复词(键)
for x in list1:
dict1[x]+=1
#将词(键)组合成列表,并添加"UNKNOWN"为列表的最后一个元素
key_lst=[]
for k in range(len(dict1)):
key_lst=list(dict1.keys())
key_lst.append("UNKNOWN")
char_to_int = dict((c, i) for i, c in enumerate(key_lst))#词转化为编号需要用到的数据类型
int_to_char = dict((i, c) for i, c in enumerate(key_lst))#编号转化为词需要用到的数据类型
print(int_to_char)#以“编号 词”的方式输出词典
#将txt2赋给txt,作预处理
txt=txt2
txtpre()
list2=list(txt.split())
integer_encoded = []#编号组合成的整数矩阵
#依次检索表2元素,如果在key_lst中,加编号到整数矩阵中,否则加"UNKONWN"的编号到矩阵中
for char in list2:
if (char in key_lst):
integer_encoded.append(char_to_int[char])
else:
integer_encoded.append(key_lst.index("UNKNOWN"))
#构成onehot形式并输出,编码的过程
onehot_encoded =[]
for value in integer_encoded:
letter = [0 for _ in range(len(key_lst))]
letter[value] = 1
onehot_encoded.append(letter)
print(onehot_encoded)
#解码,并输出由词组合成的列表
list_decode=[]
for i in range(len(onehot_encoded)):
decode = int_to_char[integer_encoded[onehot_encoded.index(onehot_encoded[i])]]#逐步往回推
list_decode.append(decode)
print(list_decode)
运行结果:
{0: 'hi', 1: 'this', 2: 'is', 3: 'wang', 4: 'hello', 5: 'it', 6: "'s", 7: 'sun', 8: 'speaking', 9: 'do', 10: 'you', 11: 'have', 12: 'free', 13: 'time', 14: 'evening', 15: 'uh', 16: "let's", 17: 'me', 18: 'see', 19: "what's", 20: 'the', 21: 'matter', 22: 'i', 23: 'hope', 24: 'can', 25: 'a', 26: 'new', 27: 'movie', 28: 'super', 29: 'hero', 30: 'with', 31: "'ve", 32: 'been', 33: 'wanting', 34: 'to', 35: 'for', 36: 'long', 37: 'sorry', 38: "didn't", 39: 'catch', 40: 'that', 41: 'could', 42: 'say', 43: 'name', 44: 'again', 45: 'oh', 46: "i'm", 47: 'very', 48: 'interested', 49: 'in', 50: "haven't", 51: 'seen', 52: 'go', 53: 'when', 54: 'start', 55: 'begin', 56: 'at', 57: 'seven', 58: "o'clock", 59: 'gather', 60: 'cinema', 61: 'ten', 62: 'ok', 63: "'ll", 64: 'arrive', 65: 'there', 66: 'on', 67: 'goodbye', 68: 'bye', 69: 'UNKNOWN'}
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
['hi', 'wang', 'i', "'ve", 'arrive', 'the', 'cinema', 'UNKNOWN', 'UNKNOWN', 'you']