原始数据剪切即把如下格式的问答语句转换成正常的问答语料.从原始数据中提取完整的对话,并处理成问答格式,最终将问题和答案数据分开保存.
E
M 呵呵
M 是王若猫的。
E
M 不是
M 那是什么?
E
呵呵
是王若猫的。
不是
那是什么?
def process_cut(source_path, cut_path):
'''提取完整对话集.
参数:
source_path: 原始语料路径
cut_path: 保存剪切后的数据路径
'''
'''完整对话集.'''
convs = []
with open(source_path, 'r', encoding='utf8') as f:
'''<_io.TextIOWrapper name='./data/source_data.conv' mode='r' encoding='utf8'>'''
print("open context object: {}".format(f))
# data = f.readlines()
'''['E\n', 'M 呵呵\n', 'M 是王若猫的。\n]'''
# print("data: {}".format(data))
'''完整对话集,包含问题和答案.'''
complete_dialog = []
for line in f:
'''删除换行符: \n'''
line = line.strip('\n')
if line == "":
continue
if line[0] == "E":
if complete_dialog:
'''若判断为E标志,将问答语句填充到语料库convs.'''
convs.append(complete_dialog)
'''存储一组对话后,清空当前对话,为下一组对话存储准备'''
complete_dialog = []
if line[0] == 'M':
'''若为M则提取对话内容,存储到单独对话库complete_dialog.'''
complete_dialog.append(line[1:])
'''
contain M: M 三鹿奶粉也假,不一样的卖啊
'''
# print("contain M: {}".format(line))
'''
line data: E
line data: M 呵呵
'''
# print("line data: {}".format(line))
# print("Complete dialog {}".format(complete_dialog))
print("All complete dialog: {}".format(convs))
return convs
if __name__ == "__main__":
source_path = "./data/source_data.conv"
process_cut(source_path, None)
[[' 呵呵', ' 是王若猫的。'], [' 不是', ' 那是什么?'], [' 怎么了', ' 我很难过,安慰我~'], [' 开心点哈,一切都会好起来', ' 嗯 会的'], [' 我还喜欢她,怎么办', ' 我帮你告诉她?发短信还是打电话?'], [' 短信', ' 嗯嗯。我也相信'], [' 你知道谁么', ' 肯定不是我,是阮德培'], [' 许兵是谁', ' 吴院四班小帅哥'], [' 这么假', ' 三鹿奶粉也假,不一样的卖啊'], [' 许兵是傻逼', ' 被你发现了。'], [' 许兵是谁', ' 是我善良可爱的主人的老公啊'], [' 许兵是谁', ' 是穆森的老婆啊'], [' 许兵是谁', ' 奇葩']]
def question_answer(convs):
questions = []
answers = []
for conv in convs:
if len(conv) == 1:
continue
if len(conv) % 2 != 0:
'''if dialog was not one to one, delete the last one and keep Q&A.'''
conv = conv[:-1]
for i in range(len(conv)):
if i % 2 == 0:
questions.append(conv[i])
else:
answers.append(conv[i])
print("questions: {} \n answers: {}".format(questions, answers))
return questions, answers
if __name__ == "__main__":
source_path = "./data/source_data.conv"
convs = process_cut(source_path, None)
questions, answers = question_answer(convs)
questions: [' 呵呵', ' 不是', ' 怎么了', ' 开心点哈,一切都会好起来', ' 我还喜欢她,怎么办', ' 短信', ' 你知道谁么', ' 许兵是谁', ' 这么假', ' 许兵是傻逼', ' 许兵是谁', ' 许兵是谁', ' 许兵是谁']
answers: [' 是王若猫的。', ' 那是什么?', ' 我很难过,安慰我~', ' 嗯 会的', ' 我帮你告诉她?发短信还是打电话?', ' 嗯嗯。我也相信', ' 肯定不是我,是阮德培', ' 吴院四班小帅哥', ' 三鹿奶粉也假,不一样的卖啊', ' 被你发现了。', ' 是我善良可爱的主人的老公啊', ' 是穆森的老婆啊', ' 奇葩']
def save_question_answer(questions, answers, test_size,
train_question_path, train_answer_path,
test_question_path, test_answer_path):
'''保存问答语料数据集.
:params questions: 问题列表
:params answers: 答案列表
:params test_size: 测试数据数量
:params train_question_path: 用于训练的问题数据集
:params train_answer_path: 用于训练的答案数据集
:params test_question_path: 用于测试的问题数据集
:params test_answer_path: 用于测试的答案数据集
'''
'''训练数据集文件文件路径.'''
train_quesition_enc = open(train_question_path, "w")
train_answer_dec = open(train_answer_path, "w")
'''测试数据集文件路径.'''
test_question_enc = open(test_question_path, "w")
test_answer_dec = open(test_answer_path, "w")
'''根据设定的测试数据集尺寸随机提取测试数据.'''
test_index = random.sample([i for i in range(len(questions))], test_size)
for i in range(len(questions)):
'''提取测试数据集'''
if i in test_index:
test_question_enc.write(questions[i]+'\n')
test_answer_dec.write(answers[i]+'\n')
else:
'''提取训练数据集'''
train_quesition_enc.write(questions[i]+'\n')
train_answer_dec.write(answers[i]+'\n')
train_quesition_enc.close()
train_answer_dec.close()
test_question_enc.close()
test_answer_dec.close()
if __name__ == "__main__":
source_path = "./data/source_data.conv"
convs = process_cut(source_path, None)
print("convs: {}".format(convs))
questions, answers = question_answer(convs)
# print("questions: {} \n answers: {}".format(questions, answers))
'''文件夹列表'''
folder_list = ["./data/train/", "./data/test/"]
'''文件列表'''
file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
for i in range(len(folder_list)):
'''若文件夹不存在,则新建'''
if not os.path.exists(folder_list[i]):
os.makedirs(folder_list[i])
for i in range(len(file_list)):
'''若文件不存在,则新建'''
if not os.path.exists(file_list[i]):
os.mknod(file_list[i])
'''设置训练数据集路径.'''
train_question_path = file_list[0]
train_answer_path = file_list[1]
'''设置测试数据集路径.'''
test_question_path = file_list[2]
test_answer_path = file_list[3]
save_question_answer(questions, answers, 5,
train_question_path, train_answer_path,
test_question_path, test_answer_path)
def generate_vocabulary(datasets, vocabulary_data):
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
vocabulary = {}
new_vocabulary = []
with open(datasets, "r") as f:
counter = 0
for line in f:
counter += 1
'''删除句子首尾的换行,并提取句子中的字,进行计数.'''
tokens = [word for word in line.strip()]
for word in tokens:
if word in vocabulary:
vocabulary[word] += 1
else:
vocabulary[word] = 1
vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
print("vocabulary: {}".format(vocabulary_list))
with open(vocabulary_data, "w") as f:
for word in vocabulary_list:
f.write(word+'\n')
if __name__ == "__main__":
file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
voc_list = ["./data/train/question_voc", "./data/train/answer_voc"]
for i in range(len(voc_list)):
'''检测保存的词汇表是否存在,不存在则新建新建'''
if not os.path.exists(voc_list[i]):
os.mknod(voc_list[i])
'''生成问题和答案的词汇表并保存'''
generate_vocabulary(file_list[i], voc_list[i])
vocabulary: ['__PAD__', '__GO__', '__EOS__', '__UNK__', '是', '谁', '许', '兵', '呵', '么', '不', '短', '信', '你', '知', '道', '这', '假', '傻', '逼']
vocabulary: ['__PAD__', '__GO__', '__EOS__', '__UNK__', '是', '。', '的', '嗯', '我', '也', '不', ',', '王', '若', '猫', '那', '什', '么', '?', '相', '信', '肯', '定', '阮', '德', '培', '吴', '院', '四', '班', '小', '帅', '哥', '三', '鹿', '奶', '粉', '假', '一', '样', '卖', '啊', '被', '你', '发', '现', '了', '奇', '葩']
def word_to_vector(dataset_qa, vocabulary, vector):
UNK_ID = 3
tmp_vocab = []
with open(vocabulary, "r") as f:
'''
读取全部数据到内存,使用extend方法将整个数据拆分为单个数据,
而不是以整块数据存储到列表中,如数据[0, 1, 2],[2, 3, 5]
使用append则存储的为:[[0, 1, 2],[2, 3, 5]]
使用extend存储为:[0, 1, 2, 2, 3, 5]
'''
tmp_vocab.extend(f.readlines())
'''删除换行符: \n'''
tmp_vocab = [line.strip() for line in tmp_vocab]
'''
将文字与对应的数字即行号对应,使用enumerate使数与字对应(0,"__PAD__")
不达标,因此使用交换(y,x)to(x,y)即可
最后使用dict将tuple转为dict,格式为{'__PAD__': 0, '__GO__': 1, '__EOS__': 2, '__UNK__': 3, '是': 4, '谁': 5, '许': 6, '兵': 7, '呵': 8, '么': 9, '不': 10, '短': 11, '信': 12, '你': 13, '知': 14, '道': 15, '这': 16, '假': 17, '傻': 18, '逼': 19}
'''
vocab = dict([(x,y) for (y,x) in enumerate(tmp_vocab)])
print("vocabulay dictionary: {}".format(vocab))
with open(vector, "w") as f_vector:
with open(dataset_qa, "r") as f_qa:
for line in f_qa:
line_vec = []
for words in line.strip():
'''将字与序号对应:提取字典键对应的值,若不存在,使用UNK_ID替代'''
line_vec.append(vocab.get(words, UNK_ID))
print("line vector: {}".format(line_vec))
f_vector.write(" ".join([str(num) for num in line_vec]) + '\n')
if __name__ == "__main__":
word_to_vector("./data/train/question.enc","./data/train/question_voc", "./data/train/question.voc")
vocabulay dictionary: {'__PAD__': 0, '__GO__': 1, '__EOS__': 2, '__UNK__': 3, '是': 4, '谁': 5, '许': 6, '兵': 7, '呵': 8, '么': 9, '不': 10, '短': 11, '信': 12, '你': 13, '知': 14, '道': 15, '这': 16, '假': 17, '傻': 18, '逼': 19}
line vector: [8, 8]
line vector: [10, 4]
line vector: [11, 12]
line vector: [13, 14, 15, 5, 9]
line vector: [6, 7, 4, 5]
line vector: [16, 9, 17]
line vector: [6, 7, 4, 18, 19]
line vector: [6, 7, 4, 5]
def extract_data(source_path, output_path, start, stop):
'''提取指定行的数据并保存.
:params source_path: 原始数据路径.
:params output_path: 保存数据路径.
:params start: 起始行号.
:params stop: 结束行号.
return:
返回文件数据行数.
'''
line_number = 0
if not os.path.exists(output_path):
os.mknod(output_path)
with open(source_path, 'r') as fi:
'''读取文件,获取文件行数'''
line_number = len(fi.readlines())
with open(source_path, 'r') as fi:
if start < 0 or start > line_number or stop <= start or stop < 0:
return "起始行设置错误,请重新设置."
else:
counter = 0
for line in fi:
counter += 1
if counter >= start:
with open(output_path, "a") as fo:
fo.write(line)
if counter == stop:
break
return line_number
if __name__ == "__main__":
counter_line = extract_data("./data/source_data.conv", "./data/extra_data.conv", 2, 14)
print("data lines nubmers: {}".format(counter_line))
import random
import os
def process_cut(source_path, cut_path):
'''Process source data.
Save data expect E,M.
:params source_path: source dialog path
:params cut_path: save dialog path
return:
convs: complete dialogs.
'''
'''Saved all conversations.'''
convs = []
with open(source_path, 'r', encoding='utf8') as f:
'''<_io.TextIOWrapper name='./data/source_data.conv' mode='r' encoding='utf8'>'''
print("open context object: {}".format(f))
# data = f.readlines()
'''['E\n', 'M 呵呵\n', 'M 是王若猫的。\n]'''
# print("data: {}".format(data))
# one_conv = []
'''Complete dialog: contains Question and Answer.'''
complete_dialog = []
for line in f:
'''Delete line feed symbol: \n'''
line = line.strip('\n')
if line == "":
continue
if line[0] == "E":
if complete_dialog:
'''Add dialog to conversations list.'''
convs.append(complete_dialog)
complete_dialog = []
if line[0] == 'M':
'''Extract Question and Answer which contains in M'''
complete_dialog.append(line[1:])
'''
contain M: M 三鹿奶粉也假,不一样的卖啊
'''
# print("contain M: {}".format(line))
'''
line data: E
line data: M 呵呵
'''
# print("line data: {}".format(line))
# print("Complete dialog {}".format(complete_dialog))
# print("All complete dialog: {}".format(convs))
return convs
def question_answer(convs):
'''Extract questions and answers from dialog.
:params convs: dialogs.
return:
questions: questions
answers: answers
'''
questions = []
answers = []
for conv in convs:
if len(conv) == 1:
continue
if len(conv) % 2 != 0:
'''if dialog was not one to one, delete the last one and keep Q&A.'''
conv = conv[:-1]
for i in range(len(conv)):
'''Extract Question.'''
if i % 2 == 0:
questions.append(conv[i])
else:
'''Extract Answer.'''
answers.append(conv[i])
print("questions: {} \n answers: {}".format(questions, answers))
return questions, answers
def save_question_answer(questions, answers, test_size,
train_question_path, train_answer_path,
test_question_path, test_answer_path):
'''Save question and answer dataset.
:params questions: question
:params answers: answer
:params test_size: set test data number and save
:params train_question_path: question dataset path for train
:params train_answer_path: answer dataset path for train
:params test_question_path: question dataset path for test
:params test_answer_path: answer dataset path for test
'''
'''Train dataset.'''
train_quesition_enc = open(train_question_path, "w")
train_answer_dec = open(train_answer_path, "w")
'''Test dataset.'''
test_question_enc = open(test_question_path, "w")
test_answer_dec = open(test_answer_path, "w")
'''Random get test dateset which number is test_size.'''
test_index = random.sample([i for i in range(len(questions))], test_size)
for i in range(len(questions)):
if i in test_index:
test_question_enc.write(questions[i]+'\n')
test_answer_dec.write(answers[i]+'\n')
else:
train_quesition_enc.write(questions[i]+'\n')
train_answer_dec.write(answers[i]+'\n')
train_quesition_enc.close()
train_answer_dec.close()
test_question_enc.close()
test_answer_dec.close()
def generate_vocabulary(datasets, vocabulary_data):
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
vocabulary = {}
new_vocabulary = []
with open(datasets, "r") as f:
counter = 0
for line in f:
counter += 1
'''Delete lind feed symbol: \n, and extract word in sentence.'''
tokens = [word for word in line.strip()]
for word in tokens:
if word in vocabulary:
vocabulary[word] += 1
else:
vocabulary[word] = 1
vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
print("vocabulary: {}".format(vocabulary_list))
with open(vocabulary_data, "w") as f:
for word in vocabulary_list:
f.write(word+'\n')
def word_to_vector(dataset_qa, vocabulary, vector):
UNK_ID = 3
tmp_vocab = []
with open(vocabulary, "r") as f:
'''Append word one by one to list as dependent element not entirely append to list.'''
tmp_vocab.extend(f.readlines())
'''Delete line feed: \n'''
tmp_vocab = [line.strip() for line in tmp_vocab]
'''Trans tmp_vocab to this format[()] and then convert to dict by dict{key:value}.'''
vocab = dict([(x,y) for (y,x) in enumerate(tmp_vocab)])
'''vocabulay dictionary: {'__PAD__': 0, '__GO__': 1, '__EOS__': 2, '__UNK__': 3, '是': 4, '谁': 5, '许': 6, '兵': 7, '呵': 8, '么': 9, '不': 10, '短': 11, '信': 12, '你': 13, '知': 14, '道': 15, '这': 16, '假': 17, '傻': 18, '逼': 19}
'''
print("vocabulay dictionary: {}".format(vocab))
with open(vector, "w") as f_vector:
with open(dataset_qa, "r") as f_qa:
for line in f_qa:
line_vec = []
for words in line.strip():
line_vec.append(vocab.get(words, UNK_ID))
# print("line vector: {}".format(line_vec))
f_vector.write(" ".join([str(num) for num in line_vec]) + '\n')
def process_data(dataset_qa, vocabulary, vector):
'''Read and save dataset.'''
source_path = "./data/source_data.conv"
convs = process_cut(source_path, None)
print("convs: {}".format(convs))
questions, answers = question_answer(convs)
# print("questions: {} \n answers: {}".format(questions, answers))
folder_list = ["./data/train/", "./data/test/"]
file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
for i in range(len(folder_list)):
if not os.path.exists(folder_list[i]):
os.makedirs(folder_list[i])
for i in range(len(file_list)):
if not os.path.exists(file_list[i]):
os.mknod(file_list[i])
'''Seting train dataset path.'''
train_question_path = file_list[0]
train_answer_path = file_list[1]
'''Seting test dataset path.'''
test_question_path = file_list[2]
test_answer_path = file_list[3]
save_question_answer(questions, answers, 5,
train_question_path, train_answer_path,
test_question_path, test_answer_path)
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
voc_list = ["./data/train/question_voc", "./data/train/answer_voc"]
for i in range(len(voc_list)):
if not os.path.exists(voc_list[i]):
os.mknod(voc_list[i])
generate_vocabulary(file_list[i], voc_list[i])
word_to_vector(dataset_qa, vocabulary, vector)
if __name__ == "__main__":
process_data("./data/train/question.enc","./data/train/question_voc", "./data/train/question.vec")
文本处理流程:
import codecs
import collections
from operator import itemgetter
RAW_DATA = "vocabulary.txt"
VOCAB_OUTPUT = "ptb.vocab"
counter = collections.Counter()
with codecs.open(RAW_DATA, "r", "utf-8") as f:
for line in f:
for word in line.strip().split():
counter[word] += 1
print(counter)
sorted_word_to_cnt = sorted(counter.items(), key=itemgetter(1), reverse=True)
print(sorted_word_to_cnt)
sorted_words = [x[0] for x in sorted_word_to_cnt]
print(sorted_words)
sorted_words = ["" ] + sorted_words
print(sorted_words)
sorted_words = ["" , "" , "" ] + sorted_words
print(sorted_words)
with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:
for word in sorted_words:
file_output.write(word + "\n")
import codecs
import sys
RAW_DATA = "vocabulary.txt"
VOCAB = "ptb.vocab"
OUTPUT_DATA = "ptb.train"
#读取词汇表
with codecs.open(VOCAB, "r", "utf-8") as f_vocab:
vocab = [w.strip() for w in f_vocab.readlines()]
#新建字典:文本:行号
word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}
#获取词汇表中词对应的行号
def get_id(word):
return word_to_id[word] if word in word_to_id else word_to_id["" ]
fin = codecs.open(RAW_DATA, "r", "utf-8")
fout = codecs.open(OUTPUT_DATA, "w", "utf-8")
for line in fin:
words = line.strip().split() + ["" ]
out_line = ' '.join([str(get_id(w)) for w in words]) + '\n'
fout.write(out_line)
fin.close()
fout.close()