自然语言处理:数据集预处理词向量嵌入

1 原始数据提取问答数据集并保存

原始数据剪切即把如下格式的问答语句转换成正常的问答语料.从原始数据中提取完整的对话,并处理成问答格式,最终将问题和答案数据分开保存.

  • 原始数据
E
M 呵呵
M 是王若猫的。
E
M 不是
M 那是什么?
E
  • 正常问答语料
呵呵
是王若猫的。
不是
那是什么?

1.0 提取对话

def process_cut(source_path, cut_path):
	'''提取完整对话集.
	参数:
	source_path: 原始语料路径
	cut_path: 保存剪切后的数据路径
	'''
	'''完整对话集.'''
	convs = []
	with open(source_path, 'r', encoding='utf8') as f:
		'''<_io.TextIOWrapper name='./data/source_data.conv' mode='r' encoding='utf8'>'''
		print("open context object: {}".format(f))
		# data = f.readlines()
		'''['E\n', 'M 呵呵\n', 'M 是王若猫的。\n]'''
		# print("data: {}".format(data))
		'''完整对话集,包含问题和答案.'''
		complete_dialog = []
		for line in f:
			'''删除换行符: \n'''
			line = line.strip('\n')
			
			if line == "":
				continue
			if line[0] == "E":
				if complete_dialog:
					'''若判断为E标志,将问答语句填充到语料库convs.'''
					convs.append(complete_dialog)
					'''存储一组对话后,清空当前对话,为下一组对话存储准备'''
					complete_dialog = []
			if line[0] == 'M':
				'''若为M则提取对话内容,存储到单独对话库complete_dialog.'''
				complete_dialog.append(line[1:])
				'''
				contain M: M 三鹿奶粉也假,不一样的卖啊

				'''
				# print("contain M: {}".format(line))
			'''
			line data: E
			
			line data: M 呵呵
			
			'''
			# print("line data: {}".format(line))
		# print("Complete dialog {}".format(complete_dialog))
	print("All complete dialog: {}".format(convs))
	return convs

if __name__ == "__main__":
	source_path = "./data/source_data.conv"
	process_cut(source_path, None)
  • Result
[[' 呵呵', ' 是王若猫的。'], [' 不是', ' 那是什么?'], [' 怎么了', ' 我很难过,安慰我~'], [' 开心点哈,一切都会好起来', ' 嗯 会的'], [' 我还喜欢她,怎么办', ' 我帮你告诉她?发短信还是打电话?'], [' 短信', ' 嗯嗯。我也相信'], [' 你知道谁么', ' 肯定不是我,是阮德培'], [' 许兵是谁', ' 吴院四班小帅哥'], [' 这么假', ' 三鹿奶粉也假,不一样的卖啊'], [' 许兵是傻逼', ' 被你发现了。'], [' 许兵是谁', ' 是我善良可爱的主人的老公啊'], [' 许兵是谁', ' 是穆森的老婆啊'], [' 许兵是谁', ' 奇葩']]

1.2 提取问答对话

def question_answer(convs):
	questions = []
	answers = []
	for conv in convs:
		if len(conv) == 1:
			continue
		if len(conv) % 2 != 0:
			'''if dialog was not one to one, delete the last one and keep Q&A.'''
			conv = conv[:-1]
		for i in range(len(conv)):
			if i % 2 == 0:
				questions.append(conv[i])
			else:
				answers.append(conv[i])
	print("questions: {} \n answers: {}".format(questions, answers))
	return questions, answers
if __name__ == "__main__":
	source_path = "./data/source_data.conv"
	convs = process_cut(source_path, None)
	questions, answers = question_answer(convs)
  • Result
questions: [' 呵呵', ' 不是', ' 怎么了', ' 开心点哈,一切都会好起来', ' 我还喜欢她,怎么办', ' 短信', ' 你知道谁么', ' 许兵是谁', ' 这么假', ' 许兵是傻逼', ' 许兵是谁', ' 许兵是谁', ' 许兵是谁'] 
 answers: [' 是王若猫的。', ' 那是什么?', ' 我很难过,安慰我~', ' 嗯 会的', ' 我帮你告诉她?发短信还是打电话?', ' 嗯嗯。我也相信', ' 肯定不是我,是阮德培', ' 吴院四班小帅哥', ' 三鹿奶粉也假,不一样的卖啊', ' 被你发现了。', ' 是我善良可爱的主人的老公啊', ' 是穆森的老婆啊', ' 奇葩']

1.3 保存问答语料

def save_question_answer(questions, answers, test_size,
							train_question_path, train_answer_path,
							test_question_path, test_answer_path):
	'''保存问答语料数据集.

	:params questions: 问题列表
	:params answers: 答案列表
	:params test_size: 测试数据数量
	:params train_question_path: 用于训练的问题数据集 
	:params train_answer_path: 用于训练的答案数据集
	:params test_question_path: 用于测试的问题数据集
	:params test_answer_path: 用于测试的答案数据集
	'''
	'''训练数据集文件文件路径.'''
	train_quesition_enc = open(train_question_path, "w")
	train_answer_dec = open(train_answer_path, "w")
	'''测试数据集文件路径.'''
	test_question_enc = open(test_question_path, "w")
	test_answer_dec = open(test_answer_path, "w")
	'''根据设定的测试数据集尺寸随机提取测试数据.''' 
	test_index = random.sample([i for i in range(len(questions))], test_size)

	for i in range(len(questions)):
		'''提取测试数据集'''
		if i in test_index:
			test_question_enc.write(questions[i]+'\n')
			test_answer_dec.write(answers[i]+'\n')
		else:
			'''提取训练数据集'''
			train_quesition_enc.write(questions[i]+'\n')
			train_answer_dec.write(answers[i]+'\n')
	train_quesition_enc.close()
	train_answer_dec.close()
	test_question_enc.close()
	test_answer_dec.close()
if __name__ == "__main__":
	source_path = "./data/source_data.conv"
	convs = process_cut(source_path, None)
	print("convs: {}".format(convs))
	questions, answers = question_answer(convs)
	# print("questions: {} \n answers: {}".format(questions, answers))
	'''文件夹列表'''
	folder_list = ["./data/train/", "./data/test/"]
	'''文件列表'''
	file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
	for i in range(len(folder_list)):
		'''若文件夹不存在,则新建'''
		if not os.path.exists(folder_list[i]):
			os.makedirs(folder_list[i])
	for i in range(len(file_list)):
		'''若文件不存在,则新建'''
		if not os.path.exists(file_list[i]):
			os.mknod(file_list[i])
	'''设置训练数据集路径.'''
	train_question_path = file_list[0]
	train_answer_path = file_list[1]
	'''设置测试数据集路径.'''
	test_question_path = file_list[2]
	test_answer_path = file_list[3]
	save_question_answer(questions, answers, 5,
						train_question_path, train_answer_path,
						test_question_path, test_answer_path)

2 问答数据集词转向量

2.1 生成词汇表

def generate_vocabulary(datasets, vocabulary_data):
	PAD = "__PAD__"
	GO = "__GO__"
	EOS = "__EOS__"  # 对话结束
	UNK = "__UNK__"  # 标记未出现在词汇表中的字符
	START_VOCABULART = [PAD, GO, EOS, UNK]
	PAD_ID = 0
	GO_ID = 1
	EOS_ID = 2
	UNK_ID = 3
	vocabulary = {}
	new_vocabulary = []
	with open(datasets, "r") as f:
		counter = 0
		for line in f:
			counter += 1
			'''删除句子首尾的换行,并提取句子中的字,进行计数.'''
			tokens = [word for word in line.strip()]
			for word in tokens:
				if word in vocabulary:
					vocabulary[word] += 1
				else:
					vocabulary[word] = 1

	vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
	print("vocabulary: {}".format(vocabulary_list))
	with open(vocabulary_data, "w") as f:
		for word in vocabulary_list:
			f.write(word+'\n')

if __name__ == "__main__":
	file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
	voc_list = ["./data/train/question_voc", "./data/train/answer_voc"]
	for i in range(len(voc_list)):
		'''检测保存的词汇表是否存在,不存在则新建新建'''
		if not os.path.exists(voc_list[i]):
			os.mknod(voc_list[i])
		'''生成问题和答案的词汇表并保存'''
		generate_vocabulary(file_list[i], voc_list[i])
  • Result
vocabulary: ['__PAD__', '__GO__', '__EOS__', '__UNK__', '是', '谁', '许', '兵', '呵', '么', '不', '短', '信', '你', '知', '道', '这', '假', '傻', '逼']
vocabulary: ['__PAD__', '__GO__', '__EOS__', '__UNK__', '是', '。', '的', '嗯', '我', '也', '不', ',', '王', '若', '猫', '那', '什', '么', '?', '相', '信', '肯', '定', '阮', '德', '培', '吴', '院', '四', '班', '小', '帅', '哥', '三', '鹿', '奶', '粉', '假', '一', '样', '卖', '啊', '被', '你', '发', '现', '了', '奇', '葩']

2.2 词转向量

def word_to_vector(dataset_qa, vocabulary, vector):
	UNK_ID = 3
	tmp_vocab = []
	with open(vocabulary, "r") as f:
		'''
		读取全部数据到内存,使用extend方法将整个数据拆分为单个数据,
		而不是以整块数据存储到列表中,如数据[0, 1, 2],[2, 3, 5]
		使用append则存储的为:[[0, 1, 2],[2, 3, 5]]
		使用extend存储为:[0, 1, 2, 2, 3, 5]
		'''
		tmp_vocab.extend(f.readlines())
	'''删除换行符: \n'''
	tmp_vocab = [line.strip() for line in tmp_vocab]
	'''
	将文字与对应的数字即行号对应,使用enumerate使数与字对应(0,"__PAD__")
	不达标,因此使用交换(y,x)to(x,y)即可
	最后使用dict将tuple转为dict,格式为{'__PAD__': 0, '__GO__': 1, '__EOS__': 2, '__UNK__': 3, '是': 4, '谁': 5, '许': 6, '兵': 7, '呵': 8, '么': 9, '不': 10, '短': 11, '信': 12, '你': 13, '知': 14, '道': 15, '这': 16, '假': 17, '傻': 18, '逼': 19}
	'''
	vocab = dict([(x,y) for (y,x) in enumerate(tmp_vocab)])
	print("vocabulay dictionary: {}".format(vocab))
	with open(vector, "w") as f_vector:
		with open(dataset_qa, "r") as f_qa:
			for line in f_qa:
				line_vec = []
				for words in line.strip():
					'''将字与序号对应:提取字典键对应的值,若不存在,使用UNK_ID替代'''
					line_vec.append(vocab.get(words, UNK_ID))
				print("line vector: {}".format(line_vec))
				f_vector.write(" ".join([str(num) for num in line_vec]) + '\n')
if __name__ == "__main__":
	
	word_to_vector("./data/train/question.enc","./data/train/question_voc", "./data/train/question.voc")
  • Result
vocabulay dictionary: {'__PAD__': 0, '__GO__': 1, '__EOS__': 2, '__UNK__': 3, '是': 4, '谁': 5, '许': 6, '兵': 7, '呵': 8, '么': 9, '不': 10, '短': 11, '信': 12, '你': 13, '知': 14, '道': 15, '这': 16, '假': 17, '傻': 18, '逼': 19}
line vector: [8, 8]
line vector: [10, 4]
line vector: [11, 12]
line vector: [13, 14, 15, 5, 9]
line vector: [6, 7, 4, 5]
line vector: [16, 9, 17]
line vector: [6, 7, 4, 18, 19]
line vector: [6, 7, 4, 5]

2.3 按行截取数据

def extract_data(source_path, output_path, start, stop):
	'''提取指定行的数据并保存.
	:params source_path: 原始数据路径.
	:params output_path: 保存数据路径.
	:params start: 起始行号.
	:params stop: 结束行号.
	return:
	返回文件数据行数.
	'''
	line_number = 0
	if not os.path.exists(output_path):
		os.mknod(output_path)
	with open(source_path, 'r') as fi:
		'''读取文件,获取文件行数'''
		line_number = len(fi.readlines())

	with open(source_path, 'r') as fi:
		if start < 0 or start > line_number or stop <= start or stop < 0:
			return "起始行设置错误,请重新设置."
		else: 
			counter = 0
			for line in fi:
				counter += 1
				if counter >= start:
					with open(output_path, "a") as fo:
						fo.write(line)
						if counter == stop:
							break	
			return line_number
if __name__ == "__main__":
	counter_line = extract_data("./data/source_data.conv", "./data/extra_data.conv", 2, 14)
	print("data lines nubmers: {}".format(counter_line))

3 完整代码

import random
import os
def process_cut(source_path, cut_path):
	'''Process source data.
	Save data expect E,M.

	:params source_path: source dialog path
	:params cut_path: save dialog path

	return:
	convs: complete dialogs.
	'''

	'''Saved all conversations.'''
	convs = []
	with open(source_path, 'r', encoding='utf8') as f:
		'''<_io.TextIOWrapper name='./data/source_data.conv' mode='r' encoding='utf8'>'''
		print("open context object: {}".format(f))
		# data = f.readlines()
		'''['E\n', 'M 呵呵\n', 'M 是王若猫的。\n]'''
		# print("data: {}".format(data))
		# one_conv = []
		'''Complete dialog: contains Question and Answer.'''
		complete_dialog = []
		for line in f:
			'''Delete line feed symbol: \n'''
			line = line.strip('\n')
			
			if line == "":
				continue
			if line[0] == "E":
				if complete_dialog:
					'''Add dialog to conversations list.'''
					convs.append(complete_dialog)
					complete_dialog = []
			if line[0] == 'M':
				'''Extract Question and Answer which contains in M'''
				complete_dialog.append(line[1:])
				'''
				contain M: M 三鹿奶粉也假,不一样的卖啊

				'''
				# print("contain M: {}".format(line))
			'''
			line data: E
			
			line data: M 呵呵
			
			'''
			# print("line data: {}".format(line))
		# print("Complete dialog {}".format(complete_dialog))
	# print("All complete dialog: {}".format(convs))
	return convs
def question_answer(convs):
	'''Extract questions and answers from dialog.
	:params convs: dialogs.

	return:
	questions: questions
	answers: answers
	'''
	questions = []
	answers = []
	for conv in convs:
		if len(conv) == 1:
			continue
		if len(conv) % 2 != 0:
			'''if dialog was not one to one, delete the last one and keep Q&A.'''
			conv = conv[:-1]
		for i in range(len(conv)):
			'''Extract Question.'''
			if i % 2 == 0:
				questions.append(conv[i])
			else:
				'''Extract Answer.'''
				answers.append(conv[i])
	print("questions: {} \n answers: {}".format(questions, answers))
	return questions, answers


def save_question_answer(questions, answers, test_size,
							train_question_path, train_answer_path,
							test_question_path, test_answer_path):
	'''Save question and answer dataset.
	:params questions: question
	:params answers: answer
	:params test_size: set test data number and save
	:params train_question_path: question dataset path for train 
	:params train_answer_path: answer dataset path for train
	:params test_question_path: question dataset path for test
	:params test_answer_path: answer dataset path for test
	'''
	'''Train dataset.'''
	train_quesition_enc = open(train_question_path, "w")
	train_answer_dec = open(train_answer_path, "w")
	'''Test dataset.'''
	test_question_enc = open(test_question_path, "w")
	test_answer_dec = open(test_answer_path, "w")
	'''Random get test dateset which number is test_size.''' 
	test_index = random.sample([i for i in range(len(questions))], test_size)

	for i in range(len(questions)):
		if i in test_index:
			test_question_enc.write(questions[i]+'\n')
			test_answer_dec.write(answers[i]+'\n')
		else:
			train_quesition_enc.write(questions[i]+'\n')
			train_answer_dec.write(answers[i]+'\n')
	train_quesition_enc.close()
	train_answer_dec.close()
	test_question_enc.close()
	test_answer_dec.close()

def generate_vocabulary(datasets, vocabulary_data):
	PAD = "__PAD__"
	GO = "__GO__"
	EOS = "__EOS__"  # 对话结束
	UNK = "__UNK__"  # 标记未出现在词汇表中的字符
	START_VOCABULART = [PAD, GO, EOS, UNK]
	PAD_ID = 0
	GO_ID = 1
	EOS_ID = 2
	UNK_ID = 3
	file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
	vocabulary = {}
	new_vocabulary = []
	with open(datasets, "r") as f:
		counter = 0
		for line in f:
			counter += 1
			'''Delete lind feed symbol: \n, and extract word in sentence.'''
			tokens = [word for word in line.strip()]
			for word in tokens:
				if word in vocabulary:
					vocabulary[word] += 1
				else:
					vocabulary[word] = 1

	vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
	print("vocabulary: {}".format(vocabulary_list))
	with open(vocabulary_data, "w") as f:
		for word in vocabulary_list:
			f.write(word+'\n')


def word_to_vector(dataset_qa, vocabulary, vector):
	UNK_ID = 3
	tmp_vocab = []
	with open(vocabulary, "r") as f:
		'''Append word one by one to list as dependent element not entirely append to list.'''
		tmp_vocab.extend(f.readlines())
	'''Delete line feed: \n'''
	tmp_vocab = [line.strip() for line in tmp_vocab]
	'''Trans tmp_vocab to this format[()] and then convert to dict by dict{key:value}.'''
	vocab = dict([(x,y) for (y,x) in enumerate(tmp_vocab)])
	'''vocabulay dictionary: {'__PAD__': 0, '__GO__': 1, '__EOS__': 2, '__UNK__': 3, '是': 4, '谁': 5, '许': 6, '兵': 7, '呵': 8, '么': 9, '不': 10, '短': 11, '信': 12, '你': 13, '知': 14, '道': 15, '这': 16, '假': 17, '傻': 18, '逼': 19}
'''
	print("vocabulay dictionary: {}".format(vocab))
	with open(vector, "w") as f_vector:
		with open(dataset_qa, "r") as f_qa:
			for line in f_qa:
				line_vec = []
				for words in line.strip():
					line_vec.append(vocab.get(words, UNK_ID))
				# print("line vector: {}".format(line_vec))
				f_vector.write(" ".join([str(num) for num in line_vec]) + '\n')


def process_data(dataset_qa, vocabulary, vector):
	'''Read and save dataset.'''
	source_path = "./data/source_data.conv"
	convs = process_cut(source_path, None)
	print("convs: {}".format(convs))
	questions, answers = question_answer(convs)
	# print("questions: {} \n answers: {}".format(questions, answers))
	folder_list = ["./data/train/", "./data/test/"]
	file_list = ["./data/train/question.enc", "./data/train/answer.dec", "./data/test/question.enc", "./data/test/answer.dec"]
	for i in range(len(folder_list)):
		if not os.path.exists(folder_list[i]):
			os.makedirs(folder_list[i])
	for i in range(len(file_list)):
		if not os.path.exists(file_list[i]):
			os.mknod(file_list[i])
	'''Seting train dataset path.'''
	train_question_path = file_list[0]
	train_answer_path = file_list[1]
	'''Seting test dataset path.'''
	test_question_path = file_list[2]
	test_answer_path = file_list[3]
	save_question_answer(questions, answers, 5,
						train_question_path, train_answer_path,
						test_question_path, test_answer_path)
	PAD = "__PAD__"
	GO = "__GO__"
	EOS = "__EOS__"  # 对话结束
	UNK = "__UNK__"  # 标记未出现在词汇表中的字符
	START_VOCABULART = [PAD, GO, EOS, UNK]
	PAD_ID = 0
	GO_ID = 1
	EOS_ID = 2
	UNK_ID = 3
	voc_list = ["./data/train/question_voc", "./data/train/answer_voc"]
	for i in range(len(voc_list)):
		if not os.path.exists(voc_list[i]):
			os.mknod(voc_list[i])
		generate_vocabulary(file_list[i], voc_list[i])
	word_to_vector(dataset_qa, vocabulary, vector)
if __name__ == "__main__":
	process_data("./data/train/question.enc","./data/train/question_voc", "./data/train/question.vec")
	

4 总结

文本处理流程:

Created with Raphaël 2.2.0 开始 清洗原始数据,提取问答到内存 将问答语料拆分为问和答并保存(train&test) 从问答语料创建词汇表并保存 将词汇表在内存中生成字和数字的映射 通过映射数据生成词向量 结束

文本编号

将文本按照词频顺序排列

import codecs
import collections
from operator import itemgetter

RAW_DATA = "vocabulary.txt"
VOCAB_OUTPUT = "ptb.vocab"

counter = collections.Counter()
with codecs.open(RAW_DATA, "r", "utf-8") as f:
	for line in f:
		for word in line.strip().split():
			counter[word] += 1
			print(counter)

sorted_word_to_cnt = sorted(counter.items(), key=itemgetter(1), reverse=True)
print(sorted_word_to_cnt)
sorted_words = [x[0] for x in sorted_word_to_cnt]
print(sorted_words)

sorted_words = [""] + sorted_words
print(sorted_words)

sorted_words = ["", "", ""] + sorted_words
print(sorted_words)

with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:
	for word in sorted_words:
		file_output.write(word + "\n")

文本分配编号

import codecs
import sys

RAW_DATA = "vocabulary.txt"
VOCAB = "ptb.vocab"
OUTPUT_DATA = "ptb.train"

#读取词汇表
with codecs.open(VOCAB, "r", "utf-8") as f_vocab:
	vocab = [w.strip() for w in f_vocab.readlines()]
#新建字典:文本:行号
word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}
#获取词汇表中词对应的行号
def get_id(word):
	return word_to_id[word] if word in word_to_id else word_to_id[""]

fin = codecs.open(RAW_DATA, "r", "utf-8")
fout = codecs.open(OUTPUT_DATA, "w", "utf-8")

for line in fin:
	words = line.strip().split() + [""]
	out_line = ' '.join([str(get_id(w)) for w in words]) + '\n'
	fout.write(out_line)

fin.close()
fout.close()

你可能感兴趣的:(#,自然语言处理,词频统计,数字映射)