import json
import time
# 分数(5)
def read_corpus(file_path):
"""
读取给定的语料库,并把问题列表和答案列表分别写入到 qlist, alist 里面。 在此过程中,不用对字符换做任何的处理(这部分需要在 Part 2.3里处理)
qlist = ["问题1", “问题2”, “问题3” ....]
alist = ["答案1", "答案2", "答案3" ....]
务必要让每一个问题和答案对应起来(下标位置一致)
"""
qlist = []
alist = []
with open(file_path, 'r') as f:
json_data = json.loads(f.readline())
json_list = json_data["data"]
for data_dict in json_list:
for data_key in data_dict:
if "paragraphs" == data_key:
paragraphs_list = data_dict.get(data_key)
for content_dict in paragraphs_list:
for qas_key in content_dict:
if "qas" == qas_key:
qas_list = content_dict.get(qas_key)
for q_a_dict in qas_list:
if len(q_a_dict["answers"]) > 0:
qlist.append(q_a_dict["question"])
alist.append(q_a_dict["answers"][0]["text"])
# else:
# print("answers : not found the answer")
print("qlist len:" + str(len(qlist)))
print("alist len:" + str(len(alist)))
assert len(qlist) == len(alist) # 确保长度一样
return qlist, alist
q_list, a_list = read_corpus("data/train-v2.0.json")
word_total_count = 0
word_total_list = []
word_dict = {}
start = time.process_time()
for line in q_list:
word_list = line.split(" ")
word_total_count += len(word_list)
for word in word_list:
word_dict[word] = word_dict.get(word, 0) + 1
print("word_total_count:" + str(word_total_count))
print("word_redupliction_list count:" + str(len(word_dict.keys())))
elapsed = (time.process_time() - start)
print("Time used:",elapsed)
import numpy as np
from matplotlib import pyplot as plt
q_list, a_list = read_corpus("data/train-v2.0.json")
start = time.process_time()
word_redupliction_dict = {}
for line in q_list:
word_list = line.split(" ")
for word in word_list:
word_redupliction_dict[word] = word_redupliction_dict.get(word, 0) + 1
word_redupliction_dict = dict(sorted(word_redupliction_dict.items(), key = lambda x:x[1], reverse = True))
elapsed = (time.process_time() - start)
print("Time used:",elapsed)
print("totol words:" + str(len(word_redupliction_dict)))
x = range(len(word_redupliction_dict))
y = [word_redupliction_dict.get(key) for key in word_redupliction_dict]
plt.figure()
plt.plot(x, y)
plt.show()
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(0.0, 1000, 0.1)
y = np.exp(-x)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(x, y)
ax1.set_ylabel('Y values for exp(-x)')
ax1.set_title("Double Y axis")
plt.show()
def statistic_words(temp_list=[]):
temp_dict = {}
for line in temp_list:
word_list = line.split(" ")
for word in word_list:
temp_dict[word] = temp_dict.get(word, 0) + 1
return temp_dict
qlist, alist = read_corpus("data/train-v2.0.json")
start = time.process_time()
new_qlist = statistic_words(qlist)
new_alist = statistic_words(alist)
elapsed = (time.process_time() - start)
new_qlist = dict(sorted(new_qlist.items(), key = lambda x:x[1], reverse = True))
new_alist = dict(sorted(new_alist.items(), key = lambda x:x[1], reverse = True))
print("qlist top 10:")
print([i + ":" + str(new_qlist.get(i)) for i in new_qlist.keys()][:10])
print("alist top 10:")
print([i + ":" + str(new_alist.get(i)) for i in new_qlist.keys()][:10])
print("Time used:",elapsed)
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
stop_words = set(stopwords.words('english'))
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
stemmer = PorterStemmer()
# 预处理:1.过滤一些无用字符 2.字符串转小写 3.过滤停用词 4.数字特殊处理 5.词干提取(包括词形还原)
def preprocessing(temp_list=[]):
word_list_list = []
word_dict = {}
for line in temp_list:
temp_word_list = []
sentence = pattern.sub("", line).lower() # 1.过滤一些无用字符
sentence = sentence.lower() # 2.字符串转小写
word_list = sentence.split()
for word in word_list:
if word not in stop_words: # 3.过滤停用词
word = "#number" if word.isdigit() else word # 4.数字特殊处理
word = stemmer.stem(word) # 5.词干提取(包括词形还原)
word_dict[word] = word_dict.get(word, 0) + 1
temp_word_list.append(word)
word_list_list.append(temp_word_list)
return word_dict, word_list_list
# 筛选单词
def filter_words(in_list=[], in_dict={}, lower=0, upper=0):
word_list = []
for key, val in in_dict.items():
if val >= lower and val <= upper:
word_list.append(key)
new_list = []
for line in in_list:
words = [w for w in line if w in word_list]
new_list.append(' '.join(words))
return new_list
qlist, alist = read_corpus("data/train-v2.0.json")
start = time.process_time()
q_dict, q_list_list = preprocessing(qlist)
new_qlist = filter_words(q_list_list, q_dict, 2, 1000)
a_dict, a_list_list = preprocessing(alist)
new_alist = filter_words(a_list_list, a_dict, 2, 1000)
elapsed = (time.process_time() - start)
print("Time used:", elapsed)
q_dict_sorted = dict(sorted(q_dict.items(), key = lambda x:x[1], reverse = True))
y1 = [q_dict_sorted.get(key) for key in q_dict_sorted]
x1 = range(len(y1))
plt.figure()
plt.plot(x1, y1)
plt.title('qlist')
plt.show()
y2 = [q_dict_sorted.get(key) for key in q_dict_sorted if q_dict_sorted.get(key) >= 2 and q_dict_sorted.get(key) <= 1000]
x2 = range(len(y2))
plt.figure()
plt.plot(x2, y2)
plt.title('qlist')
plt.show()
a_dict_sorted = dict(sorted(a_dict.items(), key = lambda x:x[1], reverse = True))
y1 = [a_dict_sorted.get(key) for key in a_dict_sorted]
x1 = range(len(y1))
plt.figure()
plt.plot(x1, y1)
plt.title('alist')
plt.show()
y2 = [a_dict_sorted.get(key) for key in a_dict_sorted if a_dict_sorted.get(key) >= 1 and a_dict_sorted.get(key) <= 1000]
x2 = range(len(y2))
plt.figure()
plt.plot(x2, y2)
plt.title('alist')
plt.show()
# qlist, alist # 更新后的
qlist = new_qlist
# alist = new_alist # 考虑到后面还要使用alist,这里就先不更新了
print('new qlist len:' + str(len(qlist)))
print('new alist len:' + str(len(alist)))
from sklearn.feature_extraction.text import TfidfVectorizer
# 统计new_qlist中的单词,主要是用于验证TfidfVectorizer
word_set = set()
start = time.process_time()
for line in qlist:
word_list = line.split(' ')
for word in word_list:
word_set.add(word)
print('qlist len :' + str(len(qlist)))
print('dict total num:' + str(len(word_set)))
elapsed = (time.process_time() - start)
print("Time used:", elapsed)
start = time.process_time()
vectorizer = TfidfVectorizer() # 定一个tf-idf的vectorizer
X = vectorizer.fit_transform(qlist) # 结果存放在X矩阵
elapsed = (time.process_time() - start)
print("Time used:", elapsed)
start = time.process_time()
x_mat = X.toarray()
n = len(x_mat)
m = len(x_mat[0])
num = 0
print(n)
print(m)
elapsed = (time.process_time() - start)
print("Time used:", elapsed)
start = time.process_time()
for i in range(n):
for j in range(m):
if x_mat[i][j] != 0:
num += 1
sparsity = num / (n*m)
print (sparsity) # 打印出稀疏度(sparsity)
elapsed = (time.process_time() - start)
print("Time used:", elapsed)
from sklearn.metrics.pairwise import cosine_similarity
from queue import PriorityQueue as PQueue
def top5results(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
1. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
2. 计算跟每个库里的问题之间的相似度
3. 找出相似度最高的top5问题的答案
"""
# 问题预处理
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
sentence = pattern.sub("", input_q)
sentence = sentence.lower()
words = sentence.split()
word_list = []
for word in words:
if word not in stop_words:
word = "#number" if word.isdigit() else word
word = stemmer.stem(word)
word_list.append(word)
# 计算相似度
input_seg = ' '.join(word_list)
input_vec = vectorizer.transform([input_seg])
res = cosine_similarity(input_vec, X)[0]
# 得到top5的索引
pq = PQueue()
for i,v in enumerate(res):
pq.put((1.0-v, i))
top_idxs = []
for i in range(5):
top_idxs.append(pq.get()[1])
print(top_idxs) # top_idxs存放相似度最高的(存在qlist里的)问题的下表
# hint: 利用priority queue来找出top results. 思考为什么可以这么做?
# 因为优先级队列的第一个值可以是浮点数,所以用1.0-相似度,就可以转换为优先级
# # 得到top5的索引
# score_idx = dict((i,v) for i,v in enumerate(res))
# score_sorted = sorted(score_idx.items(), key=lambda k:k[1], reverse=True)
# score_sorted = score_sorted[:5]
# print(score_sorted)
# top_idxs = [idx[0] for idx in score_sorted]
# print(top_idxs)
result = [alist[i] for i in top_idxs]
return result # 返回相似度最高的问题对应的答案,作为TOP5答案
测试及结果
print (top5results("when did Beyonce start becoming popular?"))
print (top5results("what languge does the word of 'symbiosis' come from"))
[0, 60835, 39267, 23136, 693]
['in the late 1990s', 'mandolin-based guitar programs', 'Particularly since the 1950s, pro wrestling events have frequently been responsible for sellout crowds at large arenas', 'early DJs creating music in their own homes', 'Agnèz Deréon']
[7786, 8154, 27470, 41967, 7844]
['Greek', 'Persian and Sanskrit', '1570s', 'living together', 'the evolution of all eukaryotes']
from functools import reduce
inverted_idx = {} # 定一个一个简单的倒排表
for i in range(len(qlist)):
for word in qlist[i].split():
if word in inverted_idx:
inverted_idx[word].append(i)
else:
inverted_idx[word] = [i]
for key in inverted_idx:
inverted_idx[key] = sorted(inverted_idx[key])
# 求两个set的交集
def intersections(set1, set2):
return set1.intersection(set2)
def top5results_invidx(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
1. 利用倒排表来筛选 candidate
2. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
3. 计算跟每个库里的问题之间的相似度
4. 找出相似度最高的top5问题的答案
"""
# 处理输入字符串
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
sentence = pattern.sub("", input_q)
sentence = sentence.lower()
word_list = sentence.split()
result_list = []
for word in word_list:
if word not in stop_words:
word = "#number" if word.isdigit() else word
word = stemmer.stem(word)
result_list.append(word)
print(result_list)
# 找到倒排表中相关的索引,用于答案的候选集
candidate_list = []
for word in result_list:
if word in inverted_idx:
idx_list = inverted_idx[word]
candidate_list.append(set(idx_list))
# 候选问题的索引
# print(candidate_list)
candidate_idx = list(reduce(intersections, candidate_list))
input_seg = ' '.join(result_list)
input_vec = vectorizer.transform([input_seg])
# 计算所有候选索引中的相似度
similarity_list = []
for i in candidate_idx:
similarity = cosine_similarity(input_vec, X[i])[0]
similarity_list.append((i, similarity[0]))
res_sorted = sorted(similarity_list, key=lambda k:k[1], reverse=True)
print(type(res_sorted))
# 根据索引检索top 5答案
answers = []
i = 0
for (idx,score) in res_sorted:
if i < 5:
answer = alist[idx]
answers.append(answer)
i += 1
return answers
测试及结果
print(top5results_invidx("when did Beyonce start becoming popular"))
print(top5results_invidx("what languge does the word of 'symbiosis' come from"))
['beyonc', 'start', 'becom', 'popular']
['in the late 1990s']
['langug', 'word', 'symbiosi', 'come']
['Greek']
def load_glove(path):
vocab = {}
embedding = []
vocab["UNK"] = 0
embedding.append([0] * 100)
with open(path, 'r', encoding='utf8') as f:
i = 1
for line in f:
row = line.strip().split()
vocab[row[0]] = i
embedding.append(row[1:])
i += 1
return vocab, embedding
# 转换为词向量
def word_to_vec(words, vocab, emb):
vec = []
for word in words:
if word in vocab:
idx = vocab[word]
vec.append(emb[idx])
else:
idx = 0
vec.append(emb[idx])
return vec
path = './data/glove.6B/glove.6B.100d.txt'
# TODO
vocabs, emb = load_glove(path)
#emb = # 读取每一个单词的嵌入。这个是 D*H的矩阵,这里的D是词典库的大小, H是词向量的大小。 这里面我们给定的每个单词的词向量,那句子向量怎么表达?
# 其中,最简单的方式 句子向量 = 词向量的平均(出现在问句里的), 如果给定的词没有出现在词典库里,则忽略掉这个词。
def top5results_emb(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
1. 利用倒排表来筛选 candidate
2. 对于用户的输入 input_q,转换成句子向量
3. 计算跟每个库里的问题之间的相似度
4. 找出相似度最高的top5问题的答案
"""
# 问题预处理
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
sentence = pattern.sub("", input_q)
sentence = sentence.lower()
word_list = sentence.split()
result_list = []
for word in word_list:
if word not in stop_words:
word = "#number" if word.isdigit() else word
word = stemmer.stem(word)
result_list.append(word)
# 输入问题的词向量
input_q_vec = word_to_vec(result_list, vocabs, emb)
# 根据倒排表筛选出候选问题索引
candidates = []
for word in result_list:
if word in inverted_idx:
ids = inverted_idx[word]
candidates.append(set(ids))
candidate_idx = list(reduce(intersections, candidates)) # 候选问题索引
# 计算相似度得分
scores = []
for i in candidate_idx:
sentence = new_qlist[i].split()
vec = word_to_vec(sentence, vocabs, emb)
score = cosine_similarity(input_q_vec, vec)[0]
scores.append((i, score[0]))
scores_sorted = sorted(scores, key=lambda k:k[1], reverse=True)
# 根据索引检索top 5答案
answers = []
i = 0
for (idx,score) in scores_sorted:
if i < 5:
answer = alist[idx]
answers.append(answer)
i += 1
return answers
测试及结果
print(top5results_emb("when did Beyonce start becoming popular"))
print(top5results_emb("what languge does the word of 'symbiosis' come from"))
print(top5results_emb("In her music, what are some?"))
['in the late 1990s']
['Greek']
['Philadelphia soul', 'love, relationships, and monogamy', 'The Fighting Temptations', "The UK's largest pipe organ", 'the foreign genres of hip hop and reggae']