简单的问答系统

1. 读取文件,并把内容分别写到两个list里(一个list对应问题集,另一个list对应答案集)

import json
import time

# 分数(5)
def read_corpus(file_path):
    """
    读取给定的语料库,并把问题列表和答案列表分别写入到 qlist, alist 里面。 在此过程中,不用对字符换做任何的处理(这部分需要在 Part 2.3里处理)
    qlist = ["问题1", “问题2”, “问题3” ....]
    alist = ["答案1", "答案2", "答案3" ....]
    务必要让每一个问题和答案对应起来(下标位置一致)
    """
    qlist = []
    alist = []
    with open(file_path, 'r') as f:
        json_data = json.loads(f.readline())
        json_list = json_data["data"]
        for data_dict in json_list:
            for data_key in data_dict:
                if "paragraphs" == data_key:
                    paragraphs_list = data_dict.get(data_key)
                    for content_dict in paragraphs_list:
                        for qas_key in content_dict:
                            if "qas" == qas_key:
                                qas_list = content_dict.get(qas_key)
                                for q_a_dict in qas_list:
                                    if len(q_a_dict["answers"]) > 0:
                                        qlist.append(q_a_dict["question"])
                                        alist.append(q_a_dict["answers"][0]["text"])
#                                     else:
#                                         print("answers : not found the answer")
    print("qlist len:" + str(len(qlist)))
    print("alist len:" + str(len(alist)))
    assert len(qlist) == len(alist)  # 确保长度一样
    return qlist, alist

2. 理解数据(可视化分析/统计信息)

q_list, a_list = read_corpus("data/train-v2.0.json")

word_total_count = 0
word_total_list = []
word_dict = {}

start = time.process_time()
for line in q_list:
    word_list = line.split(" ")
    word_total_count += len(word_list)
    for word in word_list:
        word_dict[word] = word_dict.get(word, 0) + 1

print("word_total_count:" + str(word_total_count))
print("word_redupliction_list count:" + str(len(word_dict.keys())))
elapsed = (time.process_time() - start)
print("Time used:",elapsed)
import numpy as np
from matplotlib import pyplot as plt

q_list, a_list = read_corpus("data/train-v2.0.json")


start = time.process_time()
word_redupliction_dict = {}
for line in q_list:
    word_list = line.split(" ")
    for word in word_list:
        word_redupliction_dict[word] = word_redupliction_dict.get(word, 0) + 1

word_redupliction_dict = dict(sorted(word_redupliction_dict.items(), key = lambda x:x[1], reverse = True))

elapsed = (time.process_time() - start)
print("Time used:",elapsed)

print("totol words:" + str(len(word_redupliction_dict)))
x = range(len(word_redupliction_dict))
y = [word_redupliction_dict.get(key) for key in word_redupliction_dict]
plt.figure()
plt.plot(x, y)
plt.show()
import matplotlib.pyplot as plt
import numpy as np

x = np.arange(0.0, 1000, 0.1)
y = np.exp(-x)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(x, y)

ax1.set_ylabel('Y values for exp(-x)')
ax1.set_title("Double Y axis")

plt.show()
def statistic_words(temp_list=[]):
    temp_dict = {}
    for line in temp_list:
        word_list = line.split(" ")
        for word in word_list:
            temp_dict[word] = temp_dict.get(word, 0) + 1
    return temp_dict

qlist, alist = read_corpus("data/train-v2.0.json")

start = time.process_time()
new_qlist = statistic_words(qlist)
new_alist = statistic_words(alist)
elapsed = (time.process_time() - start)

new_qlist = dict(sorted(new_qlist.items(), key = lambda x:x[1], reverse = True))
new_alist = dict(sorted(new_alist.items(), key = lambda x:x[1], reverse = True))

print("qlist top 10:")
print([i + ":" + str(new_qlist.get(i)) for i in new_qlist.keys()][:10])

print("alist top 10:")
print([i + ":" + str(new_alist.get(i)) for i in new_qlist.keys()][:10])

print("Time used:",elapsed)

3.文本预处理

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
stemmer = PorterStemmer()

# 预处理:1.过滤一些无用字符 2.字符串转小写 3.过滤停用词 4.数字特殊处理 5.词干提取(包括词形还原)
def preprocessing(temp_list=[]):
    word_list_list = []
    word_dict = {}
    for line in temp_list:
        temp_word_list = []
        sentence = pattern.sub("", line).lower()               # 1.过滤一些无用字符
        sentence = sentence.lower()                            # 2.字符串转小写
        word_list = sentence.split()
        for word in word_list:
            if word not in stop_words:                        # 3.过滤停用词
                word = "#number" if word.isdigit() else word  # 4.数字特殊处理
                word = stemmer.stem(word)                     # 5.词干提取(包括词形还原)
                word_dict[word] = word_dict.get(word, 0) + 1
                temp_word_list.append(word)
        word_list_list.append(temp_word_list)
    return word_dict, word_list_list

# 筛选单词
def filter_words(in_list=[], in_dict={}, lower=0, upper=0):
    word_list = []
    for key, val in in_dict.items():
        if val >= lower and val <= upper:
            word_list.append(key)
            
    new_list = []
    for line in in_list:
        words = [w for w in line if w in word_list]
        new_list.append(' '.join(words))
        
    return new_list
qlist, alist = read_corpus("data/train-v2.0.json")

start = time.process_time()

q_dict, q_list_list = preprocessing(qlist)
new_qlist = filter_words(q_list_list, q_dict, 2, 1000)
a_dict, a_list_list = preprocessing(alist)
new_alist = filter_words(a_list_list, a_dict, 2, 1000)

elapsed = (time.process_time() - start)
print("Time used:", elapsed)


q_dict_sorted = dict(sorted(q_dict.items(), key = lambda x:x[1], reverse = True))
y1 = [q_dict_sorted.get(key) for key in q_dict_sorted]
x1 = range(len(y1))
plt.figure()
plt.plot(x1, y1)
plt.title('qlist')
plt.show()

y2 = [q_dict_sorted.get(key) for key in q_dict_sorted if q_dict_sorted.get(key) >= 2 and q_dict_sorted.get(key) <= 1000]
x2 = range(len(y2))
plt.figure()
plt.plot(x2, y2)
plt.title('qlist')
plt.show()


a_dict_sorted = dict(sorted(a_dict.items(), key = lambda x:x[1], reverse = True))
y1 = [a_dict_sorted.get(key) for key in a_dict_sorted]
x1 = range(len(y1))
plt.figure()
plt.plot(x1, y1)
plt.title('alist')
plt.show()

y2 = [a_dict_sorted.get(key) for key in a_dict_sorted if a_dict_sorted.get(key) >= 1 and a_dict_sorted.get(key) <= 1000]
x2 = range(len(y2))
plt.figure()
plt.plot(x2, y2)
plt.title('alist')
plt.show()

# qlist, alist   # 更新后的
qlist = new_qlist
# alist = new_alist  # 考虑到后面还要使用alist,这里就先不更新了
print('new qlist len:' + str(len(qlist)))
print('new alist len:' + str(len(alist)))

4. 文本表示

from sklearn.feature_extraction.text import TfidfVectorizer

# 统计new_qlist中的单词,主要是用于验证TfidfVectorizer
word_set = set()
start = time.process_time()
for line in qlist:
    word_list = line.split(' ')
    for word in word_list:
        word_set.add(word)

print('qlist len :' + str(len(qlist)))
print('dict total num:' + str(len(word_set)))
elapsed = (time.process_time() - start)
print("Time used:", elapsed)


start = time.process_time()
vectorizer =  TfidfVectorizer()          # 定一个tf-idf的vectorizer
X = vectorizer.fit_transform(qlist)  # 结果存放在X矩阵
elapsed = (time.process_time() - start)
print("Time used:", elapsed)
start = time.process_time()
x_mat = X.toarray()
n = len(x_mat)
m = len(x_mat[0])
num = 0
print(n)
print(m)
elapsed = (time.process_time() - start)
print("Time used:", elapsed)


start = time.process_time()
for i in range(n):
    for j in range(m):
        if x_mat[i][j] != 0:
            num += 1
sparsity = num / (n*m)
print (sparsity)  # 打印出稀疏度(sparsity)
elapsed = (time.process_time() - start)
print("Time used:", elapsed)

5. 对于用户的输入问题,找到相似度最高的TOP5问题,并把5个潜在的答案做返回

from sklearn.metrics.pairwise import cosine_similarity
from queue import PriorityQueue as PQueue
 
def top5results(input_q):
    """
    给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
    1. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
    2. 计算跟每个库里的问题之间的相似度
    3. 找出相似度最高的top5问题的答案
    """
    # 问题预处理
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    sentence = pattern.sub("", input_q)
    sentence = sentence.lower()
    words = sentence.split()
    word_list = []
    for word in words:
        if word not in stop_words:
            word = "#number" if word.isdigit() else word
            word = stemmer.stem(word)
            word_list.append(word)
    
    # 计算相似度
    input_seg = ' '.join(word_list)
    input_vec = vectorizer.transform([input_seg])
    res = cosine_similarity(input_vec, X)[0]
    
    # 得到top5的索引
    pq = PQueue()
    for i,v in enumerate(res):
        pq.put((1.0-v, i))
    
    top_idxs = []
    for i in range(5):
        top_idxs.append(pq.get()[1])

    print(top_idxs)    # top_idxs存放相似度最高的(存在qlist里的)问题的下表 
                       # hint: 利用priority queue来找出top results. 思考为什么可以这么做?
                       # 因为优先级队列的第一个值可以是浮点数,所以用1.0-相似度,就可以转换为优先级

#     # 得到top5的索引
#     score_idx = dict((i,v) for i,v in enumerate(res))
#     score_sorted = sorted(score_idx.items(), key=lambda k:k[1], reverse=True)
#     score_sorted = score_sorted[:5]
#     print(score_sorted)
#     top_idxs = [idx[0] for idx in score_sorted]
#     print(top_idxs)
    
    result = [alist[i] for i in top_idxs]
    return result  # 返回相似度最高的问题对应的答案,作为TOP5答案  

测试及结果

print (top5results("when did Beyonce start becoming popular?"))
print (top5results("what languge does the word of 'symbiosis' come from"))
[0, 60835, 39267, 23136, 693]
['in the late 1990s', 'mandolin-based guitar programs', 'Particularly since the 1950s, pro wrestling events have frequently been responsible for sellout crowds at large arenas', 'early DJs creating music in their own homes', 'Agnèz Deréon']
[7786, 8154, 27470, 41967, 7844]
['Greek', 'Persian and Sanskrit', '1570s', 'living together', 'the evolution of all eukaryotes']

6. 利用倒排表的优化

from functools import reduce

inverted_idx = {}  # 定一个一个简单的倒排表
for i in range(len(qlist)):
    for word in qlist[i].split():
        if word in inverted_idx:
            inverted_idx[word].append(i)
        else:
            inverted_idx[word] = [i]

for key in inverted_idx:
    inverted_idx[key] = sorted(inverted_idx[key])

    
# 求两个set的交集
def intersections(set1, set2):
    return set1.intersection(set2)

def top5results_invidx(input_q):
    """
    给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
    1. 利用倒排表来筛选 candidate
    2. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
    3. 计算跟每个库里的问题之间的相似度
    4. 找出相似度最高的top5问题的答案
    """
    # 处理输入字符串
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    sentence = pattern.sub("", input_q)
    sentence = sentence.lower()
    word_list = sentence.split()
    result_list = []
    for word in word_list:
        if word not in stop_words:
            word = "#number" if word.isdigit() else word
            word = stemmer.stem(word)
            result_list.append(word)
    
    print(result_list)
    
    # 找到倒排表中相关的索引,用于答案的候选集
    candidate_list = []
    for word in result_list:
        if word in inverted_idx:
            idx_list = inverted_idx[word]
            candidate_list.append(set(idx_list))
    # 候选问题的索引
#     print(candidate_list)
    candidate_idx = list(reduce(intersections, candidate_list))
    
    input_seg = ' '.join(result_list)
    input_vec = vectorizer.transform([input_seg])
    
    # 计算所有候选索引中的相似度
    similarity_list = []
    for i in candidate_idx:
        similarity = cosine_similarity(input_vec, X[i])[0]
        similarity_list.append((i, similarity[0]))
    res_sorted = sorted(similarity_list, key=lambda k:k[1], reverse=True)
    
    print(type(res_sorted))
    
    # 根据索引检索top 5答案
    answers = []  
    i = 0
    for (idx,score) in res_sorted:
        if i < 5:
            answer = alist[idx]
            answers.append(answer)
        i += 1
    
    return answers

测试及结果

print(top5results_invidx("when did Beyonce start becoming popular"))
print(top5results_invidx("what languge does the word of 'symbiosis' come from"))
['beyonc', 'start', 'becom', 'popular']

['in the late 1990s']
['langug', 'word', 'symbiosi', 'come']

['Greek']

7. 基于词向量的文本表示

def load_glove(path):
    vocab = {}
    embedding = []
    vocab["UNK"] = 0
    embedding.append([0] * 100)
    with open(path, 'r', encoding='utf8') as f:
        i = 1
        for line in f:
            row = line.strip().split()
            vocab[row[0]] = i
            embedding.append(row[1:])
            i += 1

    return vocab, embedding
    
    
# 转换为词向量
def word_to_vec(words, vocab, emb):
    vec = []
    for word in words:
        if word in vocab:
            idx = vocab[word]
            vec.append(emb[idx])
        else:
            idx = 0
            vec.append(emb[idx])
    return vec


path = './data/glove.6B/glove.6B.100d.txt'

# TODO
vocabs, emb = load_glove(path)
#emb = # 读取每一个单词的嵌入。这个是 D*H的矩阵,这里的D是词典库的大小, H是词向量的大小。 这里面我们给定的每个单词的词向量,那句子向量怎么表达?
      # 其中,最简单的方式 句子向量 = 词向量的平均(出现在问句里的), 如果给定的词没有出现在词典库里,则忽略掉这个词。

def top5results_emb(input_q):
    """
    给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
    1. 利用倒排表来筛选 candidate
    2. 对于用户的输入 input_q,转换成句子向量
    3. 计算跟每个库里的问题之间的相似度
    4. 找出相似度最高的top5问题的答案
    """
    # 问题预处理
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    sentence = pattern.sub("", input_q)
    sentence = sentence.lower()
    word_list = sentence.split()
    result_list = []
    for word in word_list:
        if word not in stop_words:
            word = "#number" if word.isdigit() else word
            word = stemmer.stem(word)
            result_list.append(word)
    # 输入问题的词向量
    input_q_vec = word_to_vec(result_list, vocabs, emb)
    
    # 根据倒排表筛选出候选问题索引
    candidates = []
    for word in result_list:
        if word in inverted_idx:
            ids = inverted_idx[word]
            candidates.append(set(ids))

    candidate_idx = list(reduce(intersections, candidates))  # 候选问题索引
    
    # 计算相似度得分
    scores = []
    for i in candidate_idx:
        sentence = new_qlist[i].split()
        vec = word_to_vec(sentence, vocabs, emb)
        score = cosine_similarity(input_q_vec, vec)[0]
        scores.append((i, score[0]))
    scores_sorted = sorted(scores, key=lambda k:k[1], reverse=True)
    
    # 根据索引检索top 5答案
    answers = []
    i = 0
    for (idx,score) in scores_sorted:
        if i < 5:
            answer = alist[idx]
            answers.append(answer)
        i += 1
    
    return answers

测试及结果

print(top5results_emb("when did Beyonce start becoming popular"))
print(top5results_emb("what languge does the word of 'symbiosis' come from"))
print(top5results_emb("In her music, what are some?"))
['in the late 1990s']
['Greek']
['Philadelphia soul', 'love, relationships, and monogamy', 'The Fighting Temptations', "The UK's largest pipe organ", 'the foreign genres of hip hop and reggae']

你可能感兴趣的:(NLP)