语言模型平滑优化<add one smoothing>&<add k smoothing>&<interpolation>

数据加载及处理

def find_file(key_word,dir = os.getcwd()):
    file_paths = [os.path.join(dir, f) for f in listdir(dir) if os.path.isfile(os.path.join(dir, f)) and key_word in os.path.join(dir, f)][0]
    return file_paths

corpus = find_file("南方网 3.csv")
stop_word_path = find_file("chinese_symbols.txt","{}{}".format(os.getcwd().replace('NLP',""),'NLP数据集合/停词库/'))

#临时删除文本元素
def del_element(strings,symbles,Replace_the_symbol=" @REPLACETHESYMBOL@ "):
    srcrep = {i:Replace_the_symbol for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

def replace_number(strings,NUMBERTOKEN = ' @NUMBERTOKEN@ '):
    return re.sub(r'\d+',NUMBERTOKEN,strings)

#加载停用词
stop_words = stop_words = ''.join(open(stop_word_path,'r').read().split('\n')+['\n'])

#过滤停用词
def filter_stop_word(paper,stop_words):
    return np.array(list(filter(lambda x: x not in stop_words,jieba.cut(del_element(paper,'\n')))))

#读取本地新闻
def read_txt(corpus):
    return np.array([re.sub('\n','',str(word)) for word in tqdm(pd.read_csv(corpus).text,desc='加载文章')])

#只要中文
def just_chinese(strings):
    regStr = ".*?([\u4E00-\u9FA5]+).*?"
    expr = ''.join(re.findall(regStr, strings))
    if expr:
        return expr
    return '\n'

#分词删除
def split_word(original,temp_del=stop_words,just_chinese=False):
    result = []
    for paper in tqdm(original,desc='分词文章'):
        if just_chinese:
            paper_ = just_chinese(paper)
        paper_1 = del_element(paper,stop_words)
        paper_ = replace_number(paper_1)
        temp_split_words = filter_stop_word(paper_,stop_words)
        result.append(temp_split_words)
    return np.array(result)

# 排序字典
def sort_dict(dict_items):
    sorted_tuple = np.array(sorted(dict_items.items(), key=lambda x: x[0], reverse=True))
    return dict(zip(sorted_tuple[:,0],sorted_tuple[:,1]))

# 单词编码器
def word_encoding(all_words):
    unique,index_,counter= np.unique(all_words,return_counts=True,return_index=True)
    encode_dict = dict(zip(unique,index_))
    decode_dict = dict(zip(index_,unique))
    prob_of_word_dict = dict(zip(unique,counter/unique.size))
    prob_of_word_dict_to_log = dict(zip(unique,-np.log(counter/all_words.size)))
    return decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log

# 编码向量
def encode_papers(split_paper,encode_dict):
    # 所有单词降维到一维
    return [torch.tensor([encode_dict[word] for word in paper]) for paper in tqdm(split_paper,desc='词列表转编码')]

def to_numpy(data):
    if isinstance(data,np.ndarray):
        return data
    else:
        return np.array(data)

def INIT_Ngram(word_list,n):
    m = word_list.size
    end = np.array([np.array([word_list[j] for j in range(i) ][-n:]) for i in tqdm(range(m+1),desc='Ngram初始化')][1:])
    return list(zip(word_list,end))

'''数据预处理函数'''
def data_preprocessing_to_tensor(corpus,stop_words=stop_words,just_chinese=False):
    # 读取原文
    read_original = read_txt(corpus) 
    # 倒入文章并分词
    split_paper = split_word(read_original,stop_words,just_chinese=just_chinese)
    # 所有单词降维到一维
    all_words = np.array([j for i in tqdm(split_paper,desc='词列表降维') for j in i])
    # 所有文章合并到一行并且补足符号
    one_row = [word for paper in tqdm(split_paper,desc="文章列表降维补码") for word in np.append(paper,'NEWLINE')]+['NEWLINE']
    one_row.insert(0,'NEWLINE')
    papers_to_one_dim = np.array(one_row)
    # 单词编码
    decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log = word_encoding(all_words)
    # 转tensor
    code_paper = encode_papers(split_paper,encode_dict)
    # 编码向量
    code_tensor = torch.unique(torch.tensor(list(decode_dict)))


    return decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log,all_words,papers_to_one_dim,read_original,split_paper,code_tensor,code_paper


'''TF-IDF torch'''
def TF_Tensor(tensor_paper,code_tensor):
    unique,counts = torch.unique(tensor_paper,return_counts=True)
    init_TF = torch.zeros(code_tensor.size())
    for e,c in zip(unique,counts):
        if e in code_tensor:
            index_ = np.where(code_tensor == e)[0][0]
            init_TF[index_] = c
    return init_TF

def IDF_Tensor(tensor_papers,code_tensor):
    N = len(tensor_papers)
    NW = torch.zeros(code_tensor.size())
    step = -1
    for word_code in tqdm(code_tensor,desc="IDF词汇"):
        step += 1
        counter = sum((word_code in paper_code for paper_code in tensor_papers))
        NW[step] = counter
    return torch.log(N/(NW+1))

def TFIDFTensor(tensor_papers,code_tensor):
    IDF = IDF_Tensor(tensor_papers,code_tensor)
    m = len(tensor_papers)
    n = int(code_tensor.size()[0])
    DF = torch.zeros(m,n)
    step = -1
    for paper in tqdm(tensor_papers,desc="TF矩阵"):
        step += 1
        DF[step] = TF_Tensor(paper,code_tensor)
    return DF*IDF

#静态配置
decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log,all_words,papers_to_one_dim,read_original,split_paper,code_tensor,code_paper = data_preprocessing_to_tensor(corpus)

# 初始化Ngram (耗时极长,且需要大量内存)
#init_Ngram = INIT_Ngram(papers_to_one_dim,2)
# torch训练TFIDF
#TFIDF = TFIDFTensor(code_paper,code_tensor)

语言模型及优化

def token(paper):
    return np.array(list(jieba.cut(paper)))

def Ngram(word_list,n):
    sentence = to_numpy(word_list)
    m = sentence.size
    end = np.array([np.array([word_list[j] for j in range(i) ][-n:]) for i in range(m+1)][1:])
    return end#[(e[:-1],e[-1]) for e in end]

def forward_check(word,n=2,word_docker=split_paper):
    start = -1
    result = []
    if word in stop_words:
        word = "REPLACETHESYMBOL"
    for paper in word_docker:
        start += 1
        if word in paper:
            ends = np.ravel(np.argwhere(paper==word))
            for end in ends:
                end_first = end-(n-1)
                if end_first:
                    result.append(word_docker[start][end_first:end+1])
                else:
                    result.append(word_docker[start][0:end+1])
    return np.array(list(filter(lambda x : len(x) != 0 ,result)))

def backward_check(word,n=2,word_docker=split_paper):
    if word in list(stop_words):
        word = "REPLACETHESYMBOL"
    start = -1
    result = []
    for paper in word_docker:
        start += 1
        if word in paper:
            ends = np.ravel(np.argwhere(paper==word))
            for end in ends:
                if end+n < paper.size:
                    result.append(word_docker[start][end:end+n])
                else:
                    start_arr = word_docker[start][end:end+1]
                    end_arr = [word]*(n-1)
                    result.append(np.append(start_arr,end_arr))
    return np.array(list(filter(lambda x : len(x) != 0 ,result)))

# 概率字典查询
def to_dictionary(word,word_dict=prob_of_word_dict):
    try:
        return word_dict[word]
    except Exception:
        return 0

# 优化器组建
def Add_one_Smoothing(Wi,Wireduce1,V=None,word_dict=prob_of_word_dict):
    return (Wireduce1+1)/(Wi+V)

def Add_K_Smoothing(Wi,Wireduce1,k,V,word_dict=prob_of_word_dict):
    return (Wireduce1+k)/(Wi+k*V)


def interpolation(raw_text):
    raw_text = to_numpy(raw_text)
    n = raw_text.size
    end = raw_text[-1]
    prob = [1/n+to_dictionary(end)]
    for i in range(2,raw_text.size+1):
        find = forward_check(end,i)
        w = raw_text[n-i:]
        check = (find==w).sum(axis=1)==i
        counter = check.sum()
        prob.append(1/n+counter/check.size)
    return np.mean(prob)

# 概率优化器接口
def prob_optimize(raw_text,k=2,optimize='min'):
    raw_text = to_numpy(raw_text)
    n = raw_text.size
    AB = raw_text
    B = raw_text[-1]
    search_backward = backward_check(B,n,split_paper)
    if len(search_backward)==0:
        print('mini')
        return 10**(-8)
    check_backward = search_backward==AB
    
    backward_bool = check_backward.all(axis=1)
    counter = backward_bool.sum()
    PAB = counter/backward_bool.size
    PB = to_dictionary(B)
    V = papers_to_one_dim[papers_to_one_dim!='NEWLINE'].size
    if optimize == 'min' :
        print('min')
        if PAB*PB == 0:
            return 10**(-8)
    elif optimize == 'add one smoothing':
        print('add one smoothing')
        return Add_one_Smoothing(PB,PAB,V)
    elif optimize == 'add k smoothing':
        print('add k smoothing')
        return Add_K_Smoothing(PB,PAB,k,V)
    elif optimize == 'interpolation':
        print('interpolation')
        return interpolation(raw_text)
    else:
        return PAB/PB

# 概率接口
def get_prob(iterable,n,k=2,optimize='min'):
    start = iterable[0]
    m = len(iterable)
    if n == 1:
        return to_dictionary(start)
    else:
        if n > m:
            n = m-1
        return prob_optimize(iterable,k,optimize=optimize)

#语言模型
def get_prob(iterable,n,k=2,optimize='min'):
    start = iterable[0]
    m = len(iterable)
    if n == 1:
        return to_dictionary(start)
    else:
        if n > m:
            n = m-1
        return prob_optimize(iterable,k,optimize=optimize)

def LanguageModel(raw_text,n=2,k=2,word_docker=split_paper,word_dict=prob_of_word_dict,V = papers_to_one_dim[papers_to_one_dim!='NEWLINE'].size,optimize_key ='add k smoothing'):
    expr , text , prob_list = 0 ,[] ,[]
    for gram in Ngram(raw_text,n):
        prob = get_prob(gram,gram.size,k,optimize_key)
        expr += prob!=10**(-8) and -np.log(prob) or 10**(-8)
        prob_list.append((gram,prob,-np.log(prob)))
        text.append(gram[-1])
    return {expr:text},prob_list
   
raw_text = token('特朗普当局似乎有点着急这是好事儿')

LanguageModel(raw_text,n=2,optimize_key='interpolation')

语言模型平滑优化<add one smoothing>&<add k smoothing>&<interpolation>_第1张图片

你可能感兴趣的:(python代码整合)