import numpy as np
import torch
import re
import jieba
import pandas as pd
from tqdm import tqdm
import os.path
from os import listdir
import torch
from IPython.display import display, Image
from itertools import permutations
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth',100)
def find_file(key_word,dir = os.getcwd()):
file_paths = [os.path.join(dir, f) for f in listdir(dir) if os.path.isfile(os.path.join(dir, f)) and key_word in os.path.join(dir, f)][0]
return file_paths
corpus = find_file("南方网 3.csv")
stop_word_path = find_file("chinese_symbols.txt","/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/停词库/")
stop_word_path
def del_element(strings,symbles,Replace_the_symbol=" @REPLACETHESYMBOL@ "):
srcrep = {i:Replace_the_symbol for i in symbles }
rep = dict((re.escape(k), v) for k, v in srcrep.items())
pattern = re.compile("|".join(rep.keys()))
return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)
def replace_number(strings,NUMBERTOKEN = ' @NUMBERTOKEN@ '):
return re.sub(r'\d+',NUMBERTOKEN,strings)
stop_words = stop_words = ''.join(open(stop_word_path,'r').read().split('\n')+['\n'])
def filter_stop_word(paper,stop_words):
return np.array(list(filter(lambda x: x not in stop_words,jieba.cut(del_element(paper,'\n')))))
def read_txt(corpus):
return np.array([re.sub('\n','',str(word)) for word in tqdm(pd.read_csv(corpus).text,desc='加载文章')])
def just_chinese(strings):
regStr = ".*?([\u4E00-\u9FA5]+).*?"
expr = ''.join(re.findall(regStr, strings))
if expr:
return expr
return '\n'
def split_word(original,temp_del=stop_words,just_chinese=False):
result = []
for paper in tqdm(original,desc='分词文章'):
if just_chinese:
paper_ = just_chinese(paper)
paper_1 = del_element(paper,stop_words)
paper_ = replace_number(paper_1)
temp_split_words = filter_stop_word(paper_,stop_words)
result.append(temp_split_words)
return np.array(result)
def sort_dict(dict_items):
sorted_tuple = np.array(sorted(dict_items.items(), key=lambda x: x[0], reverse=True))
return dict(zip(sorted_tuple[:,0],sorted_tuple[:,1]))
def word_encoding(all_words):
unique,index_,counter= np.unique(all_words,return_counts=True,return_index=True)
encode_dict = dict(zip(unique,index_))
decode_dict = dict(zip(index_,unique))
prob_of_word_dict = dict(zip(unique,counter/unique.size))
prob_of_word_dict_to_log = dict(zip(unique,-np.log(counter/all_words.size)))
return decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log
def encode_papers(split_paper,encode_dict):
return [torch.tensor([encode_dict[word] for word in paper]) for paper in tqdm(split_paper,desc='词列表转编码')]
def to_numpy(data):
if isinstance(data,np.ndarray):
return data
else:
return np.array(data)
def INIT_Ngram(word_list,n):
m = word_list.size
end = np.array([np.array([word_list[j] for j in range(i) ][-n:]) for i in tqdm(range(m+1),desc='Ngram初始化')][1:])
return list(zip(word_list,end))
'''数据预处理函数'''
def data_preprocessing_to_tensor(corpus,stop_words=stop_words,just_chinese=False):
read_original = read_txt(corpus)
split_paper = split_word(read_original,stop_words,just_chinese=just_chinese)
all_words = np.array([j for i in tqdm(split_paper,desc='词列表降维') for j in i])
one_row = [word for paper in tqdm(split_paper,desc="文章列表降维补码") for word in np.append(paper,'NEWLINE')]+['NEWLINE']
one_row.insert(0,'NEWLINE')
papers_to_one_dim = np.array(one_row)
decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log = word_encoding(all_words)
code_paper = encode_papers(split_paper,encode_dict)
code_tensor = torch.unique(torch.tensor(list(decode_dict)))
return decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log,all_words,papers_to_one_dim,read_original,split_paper,code_tensor,code_paper
'''TF-IDF torch'''
def TF_Tensor(tensor_paper,code_tensor):
unique,counts = torch.unique(tensor_paper,return_counts=True)
init_TF = torch.zeros(code_tensor.size())
for e,c in zip(unique,counts):
if e in code_tensor:
index_ = np.where(code_tensor == e)[0][0]
init_TF[index_] = c
return init_TF
def IDF_Tensor(tensor_papers,code_tensor):
N = len(tensor_papers)
NW = torch.zeros(code_tensor.size())
step = -1
for word_code in tqdm(code_tensor,desc="IDF词汇"):
step += 1
counter = sum((word_code in paper_code for paper_code in tensor_papers))
NW[step] = counter
return torch.log(N/(NW+1))
def TFIDFTensor(tensor_papers,code_tensor):
IDF = IDF_Tensor(tensor_papers,code_tensor)
m = len(tensor_papers)
n = int(code_tensor.size()[0])
DF = torch.zeros(m,n)
step = -1
for paper in tqdm(tensor_papers,desc="TF矩阵"):
step += 1
DF[step] = TF_Tensor(paper,code_tensor)
return DF*IDF
decode_dict,encode_dict,prob_of_word_dict,prob_of_word_dict_to_log,all_words,papers_to_one_dim,read_original,split_paper,code_tensor,code_paper = data_preprocessing_to_tensor(corpus)
def token(paper):
return np.array(list(jieba.cut(paper)))
def Ngram(word_list,n):
sentence = to_numpy(word_list)
m = sentence.size
end = np.array([np.array([word_list[j] for j in range(i) ][-n:]) for i in range(m+1)][1:])
return [(e[:-1],e[-1]) for e in end]
raw_text = token('特朗普当局似乎有点着急,这是一个好事情')
Ngram(raw_text[:10],4),raw_text[:3]
def following_words(word,n=2,word_docker=split_paper):
if word in list(stop_words):
word = "REPLACETHESYMBOL"
start = -1
result = []
for paper in word_docker:
start += 1
if word in paper:
ends = np.ravel(np.argwhere(paper==word))
for end in ends:
if end+n < paper.size:
result.append(word_docker[start][end:end+n])
else:
start_arr = word_docker[start][end:end+1]
end_arr = [word]*(n-1)
result.append(np.append(start_arr,end_arr))
return np.array(result)
following_words("特朗普",3)
def join_prob(iterable,n,word_docker=split_paper):
start = iterable[0]
m = len(iterable)
if n > m:
n = m
check = (following_words(start,n,split_paper)==iterable[:n])
check = check.all(axis=1)
counter = check.sum()
if counter==0:
return 10**(-8)
return -np.log(counter/check.size)
def check_prob(word,code_dict=prob_of_word_dict_to_log):
try:
return code_dict[word]
except Exception:
return 10**(-8)
join_prob(('特朗普', '进行', '对话'),3)
def NGramLanguageModel(raw_text,n):
expr ,arr ,prob,result = 0,np.zeros(n),1,[]
for gram in Ngram(raw_text,n):
end = gram[-1]
start = gram[0]
if start.size == 0:
prob += check_prob(end)
arr = ['']
else:
arr = np.append(start,end)
prob += join_prob(arr,arr.size)
arr = start
result.append(end)
return {prob:result}
raw_text = token('特朗普当局似乎有点着急')
NGramLanguageModel(raw_text,2)