import jieba from pypinyin import pinyin, lazy_pinyin from common.basicInfo import BasicInfo """纠错模块""" class ErrorRecovery(object): def __init__(self, dataSever): self.dataServer = dataSever self.symbol_list = self.dataServer.symbol_list self.entity_list = self.dataServer.entity_list """ ===============一定要带标点符号===========""" def recoveryMain(self, sentence,entity_list=None): word_list, word_gram_list = self.gram2Main(sentence) user_pin_list =[] user_chines_list=[] for item in word_gram_list: user_chines_list.append(item) pin_list = lazy_pinyin(item) #, errors='ignore' user_pin_list.append(pin_list) mohu_list = [] mohu_list2 = [] for k, v in self.dataServer.dict_gupiao_pin.items(): for user_pin in user_pin_list: indexs =user_pin_list.index(user_pin) if v == user_pin: #todo 完全匹配,直接return if self.common_Chinese(k, indexs, user_chines_list) >= 2: accurate_list = [] save=[] save.append(k) save.append(user_pin) accurate_list.append(save) self.sensitiveWord(word_list) return self.restoreQue(None, accurate_list, user_pin_list, word_gram_list, word_list) else: #todo 模糊匹配,不能直接return,完全匹配具有最高的优先级 if len(v) >= 4 and self.unionlen(user_pin, v) >= 3: #todo 4个字的关键字大于3个字的优先级 if self.common_Chinese(k, indexs, user_chines_list) >= 2: save=[] save.append(k) save.append(user_pin) mohu_list.append(save) elif len(v) == 3 and self.unionlen(user_pin, v) >= 2: if self.common_Chinese(k,indexs, user_chines_list) >= 1: save = [] save.append(k) save.append(user_pin) mohu_list2.append(save) if len(mohu_list) > 0: self.sensitiveWord(word_list) return self.restoreQue(None, mohu_list, user_pin_list, word_gram_list, word_list) elif len(mohu_list2) > 0: self.sensitiveWord(word_list) return self.restoreQue(None, mohu_list2, user_pin_list, word_gram_list, word_list) def common_Chinese(self, k, indexs, user_chines_list): user_word = user_chines_list[indexs] orginal_word = BasicInfo.get_value(self.dataServer.new_gupiao, k) # print("识别纠错==="+orginal_word) ret = [i for i in orginal_word if i in user_word] return len(ret) def sensitiveWord(self, word_list): if 'englishA' in word_list: word_list[word_list.index('englishA')] = "好股" #问句排查错误后,重组问题 def restoreQue(self, id=None, accu_list=None, user_pin_list=None,word_gram_list=None, word_list=None): if id is None: y=0 else: #todo print("=======") k = accu_list[y][0] user_pin = accu_list[y][1] index = user_pin_list.index(user_pin) shortence = word_gram_list[index] seg_list = jieba.cut(shortence, cut_all=False, HMM=True) short_list = [e for e in seg_list] ret = [] ret.append(k) if short_list[0] in word_list: word_list[word_list.index(short_list[0])] = self.entity_list[0] del word_list[word_list.index(self.entity_list[0]) + 1] else: print("========[log]errorRecovery.py====纠错error===========") for word in word_list: for symbol in self.symbol_list: if symbol in word or '\\ue' in word: word_list.remove(word) ret.append(word_list) return ret def gram2Main(self, sentence): sentence = ''.join(sentence.split()) seg_list = jieba.cut(sentence, cut_all=False, HMM=True) word_list = [e for e in seg_list] if '好股' in word_list: word_list[word_list.index('好股')] = "englishA" # print("纠错====="+str(word_list)) return self.test2gram(word_list) def test2gram(self, list2=None): word_gram_list = [] for i in range(len(list2)-1): ce = list2[i]+list2[i+1] word_gram_list.append(ce) return list2, word_gram_list def test3gram(self, list2=None): #todo 启用3-gram进行优化,先选用3-gram完全匹配,若未找到,则进行2-gram的模糊匹配 word_gram3_list = [] for i in range(len(list2) - 2): ce = list2[i] + list2[i + 1] + list2[i + 2] word_gram3_list.append(ce) return list2, word_gram3_list #"""**该方法被遗弃""" # def findok(self, v, user_pin): # if len(v) >= 4 and self.unionlen(user_pin, v) >= 3: # return True # elif len(v) == 3 and self.unionlen(user_pin, v) >= 2: # return True # return False """**blist 为字典,len(blist)为每项字典长度列表(该方法被遗弃)""" def unionlen2(self, alist, blist): ret = [] if len(alist) <= len(blist): for i in range(len(alist)): if alist[i] in blist: ret.append(alist) return len(ret) else: for i in range(len(blist)): if alist[i] in blist: ret.append(alist) return len(ret) def unionlen(self, alist, blist): length = 0 if len(alist) <= len(blist): for i in range(len(alist)): if alist[i] == blist[i]: length =length + 1 else: for i in range(len(blist)): if blist[i] == alist[i]: length=length+1 return length # if __name__ == '__main__': # dataServer = DataServer() # erRecovery = ErrorRecovery(dataServer) # entity_list=['gegu','bankuai'] # words = erRecovery.recoveryMain("有什么好股可?")#大脸有一,这只股票怎么样? 浙江现xian,怎么,样?,航茶集怎么样呢 爱第二怎么样 # print(words) # # words = erRecovery.recoveryMain("大脸有一,这只股票,怎么样?") # # print(words)