智能纠错(N-gram、编辑距离、转化拼音)

from basicInfo import BasicInfo
import jieba
from pypinyin import pinyin, lazy_pinyin
from dataServer import DataServer




class ErrorRecovery(object):


    def __init__(self, dataSer):
        self.basic = BasicInfo()
        self.data = dataServer


    """ ===============一定要带标点符号==========="""


    def recoveryMain(self, sentence):
        word_list, word_gram_list = self.gram2Main(sentence)
        user_pin_list =[]


        for item in word_gram_list:
            pin_list=lazy_pinyin(item, errors='ignore')
            user_pin_list.append(pin_list)


        for k, v in dataServer.dict_gupiao_pin.items():
            for user_pin in user_pin_list:
                if v == user_pin:
                    print("识别代码===$$$$$$=="+str(k))
                    return self.restoreQue(user_pin, user_pin_list, word_gram_list, word_list, k)
                else:
                    if self.findok(v, user_pin):
                        print("识别代码===%%%%%%====="+str(k))
                        return self.restoreQue(user_pin, user_pin_list, word_gram_list, word_list, k)


    #问句排查错误后,重组问题
    def restoreQue(self, user_pin, user_pin_list, word_gram_list, word_list, k):
        index = user_pin_list.index(user_pin)
        shortence = word_gram_list[index]
        seg_list = jieba.cut(shortence, cut_all=False, HMM=True)
        short_list = [e for e in seg_list]
        word_list[word_list.index(short_list[0])] = str(k)
        del word_list[word_list.index(str(k)) + 1]


        symbol_list = [',', ',', '。', '、', '.', '#.', '?', '?', ';', ':', ';', '=', '+', '-', '@', '#', '(', ')', '!', '!',
                       '*', '%']
        for word in word_list:
            for symbol in symbol_list:
                if symbol in word or '\\ue' in word:
                    word_list.remove(word)
        return word_list


    def gram2Main(self, sentence):
        print("原句===="+sentence)
        sentence = ''.join(sentence.split())
        #sentence = re.sub("[\s+\.\!\/_,\[\]:$\-:);%;=^*(+\"\']+|[+——\“!\”,?。?<《》>、~@#¥%……&*()]+", '', sentence)
        seg_list = jieba.cut(sentence, cut_all=False, HMM=True)
        word_list = [e for e in seg_list]
  
        return self.test2gram(word_list)


    def test2gram(self, list2=None):
        #list2 = ['请问','这','只','股票','怎么','样','呢']
        word_gram_list=[]
        for i in range(len(list2)-1):
            ce = list2[i]+list2[i+1]
            word_gram_list.append(ce)
        return list2, word_gram_list


    def findok(self, v, user_pin):
        if len(v) >= 4 and self.unionlen(user_pin, v) >= 3:
            return True
        elif len(v) == 3 and self.unionlen(user_pin, v) >= 2:
            return True
        return False


    #blist 为字典,len(blist)为每项字典长度列表
    def unionlen(self, alist, blist):
        ret = []
        if len(alist)<=len(blist):
            for i in range(len(alist)):
                if alist[i] in blist:
                    ret.append(alist)
            return len(ret)


        else:
            for i in range(len(blist)):
                if alist[i] in blist:
                    ret.append(alist)
            return len(ret)




if __name__ == '__main__':
    dataServer = DataServer()
    erRecovery = ErrorRecovery(dataServer)
    words = erRecovery.recoveryMain("请问老师,爱第二怎么样?")#大脸有一,这只股票怎么样?  浙江现xian,怎么,样?,航茶集怎么样呢  爱第二怎么样
    print(words)
    words = erRecovery.recoveryMain("大脸有一,这只股票,怎么样?")
    print(words)





你可能感兴趣的:(python技能)