利用accessor variety得到ngram

import re
def read_tsv(file_path):
    sentence_list = []
    label_list = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        sentence = []
        labels = []
        for line in lines:
            line = line.strip()
            if line == '':
                if len(sentence) > 0:
                    sentence_list.append(sentence)
                    label_list.append(labels)
                    sentence = []
                    labels = []
                continue
            items = re.split('\\s+', line)  #按空格分割,若存在多个连续的空格,则视为一个空格
            character = items[0]
            label = items[-1]
            sentence.append(character)
            labels.append(label)

            if character in [',', '。', '?', '!', ':', ';', '(', ')', '、']:
                '''
                    若遇到以上(半角)字符,则切分句子
                '''
                sentence_list.append(sentence)
                label_list.append(labels)
                sentence = []
                labels = []
    return sentence_list, label_list


def av(train_path, eval_path, min_freq=2, av_threshold=5):

    train_sentence, _ = read_tsv(train_path)
    test_sentence, _ = read_tsv(eval_path)
    '''
        train_sentence = [['迈', '向', '充', '满', '希', '望', '的', '新', '世', '纪' ...]...[...]]
    '''
    all_sentences = train_sentence + test_sentence

    n_gram_dict = {
     }
    new_all_sentence = []

    ngram2av = {
     }

    for sen in all_sentences:
        '''
            sen = ['迈', '向', '充', '满', '希', '望', '的', '新', '世', '纪' ...]
        '''
        str_sen = ''.join(sen)
        new_sen = re.split(u'[^\u4eff-\u9fa50-9a-zA-Z]+')    #注意正则表达式前边的'取非操作'
        for s in new_sen:
            if len(s) > 0:
                new_all_sentence.append(s)  #去掉split最后分隔开的'空'-->''
        '''
            new_all_sentence = ['迈向充满希望的新世纪', '一九九八年新年讲话', '附图片', '张', ...]
        '''
    for sentence in new_all_sentence:
        '''
            sentence = '迈向充满希望的新世纪'
        '''
        for i in range(len(sentence)):
            for n in range(1, 6):  # n_gram最多只允许拼接5个词
                if i + n > len(sentence):
                    break
                left_index = i - 1
                right_index = i + n
                n_gram = ''.join(sentence[i:i + n])
                if n_gram not in n_gram_dict:
                    n_gram_dict[n_gram] += 1
                    n_gram_dict[n_gram] = {
     'l': {
     }, 'r': {
     }}
                else:
                    n_gram_dict[n_gram] += 1
                if left_index > 0:
                    ngram2av[n_gram]['l'][sentence[left_index]] = 1
                if right_index < len(sentence):
                    ngram2av[n_gram]['r'][sentence[right_index]] = 1
    remaining_ngram = {
     }
    for ngram, av_dict in ngram2av.items():
        avl = len(av_dict['l']) #当前ngram不同前驱字符的数目--3
        avr = len(av_dict['r']) #当前ngram不同后继字符的数目--4
        av = min(avl, avr)  #3
        if av >= av_threshold and n_gram_dict[ngram] >= min_freq:
            remaining_ngram[ngram] = n_gram_dict[ngram]
    return remaining_ngram

你可能感兴趣的:(python编程)