NLP之滑动窗口函数

import re


def compute_ngrams(word):
    # BOW, EOW = ('<', '>')  # Used by FastText to attach to all words as prefix and suffix
    pattern = r'[a-zA-Z]+'
    re.findall(pattern, word)

    extended_word,tag_dict = segword(word)
    # print(extended_word,tag_dict)
    min_n = 2
    max_n = len(extended_word)

    ngrams = []
    for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
        for i in range(0, len(extended_word) - ngram_length + 1):
            new_word = extended_word[i:i + ngram_length]
            new_word2 = new_word
            if len(new_word) == 1:
                continue
            if len(tag_dict) == 0:
                ngrams.append(new_word)
            else:
                for c in new_word:
                    if c.encode('utf-8').isalpha():
                        new_word2 = new_word2.replace(c,tag_dict[c]+' ')
                ngrams.append(new_word2)
    return list(set(ngrams))


def segword(word):
    word = word.lower()
    # en
    pattern = r'[a-zA-Z0-9]+'
    en_words = re.findall(pattern, word)
    chars = [chr(i).upper() for i in range(97, 123)]
    tags = [chars[i] for i in range(len(en_words))]
    tag_word = {}
    new_word = word.replace(' ', '')
    for i in range(len(en_words)):
        new_word = new_word.replace(en_words[i], tags[i])
        tag_word[tags[i]] = en_words[i]
    return new_word,tag_word


r = compute_ngrams("cad模具设计")
print(r)

你可能感兴趣的:(NLP之滑动窗口函数)