预处理

stopwords = nltk.corpus.stopwords.words("english")
eng_stopwords = set(stopwords)
def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ' '.join(words)

你可能感兴趣的:(预处理)