我正在练习使用NLTK从原始tweets中删除某些特性,然后希望删除(对我来说)无关的tweets(例如空tweet或单字tweets)。不过,似乎有些单字微博并没有被删除。我还面临着一个问题,无法删除任何停止词,无论是在开始或结束的句子。
有什么建议吗?目前,我希望传递一个句子作为输出,而不是一个标记词列表。
欢迎对改进代码(处理时间、优雅度)的任何其他评论。import string
import numpy as np
import nltk
from nltk.corpus import stopwords
cache_english_stopwords=stopwords.words('english')
cache_en_tweet_stopwords=stopwords.words('english_tweet')
# For clarity, df is a pandas dataframe with a column['text'] together with other headers.
def tweet_clean(df):
temp_df = df.copy()
# Remove hyperlinks
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('https?:\/\/.*\/\w*', '', regex=True)
# Remove hashtags
# temp_df.loc[:,"text"]=temp_df.loc[:,"text"].replace('#\w*', '', regex=True)
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('#', ' ', regex=True)
# Remove citations
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\@\w*', '', regex=True)
# Remove tickers
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\$\w*', '', regex=True)
# Remove punctuation
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('[' + string.punctuation + ']+', '', regex=True)
# Remove stopwords
for tweet in temp_df.loc[:,"text"]:
tweet_tokenized=nltk.word_tokenize(tweet)
for w in tweet_tokenized:
if (w.lower() in cache_english_stopwords) | (w.lower() in cache_en_tweet_stopwords):
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('[\W*\s?\n?]'+w+'[\W*\s?]', ' ', regex=True)
#print("w in stopword")
# Remove quotes
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\&*[amp]*\;|gt+', '', regex=True)
# Remove RT
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\s+rt\s+', '', regex=True)
# Remove linebreak, tab, return
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('[\n\t\r]+', ' ', regex=True)
# Remove via with blank
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('via+\s', '', regex=True)
# Remove multiple whitespace
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\s+\s+', ' ', regex=True)
# Remove single word sentence
for tweet_sw in temp_df.loc[:, "text"]:
tweet_sw_tokenized = nltk.word_tokenize(tweet_sw)
if len(tweet_sw_tokenized) <= 1:
temp_df.loc["text"] = np.nan
# Remove empty rows
temp_df.loc[(temp_df["text"] == '') | (temp_df['text'] == ' ')] = np.nan
temp_df = temp_df.dropna()
return temp_df