import pandas as pd
import re
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
# 下载NLTK的停用词和词性标注所需的资源
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# 读取停用词表
with open('en_stopwords.txt', 'r') as f:
stopwords_set = set(f.read().splitlines())
# 读取Excel文件
df = pd.read_excel('原始新闻合并.xlsx', usecols=['content'])
# 定义文本清洗和词性标注函数
def clean_and_tag(text):
# 检查text是否为字符串,如果不是则转换为字符串
if isinstance(text, str):
# 去除HTML标签
cleaned_text = re.sub(r'<.*?>', '', text)
# 去除多余空格和换行符
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
# 分词
words = word_tokenize(cleaned_text)
# 去除停用词
filtered_words = [word.lower() for word in words if word.lower() not in stopwords_set]
# 词性标注
tagged_words = pos_tag(filtered_words)
# 将词汇和词性标签拼接为字符串
tagged_text = ' '.join([f"{word}/{tag}" for word, tag in tagged_words])
return tagged_text
else:
# 如果text不是字符串,返回空字符串或者其他你认为合适的默认值
return ''
# 对每条新闻内容进行文本清洗和词性标注
df['cleaned_content'] = df['content'].apply(clean_and_tag)
# 将清洗后的文本保存到Excel文件
df['cleaned_content'].to_excel('清洗后的文本.xlsx', index=False)
# 输出清洗后的文本内容
print("清洗后的文本已保存到 '清洗后的文本.xlsx' 文件。")