import pandas as pd
import nltk
from nltk import ne_chunk, word_tokenize, pos_tag
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
# 下载NLTK的词性标注器和实体识别器所需的数据
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# 初始化地名频次字典
locations = Counter()
# 获取英文停用词列表
stop_words = set(nltk.corpus.stopwords.words('english'))
# 提取地名并计算频次的函数
def extract_locations(content):
words = word_tokenize(content)
tags = pos_tag(words)
tree = ne_chunk(tags)
local_locations = Counter()
for subtree in tree:
if isinstance(subtree, nltk.Tree):
entity = " ".join([word for word, tag in subtree.leaves()])
label = subtree.label()
if label == 'GPE' and entity.lower() not in stop_words:
# 'GPE'表示地名,并且不是通用词汇
local_locations[entity] += 1
return local_locations
# 主程序
def main():
# 读取英文新闻文档
df = pd.read_excel('原始新闻合并.xlsx') # 请替换为你的新闻文档路径
# 处理NAN值,将NAN替换为空字符串
df['content'].fillna('', inplace=True)
# 使用多线程处理每篇新闻的地名提取
NUM_THREADS = 4 # 设置并行线程数
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
results = list(executor.map(extract_locations, df['content']))
# 合并结果
for local_locations in results:
locations.update(local_locations)
# 输出提取到的地名及其频次
for location, count in locations.items():
print("地名:", location, " 出现次数:", count)
# 转换为DataFrame
location_data = pd.DataFrame(list(locations.items()), columns=['地名', '出现次数'])
# 保存为Excel文件
location_data.to_excel('地名出现次数.xlsx', index=False)
# 制作词云图
wordcloud = WordCloud(width=800, height=600, background_color='white').generate_from_frequencies(locations)
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 调用主程序
if __name__ == "__main__":
main()