提取英文新闻文本地名及统计出现的数量,制作词云图

import pandas as pd
import nltk
from nltk import ne_chunk, word_tokenize, pos_tag
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor

# 下载NLTK的词性标注器和实体识别器所需的数据
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# 初始化地名频次字典
locations = Counter()

# 获取英文停用词列表
stop_words = set(nltk.corpus.stopwords.words('english'))

# 提取地名并计算频次的函数
def extract_locations(content):
    words = word_tokenize(content)
    tags = pos_tag(words)
    tree = ne_chunk(tags)

    local_locations = Counter()
    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            entity = " ".join([word for word, tag in subtree.leaves()])
            label = subtree.label()
            if label == 'GPE' and entity.lower() not in stop_words:
                # 'GPE'表示地名,并且不是通用词汇
                local_locations[entity] += 1
    return local_locations

# 主程序
def main():
    # 读取英文新闻文档
    df = pd.read_excel('原始新闻合并.xlsx')  # 请替换为你的新闻文档路径

    # 处理NAN值,将NAN替换为空字符串
    df['content'].fillna('', inplace=True)

    # 使用多线程处理每篇新闻的地名提取
    NUM_THREADS = 4  # 设置并行线程数
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        results = list(executor.map(extract_locations, df['content']))

    # 合并结果
    for local_locations in results:
        locations.update(local_locations)

    # 输出提取到的地名及其频次
    for location, count in locations.items():
        print("地名:", location, " 出现次数:", count)

    # 转换为DataFrame
    location_data = pd.DataFrame(list(locations.items()), columns=['地名', '出现次数'])

    # 保存为Excel文件
    location_data.to_excel('地名出现次数.xlsx', index=False)

    # 制作词云图
    wordcloud = WordCloud(width=800, height=600, background_color='white').generate_from_frequencies(locations)

    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# 调用主程序
if __name__ == "__main__":
    main()
 

你可能感兴趣的:(1024程序员节)