Python分单篇文章提取核心词汇

import pandas as pd
import re
import html
from transformers import BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 加载BERT分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 处理 'Indonesia news overall.xlsx' 文件
indonesia_file_path = 'Indonesia news overall.xlsx'
df_indonesia = pd.read_excel(indonesia_file_path, header=0)  # 使用第一行作为列名

# 合并标题和内容到一列
df_indonesia['合并文本'] = df_indonesia['标题'].astype(str) + ' ' + df_indonesia['内容'].astype(str)

# 文本清洗函数(使用BERT进行文本清洗)
def clean_text_with_bert(text):
    # 去除HTML标签
    text = html.unescape(text)

    # 使用BERT进行文本清洗
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    cleaned_text = tokenizer.decode(input_ids, skip_special_tokens=True)

    return cleaned_text

# 对合并文本进行文本清洗
df_indonesia['Cleaned_Content'] = df_indonesia['合并文本'].apply(clean_text_with_bert)

# 使用TF-IDF算法提取核心词汇
tfidf_vectorizer = TfidfVectorizer(max_features=10, analyzer='word', stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df_indonesia['Cleaned_Content'])
feature_names = tfidf_vectorizer.get_feature_names_out()

# 转换稀疏矩阵为稠密矩阵
dense_tfidf_matrix = tfidf_matrix.toarray()

# 将核心词汇添加为新的列
for i, feature_name in enumerate(feature_names):
    df_indonesia[f'Core_Term_{i+1}'] = dense_tfidf_matrix[:, i]

# 保存处理后的文本为新的CSV文件
output_indonesia_file_path = 'Processed_Indonesia_News.csv'
df_indonesia.to_csv(output_indonesia_file_path, index=False, encoding='utf-8')

print("处理后的 'Indonesia news overall.xlsx' 数据已保存至", output_indonesia_file_path)

# 处理 'Laos news overall.xlsx' 文件(根据需要重复上述步骤)

laos_file_path = 'Laos news overall.xlsx'
df_laos = pd.read_excel(laos_file_path, header=0)  # 使用第一行作为列名

# 合并标题和内容到一列
df_laos['合并文本'] = df_laos['标题'].astype(str) + ' ' + df_laos['内容'].astype(str)

# 对合并文本进行文本清洗(根据需要修改文本清洗函数)

df_laos['Cleaned_Content'] = df_laos['合并文本'].apply(clean_text_with_bert)

# 继续使用TF-IDF算法提取核心词汇(根据需要重复上述步骤)

tfidf_matrix_laos = tfidf_vectorizer.transform(df_laos['Cleaned_Content'])

# 转换稀疏矩阵为稠密矩阵(根据需要重复上述步骤)

dense_tfidf_matrix_laos = tfidf_matrix_laos.toarray()

# 将核心词汇添加为新的列(根据需要重复上述步骤)

for i, feature_name in enumerate(feature_names):
    df_laos[f'Core_Term_{i+1}'] = dense_tfidf_matrix_laos[:, i]

# 保存处理后的文本为新的CSV文件(根据需要重复上述步骤)

output_laos_file_path = 'Processed_Laos_News.csv'
df_laos.to_csv(output_laos_file_path, index=False, encoding='utf-8')

print("处理后的 'Laos news overall.xlsx' 数据已保存至", output_laos_file_path)

你可能感兴趣的:(python,人工智能,开发语言)