k均值与轮廓法则

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# 读取Excel文件
file_path = '处理好后文本.xlsx'
sheet_name = 'Sheet1'  # 修改为实际的工作表名称
data = pd.read_excel(file_path, sheet_name=sheet_name)

# 清除NaN值,将它们替换为一个空字符串
data['cleaned_content'].fillna('', inplace=True)

# 合并所有新闻文本内容
all_texts = data['cleaned_content'].tolist()

# 创建TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# 使用轮廓系数选择最优的K值
best_silhouette_score = -1
optimal_k = 2

for k in range(2, 11):  # 尝试不同的K值
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(tfidf_matrix)
    silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)
    
    print(f"For K={k}, Silhouette Score: {silhouette_avg:.2f}")
    
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        optimal_k = k

print(f"Optimal K based on Silhouette Score: {optimal_k}")

# 运行K均值聚类
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(tfidf_matrix)

# 使用TruncatedSVD进行降维
svd = TruncatedSVD(n_components=2)
tfidf_matrix_2d = svd.fit_transform(tfidf_matrix)

# 获取每个文本所属的聚类标签
cluster_labels = kmeans.labels_

# 绘制散点图
plt.figure(figsize=(10, 6))
plt.scatter(tfidf_matrix_2d[:, 0], tfidf_matrix_2d[:, 1], c=cluster_labels, cmap='Set1', alpha=0.5)

plt.title(f'KMeans Clustering with K={optimal_k}')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

plt.show()
 

你可能感兴趣的:(python,机器学习,开发语言)