散点图绘制

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

# 读取Excel文件,指定 'content' 列作为文本数据
file_path = 'nltk处理后新闻合并.xlsx'
sheet_name = 'Sheet1'
data = pd.read_excel(file_path, sheet_name=sheet_name, usecols=['content'])

# 清除NaN值,将它们替换为一个空字符串
data['content'].fillna('', inplace=True)

# 获取所有新闻文本内容
all_texts = data['content'].tolist()

# 创建TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# 运行K均值聚类,设置n_clusters为2
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(tfidf_matrix)

# 使用TruncatedSVD进行降维
svd = TruncatedSVD(n_components=2)
tfidf_matrix_2d = svd.fit_transform(tfidf_matrix)

# 获取每个文本所属的聚类标签
cluster_labels_2 = kmeans.labels_

# 绘制K=2的散点图
plt.figure(figsize=(6, 6))
plt.scatter(tfidf_matrix_2d[:, 0], tfidf_matrix_2d[:, 1], c=cluster_labels_2, cmap='Set1', alpha=0.5)
plt.title('KMeans Clustering with K=2')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

plt.show()
 

你可能感兴趣的:(1024程序员节)