数据分析之鸢尾花KMEANS,层次,DBSCAN 聚类简单实现,评价指标:兰德系数,轮廓系数

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

iris=pd.read_csv("D:\Test\iris1.csv")
iris.describe()

from sklearn.preprocessing import LabelEncoder
iris["style"]=LabelEncoder().fit_transform(iris["style"].values.reshape(-1,1))

from sklearn.preprocessing import StandardScaler
column_list=["sl","sw","pl","pw"]
for i  in range(len(column_list)):
    iris[column_list[i]]=StandardScaler().fit_transform(iris[column_list[i]].values.reshape(-1,1))

x=iris[["sl","sw","pl","pw"]]
y=iris["style"]

 

#KMeans聚类

from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=4)
kmeans.fit(x)
pre_label=kmeans.labels_
#print(pre_label)
x0=x[pre_label==0]
x1=x[pre_label==1]
x2=x[pre_label==2]
x3=x[pre_label==3]
plt.scatter(x0["sl"],x0["sw"],marker="*",c="red")
plt.scatter(x1["sl"],x1["sw"],marker=".",c="blue")
plt.scatter(x2["sl"],x2["sw"],marker="+",c="black")
plt.scatter(x3["sl"],x3["sw"],marker="o",c="orange")
plt.show()
from sklearn import metrics
print("轮廓系数:",metrics.silhouette_score(x,y,metric='euclidean'))
print("兰德系数:",metrics.adjusted_rand_score(y,pre_label))

#层次聚类
from  sklearn.cluster  import AgglomerativeClustering
agg=AgglomerativeClustering(n_clusters=3,linkage="average").fit(x)
pre_label=agg.labels_
x0=x[pre_label==0]
x1=x[pre_label==1]
x2=x[pre_label==2]
x3=x[pre_label==3]

plt.scatter(x0["sl"],x0["sw"],marker="*",c="red")
plt.scatter(x1["sl"],x1["sw"],marker=".",c="blue")
plt.scatter(x2["sl"],x2["sw"],marker="+",c="black")
plt.scatter(x3["sl"],x3["sw"],marker="o",c="orange")
plt.xlabel("sl")
plt.ylabel("sw")
plt.show()
from sklearn import metrics
print("轮廓系数:",metrics.silhouette_score(x,y,metric='euclidean'))
print("兰德系数:",metrics.adjusted_rand_score(y,pre_label))


#DBSCAN聚类
from sklearn.cluster import DBSCAN
dbscan=DBSCAN(eps=0.5,min_samples=2)
dbscan.fit(x)
pre_label=dbscan.labels_
x0=x[pre_label==0]
x1=x[pre_label==1]
x2=x[pre_label==2]
x3=x[pre_label==3]

plt.scatter(x0["sl"],x0["sw"],marker="*",c="red")
plt.scatter(x1["sl"],x1["sw"],marker=".",c="blue")
plt.scatter(x2["sl"],x2["sw"],marker="+",c="black")
plt.scatter(x3["sl"],x3["sw"],marker="o",c="orange")
plt.xlabel("sl")
plt.ylabel("sw")
plt.show()
from sklearn import metrics
print("轮廓系数:",metrics.silhouette_score(x,y,metric='euclidean'))
print("兰德系数:",metrics.adjusted_rand_score(y,pre_label))

你可能感兴趣的:(菜鸟计划)