Python数据分析与机器学习-聚类实践

源码下载:

http://download.csdn.net/download/adam_zs/10195505

Python数据分析与机器学习-聚类实践_第1张图片

import pandas as pd

beer = pd.read_csv("data.txt", sep=" ")
'''
                    name  calories  sodium  alcohol  cost
0              Budweiser       144      15      4.7  0.43
1                Schlitz       151      19      4.9  0.43
2              Lowenbrau       157      15      0.9  0.48
3            Kronenbourg       170       7      5.2  0.73
4               Heineken       152      11      5.0  0.77
'''

X = beer[["calories", "sodium", "alcohol", "cost"]]

'''K-means clustering'''
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3).fit(X)
beer["cluster_3"] = km.labels_  # 分成三类
beer.sort_values("cluster_3", inplace=True)
cluster_centers = km.cluster_centers_
centers = beer.groupby("cluster_3").mean().reset_index()  # 中心点

import matplotlib.pyplot as plt
import numpy as np

colors = np.array(['red', 'green', 'blue', 'yellow'])

# plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster_3"]])  # 数据位置
# plt.scatter(centers["calories"], centers["alcohol"], marker='+', s=300, c='black')  # 中心点位置
# plt.xlabel("Calories")
# plt.ylabel("Alcohol")
# plt.show()

from pandas.tools.plotting import scatter_matrix

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]], s=100, alpha=1, c=colors[beer["cluster_3"]],
               figsize=(10, 10))
# plt.suptitle("With 3 centroids initialized")
# plt.show()

'''数据标准化'''
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)
'''
 [ 0.38791334  0.00779468  0.43380786 -0.45682969]
 [ 0.6250656   0.63136906  0.62241997 -0.45682969]
 [ 0.82833896  0.00779468 -3.14982226 -0.10269815]
 [ 1.26876459 -1.23935408  0.90533814  1.66795955]
 [ 0.65894449 -0.6157797   0.71672602  1.95126478]
'''

'''聚类评估:轮廓系数'''
'''
计算样本i到同簇其他样本的平均距离ai。ai 越小,说明样本i越应该被聚类到该簇。将ai 称为样本i的簇内不相似度。
计算样本i到其他某簇Cj 的所有样本的平均距离bij,称为样本i与簇Cj 的不相似度。定义为样本i的簇间不相似度:bi =min{bi1, bi2, ..., bik}
si接近1,则说明样本i聚类合理
si接近-1,则说明样本i更应该分类到另外的簇
若si 近似为0,则说明样本i在两个簇的边界上。
'''

from sklearn import metrics

# 遍历多个n_clusters,取最好的值
scores = []
for k in range(2, 20):
    labels = KMeans(n_clusters=k).fit(X).labels_
    score = metrics.silhouette_score(X, labels)
    scores.append(score)
scores.sort(reverse=True)
for i, k in zip(range(2, 20), scores):
    print(i, "\t-", k)

plt.plot(list(range(2, 20)), scores)
plt.xlabel("Number of Clusters Initialized")
plt.ylabel("Sihouette Score")
plt.show()

import pandas as pd

beer = pd.read_csv("data.txt", sep=" ")
X = beer[["calories", "sodium", "alcohol", "cost"]]

from sklearn.cluster import DBSCAN

db = DBSCAN(eps=10, min_samples=2).fit(X)
beer["cluster_db"] = db.labels_
beer.sort_values("cluster_db", inplace=True)

# print(beer.groupby("cluster_db").mean())
import numpy as np

colors = np.array(['red', 'green', 'blue', 'yellow'])

from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]], s=100, alpha=1, c=colors[beer["cluster_db"]],
               figsize=(10, 10))
plt.suptitle("With 3 centroids initialized")
# plt.show()

# 遍历多个n_clusters,取最好的值
scores = []
from sklearn import metrics

for eps in range(8, 15):
    for min_samples in range(2, 5):
        labels = DBSCAN(eps=eps, min_samples=min_samples).fit(X).labels_
        score = metrics.silhouette_score(X, labels)
        scores.append(str(eps) + " " + str(min_samples) + " " + str(score))
for score in scores:
    print(score)


你可能感兴趣的:(python)