K-MEANS聚类分析银行数据分析记录
调用的包
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
#读取文件
df = pd.read_csv('data.xlsx')1
2
3
4
5
6
7
8
9
数据预处理
①数据缺失
a = df[df['年龄']>0]
b = a['年龄']
for i in range(len(df)):
if df['年龄'][i]<=0:
df['年龄'][i] = b.mean()1
2
3
4
5
6
②异常值处理
df['年龄'] = df['年龄'].fillna(b.mean())
df['职业'] = df['职业'].fillna(0)
df['职业'] = df['职业'].replace([9999, '@'], 0)1
2
3
特征相关性分析
list2 = ['age', 'money_type', 'career', 'custo_type', 'balance', 'loan', 'pos', 'alipay', 'qqpay', 'jingdong', 'cloud', 'jin']
df2 = df1[list2]
data_corr = df2.corr()
sns.heatmap(data, square=True, linewidth=0.5, annot=True)
plt.show()1
2
3
4
5
画每一个特征的箱型图,对应最大值、最小值、中位数及上下四分位数。
sns.boxplot(df1['age'])
plt.show1
2
对多个图画箱型图像,我有12个特征,所以总体的特征图过程如下:
for i in range(3):
for j in range(4):
plt.subplot(3, 4, k)
sns.boxplot(df1.iloc[:, k-1], orient='vertical')
k = k + 11
2
3
4
5
4. Kmeans 聚类分析
改变K值进行聚类分析
def K_means(k, data):
kmodel = KMeans(n_clusters=k, n_jobs=4) #n_jobs是并行数, 一般等于CPU数比较好
kmodel.fit(data)
r = pd.concat([data, pd.Series(kmodel.labels_, index=data.index)], axis=1)
r.columns = list(data.columns) + ['聚类类别']
centers=kmodel.cluster_centers_
return centers, r
colors=['r','c','b', 'g', 'm', 'gold', 'steelblue', 'crimson', 'navy', 'forestgreen']
plt.figure()
for k in range(2, 8):
centers, r = K_means(k, data)
X = r[list1]
Y = r['聚类类别']
clf = RandomForestRegressor(oob_score=True)
model1 = clf.fit(X, Y)
print(k) #k=5相对合适
print(clf.oob_score_)
a = model1.feature_importances_
for i in range(len(a)):
print(a[i])
plt.subplot(2, 3, k-1)
for j in range(k):
index_set=np.where(r['聚类类别']==j)
cluster=data.iloc[index_set]
plt.scatter(cluster.iloc[:,0],cluster.iloc[:,4],c=colors[j],marker='.')
plt.plot(centers[j][0],centers[j][4],'o',markerfacecolor=colors[j],markeredgecolor='k',markersize=8)
plt.show()1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
不同K值的聚类效果
5. 聚类后的特征重要性分析,使用随机森林
colors=['r','c','b', 'g', 'm', 'gold', 'steelblue', 'crimson', 'navy', 'forestgreen']
plt.figure()
for k in range(2, 8):
centers, r = K_means(k, data)
X = r[list1]
Y = r['聚类类别']
clf = RandomForestRegressor(oob_score=True)
model1 = clf.fit(X, Y)
print(k)
print(clf.oob_score_)
a = model1.feature_importances_
for i in range(len(a)):
print(a[i])
plt.subplot(2, 3, k-1)
for j in range(k):
index_set=np.where(r['聚类类别']==j)
cluster=data.iloc[index_set]
plt.scatter(cluster.iloc[:,0],cluster.iloc[:,4],c=colors[j],marker='.')
plt.plot(centers[j][0],centers[j][4],'o',markerfacecolor=colors[j],markeredgecolor='k',markersize=8)
plt.show()1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
通过上述过程可以知道最合适的k值是哪个。