聚类分析(银行客户画像)

# CDA 11 聚类分析(客户画像)
# 导入数据
customer<- read.csv("D:\\桃子的数据\\CDA\\11 聚类分析\\课件&代码-11.客户画像\\Data\\Age_Income3.csv",header=T,sep=',')
names(customer)
customer1<- customer[,c(2,4)]
names(customer1)

#### 11.2---银行客户 k均值/层次聚类----####
names(customer1)
#找到聚类数--pam算法
pamkmd<-pamk(customer1)
pamkmd$nc
layout(matrix(c(1,2),1,2))
plot(pamkmd$pamobject)
# 结果:2类

#找到聚类数--轮廓系数
silhouette()
library(cluster)

result <- list()
for (i in 2:5){
  kmd <- kmeans(customer1,centers = i)
  sil <- silhouette(kmd$cluster, dist(customer1))
  result[[paste('k=',i,sep='')]] <- mean(sil[,'sil_width'])
}
result
# K=2时轮廓系数最高,k=3时候轮廓系数最小,但是从图中看存在离群点,因此用聚类法5类来删掉离群点
#选择聚类5类,去掉离群值的类
#1.正规化后再聚类
library(clusterSim)
customer2<- data.Normalization(customer1,type= "n4")
kmd <- kmeans(customer2,centers = 5)
plot(customer2,col=kmd$cluster)
table(kmd$cluster)
#去掉含有2个异常值的2个类
customer3<- customer1[kmd$cluster!=3&kmd$cluster!=4,]
# #2.正规化后再聚类
# customer4<- data.Normalization(customer3,type= "n4")
# kmd <- kmeans(customer4,centers = 4)
# plot(customer4,col=kmd$cluster)
# table(kmd$cluster)
# #去掉第二类,因为样本有2个异常值
# customer5<- customer3[kmd$cluster!=2,]
#3.正规化后再次寻找最优聚类数
customer6<- data.Normalization(customer3,type= "n4")
write.csv(customer6,file = "D:\\桃子的数据\\CDA\\11 聚类分析\\课件&代码-11.客户画像\\Data\\customer6.csv")
#找到聚类数--pam算法
pamkmd<-pamk(customer6)
pamkmd$nc
layout(matrix(c(1,2),1,2))
plot(pamkmd$pamobject)
#聚类
kmd <- kmeans(customer6,centers = 3)
plot(customer6,col=kmd$cluster)
table(kmd$cluster)

#------ 找到三个类,特征描述------#
plot(customer6,col=kmd$cluster)
points(kmd$centers,col=1:3,pch=8,cex=2)
table(kmd$cluster)
#对原始数据(未正规化但已经去掉异常值的数据)进行解释
write.csv(customer3,file = "D:\\桃子的数据\\CDA\\11 聚类分析\\课件&代码-11.客户画像\\Data\\customer3.csv")
clust <- list()
for (i in 1:3){
  clust[[i]]<- customer3[kmd$cluster==i,]
  result<- data.frame()
}
for (i in 1:3){
  out<-data.frame(cluster=i,Income = mean(clust[[i]]$Income),Age= mean(clust[[i]]$Age))
  result<- rbind(result,out)
}
re1<-as.data.frame(result)
# re1$cluster<- factor(re1$cluster,levels = c(1,2,3),labels = c("年老多金", "中年收入不高", "富裕青年"))
#数据每个类的均值
kmd$cluster
#rechart作图
customer4<- cbind(customer3,cluster=kmd$cluster)
customer4$cluster<- factor(customer4$cluster,levels = c(1,2,3),labels = c("年老多金", "中年收入不高", "富裕青年"))
library(recharts)
png("D:\\桃子的数据\\CDA\\cluster.png")
ePoints(customer4[,c(2:4)],xvar =~Income,yvar = ~Age,series = ~cluster,
        xlab.name = "Income:千元",ylab.name = "Age:岁",xlab.namePosition = "end",ylab.namePosition = "end",theme = 2)
dev.off()

数据预览(前5行))

CID

Income

Sex

Age

Marriage

1

70

M

15

S

2

90

M

17

M

3

80

M

20

S

4

220

M

22

S

5

75

F

24

S

 

 

 

聚类分析(银行客户画像)_第1张图片

 

第一类:年老多金型,年龄均值为55岁,收入约9.6万每年,这类人群财富有所积累,但可能思想保守,可推荐保本型投资的理财产品。
第二类:中年收入不高型,年龄均值为38岁,收入约4.5万每年,这类人收入较低,可推荐保险。
第三类:富裕青年型,年龄均值为23岁,收入约8.3万每年,年轻人思想比较开放,具有冒险精神,因此可推荐高获利高风险的投资产品。

你可能感兴趣的:(R)