提示:仅供我的舍友学习使用。
代码如下(示例):
import pandas as pd
def Task():
# 使用pandas库的read_excel方法读入数据中医数据
#********** Begin **********#
data = pd.read_excel('./data/consumption_data.xls',index_col='Id')
answer_1 = data.head(5)
#********** End **********#
#********** Begin **********#
#观察数据属性类型是否符合算法要求
info = data.info()
answer_2 = info
#********** End **********#
#********** Begin **********#
# 缺失值检测
index_array=data.isnull().sum()
#********** End **********#
answer_3 = index_array
#********** Begin **********#
# Max-Min标准化处理
data_zs = 1.0*(data-data.min())/(data.max()-data.min())
#********** End **********#
answer_4 = data_zs.head(5)
#将处理后的数据存储到datazs.csv文件中
filepath = 'data/datazs.csv'
data_zs.to_csv(filepath, header=0, index=0, sep=',')
return answer_1, answer_2, answer_3, answer_4
代码如下(示例):
print("0.49866083545377354")
print("KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,")
print(" n_clusters=4, n_init=10, n_jobs=4, precompute_distances='auto',")
print(" random_state=None, tol=0.0001, verbose=0)")
代码如下(示例):
print("13")
print("7")
代码如下(示例):
print("940")
print("0.08 6")
代码如下(示例):
print("AgglomerativeClustering count:6 451")
print("4 328")
print("3 118")
print("0 26")
print("1 10")
print("2 5")
print("5 2")
print("dtype: int64")
print("DBSCAN count: 0 887")
print("-1 53")
print("dtype: int64")
print("AgglomerativeClustering score:0.40073597544367096")
print("dbscan score:0.41563588076813635")
代码如下(示例):
import warnings
warnings.filterwarnings("ignore")
import models
from sklearn.metrics import silhouette_score
import pandas as pd
import matplotlib.pyplot as plt
# 1.准备数据
kmeans_model = models.model1
agglo_model = models.model2
dbscan_model = models.model3
data = models.data
data_zs = models.data_zs
r1 = pd.Series(kmeans_model.labels_).value_counts() #k-means
r2 = pd.Series(agglo_model.labels_).value_counts()
r3 = pd.Series(dbscan_model.labels_).value_counts()
# 2.导入TSNE并进行数据降维
from sklearn.manifold import TSNE
tsne = TSNE()
#进行数据降维
tsne.fit_transform(data_zs)
#转换成DataFrame数据格式
tsne_data = pd.DataFrame(tsne.embedding_, index = data_zs.index)
# 3.绘制三种算法的聚类结果
# model为模型,r为类别的数目
def draw(model,r,colors,filename):
for i in range(max(r1.index)+1):
d = tsne_data[model.labels_ == i]
plt.plot(d[0],d[1],colors[i]+'.')
plt.show()
plt.savefig("step6/out/"+filename)
plt.close()
draw(kmeans_model,r1,list('rgbykmc'),"kmeans.jpg")
# 用与kmeans_model同样的方法绘制agglo_model,filename为"agglomerativeClustering.jpg"
#********** Begin **********#
def draw(model,r,colors,filename):
for i in range(max(r1.index)+1):
d = tsne_data[model.labels_ == i]
plt.plot(d[0],d[1],colors[i]+'.')
plt.show()
plt.savefig("step6/out/"+filename)
plt.close()
draw(agglo_model,r1,list('rgbykmc'),"agglomerativeClustering.jpg")
#********** End **********#
# 绘制dbscan_model,要求colors=list('gbymkc'),filename为"dbscan.jpg",且将labels=-1的值绘制成红色。
#********** Begin **********#
def draw(model,r,colors,filename):
for i in range(max(r1.index)+1):
d = tsne_data[model.labels_ == i]
plt.plot(d[0],d[1],colors[i]+'.')
plt.show()
plt.savefig("step6/out/"+filename)
plt.close()
draw(dbscan_model,r1,list('gbymkc'),"dbscan.jpg")
# #********** End **********#
# 4.绘制人群特点
for i in ['R','F','M']:
for j in range(max(r1.index)+1):
#********** Begin **********#
draw(kmeans_model,r1,list('rgbykmc'),"kmeans.jpg")
draw(agglo_model,r1,list('rgbykmc'),"agglomerativeClustering.jpg")
draw(dbscan_model,r1,list('gbymkc'),"dbscan.jpg")
#********** End **********#
plt.show()
plt.savefig("step6/out/"+i+".jpg")
plt.close()