from __future__ import division, print_function
import numpy as np
from numpy.linalg import cholesky
import matplotlib.pyplot as plt
import skfuzzy as fuzz
import argparse
import sys
import random
# https://www.zhihu.com/question/39823283
def generateMultidimensionalNormalDistribution(sampleCounts, dimension):
gap = 8.
mu = np.arange((dimension - 1) * (-gap), dimension * (gap + 1), 2 * gap) # 期望
print(mu)
print(np.random.normal((3, 5)))
exit()
#numpy.random.normal(size=(10, 10))
# 协方差矩阵
# 创建一个方阵
sigma = np.random.rand(dimension**2).reshape(dimension, dimension)
for rowIndex in range(0, dimension):
sigma[rowIndex] = sigma[rowIndex] + random.uniform(0, 4.84)
# 保留其上三角部分
#sigma = np.triu(sigma)
# 将上三角”拷贝”到下三角部分
#sigma += sigma.T - np.diag(sigma.diagonal())
print(sigma)
R = cholesky(sigma)
s = np.dot(np.random.randn(sampleCounts, dimension), R) + mu
return s
# 生成正态分布的数据,counts:数据的数量,
# dimension:数据的维度
# size是dimension维的tuple
def generateMultidimensionalNormalDistribution(counts, dimension, size):
data = np.random.normal(size)
i = 0
while i < counts - 1:
d = np.random.normal(size)
data = np.vstack((data, d))
i = i + 1
return data
def generateMultidimensionalNormalDistribution1(categoryNumber, sampleNumberPerCategory, dimension, savedFile):
gap = 4.
di = (categoryNumber - 1) // 2
list_ = []
for i in range(0, categoryNumber):
list_ = []
for j in range(0, dimension):
list_.append((i - di) * gap)
tuple_ = tuple(list_)
if(i == 0):
data = generateMultidimensionalNormalDistribution(sampleNumberPerCategory, dimension, tuple_)
else:
d = generateMultidimensionalNormalDistribution(sampleNumberPerCategory, dimension, tuple_)
data = np.vstack((data, d))
return data
def saveData(numpyArray, fileName):
np.savetxt(fileName, numpyArray, fmt='%0.8f')
def saveCfg(m, error, maxiter, cfg_file):
pass
def loadData(fileName):
return np.loadtxt(fileName, dtype=np.float32)
def createData(sampleNumberPerCategory, categoryNumber, dimension, saveFile):
data = generateMultidimensionalNormalDistribution1(categoryNumber, sampleNumberPerCategory, dimension, saveFile)
# saveData(data.T, saveFile)
saveData(data, saveFile)
'''
def cmeans(data, c, m, error, maxiter, init=None, seed=None):
data : 2d array, size (S, N)
Data to be clustered. N is the number of data sets; S is the number
of features within each sample vector.
c : int
Desired number of clusters or classes.
m : float
Array exponentiation applied to the membership function u_old at each
iteration, where U_new = u_old ** m.
error : float
Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
maxiter : int
Maximum number of iterations allowed.
init : 2d array, size (S, N)
Initial fuzzy c-partitioned matrix. If none provided, algorithm is
randomly initialized.
seed : int
If provided, sets random seed of init. No effect if init is
provided. Mainly for debug/testing purposes.
-------------------------------------------------------------------------------------------
cntr, u_orig, u0, d, jm, p, fpc = cmeans(data, c, m, error, maxiter, init=None, seed=None)
Returns:
cntr : 2d array, size (S, c)
Cluster centers. Data for each center along each feature provided
for every cluster (of the `c` requested clusters).
u : 2d array, (S, N)
Final fuzzy c-partitioned matrix.
u0 : 2d array, (S, N)
Initial guess at fuzzy c-partitioned matrix (either provided init or
random guess used if init was not provided).
d : 2d array, (S, N)
Final Euclidian distance matrix.
jm : 1d array, length P
Objective function history.
p : int
Number of iterations run.
fpc : float
Final fuzzy partition coefficient.
'''
def train_cmeans(dataFile, modelFile, cfgFile, saveIntermediateResults, \
categoryNumber, m, error, maxiter, init_, seed_, \
u_file, u0_file, d_file, jm_file, p_file, fpc_file):
# load train data
trainingData = loadData(dataFile)
if(init_ == None):
# cntr, u_orig, u0, d, jm, p, fpc = fuzz.cluster.cmeans(trainingData.T, categoryNumber, m, error, maxiter)
cntr, u_orig, u0, d, jm, p, fpc = fuzz.cluster.cmeans(trainingData.T, categoryNumber, m, error, maxiter, init=None, seed=seed_)
else:
init_fuzzy_c_partitioned_matrix = loadData(init_)
cntr, u_orig, u0, d, jm, p, fpc = fuzz.cluster.cmeans(trainingData.T, categoryNumber, m, error, maxiter, init_, seed_)
# save model
saveData(cntr, modelFile)
# save cfg
cfgArray = np.random.random(size=(1, 4))
cfgArray[0, 0] = m
cfgArray[0, 1] = error
cfgArray[0, 2] = maxiter
cfgArray[0, 3] = trainingData.shape[1]
#print("trainingData.shape: ", trainingData.shape[1])
saveData(cfgArray, cfgFile)
#print("cntr.shpe: ", cntr.shape)
print("cntr: ", cntr)
print()
# 最终的u_orig其实相当于一个概率矩阵啊,也可以进一步做softmax计算概率
print("u_orig: ", u_orig)
print()
print("u0: ", u0)
print()
print("d: ", d)
print()
print("jm: ", jm)
print()
print("p: ", p)
print()
print("fpc: ", fpc)
if(saveIntermediateResults != 0):
saveData(u_orig, u_file)
saveData(u0, u0_file)
saveData(d, d_file)
saveData(jm, jm_file)
array = np.random.random(size=(1, 1))
array[0, 0] = p
saveData(array, p_file)
array[0, 0] = fpc
saveData(array, fpc_file)
def valid_cmeans():
pass;
def test_cmeas(dataFile, modelFile, cfgFile):
testData = loadData(dataFile)
cntr = loadData(modelFile)
cfg = loadData(cfgFile)
#cfg = cfg.re
dataDimension = int(cfg[3])
print("testData.shape=", testData.shape)
print("cntr.shape=", cntr.shape)
print("cfg.shape=", cfg.shape)
print("cntr:", cntr)
print()
print("m:", cfg[0])
print("error:", cfg[1])
print("maxiter:", cfg[2])
print("dataDimension:", dataDimension)
print()
u, u0, d, jm, p, fpc = fuzz.cluster.cmeans_predict(testData.T, cntr, float(cfg[0]), float(cfg[1]), float(cfg[2]))
'''
这里为了分析u矩阵的含义,假设一共分为3类
按照行求max的index,index范围为0~2
u长成这样子:
[[ 0.54256489 0.0631068 0.00291562 ..., 0.15580619 0.17543005
0.15652909]
[ 0.35176643 0.02712891 0.99530463 ..., 0.2065651 0.31637093
0.22570475]
[ 0.10566868 0.90976429 0.00177975 ..., 0.63762871 0.50819901
0.61776617]]
最后返回像:
[0 2 1 ..., 2 2 2]
其实,u就是聚类的概率啊,特定列的行数值求和就是1哇!
下面返回的cluster_membership 其实就是聚类的结果,0表示聚在类别0,2表示聚集在类别2,...!
补充:(np.argmax([[1,2,3],[4,1,4],[2,8,9]], axis=0)) == [1,2,2]
'''
cluster_membership = np.argmax(u, axis=0) # Hardening for visualization
#result = arange(cntr.shape[0] * testData.shape[0] * testData.shape[1]).reshape(cntr.shape[0], )
resultList = []
for i in range(0, cntr.shape[0]):
list_ = []
resultList.append(list_)
# print('Hello')
print("len(resultList)=", len(resultList))
#for i in range(0, testData.shape[0]):
# pass
# 保存结果
# print("Hello")
suffix = str(int(cfg[2])) + "_" + str(int(cntr.shape[0])) + "_" + str(int(cntr.shape[1])) + "_class_"
# print(suffix)
fileList = []
for classIndex in range(0, int(cntr.shape[0])):
file = suffix + str(classIndex) + ".data"
fp = open(file, "w")
fileList.append(fp)
dataSize = testData.shape[0]
for dataIndex in range(0, dataSize):
line = str(testData[dataIndex])
line = line.lstrip('[')
line = line.rstrip(']')
fileList[cluster_membership[dataIndex]].writelines(line + "\n")
for classIndex in range(0, int(cntr.shape[0])):
fileList[classIndex].close()
fig3, ax3 = plt.subplots()
ax3.set_title('points classifed according to known centers')
# 将聚类预测的各个类别的结果进行绘图
categoryNumber = cntr.shape[0]
if(dataDimension == 1):
pass;
elif(dataDimension == 2):
for j in range(categoryNumber):
ax3.plot(testData[cluster_membership == j, 0],
testData[cluster_membership == j, 1], 'o',
label='class ' + str(j))
elif(dataDimension > 2): # 最多只画3个维度出来,但是还是2维图形,不直观,该为3维画图就好了
for j in range(categoryNumber):
ax3.plot(testData[cluster_membership == j, 0],
testData[cluster_membership == j, 1],
testData[cluster_membership == j, 2], 'o',
label='class ' + str(j))
if(dataDimension > 1):
ax3.legend()
plt.show()
def cmeans():
print("cmeans.")
def main():
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument('--phase', type=str, default = None)
parser.add_argument('--dataFile', type=str, default = None)
parser.add_argument('--modelFile', type=str, default = None)
parser.add_argument('--cfgFile', type=str, default = None)
parser.add_argument('--trainSampleNumberPerCategory', type=int, default = 1000)
parser.add_argument('--categoryNumber', type=int, default = 3)
parser.add_argument('--dataDimension', type=int, default = 2)
parser.add_argument('--m', type=float, default = 2)
parser.add_argument('--error', type=float, default = 0.005)
parser.add_argument('--maxiter', type=int, default = 2000)
parser.add_argument('--init', type=str, default = None)
parser.add_argument('--seed', type=int, default = None)
parser.add_argument('--saveIntermediateResults', type=bool, default = False)
parser.add_argument('--saveDataFile', type=str, default = 'data.data')
parser.add_argument('--u', type=str, default = 'u.data')
parser.add_argument('--u0', type=str, default = 'u0.data')
parser.add_argument('--d', type=str, default = 'd.data')
parser.add_argument('--jm', type=str, default = 'jm.data')
parser.add_argument('--p', type=str, default = 'p.data')
parser.add_argument('--fpc', type=str, default = 'fpc.data')
args = parser.parse_args()
if(args.phase == 'createData'):
createData(args.trainSampleNumberPerCategory, args.categoryNumber, args.dataDimension, args.saveDataFile)
elif(args.phase == 'train'):
if(args.dataFile == None):
print("Please specify train data file.")
exit()
if(args.modelFile == None):
print("Please specify the model file to save the training model.")
exit()
if(args.cfgFile == None):
print("Please specify the cfg file to save the training cfg.")
exit()
if(args.saveIntermediateResults):
print("Inter mediate results will be saved in training phase.")
else:
print("Inter mediate results will not be saved in training phase.")
train_cmeans(args.dataFile, args.modelFile, args.cfgFile, args.saveIntermediateResults, \
args.categoryNumber, args.m, args.error, args.maxiter, args.init, args.seed, \
args.u, args.u0, args.d, args.jm, args.p, args.fpc)
elif(args.phase == 'valid'):
pass
elif(args.phase == 'test'):
if(args.dataFile == None):
print("Please specify test data file.")
exit()
if(args.modelFile == None):
print("Please specify the model file for the testing phase.")
exit()
if(args.cfgFile == None):
print("Please specify the cfg file for the testing phase.")
exit()
test_cmeas(args.dataFile, args.modelFile, args.cfgFile)
else:
print("Unknown phase! phase should be createData or train or valid or test.")
exit()
if __name__ == "__main__":
# create data
# python _cmeans.py --phase=createData --saveDataFile=data_1000_3_2.data
# python _cmeans.py --phase=createData --dataDimension=3 --saveDataFile=data_1000_3_3.data
# python _cmeans.py --phase=createData --categoryNumber=4 --saveDataFile=data_1000_4_2.data
# python _cmeans.py --phase=createData --categoryNumber=4 --dataDimension=3 --saveDataFile=data_1000_4_3.data
# python _cmeans.py --phase=createData --categoryNumber=3 --dataDimension=3 --trainSampleNumberPerCategory=2000 --saveDataFile=data_2000_3_3.data
# python _cmeans.py --phase=createData --categoryNumber=4 --dataDimension=5 --trainSampleNumberPerCategory=2000 --saveDataFile=data_2000_4_5.data
# python _cmeans.py --phase=createData --categoryNumber=4 --dataDimension=2 --trainSampleNumberPerCategory=2000 --saveDataFile=data_2000_4_2.data
# train
# python _cmeans.py --phase=train --categoryNumber=3 --dataFile=data_2000_3_3.data --modelFile=2000_3_3.model --cfgFile=2000_3_3.cfg --seed=234 --saveIntermediateResults=1
# python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg
# python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_1000_4_3.data --modelFile=1000_4_3.model --cfgFile=1000_4_3.cfg
# python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg --seed=234
# python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg --seed=234 --saveIntermediateResults=1
# python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_2.data --modelFile=2000_4_2.model --cfgFile=2000_4_2.cfg --seed=234 --saveIntermediateResults=1
# test
# python _cmeans.py --phase=test --dataFile=data_2000_3_3.data --modelFile=2000_3_3.model --cfgFile=2000_3_3.cfg
# python _cmeans.py --phase=test --dataFile=data_2000_4_2.data --modelFile=2000_4_2.model --cfgFile=2000_4_2.cfg
# python _cmeans.py --phase=test --dataFile=data_1000_4_3.data --modelFile=1000_4_3.model --cfgFile=1000_4_3.cfg
# python _cmeans.py --phase=test --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg
main()