fuzzy cmeans

from __future__ import division, print_function
import numpy as np
from numpy.linalg import cholesky
import matplotlib.pyplot as plt
import skfuzzy as fuzz
import argparse
import sys
import random

# https://www.zhihu.com/question/39823283
def generateMultidimensionalNormalDistribution(sampleCounts, dimension):
    gap = 8.
    mu = np.arange((dimension - 1) * (-gap), dimension * (gap + 1), 2 * gap)    # 期望
    print(mu)
    
    print(np.random.normal((3, 5)))
    exit()
    #numpy.random.normal(size=(10, 10))

    # 协方差矩阵    
    # 创建一个方阵
    sigma = np.random.rand(dimension**2).reshape(dimension, dimension) 
    for rowIndex in range(0, dimension):
        sigma[rowIndex] = sigma[rowIndex] + random.uniform(0, 4.84)  
    
    # 保留其上三角部分
    #sigma = np.triu(sigma)
    
    # 将上三角”拷贝”到下三角部分
    #sigma += sigma.T - np.diag(sigma.diagonal())
    
    print(sigma)
    R = cholesky(sigma)    
    s = np.dot(np.random.randn(sampleCounts, dimension), R) + mu  
    return s
    
# 生成正态分布的数据,counts:数据的数量,
# dimension:数据的维度
# size是dimension维的tuple
def generateMultidimensionalNormalDistribution(counts, dimension, size):
    data = np.random.normal(size)
    i = 0
    while i < counts - 1:
        d = np.random.normal(size)
        data = np.vstack((data, d))
        i = i + 1
    return data    
    
def generateMultidimensionalNormalDistribution1(categoryNumber, sampleNumberPerCategory, dimension, savedFile):
    gap = 4.
    di = (categoryNumber - 1) // 2
    list_ = []
    for i in range(0, categoryNumber):
        list_ = []
        for j in range(0, dimension):
             list_.append((i - di) * gap)
        tuple_ = tuple(list_)
        
        if(i == 0):
            data = generateMultidimensionalNormalDistribution(sampleNumberPerCategory, dimension, tuple_)
        else:
            d = generateMultidimensionalNormalDistribution(sampleNumberPerCategory, dimension, tuple_)
            data = np.vstack((data, d))
            
    return data
    
    
def saveData(numpyArray, fileName):
    np.savetxt(fileName, numpyArray, fmt='%0.8f')
    
def saveCfg(m, error,  maxiter, cfg_file):
    pass
    
    
def loadData(fileName):
    return np.loadtxt(fileName, dtype=np.float32)
    
def createData(sampleNumberPerCategory, categoryNumber, dimension, saveFile):
    data = generateMultidimensionalNormalDistribution1(categoryNumber, sampleNumberPerCategory, dimension, saveFile)
    # saveData(data.T, saveFile)
    saveData(data, saveFile)
    
    '''
    def cmeans(data, c, m, error, maxiter, init=None, seed=None):
    data : 2d array, size (S, N)
        Data to be clustered.  N is the number of data sets; S is the number
        of features within each sample vector.
        c : int
        Desired number of clusters or classes.
        
    m : float
        Array exponentiation applied to the membership function u_old at each
        iteration, where U_new = u_old ** m.
        
    error : float
        Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
        
    maxiter : int
        Maximum number of iterations allowed.
        
    init : 2d array, size (S, N)
        Initial fuzzy c-partitioned matrix. If none provided, algorithm is
        randomly initialized.
        
    seed : int
        If provided, sets random seed of init. No effect if init is
        provided. Mainly for debug/testing purposes.
    -------------------------------------------------------------------------------------------
    cntr, u_orig, u0, d, jm, p, fpc = cmeans(data, c, m, error, maxiter, init=None, seed=None)
    Returns:
    cntr : 2d array, size (S, c)
        Cluster centers.  Data for each center along each feature provided
        for every cluster (of the `c` requested clusters).
    u : 2d array, (S, N)
        Final fuzzy c-partitioned matrix.
    u0 : 2d array, (S, N)
        Initial guess at fuzzy c-partitioned matrix (either provided init or
        random guess used if init was not provided).
    d : 2d array, (S, N)
        Final Euclidian distance matrix.
    jm : 1d array, length P
        Objective function history.
    p : int
        Number of iterations run.
    fpc : float
        Final fuzzy partition coefficient.
    '''
def  train_cmeans(dataFile, modelFile, cfgFile, saveIntermediateResults, \
                     categoryNumber, m, error, maxiter, init_, seed_, \
                     u_file, u0_file, d_file, jm_file, p_file, fpc_file):
    # load train data
    trainingData = loadData(dataFile)
    if(init_ == None):
        # cntr, u_orig, u0, d, jm, p, fpc = fuzz.cluster.cmeans(trainingData.T, categoryNumber, m, error, maxiter)
        cntr, u_orig, u0, d, jm, p, fpc = fuzz.cluster.cmeans(trainingData.T, categoryNumber, m, error, maxiter, init=None, seed=seed_)   
    else:
        init_fuzzy_c_partitioned_matrix = loadData(init_)
        cntr, u_orig, u0, d, jm, p, fpc = fuzz.cluster.cmeans(trainingData.T, categoryNumber, m, error, maxiter, init_, seed_)   
        
    # save model
    saveData(cntr, modelFile)
    
    # save cfg
    cfgArray = np.random.random(size=(1, 4))
    cfgArray[0, 0] = m
    cfgArray[0, 1] = error
    cfgArray[0, 2] = maxiter
    cfgArray[0, 3] = trainingData.shape[1]
    #print("trainingData.shape: ", trainingData.shape[1])
    saveData(cfgArray, cfgFile)
    
    #print("cntr.shpe: ", cntr.shape)
    print("cntr: ", cntr)
    print()
      
    # 最终的u_orig其实相当于一个概率矩阵啊,也可以进一步做softmax计算概率
    print("u_orig: ", u_orig) 
    print()
      
    print("u0: ", u0)
    print()
      
    print("d: ", d)
    print()
      
    print("jm: ", jm)
    print()
     
    print("p: ", p)
    print()
      
    print("fpc: ", fpc)
    
    if(saveIntermediateResults != 0):
        saveData(u_orig, u_file)
        saveData(u0, u0_file)
        saveData(d, d_file)
        saveData(jm, jm_file)
        array = np.random.random(size=(1, 1))
        array[0, 0] = p
        saveData(array, p_file)
        array[0, 0] = fpc
        saveData(array, fpc_file)
        
def valid_cmeans():
    pass;

def test_cmeas(dataFile, modelFile, cfgFile):
    testData = loadData(dataFile)
    cntr = loadData(modelFile)
    cfg = loadData(cfgFile)
    #cfg = cfg.re
    dataDimension = int(cfg[3])
    print("testData.shape=", testData.shape)
    print("cntr.shape=", cntr.shape)
    print("cfg.shape=", cfg.shape)
    print("cntr:", cntr)
    print()
    print("m:", cfg[0])
    print("error:", cfg[1])
    print("maxiter:", cfg[2])
    print("dataDimension:", dataDimension)
    print()
    
    u, u0, d, jm, p, fpc = fuzz.cluster.cmeans_predict(testData.T, cntr, float(cfg[0]),  float(cfg[1]),  float(cfg[2]))
    '''
        这里为了分析u矩阵的含义,假设一共分为3类
        按照行求max的index,index范围为0~2
        u长成这样子:
        [[ 0.54256489  0.0631068   0.00291562 ...,  0.15580619  0.17543005
           0.15652909]
         [ 0.35176643  0.02712891  0.99530463 ...,  0.2065651   0.31637093
           0.22570475]
         [ 0.10566868  0.90976429  0.00177975 ...,  0.63762871  0.50819901
           0.61776617]]
        最后返回像:
        [0 2 1 ..., 2 2 2]
        其实,u就是聚类的概率啊,特定列的行数值求和就是1哇!
        下面返回的cluster_membership 其实就是聚类的结果,0表示聚在类别0,2表示聚集在类别2,...!
        补充:(np.argmax([[1,2,3],[4,1,4],[2,8,9]], axis=0)) == [1,2,2]
    '''
    cluster_membership = np.argmax(u, axis=0)  # Hardening for visualization

    #result = arange(cntr.shape[0] * testData.shape[0] * testData.shape[1]).reshape(cntr.shape[0], )    

    resultList = []
    for i in range(0, cntr.shape[0]):
        list_ = []
        resultList.append(list_)
	
    # print('Hello')
    print("len(resultList)=", len(resultList))
    #for i in range(0, testData.shape[0]):
     #   pass

    # 保存结果
    # print("Hello")
    suffix = str(int(cfg[2])) + "_" + str(int(cntr.shape[0])) + "_" + str(int(cntr.shape[1])) + "_class_"
    # print(suffix)
    fileList = [] 
    for classIndex in range(0, int(cntr.shape[0])):
        file = suffix + str(classIndex) + ".data"
        fp = open(file, "w")
        fileList.append(fp)

    dataSize = testData.shape[0]
    for dataIndex in range(0, dataSize):
        line = str(testData[dataIndex])
        line = line.lstrip('[')
        line = line.rstrip(']')
        fileList[cluster_membership[dataIndex]].writelines(line + "\n")
		
    for classIndex in range(0, int(cntr.shape[0])):
        fileList[classIndex].close()

    
    fig3, ax3 = plt.subplots()
    ax3.set_title('points classifed according to known centers')
    
    # 将聚类预测的各个类别的结果进行绘图
    categoryNumber = cntr.shape[0]
    if(dataDimension == 1):
        pass;
    elif(dataDimension == 2):
        for j in range(categoryNumber):
            ax3.plot(testData[cluster_membership == j, 0],
                     testData[cluster_membership == j, 1], 'o',
                     label='class ' + str(j))
    elif(dataDimension > 2): # 最多只画3个维度出来,但是还是2维图形,不直观,该为3维画图就好了
        for j in range(categoryNumber):
            ax3.plot(testData[cluster_membership == j, 0],
                     testData[cluster_membership == j, 1], 
                     testData[cluster_membership == j, 2], 'o',
                     label='class ' + str(j))
            
            
    if(dataDimension > 1):
        ax3.legend()
        plt.show()
    
def cmeans():
    print("cmeans.")
    
    

def main():
    parser = argparse.ArgumentParser(description='manual to this script')
    parser.add_argument('--phase', type=str, default = None)
    parser.add_argument('--dataFile', type=str, default = None)
    parser.add_argument('--modelFile', type=str, default = None)
    parser.add_argument('--cfgFile', type=str, default = None)
    parser.add_argument('--trainSampleNumberPerCategory', type=int, default = 1000)
    parser.add_argument('--categoryNumber', type=int, default = 3)
    parser.add_argument('--dataDimension', type=int,  default = 2)
    parser.add_argument('--m', type=float,  default = 2)
    parser.add_argument('--error', type=float, default = 0.005)
    parser.add_argument('--maxiter', type=int, default = 2000)
    parser.add_argument('--init', type=str, default = None)
    parser.add_argument('--seed', type=int, default = None)
    
    parser.add_argument('--saveIntermediateResults', type=bool, default = False)
    parser.add_argument('--saveDataFile', type=str, default = 'data.data')
    parser.add_argument('--u', type=str, default = 'u.data')
    parser.add_argument('--u0', type=str, default = 'u0.data')
    parser.add_argument('--d', type=str, default = 'd.data')
    parser.add_argument('--jm', type=str, default = 'jm.data')
    parser.add_argument('--p', type=str, default = 'p.data')
    parser.add_argument('--fpc', type=str, default = 'fpc.data')
    
    args = parser.parse_args()
   
    if(args.phase == 'createData'):
        createData(args.trainSampleNumberPerCategory, args.categoryNumber, args.dataDimension, args.saveDataFile)
    elif(args.phase == 'train'):
        if(args.dataFile == None):
            print("Please specify train data file.")
            exit()
        if(args.modelFile == None):
            print("Please specify the model file to save the training model.")
            exit()
        if(args.cfgFile == None):
            print("Please specify the cfg file to save the training cfg.")
            exit()
        if(args.saveIntermediateResults):
            print("Inter mediate results will be saved in training phase.")
        else:
            print("Inter mediate results will not be saved in training phase.")
            
        train_cmeans(args.dataFile, args.modelFile, args.cfgFile, args.saveIntermediateResults, \
                     args.categoryNumber, args.m, args.error, args.maxiter, args.init, args.seed, \
                     args.u, args.u0, args.d, args.jm, args.p, args.fpc)
    elif(args.phase == 'valid'):
        pass
    elif(args.phase == 'test'):
        if(args.dataFile == None):
            print("Please specify test data file.")
            exit()
        if(args.modelFile == None):
            print("Please specify the model file for the testing phase.")
            exit()
        if(args.cfgFile == None):
            print("Please specify the cfg file for the testing phase.")
            exit()
        
        test_cmeas(args.dataFile, args.modelFile, args.cfgFile)    
    else:
        print("Unknown phase! phase should be  createData or train or valid or test.")
        exit()
   

if __name__ == "__main__":
    # create data 
    # python _cmeans.py --phase=createData --saveDataFile=data_1000_3_2.data
    # python _cmeans.py --phase=createData --dataDimension=3  --saveDataFile=data_1000_3_3.data
    # python _cmeans.py --phase=createData --categoryNumber=4 --saveDataFile=data_1000_4_2.data
    # python _cmeans.py --phase=createData --categoryNumber=4 --dataDimension=3  --saveDataFile=data_1000_4_3.data
    # python _cmeans.py --phase=createData --categoryNumber=3 --dataDimension=3 --trainSampleNumberPerCategory=2000 --saveDataFile=data_2000_3_3.data
    # python _cmeans.py --phase=createData --categoryNumber=4 --dataDimension=5 --trainSampleNumberPerCategory=2000 --saveDataFile=data_2000_4_5.data
    # python _cmeans.py --phase=createData --categoryNumber=4 --dataDimension=2 --trainSampleNumberPerCategory=2000 --saveDataFile=data_2000_4_2.data
    
    # train
    # python _cmeans.py --phase=train --categoryNumber=3 --dataFile=data_2000_3_3.data --modelFile=2000_3_3.model --cfgFile=2000_3_3.cfg --seed=234 --saveIntermediateResults=1
    # python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg
    # python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_1000_4_3.data --modelFile=1000_4_3.model --cfgFile=1000_4_3.cfg
    # python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg --seed=234
    # python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg --seed=234 --saveIntermediateResults=1
    # python _cmeans.py --phase=train --categoryNumber=4 --dataFile=data_2000_4_2.data --modelFile=2000_4_2.model --cfgFile=2000_4_2.cfg --seed=234 --saveIntermediateResults=1
    
    # test
    # python _cmeans.py --phase=test --dataFile=data_2000_3_3.data --modelFile=2000_3_3.model --cfgFile=2000_3_3.cfg 
    # python _cmeans.py --phase=test --dataFile=data_2000_4_2.data --modelFile=2000_4_2.model --cfgFile=2000_4_2.cfg 
    # python _cmeans.py --phase=test --dataFile=data_1000_4_3.data --modelFile=1000_4_3.model --cfgFile=1000_4_3.cfg 
    # python _cmeans.py --phase=test --dataFile=data_2000_4_5.data --modelFile=2000_4_5.model --cfgFile=2000_4_5.cfg 
    main()

 

你可能感兴趣的:(python)