weka[10] - SimpleKmeans

classification的算法还有一些,不过还是打算先进入clustering的阶段。后续再回去补。

这一篇主要看看kmeans。kmeans是最简单的一种聚类算法,很清晰的EM思路。他的主要缺陷是聚类个数无法确定(靠人为设定),受初始中心点影响较大。

下面直接看看代码:国际惯例,看看buildCluster

    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);
				
    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
      m_ReplaceMissingFilter.setInputFormat(instances);
      instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_FullMissingCounts = new int[instances.numAttributes()];
    if (m_displayStdDevs) {
      m_FullStdDevs = new double[instances.numAttributes()];
    }
    m_FullNominalCounts = new int[instances.numAttributes()][0];
m_FullMissingCounts就是统计每个属性的缺失值

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false);
    for (int i = 0; i < instances.numAttributes(); i++) {
      m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
      if (instances.attribute(i).isNumeric()) {
        if (m_displayStdDevs) {
          m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
        }
        if (m_FullMissingCounts[i] == instances.numInstances()) {
          m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
        }
      } else {
        m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
        if (m_FullMissingCounts[i] 
            > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
          m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value
        }
      }
    }
moveCentroid就是更新每个簇的中心点(后面详细看)。下面在做的事情,就是计算每个特征的mean或者mode(出现次数最多的那个),他考虑了如果是连续的,那么计算下标准差。如果是离散的并且missing最多,那么modes就是-1。

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int [instances.numInstances()];

    if(m_PreserveOrder)
      m_Assignments = clusterAssignments;
		
    m_DistanceFunction.setInstances(instances);
    
    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if(m_PreserveOrder)
      initInstances = new Instances(instances);
    else
      initInstances = instances;
clusterAssignments就是每个样本的簇号。下面的RandomO就是为了初始化中心点。
    for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
      instIndex = RandomO.nextInt(j+1);
      hk = new DecisionTableHashKey(initInstances.instance(instIndex),
                                    initInstances.numAttributes(), true);
      if (!initC.containsKey(hk)) {
        m_ClusterCentroids.add(initInstances.instance(instIndex));
	initC.put(hk, null);
      }
      initInstances.swap(j, instIndex);
      
      if (m_ClusterCentroids.numInstances() == m_NumClusters) {
	break;
      }
    }
可以看到,这里每次初始化一个instIndex,然后把对应的样本作为一个中心点,直到个数 =m_NumClusters。

这里还用到了一个HashMap,来判断随机得到的中心点之前是否已经被选择。到目前为止,只是为迭代做一些准备工作。

    m_NumClusters = m_ClusterCentroids.numInstances();
    
    //removing reference
    initInstances = null;
		
    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances [] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double [m_NumClusters];
    m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
    while (!converged) {
      emptyClusterCount = 0;
      m_Iterations++;
      converged = true;
      for (i = 0; i < instances.numInstances(); i++) {
	Instance toCluster = instances.instance(i);
	int newC = clusterProcessedInstance(toCluster, true);
	if (newC != clusterAssignments[i]) {
	  converged = false;
	}
	clusterAssignments[i] = newC;
      }
      
      // update centroids
      m_ClusterCentroids = new Instances(instances, m_NumClusters);
      for (i = 0; i < m_NumClusters; i++) {
	tempI[i] = new Instances(instances, 0);
      }
      for (i = 0; i < instances.numInstances(); i++) {
	tempI[clusterAssignments[i]].add(instances.instance(i));
      }
      for (i = 0; i < m_NumClusters; i++) {
	if (tempI[i].numInstances() == 0) {
	  // empty cluster
	  emptyClusterCount++;
	} else {
          moveCentroid( i, tempI[i], true  );					
	}
      }
      
      if (m_Iterations == m_MaxIterations)
        converged = true;

      if (emptyClusterCount > 0) {
	m_NumClusters -= emptyClusterCount;
        if (converged) {
          Instances[] t = new Instances[m_NumClusters];
          int index = 0;
          for (int k = 0; k < tempI.length; k++) {
            if (tempI[k].numInstances() > 0) {
              t[index++] = tempI[k];
            }
          }
          tempI = t;
        } else {
          tempI = new Instances[m_NumClusters];
        }
      }			
			
      if (!converged) {
	m_squaredErrors = new double [m_NumClusters];
	m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0];
      }
    }
这个while就是开始循环啦!

ClusterPrecessedInstance就是给样本找簇号,把这个点分到最近中心点那个簇里面去。收敛的标准是,clusterAssign没有改变。

temp存放的是每个cluster的数据,用在后面moveCentriod更新中心点集。如果出现空集,那么enmptyCluster+1,并且在此循环中被减去,更新m_numClusters.

最后看看moveCentriod

protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo){
    double [] vals = new double[members.numAttributes()];
		
    //used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;
		
    if(m_DistanceFunction instanceof ManhattanDistance){
      middle = (members.numInstances()-1)/2;
      dataIsEven = ((members.numInstances()%2)==0);
      if(m_PreserveOrder){
        sortedMembers = members;
      }else{
        sortedMembers = new Instances(members);
      }
    }
		
    for (int j = 0; j < members.numAttributes(); j++) {						
			
      //in case of Euclidian distance the centroid is the mean point
      //in case of Manhattan distance the centroid is the median point
      //in both cases, if the attribute is nominal, the centroid is the mode
      if(m_DistanceFunction instanceof EuclideanDistance ||
         members.attribute(j).isNominal())
        {													
          vals[j] = members.meanOrMode(j);
        }else if(m_DistanceFunction instanceof ManhattanDistance){
        //singleton special case
        if(members.numInstances() == 1){
          vals[j] = members.instance(0).value(j);
        }else{
          vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
          if (dataIsEven) {
            vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
          }
        }
      }	
			
      if(updateClusterInfo){
        m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
        m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
        if (members.attribute(j).isNominal()) {
          if (m_ClusterMissingCounts[centroidIndex][j] >  
              m_ClusterNominalCounts[centroidIndex][j][Utils.maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) 
            {
              vals[j] = Instance.missingValue(); // mark mode as missing
            }
        } else {
          if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
            vals[j] = Instance.missingValue(); // mark mean as missing
          }
        }
      }
    }
    if(updateClusterInfo)
      m_ClusterCentroids.add(new Instance(1.0, vals));
    return vals;
  }
	
这个很好懂,就是给定一个粗,如何更新他的中心点。具体如何更新,看这三行注释:

  //in case of Euclidian distance the centroid is the mean point
  //in case of Manhattan distance the centroid is the median point
  //in both cases, if the attribute is nominal, the centroid is the mode
SimpleKmeans就到这里了!!



你可能感兴趣的:(weka)