读代码-RandomSeedGenerator

package org.apache.mahout.clustering.kmeans;
public final class RandomSeedGenerator
完成中心点随机取样的过程

hdfs操作,比较普遍,先删除再新建
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    HadoopUtil.delete(conf, output);
    Path outFile = new Path(output, "part-randomSeed");
    boolean newFile = fs.createNewFile(outFile);


遍历hdfs路径框架
fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
globStatus返回了匹配pattern的所有路径
logsCRCFilter过滤掉了以_开头的日志,点开头的隐藏及.crc文件
循环时滤掉文件夹,只处理文件
    if (newFile) {
      Path inputPathPattern;

      if (fs.getFileStatus(input).isDir()) {
        inputPathPattern = new Path(input, "*");
      } else {
        inputPathPattern = input;
      }
      
      FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
      for (FileStatus fileStatus : inputFiles) {
        if (fileStatus.isDir()) {
          continue;
        }
        //process file
      }
      
    }



初始化writer
准备k个容量的list存储选出的值
      SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Cluster.class);
      Random random = RandomUtils.getRandom();
      List<Text> chosenTexts = new ArrayList<Text>(k);
      List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
      int nextClusterId = 0;



随机的核心--蓄水池算法
        for (Pair<Writable,VectorWritable> record
             : new SequenceFileIterable<Writable,VectorWritable>(fileStatus.getPath(), true, conf)) {
          Writable key = record.getFirst();
          VectorWritable value = record.getSecond();
          Cluster newCluster = new Cluster(value.get(), nextClusterId++, measure);
          newCluster.observe(value.get(), 1);
          Text newText = new Text(key.toString());
          int currentSize = chosenTexts.size();
          if (currentSize < k) {
            chosenTexts.add(newText);
            chosenClusters.add(newCluster);
          } else if (random.nextInt(currentSize + 1) == 0) { // with chance 1/(currentSize+1) pick new element
            int indexToRemove = random.nextInt(currentSize); // evict one chosen randomly
            chosenTexts.remove(indexToRemove);
            chosenClusters.remove(indexToRemove);
            chosenTexts.add(newText);
            chosenClusters.add(newCluster);
          }
        }

你可能感兴趣的:(generator)