mahout推荐引擎使用hadoop

        Taste 是 Apache Mahout 提供的一个协同过滤算法的高效实现,它是一个基于Java实现的可扩展的高效的推荐引擎。扩展性是指使用hadoop进行mapreduce计算,提高运算性能。

        最近开始看源码,分析一下,做个笔记。 ItemSimilarityJob类是mahout使用hadoop做推荐引擎的主要实现类,下面开始分析。

run()函数是启动函数:

 

 

public final class RecommenderJob extends AbstractJob {

  public static final String BOOLEAN_DATA = "booleanData";

  private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100;
  private static final int DEFAULT_MAX_PREFS_PER_USER = 1000;
  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;

  @Override
  public int run(String[] args) throws Exception {
    //这里原来有大一堆代码,都是用来载入配置项,不用管它

    //第一步:准备矩阵,将原始数据转换为一个矩阵,在PreparePreferenceMatrixJob这个类中完成
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
              "--input", getInputPath().toString(),
              "--output", prepPath.toString(),
              "--maxPrefsPerUser", String.valueOf(maxPrefsPerUserInItemSimilarity),
              "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
              "--booleanData", String.valueOf(booleanData),
              "--tempDir", getTempPath().toString()});

      numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
    }

    //第二步:计算协同矩阵
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

      /* special behavior if phase 1 is skipped */
      if (numberOfUsers == -1) {
        numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
                PathType.LIST, null, getConf());
      }

      /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
       * new DistributedRowMatrix(...).rowSimilarity(...) */
      //calculate the co-occurrence matrix
      ToolRunner.run(getConf(), new RowSimilarityJob(), new String[]{
              "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
              "--output", similarityMatrixPath.toString(),
              "--numberOfColumns", String.valueOf(numberOfUsers),
              "--similarityClassname", similarityClassname,
              "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),
              "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
              "--threshold", String.valueOf(threshold),
              "--tempDir", getTempPath().toString()});
    }

    //start the multiplication of the co-occurrence matrix by the user vectors
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job prePartialMultiply1 = prepareJob(
              similarityMatrixPath, prePartialMultiplyPath1, SequenceFileInputFormat.class,
              SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class, VectorOrPrefWritable.class,
              Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
              SequenceFileOutputFormat.class);
      boolean succeeded = prePartialMultiply1.waitForCompletion(true);
      if (!succeeded) 
        return -1;
      //continue the multiplication
      Job prePartialMultiply2 = prepareJob(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
              prePartialMultiplyPath2, SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class,
              VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
              SequenceFileOutputFormat.class);
      if (usersFile != null) {
        prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile);
      }
      prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED,
              maxPrefsPerUser);
      succeeded = prePartialMultiply2.waitForCompletion(true);
      if (!succeeded) 
        return -1;
      //finish the job
      Job partialMultiply = prepareJob(
              new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2), partialMultiplyPath,
              SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class, VectorOrPrefWritable.class,
              ToVectorAndPrefReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
              SequenceFileOutputFormat.class);
      setS3SafeCombinedInputPath(partialMultiply, getTempPath(), prePartialMultiplyPath1, prePartialMultiplyPath2);
      succeeded = partialMultiply.waitForCompletion(true);
      if (!succeeded) 
        return -1;
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      //filter out any users we don't care about
      /* convert the user/item pairs to filter if a filterfile has been specified */
      if (filterFile != null) {
        Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
                ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
                ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
                SequenceFileOutputFormat.class);
        boolean succeeded = itemFiltering.waitForCompletion(true);
        if (!succeeded) 
          return -1;
      }

      String aggregateAndRecommendInput = partialMultiplyPath.toString();
      if (filterFile != null) {
        aggregateAndRecommendInput += "," + explicitFilterPath;
      }
      //extract out the recommendations
      Job aggregateAndRecommend = prepareJob(
              new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class,
              PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class,
              AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class,
              TextOutputFormat.class);
      Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
      if (itemsFile != null) {
        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
      }

      if (filterFile != null) {
        setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath);
      }
      setIOSort(aggregateAndRecommend);
      aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
              new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
      aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
      aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
      boolean succeeded = aggregateAndRecommend.waitForCompletion(true);
      if (!succeeded) 
        return -1;
    }

    return 0;
  }

 

 

 

 

你可能感兴趣的:(hadoop)