Spark-FpGrowth算法实现

package test;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.fpm.AssociationRules;
import org.apache.spark.mllib.fpm.FPGrowth;
import org.apache.spark.mllib.fpm.FPGrowthModel;

import java.util.Arrays;
import java.util.List;

/**
 * @description: 测试spark-FpGrowth
 */
public class FpgDemo {
    public static void main(String[] args) {
        String pathOfData = "C:\\Users\\yuzisheng\\Desktop\\retail.txt";
        //最小支持度
        double minSup = 0.01;
        //最小置信度
        double minConf = 0.8;
        //生成的规则数
        int numPart = 10;

        //记录程序开始时间
        long startTime = System.currentTimeMillis();

        //本地模式
        SparkConf conf = new SparkConf().setAppName("FPDemo").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        //读入数据集
        JavaRDD> transactions = sc.textFile(pathOfData).map((Function>) s -> {
            String[] parts = s.split(" ");
            return Arrays.asList(parts);
        });

        //建立算法实例
        FPGrowth fpGrowth = new FPGrowth().setMinSupport(minSup).setNumPartitions(numPart);
        FPGrowthModel model = fpGrowth.run(transactions);

        //查看所有频繁项集,并列出它出现的次数
        System.out.println("--all frequent item sets");
        for (FPGrowth.FreqItemset itemSet : model.freqItemsets().toJavaRDD().collect()) {
            System.out.println(itemSet.javaItems() + ", " + itemSet.freq());
        }

        //通过置信度筛选出强规则(满足一定置信度的规则),antecedent表示前项,consequent表示后项
        System.out.println("--all strong rules");
        for (AssociationRules.Rule rule : model.generateAssociationRules(minConf).toJavaRDD().collect()) {
            System.out.println(rule.javaAntecedent() + "=>" + rule.javaConsequent() + ", " + rule.confidence());
        }

        //记录程序结束时间
        long endTime = System.currentTimeMillis();
        System.out.println("Done by " + (endTime - startTime) + "ms");
    }
}

数据样例(retail.txt前十行):

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 
30 31 32 
33 34 35 
36 37 38 39 40 41 42 43 44 45 46 
38 39 47 48 
38 39 48 49 50 51 52 53 54 55 56 57 58 
32 41 59 60 61 62 
3 39 48 
63 64 65 66 67 68 
32 69 

运行结果:

[37]=>[38], 0.9739292364990689
[36]=>[38], 0.9502724795640327
[41, 38, 48]=>[39], 0.8386689132266217
[36, 48, 39]=>[38], 0.967741935483871
[225, 48]=>[39], 0.8064516129032258
[110]=>[38], 0.9753042233357194
[170, 39]=>[38], 0.9805730937348227
[170, 48, 39]=>[38], 0.9892205638474295
[110, 48]=>[38], 0.986231884057971
[286]=>[38], 0.9433643279797126
[110, 39]=>[38], 0.9891984081864695
[170]=>[38], 0.9780574378831881
[41, 48]=>[39], 0.8168108227988468
[110, 48, 39]=>[38], 0.9942140790742526
[36, 48]=>[38], 0.96045197740113
[36, 39]=>[38], 0.9548355424644085
[170, 48]=>[38], 0.9877970456005138
Done by 8097ms

你可能感兴趣的:(Spark)