spark MLlib BasicStatistics 统计学基础

一, jar依赖,jsc创建。

package ML.BasicStatistics;

import com.google.common.collect.Lists;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.DoubleFlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.mllib.linalg.Matrices;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.stat.KernelDensity;
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
import org.apache.spark.mllib.stat.Statistics;
import org.apache.spark.mllib.stat.test.ChiSqTestResult;
import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import scala.Tuple2;
import scala.runtime.Statics;
import static org.apache.spark.mllib.random.RandomRDDs.*;

import java.util.*;

/**
 * TODO
 *
 * @ClassName: BasicStatistics
 * @author: DingH
 * @since: 2019/4/3 16:11
 */
public class BasicStatistics {
    public static void main(String[] args) {
        System.setProperty("hadoop.home.dir","E:\\hadoop-2.6.5");
        SparkConf conf = new SparkConf().setAppName("BasicStatistics").setMaster("local");
        JavaSparkContext jsc = new JavaSparkContext(conf);

 

二。Summary statistics

        /**
         * @Title: Statistics.colStats一个实例MultivariateStatisticalSummary,其中包含按列的max,min,mean,variance和非零数,以及总计数
         * Summary statistics:摘要统计
         */
        JavaRDD parallelize = jsc.parallelize(Arrays.asList(
                Vectors.dense(1, 0, 3),
                Vectors.dense(2, 0, 4),
                Vectors.dense(3, 0, 5)
        ));
        MultivariateStatisticalSummary summary = Statistics.colStats(parallelize.rdd());
        System.out.println(summary.mean());
        System.out.println(summary.variance());
        System.out.println(summary.numNonzeros());

 

三。Correlations:相关性

        /**
         * @Title: Correlations:相关性
         */
        JavaRDD> parallelize = jsc.parallelize(Lists.newArrayList(
                new Tuple2("cat", "11"),
                new Tuple2("dog", "22"),
                new Tuple2("cat", "33"),
                new Tuple2("pig", "44")

        ));

        JavaDoubleRDD seriesX  = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction>>() {
            public Iterable call(Iterator> tuple2Iterator) throws Exception {
                ArrayList strings = new ArrayList();
                while (tuple2Iterator.hasNext()){
                    strings.add(Double.parseDouble(tuple2Iterator.next()._2));
                }
                return strings;
            }
        });
        JavaDoubleRDD seriesY  = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction>>() {
            public Iterable call(Iterator> tuple2Iterator) throws Exception {
                ArrayList strings = new ArrayList();
                while (tuple2Iterator.hasNext()){
                    strings.add(Double.parseDouble(tuple2Iterator.next()._2)+1);
                }
                return strings;
            }
        });
         //compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
         //method is not specified, Pearson's method will be used by default.
        double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");

        
        
        
        JavaRDD parallelize11 = jsc.parallelize(Arrays.asList(
                Vectors.dense(1, 0, 3),
                Vectors.dense(2, 0, 4),
                Vectors.dense(3, 0, 5)
        ));// note that each Vector is a row and not a column
        Matrix correlation2 = Statistics.corr(parallelize11.rdd(), "spearman");
        System.out.println(correlation2);

 

三,Stratified sampling:分层抽样

        /**
         * @Title: Stratified sampling:分层抽样
         */
        JavaRDD> parallelize = jsc.parallelize(Lists.newArrayList(
                new Tuple2("cat", "11"),
                new Tuple2("dog", "22"),
                new Tuple2("cat", "33"),
                new Tuple2("pig", "44")

        ));
        JavaPairRDD data = parallelize.mapToPair(new PairFunction, String, String>() {
            public Tuple2 call(Tuple2 stringStringTuple2) throws Exception {
                return new Tuple2(stringStringTuple2._1, stringStringTuple2._2);
            }
        });    // an RDD of any key value pairs
        Map fractions = new HashMap(); // specify the exact fraction desired from each key
        fractions.put("cat",0.5);    //对于每个key取值的概率
        fractions.put("dog",0.8);
        fractions.put("pig",0.8);
        // Get an exact sample from each stratum
        JavaPairRDD approxSample  = data.sampleByKey(false, fractions);
        JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions);
        approxSample.foreach(new VoidFunction() {
            public void call(Object o) throws Exception {
                System.out.println(o);
            }
        });

 

四。Hypothesis testing  假设检验

        /**
         * @Title: Hypothesis testing  假设检验
         */

        Vector vec = Vectors.dense(1,2,3,4); // a vector composed of the frequencies of events

        // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
        // the test runs against a uniform distribution.
        ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
        // summary of the test including the p-value, degrees of freedom, test statistic, the method used,
        // and the null hypothesis.
        System.out.println(goodnessOfFitTestResult);

        Matrix mat = Matrices.dense(3,2,new double[]{1,2,3,4,5,6}); // a contingency matrix

        // conduct Pearson's independence test on the input contingency matrix
        ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
        // summary of the test including the p-value, degrees of freedom...
        System.out.println(independenceTestResult);

        JavaRDD obs = MLUtils.loadLibSVMFile(jsc.sc(), "/data...").toJavaRDD(); // an RDD of labeled points

        // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
        // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
        // against the label.
        ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
        int i = 1;
        for (ChiSqTestResult result : featureTestResults) {
            System.out.println("Column " + i + ":");
            System.out.println(result); // summary of the test
            i++;
        }

        JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0,0.3));
        KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data,"norm");
        // summary of the test including the p-value, test statistic,
        // and null hypothesis
        // if our p-value indicates significance, we can reject the null hypothesis
        System.out.println(testResult);        

 

五。Random data generation  

         /**
         * @Title: Random data generation  :uniform, standard normal, or Poisson.
         */

        JavaDoubleRDD u = normalJavaRDD(jsc, 100,2);
        // Apply a transform to get a random double RDD following `N(1, 4)`.
        JavaRDD map = u.map(new Function() {
            public Double call(Double aDouble) throws Exception {
                return 1.0 + 2.0 * aDouble;
            }
        });
        map.foreach(new VoidFunction() {
            public void call(Double aDouble) throws Exception {
                System.out.println(aDouble);
            }
        });

 

六。Kernel density estimation

        /**
         * @Title: Kernel density estimation
         */
        JavaRDD data = jsc.parallelize(Arrays.asList(1.0, 2.0, 3.0));// an RDD of sample data

        // Construct the density estimator with the sample data and a standard deviation for the Gaussian
        // kernels
        KernelDensity kd = new KernelDensity()
          .setSample(data)
          .setBandwidth(3.0);

        // Find density estimates for the given values
        double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
        for (int i = 0; i < densities.length; i++) {
            System.out.println(densities[i]);
        }

 

转载于:https://www.cnblogs.com/dhName/p/10655450.html

你可能感兴趣的:(spark MLlib BasicStatistics 统计学基础)