Java实现Spark groupByKey等算子

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;

import java.util.*;

public class baobiao {
    public static void main(String[] args) {
        SparkSession spark = SparkSession.builder().appName("dup").master("local[4]").getOrCreate();
        JavaRDD input = spark.sparkContext().textFile("/Users/yangyang/Desktop/textbook_exercise_use_report.csv",1)
                .toJavaRDD().map(new Function() {
                    public String call(String s) throws Exception {
                        String[] tmp = s.split(",");
                        String[] timestamp= tmp[9].split(" ")[0].split("-");
                        String time = "";
                        for(int i =0 ;i < timestamp.length-1;i++){
                            time += timestamp[i];
                        }
                        tmp[9] = time;
                        String res = "";
                        for(int i = 0;i all_people= input.mapToPair(new PairFunction() {
            public Tuple2 call(String str) throws Exception{
                String[] tmp = str.split(",");
                String key = "";
                for(int i=1;i<=4;i++)
                    key += tmp[i]+"\t";
                key += tmp[6];
                return new Tuple2(key,1);
            }
        }).groupByKey().mapToPair(new PairFunction>, String, Integer>() {
            public Tuple2 call(Tuple2> pairs) throws Exception{
                String key = pairs._1();
                Iterable iter = pairs._2();
                int sum = 0;
                for(Integer i:iter)
                    sum +=i;
                return new Tuple2(key,sum);
            }
        });
        JavaPairRDD all_times = input.mapToPair(new PairFunction() {
            public Tuple2 call(String str) throws Exception{
                String[] tmp = str.split(",");
                String key = "";
                for(int i=1;i<=4;i++)
                    key += tmp[i]+"\t";
                key += tmp[6];
                return new Tuple2(key,Integer.valueOf(tmp[11]));
            }
        }).groupByKey().mapToPair(new PairFunction>, String, Integer>() {
                    public Tuple2 call(Tuple2> pairs) throws Exception{
                        String key = pairs._1();
                        Iterable iter = pairs._2();
                        int sum = 0;
                        for(Integer i:iter)
                            sum +=i;
                        return new Tuple2(key,sum);
                    }
                });
        JavaPairRDD all_learn_times = input.mapToPair(new PairFunction() {
            public Tuple2 call(String str) throws Exception{
                String[] tmp = str.split(",");
                String key = "";
                for(int i=1;i<=4;i++)
                    key += tmp[i]+"\t";
                key += tmp[6];
                return new Tuple2(key,Integer.valueOf(tmp[10]));
            }
        }).groupByKey().mapToPair(new PairFunction>, String, Integer>() {
            public Tuple2 call(Tuple2> pairs) throws Exception{
                String key = pairs._1();
                Iterable iter = pairs._2();
                int sum = 0;
                for(Integer i:iter)
                    sum +=i;
                return new Tuple2(key,sum);
            }
        });
        List> all_people_list= all_people.collect();
        List> all_times_list= all_times.collect();
        List> all_learn_times_list= all_learn_times.collect();
        
        for(Tuple2 people:all_learn_times_list)
            System.out.println(people._1()+"\t"+people._2());
        input.foreach(x -> System.out.println(x));
        spark.close();
    }
}

注:Mac下使用idea开发,可用Alt+/看方法返回类型(本人个人习惯设置,写在此怕忘记)

你可能感兴趣的:(大数据/Spark)