spark

1.简单关于气温topk小例子。
package jspark;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

/**
* @author liyu
*
*/
public class HT {

public static void main(String[] args) {
// TODO Auto-generated method stub
if (args.length < 1) {
System.err.println("Usage: HT <file>");
System.exit(1);
}
for (String string : args) {
System.out.println("============="+string);
}
SparkConf sparkConf = new SparkConf().setAppName("ht1");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);

JavaRDD<String> lines = ctx.textFile(args[0], 2);
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String s) {
return Arrays.asList(s);
}
});
System.out.println("----------------------------------------------");
JavaPairRDD<String, Integer> km = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) {
int  airTemperature = 0;
String year = s.substring(15,19);
if(s.charAt(87) == '+') {
airTemperature = Integer.parseInt(s.substring(88, 92));
} else {
airTemperature = Integer.parseInt(s.substring(87, 92));
}  
return new Tuple2<String, Integer>(year, airTemperature);
}
});

//km.saveAsTextFile(args[1]);
/*List<Tuple2<String, Integer>> output = km.collect();
for (Tuple2<?,?> tuple : output) {
System.out.println(tuple._1() + ": " + tuple._2());
}*/
System.out.println("reduce--------------------");
/*JavaRDD<Tuple2<Integer, String>>  jdd =  km.map(new Function<Tuple2<String,Integer>, Tuple2<Integer,String>>() {
public Tuple2<Integer,String> call(Tuple2<String, Integer> tuple) throws Exception {
// TODO Auto-generated method stub
return new Tuple2<Integer, String>(tuple._2, tuple._1);
}
});
jdd.sortBy(new Function<Tuple2<Integer,String>, Tuple2<Integer,String>>() {
@Override
public Tuple2<Integer,String> call(Tuple2<Integer, String> key) throws Exception {
// TODO Auto-generated method stub

return null;
}


} ,true, 3);*/

JavaPairRDD<String, Integer> jrd = km.groupByKey().mapValues(new Function<Iterable<Integer>, Integer>() {

@Override
public Integer call(Iterable<Integer> v) throws Exception {
// TODO Auto-generated method stub
Integer max = 0;
for(Integer x:v){
max = x>max?x:max;
}
return max;
}
});
jrd.sortByKey().saveAsTextFile(args[1]);
List<Tuple2<String, Integer>> output1 = jrd.sortByKey(true).collect();
for (Tuple2<?,?> tuple : output1) {
System.out.println(tuple._1() + ": " + tuple._2());
}
//采用reduceByKey
/*JavaPairRDD<String, Integer> counts = km.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer k1, Integer k2) throws Exception {
// TODO Auto-generated method stub
return k1+k2;
}

});
counts.saveAsTextFile(args[1]);
List<Tuple2<String, Integer>> output1 = counts.collect();
for (Tuple2<?,?> tuple : output1) {
System.out.println(tuple._1() + ": " + tuple._2());
}*/
ctx.stop();



}
}
下面是处理气温的数据。
./bin/spark-submit --master spark://192.168.1.26:7077 --class  jspark.HT  --name ht1  --executor-memory 400M --driver-memory 512M  --jars /opt/spark-1.3.0-bin-hadoop2.4/lib/spark-assembly-1.3.0-hadoop2.4.0.jar   /opt/a/jwc.jar  "/opt/a/190*" "/opt/spark-1.3.0-bin-hadoop2.4/test/dddd14"
此例子是处理k=1最大值(最小值和最大值的处理方式一样,定义一个变量min = x>min?min:x即可。
2若要求top2或者topk,将返回值的值变成数组,里面的实现可以是各种排序
JavaPairRDD<String, Integer[]> jrd = km.groupByKey().mapValues(new Function<Iterable<Integer>, Integer[]>() {
int arr[] = new int[2];
@Override
public Integer[] call(Iterable<Integer> v) throws Exception {
// TODO Auto-generated method stub
//冒泡排序
//快速排序
                                           //最小堆处理
return null;
}
});

你可能感兴趣的:(spark)