File->New->Project->Maven->Maven Project->quick start
https://mvnrepository.com/ 找依赖库
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-core_2.10artifactId>
<version>1.2.0version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-clientartifactId>
<version>2.6.0version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-mllib_2.10artifactId>
<version>1.0.0version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql_2.10artifactId>
<version>1.5.2version>
dependency>
3.1利用sqoop数据的导入到Hive HDFS
sqoop import
--connect jdbc:mysql://49.123.21.100:3306/bbk
--username root
--password root
--table C
--fields-terminated-by '\t' -m 1
3.2 编写spark
SparkConf sparkConf = new SparkConf().setAppName("TestKMeans");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
String filePath = "hdfs:/user/kpnm/C ";
JavaRDD
3.2 函数式编程
类 实现方法 用途
Function
JavaRDD
Class Containsdt implements Function
Public Boolean call(String X){
return X.contains(“东塘店”)
}
}
SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local")
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
JavaRDD
RDD=
1 2
2 3
3
4
5
6 7
idd.map(fun);
fun=new Function
public Integer call(Integer x){
return x+1;
}
}
类 实现方法 用途
Function
List
JavaRDD
Integer aggregateRDD = javaRDD.aggregate(2, new Function2
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}, new Function2
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + aggregateRDD);
//结果输出 27
List
JavaRDD
System.out.println(javaRDD.fold(1, new Function2
public Integer call(Integer x,Integer y){
return x+y;
}
}));
//输出结果 23
类 实现方法 用途 FlatMapFunction
JavaRDD
sjdd.foreach(new VoidFunction
public void call(String x){
System.out.println(x);
}
});
sjdd=sjdd.flatMap(new FlatMapFunction
public Iterable
return Arrays.asList(x.split(" "));
}
});
System.out.println("处理之后的数据");
sjdd.foreach(new VoidFunction
public void call(String x){
System.out.println(x);
}
});
之前结果:
"hello world how are you"
"i am fine"
"thanks for you"
之后结果
hello
World
How
Are
you
i
am
fine
thanks
For
You
PariRDD 或者DoubleRDD 相关的函数和类同样需要 一些的相关接口类
Maven 打包
Maven install 将所有的依赖包充入jar中
sudo spark-submit \
--class com.mycompany.app.yang.App \
--executor-memory 5G \
--total-executor-cores 5 \
--driver-class-path /home/kpnm/mysql-connector-java-5.1.41-bin.jar\
/home/kpnm/yxs/yang-0.0.1-SNAPSHOT.jar
其中红色的表示必须。
趋势分析:
package com.mycompany.app.yang;
import scala.Tuple2;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
public class App
{
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local").set("spark.kryoserializer.buffer.max","128");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(ctx);
HiveContext hiveCtx=new HiveContext(ctx);
DataFrame rdd;
rdd=hiveCtx.sql("select SHOP,CATE,ACCTURE,PERIOD,ACCOUNT,collect_set(VALUE) from C group by SHOP,CATE,ACCTURE,ACCOUNT,PERIOD,ACCOUNT");
JavaRDD
jdd =jdd.map(new linearegression()).collect());
jdd.saveAsTextFile("/Home/kpnm/yxs/result.txt");
ctx.stop();
}
};
class linearegression implements Function
public List
String str=s.get(5).toString();
String datas=str.substring(13, str.length()-1);
String[] data=datas.split(",");
List
double k;
for(int i=0;i<data.length-2;i++){
k=(Double.valueOf(data[i+2]) - Double.valueOf(data[i]))/2;
list.add(Double.valueOf(k));
}
return list;
}
}