Spark使用

                            Spark使用

  1. Mven eclipse的安装 http://blog.csdn.net/qjyong/article/details/9098213
  2. eclipse 建立maven工程

File->New->Project->Maven->Maven Project->quick start

  1. 添加依赖

   https://mvnrepository.com/   找依赖库  

 

  <dependency>

      <groupId>org.apache.sparkgroupId>

      <artifactId>spark-core_2.10artifactId>

      <version>1.2.0version>

dependency>

 

    <dependency>

      <groupId>org.apache.hadoopgroupId>

      <artifactId>hadoop-clientartifactId>

      <version>2.6.0version>

    dependency>

 

   <dependency>

       <groupId>org.apache.sparkgroupId>

       <artifactId>spark-mllib_2.10artifactId>

       <version>1.0.0version>

       <scope>providedscope>

   dependency>

 

   <dependency>

      <groupId>org.apache.sparkgroupId>

      <artifactId>spark-sql_2.10artifactId>

      <version>1.5.2version>

    dependency>

 

  1. 编写程序

3.1利用sqoop数据的导入到Hive HDFS  

sqoop import

--connect jdbc:mysql://49.123.21.100:3306/bbk

--username root

--password root

--table C

--fields-terminated-by '\t' -m 1

 

3.2 编写spark

SparkConf sparkConf = new SparkConf().setAppName("TestKMeans");

 JavaSparkContext sc = new JavaSparkContext(sparkConf);

 

String filePath = "hdfs:/user/kpnm/C ";

 

JavaRDD lines = sc.textFile(filePath);

 

 

3.2 函数式编程

                      实现方法            用途

    Function      R call(T)      map() filter()

 

例1 filter

JavaRDD dongtang = lines.filter(new Containsdt())

 

Class Containsdt implements Function(){

 

    Public Boolean call(String X){

return  X.contains(“东塘店”)

}

 

}

  

例2 map

SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local")

JavaSparkContext ctx = new JavaSparkContext(sparkConf);

 

         

JavaRDD data= ctx.parallelize(Arrays.asList(1,2,3,4,5,6));

RDD=

1    2

2    3

3

4

5

6     7

 

 

idd.map(fun);

 

fun=new Function(){

 

             public Integer call(Integer x){

                return x+1;

             }

            

          }

 

                            实现方法            用途

    Function    R call(T1,T2)    aggregate() fold()

 

例3 aggregate

List data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);

JavaRDD javaRDD = ctx.parallelize(data,3);

 

Integer aggregateRDD = javaRDD.aggregate(2, new Function2() {   

            @Override   

            public Integer call(Integer v1, Integer v2) throws Exception {       

                return v1 + v2;   

            }

        }, new Function2() {   

            @Override   

            public Integer call(Integer v1, Integer v2) throws Exception {         

                return v1 + v2;   

            }

        });

        System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + aggregateRDD);

         //结果输出 27

 

例4 fold

 

List data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);

JavaRDD javaRDD = ctx.parallelize(data,3);

       

System.out.println(javaRDD.fold(1, new Function2(){

           public Integer call(Integer x,Integer y){

              return x+y;

           }

        }));

//输出结果 23

 

 

                        实现方法                     用途   FlatMapFunction    Iterable call(T)      flatMap()

 

例4 flatMap

JavaRDD sjdd=ctx.parallelize(Arrays.asList("hello world  how are you","i am fine","thanks for you"));

         

sjdd.foreach(new VoidFunction(){

             public void call(String x){

               

                System.out.println(x);

               

             }

 });

          

sjdd=sjdd.flatMap(new FlatMapFunction(){

            

             public Iterable call(String x){

                return Arrays.asList(x.split(" "));

               

             }

            

});

System.out.println("处理之后的数据");

        

sjdd.foreach(new VoidFunction(){

            

                public void call(String x){

               

                System.out.println(x);

               

             }

           });

 

之前结果:

"hello world  how are you"

"i am fine"

"thanks for you"

 

之后结果

 hello

 World

 How

 Are

 you

 i

 am

 fine

 thanks

 For

 You

 

PariRDD 或者DoubleRDD 相关的函数和类同样需要 一些的相关接口类

5 提交作业

Maven 打包

 Maven install 将所有的依赖包充入jar中

 

sudo spark-submit \

 --class com.mycompany.app.yang.App \

 --executor-memory 5G \

 --total-executor-cores 5 \

 --driver-class-path /home/kpnm/mysql-connector-java-5.1.41-bin.jar\

 /home/kpnm/yxs/yang-0.0.1-SNAPSHOT.jar

 

其中红色的表示必须。

 

趋势分析:

 

package com.mycompany.app.yang;

 

import scala.Tuple2;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaPairRDD;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.FlatMapFunction;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.api.java.function.Function2;

import org.apache.spark.api.java.function.PairFunction;

import org.apache.spark.api.java.function.VoidFunction;

 

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Collection;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Properties;

import java.util.regex.Pattern;

 

import org.apache.spark.sql.DataFrame;

import org.apache.spark.sql.DataFrameReader;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.SQLContext;

import org.apache.spark.sql.hive.HiveContext;

import org.apache.spark.sql.types.DataTypes;

import org.apache.spark.sql.types.StructField;

import org.apache.spark.sql.types.StructType;

 

 

public class App

{

   private static final Pattern SPACE = Pattern.compile(" ");

     public static void main(String[] args) throws Exception {

    

SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local").set("spark.kryoserializer.buffer.max","128");

 

JavaSparkContext ctx = new JavaSparkContext(sparkConf);

 

SQLContext sqlContext = new SQLContext(ctx);

  

HiveContext hiveCtx=new HiveContext(ctx);

      

DataFrame rdd;         

         

rdd=hiveCtx.sql("select SHOP,CATE,ACCTURE,PERIOD,ACCOUNT,collect_set(VALUE) from C group by SHOP,CATE,ACCTURE,ACCOUNT,PERIOD,ACCOUNT");

          

JavaRDD jdd=rdd.toJavaRDD();

jdd =jdd.map(new linearegression()).collect());

          

jdd.saveAsTextFile("/Home/kpnm/yxs/result.txt");

          

          

       ctx.stop();

      

     }

    

};

 

class linearegression implements Function>{

    public List call(Row s){

       String str=s.get(5).toString();

       String datas=str.substring(13, str.length()-1);

      

       String[] data=datas.split(",");

      

       List list= new ArrayList ();

       double k;

       for(int i=0;i<data.length-2;i++){

         

          k=(Double.valueOf(data[i+2]) - Double.valueOf(data[i]))/2;

         

          list.add(Double.valueOf(k));

         

       }

      

        return list;

       

    }

}

 

查看任务执行情况

 

你可能感兴趣的:(python)