11 Spark案例

0 搭建项目

  • pom参考
<dependencies>
        <dependency>
            <groupId>org.slf4jgroupId>
            <artifactId>slf4j-log4j12artifactId>
            <version>1.7.25version>
        dependency>
        <dependency>
            <groupId>log4jgroupId>
            <artifactId>log4jartifactId>
            <version>1.2.17version>
        dependency>
        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-core_2.10artifactId>
            <version>1.6.0version>
        dependency>
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-mllib_2.10artifactId>
            <version>1.6.0version>
        dependency>
        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-sql_2.10artifactId>
            <version>1.6.0version>
        dependency>
        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-hive_2.10artifactId>
            <version>1.6.0version>
        dependency>
        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming_2.10artifactId>
            <version>1.6.0version>
            
        dependency>
        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming-kafka_2.10artifactId>
            <version>1.6.0version>
        dependency>
        <dependency>
            <groupId>org.scala-langgroupId>
            <artifactId>scala-libraryartifactId>
            <version>2.10.5version>
        dependency>
        <dependency>
            <groupId>org.scala-langgroupId>
            <artifactId>scala-compilerartifactId>
            <version>2.10.5version>
        dependency>
        <dependency>
            <groupId>org.scala-langgroupId>
            <artifactId>scala-reflectartifactId>
            <version>2.10.5version>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-commonartifactId>
            <version>2.6.5version>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-clientartifactId>
            <version>2.6.5version>
        dependency>
        <dependency>
            <groupId>org.apache.zookeepergroupId>
            <artifactId>zookeeperartifactId>
            <version>3.4.10version>
        dependency>
        <dependency>
            <groupId>org.apache.hivegroupId>
            <artifactId>hive-execartifactId>
            <version>1.2.2version>
        dependency>
        <dependency>
            <groupId>org.apache.hbasegroupId>
            <artifactId>hbase-itartifactId>
            <version>1.2.2version>
        dependency>

        <dependency>
            <groupId>com.lmaxgroupId>
            <artifactId>disruptorartifactId>
            <version>3.2.0version>
        dependency>
        <dependency>
            <groupId>org.apache.flumegroupId>
            <artifactId>flume-ng-coreartifactId>
            <version>1.6.0version>
        dependency>
        <dependency>
            <groupId>org.apache.kafkagroupId>
            <artifactId>kafka_2.11artifactId>
            <version>1.1.1version>
        dependency>
        
        <dependency>
            <groupId>redis.clientsgroupId>
            <artifactId>jedisartifactId>
            <version>2.6.1version>
        dependency>
        
        <dependency>
            <groupId>mysqlgroupId>
            <artifactId>mysql-connector-javaartifactId>
            <version>5.1.47version>
        dependency>
        <dependency>
            <groupId>com.google.collectionsgroupId>
            <artifactId>google-collectionsartifactId>
            <version>1.0version>
        dependency>
    dependencies>

    <repositories>
        <repository>
            <id>centralid>
            <name>Maven Repository Switchboardname>
            <layout>defaultlayout>
            <url>http://repo2.maven.org/maven2url>
            <snapshots>
                <enabled>falseenabled>
            snapshots>
        repository>
    repositories>

    <build>
        <sourceDirectory>src/main/javasourceDirectory>
        <testSourceDirectory>src/test/javatestSourceDirectory>


        <plugins>
            
            <plugin>
                <groupId>org.scala-toolsgroupId>
                <artifactId>maven-scala-pluginartifactId>
                <version>2.15.2version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compilegoal>
                            <goal>testCompilegoal>
                        goals>
                    execution>
                executions>
            plugin>
            <plugin>
                
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-compiler-pluginartifactId>
                <version>3.3version>
                <configuration>
                    <source>1.8source>
                    <target>1.8target>
                    <encoding>UTF-8encoding>
                configuration>
            plugin>
            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-assembly-pluginartifactId>
                <version>3.0.0version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependenciesdescriptorRef>
                    descriptorRefs>
                configuration>
            plugin>

            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-surefire-pluginartifactId>
                <version>2.10version>
                <configuration>
                    <skip>trueskip>
                configuration>
            plugin>
        plugins>
    build>
View Code

11 Spark案例_第1张图片

1 demo1--WorldCount

  • 项目目录下新建data文件夹,再新建world.csv文件
hello,spark
hello,scala,hadoop
hello,hdfs
hello,spark,hadoop
hello
  • scala版本---SparkWC.scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
  * spark wordcount
  */
object SparkWC {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("wordcount").setMaster("local")
    val sc = new SparkContext(conf)
    sc.textFile("./data/world.csv").flatMap( _.split(",")).map((_,1)).reduceByKey(_+_).foreach(println)
    sc.stop()
//  下面是每一步的分析
//    //conf 可以设置SparkApplication 的名称,设置Spark 运行的模式
//    val conf = new SparkConf()
//    conf.setAppName("wordcount")
//    conf.setMaster("local")
//    //SparkContext 是通往spark 集群的唯一通道
//    val sc = new SparkContext(conf)
//
//    val lines: RDD[String] = sc.textFile("./data/world.csv")
//    val words: RDD[String] = lines.flatMap(line => {
//      line.split(",")
//    })
//    val pairWords: RDD[(String, Int)] = words.map(word=>{new Tuple2(word,1)})
//    val result: RDD[(String, Int)] = pairWords.reduceByKey((v1:Int, v2:Int)=>{v1+v2})
//    result.foreach(one=>{
//      println(one)
//    })
  }
}
View Code
  • 测试
    11 Spark案例_第2张图片

  • java版本---SparkWC.scala
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;

public class SparkWordCount {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local");
        conf.setAppName("wc");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD lines = sc.textFile("./data/world.csv");
        JavaRDD words = lines.flatMap(new FlatMapFunction() {
            @Override
            public Iterable call(String line) throws Exception {
                return Arrays.asList(line.split(","));
            }
        });
        JavaPairRDD pairWords = words.mapToPair(new PairFunction() {
            @Override
            public Tuple2 call(String s) throws Exception {
                return new Tuple2<>(s, 1);
            }
        });
        JavaPairRDD result = pairWords.reduceByKey(new Function2() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        result.foreach(new VoidFunction>() {
            @Override
            public void call(Tuple2 tp) throws Exception {
                System.out.println(tp);
            }
        });
        sc.stop();
    }
}
View Code
  • 测试
    11 Spark案例_第3张图片

2 demo2--join算子

  • 代码及测试
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer

object Taiko extends App {
  val conf = new SparkConf().setMaster("local").setAppName("wc");
  val sc = new SparkContext(conf)

  //demo1-5 data start
  val nameRDD: RDD[(String, Int)] = sc.parallelize(List[(String, Int)](
    ("zhangsan", 18), ("lisi", 19), ("wangwu", 20), ("zhaoliu", 21)
  ))
  val sourceRDD: RDD[(String, Int)] = sc.parallelize(List[(String, Int)](
    ("zhangsan", 100), ("lisi", 200), ("wangwu", 300), ("tianqi", 400)
  ))
  //demo1-5 data end
  //demo1 join
  //val result: RDD[(String, (Int, Int))] = nameRDD.join(sourceRDD)
  //result.foreach(println)
  /** demo1结果
    * (zhangsan,(18,100))
    * (wangwu,(20,300))
    * (lisi,(19,200))
    */
  //demo2 leftOuterJoin
  //val result: RDD[(String, (Int, Option[Int]))] = nameRDD.leftOuterJoin(sourceRDD)
  //result.foreach(println)
  /** demo2结果
    * (zhangsan,(18,Some(100)))
    * (wangwu,(20,Some(300)))
    * (zhaoliu,(21,None))
    * (lisi,(19,Some(200)))
    */
  /* result.foreach(res => {
     val name = res._1
     val v1 = res._2._1
     val v2 = res._2._2.getOrElse("没有分数")
     println(s"name=$name,age=$v1,scoure=$v2")
   })*/
  /** demo2结果
    * name=zhangsan,age=18,scoure=100
    * name=wangwu,age=20,scoure=300
    * name=zhaoliu,age=21,scoure=没有分数
    * name=lisi,age=19,scoure=200
    */
  //demo3 rightOuterJoin
  //val result: RDD[(String, (Option[Int], Int))] = nameRDD.rightOuterJoin(sourceRDD)
  //result.foreach(println)
  /** demo3结果
    * (zhangsan,(Some(18),100))
    * (wangwu,(Some(20),300))
    * (tianqi,(None,400))
    * (lisi,(Some(19),200))
    */

  //demo4 fullOuterJoin
  //val result: RDD[(String, (Option[Int], Option[Int]))] = nameRDD.fullOuterJoin(sourceRDD)
  //result.foreach(println)
  /** demo4结果
    * (zhangsan,(Some(18),Some(100)))
    * (wangwu,(Some(20),Some(300)))
    * (zhaoliu,(Some(21),None))
    * (tianqi,(None,Some(400)))
    * (lisi,(Some(19),Some(200)))
    */
  //demo5 union
  //val result: RDD[(String, Int)] = nameRDD.union(sourceRDD)
  //result.foreach(println)
  /** demo5结果
    * (zhangsan,18)
    * (lisi,19)
    * (wangwu,20)
    * (zhaoliu,21)
    * (zhangsan,100)
    * (lisi,200)
    * (wangwu,300)
    * (tianqi,400)
    */

  //demo6 分区
  val nameRDD1: RDD[(String, Int)] = sc.parallelize(List[(String, Int)](
    ("zhangsan", 18), ("lisi", 19), ("wangwu", 20), ("zhaoliu", 21)
  ), 3)
  val sourceRDD1: RDD[(String, Int)] = sc.parallelize(List[(String, Int)](
    ("zhangsan", 100), ("lisi", 200), ("wangwu", 300), ("tianqi", 400)
  ), 4)
  val p1: Int = nameRDD1.getNumPartitions
  val p2: Int = sourceRDD1.getNumPartitions
  //val result: RDD[(String, (Int, Int))] = nameRDD1.join(sourceRDD1)
  //val p3: Int = result.getNumPartitions
  //println(s"p1:$p1,p2:$p2,p3:$p3")
  /** p1:3,p2:4,p3:4  和多的分区保持一致 */
  //val result: RDD[(String, Int)] = nameRDD1.union(sourceRDD1)
  //val p3: Int = result.getNumPartitions
  //println(s"p1:$p1,p2:$p2,p3:$p3")
  /** p1:3,p2:4,p3:7  数据其实没有移动,只是把分区加在了一起 */

  //demo7 intersection交集   subtract差集
  val rdd1: RDD[Int] = sc.parallelize(List[Int](1, 2, 3))
  val rdd2: RDD[Int] = sc.parallelize(List[Int](2, 3, 5))
  //rdd1.intersection(rdd2).foreach(println)
  /**
    * 3
    * 2
    */
  //rdd1.subtract(rdd2).foreach(println)
  /** 1  */
  //rdd2.subtract(rdd1).foreach(println)
  /** 5  */

  //demo8 优化频繁操作 mapPartitions分区数据处理
  val rdd: RDD[String] = sc.parallelize(List[String]("hello1", "hello2", "hello3", "hello4"), 2)
  /*  rdd.map(one => {
      println("建立数据库连接...")
      println(s"插入数据库数据:$one")
      println("关闭数据库连接...")
      one + "!"
    }).count()*/
  /** 频繁建立数据库连接!!!!!!!
    * 建立数据库连接...
    * 插入数据库数据:hello1
    * 关闭数据库连接...
    * 建立数据库连接...
    * 插入数据库数据:hello2
    * 关闭数据库连接...
    * 建立数据库连接...
    * 插入数据库数据:hello3
    * 关闭数据库连接...
    * 建立数据库连接...
    * 插入数据库数据:hello4
    * 关闭数据库连接...
    */
  rdd.mapPartitions(iter => {
    val list = new ListBuffer[String]
    println("建立数据库连接...")
    while (iter.hasNext) {
      val str = iter.next()
      println(s"插入数据库数据:$str")
      list.+=(str)
    }
    println("关闭数据库连接...")
    list.iterator
  }).count()
  /**
    * 建立数据库连接...
    * 插入数据库数据:hello1
    * 插入数据库数据:hello2
    * 关闭数据库连接...
    * 建立数据库连接...
    * 插入数据库数据:hello3
    * 插入数据库数据:hello4
    * 关闭数据库连接...
    */
}
View Code

3 demo3-- spark集群验证 yarn集群验证

  • user_item_score.txt
1 100001 5
1 100002 3 1 100003 4 3 100001 2 3 100002 5 2 100001 1 2 100002 2 2 100003 4 2 100004 5
  • userwatchlist
package com.test.scala.spark

import org.apache.spark.{SparkConf, SparkContext}

object userwatchlist {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("userwatchlist test")
    val sc = new SparkContext(conf)

    val input_path = sc.textFile("./data/user_item_score.txt")
    val output_path = "./data/userwatchlist_output"
    //过滤掉分数小于2的数据
    val data = input_path.filter(x => {
      val fields = x.split(" ")
      fields(2).toDouble > 2
    }).map(x => {
      /*
       原始数据
       user item score
       ->
       (user, (item1 score1))
       (user, (item2 score2))
       ->
       (user,((item1 score1) (item2 score2)))
       ->目标
       user -> item item  item
      */
      val fields = x.split(" ")
      (fields(0).toString, (fields(1).toString, fields(2).toString))
    }).groupByKey().map(x => {
      val userid = x._1
      val item_score_tuple_list = x._2
      //根据score进行排序
      val tmp_arr = item_score_tuple_list.toArray.sortWith(_._2 > _._2)
      var watchlen = tmp_arr.length
      //取前5个
      if (watchlen > 5) {
        watchlen = 5
      }
      val strbuf = new StringBuilder
      for (i <- 0 until watchlen) {
        strbuf ++= tmp_arr(i)._1
        strbuf.append(":")
        strbuf ++= tmp_arr(i)._2
        strbuf.append(" ")
      }
      userid + "\t" + strbuf
    })
    data.saveAsTextFile(output_path)
  }
}
View Code
  • 3.1本地验证结果
    11 Spark案例_第4张图片

  • 3.2通过spark集群验证
    • 修改scala类
      11 Spark案例_第5张图片
    • maven打包
    • 将文件和jar包上传到linux上
    • 在将文件上传到hdfs上
      11 Spark案例_第6张图片
    • 新建run.sh
    /usr/local/src/spark-1.6.0-bin-hadoop2.6/bin/spark-submit \
          --master spark://master:7077 \ --num-executors 2 \ --executor-memory 1g \ --executor-cores 1 \ --driver-memory 1g \ --class com.test.scala.spark.userwatchlist /root/test_spark/test-1.0-SNAPSHOT.jar
    • 运行bash run.sh
      11 Spark案例_第7张图片
      11 Spark案例_第8张图片
  • 3.3通过hadoop集群验证
    • 修改run.sh
      11 Spark案例_第9张图片
    /usr/local/src/spark-1.6.0-bin-hadoop2.6/bin/spark-submit \
          --master yarn-cluster \ --num-executors 2 \ --executor-memory 1g \ --executor-cores 1 \ --driver-memory 1g \ --class com.test.scala.spark.userwatchlist /root/test_spark/test-1.0-SNAPSHOT.jar
    • 删除刚刚的输出路径(如果没操作spark集群验证的则不用)hadoop fs -rmr- /userwatchlist_output
      11 Spark案例_第10张图片
      11 Spark案例_第11张图片

4 demo4-- cf算法

package com.test.scala.spark

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer
import scala.math._

object cf {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("CF")
    val sc = new SparkContext(conf)

    val input_path = args(0).toString
    val output_path = args(1).toString

    val lines = sc.textFile(input_path)
    val max_prefs_per_user = 20
    val topn = 5
    //step1
    val ui_rdd = lines.map { x =>
      val fileds = x.split(" ")
      (fileds(0).toString, (fileds(1).toString, fileds(2).toDouble))
    }.groupByKey().flatMap { x =>
      val user = x._1
      val item_score_list = x._2
      var is_arr = item_score_list.toArray
      var is_list_len = is_arr.length
      if (is_list_len > max_prefs_per_user) {
        is_list_len = max_prefs_per_user
      }
      //转置
      var i_us_arr = new ArrayBuffer[(String, (String, Double))]()
      for (i <- 0 until is_list_len) {
        i_us_arr += ((is_arr(i)._1,(user,is_arr(i)._2)))
      }
      i_us_arr
    }.groupByKey().flatMap{x=>
      //归一化
      val item = x._1
      val u_list = x._2
      val us_arr = u_list.toArray
      var sum:Double = 0.0
      for(i <- 0 until us_arr.length){
        sum += pow(us_arr(i)._2,2)
      }
      sum = sqrt(sum)

      var u_is_arr = new ArrayBuffer[(String, (String, Double))]()
      for(i <- 0 until us_arr.length){
        u_is_arr += ((us_arr(i)._1,(item,us_arr(i)._2 / sum)))
      }
      u_is_arr
      /*
      设置参数测试
      (2,CompactBuffer((100002,0.3244428422615251), (100003,0.7071067811865475), (100004,1.0), (100001,0.18257418583505536)))
      (3,CompactBuffer((100002,0.8111071056538127), (100001,0.3651483716701107)))
      (1,CompactBuffer((100002,0.48666426339228763), (100003,0.7071067811865475), (100001,0.9128709291752769)))
      */
    }.groupByKey()

    //step2
    val unpack_rdd = ui_rdd.flatMap{x=>
      val is_arr = x._2.toArray
      var ii_s_arr = new ArrayBuffer[((String,String),Double)]()
      for(i <- 0 until is_arr.length-1){
        for(j <- 0 until is_arr.length){
          ii_s_arr += (((is_arr(i)._1,is_arr(j)._1),is_arr(i)._2 * is_arr(j)._2))
          ii_s_arr += (((is_arr(j)._1,is_arr(i)._1),is_arr(i)._2 * is_arr(j)._2))
        }
      }
      ii_s_arr
    /*测试
      ((100002,100002),0.10526315789473685)
      ((100002,100002),0.10526315789473685)
      ((100002,100003),0.22941573387056174)
      ((100003,100002),0.22941573387056174)
      ((100002,100004),0.3244428422615251)
      ((100004,100002),0.3244428422615251)
      ((100002,100001),0.05923488777590923)
      ((100001,100002),0.05923488777590923)
      ((100003,100002),0.22941573387056174)
      ((100002,100003),0.22941573387056174)
      ((100003,100003),0.4999999999999999)
      ((100003,100003),0.4999999999999999)*/
    }
    //step3
    unpack_rdd.groupByKey().map{x=>
      val ii_pair = x._1
      val s_list = x._2
      val s_arr = s_list.toArray
      var score:Double = 0.0
      for(i <- 0 until s_arr.length){
        score += s_arr(i)
      }
      (ii_pair._1,(ii_pair._2,score))
     /*测试
      (100002,(100002,2.0))
      (100002,(100001,0.7996709849747747))
      (100001,(100003,0.7745966692414834))
      (100003,(100002,1.1470786693528088))
      (100001,(100004,0.18257418583505536))
      (100004,(100001,0.18257418583505536))
      (100004,(100002,0.6488856845230502))
      (100004,(100004,2.0))
      (100003,(100001,0.7745966692414834))
      (100003,(100003,1.9999999999999996))
      (100002,(100004,0.6488856845230502))
      (100001,(100002,0.7996709849747747))
      (100003,(100004,1.414213562373095))
      (100004,(100003,1.414213562373095))
      (100002,(100003,1.1470786693528088))*/
    }.groupByKey().map{x=>
      val item_a = x._1
      val item_list = x._2
      val bs_arr = item_list.toArray.sortWith(_._2 > _._2)

      var len = bs_arr.length
      if(len > topn){
        len=topn
      }
      val s = new StringBuilder
      for(i <- 0 until len){
        val item = bs_arr(i)._1
        val score = "%1.4f" format bs_arr(i)._2
        s.append(item+":"+score)
        if(i){
          s.append(",")
        }
      }
      item_a + "\t" + s
    }.saveAsTextFile(output_path)

  }
}
View Code
    • 设置参数测试
      11 Spark案例_第12张图片
    • 结果
      11 Spark案例_第13张图片

你可能感兴趣的:(11 Spark案例)