spark:sparkstreaming 0.08版本 从 kafka 采集数据,并调用HTTP接口传参

pom:



    4.0.0

    com.tzb.bigdata
    spark-test
    
    1.0
    
    
    

    
        2.10.6
        2.6.0
    

    
        
            org.apache.spark
            spark-core_2.11
            2.1.1
        
        
            org.apache.spark
            spark-sql_2.11
            2.1.1
        
        
        
        
        
        
        
            org.apache.spark
            spark-hive_2.11
            2.1.1
        
        
            com.typesafe.play
            play-mailer_2.11
            7.0.0
        
        
            mysql
            mysql-connector-java
            5.1.41
        
        
            org.apache.spark
            spark-streaming_2.11
            2.1.1
        
        
        
        
            org.apache.spark
            spark-streaming-kafka-0-8_2.11
            2.1.1
        

        
        
            
            
            
            
                
                    
                    
                
            
        
        
        
            org.apache.kafka
            kafka-clients
            0.11.0.2
        


        
        
        
        
        
        
        
        
        

        
        
            
            
            
            
                
                    
                    
                
            
        

        
            net.sf.json-lib
            json-lib
            2.4
            jdk15
        
        
            org.neo4j.driver
            neo4j-java-driver
            4.0.0
        
        
            com.google.code.gson
            gson
            2.8.5
        
        
            junit
            junit
            4.12
            
            
        
        
            net.minidev
            json-smart
            2.3
        
        
        
        
        
        
        
        
        
        
        
        
        
            joda-time
            joda-time
            2.10.1
        
        
        
        
        
        
        
        
        
        
            com.huaban
            jieba-analysis
            1.0.2
        
        
            com.alibaba
            fastjson
            1.2.68
        
        
        
            org.elasticsearch
            elasticsearch-spark-20_2.11
            6.2.4
        
        
        
            org.apache.poi
            poi
            3.12
        
    

    
        spark-test
        
            
                net.alchim31.maven
                scala-maven-plugin
                3.2.2
                
                    
                        
                            compile
                            testCompile
                        
                    
                
            
            
                org.apache.maven.plugins
                maven-assembly-plugin
                
                
                    
                        
                            WordCount
                        
                    
                    
                        jar-with-dependencies
                    
                
                
                    
                        make-assembly
                        package
                        
                            single
                        
                    
                
            
            
                org.apache.maven.plugins
                maven-compiler-plugin
                
                    8
                    8
                
            
        
    



SparkStreaming04_KafkaSource:
package com.tzb.sparkstreaming
import java.net.URLEncoder
import net.minidev.json.JSONObject
import net.minidev.json.parser.JSONParser
import org.apache.commons.httpclient.HttpClient
import org.apache.commons.httpclient.methods.GetMethod
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * SparkStreaming 版本0.8
  * 注:本程序是将SparkStreaming与kafka结合,通过从kafka采集数据,并调用HTTP接口传参
  * SparkStreaming 从kafka中采集数据 无状态的数据统计(不同批次的word的次数不会合并)
  * 1)声明采集器(这里不需要是因为 SparkStreaming03_MyReceiver中已定义)
  * 2)重写方法 onStart ,onStop
  *
  * idea本地和210测试机都测试成功:
  * 打开kafkatool向某个主题中推送数据
  * 执行main方法,开始消费数据
  * 并取出值调用HTTP接口传参
  * 打包测试(成功):
  * spark-submit --master yarn-client --conf spark.driver.memory=2g --class com.tzb.sparkstreaming.SparkStreaming04_KafkaSource --executor-memory 8G --num-executors 5 --executor-cores 2 /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/spark-test-jar-with-dependencies.jar >> /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/sparkstreaming_datachange.log
  * 线上跑的话要把代码里的kafka以及zk 等组件的ip或域名,改为线上的,同时提交任务时把 spark-submit 改为 spark-submit2,命令后边加个&符号,则为后台启动程序,当前窗口可关闭。
  *
  * 如何停止任务:
  * 如果想停止掉这个任务则:ps -ef | grep SparkStreaming04_KafkaSource,并将端口kill掉即可。
  *
  */
object SparkStreaming04_KafkaSource {

  def main(args: Array[String]): Unit = {

    //使用SparkStreaming完成WordCount

    //Spark配置对象
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming01_WordCount")

    //实时数据分析环境对象
    //采集周期:以指定的时间为周期采集实时数据
    val streamingContext = new StreamingContext(sparkConf,Seconds(5))

    // 从kafka中采集数据(注意这里要引 kafka 0.8 版本的包)
    //家里机器
//    val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
//      streamingContext,
//      "sparkproject1:2181",
//      "testgroup", //group
//      Map("testsparkstreaming" -> 3) //topics
//    )
    // 210 测试机
    val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
      streamingContext,
      "**.**.**.10:2181",
      "testgroup", //group
      Map("test" -> 3) //topics
    )

    //家里机器
    //bin/kafka-topics.sh --zookeeper sparkproject1:2181 --list    //查看kafka topics
    //创建topic bin/kafka-topics.sh --zookeeper sparkproject1:2181 --create --topic testsparkstreaming --partitions 3 --replication-factor 2   //总共3*2 6个副本,先定义分区再定义副本
    //生产测试数据 bin/kafka-console-producer.sh --broker-list sparkproject1:9092 --topic testsparkstreaming
    // 210 测试机

    //将采集的数据进行分解(扁平化),注意kafka的消息其实就是k v 对
    val wordDStream : DStream[String] =  kafkaDStream.flatMap(t => t._2.split(" "))

    //将数据进行结构的转换方便统计分析
    val mapDStream : DStream[(String,Int)] = wordDStream.map((_,1))

    //将转换结构后的数据进行聚合处理
    val wordToSumDStream : DStream[(String,Int)] = mapDStream.reduceByKey(_+_)
    //将结果打印出来
    wordToSumDStream.print()
    wordToSumDStream.repartition(1) //未生效?

    //将DStream保存成文件
//    wordToSumDStream.saveAsTextFiles("file:///D:\\workspace\\spark-test\\output\\sparkstreamingResult1") //注意:如果sparkstreamingResult1文件夹没手动创建的话,会把结果存储到output目录
//    wordToSumDStream.saveAsTextFiles("file:///D:/workspace/spark-test/output/sparkstreamingResult/sparkstreaming.txt")
//    wordToSumDStream.saveAsTextFiles("file:///output/sparkstreamingResult/sparkstreaming.txt") // 指向D盘根目录

    wordToSumDStream.foreachRDD(
      rdd => {
        val arr : Array[(String, Int)] = rdd.collect()
        if(arr!=null && arr.length>0){
          println("key:"+ arr(0)._1+" value:" +arr(0)._2)
                  //调用HTTP接口
                      val result = requestHTTP(arr(0)._1)
                      println("=======>HTTP接口调用结果:" + result)

        }

      }
    )


    //不能停止采集程序
    //streamingContext.stop

    //启动采集器
    streamingContext.start()
    //Driver等待采集器的执行
    streamingContext.awaitTermination()
 //
  }

  /**
    * 请求HTTP接口
    * @param jobName
    * @return
    */
  def requestHTTP(jobName: String) = {
    var data =""
    var jobName1="bbb"
    // 相当于你拿到了一个浏览器 HTTP:(Get | Post | Put | Delete)
    val httpClient = new HttpClient()
    // 组装参数
    val params = Map[String, String](
      "jobName" -> URLEncoder.encode(jobName, "UTF-8"),
      "jobName1" -> URLEncoder.encode(jobName1, "UTF-8")
    ).map(kv => kv._1 + "=" + kv._2).mkString("&")

    val getMethod = new GetMethod("http://10.21.4.197:7772/src/main/test/sparkHTTP?" + params) //此接口写法示例:本类搜索 spark请求HTTP接口示例
    getMethod.addRequestHeader("Content-Type", "application/json;charset=UTF-8")
    // 发送get请求
    val status = httpClient.executeMethod(getMethod)
    if (status == 200) {
      val responseBodyAsString = getMethod.getResponseBodyAsString
      val jsonParser = new JSONParser()
      val jsonObj: JSONObject = jsonParser.parse(responseBodyAsString).asInstanceOf[JSONObject]
      data = jsonObj.get("data").toString
      // 释放连接
      getMethod.releaseConnection()
    } else None
    data
  }

}

//这里不需要是因为 SparkStreaming03_MyReceiver中已定义
//声明采集器
//1)继承Receiver
//class MyReceiver(host:String,port:Int) extends Receiver[String](StorageLevel.MEMORY_ONLY){
//  //  val socket = _
//  var socket: java.net.Socket = null
//
//  def receive(): Unit = {
//    socket = new java.net.Socket(host,port)
//    val reader = new BufferedReader(new InputStreamReader(socket.getInputStream,"UTF-8"))
//    var line : String =null
//    while((line = reader.readLine()) != null){
//      //将采集的数据存储到采集器的内部进行转换
//      if("END".equals(line)){  //设定结束标识符
//        return
//      }else{ //数据是正常发的
//        this.store(line)
//      }
//    }
//  }
//
//  override  def onStart(): Unit ={
//    //启动一个线程
//    new Thread(
//      new Runnable {
//        override def run(): Unit = {
//          receive()
//        }
//      }
//
//    ).start()
//  }
//
//  override def onStop(): Unit = {
//    if(socket != null){
//      socket.close()
//      socket = null
//    }
//  }
//}

HTTP接口:

package com.huayong.bi.web.controller;

import com.alibaba.fastjson.JSONObject;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
 *  测试
 */
@RestController
@RequestMapping("/src/main/test")
public class TestController {
    
    /**
     * spark 请求 HTTP
     * @param request
     * @param response
     */
    @CrossOrigin
    @RequestMapping(value = "/sparkHTTP", method={RequestMethod.GET})
    public String sparkHTTP(HttpServletRequest request, HttpServletResponse response) {
        JSONObject jo = null;
        try {
            String jobName = request.getParameter("jobName");
            String jobName1  = request.getParameter("jobName1");
            System.out.println(jobName + "===" + jobName1);

            jo = new JSONObject();
            jo.put("code", 200);
            jo.put("msg", "");
            jo.put("data", "成功");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return jo.toString();
    }

}

 

 

 

 

你可能感兴趣的:(sparkStreaming,spark)