pom:
4.0.0
com.tzb.bigdata
spark-test
1.0
2.10.6
2.6.0
org.apache.spark
spark-core_2.11
2.1.1
org.apache.spark
spark-sql_2.11
2.1.1
org.apache.spark
spark-hive_2.11
2.1.1
com.typesafe.play
play-mailer_2.11
7.0.0
mysql
mysql-connector-java
5.1.41
org.apache.spark
spark-streaming_2.11
2.1.1
org.apache.spark
spark-streaming-kafka-0-8_2.11
2.1.1
org.apache.kafka
kafka-clients
0.11.0.2
net.sf.json-lib
json-lib
2.4
jdk15
org.neo4j.driver
neo4j-java-driver
4.0.0
com.google.code.gson
gson
2.8.5
junit
junit
4.12
net.minidev
json-smart
2.3
joda-time
joda-time
2.10.1
com.huaban
jieba-analysis
1.0.2
com.alibaba
fastjson
1.2.68
org.elasticsearch
elasticsearch-spark-20_2.11
6.2.4
org.apache.poi
poi
3.12
spark-test
net.alchim31.maven
scala-maven-plugin
3.2.2
compile
testCompile
org.apache.maven.plugins
maven-assembly-plugin
WordCount
jar-with-dependencies
make-assembly
package
single
org.apache.maven.plugins
maven-compiler-plugin
8
SparkStreaming04_KafkaSource:
package com.tzb.sparkstreaming
import java.net.URLEncoder
import net.minidev.json.JSONObject
import net.minidev.json.parser.JSONParser
import org.apache.commons.httpclient.HttpClient
import org.apache.commons.httpclient.methods.GetMethod
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* SparkStreaming 版本0.8
* 注:本程序是将SparkStreaming与kafka结合,通过从kafka采集数据,并调用HTTP接口传参
* SparkStreaming 从kafka中采集数据 无状态的数据统计(不同批次的word的次数不会合并)
* 1)声明采集器(这里不需要是因为 SparkStreaming03_MyReceiver中已定义)
* 2)重写方法 onStart ,onStop
*
* idea本地和210测试机都测试成功:
* 打开kafkatool向某个主题中推送数据
* 执行main方法,开始消费数据
* 并取出值调用HTTP接口传参
* 打包测试(成功):
* spark-submit --master yarn-client --conf spark.driver.memory=2g --class com.tzb.sparkstreaming.SparkStreaming04_KafkaSource --executor-memory 8G --num-executors 5 --executor-cores 2 /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/spark-test-jar-with-dependencies.jar >> /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/sparkstreaming_datachange.log
* 线上跑的话要把代码里的kafka以及zk 等组件的ip或域名,改为线上的,同时提交任务时把 spark-submit 改为 spark-submit2,命令后边加个&符号,则为后台启动程序,当前窗口可关闭。
*
* 如何停止任务:
* 如果想停止掉这个任务则:ps -ef | grep SparkStreaming04_KafkaSource,并将端口kill掉即可。
*
*/
object SparkStreaming04_KafkaSource {
def main(args: Array[String]): Unit = {
//使用SparkStreaming完成WordCount
//Spark配置对象
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming01_WordCount")
//实时数据分析环境对象
//采集周期:以指定的时间为周期采集实时数据
val streamingContext = new StreamingContext(sparkConf,Seconds(5))
// 从kafka中采集数据(注意这里要引 kafka 0.8 版本的包)
//家里机器
// val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
// streamingContext,
// "sparkproject1:2181",
// "testgroup", //group
// Map("testsparkstreaming" -> 3) //topics
// )
// 210 测试机
val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
streamingContext,
"**.**.**.10:2181",
"testgroup", //group
Map("test" -> 3) //topics
)
//家里机器
//bin/kafka-topics.sh --zookeeper sparkproject1:2181 --list //查看kafka topics
//创建topic bin/kafka-topics.sh --zookeeper sparkproject1:2181 --create --topic testsparkstreaming --partitions 3 --replication-factor 2 //总共3*2 6个副本,先定义分区再定义副本
//生产测试数据 bin/kafka-console-producer.sh --broker-list sparkproject1:9092 --topic testsparkstreaming
// 210 测试机
//将采集的数据进行分解(扁平化),注意kafka的消息其实就是k v 对
val wordDStream : DStream[String] = kafkaDStream.flatMap(t => t._2.split(" "))
//将数据进行结构的转换方便统计分析
val mapDStream : DStream[(String,Int)] = wordDStream.map((_,1))
//将转换结构后的数据进行聚合处理
val wordToSumDStream : DStream[(String,Int)] = mapDStream.reduceByKey(_+_)
//将结果打印出来
wordToSumDStream.print()
wordToSumDStream.repartition(1) //未生效?
//将DStream保存成文件
// wordToSumDStream.saveAsTextFiles("file:///D:\\workspace\\spark-test\\output\\sparkstreamingResult1") //注意:如果sparkstreamingResult1文件夹没手动创建的话,会把结果存储到output目录
// wordToSumDStream.saveAsTextFiles("file:///D:/workspace/spark-test/output/sparkstreamingResult/sparkstreaming.txt")
// wordToSumDStream.saveAsTextFiles("file:///output/sparkstreamingResult/sparkstreaming.txt") // 指向D盘根目录
wordToSumDStream.foreachRDD(
rdd => {
val arr : Array[(String, Int)] = rdd.collect()
if(arr!=null && arr.length>0){
println("key:"+ arr(0)._1+" value:" +arr(0)._2)
//调用HTTP接口
val result = requestHTTP(arr(0)._1)
println("=======>HTTP接口调用结果:" + result)
}
}
)
//不能停止采集程序
//streamingContext.stop
//启动采集器
streamingContext.start()
//Driver等待采集器的执行
streamingContext.awaitTermination()
//
}
/**
* 请求HTTP接口
* @param jobName
* @return
*/
def requestHTTP(jobName: String) = {
var data =""
var jobName1="bbb"
// 相当于你拿到了一个浏览器 HTTP:(Get | Post | Put | Delete)
val httpClient = new HttpClient()
// 组装参数
val params = Map[String, String](
"jobName" -> URLEncoder.encode(jobName, "UTF-8"),
"jobName1" -> URLEncoder.encode(jobName1, "UTF-8")
).map(kv => kv._1 + "=" + kv._2).mkString("&")
val getMethod = new GetMethod("http://10.21.4.197:7772/src/main/test/sparkHTTP?" + params) //此接口写法示例:本类搜索 spark请求HTTP接口示例
getMethod.addRequestHeader("Content-Type", "application/json;charset=UTF-8")
// 发送get请求
val status = httpClient.executeMethod(getMethod)
if (status == 200) {
val responseBodyAsString = getMethod.getResponseBodyAsString
val jsonParser = new JSONParser()
val jsonObj: JSONObject = jsonParser.parse(responseBodyAsString).asInstanceOf[JSONObject]
data = jsonObj.get("data").toString
// 释放连接
getMethod.releaseConnection()
} else None
data
}
}
//这里不需要是因为 SparkStreaming03_MyReceiver中已定义
//声明采集器
//1)继承Receiver
//class MyReceiver(host:String,port:Int) extends Receiver[String](StorageLevel.MEMORY_ONLY){
// // val socket = _
// var socket: java.net.Socket = null
//
// def receive(): Unit = {
// socket = new java.net.Socket(host,port)
// val reader = new BufferedReader(new InputStreamReader(socket.getInputStream,"UTF-8"))
// var line : String =null
// while((line = reader.readLine()) != null){
// //将采集的数据存储到采集器的内部进行转换
// if("END".equals(line)){ //设定结束标识符
// return
// }else{ //数据是正常发的
// this.store(line)
// }
// }
// }
//
// override def onStart(): Unit ={
// //启动一个线程
// new Thread(
// new Runnable {
// override def run(): Unit = {
// receive()
// }
// }
//
// ).start()
// }
//
// override def onStop(): Unit = {
// if(socket != null){
// socket.close()
// socket = null
// }
// }
//}
HTTP接口:
package com.huayong.bi.web.controller;
import com.alibaba.fastjson.JSONObject;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
/**
* 测试
*/
@RestController
@RequestMapping("/src/main/test")
public class TestController {
/**
* spark 请求 HTTP
* @param request
* @param response
*/
@CrossOrigin
@RequestMapping(value = "/sparkHTTP", method={RequestMethod.GET})
public String sparkHTTP(HttpServletRequest request, HttpServletResponse response) {
JSONObject jo = null;
try {
String jobName = request.getParameter("jobName");
String jobName1 = request.getParameter("jobName1");
System.out.println(jobName + "===" + jobName1);
jo = new JSONObject();
jo.put("code", 200);
jo.put("msg", "");
jo.put("data", "成功");
} catch (Exception e) {
e.printStackTrace();
}
return jo.toString();
}
}