流式数据采集和计算(十五):Spark streaming的使用案例

-----读取本地打印输出

val text=sc.textFile("hdfs://172.22.241.183:8020/user/spark/yzg_test.txt")

sc.textFile("hdfs://172.22.241.183:8020/user/spark/yzg_test.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect

--spark-shell实现文件流

import org.apache.spark.streaming._
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.textFileStream("hdfs://172.22.241.183:8020/user/spark/yzg_test.txt")
val Counts = lines.flatMap(_.split(" ")).map((_,1)).reduceByKey(_ + _)
Counts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-test")
ssc.start()
ssc.awaitTermination()

--spark-shell实现socket流,结果保存在hdfs(测试通过)

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel

val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-socket-now")
ssc.start()

--spark-shell实现socket流,结果保存在本地文件(测试通过)

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel

val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.saveAsTextFiles("file:/root/now")
ssc.start()

--spark-shell实现socket流(测试通过)结果打印输出

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel

val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print
ssc.start()

结果:
(is,1)
(day,1)
(thsi,1)
(a,1)
(good,1)
(for,1)
------------spark-shell实现spark-sql和spark-streaming集成(测试通过)

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession

val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
words.foreachRDD(rdd=>{
    val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
    import spark.implicits._
    val df1= rdd.toDF("word")
    df1.createOrReplaceTempView("_temp")
    spark.sql("select word,count(*) from _temp group by word").show()
})
ssc.start()

--------java api实现socket读取并保存hdfs(测试通过)
//该测试用例通过

package cmos.yzg.spark

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
class SocketTest {
}
object SocketTest{
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("WordCount")
    val ssc = new StreamingContext(conf,Seconds(3))
    ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/ck")
    val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
    val wordCounts = lines.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
    wordCounts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-socket-IDEA")
    ssc.start();
    ssc.awaitTermination();
  }
}

--------java api实现spark-sql和spark-streaming集成(sparkseesion 引用包报错)

package cmos.yzg.spark
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession

object Socket_Sql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("WordCount")
    val ssc = new StreamingContext(conf,Seconds(3))
    ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/ck2")
    val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
    val words = lines.flatMap(_.split(" "))
    words.foreachRDD(rdd=>{
      val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
      import spark.implicits._
      val df1= rdd.toDF("word")
      df1.createOrReplaceTempView("_temp")
      spark.sql("select word,count(*) from _temp group by word").show()
    })
    ssc.start();
    ssc.awaitTermination();
  }

}

---------csdn上copy的java-api实现sparksql和sparkstreaming集成(未测试)

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.sql.SparkSession

/**
  * Created by Administrator on 2018/3/8.
  */
object SparkStreamingWordCountSparkSQLScala {
    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("worldCount")
        conf.setMaster("local[2]")
        //时间片是2秒
        val ssc = new StreamingContext(conf ,Seconds(2))
        ssc.checkpoint("file:///d:/java/chk")
        //创建套接字文本流
        val lines = ssc.socketTextStream("s101", 8888)
        //压扁生成单词流
        val words = lines.flatMap(_.split(" "))
        words.foreachRDD(rdd=>{
            val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
            import spark.implicits._
            val df1= rdd.toDF("word")
            df1.createOrReplaceTempView("_temp")
            spark.sql("select word,count(*) from _temp group by word").show()
        })
        //启动流
        ssc.start()
        ssc.awaitTermination()
    }
}

--spark-shell实现kafka流,带有窗口滑动操作(测试通过)

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/cp-kafka")
val zkQuorum = "172.22.241.185:2181"
val group = "test-consumer-group"
val topics = "yzg_spark"           
val numThreads = 1
val topicMap =topics.split(",").map((_,numThreads.toInt)).toMap
val lineMap = KafkaUtils.createStream(ssc,zkQuorum,group,topicMap)
val pair = lineMap.map(_._2).flatMap(_.split(" ")).map((_,1))
val wordCounts = pair.reduceByKeyAndWindow(_ + _,_ - _,Minutes(2),Seconds(10),2)
wordCounts.print
ssc.start

-------结果:
Time: 1554810320000 ms
-------------------------------------------
(fdjslkfas,1)
(fjdsklfas,1)
(,4)
(fdsjlk,1)
(hgjk,1)
(jfkldaf,1)
(jkfdolowfr,1)
(jklaf,1)
(fdjsklagf,1)
(jkls,1)

-------spark-shell实现kafka流的spark-sql,不带窗口滑动操作(测试通过)

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.sql.SparkSession

val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/cp-kafka")
val zkQuorum = "172.22.241.185:2181"
val group = "test-consumer-group"
val topics = "yzg_spark"           
val numThreads = 1
val topicMap =topics.split(",").map((_,numThreads.toInt)).toMap
val lineMap = KafkaUtils.createStream(ssc,zkQuorum,group,topicMap)
val pair = lineMap.map(_._2).flatMap(_.split(" "))
pair.foreachRDD(rdd=>{
      val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
      import spark.implicits._
      val df1= rdd.toDF("word")
      df1.createOrReplaceTempView("_temp")
      spark.sql("select word,count(*) from _temp group by word").show()
    })
ssc.start

----------测试结果:
+-----------------+--------+                                                    
|             word|count(1)|
+-----------------+--------+
|fdsjlfksajflkdsaf|       1|
|        jflalfsda|       1|
|        jfldaflsa|       1|
|   sdjfklsfjldsaf|       1|
|           fjdkll|       1|
|                 |      39|
|     sadjfklsdfas|       1|
|        fjdslkfaf|       1|
+-----------------+--------+

---------------------------------------------sparksession的版本?
---------------------------------------------kafka.utill包的版本?

----------------有状态的转换

val updateFunc = (values: Seq[Int], state: Option[Int]) => {
val currentCount = values.foldLeft(0)(_ + _)
val previousCount = state.getOrElse(0)
Some(currentCount + previousCount)
}

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/checkpoint")
val lines = ssc.socketTextStream("172.22.241.184",9999,StorageLevel.MEMORY_AND_DISK_SER)
val wordCounts = lines.flatMap(_.split(" ")).map((_,1)).updateStateByKey[Int](updateFunc)
wordCounts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-socket-new")
ssc.start()

-----------jar包提交与修改

spark-submit --class "JSONRead" /usr/local/spark/mycode/json/target/scala-2.11/json-project_2.11-1.0.jar
spark-submit --class "SparkFromKafka"  /root/Shit-1.0-SNAPSHOT-shaded.jar
zip -d /root/Shit-1.0-SNAPSHOT-shaded.jar  'META-INF/.SF'  'META-INF/.RSA'  'META-INF/*SF'

你可能感兴趣的:(Spark/Flink的流处理)