-----读取本地打印输出
val text=sc.textFile("hdfs://172.22.241.183:8020/user/spark/yzg_test.txt")
sc.textFile("hdfs://172.22.241.183:8020/user/spark/yzg_test.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect
--spark-shell实现文件流
import org.apache.spark.streaming._
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.textFileStream("hdfs://172.22.241.183:8020/user/spark/yzg_test.txt")
val Counts = lines.flatMap(_.split(" ")).map((_,1)).reduceByKey(_ + _)
Counts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-test")
ssc.start()
ssc.awaitTermination()
--spark-shell实现socket流,结果保存在hdfs(测试通过)
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-socket-now")
ssc.start()
--spark-shell实现socket流,结果保存在本地文件(测试通过)
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.saveAsTextFiles("file:/root/now")
ssc.start()
--spark-shell实现socket流(测试通过)结果打印输出
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print
ssc.start()
结果:
(is,1)
(day,1)
(thsi,1)
(a,1)
(good,1)
(for,1)
------------spark-shell实现spark-sql和spark-streaming集成(测试通过)
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
words.foreachRDD(rdd=>{
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val df1= rdd.toDF("word")
df1.createOrReplaceTempView("_temp")
spark.sql("select word,count(*) from _temp group by word").show()
})
ssc.start()
--------java api实现socket读取并保存hdfs(测试通过)
//该测试用例通过
package cmos.yzg.spark
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
class SocketTest {
}
object SocketTest{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordCount")
val ssc = new StreamingContext(conf,Seconds(3))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/ck")
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val wordCounts = lines.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-socket-IDEA")
ssc.start();
ssc.awaitTermination();
}
}
--------java api实现spark-sql和spark-streaming集成(sparkseesion 引用包报错)
package cmos.yzg.spark
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession
object Socket_Sql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordCount")
val ssc = new StreamingContext(conf,Seconds(3))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/ck2")
val lines = ssc.socketTextStream("172.22.241.184", 9990, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
words.foreachRDD(rdd=>{
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val df1= rdd.toDF("word")
df1.createOrReplaceTempView("_temp")
spark.sql("select word,count(*) from _temp group by word").show()
})
ssc.start();
ssc.awaitTermination();
}
}
---------csdn上copy的java-api实现sparksql和sparkstreaming集成(未测试)
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.sql.SparkSession
/**
* Created by Administrator on 2018/3/8.
*/
object SparkStreamingWordCountSparkSQLScala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("worldCount")
conf.setMaster("local[2]")
//时间片是2秒
val ssc = new StreamingContext(conf ,Seconds(2))
ssc.checkpoint("file:///d:/java/chk")
//创建套接字文本流
val lines = ssc.socketTextStream("s101", 8888)
//压扁生成单词流
val words = lines.flatMap(_.split(" "))
words.foreachRDD(rdd=>{
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val df1= rdd.toDF("word")
df1.createOrReplaceTempView("_temp")
spark.sql("select word,count(*) from _temp group by word").show()
})
//启动流
ssc.start()
ssc.awaitTermination()
}
}
--spark-shell实现kafka流,带有窗口滑动操作(测试通过)
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/cp-kafka")
val zkQuorum = "172.22.241.185:2181"
val group = "test-consumer-group"
val topics = "yzg_spark"
val numThreads = 1
val topicMap =topics.split(",").map((_,numThreads.toInt)).toMap
val lineMap = KafkaUtils.createStream(ssc,zkQuorum,group,topicMap)
val pair = lineMap.map(_._2).flatMap(_.split(" ")).map((_,1))
val wordCounts = pair.reduceByKeyAndWindow(_ + _,_ - _,Minutes(2),Seconds(10),2)
wordCounts.print
ssc.start
-------结果:
Time: 1554810320000 ms
-------------------------------------------
(fdjslkfas,1)
(fjdsklfas,1)
(,4)
(fdsjlk,1)
(hgjk,1)
(jfkldaf,1)
(jkfdolowfr,1)
(jklaf,1)
(fdjsklagf,1)
(jkls,1)
-------spark-shell实现kafka流的spark-sql,不带窗口滑动操作(测试通过)
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.sql.SparkSession
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/cp-kafka")
val zkQuorum = "172.22.241.185:2181"
val group = "test-consumer-group"
val topics = "yzg_spark"
val numThreads = 1
val topicMap =topics.split(",").map((_,numThreads.toInt)).toMap
val lineMap = KafkaUtils.createStream(ssc,zkQuorum,group,topicMap)
val pair = lineMap.map(_._2).flatMap(_.split(" "))
pair.foreachRDD(rdd=>{
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val df1= rdd.toDF("word")
df1.createOrReplaceTempView("_temp")
spark.sql("select word,count(*) from _temp group by word").show()
})
ssc.start
----------测试结果:
+-----------------+--------+
| word|count(1)|
+-----------------+--------+
|fdsjlfksajflkdsaf| 1|
| jflalfsda| 1|
| jfldaflsa| 1|
| sdjfklsfjldsaf| 1|
| fjdkll| 1|
| | 39|
| sadjfklsdfas| 1|
| fjdslkfaf| 1|
+-----------------+--------+
---------------------------------------------sparksession的版本?
---------------------------------------------kafka.utill包的版本?
----------------有状态的转换
val updateFunc = (values: Seq[Int], state: Option[Int]) => {
val currentCount = values.foldLeft(0)(_ + _)
val previousCount = state.getOrElse(0)
Some(currentCount + previousCount)
}
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.storage.StorageLevel
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint("hdfs://172.22.241.183:8020/user/spark/checkpoint")
val lines = ssc.socketTextStream("172.22.241.184",9999,StorageLevel.MEMORY_AND_DISK_SER)
val wordCounts = lines.flatMap(_.split(" ")).map((_,1)).updateStateByKey[Int](updateFunc)
wordCounts.saveAsTextFiles("hdfs://172.22.241.183:8020/user/spark/bendi-socket-new")
ssc.start()
-----------jar包提交与修改
spark-submit --class "JSONRead" /usr/local/spark/mycode/json/target/scala-2.11/json-project_2.11-1.0.jar
spark-submit --class "SparkFromKafka" /root/Shit-1.0-SNAPSHOT-shaded.jar
zip -d /root/Shit-1.0-SNAPSHOT-shaded.jar 'META-INF/.SF' 'META-INF/.RSA' 'META-INF/*SF'