foreachRDD 方法主要把数据发送到外部系统
foreachRDD 遍历DStream
foreachPartition 遍历RDD里的分区
foreach 遍历分区里的每条记录
误区 connection 在driver上创建
dstream.foreachRDD { rdd =>
val connection = createNewConnection() // executed at the driver
rdd.foreach { record =>
connection.send(record) // executed at the worker
}
}
误区 每条记录创建一个链接
dstream.foreachRDD { rdd =>
rdd.foreach { record =>
val connection = createNewConnection()
connection.send(record)
connection.close()
}
}
解决办法 每个分区创建一个链接
dstream.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
val connection = createNewConnection()
partitionOfRecords.foreach(record => connection.send(record))
connection.close()
}
}
解决办法 连接池
dstream.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
// ConnectionPool is a static, lazily initialized pool of connections
val connection = ConnectionPool.getConnection()
partitionOfRecords.foreach(record => connection.send(record))
ConnectionPool.returnConnection(connection) // return to the pool for future reuse
}
}
reducebykey
和MR,在map阶段先做一次Merge一样,再shuffle
groupbykey
直接shuffle
// 所有的数据堆一起 得到一个value ,返回v
// reduce(_+_)
def reduce(reduceFunc: (T, T) => T): DStream[T] = ssc.withScope {
this.map((null, _)).reduceByKey(reduceFunc, 1).map(_._2)
}
//相同的key堆一起得到一个value 返回<k,v>
def reduceByKey(reduceFunc: (V, V) => V): DStream[(K, V)] = ssc.withScope {
reduceByKey(reduceFunc, defaultPartitioner())
}
直接定义,然后使用就可以了
val keyWorld = Map("a" -> "b", "b" -> "c")
val bro= ssc.sparkContext.broadcast(keyWorld)
bro.value.getOrElse("a","c")
obj.foreachRDD(x => {
var keyWorld: Map[String, Double] = Map()
var li = x.filter(_._1 == "c").take(1)
if (li.size == 1) {
keyWorld += (li(0)._1 -> li(0)._2)
}
val bro = ssc.sparkContext.broadcast(keyWorld)
x.filter(_._1 != "c").foreach(z => {
println("key " + z._1)
println("value " + z._2)
println("c " + bro.value.getOrElse("c", 0.0))
})
})
./bin/spark-submit \
--class com.demo.TestSparkStreaming \
--master local[2] \
/xx/spark-streaming-demo-1.0-SNAPSHOT.jar \
2
./bin/spark-submit \
--class com.demo.TestSparkStreaming \
--master spark://centos-6:7077 \
--executor-memory 2G \
/opt/soft/spark-2.3.0-bin-hadoop2.7/spark-streaming-demo-1.0-SNAPSHOT.jar \
2
3.部署在Yarn, Driver在集群
./bin/spark-submit \
--class com.demo.SparkKafka \
--master yarn \
--deploy-mode cluster \
--executor-memory 2G \
/opt/soft/spark-2.3.0-bin-hadoop2.7/spark-streaming-demo-1.0-SNAPSHOT.jar \
4.部署在Yarn ,Driver在本地
./bin/spark-submit \
--class com.demo.TestSparkStreaming \
--master yarn \
--executor-memory 2G \
/xxx/spark-streaming-demo-1.0-SNAPSHOT.jar \
在Yarn模式下
val list: util.ArrayList[String] = new util.ArrayList[String]
obj.foreachRDD(x => {
//变量访问不到
println("list size" + list.size())
x.foreachPartition(z => {
//每个分区定义变量可用
var featureMap: Map[String, Double] = Map()
z.foreach(
t => {
println("key1:" + t._1)
println("value1:" + t._2)
featureMap += (t._1 -> t._2)
}
)
println("map size" + featureMap.size)
})
})
把数据打散
val data = Array(1, 2, 3, 4, 5)
val distData = sc.parallelize(data)
distData.repartition(2)