flink统计PV和UV,以及布隆过滤器
- PV(代码):
- UV(代码):
-
- 1:Set为不重复。存入Set中(数据放在内存中,数据量太大的话对内存要求太大。几乎不可实现)
- 2: UVWithBloom
PV(代码):
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
object PageView {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
})
.assignAscendingTimestamps(_.timestamp * 1000L)
.filter(_.behavior == "pv")
.map(data => ("pv", 1))
.keyBy(_._1)
.timeWindow(Time.hours(1))
.sum(1)
dataStream.print("pv count")
env.execute("page view job")
}
}
UV(代码):
1:Set为不重复。存入Set中(数据放在内存中,数据量太大的话对内存要求太大。几乎不可实现)
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
case class UVCount(windowEnd: Long, count: Long)
object UniqueVisitor {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
})
.assignAscendingTimestamps(_.timestamp * 1000L)
.filter(_.behavior == "pv")
.timeWindowAll(Time.hours(1))
.apply(new UVcountByWindow())
dataStream.print("pv count")
env.execute("page view job")
}
}
class UVcountByWindow() extends AllWindowFunction[UserBehavior, UVCount, TimeWindow] {
override def apply(window: TimeWindow, input: Iterable[UserBehavior], out: Collector[UVCount]): Unit = {
var IDSet = Set[Long]()
for (x <- input) {
IDSet += x.userId
}
out.collect(UVCount(window.getEnd,IDSet.size))
}
}
2: UVWithBloom
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{
Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis
object UniqeVistrBloom {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
})
.assignAscendingTimestamps(_.timestamp * 1000L)
.filter(_.behavior == "pv")
.map(x => ("uv", x.userId))
.keyBy(_._1)
.timeWindow(Time.hours(1))
.trigger(new myTrigger())
.process(new UVCoungWithBoolen())
dataStream.print()
}
}
class myTrigger() extends Trigger[(String, Long), TimeWindow] {
override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE
override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE
override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {
}
override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult =
TriggerResult.FIRE_AND_PURGE
}
class Bloom(size: Long) extends Serializable {
private val cap = if (size > 0) size else 1 << 29
def hash(value: String, seed: Int): Long = {
var result = 0L
for (i <- 0 until value.length) {
result = result * seed + value.charAt(i)
}
result & (cap - 1)
}
}
class UVCoungWithBoolen() extends ProcessWindowFunction[(String, Long), UVCount, String, TimeWindow] {
lazy val jedis = new Jedis("node01", 6379)
lazy val bloom = new Bloom(1 << 29)
override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[UVCount]): Unit = {
val storeKey = context.window.getEnd.toString
var count = 0L
if (jedis.hget("count", storeKey) != null) {
count = jedis.hget("count", storeKey).toLong
}
val userI = elements.last._2.toString
val offset = bloom.hash(userI, 61)
val isExist = jedis.getbit(storeKey, offset)
if (!isExist) {
jedis.setbit(storeKey, offset, true)
jedis.hset("count", storeKey, (count + 1).toString)
} else {
out.collect(UVCount(storeKey.toLong, count))
}
}
}