3、经过Spark Streaming实时计算程序分析,将结果写入Redis,可以实时获取用户的行为数据,并可以导出进行离线综合统计分析
{ "uid": "068b746ed4620d25e26055a9f804385f", "event_time": "1430204612405", "os_type": "Android", "click_count": 6 }一个事件包含4个字段:
package com.iteblog.spark.streaming.utils import java.util.Properties import scala.util.Properties import org.codehaus.jettison.json.JSONObject import kafka.javaapi.producer.Producer import kafka.producer.KeyedMessage import kafka.producer.KeyedMessage import kafka.producer.ProducerConfig import scala.util.Random object KafkaEventProducer { private val users = Array( "4A4D769EB9679C054DE81B973ED5D768", "8dfeb5aaafc027d89349ac9a20b3930f", "011BBF43B89BFBF266C865DF0397AA71", "f2a8474bf7bd94f0aabbd4cdd2c06dcf", "068b746ed4620d25e26055a9f804385f", "97edfc08311c70143401745a03a50706", "d7f141563005d1b5d0d3dd30138f3f62", "c8ee90aade1671a21336c721512b817a", "6b67c8c700427dee7552f81f3228c927", "a95f22eabc4fd4b580c011a3161a9d9d") private val random = new Random() private var pointer = -1 def getUserID() : String = { pointer = pointer + 1 if(pointer >= users.length) { pointer = 0 users(pointer) } else { users(pointer) } } def click() : Double = { random.nextInt(10) } // bin/kafka-topics.sh --zookeeper zk1:2181,zk2:2181,zk3:2181/kafka --create --topic user_events --replication-factor 2 --partitions 2 // bin/kafka-topics.sh --zookeeper zk1:2181,zk2:2181,zk3:2181/kafka --list // bin/kafka-topics.sh --zookeeper zk1:2181,zk2:2181,zk3:2181/kafka --describe user_events // bin/kafka-console-consumer.sh --zookeeper zk1:2181,zk2:2181,zk3:22181/kafka --topic test_json_basis_event --from-beginning def main(args: Array[String]): Unit = { val topic = "user_events" val brokers = "10.10.4.126:9092,10.10.4.127:9092" val props = new Properties() props.put("metadata.broker.list", brokers) props.put("serializer.class", "kafka.serializer.StringEncoder") val kafkaConfig = new ProducerConfig(props) val producer = new Producer[String, String](kafkaConfig) while(true) { // prepare event data val event = new JSONObject() event .put("uid", getUserID) .put("event_time", System.currentTimeMillis.toString) .put("os_type", "Android") .put("click_count", click) // produce event message producer.send(new KeyedMessage[String, String](topic, event.toString)) println("Message sent: " + event) Thread.sleep(200) } } }通过控制上面程序最后一行的时间间隔来控制模拟写入速度。下面我们来讨论实现实时统计每个用户的点击次数,它是按照用户分组进行累加次数,逻辑比较简单,关键是在实现过程中要注意一些问题,如对象序列化等。先看实现代码,稍后我们再详细讨论,代码实现如下所示:
object UserClickCountAnalytics { def main(args: Array[String]): Unit = { var masterUrl = "local[1]" if (args.length > 0) { masterUrl = args(0) } // Create a StreamingContext with the given master URL val conf = new SparkConf().setMaster(masterUrl).setAppName("UserClickCountStat") val ssc = new StreamingContext(conf, Seconds(5)) // Kafka configurations val topics = Set("user_events") val brokers = "10.10.4.126:9092,10.10.4.127:9092" val kafkaParams = Map[String, String]( "metadata.broker.list" -> brokers, "serializer.class" -> "kafka.serializer.StringEncoder") val dbIndex = 1 val clickHashKey = "app::users::click" // Create a direct stream val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) val events = kafkaStream.flatMap(line => { val data = JSONObject.fromObject(line._2) Some(data) }) // Compute user click times val userClicks = events.map(x => (x.getString("uid"), x.getInt("click_count"))).reduceByKey(_ + _) userClicks.foreachRDD(rdd => { rdd.foreachPartition(partitionOfRecords => { partitionOfRecords.foreach(pair => { val uid = pair._1 val clickCount = pair._2 val jedis = <span class="wp_keywordlink_affiliate"><a href="http://www.iteblog.com/archives/tag/redis" title="" target="_blank" data-original-title="View all posts in Redis">Redis</a></span>Client.pool.getResource jedis.select(dbIndex) jedis.hincrBy(clickHashKey, uid, clickCount) RedisClient.pool.returnResource(jedis) }) }) }) ssc.start() ssc.awaitTermination() } }上面代码使用了Jedis客户端来操作Redis,将分组计数结果数据累加写入Redis存储,如果其他系统需要实时获取该数据,直接从Redis实时读取即可。RedisClient实现代码如下所示:
object RedisClient extends Serializable { val redisHost = "10.10.4.130" val redisPort = 6379 val redisTimeout = 30000 lazy val pool = new JedisPool(new GenericObjectPoolConfig(), redisHost, redisPort, redisTimeout) lazy val hook = new Thread { override def run = { println("Execute hook thread: " + this) pool.destroy() } } sys.addShutdownHook(hook.run) }上面代码我们分别在local[K]和Spark Standalone集群模式下运行通过。
// lazy pool reference lazy val pool = new JedisPool(new GenericObjectPoolConfig(), redisHost, redisPort, redisTimeout) ... partitionOfRecords.foreach(pair => { val uid = pair._1 val clickCount = pair._2 val jedis = RedisClient.pool.getResource jedis.select(dbIndex) jedis.hincrBy(clickHashKey, uid, clickCount) RedisClient.pool.returnResource(jedis) })另一种方式,我们将代码修改为,把对Redis连接的管理放在操作DStream的Output操作范围之内,因为我们知道它是在特定的Executor中进行初始化的,使用一个单例的对象来管理,如下所示:
package org.shirdrn.spark.streaming import org.apache.commons.pool2.impl.GenericObjectPoolConfig import org.apache.spark.SparkConf import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import net.sf.json.JSONObject import redis.clients.jedis.JedisPool object UserClickCountAnalytics { def main(args: Array[String]): Unit = { var masterUrl = "local[1]" if (args.length > 0) { masterUrl = args(0) } // Create a StreamingContext with the given master URL val conf = new SparkConf().setMaster(masterUrl).setAppName("UserClickCountStat") val ssc = new StreamingContext(conf, Seconds(5)) // Kafka configurations val topics = Set("user_events") val brokers = "10.10.4.126:9092,10.10.4.127:9092" val kafkaParams = Map[String, String]( "metadata.broker.list" -> brokers, "serializer.class" -> "kafka.serializer.StringEncoder") val dbIndex = 1 val clickHashKey = "app::users::click" // Create a direct stream val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) val events = kafkaStream.flatMap(line => { val data = JSONObject.fromObject(line._2) Some(data) }) // Compute user click times val userClicks = events.map(x => (x.getString("uid"), x.getInt("click_count"))).reduceByKey(_ + _) userClicks.foreachRDD(rdd => { rdd.foreachPartition(partitionOfRecords => { partitionOfRecords.foreach(pair => { /** * Internal Redis client for managing Redis connection {@link Jedis} based on {@link RedisPool} */ object InternalRedisClient extends Serializable { @transient private var pool: JedisPool = null def makePool(redisHost: String, redisPort: Int, redisTimeout: Int, maxTotal: Int, maxIdle: Int, minIdle: Int): Unit = { makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle, true, false, 10000) } def makePool(redisHost: String, redisPort: Int, redisTimeout: Int, maxTotal: Int, maxIdle: Int, minIdle: Int, testOnBorrow: Boolean, testOnReturn: Boolean, maxWaitMillis: Long): Unit = { if(pool == null) { val poolConfig = new GenericObjectPoolConfig() poolConfig.setMaxTotal(maxTotal) poolConfig.setMaxIdle(maxIdle) poolConfig.setMinIdle(minIdle) poolConfig.setTestOnBorrow(testOnBorrow) poolConfig.setTestOnReturn(testOnReturn) poolConfig.setMaxWaitMillis(maxWaitMillis) pool = new JedisPool(poolConfig, redisHost, redisPort, redisTimeout) val hook = new Thread{ override def run = pool.destroy() } sys.addShutdownHook(hook.run) } } def getPool: JedisPool = { assert(pool != null) pool } } // Redis configurations val maxTotal = 10 val maxIdle = 10 val minIdle = 1 val redisHost = "10.10.4.130" val redisPort = 6379 val redisTimeout = 30000 val dbIndex = 1 InternalRedisClient.makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle) val uid = pair._1 val clickCount = pair._2 val jedis =InternalRedisClient.getPool.getResource jedis.select(dbIndex) jedis.hincrBy(clickHashKey, uid, clickCount) InternalRedisClient.getPool.returnResource(jedis) }) }) }) ssc.start() ssc.awaitTermination() } }上面代码实现,得益于Scala语言的特性,可以在代码中任何位置进行class或object的定义,我们将用来管理Redis连接的代码放在了特定操作的内部,就避免了瞬态(Transient)对象跨节点序列化的问题。这样做还要求我们能够了解Spark内部是如何操作RDD数据集的,更多可以参考RDD或Spark相关文档。
cd /usr/local/spark ./bin/spark-submit --class org.shirdrn.spark.streaming.UserClickCountAnalytics --master spark://hadoop1:7077 --executor-memory 1G --total-executor-cores 2 ~/spark-0.0.SNAPSHOT.jar spark://hadoop1:7077可以查看集群中各个Worker节点执行计算任务的状态,也可以非常方便地通过Web页面查看。
127.0.0.1:6379[1]> HGETALL app::users::click 1) "4A4D769EB9679C054DE81B973ED5D768" 2) "7037" 3) "8dfeb5aaafc027d89349ac9a20b3930f" 4) "6992" 5) "011BBF43B89BFBF266C865DF0397AA71" 6) "7021" 7) "97edfc08311c70143401745a03a50706" 8) "6874" 9) "d7f141563005d1b5d0d3dd30138f3f62" 10) "7057" 11) "a95f22eabc4fd4b580c011a3161a9d9d" 12) "7092" 13) "6b67c8c700427dee7552f81f3228c927" 14) "7266" 15) "f2a8474bf7bd94f0aabbd4cdd2c06dcf" 16) "7188" 17) "c8ee90aade1671a21336c721512b817a" 18) "6950" 19) "068b746ed4620d25e26055a9f804385f"
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.shirdrn.spark</groupId> <artifactId>spark</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>net.sf.json-lib</groupId> <artifactId>json-lib</artifactId> <version>2.3</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka_2.10</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>2.5.2</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-pool2</artifactId> <version>2.2</version> </dependency> </dependencies> <build> <sourceDirectory>${basedir}/src/main/scala</sourceDirectory> <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory> <resources> <resource> <directory>${basedir}/src/main/resources</directory> </resource> </resources> <testResources> <testResource> <directory>${basedir}/src/test/resources</directory> </testResource> </testResources> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source>1.6</source> <target>1.6</target> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.2</version> <configuration> <createDependencyReducedPom>true</createDependencyReducedPom> </configuration> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <artifactSet> <includes> <include>*:*</include> </includes> </artifactSet> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> <resource>reference.conf</resource> </transformer> <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer"> <resource>log4j.properties</resource> </transformer> </transformers> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>