spark streamming + kafka + Redis 实践

java操作Redis:http://blog.csdn.net/xyang81/article/details/51918129

数据order.txt

A 202.106.196.115 手机 iPhone8 8000
B 202.106.0.20 服装 布莱奥尼西服 199
C 202.102.152.3 家具 婴儿床 2000
D 202.96.96.68 家电 电饭锅 1000
F 202.98.0.68 化妆品 迪奥香水 200
H 202.96.75.68 食品 奶粉 600
J 202.97.229.133 图书 Hadoop编程指南 90
A 202.106.196.115 手机 手机壳 200
B 202.106.0.20 手机 iPhone8 8000
C 202.102.152.3 家具 婴儿车 2000
D 202.96.96.68 家具 婴儿车 1000
F 202.98.0.68 化妆品 迪奥香水 200
H 202.96.75.68 食品 婴儿床 600
J 202.97.229.133 图书 spark实战 80

JedisConnectionPool类

import redis.clients.jedis.Jedis
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}


object JedisConnectionPool{

  val config = new JedisPoolConfig()
  //最大连接数,
  config.setMaxTotal(20)
  //最大空闲连接数
  config.setMaxIdle(10)
  //当调用borrow Object方法时,是否进行有效性检查 -->
  config.setTestOnBorrow(true)
  //10000代表超时时间(10秒)
  val pool = new JedisPool(config, "192.168.1.207", 6379, 10000, "123")

  def getConnection(): Jedis = {
    pool.getResource
  }

  def main(args: Array[String]) {


    val conn = JedisConnectionPool.getConnection()
//    conn.set("income", "1000")
//
//    val r1 = conn.get("xiaoniu")
//
//    println(r1)
//
//    conn.incrBy("xiaoniu", -50)
//
//    val r2 = conn.get("xiaoniu")
//
//    println(r2)
//
//    conn.close()

    val r = conn.keys("*")
    import scala.collection.JavaConversions._
    for (p <- r) {
      println(p + " : " + conn.get(p))
    }
  }

}

OrderCount类

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Duration, StreamingContext}


object OrderCount {

  def main(args: Array[String]): Unit = {

    //指定组名
    val group = "g1"
    //创建SparkConf
    val conf = new SparkConf().setAppName("OrderCount").setMaster("local[4]")
    //创建SparkStreaming,并设置间隔时间
    val ssc = new StreamingContext(conf, Duration(5000))

    val broadcastRef = IPUtils.broadcastIpRules(ssc, "/Users/zx/Desktop/temp/spark-24/spark-4/ip/ip.txt")


    //指定消费的 topic 名字
    val topic = "orders"
    //指定kafka的broker地址(sparkStream的Task直连到kafka的分区上,用更加底层的API消费,效率更高)
    val brokerList = "node-4:9092,node-5:9092,node-6:9092"

    //指定zk的地址,后期更新消费的偏移量时使用(以后可以使用Redis、MySQL来记录偏移量)
    val zkQuorum = "node-1:2181,node-2:2181,node-3:2181"
    //创建 stream 时使用的 topic 名字集合,SparkStreaming可同时消费多个topic
    val topics: Set[String] = Set(topic)

    //创建一个 ZKGroupTopicDirs 对象,其实是指定往zk中写入数据的目录,用于保存偏移量
    val topicDirs = new ZKGroupTopicDirs(group, topic)
    //获取 zookeeper 中的路径 "/g001/offsets/wordcount/"
    val zkTopicPath = s"${topicDirs.consumerOffsetDir}"

    //准备kafka的参数
    val kafkaParams = Map(
      //"key.deserializer" -> classOf[StringDeserializer],
      //"value.deserializer" -> classOf[StringDeserializer],
      //"deserializer.encoding" -> "GB2312", //配置读取Kafka中数据的编码
      "metadata.broker.list" -> brokerList,
      "group.id" -> group,
      //从头开始读取数据
      "auto.offset.reset" -> kafka.api.OffsetRequest.SmallestTimeString
    )

    //zookeeper 的host 和 ip,创建一个 client,用于跟新偏移量量的
    //是zookeeper的客户端,可以从zk中读取偏移量数据,并更新偏移量
    val zkClient = new ZkClient(zkQuorum)

    //查询该路径下是否字节点(默认有字节点为我们自己保存不同 partition 时生成的)
    // /g001/offsets/wordcount/0/10001"
    // /g001/offsets/wordcount/1/30001"
    // /g001/offsets/wordcount/2/10001"
    //zkTopicPath  -> /g001/offsets/wordcount/
    val children = zkClient.countChildren(zkTopicPath)

    var kafkaStream: InputDStream[(String, String)] = null

    //如果 zookeeper 中有保存 offset,我们会利用这个 offset 作为 kafkaStream 的起始位置
    var fromOffsets: Map[TopicAndPartition, Long] = Map()

    //如果保存过 offset
    //注意:偏移量的查询是在Driver完成的
    if (children > 0) {
      for (i <- 0 until children) {
        // /g001/offsets/wordcount/0/10001

        // /g001/offsets/wordcount/0
        val partitionOffset = zkClient.readData[String](s"$zkTopicPath/${i}")
        // wordcount/0
        val tp = TopicAndPartition(topic, i)
        //将不同 partition 对应的 offset 增加到 fromOffsets 中
        // wordcount/0 -> 10001
        fromOffsets += (tp -> partitionOffset.toLong)
      }
      //Key: kafka的key   values: "hello tom hello jerry"
      //这个会将 kafka 的消息进行 transform,最终 kafak 的数据都会变成 (kafka的key, message) 这样的 tuple
      val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key(), mmd.message())

      //通过KafkaUtils创建直连的DStream(fromOffsets参数的作用是:按照前面计算好了的偏移量继续消费数据)
      //[String, String, StringDecoder, StringDecoder,     (String, String)]
      //  key    value    key的解码方式   value的解码方式
      kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
    } else {
      //如果未保存,根据 kafkaParam 的配置使用最新(largest)或者最旧的(smallest) offset
      kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
    }

    //偏移量的范围
    var offsetRanges = Array[OffsetRange]()

    //直连方式只有在KafkaDStream的RDD(KafkaRDD)中才能获取偏移量,那么就不能到调用DStream的Transformation
    //所以只能子在kafkaStream调用foreachRDD,获取RDD的偏移量,然后就是对RDD进行操作了
    //依次迭代KafkaDStream中的KafkaRDD
    //如果使用直连方式累加数据,那么就要在外部的数据库中进行累加(用KeyVlaue的内存数据库(NoSQL),Redis)
    //kafkaStream.foreachRDD里面的业务逻辑是在Driver端执行
    kafkaStream.foreachRDD { kafkaRDD =>
      //判断当前的kafkaStream中的RDD是否有数据
      if(!kafkaRDD.isEmpty()) {
        //只有KafkaRDD可以强转成HasOffsetRanges,并获取到偏移量
        offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
        val lines: RDD[String] = kafkaRDD.map(_._2)

        //整理数据
        val fields: RDD[Array[String]] = lines.map(_.split(" "))

        //计算成交总金额
        CalculateUtil.calculateIncome(fields)

        //计算商品分类金额
        CalculateUtil.calculateItem(fields)

        //计算区域成交金额
        CalculateUtil.calculateZone(fields, broadcastRef)

        //偏移量跟新在哪一端()
        for (o <- offsetRanges) {
          //  /g001/offsets/wordcount/0
          val zkPath = s"${topicDirs.consumerOffsetDir}/${o.partition}"
          //将该 partition 的 offset 保存到 zookeeper
          //  /g001/offsets/wordcount/0/20000
          ZkUtils.updatePersistentPath(zkClient, zkPath, o.untilOffset.toString)
        }
      }
    }

    ssc.start()
    ssc.awaitTermination()

  }


}

CalculateUtil类

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


object CalculateUtil {

  def calculateIncome(fields: RDD[Array[String]]) = {
    //将数据计算后写入到Reids
    val priceRDD: RDD[Double] = fields.map(arr => {
      val price = arr(4).toDouble
      price
    })
    //reduce是一个Action,会把结果返回到Driver端
    //将当前批次的总金额返回了
    val sum: Double = priceRDD.reduce(_+_)
    //获取一个jedis连接
    val conn = JedisConnectionPool.getConnection()
    //将历史值和当前的值进行累加
    //conn.set(Constant.TOTAL_INCOME, sum.toString)
    conn.incrByFloat(Constant.TOTAL_INCOME, sum)
    //释放连接
    conn.close()
  }

  /**
    * 计算分类的成交金额
    * @param fields
    */
  def calculateItem(fields: RDD[Array[String]]) = {
    //对field的map方法是在哪一端调用的呢?Driver
    val itemAndPrice: RDD[(String, Double)] = fields.map(arr => {
      //分类
      val item = arr(2)
      //金额
      val parice = arr(4).toDouble
      (item, parice)
    })
    //安装商品分类进行聚合
    val reduced: RDD[(String, Double)] = itemAndPrice.reduceByKey(_+_)
    //将当前批次的数据累加到Redis中
    //foreachPartition是一个Action
    //现在这种方式,jeids的连接是在哪一端创建的(Driver)
    //在Driver端拿Jedis连接不好
    //val conn = JedisConnectionPool.getConnection()

    reduced.foreachPartition(part => {
      //获取一个Jedis连接
      //这个连接其实是在Executor中的获取的
      //JedisConnectionPool在一个Executor进程中有几个实例(单例)
      val conn = JedisConnectionPool.getConnection()
      part.foreach(t => {
        //一个连接更新多条数据
        conn.incrByFloat(t._1, t._2)
      })
      //将当前分区中的数据跟新完在关闭连接
      conn.close()
    })
  }

  //根据Ip计算归属地
  def calculateZone(fields: RDD[Array[String]], broadcastRef: Broadcast[Array[(Long, Long, String)]]) = {

    val provinceAndPrice: RDD[(String, Double)] = fields.map(arr => {
      val ip = arr(1)
      val price = arr(4).toDouble
      val ipNum = MyUtils.ip2Long(ip)
      //在Executor中获取到广播的全部规则
      val allRules: Array[(Long, Long, String)] = broadcastRef.value
      //二分法查找
      val index = MyUtils.binarySearch(allRules, ipNum)
      var province = "未知"
      if (index != -1) {
        province = allRules(index)._3
      }
      //省份,订单金额
      (province, price)
    })
    //按省份进行聚合
    val reduced: RDD[(String, Double)] = provinceAndPrice.reduceByKey(_+_)
    //将数据跟新到Redis
    reduced.foreachPartition(part => {
      val conn = JedisConnectionPool.getConnection()
      part.foreach(t => {
        conn.incrByFloat(t._1, t._2)
      })
      conn.close()
    })

  }
}

Constant(1)类

object Constant {

  val TOTAL_INCOME = "TOTAL_INCOME"
}

MyUtils类

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.spark.streaming.StreamingContext

import scala.io.{BufferedSource, Source}

object MyUtils {

  def ip2Long(ip: String): Long = {
    val fragments = ip.split("[.]")
    var ipNum = 0L
    for (i <- 0 until fragments.length){
      ipNum =  fragments(i).toLong | ipNum << 8L
    }
    ipNum
  }

  def readRules(path: String): Array[(Long, Long, String)] = {
    //读取ip规则
    val bf: BufferedSource = Source.fromFile(path)
    val lines: Iterator[String] = bf.getLines()
    //对ip规则进行整理,并放入到内存
    val rules: Array[(Long, Long, String)] = lines.map(line => {
      val fileds = line.split("[|]")
      val startNum = fileds(2).toLong
      val endNum = fileds(3).toLong
      val province = fileds(6)
      (startNum, endNum, province)
    }).toArray
    rules
  }

  def binarySearch(lines: Array[(Long, Long, String)], ip: Long) : Int = {
    var low = 0
    var high = lines.length - 1
    while (low <= high) {
      val middle = (low + high) / 2
      if ((ip >= lines(middle)._1) && (ip <= lines(middle)._2))
        return middle
      if (ip < lines(middle)._1)
        high = middle - 1
      else {
        low = middle + 1
      }
    }
    -1
  }

  def data2MySQL(it: Iterator[(String, Int)]): Unit = {
    //一个迭代器代表一个分区,分区中有多条数据
    //先获得一个JDBC连接
    val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "123568")
    //将数据通过Connection写入到数据库
    val pstm: PreparedStatement = conn.prepareStatement("INSERT INTO access_log VALUES (?, ?)")
    //将分区中的数据一条一条写入到MySQL中
    it.foreach(tp => {
      pstm.setString(1, tp._1)
      pstm.setInt(2, tp._2)
      pstm.executeUpdate()
    })
    //将分区中的数据全部写完之后,在关闭连接
    if(pstm != null) {
      pstm.close()
    }
    if (conn != null) {
      conn.close()
    }
  }


  def main(args: Array[String]): Unit = {
    //数据是在内存中
    val rules: Array[(Long, Long, String)] = readRules("/Users/zx/Desktop/ip/ip.txt")
    //将ip地址转换成十进制
    val ipNum = ip2Long("114.215.43.42")
    //查找
    val index = binarySearch(rules, ipNum)
    //根据脚本到rules中查找对应的数据
    val tp = rules(index)
    val province = tp._3
    println(province)

  }
}

IPUtils类

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext


object IPUtils {

  def broadcastIpRules(ssc: StreamingContext, ipRulesPath: String): Broadcast[Array[(Long, Long, String)]] = {
    //现获取sparkContext
    val sc = ssc.sparkContext
    val rulesLines:RDD[String] = sc.textFile(ipRulesPath)
    //整理ip规则数据
    val ipRulesRDD: RDD[(Long, Long, String)] = rulesLines.map(line => {
      val fields = line.split("[|]")
      val startNum = fields(2).toLong
      val endNum = fields(3).toLong
      val province = fields(6)
      (startNum, endNum, province)
    })

    //将分散在多个Executor中的部分IP规则收集到Driver端
    val rulesInDriver: Array[(Long, Long, String)] = ipRulesRDD.collect()


    //将Driver端的数据广播到Executor
    //广播变量的引用(还在Driver端)
    sc.broadcast(rulesInDriver)
  }
}

 

你可能感兴趣的:(Spark,Kafka)