spark操作Redis

需求:需要通过spark对redis里面的数据进行实时读写

实现方案:通过建立连接池,在每台机器上单独建立连接,进行操作

1、利用lazy val的方式进行包装

  class RedisSink(makeJedisPool: () => JedisPool) extends Serializable {
    lazy val pool = makeJedisPool()
  }

  object RedisSink {
    def apply(redisHost: String, redisPort: Int, password: String, database: Int): RedisSink = {
      val createJedisPoolFunc = () => {
        val poolConfig = new GenericObjectPoolConfig()
        val pool = new JedisPool(poolConfig, redisHost, redisPort, Protocol.DEFAULT_TIMEOUT, password, database)
        val hook = new Thread {
          override def run = {
            pool.destroy()
          }
        }
        sys.addShutdownHook(hook.run)
        pool
      }
      new RedisSink(createJedisPoolFunc)
    }
  }

2、使用时,我们利用广播变量的形式,将RedisSink广播到每一个executor

    val redisSink: Broadcast[RedisSink] = {
      sc.broadcast(RedisSink(redisHost, redisPort, redisPassword, redisDatabase))
    }

    val rdd = DataUtils.getKafkaDataSource(ssc, topic, bootstrap, groupId)
    rdd.foreachRDD(recordRDD => {
      if (!recordRDD.isEmpty()) {
        recordRDD.foreachPartition(part => {
          val jedisPool = redisSink.value.pool
          val jedis = jedisPool.getResource
          val pipeline = jedis.pipelined()

          part.foreach(x => {
            val message = x.value()
            val kv = message.split(",")
            val key = kv(0)
            val value = kv(1)
//            jedis.set(key, value)
            pipeline.set(key, value)
          })
          pipeline.sync()
          jedis.close()
        })
      }
    })

你可能感兴趣的:(Hadoop相关,spark,redis,redis,spark,数据库,连接池,scala)