SparkStreaming消费kafka中的数据保存到HBase中简单demo

概述

数据处理流程:kafka–>spark streaming -->hbase
最近在做数据处理,但是通过java api进行处理的,目前想通过spark去做处理,这里记下了一个简单的实现,但是生产上肯定不是那么简单的.后面会说有哪些优化的点.

环境

jdk 1.8
cdh:1.2.0-cdh5.16.1
scala:2.11.8
KAFKA-3.1.0-1.3.1.0
  • pom.xml
<properties>
      <scala.version>2.11.8scala.version>
      <spark.version>2.2.0spark.version>
      <hadoop.version>2.6.0-cdh5.7.0hadoop.version>
    properties>


  <repositories>
    
    
    
    
    

    <repository>
      <id>clouderaid>
      <name>clouderaname>
      <url>https://repository.cloudera.com/artifactory/cloudera-reposurl>
    repository>
  repositories>

  <pluginRepositories>
    <pluginRepository>
      <id>scala-tools.orgid>
      <name>Scala-Tools Maven2 Repositoryname>
      <url>http://scala-tools.org/repo-releasesurl>
    pluginRepository>
  pluginRepositories>
<dependencies>
<dependency>
      <groupId>org.apache.hbasegroupId>
      <artifactId>hbase-clientartifactId>
      <version>1.2.0-cdh5.16.1version>
 dependency>


    <dependency>
      <groupId>org.scala-langgroupId>
      <artifactId>scala-libraryartifactId>
      <version>${scala.version}version>
    dependency>
    <dependency>
      <groupId>org.apache.sparkgroupId>
      <artifactId>spark-sql_2.11artifactId>
      <version>${spark.version}version>
    dependency>

    <dependency>
    <groupId>org.apache.sparkgroupId>
    <artifactId>spark-streaming_2.11artifactId>
    <version>${spark.version}version>
    dependency>


    
    <dependency>
    <groupId>org.apache.sparkgroupId>
    <artifactId>spark-streaming-kafka-0-10_2.11artifactId>
    <version>${spark.version}version>
    dependency>
    
      
      
      
    

    <dependency>
      <groupId>org.apache.hbasegroupId>
      <artifactId>hbase-serverartifactId>
      <version>1.2.0-cdh5.16.1version>
    dependency>

    <dependency>
      <groupId>org.apache.hbasegroupId>
      <artifactId>hbaseartifactId>
      <version>1.2.0-cdh5.16.1version>
      <type>pomtype>
    dependency>

    
    <dependency>
      <groupId>com.alibabagroupId>
      <artifactId>fastjsonartifactId>
      <version>1.2.58version>
    dependency>

demo

package cn.zhangyu

import cn.zhangyu.HbaseStreaming.createTable
import cn.zhangyu.utils.JsonUitls
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.spark.internal.Logging
import scala.collection.JavaConversions.mapAsScalaMap

object KafkaStreaming2Hbase extends Logging{
  def main(args: Array[String]): Unit = {
    //创建streamingcontext
    val sparkConf = new SparkConf().setAppName("KafkaStreaming2Hbase").setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf,Seconds(5))

    //准备参数
    val kafkaParams = Map[String,Object](
      "bootstrap.servers" -> "hadoop001:9092,hadoop002:9092,hadoop003:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "use_a_separate_group_id_for_each_stream",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    //准备topics
    val topics = Array("test")
    //创建stream
    val stream = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
    )
    // column family
    val family = Bytes.toBytes("cf1")

    stream.foreachRDD(rdd => {
      rdd.foreachPartition(partitions =>{
        //在每一个partition创建,否则会出现序列化异常问题
        val table = createTable()
        try {
          partitions.foreach(row => {
           
            print("----------------" + row.value())
            val map = JsonUitls.parseJson(row.value())
            print("map" + map)
            //转化为map 
            val javaMap = mapAsScalaMap(map)
            println("java Map----------" + javaMap)
            println("id=====" + javaMap.get("id").get.toString)
            javaMap.foreach(x => {
              //rowkey
              val put = new Put(Bytes.toBytes(javaMap.get("id").get.toString))
              x._1 match {
                case "flag" => (put.addImmutable(family, Bytes.toBytes("flag"), Bytes.toBytes(javaMap.get("flag").get.toString)), table.put(put))
                case "libid" => (put.addImmutable(family, Bytes.toBytes("libid"), Bytes.toBytes(javaMap.get("libid").get.toString)), table.put(put))
                case "idOpType" => (put.addImmutable(family, Bytes.toBytes("idOpType"), Bytes.toBytes(javaMap.get("idOpType").get.toString)), table.put(put))
                case "ATTR" => (put.addImmutable(family, Bytes.toBytes("ATTR"), Bytes.toBytes(javaMap.get("ATTR").get.toString)), table.put(put))
                case _ => println("x:" + x)
              }
            })
          })
        }catch {
          case e: Exception => logError("写入HBase失败,{}" + e.getMessage)
        }

      })
    })

    ssc.start()
    ssc.awaitTermination()
  }

  def createTable (): Table = {
    val hbaseConf = HBaseConfiguration.create ()
    hbaseConf.set ("hbase.zookeeper.quorum", "hadoop001,hadoop002,hadoop003")
    hbaseConf.set ("hbase.zookeeper.property.clientPort", "2181")
    hbaseConf.set ("hbase.defaults.for.version.skip", "true")
    val conn = ConnectionFactory.createConnection (hbaseConf)
    conn.getTable (TableName.valueOf ("aaabbb") )
  }

}

package cn.zhangyu.utils;

import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.TypeReference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;

public class JsonUitls {

    static Logger logger = LoggerFactory.getLogger(JsonUitls.class);

    public static Map<String,Object> parseJson(String s){
        //{"id":"test","age":18,"name":"lisi","sex":"gender","libid":"001","idOpType":"1","flag":"1"}
        Map<String,Object> map = new HashMap<>();
        Map<String,Object> attrMap = new HashMap<String, Object>();
        if (s != null && !s.isEmpty()){
            map = JSONObject.parseObject(s, new TypeReference<Map<String, Object>>() {
            });
        }
        //遍历kafka 中的数据
        for (Map.Entry<String, Object> entry : map.entrySet()) {
            //过滤id,libid,flag,idOpType字段
            if (!"id".equals(entry.getKey()) && !"libid".equals(entry.getKey())
                    && !"flag".equals(entry.getKey()) && !"idOpType".equals(entry.getKey())) {
                String key = entry.getKey();
                attrMap.put(key, entry.getValue());
            }
        }
        //转化为json
        String attrString = new JSONObject(attrMap).toJSONString();
        Map<String,Object> result = new HashMap<String, Object>();
        Object id = map.get("id");
        Object libid = map.get("libid");
        if (id == null || libid == null) {
            logger.error("id is null or libid is null");
            return null;
        }
        result.put("id",id.toString());
        result.put("libid",libid.toString());
        result.put("flag",map.get("flag") == null ? "0":map.get("flag"));
        result.put("idOpType",map.get("idOpType") == null ? "1":map.get("idOpType"));
        result.put("ATTR",attrString);
        return result;
    }
}

例子很简单,启动程序可以再本地启动,或者通过spark-submit去提交

spark-submit \
--class cn.zhangyu.KafkaStreaming2Hbase \
--master local[2] \
/home/hadoop/lib/spark_streaming-1.0-SNAPSHOT.jar  \ args0 args1 args2 ....  (参数在最后写,空格分隔)

思考

这个例子中其实存在很多问题,这里简单说两个:

  1. 例子中的参数都是写死的
  2. 脏数据没有处理
    大家可以想想如何去优化.

你可能感兴趣的:(spark,streaming,HBase,Spark)