Kafka和Spark Streaming Java版本集成并将数据实时写入HBase


Kafka和Spark Streaming Java版本集成并将数据实时写入HBase

mvn配置pom.xml


	4.0.0
	spaek
	spark
	1
	jar

	
		UTF-8
	
	

		
			org.apache.spark
			spark-streaming_2.10
			1.2.0
			provided
		
		
			org.apache.spark
			spark-streaming-kafka_2.10
			1.2.0
		
		
			org.clojure
			clojure
			1.6.0
		
		
			com.google.guava
			guava
			11.0.2
		
		
			org.apache.hbase
			hbase-client
			0.98.4-hadoop2
		
		
			com.google.protobuf
			protobuf-java
			2.5.0
		
		
			io.netty
			netty
			3.6.6.Final
		
		
			org.apache.hbase
			hbase-common
			0.98.4-hadoop2
		
		
			org.apache.hbase
			hbase-protocol
			0.98.4-hadoop2
		
		
			org.apache.zookeeper
			zookeeper
			3.4.5
		
		
			org.cloudera.htrace
			htrace-core
			2.01
		
	

	
		
			
			
				maven-assembly-plugin
				
					
						jar-with-dependencies
					
					
						
							
						
					
				
				
					
						make-assembly
						package
						
							single
						
					
				
			

			
				com.theoryinpractise
				clojure-maven-plugin
				true
				
					
						src/clj
					
				
				
					
						compile
						compile
						
							compile
						
					
				
			

			
				org.codehaus.mojo
				exec-maven-plugin
				1.2.1
				
					
						
							exec
						
					
				
				
					java
					true
					false
					compile
					${storm.topology}
				
			
		
	


java代码SparkStreamingFromFlumeToHBaseExample.java

package org.apache.spark.examples.streaming;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;

import scala.Tuple2;

import com.google.common.base.Optional;
import com.google.common.collect.Lists;

public class SparkStreamingFromFlumeToHBaseExample {

  private static final Pattern SPACE = Pattern.compile(" ");

  public static void main(String[] args) {
    if (args.length == 0) {
      System.err
          .println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds");
      System.exit(1);
    }

    // String master = args[0];
    // String host = args[1];
    // int port = Integer.parseInt(args[2]);
    String tableName = "test";// args[3];
    String columnFamily = "f";// args[4];
    // int windowInSeconds = 3;// Integer.parseInt(args[5]);
    // int slideInSeconds = 1;// Integer.parseInt(args[5]);

    String zkQuorum = "localhost";
    String group = "test-consumer-group";
    String topicss = "test";
    String numThread = "2";

    Duration batchInterval = new Duration(5000);
    // Duration windowInterval = new Duration(windowInSeconds * 1000);
    // Duration slideInterval = new Duration(slideInSeconds * 1000);

    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
    JavaStreamingContext jssc =
        new JavaStreamingContext(sparkConf, new Duration(2000));

    final Broadcast broadcastTableName =
        jssc.sparkContext().broadcast(tableName);
    final Broadcast broadcastColumnFamily =
        jssc.sparkContext().broadcast(columnFamily);

    // JavaDStream flumeStream = sc.flumeStream(host, port);

    int numThreads = Integer.parseInt(numThread);
    Map topicMap = new HashMap();
    String[] topics = topicss.split(",");
    for (String topic : topics) {
      topicMap.put(topic, numThreads);
    }

    JavaPairReceiverInputDStream messages =
        KafkaUtils.createStream(jssc, zkQuorum, group, topicMap);

    JavaDStream lines =
        messages.map(new Function, String>() {
          @Override
          public String call(Tuple2 tuple2) {
            return tuple2._2();
          }
        });

    JavaDStream words =
        lines.flatMap(new FlatMapFunction() {
          @Override
          public Iterable call(String x) {
            return Lists.newArrayList(SPACE.split(x));
          }
        });

    JavaPairDStream lastCounts =
        messages.map(new Function, String>() {
          @Override
          public String call(Tuple2 tuple2) {
            return tuple2._2();
          }
        }).flatMap(new FlatMapFunction() {
          @Override
          public Iterable call(String x) {
            return Lists.newArrayList(SPACE.split(x));
          }
        }).mapToPair(new PairFunction() {
          @Override
          public Tuple2 call(String s) {
            return new Tuple2(s, 1);
          }
        }).reduceByKey(new Function2() {

          @Override
          public Integer call(Integer x, Integer y) throws Exception {
            // TODO Auto-generated method stub
            return x.intValue() + y.intValue();
          }
        });

    lastCounts
        .foreach(new Function2, Time, Void>() {

          @Override
          public Void call(JavaPairRDD values, Time time)
              throws Exception {

            values.foreach(new VoidFunction>() {

              @Override
              public void call(Tuple2 tuple) throws Exception {
                HBaseCounterIncrementor incrementor =
                    HBaseCounterIncrementor.getInstance(
                        broadcastTableName.value(),
                        broadcastColumnFamily.value());
                incrementor.incerment("Counter", tuple._1(), tuple._2());
                System.out.println("Counter:" + tuple._1() + "," + tuple._2());

              }
            });

            return null;
          }
        });

    jssc.start();

  }
}

java代码CounterMap.java

package org.apache.spark.examples.streaming;

import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;

public class CounterMap {
  HashMap map = new HashMap();
  
  public void increment(String key, long increment) {
    Counter count = map.get(key);
    if (count == null) {
      count = new Counter();
      map.put(key, count);
    } 
    count.value += increment;
  }
  
  
  public long getValue(String key) {
    Counter count = map.get(key);
    if (count != null) {
      return count.value;
    } else {
      return 0;
    }
  }
  
  public Set> entrySet() {
    return map.entrySet();
  }
  
  public void clear() {
    map.clear();
  }
  
  public static class Counter {
    public long value;
  }
  
  
}


java代码HBaseCounterIncrementor.java

package org.apache.spark.examples.streaming;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.util.Bytes;

import org.apache.spark.examples.streaming.CounterMap;
import org.apache.spark.examples.streaming.CounterMap.Counter;

public class HBaseCounterIncrementor {

  static HBaseCounterIncrementor singleton;
  static String tableName;
  static String columnFamily;
  static HTable hTable;
  static long lastUsed;
  static long flushInterval;
  static CloserThread closerThread;
  static FlushThread flushThread;
  static HashMap rowKeyCounterMap =
      new HashMap();
  static Object locker = new Object();

  private HBaseCounterIncrementor(String tableName, String columnFamily) {
    HBaseCounterIncrementor.tableName = tableName;
    HBaseCounterIncrementor.columnFamily = columnFamily;
  }

  public static HBaseCounterIncrementor getInstance(String tableName,
      String columnFamily) {

    if (singleton == null) {
      synchronized (locker) {
        if (singleton == null) {
          singleton = new HBaseCounterIncrementor(tableName, columnFamily);
          initialize();
        }
      }
    }
    return singleton;
  }

  private static void initialize() {
    if (hTable == null) {
      synchronized (locker) {
        if (hTable == null) {
          Configuration hConfig = HBaseConfiguration.create();
          try {
            hTable = new HTable(hConfig, tableName);
            updateLastUsed();

          } catch (IOException e) {
            throw new RuntimeException(e);
          }
          flushThread = new FlushThread(flushInterval);
          flushThread.start();
          closerThread = new CloserThread();
          closerThread.start();
        }
      }
    }
  }

  public void incerment(String rowKey, String key, int increment) {
    incerment(rowKey, key, (long) increment);
  }

  public void incerment(String rowKey, String key, long increment) {
    CounterMap counterMap = rowKeyCounterMap.get(rowKey);
    if (counterMap == null) {
      counterMap = new CounterMap();
      rowKeyCounterMap.put(rowKey, counterMap);
    }
    counterMap.increment(key, increment);

    initialize();
  }

  private static void updateLastUsed() {
    lastUsed = System.currentTimeMillis();
  }

  protected void close() {
    if (hTable != null) {
      synchronized (locker) {
        if (hTable != null) {
          if (hTable != null && System.currentTimeMillis() - lastUsed > 30000) {
            flushThread.stopLoop();
            flushThread = null;
            try {
              hTable.close();
            } catch (IOException e) {
              // TODO Auto-generated catch block
              e.printStackTrace();
            }

            hTable = null;
          }
        }
      }
    }
  }

  public static class CloserThread extends Thread {

    boolean continueLoop = true;

    @Override
    public void run() {
      while (continueLoop) {

        if (System.currentTimeMillis() - lastUsed > 30000) {
          singleton.close();
          break;
        }

        try {
          Thread.sleep(60000);
        } catch (InterruptedException e) {
          e.printStackTrace();
        }
      }
    }

    public void stopLoop() {
      continueLoop = false;
    }
  }

  protected static class FlushThread extends Thread {
    long sleepTime;
    boolean continueLoop = true;

    public FlushThread(long sleepTime) {
      this.sleepTime = sleepTime;
    }

    @Override
    public void run() {
      while (continueLoop) {
        try {
          flushToHBase();
        } catch (IOException e) {
          e.printStackTrace();
          break;
        }

        try {
          Thread.sleep(sleepTime);
        } catch (InterruptedException e) {
          e.printStackTrace();
        }
      }
    }

    private void flushToHBase() throws IOException {
      synchronized (hTable) {
        if (hTable == null) {
          initialize();
        }
        updateLastUsed();

        for (Entry entry : rowKeyCounterMap.entrySet()) {
          CounterMap pastCounterMap = entry.getValue();
          rowKeyCounterMap.put(entry.getKey(), new CounterMap());

          Increment increment = new Increment(Bytes.toBytes(entry.getKey()));

          boolean hasColumns = false;
          for (Entry entry2 : pastCounterMap.entrySet()) {
            increment.addColumn(Bytes.toBytes(columnFamily),
                Bytes.toBytes(entry2.getKey()), entry2.getValue().value);
            hasColumns = true;
          }
          if (hasColumns) {
            updateLastUsed();
            hTable.increment(increment);
          }
        }
        updateLastUsed();
      }
    }

    public void stopLoop() {
      continueLoop = false;
    }
  }

}


mvn package打包后将spark-1-jar-with-dependencies.jar上传到spark集群运行

/root/spark-1.2.0-bin-hadoop2.3/bin/spark-submit --class org.apache.spark.examples.streaming.SparkStreamingFromFlumeToHBaseExample --master local[8]   /root/spark-1-jar-with-dependencies.jar  100


Hbase创建相应的表

hbase(main):002:0> create 'test', 'f'


启动kafka的server和producer

[root@n1 kafka-0.8.1]# bin/kafka-console-producer.sh --broker-list n1:9092 --topic test
[root@n1 kafka-0.8.1]# bin/kafka-server-start.sh config/server.properties


在producer端输入字符后HBase中会插入相应记录如下





也可以到http://download.csdn.net/detail/q79969786/8369971下载源代码

你可能感兴趣的:(hadoop相关)