spark streaming读取kafka数据,记录offset

如下是pom.xml文件

	4.0.0

	com.demo
	spark-streaming-demo
	0.0.1-SNAPSHOT
	jar

	spark-streaming-demo
	http://maven.apache.org

	
		UTF-8
		1.6.2
		5.1.35
	

	
		
			org.apache.spark
			spark-streaming_2.10
			${spark.version}
		
		
			org.apache.spark
			spark-streaming-kafka_2.10
			${spark.version}
		
		
			org.apache.spark
			spark-core_2.10
			${spark.version}
		
		
			org.apache.spark
			spark-sql_2.10
			${spark.version}
		

		
			mysql
			mysql-connector-java
			${mysql-connector.version}
		

		
			com.alibaba
			druid
			1.0.31
		
		
			junit
			junit
			3.8.1
			test
		
		
			com.stratio.datasource
			spark-mongodb_2.11
			0.12.0
		
	


代码如下:

package com.fosun.spark_streaming_demo;

import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;

import javax.sql.DataSource;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.api.java.JavaStreamingContextFactory;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;

import com.alibaba.druid.pool.DruidDataSourceFactory;

import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import scala.Tuple2;

public class SparkstreamingOnDirectKafka {
	public static JavaStreamingContext createContext() throws Exception {
		SparkConf conf = new SparkConf().setMaster("local[4]").setAppName("SparkStreamingOnKafkaDirect");
		JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(1));
		// jsc.checkpoint("/user/tenglq/checkpoint");

		Map kafkaParams = new HashMap();
		kafkaParams.put("metadata.broker.list", "fonova-hadoop1.fx01:9092,fonova-hadoop2.fx01:9092");
		kafkaParams.put("auto.offset.reset", "smallest");
		Set topics = new HashSet();
		topics.add("tlqtest3");

		final Map params = new HashMap();
		params.put("driverClassName", "com.mysql.jdbc.Driver");
		params.put("url", "jdbc:mysql://172.16.100.49:3306/test_sparkstreaming");
		params.put("username", "root");
		params.put("password", "root123456");

		Map offsets = new HashMap();
		DataSource ds = DruidDataSourceFactory.createDataSource(params);
		Connection conn = ds.getConnection();
		Statement stmt = conn.createStatement();
		ResultSet rs = stmt.executeQuery("SELECT topic,partition,offset FROM kafka_offsets WHERE topic = 'tlqtest3'");
		while (rs.next()) {
			TopicAndPartition topicAndPartition = new TopicAndPartition(rs.getString(1), rs.getInt(2));
			offsets.put(topicAndPartition, rs.getLong(3));
		}

		final AtomicReference offsetRanges = new AtomicReference();

		JavaDStream lines = null;

		if (offsets.isEmpty()) {
			JavaPairInputDStream pairDstream = KafkaUtils.createDirectStream(jsc, String.class,
					String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
			lines = pairDstream
					.transformToPair(new Function, JavaPairRDD>() {
						private static final long serialVersionUID = 1L;

						public JavaPairRDD call(JavaPairRDD rdd) throws Exception {
							OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
							offsetRanges.set(offsets);
							return rdd;
						}
					}).flatMap(new FlatMapFunction, String>() {
						private static final long serialVersionUID = 1L;

						public Iterable call(Tuple2 t) throws Exception {
							return Arrays.asList(t._2);
						}
					});
		} else {
			JavaInputDStream dstream = KafkaUtils.createDirectStream(jsc, String.class, String.class,
					StringDecoder.class, StringDecoder.class, String.class, kafkaParams, offsets,
					new Function, String>() {

						private static final long serialVersionUID = 1L;

						public String call(MessageAndMetadata v1) throws Exception {
							return v1.message();
						}
					});
			lines = dstream.transform(new Function, JavaRDD>() {
				private static final long serialVersionUID = 1L;

				public JavaRDD call(JavaRDD rdd) throws Exception {
					OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
					offsetRanges.set(offsets);
					return rdd;
				}
			});

		}

		lines.foreachRDD(new VoidFunction>() {
			private static final long serialVersionUID = 1L;

			public void call(JavaRDD rdd) throws Exception {
				// 操作rdd
				List map = rdd.collect();
				String[] array = new String[map.size()];
				System.arraycopy(map.toArray(new String[map.size()]), 0, array, 0, map.size());
				List l = Arrays.asList(array);
				Collections.sort(l);
				for (String value : l) {
					System.out.println(value);
				}

				// 保存offset
				DataSource ds = DruidDataSourceFactory.createDataSource(params);
				Connection conn = ds.getConnection();
				Statement stmt = conn.createStatement();
				for (OffsetRange offsetRange : offsetRanges.get()) {
					ResultSet rs = stmt.executeQuery("select count(1) from kafka_offsets where topic='"
							+ offsetRange.topic() + "' and partition='" + offsetRange.partition() + "'");
					if (rs.next()) {
						int count = rs.getInt(1);
						if (count > 0) {
							stmt.executeUpdate("update kafka_offsets set offset ='" + offsetRange.untilOffset()
									+ "'  where topic='" + offsetRange.topic() + "' and partition='"
									+ offsetRange.partition() + "'");
						} else {
							stmt.execute("insert into kafka_offsets(topic,partition,offset) values('"
									+ offsetRange.topic() + "','" + offsetRange.partition() + "','"
									+ offsetRange.untilOffset() + "')");
						}
					}

					rs.close();
				}

				stmt.close();
				conn.close();
			}

		});

		return jsc;
	}

	public static void main(String[] args) {
		JavaStreamingContextFactory factory = new JavaStreamingContextFactory() {
			public JavaStreamingContext create() {
				try {
					return createContext();
				} catch (Exception e) {
					throw new RuntimeException(e);
				}
			}
		};

		// JavaStreamingContext jsc =
		// JavaStreamingContext.getOrCreate("/user/tenglq/checkpoint", factory);

		JavaStreamingContext jsc = factory.create();

		jsc.start();

		jsc.awaitTermination();
		jsc.close();

	}
}


你可能感兴趣的:(spark)