Kafka Stream数据清洗案例(亲测有效)

0)需求:
实时处理单词带有”>>>”前缀的内容。例如输入”atguigu>>>ximenqing”,最终处理成“ximenqing”
1)需求分析:
Kafka Stream数据清洗案例(亲测有效)_第1张图片

2)案例实操
需要的maven包:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.sheng.hbase</groupId>
  <artifactId>HbaseMaven</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
 <dependencies>

		
			
			<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
			<dependency>
				<groupId>org.apache.hadoop</groupId>
				<artifactId>hadoop-common</artifactId>
				<version>2.7.3</version>

			</dependency>

			<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
			<dependency>
				<groupId>org.apache.hadoop</groupId>
				<artifactId>hadoop-hdfs</artifactId>
				<version>2.7.3</version>
			</dependency>

			<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
			<!-- Winodws下提交至Yarn上运行,改客户端是2.6.1s 
			<dependency>
				<groupId>org.apache.hadoop</groupId>
				<artifactId>hadoop-client</artifactId>
				<version>2.6.1</version>
			</dependency>
-->

			<dependency>
				<groupId>junit</groupId>
				<artifactId>junit</artifactId>
				<version>4.10</version>
			</dependency>



			<!-- https://mvnrepository.com/artifact/org.apache.mrunit/mrunit MRUnit测试 -->
			<dependency>
				<groupId>org.apache.mrunit</groupId>
				<artifactId>mrunit</artifactId>
				<version>0.9.0-incubating</version>
				<classifier>hadoop2</classifier>
				<scope>test</scope>
			</dependency>





			<dependency>
				<groupId>jdk.tools</groupId>
				<artifactId>jdk.tools</artifactId>
				<version>1.8</version>
				<scope>system</scope>
				<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
			</dependency>



			<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase -->
			<dependency>
				<groupId>org.apache.hbase</groupId>
				<artifactId>hbase</artifactId>
				<version>1.3.2</version>
				<type>pom</type>
			</dependency>

			<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-common -->
			<dependency>
				<groupId>org.apache.hbase</groupId>
				<artifactId>hbase-common</artifactId>
				<version>1.3.2</version>
			</dependency>
			<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server -->
			<dependency>
				<groupId>org.apache.hbase</groupId>
				<artifactId>hbase-server</artifactId>
				<version>1.3.2</version>
			</dependency>


			<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
			<dependency>
				<groupId>org.apache.hbase</groupId>
				<artifactId>hbase-client</artifactId>
				<version>1.3.2</version>
			</dependency>

			<!-- https://mvnrepository.com/artifact/org.apache.zookeeper/zookeeper -->
			<dependency>
				<groupId>org.apache.zookeeper</groupId>
				<artifactId>zookeeper</artifactId>
				<version>3.4.6</version>
				<type>pom</type>
			</dependency>

			<!-- https://mvnrepository.com/artifact/org.glassfish.jersey.core/jersey-client -->
			<dependency>
				<groupId>org.glassfish.jersey.core</groupId>
				<artifactId>jersey-client</artifactId>
				<version>2.26</version>
			</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients -->
	<dependency>
	    <groupId>org.apache.kafka</groupId>
	    <artifactId>kafka-clients</artifactId>
	    <version>0.10.0.0</version>
	</dependency>
	<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-streams -->
	<dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-streams</artifactId>
            <version>0.10.0.0</version>
        </dependency>

	
		
	
			

	</dependencies>
  
</project>

(2)创建主类

import java.util.Properties;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.processor.Processor;
import org.apache.kafka.streams.processor.ProcessorSupplier;
import org.apache.kafka.streams.processor.TopologyBuilder;

public class Application {

	public static void main(String[] args) {

		// 定义输入的topic
        String from = "first";
        // 定义输出的topic
        String to = "second";

        // 设置参数
        Properties settings = new Properties();
        settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "logFilter");
        settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "hadoop102:9092");

        StreamsConfig config = new StreamsConfig(settings);

        // 构建拓扑
        TopologyBuilder builder = new TopologyBuilder();

        builder.addSource("SOURCE", from)
               .addProcessor("PROCESS", new ProcessorSupplier<byte[], byte[]>() {

					@Override
					public Processor<byte[], byte[]> get() {
						// 具体分析处理
						return new LogProcessor();
					}
				}, "SOURCE")
                .addSink("SINK", to, "PROCESS");

        // 创建kafka stream
        KafkaStreams streams = new KafkaStreams(builder, config);
        streams.start();
	}
}

(3)具体业务处理

import org.apache.kafka.streams.processor.Processor;
import org.apache.kafka.streams.processor.ProcessorContext;

public class LogProcessor implements Processor<byte[], byte[]> {
	
	private ProcessorContext context;
	
	@Override
	public void init(ProcessorContext context) {
		this.context = context;
	}

	@Override
	public void process(byte[] key, byte[] value) {
		String input = new String(value);
		
		// 如果包含“>>>”则只保留该标记后面的内容
		if (input.contains(">>>")) {
			input = input.split(">>>")[1].trim();
			// 输出到下一个topic
			context.forward("logProcessor".getBytes(), input.getBytes());
		}else{
			context.forward("logProcessor".getBytes(), input.getBytes());
		}
	}

	@Override
	public void punctuate(long timestamp) {
		
	}

	@Override
	public void close() {
		
	}
}

(4)运行程序
(5)在HadoopNode1上启动生产者

kafka-console-producer.sh  --broker-list hadoop102:9092 --topic test

>hello>>>world
>h>>>atguigu
>hahaha

(6)在HadoopNode3上启动消费者

kafka-console-consumer.sh  --zookeeper hadoop102:2181 --from-beginning --topic second

world
atguigu
hahaha

参考资料:尚硅谷大数据教学

你可能感兴趣的:(大数据)