logger server – flume读数据 – kafka – flume – hdfs
第一层flume:taildir source – memory channel(或者file channel) – kafka sink 【传统架构】【√】
taildir source – kafka channel 【使用kafka channel】
第二层flume:kafka source – memory channel(或者file channel) – hdfs sink 【传统架构】【√】需要添加拦截器,如果没有source,没有办法添加拦截器
kafka channel – hdfs sink 【使用kafka channel】
在实现第一层flume之前,需要进行数据清洗,将读取到的不符合规则的数据去除掉
创建maven工程,编辑依赖信息(pom.xml)
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.hike.gmallgroupId>
<artifactId>ColllectartifactId>
<version>1.0-SNAPSHOTversion>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<configuration>
<source>7source>
<target>7target>
configuration>
plugin>
<plugin>
<artifactId>maven-compiler-pluginartifactId>
<version>2.3.2version>
<configuration>
<source>1.8source>
<target>1.8target>
configuration>
plugin>
<plugin>
<artifactId>maven-assembly-pluginartifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependenciesdescriptorRef>
descriptorRefs>
configuration>
<executions>
<execution>
<id>make-assemblyid>
<phase>packagephase>
<goals>
<goal>singlegoal>
goals>
execution>
executions>
plugin>
plugins>
build>
<dependencies>
<dependency>
<groupId>org.apache.flumegroupId>
<artifactId>flume-ng-coreartifactId>
<version>1.9.0version>
<scope>providedscope>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.62version>
<scope>compilescope>
dependency>
dependencies>
project>
编辑代码
package com.hike.gmall.interceptor;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONException;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
public class ETLLogInterceptor implements Interceptor{
public void initialize() {
}
public Event intercept(Event event) {
// 1 取出body
String body = new String(event.getBody(), StandardCharsets.UTF_8);
// 2 通过阿里的fastjson判断数据是否完整
try{
JSON.parseObject(body);
}catch (JSONException e){
return null;
}
return event;
}
public List<Event> intercept(List<Event> list) {
Iterator<Event> iterator = list.iterator();
//增强for循环在遍历过程中不能够移除元素
//需要使用迭代器的方式
while(iterator.hasNext()){
Event event = iterator.next();
Event result = intercept(event);
if(result == null){
iterator.remove();
}
}
return list;
}
public void close() {
}
public static class MyBuilder implements Builder{
@Override
public Interceptor build() {
return new ETLLogInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
打包,选择带依赖的jar包 Colllect-1.0-SNAPSHOT-jar-with-dependencies.jar
上传到/opt/module/flume-1.9.0/lib目录下
a1.sources = r1
a1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /opt/module/applog/log/app.*
a1.sources.r1.positionFile = /opt/module/flume-1.9.0/jobs/position/position.json
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.hike.gmall.interceptor.ETLLogInterceptor$MyBuilder
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop101:9092,hadoop102:9092,hadoop103:9092
a1.channels.c1.kafka.topic = topic_log
a1.channels.c1.parseAsFlumeEvent = false
a1.sources.r1.channels = c1
/opt/module/flume-1.9.0/jobs/gmall
vim logserver-flume-kafka.conf
#启动消费者
kafka-console-consumer.sh --topic topic_log --bootstrap-server hadoop101:9092
#启动kafka,查看消费者是否采集到数据
flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/logserver-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console
#可以再执行log脚本生产一些数据,查看是否可以采集到
#!/bin/bash
if [ $# -lt 1 ]
then
echo "USAGE: f1.sh {start|stop}"
exit
fi
case $1 in
start)
for i in hadoop101 hadoop102
do
ssh $i "nohup flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/logserver-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console 1>$FLUME_HOME/logs/flume.log 2>&1 &"
done
;;
stop)
for i in hadoop101 hadoop102
do
ssh $i "ps -ef | grep logserver-flume-kafka.conf | grep -v grep | awk '{print \$2}' | xargs -n1 kill -9"
done
;;
*)
echo "USAGE: f1.sh {start|stop}"
exit
;;
esac
/opt/module/flume-1.9.0
mkdir logs
cd ~/bin
vim f1.sh
chmod u+x f1.sh
scp -r /opt/module/flume-1.9.0/ hadoop102:/opt/module/
scp /etc/profile.d/my_env.sh root@hadoop102:/etc/profile.d/
第二层flume:kafka source – memory channel(或者file channel) – hdfs sink 【传统架构】【√】需要添加拦截器,如果没有source,没有办法添加拦截器
kafka channel – hdfs sink 【使用kafka channel】
为保证用户行为产生事件就是日志的生成时间(因其在传输过程中也会耗费时间),所以需要在kafka source中添加时间戳,不使用本地时间戳。
package com.hike.gmall.interceptor;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.List;
public class TimeStampInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
// 1 取出body
String body = new String(event.getBody(), StandardCharsets.UTF_8);
// 2 将JSON解析成对象
JSONObject jsonObject = JSON.parseObject(body);
// 3 从对象中获取ts
String ts = jsonObject.getString("ts");
// 4 将ts的值设置到event的header中
event.getHeaders().put("timestamp",ts);
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
for (Event event : list) {
intercept(event);
}
return list;
}
@Override
public void close() {
}
public static class MyBuilder implements Builder{
@Override
public Interceptor build() {
return new TimeStampInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
将第二层flume配置到hadoop103上
scp -r /opt/module/flume-1.9.0/ hadoop103:/opt/module/
scp /etc/profile.d/my_env.sh root@hadoop103:/etc/profile.d/
#在hadoop104进行操作
/opt/module/flume-1.9.0/lib
rm -rf Colllect-1.0-SNAPSHOT-jar-with-dependencies.jar
#上传jar包
/opt/module/flume-1.9.0/jobs
mkdir filechannel
mkdir checkpoint
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r1.kafka.bootstrap.servers = hadoop101:9092,hadoop102:9092,hadoop103:9092
a1.sources.r1.kafka.topics = topic_log
a1.sources.r1.kafka.consumer.group.id = gmall
a1.sources.r1.batchDurationMillis = 2000
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.hike.gmall.interceptor.TimeStampInterceptor$MyBuilder
a1.channels.c1.type = file
a1.channels.c1.dataDirs = /opt/module/flume-1.9.0/jobs/filechannel
#内存中保存断点续传指针的容量
a1.channels.c1.capacity = 1000000
#将内存中数据落盘,防止数据丢失
a1.channels.c1.checkpointDir = /opt/module/flume-1.9.0/jobs/checkpoint
#a1.channels.c1.useDualCheckpoints = true
#a1.channels.c1.backupCheckpointDir = /opt/module/flume-1.9.0/jobs/checkpoint-bk
a1.channels.c1.transactionCapacity = 10000
a1.channels.c1.maxFileSize = 2146435071
a1.channels.c1.keep-alive = 5
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /origin_data/gmall/log/topic_log/%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = false
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
## 控制输出文件是原生文件。
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = lzop
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
/opt/module/flume-1.9.0/jobs/gmall
vim kafka-flume-hdfs.conf
flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/kafka-flume-hdfs.conf -n a1 -Dflume.root.logger=INFO,console
#完成之后可以在hdfs端查看到收集的文件
#!/bin/bash
if [ $# -lt 1 ]
then
echo "USAGE: f2.sh {start|stop}"
exit
fi
case $1 in
start)
for i in hadoop103
do
ssh $i "nohup flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/kafka-flume-hdfs.conf -n a1 -Dflume.root.logger=INFO,console 1>$FLUME_HOME/logs/flume.log 2>&1 &"
done
;;
stop)
for i in hadoop103
do
ssh $i "ps -ef | grep kafka-flume-hdfs.conf | grep -v grep | awk '{print \$2}' | xargs -n1 kill -9"
done
;;
*)
echo "USAGE: f2.sh {start|stop}"
exit
;;
esac