flume+kafka实现对nginx日志收集并存储到hdfs

1、准备可以运行的web项目jar包,在hdp-3上运行jar包

[root@hdp-3 apps]# java -jar springbt-0.0.1-SNAPSHOT.jar

2、在hdp-1上配置ngingx代理hdp-3运行的项目,启动nginx服务器

修改/usr/local/nginx/conf/nginx.conf,在server上添加这段代码

 upstream frame-tomcat {
          server hdp-3:8088;      //代理的host和运行jar包的端口号
    }
    server {
        listen       80;
        server_name  hdp-1;        //在浏览器输入地址的名字
 
        #charset koi8-r;
 
        access_log  logs/log.frame.access.log  main;
 
        location / {
            # root   html;
            # index  index.html index.htm;
            proxy_pass http://frame-tomcat;
        }
 
        error_page   500 502 503 504  /50x.html;
        location = /50x.html {
            root   html;
        }
    }  

启动nginx

[root@hdp-1 sbin]# ./nginx

3、启动Kafka集群

[root@hdp-1 bin]# sh start-allkafka.sh        //脚本启动集群

 

4、配置flume采集nginx的日志文件,下沉到kafka集群

       flume需与nginx在同一台机器

# 指定各个核心组件
ag1.sources = r1
ag1.sinks = k1
ag1.channels = c1
# 准备数据源
ag1.sources.r1.type = exec
ag1.sources.r1.command = tail -F /usr/local/nginx/logs/log.frame.access.log

# Describe the sink
ag1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
ag1.sinks.k1.kafka.topic=xin
ag1.sinks.k1.kafka.bootstrap.servers=hdp-1:9092,hdp-2:9092,hdp-3:9092
# Use a channel which buffers events in memory
ag1.channels.c1.type = memory
ag1.channels.c1.capacity = 20000
ag1.channels.c1.transactionCapacity = 10000
# Bind the source and sink to the channel
ag1.sources.r1.channels = c1
ag1.sinks.k1.channel = c1

     启动flume

[root@hdp-1 bin]# ./flume-ng agent -C ../conf/ -f ../flume_kafka.conf -n ag1 -Dflume.root.logger=INFO,console

5、java API实现consumer,并将数据存储到hdfs

      方法一:Consumer通过ConsumerRecords获取到字符串类型数据,再将数据转换成流的形式存储到hdfs中

                    不足:为了保证点击每一次数据存储到hdfs中,采用时间+随机数命名方式

        Consumer.java

package csdn;

import java.util.Collections;
import java.util.Properties;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;


public class Consumer {
    private static KafkaConsumer consumer;
    private static Properties props;
    static {
        props = new Properties();
        //生产者kafkka地址
        props.put("bootstrap.servers", "hdp-2:9092");
        props.put("zookeeper.connect","hdp-1:2181,hdp-2:2181,hdp-3:2181");
        //key反序列化
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        //组???????
        props.put("group.id", "wang");
    }

    private static void ConsumerMessage() {
        HDFSWriter hdfsWriter = new HDFSWriter();
        //允许自动提交位移
        props.put("enable.auto.commit", true);
        consumer = new KafkaConsumer(props);
        consumer.subscribe(Collections.singleton("xin"));



        //使用轮询拉取数据--消费完成之后会根据设置时长来清除消息,被消费过的消息,如果想再次被消费,可以根据偏移量(offset)来获取
        try {
            while (true) {
                //poll方法获取数据
                ConsumerRecords records = consumer.poll(100);
                for (ConsumerRecord r : records) {
                    System.out.printf("topic = %s, offset = %s, key = %s, value = %s", r.topic(), r.offset(),
                            r.key(), r.value());
                    hdfsWriter.writer(r.toString());
                }
            }

        } finally {
            consumer.close();
        }
    }

    public static void main(String[] args) {
        ConsumerMessage();
    }

}
HDFSWriter.java
package csdn;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Random;

public class HDFSWriter {
    public void writer(String str) {
        try {
            InputStream inputStream = new BufferedInputStream(
                    new ByteArrayInputStream(str.getBytes()));//打开一个BufferedInputStream字节输入流

            URI uri= null;
            FileSystem fs=null;
            try {
                uri = new URI("hdfs://hdp-1:9000");
                Configuration conf=new 	Configuration();
                conf.set("dfs.blocksize","64m");
                conf.set("dfs.replication", "1");
                String user="root";
                 fs = FileSystem.get(uri, conf, user);
            } catch (Exception e) {
                e.printStackTrace();
            }

            Date date = new Date();
            String strDateFormat = "yyyy-MM-dd-HH-mm-ss";
            SimpleDateFormat sdf = new SimpleDateFormat(strDateFormat);
            Random random = new Random();
            int end = random.nextInt(99);
            final String ends = String.format("%02d", end);//如果不足两位,前面补0

            String name = sdf.format(date).toString()+ends;
            final Path path = new Path("/kafka/" + name + ".log");
            //这里是creat()方法表示新创建一个文件,如果想在一个文件上追加,请用append()方法。
            FSDataOutputStream fsDataOutputStream = fs.create(path);
            
//          直接把str传入hdfs
//          OutputStreamWriter out = new OutputStreamWriter(fsDataOutputStream);
//          out.write(str);
//          流的形式传值
            IOUtils.copyBytes(inputStream, fsDataOutputStream, 1024, true);

            fsDataOutputStream.close();
            fs.close();
            inputStream.close();
        } catch (IOException e) {

            e.printStackTrace();
        }
    }
}

 

你可能感兴趣的:(Hadoop生态圈)