1.flume配置
创建一个文件,flume监听这个文件,代替kafka生产者向消费者传输内容,具体配置如下
###############给sinnsource、channel起名字###############a
kafka_agent.sources = kafka_source
kafka_agent.sinks = kafka_sink
kafka_agent.channels = kafka_channel
##############配置source#####################
#对接文件
kafka_agent.sources.kafka_source.type = exec
kafka_agent.sources.kafka_source.command = tail -F /usr/local/flume1.8/test/MultilevelComputing.log
kafka_agent.sources.tailsource-1.shell = /bin/bash -c
###############配置sink######################
#对接kafka
kafka_agent.sinks.kafka_sink.type = org.apache.flume.sink.kafka.KafkaSink
#配置传输到哪个topic中
kafka_agent.sinks.kafka_sink.kafka.topic = Multilevel
#地址
kafka_agent.sinks.kafka_sink.kafka.bootstrap.servers = htkj101:9092
#批处理大小,1条消息处理一次
kafka_agent.sinks.kafka_sink.kafka.flumeBatchSize = 1
#输入数据传输成功策略
kafka_agent.sinks.kafka_sink.kafka.producer.acks = -1
kafka_agent.sinks.kafka_sink.kafka.producer.linger.ms = 1
kafka_agent.sinks.kafka_sink.kafka.producer.compression.type = snappy
############################配置channel###################
#对于channel的配置描述 使用文件做数据的临时缓存 这种的安全性要高
kafka_agent.channels.kafka_channel.type = file
kafka_agent.channels.kafka_channel.checkpointDir = /home/uplooking/data/flume/checkpoint
kafka_agent.channels.kafka_channel.dataDirs = /home/uplooking/data/flume/data
###########################整合三个组件#########################
kafka_agent.sources.kafka_source.channels = kafka_channel
kafka_agent.sinks.kafka_sink.channel = kafka_channel
2.storm接受kafka的数据并进行实时计算
2.1pom.xml
org.apache.storm
storm-core
1.1.0
org.slf4j
slf4j-log4j12
org.slf4j
log4j-over-slf4j
org.apache.kafka
kafka-clients
0.10.0.1
org.apache.kafka
kafka_2.11
0.10.0.1
org.apache.zookeeper
zookeeper
org.slf4j
slf4j-log4j12
org.slf4j
slf4j-api
org.slf4j
log4j-over-slf4j
org.apache.logging.log4j
log4j-slf4j-impl
org.apache.storm
storm-kafka-client
1.1.0
org.apache.storm
storm-jdbc
1.1.1
mysql
mysql-connector-java
5.1.31
2.2 KafkaSpout
从kafka中获取数据,同时也是topology,在这个类中进行配置
package com.htkj.multilevelcomputing.Storm;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.kafka.spout.KafkaSpout;
import org.apache.storm.kafka.spout.KafkaSpoutConfig;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
import org.apache.storm.utils.Utils;
public class KafkaTopo {
public static void main(String[] args) {
//创建TopologyBuilder
TopologyBuilder topologyBuilder = new TopologyBuilder();
KafkaSpoutConfig.Builder kafkaSpoutConfigBuilder;
//kafka连接信息
String bootstrapServers="htkj101:9092,htkj102:9093,htkj103:9094";
//主题
String topic = "Multilevel";
/**
* 构造kafkaSpoutConfigBuilder构造器
*
* bootstrapServers: Kafka链接地址 ip:port
* StringDeserializer: key Deserializer 主题key的反序列化
* StringDeserializer: value Deserializer 主题的value的反序列化
* topic: 主题名称
*/
kafkaSpoutConfigBuilder = new KafkaSpoutConfig.Builder<>(
bootstrapServers,
StringDeserializer.class,
StringDeserializer.class,
topic);
//使用kafkaSpoutConfigBuilder构造器构造kafkaSpoutConfig,并配置相应属性
KafkaSpoutConfig kafkaSpoutConfig = kafkaSpoutConfigBuilder
/**
* 设置groupId
*/
.setProp(ConsumerConfig.GROUP_ID_CONFIG, topic.toLowerCase() + "_storm_group")
/**
* 设置session超时时间,该值应介于
* [group.min.session.timeout.ms, group.max.session.timeout.ms] [6000,300000]
* 默认值:10000
*/
.setProp(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "100000")
/**
* 设置拉取最大容量
*/
.setProp(ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG, "1048576")
/**
* 设置控制客户端等待请求响应的最大时间量
* 默认值:30000
*/
.setProp(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG, "300000")
/**
* 设置心跳到消费者协调器之间的预期时间。
* 心跳用于确保消费者的会话保持活动并且当新消费者加入或离开组时促进重新平衡
* 默认值: 3000 (一般设置低于session.timeout.ms的三分之一)
*/
.setProp(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, "30000")
/**
* 设置offset提交时间15s 默认30s
*/
.setOffsetCommitPeriodMs(15000)
/**
* 设置拉取最大在session超时时间内最好处理完成的个数
*/
.setMaxPollRecords(20)
/**
* 设置拉取策略
*/
.setFirstPollOffsetStrategy(KafkaSpoutConfig.FirstPollOffsetStrategy.LATEST)
/**
* 构造kafkaSpoutConfig
*/
.build();
//setSpout 从kafka中接受数据
topologyBuilder.setSpout("kafkaSpout",new KafkaSpout(kafkaSpoutConfig));
//setbolt 将从kafka中接受的数据进行处理
topologyBuilder.setBolt("KafkaSpoutBolt", new KafkaBolt()).localOrShuffleGrouping("kafkaSpout");
//setbolt 将从KafkaSpoutBolt中接受的数据进行处理
topologyBuilder.setBolt("MultiBolt",new MultiBolt()).fieldsGrouping("KafkaSpoutBolt",new Fields("orderSn","cateId","goodsAmount","parentId","CEOId"));
//setbolt 将从MultiBolt中接受的数据进行处理
topologyBuilder.setBolt("ComputingBolt",new ComputingBolt()).fieldsGrouping("MultiBolt",new Fields("CEOId","parentId","goodsAmount"));
Config config = new Config();
/**
* 设置supervisor和worker之间的通信超时时间.
* 超过这个时间supervisor会重启worker (秒)
*/
config.put("supervisor.worker.timeout.secs",600000);
/**
* 设置storm和zookeeper之间的超时时间.
*/
config.put("storm.zookeeper.session.timeout",1200000000);
/**
* 设置debug模式 日志输出更全
* 只能在本地LocalCluster模式下启用
*/
config.setDebug(true);
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology("KafKaTopo", config, topologyBuilder.createTopology());
Utils.sleep(Long.MAX_VALUE);
localCluster.shutdown();
}
}
2.3KafkaBolt
在这里获取kafka中获取的数据,并进行切割
package com.htkj.multilevelcomputing.Storm;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
public class KafkaBolt extends BaseBasicBolt {
@Override
public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
String s = tuple.getString(4);
//切割 为一个空格
System.out.println("kafkabolt-----"+s);
String[] split = s.split(" ");
Integer orderSn=Integer.valueOf(split[0]);
Integer cateId=Integer.valueOf(split[1]);
Integer goodsAmount=Integer.valueOf(split[2]);
Integer parentId=Integer.valueOf(split[3]);
Integer CEOId=Integer.valueOf(split[4]);
//提交
basicOutputCollector.emit(new Values(orderSn,cateId,goodsAmount,parentId,CEOId));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("orderSn","cateId","goodsAmount","parentId","CEOId"));
}
}
2.4MultiBolt
在这里进行业务逻辑的判断,有些商品不能进入到业绩计算
package com.htkj.multilevelcomputing.Storm;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
public class MultiBolt extends BaseBasicBolt {
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("CEOId", "parentId", "goodsAmount"));
}
@Override
public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
//获取数据
Integer orderSn = tuple.getIntegerByField("orderSn");
Integer cateId = tuple.getIntegerByField("cateId");
Integer goodsAmount = tuple.getIntegerByField("goodsAmount");
Integer parentId = tuple.getIntegerByField("parentId");
Integer CEOId = tuple.getIntegerByField("CEOId");
System.out.println("orderSn:" + orderSn + "cateId:" + cateId + "goodsAmount:" + goodsAmount + "parentId:" + parentId + "CEOId:" + CEOId);
//物品不能为配件和面膜
if (cateId != 9 && cateId != 65) {
System.out.println("成功");
basicOutputCollector.emit(new Values(CEOId, parentId, goodsAmount));
}
}
}
2.5ComputingBolt
这里连接了数据库,把计算得到的订单数据添加到数据库中
package com.htkj.multilevelcomputing.Storm;
import com.google.common.collect.Maps;
import org.apache.storm.jdbc.common.Column;
import org.apache.storm.jdbc.common.ConnectionProvider;
import org.apache.storm.jdbc.common.HikariCPConnectionProvider;
import org.apache.storm.jdbc.common.JdbcClient;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;
import java.sql.Types;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class ComputingBolt extends BaseRichBolt {
private OutputCollector collector;
//创建数据库连接对象
private ConnectionProvider connectionProvider;
//数据库的操作对象
private JdbcClient jdbcClient;
//一级代理从数据库获取的金额
private Integer parentAmount;
//一级代理总金额
private Integer parentAllAmount;
//CEO从数据库获取的金额
private Integer CEOAmount;
//CEO总金额
private Integer CEOAllAmount;
@Override
public void prepare(Map stormConf, TopologyContext topologyContext, OutputCollector collector) {
this.collector = collector;
//创建map集合存储连接属性
Map map = Maps.newHashMap();
//driver
map.put("dataSourceClassName","com.mysql.jdbc.jdbc2.optional.MysqlDataSource");
//url
map.put("dataSource.url", "jdbc:mysql:///yazan");
//用户名
map.put("dataSource.user","root");
//密码
map.put("dataSource.password","123456");
//创建连接对象
connectionProvider = new HikariCPConnectionProvider(map);
//对数据库连接进行初始化
connectionProvider.prepare();
//创建数据库操作对象,参数2:查询的超时时间
jdbcClient = new JdbcClient(connectionProvider,30);
}
@Override
public void execute(Tuple tuple) {
Integer goodsAmount=tuple.getIntegerByField("goodsAmount");
Integer parentId=tuple.getIntegerByField("parentId");
Integer CEOId=tuple.getIntegerByField("CEOId");
System.out.println("goodsAmount-->"+goodsAmount+"parentId-->"+parentId+"CEOId-->"+CEOId);
//创建集合,存储列的条件
List parent = new ArrayList();
//添加条件
parent.add(new Column("user_id", parentId, Types.INTEGER));
//查询user_id,条件是parentId
List> selectParentId = jdbcClient.select("SELECT user_id ,amount from test WHERE user_id = ?", parent);
//如果没有ParentId,执行增加
if (selectParentId.size()==0){
System.out.println("没有数据");
jdbcClient.executeSql("INSERT INTO test (user_id,amount,lid,lid_name) VALUES("+parentId+","+goodsAmount+",10,'总代理')");
}
//如果有ParentId,查找金额,并进行修改
else {
for (List columns : selectParentId) {
for (Column column : columns) {
String columnName= column.getColumnName();
if("amount".equalsIgnoreCase(columnName)){
//获取当前金额
parentAmount= (Integer) column.getVal();
System.out.println("当前金额"+parentAmount);
}
}
}
parentAllAmount=parentAmount+goodsAmount;
System.out.println("总金额"+parentAllAmount);
jdbcClient.executeSql("UPDATE test SET amount = "+parentAllAmount+" WHERE user_id = '"+parentId+"'");
}
List CEO = new ArrayList();
CEO.add(new Column("user_id", CEOId, Types.INTEGER));
List> selectCEOId = jdbcClient.select("SELECT user_id ,amount from test WHERE user_id = ?", CEO);
//如果没有CEOId,执行增加
if (selectCEOId.size()==0){
System.out.println("没有数据");
jdbcClient.executeSql("INSERT INTO test (user_id,amount,lid,lid_name) VALUES("+CEOId+","+goodsAmount+",9,'CEO')");
}
//如果有CEOId,查找金额,并进行修改
else {
for (List columns : selectCEOId) {
for (Column column : columns) {
String columnName= column.getColumnName();
if("amount".equalsIgnoreCase(columnName)){
//获取当前金额
CEOAmount= (Integer) column.getVal();
System.out.println("当前金额"+CEOAmount);
}
}
}
CEOAllAmount=CEOAmount+goodsAmount;
System.out.println("总金额"+CEOAllAmount);
jdbcClient.executeSql("UPDATE test SET amount = "+CEOAllAmount+" WHERE user_id = '"+CEOId+"'");
}
collector.ack(tuple);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
}
3.shell脚本
写一个shell脚本 进行测试
#!/bin/bash
file='/usr/local/flume1.8/test/MultilevelComputing.log '
for((i=0;i<1000000;i++))
do
orderSn=$RANDOM
cateId=$(($RANDOM%100+1))
goodsAmount=$RANDOM
parentId=$(($RANDOM%50+51))
CEOId=$(($RANDOM%50+1))
echo $orderSn $cateId $goodsAmount $parentId $CEOId >> $file;
sleep 0.001;
done
4.测试结果
5.一些注意事项
- kafka+storm重复消费计算的问题
一开始我以为是kafka的问题,后来经过排查是
ComputingBolt这个类继承了BaseRichBolt这个类,没有进行ack锚定
在代码最后添加了
collector.ack(tuple);
就成功解决了。
在这里要说明的是如果继承的是BaseBasicBolt 就不用进行ack锚定 ,这是因为他已经帮我们写好了
- storm-jdbc的一些错误
当时写的时候报了一个这样的错误
这是因为当时sql语句直接写的是 select * from xxx
而storm-jdbc 操作进行查询时 是根据我们创建集合里面列的属性来查询的
当时我对这个列只add了userId 自然是查不到其他的