简介
本文通过问题入手,介绍下RocketMQ的消息发送逻辑是怎么样的。消息发送的大体逻辑图如下:
问题
首先我们先来思考几个问题,如果我们要来实现一个消息发送的客户端,那么我们需要解决那些问题
- 消息发送如何保证负载均衡
- 如何保证高可用,当broker节点挂掉的时候
初始化
如下是一个常见的消息客户端的使用demo,接下来让我们看看再初始化的时候它做了哪些事情
DefaultMQProducer producer = new DefaultMQProducer("testGroup");
// 这里只配置了nameserver的地址,对应topic和broker的所有信息都是通过nameserver去获取的
producer.setNamesrvAddr("127.0.0.1:9876");
producer.start();
for (int i = 0; i < 100000000; i++) {
try {
Message msg = new Message("testtrace" /* Topic */,
"TagA" /* Tag */,(String.valueOf(System.currentTimeMillis())).getBytes(RemotingHelper.DEFAULT_CHARSET) /* Message body */
);msg.setDelayTimeLevel(Math.abs(new Random().nextInt(18))+1);
SendResult sendResult = producer.send(msg);
} catch (Exception e) {
e.printStackTrace();
Thread.sleep(1000);
}
}
DefaultMQProducerImpl#start
public void start(final boolean startFactory) throws MQClientException {
switch (this.serviceState) {
case CREATE_JUST:
this.serviceState = ServiceState.START_FAILED;
// 校验group名是否合规
this.checkConfig();
// 这里将instanceName设置成了进程id
if (!this.defaultMQProducer.getProducerGroup().equals(MixAll.CLIENT_INNER_PRODUCER_GROUP)) {
this.defaultMQProducer.changeInstanceNameToPID();
}
// 创建MQClientInstance,其主要用途是和nameserver和broker进行通信,获取元数据,发送消息等
this.mQClientFactory = MQClientManager.getInstance().getAndCreateMQClientInstance(this.defaultMQProducer, rpcHook);
// 这里只是在producerTable放入了这个group。这里需要注意的是,一个jvm里面只能创建一个producerGroup的实例对象
boolean registerOK = mQClientFactory.registerProducer(this.defaultMQProducer.getProducerGroup(), this);
if (!registerOK) {
this.serviceState = ServiceState.CREATE_JUST;
throw new MQClientException("The producer group[" + this.defaultMQProducer.getProducerGroup()
+ "] has been created before, specify another name please." + FAQUrl.suggestTodo(FAQUrl.GROUP_NAME_DUPLICATE_URL),
null);
}
this.topicPublishInfoTable.put(this.defaultMQProducer.getCreateTopicKey(), new TopicPublishInfo());
// 这里会初始化一些定时任务,比如定时刷新topic路由信息,初始化网络通信组件等
if (startFactory) {
mQClientFactory.start();
}
log.info("the producer [{}] start OK. sendMessageWithVIPChannel={}", this.defaultMQProducer.getProducerGroup(),
this.defaultMQProducer.isSendMessageWithVIPChannel());
this.serviceState = ServiceState.RUNNING;
break;
case RUNNING:
case START_FAILED:
case SHUTDOWN_ALREADY:
throw new MQClientException("The producer service state not OK, maybe started once, "
+ this.serviceState
+ FAQUrl.suggestTodo(FAQUrl.CLIENT_SERVICE_NOT_OK),
null);
default:
break;
}
// 启动心跳线程,定时的向所有的broker发送心跳
this.mQClientFactory.sendHeartbeatToAllBrokerWithLock();
}
消息发送
DefaultMQProducerImpl#sendDefaultImpl
private SendResult sendDefaultImpl(
Message msg,
final CommunicationMode communicationMode,
final SendCallback sendCallback,
final long timeout
) throws MQClientException, RemotingException, MQBrokerException, InterruptedException {
this.makeSureStateOK();
// 验证消息格式
Validators.checkMessage(msg, this.defaultMQProducer);
final long invokeID = random.nextLong();
long beginTimestampFirst = System.currentTimeMillis();
long beginTimestampPrev = beginTimestampFirst;
long endTimestamp = beginTimestampFirst;
// 从nameserver获取topic路由
TopicPublishInfo topicPublishInfo = this.tryToFindTopicPublishInfo(msg.getTopic());
if (topicPublishInfo != null && topicPublishInfo.ok()) {
boolean callTimeout = false;
MessageQueue mq = null;
Exception exception = null;
SendResult sendResult = null;
// 同步方式默认重试3次
int timesTotal = communicationMode == CommunicationMode.SYNC ? 1 + this.defaultMQProducer.getRetryTimesWhenSendFailed() : 1;
int times = 0;
String[] brokersSent = new String[timesTotal];
for (; times < timesTotal; times++) {
String lastBrokerName = null == mq ? null : mq.getBrokerName();
// 选择需要发送的queue,这里后面会详细介绍其中负载均衡的逻辑
MessageQueue mqSelected = this.selectOneMessageQueue(topicPublishInfo, lastBrokerName);
if (mqSelected != null) {
mq = mqSelected;
brokersSent[times] = mq.getBrokerName();
try {
beginTimestampPrev = System.currentTimeMillis();
if (times > 0) {
//Reset topic with namespace during resend.
msg.setTopic(this.defaultMQProducer.withNamespace(msg.getTopic()));
}
long costTime = beginTimestampPrev - beginTimestampFirst;
if (timeout < costTime) {
callTimeout = true;
break;
}
// 发送消息到rocketmq
sendResult = this.sendKernelImpl(msg, mq, communicationMode, sendCallback, topicPublishInfo, timeout - costTime);
endTimestamp = System.currentTimeMillis();
// 设置broker的可用时间和当前延迟,这个在后面负载均衡和Broker故障延迟机制里面会讲到
this.updateFaultItem(mq.getBrokerName(), endTimestamp - beginTimestampPrev, false);
switch (communicationMode) {
case ASYNC:
return null;
case ONEWAY:
return null;
case SYNC:
if (sendResult.getSendStatus() != SendStatus.SEND_OK) {
if (this.defaultMQProducer.isRetryAnotherBrokerWhenNotStoreOK()) {
continue;
}
}
return sendResult;
default:
break;
}
} catch (RemotingException e) {
endTimestamp = System.currentTimeMillis();
//
this.updateFaultItem(mq.getBrokerName(), endTimestamp - beginTimestampPrev, true);
log.warn(String.format("sendKernelImpl exception, resend at once, InvokeID: %s, RT: %sms, Broker: %s", invokeID, endTimestamp - beginTimestampPrev, mq), e);
log.warn(msg.toString());
exception = e;
continue;
} catch (MQClientException e) {
...//省略了和上面相同的异常处理逻辑
continue;
} catch (MQBrokerException e) {
...//省略了和上面相同的异常处理逻辑
switch (e.getResponseCode()) {
case ResponseCode.TOPIC_NOT_EXIST:
case ResponseCode.SERVICE_NOT_AVAILABLE:
case ResponseCode.SYSTEM_ERROR:
case ResponseCode.NO_PERMISSION:
case ResponseCode.NO_BUYER_ID:
case ResponseCode.NOT_IN_CURRENT_UNIT:
continue;
default:
if (sendResult != null) {
return sendResult;
}
throw e;
}
} catch (InterruptedException e) {
...//省略了和上面相同的异常处理逻辑
throw e;
}
} else {
break;
}
}
if (sendResult != null) {
return sendResult;
}
String info = String.format("Send [%d] times, still failed, cost [%d]ms, Topic: %s, BrokersSent: %s",
times,
System.currentTimeMillis() - beginTimestampFirst,
msg.getTopic(),
Arrays.toString(brokersSent));
info += FAQUrl.suggestTodo(FAQUrl.SEND_MSG_FAILED);
MQClientException mqClientException = new MQClientException(info, exception);
if (callTimeout) {
throw new RemotingTooMuchRequestException("sendDefaultImpl call timeout");
}
if (exception instanceof MQBrokerException) {
mqClientException.setResponseCode(((MQBrokerException) exception).getResponseCode());
} else if (exception instanceof RemotingConnectException) {
mqClientException.setResponseCode(ClientErrorCode.CONNECT_BROKER_EXCEPTION);
} else if (exception instanceof RemotingTimeoutException) {
mqClientException.setResponseCode(ClientErrorCode.ACCESS_BROKER_TIMEOUT);
} else if (exception instanceof MQClientException) {
mqClientException.setResponseCode(ClientErrorCode.BROKER_NOT_EXIST_EXCEPTION);
}
throw mqClientException;
}
List nsList = this.getmQClientFactory().getMQClientAPIImpl().getNameServerAddressList();
if (null == nsList || nsList.isEmpty()) {
throw new MQClientException(
"No name server address, please set it." + FAQUrl.suggestTodo(FAQUrl.NAME_SERVER_ADDR_NOT_EXIST_URL), null).setResponseCode(ClientErrorCode.NO_NAME_SERVER_EXCEPTION);
}
throw new MQClientException("No route info of this topic, " + msg.getTopic() + FAQUrl.suggestTodo(FAQUrl.NO_TOPIC_ROUTE_INFO),
null).setResponseCode(ClientErrorCode.NOT_FOUND_TOPIC_EXCEPTION);
}
这里需要补充1点:
当异常是RemotingException和MQClientException时,消息会进行重试,而其他异常会直接抛出去或者返回SendResult到客户端
负载均衡
TopicPublishInfo#selectOneMessageQueue
public MessageQueue selectOneMessageQueue(final String lastBrokerName) {
// 首次选择queue进行发送时,lasterBrokerName为null
// 只有当消息发送失败 进行重试时,lastBrokerName才为null
if (lastBrokerName == null) {
return selectOneMessageQueue();
} else { // 消息发送失败重试时,走到如下逻辑
int index = this.sendWhichQueue.getAndIncrement();
for (int i = 0; i < this.messageQueueList.size(); i++) {
int pos = Math.abs(index++) % this.messageQueueList.size();
if (pos < 0)
pos = 0;
MessageQueue mq = this.messageQueueList.get(pos);
// 重试时,如果还选择到同一个broker,则可能继续失败,所以加此判断
if (!mq.getBrokerName().equals(lastBrokerName)) {
return mq;
}
}
return selectOneMessageQueue();
}
}
public MessageQueue selectOneMessageQueue() {
// sendWhichQueue是一个threadlocal的变量,初始值就是一个随机值,首先随机取一个queue进行消息发送,之后就轮训的进行消息发送
int index = this.sendWhichQueue.getAndIncrement();
int pos = Math.abs(index) % this.messageQueueList.size();
if (pos < 0)
pos = 0;
return this.messageQueueList.get(pos);
}
简单来说,一般情况下,producer会轮训所有的queue,进行消息发送
Broker故障延迟机制
默认是不开启故障延迟机制的,开启的话,需要设置sendLatencyFaultEnable=true。上面在介绍消息发送是有看到,当出现异常时,会调用updateFaultItem方法来设置broker的延迟和可用时间。故障延迟机制就是根据这个时间来进行逻辑处理的。
MQFaultStrategy#selectOneMessageQueue
private long[] latencyMax = {50L, 100L, 550L, 1000L, 2000L, 3000L, 15000L};
private long[] notAvailableDuration = {0L, 0L, 30000L, 60000L, 120000L, 180000L, 600000L};
public MessageQueue selectOneMessageQueue(final TopicPublishInfo tpInfo, final String lastBrokerName) {
// 判断是否开启故障延迟功能
if (this.sendLatencyFaultEnable) {
try {
// 循环遍历queue
int index = tpInfo.getSendWhichQueue().getAndIncrement();
for (int i = 0; i < tpInfo.getMessageQueueList().size(); i++) {
int pos = Math.abs(index++) % tpInfo.getMessageQueueList().size();
if (pos < 0)
pos = 0;
MessageQueue mq = tpInfo.getMessageQueueList().get(pos);
// 在返回broker queue之前,首先要对broker是否可用进行判断。
if (latencyFaultTolerance.isAvailable(mq.getBrokerName())) {
if (null == lastBrokerName || mq.getBrokerName().equals(lastBrokerName))
return mq;
}
}
// 如果上面的循环选不出message queue,那么则根据每个broker的是否可用,延迟时间,上次消息发送成功时间进行排序,选择一个最优的queue 进行返回
final String notBestBroker = latencyFaultTolerance.pickOneAtLeast();
int writeQueueNums = tpInfo.getQueueIdByBroker(notBestBroker);
if (writeQueueNums > 0) {
final MessageQueue mq = tpInfo.selectOneMessageQueue();
if (notBestBroker != null) {
mq.setBrokerName(notBestBroker);
mq.setQueueId(tpInfo.getSendWhichQueue().getAndIncrement() % writeQueueNums);
}
return mq;
} else {
latencyFaultTolerance.remove(notBestBroker);
}
} catch (Exception e) {
log.error("Error occurred when selecting message queue", e);
}
return tpInfo.selectOneMessageQueue();
}
return tpInfo.selectOneMessageQueue(lastBrokerName);
}
LatencyFaultToleranceImpl#pickOneAtLeast
final Enumeration elements = this.faultItemTable.elements();
List tmpList = new LinkedList();
while (elements.hasMoreElements()) {
final FaultItem faultItem = elements.nextElement();
tmpList.add(faultItem);
}
if (!tmpList.isEmpty()) {
Collections.shuffle(tmpList);
// 排序 之后选择从最优的一半里面进行选择
Collections.sort(tmpList);
final int half = tmpList.size() / 2;
if (half <= 0) {
return tmpList.get(0).getName();
} else {
final int i = this.whichItemWorst.getAndIncrement() % half;
return tmpList.get(i).getName();
}
}
return null;
FaultItem类继承了Comparable接口,对avaliable,latency和starTimestamp进行了排序
故障延迟机制主要解决的问题应该是某个Broker单机的failover,或者是某个broker瞬时压力过大,导致接口超时,从而需要路由到别的broker进行消息发送。
总结
现在我们来看看最开始提出的2个问题的答案是什么
- 消息发送如何保证负载均衡
轮训所有的messagequeue进行消息发送 - 如何保证高可用,当broker节点挂掉的时候
- 故障延迟机制
- 定时扫描nameserver,获取最新的broker状态(代码比较简单就没有列出来)