nsqd是就是我们平时所说的broker,接下来我们就来看一下它是怎么设计的,下面的Main方法就是nsqd启动的入口
func (n *NSQD) Main() error {
ctx := &context{n}
exitCh := make(chan error)
var once sync.Once
exitFunc := func(err error) {
once.Do(func() {
if err != nil {
n.logf(LOG_FATAL, "%s", err)
}
exitCh <- err
})
}
tcpServer := &tcpServer{ctx: ctx}
n.waitGroup.Wrap(func() {
exitFunc(protocol.TCPServer(n.tcpListener, tcpServer, n.logf))
})
httpServer := newHTTPServer(ctx, false, n.getOpts().TLSRequired == TLSRequired)
n.waitGroup.Wrap(func() {
exitFunc(http_api.Serve(n.httpListener, httpServer, "HTTP", n.logf))
})
if n.tlsConfig != nil && n.getOpts().HTTPSAddress != "" {
httpsServer := newHTTPServer(ctx, true, true)
n.waitGroup.Wrap(func() {
exitFunc(http_api.Serve(n.httpsListener, httpsServer, "HTTPS", n.logf))
})
}
n.waitGroup.Wrap(n.queueScanLoop)
n.waitGroup.Wrap(n.lookupLoop)
if n.getOpts().StatsdAddress != "" {
n.waitGroup.Wrap(n.statsdLoop)
}
err := <-exitCh
return err
}
在这个方法中并发启动了5个方法:
1、启动TCP
2、启动HTTP
3、启动queueScanLoop方法
4、启动lookupLoop方法
5、启动statsdLoop方法
这也是我们下面要关注的重点先来看一下lookupLoop,这个方法比较长,我们分段来看
func (n *NSQD) lookupLoop() {
var lookupPeers []*lookupPeer
var lookupAddrs []string
connect := true
hostname, err := os.Hostname()
if err != nil {
n.logf(LOG_FATAL, "failed to get hostname - %s", err)
os.Exit(1)
}
ticker := time.Tick(15 * time.Second)
...
}
首先创建了两个slice用来保存和nsqlookupd的连接和nsqlookup的地址,接下来获取本地主机的名字,常见了一个每15s触发一次的定时器
for {
if connect {
for _, host := range n.getOpts().NSQLookupdTCPAddresses {
if in(host, lookupAddrs) {
continue
}
n.logf(LOG_INFO, "LOOKUP(%s): adding peer", host)
lookupPeer := newLookupPeer(host, n.getOpts().MaxBodySize, n.logf,
connectCallback(n, hostname))
lookupPeer.Command(nil) // start the connection
lookupPeers = append(lookupPeers, lookupPeer)
lookupAddrs = append(lookupAddrs, host)
}
n.lookupPeers.Store(lookupPeers)
connect = false
}
...
}
下面就是一个死循环了,由于我们在启动的时候传入了NSQLookupd的TCP地址,所以一开始就会遍历我们传入的地址,如果之前处理过了会保存到lookupAddrs里面,如果没有继续向下执行,创建一个lookupPeer,建立连接,保存当前信息,我们可以发现在创建lookupPeer的时候传入了一个回调函数,来看看它里面做了什么
func connectCallback(n *NSQD, hostname string) func(*lookupPeer) {
return func(lp *lookupPeer) {
ci := make(map[string]interface{})
ci["version"] = version.Binary
ci["tcp_port"] = n.RealTCPAddr().Port
ci["http_port"] = n.RealHTTPAddr().Port
ci["hostname"] = hostname
ci["broadcast_address"] = n.getOpts().BroadcastAddress
cmd, err := nsq.Identify(ci)
if err != nil {
lp.Close()
return
}
resp, err := lp.Command(cmd)
if err != nil {
n.logf(LOG_ERROR, "LOOKUPD(%s): %s - %s", lp, cmd, err)
return
} else if bytes.Equal(resp, []byte("E_INVALID")) {
n.logf(LOG_INFO, "LOOKUPD(%s): lookupd returned %s", lp, resp)
lp.Close()
return
} else {
err = json.Unmarshal(resp, &lp.Info)
if err != nil {
n.logf(LOG_ERROR, "LOOKUPD(%s): parsing response - %s", lp, resp)
lp.Close()
return
} else {
n.logf(LOG_INFO, "LOOKUPD(%s): peer info %+v", lp, lp.Info)
if lp.Info.BroadcastAddress == "" {
n.logf(LOG_ERROR, "LOOKUPD(%s): no broadcast address", lp)
}
}
}
// build all the commands first so we exit the lock(s) as fast as possible
var commands []*nsq.Command
n.RLock()
for _, topic := range n.topicMap {
topic.RLock()
if len(topic.channelMap) == 0 {
commands = append(commands, nsq.Register(topic.name, ""))
} else {
for _, channel := range topic.channelMap {
commands = append(commands, nsq.Register(channel.topicName, channel.name))
}
}
topic.RUnlock()
}
n.RUnlock()
for _, cmd := range commands {
n.logf(LOG_INFO, "LOOKUPD(%s): %s", lp, cmd)
_, err := lp.Command(cmd)
if err != nil {
n.logf(LOG_ERROR, "LOOKUPD(%s): %s - %s", lp, cmd, err)
return
}
}
}
}
保存当前nsqd的信息放到一个map中,构建IDENTIFY命名返回给cmd,将cmd发送给nsqlookupd并等待response响应,从响应中中获取nsqlookupd并保存到lookupPeer的Info中,接下来便利自己的topicMap,为每一个topic构造register请求并放到commands中,最后逐条发送commands里面的命令。
我们总结一下这个回调函数的两个任务:
1、向nsqlookupd发送自己的信息并从响应中获取当前nsqlookupd的信息保存起来
2、向nsqlookupd注册自己的所有topic
我们回到之前的方法
下面就是执行lookupPeer.Command(nil)方法建立连接,建立连接完成之后就会执行我们上面介绍的回调函数。
lookupLoop方法最后一部分是一个select
select {
case <-ticker:
// send a heartbeat and read a response (read detects closed conns)
for _, lookupPeer := range lookupPeers {
n.logf(LOG_DEBUG, "LOOKUPD(%s): sending heartbeat", lookupPeer)
cmd := nsq.Ping()
_, err := lookupPeer.Command(cmd)
if err != nil {
n.logf(LOG_ERROR, "LOOKUPD(%s): %s - %s", lookupPeer, cmd, err)
}
}
case val := <-n.notifyChan:
var cmd *nsq.Command
var branch string
switch val.(type) {
case *Channel:
// notify all nsqlookupds that a new channel exists, or that it's removed
branch = "channel"
channel := val.(*Channel)
if channel.Exiting() == true {
cmd = nsq.UnRegister(channel.topicName, channel.name)
} else {
cmd = nsq.Register(channel.topicName, channel.name)
}
case *Topic:
// notify all nsqlookupds that a new topic exists, or that it's removed
branch = "topic"
topic := val.(*Topic)
if topic.Exiting() == true {
cmd = nsq.UnRegister(topic.name, "")
} else {
cmd = nsq.Register(topic.name, "")
}
}
for _, lookupPeer := range lookupPeers {
n.logf(LOG_INFO, "LOOKUPD(%s): %s %s", lookupPeer, branch, cmd)
_, err := lookupPeer.Command(cmd)
if err != nil {
n.logf(LOG_ERROR, "LOOKUPD(%s): %s - %s", lookupPeer, cmd, err)
}
}
case <-n.optsNotificationChan:
var tmpPeers []*lookupPeer
var tmpAddrs []string
for _, lp := range lookupPeers {
if in(lp.addr, n.getOpts().NSQLookupdTCPAddresses) {
tmpPeers = append(tmpPeers, lp)
tmpAddrs = append(tmpAddrs, lp.addr)
continue
}
n.logf(LOG_INFO, "LOOKUP(%s): removing peer", lp)
lp.Close()
}
lookupPeers = tmpPeers
lookupAddrs = tmpAddrs
connect = true
case <-n.exitChan:
goto exit
}
第一个是定时器每15s触发一次,向所有的nsqlookupd发送心跳包
notifyChan是用来传递有关topic的信息的,当有新的topic添加或者topic清除的时候向notifyChan中添加消息,根据内容的类型构造register或者unregister命令向所有nsqlookupd发送。
optsNotificationChan使用来传递nsqlookupd变化的消息的
exitChan传递退出消息
总结一下lookupLoop的主要功能:
1、与所有的nsqlookupd建立连接并注册topic
2、每15s向nsqlookupd发送心跳包
3、当topic发生变化的时候,向nsqlookupd发送register或者unregister命令
4、对nsqlookupd进行监听
5、监听退出消息
下面看一下queueScanLoop这个方法
func (n *NSQD) queueScanLoop() {
workCh := make(chan *Channel, n.getOpts().QueueScanSelectionCount)
responseCh := make(chan bool, n.getOpts().QueueScanSelectionCount)
closeCh := make(chan int)
workTicker := time.NewTicker(n.getOpts().QueueScanInterval)
refreshTicker := time.NewTicker(n.getOpts().QueueScanRefreshInterval)
channels := n.channels()
n.resizePool(len(channels), workCh, responseCh, closeCh)
for {
select {
case <-workTicker.C:
if len(channels) == 0 {
continue
}
case <-refreshTicker.C:
channels = n.channels()
n.resizePool(len(channels), workCh, responseCh, closeCh)
continue
case <-n.exitChan:
goto exit
}
num := n.getOpts().QueueScanSelectionCount
if num > len(channels) {
num = len(channels)
}
loop:
for _, i := range util.UniqRands(num, len(channels)) {
workCh <- channels[i]
}
numDirty := 0
for i := 0; i < num; i++ {
if <-responseCh {
numDirty++
}
}
if float64(numDirty)/float64(num) > n.getOpts().QueueScanDirtyPercent {
goto loop
}
}
exit:
n.logf(LOG_INFO, "QUEUESCAN: closing")
close(closeCh)
workTicker.Stop()
refreshTicker.Stop()
}
workCh用来保存worker要处理的channel,responseCh用来保存worker要出里的response,closeCh保存关闭消息,workTicker每次触发会判断channels的长度,refreshTicker每次触发则会调用resizePool方法,我们待会会分析这个方法。接下来随机挑选num个channel放到workCh中,最后从responoseCh中获取channel被修改的数目
func (n *NSQD) resizePool(num int, workCh chan *Channel, responseCh chan bool, closeCh chan int) {
idealPoolSize := int(float64(num) * 0.25)
if idealPoolSize < 1 {
idealPoolSize = 1
} else if idealPoolSize > n.getOpts().QueueScanWorkerPoolMax {
idealPoolSize = n.getOpts().QueueScanWorkerPoolMax
}
for {
if idealPoolSize == n.poolSize {
break
} else if idealPoolSize < n.poolSize {
// contract
closeCh <- 1
n.poolSize--
} else {
// expand
n.waitGroup.Wrap(func() {
n.queueScanWorker(workCh, responseCh, closeCh)
})
n.poolSize++
}
}
}
其实resizePool这个方法就是用来调整worker的数目的,理想的数目是idealPoolSize ,poolSize 是当前池中的worker的数目,如果小于idealPoolSize 那么就通过queueScanWorker方法启动worker如果大于就通过向closeCh中写数据,收到的worker会关闭自己
func (n *NSQD) queueScanWorker(workCh chan *Channel, responseCh chan bool, closeCh chan int) {
for {
select {
case c := <-workCh:
now := time.Now().UnixNano()
dirty := false
if c.processInFlightQueue(now) {
dirty = true
}
if c.processDeferredQueue(now) {
dirty = true
}
responseCh <- dirty
case <-closeCh:
return
}
}
}
worker会从workCh中获取需要处理的channel,直到从closeCh中收到关闭的命令,如果从workCh中获取到了channel那么就会执行它的processInFlightQueue方法
func (c *Channel) processInFlightQueue(t int64) bool {
c.exitMutex.RLock()
defer c.exitMutex.RUnlock()
if c.Exiting() {
return false
}
dirty := false
for {
c.inFlightMutex.Lock()
msg, _ := c.inFlightPQ.PeekAndShift(t)
c.inFlightMutex.Unlock()
if msg == nil {
goto exit
}
dirty = true
_, err := c.popInFlightMessage(msg.clientID, msg.ID)
if err != nil {
goto exit
}
atomic.AddUint64(&c.timeoutCount, 1)
c.RLock()
client, ok := c.clients[msg.clientID]
c.RUnlock()
if ok {
client.TimedOutMessage()
}
c.put(msg)
}
exit:
return dirty
}
我们向这个方法中传入了当前事件,在下面的for循环中会根据当前事件从channel中获取存储到nsqd的时间不超过我们传入的时间的消息,如果没有,就返回null,当获取消息成功后,将dirty标记为true,从inFlightMessages删除当前消息,最后放到channel的memoryMsgChan中。从这个方法中我们主要读到了message从一个集合到另一个集合的转移,到目前为止还不知道作用是什么,我们接着往下看有关TCP的部分
关于TCP我们所关注的就是他的Handle函数
func (p *tcpServer) Handle(clientConn net.Conn) {
p.ctx.nsqd.logf(LOG_INFO, "TCP: new client(%s)", clientConn.RemoteAddr())
// The client should initialize itself by sending a 4 byte sequence indicating
// the version of the protocol that it intends to communicate, this will allow us
// to gracefully upgrade the protocol away from text/line oriented to whatever...
buf := make([]byte, 4)
_, err := io.ReadFull(clientConn, buf)
if err != nil {
p.ctx.nsqd.logf(LOG_ERROR, "failed to read protocol version - %s", err)
return
}
protocolMagic := string(buf)
p.ctx.nsqd.logf(LOG_INFO, "CLIENT(%s): desired protocol magic '%s'",
clientConn.RemoteAddr(), protocolMagic)
var prot protocol.Protocol
switch protocolMagic {
case " V2":
prot = &protocolV2{ctx: p.ctx}
default:
protocol.SendFramedResponse(clientConn, frameTypeError, []byte("E_BAD_PROTOCOL"))
clientConn.Close()
p.ctx.nsqd.logf(LOG_ERROR, "client(%s) bad protocol magic '%s'",
clientConn.RemoteAddr(), protocolMagic)
return
}
err = prot.IOLoop(clientConn)
if err != nil {
p.ctx.nsqd.logf(LOG_ERROR, "client(%s) - %s", clientConn.RemoteAddr(), err)
return
}
}
首先根据客户端传来的协议版本创建协议实例,在这只支持V2版本,最后执行IOLoop方法,在循环中处理客户端的连接
func (p *protocolV2) IOLoop(conn net.Conn) error {
var err error
var line []byte
var zeroTime time.Time
clientID := atomic.AddInt64(&p.ctx.nsqd.clientIDSequence, 1)
client := newClientV2(clientID, conn, p.ctx)
p.ctx.nsqd.AddClient(client.ID, client)
// synchronize the startup of messagePump in order
// to guarantee that it gets a chance to initialize
// goroutine local state derived from client attributes
// and avoid a potential race with IDENTIFY (where a client
// could have changed or disabled said attributes)
messagePumpStartedChan := make(chan bool)
go p.messagePump(client, messagePumpStartedChan)
<-messagePumpStartedChan
for {
if client.HeartbeatInterval > 0 {
client.SetReadDeadline(time.Now().Add(client.HeartbeatInterval * 2))
} else {
client.SetReadDeadline(zeroTime)
}
// ReadSlice does not allocate new space for the data each request
// ie. the returned slice is only valid until the next call to it
line, err = client.Reader.ReadSlice('\n')
if err != nil {
if err == io.EOF {
err = nil
} else {
err = fmt.Errorf("failed to read command - %s", err)
}
break
}
// trim the '\n'
line = line[:len(line)-1]
// optionally trim the '\r'
if len(line) > 0 && line[len(line)-1] == '\r' {
line = line[:len(line)-1]
}
params := bytes.Split(line, separatorBytes)
p.ctx.nsqd.logf(LOG_DEBUG, "PROTOCOL(V2): [%s] %s", client, params)
var response []byte
response, err = p.Exec(client, params)
if err != nil {
ctx := ""
if parentErr := err.(protocol.ChildErr).Parent(); parentErr != nil {
ctx = " - " + parentErr.Error()
}
p.ctx.nsqd.logf(LOG_ERROR, "[%s] - %s%s", client, err, ctx)
sendErr := p.Send(client, frameTypeError, []byte(err.Error()))
if sendErr != nil {
p.ctx.nsqd.logf(LOG_ERROR, "[%s] - %s%s", client, sendErr, ctx)
break
}
// errors of type FatalClientErr should forceably close the connection
if _, ok := err.(*protocol.FatalClientErr); ok {
break
}
continue
}
if response != nil {
err = p.Send(client, frameTypeResponse, response)
if err != nil {
err = fmt.Errorf("failed to send response - %s", err)
break
}
}
}
p.ctx.nsqd.logf(LOG_INFO, "PROTOCOL(V2): [%s] exiting ioloop", client)
conn.Close()
close(client.ExitChan)
if client.Channel != nil {
client.Channel.RemoveClient(client.ID)
}
p.ctx.nsqd.RemoveClient(client.ID)
return err
}
为当前新连接进来的客户创建一个id,封装一个client添加到nsqd的client字典中,下面用到了一个技巧,首先创建了一个messagePumpStartedChan,传入messagePump方法中,之后阻塞从messagePumpStartedChan获取信息,其实如果进入messagePump看,并不是获取信息,而是等待messagePumpStartedChan关闭,其实就是为了让messagePump里面的for循环和IOLoop里面的for循环同时执行。在IOLoop的循环里就是接收客户端的请求,解析请求,处理请求,最后给客户端返回响应。在messagePump方法里面会从内存和disk上获取准备好的message返回给客户。
假设现在是消费者建立了连接,那么,建立完连接之后消费者会发送一个sub命令,我们来看一下是怎么处理的
func (p *protocolV2) SUB(client *clientV2, params [][]byte) ([]byte, error) {
if atomic.LoadInt32(&client.State) != stateInit {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "cannot SUB in current state")
}
if client.HeartbeatInterval <= 0 {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "cannot SUB with heartbeats disabled")
}
if len(params) < 3 {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "SUB insufficient number of parameters")
}
topicName := string(params[1])
if !protocol.IsValidTopicName(topicName) {
return nil, protocol.NewFatalClientErr(nil, "E_BAD_TOPIC",
fmt.Sprintf("SUB topic name %q is not valid", topicName))
}
channelName := string(params[2])
if !protocol.IsValidChannelName(channelName) {
return nil, protocol.NewFatalClientErr(nil, "E_BAD_CHANNEL",
fmt.Sprintf("SUB channel name %q is not valid", channelName))
}
if err := p.CheckAuth(client, "SUB", topicName, channelName); err != nil {
return nil, err
}
var channel *Channel
for {
topic := p.ctx.nsqd.GetTopic(topicName)
channel = topic.GetChannel(channelName)
if err := channel.AddClient(client.ID, client); err != nil {
return nil, protocol.NewFatalClientErr(nil, "E_TOO_MANY_CHANNEL_CONSUMERS",
fmt.Sprintf("channel consumers for %s:%s exceeds limit of %d",
topicName, channelName, p.ctx.nsqd.getOpts().MaxChannelConsumers))
}
if (channel.ephemeral && channel.Exiting()) || (topic.ephemeral && topic.Exiting()) {
channel.RemoveClient(client.ID)
time.Sleep(1 * time.Millisecond)
continue
}
break
}
atomic.StoreInt32(&client.State, stateSubscribed)
client.Channel = channel
// update message pump
client.SubEventChan <- channel
return okBytes, nil
}
从用户传来的请求中获取订阅的topic、channel,如果topic没有获取到就会新建一个添加到nsqd的字典中,从topic中获取对应的channel,如果不存在,那么新建一个channel添加到topic的字典中,向该channel添加当前client,标志client订阅了channel,给client的SubEventChan 传入订阅channel。那么SubEventChan会在什么时候用到呢,我们回到messagePump方法,看一下里面的实现
for {
if subChannel == nil || !client.IsReadyForMessages() {
// the client is not ready to receive messages...
memoryMsgChan = nil
backendMsgChan = nil
flusherChan = nil
// force flush
client.writeLock.Lock()
err = client.Flush()
client.writeLock.Unlock()
if err != nil {
goto exit
}
flushed = true
} else if flushed {
// last iteration we flushed...
// do not select on the flusher ticker channel
memoryMsgChan = subChannel.memoryMsgChan
backendMsgChan = subChannel.backend.ReadChan()
flusherChan = nil
} else {
// we're buffered (if there isn't any more data we should flush)...
// select on the flusher ticker channel, too
memoryMsgChan = subChannel.memoryMsgChan
backendMsgChan = subChannel.backend.ReadChan()
flusherChan = outputBufferTicker.C
}
select {
case <-flusherChan:
// if this case wins, we're either starved
// or we won the race between other channels...
// in either case, force flush
client.writeLock.Lock()
err = client.Flush()
client.writeLock.Unlock()
if err != nil {
goto exit
}
flushed = true
case <-client.ReadyStateChan:
case subChannel = <-subEventChan:
// you can't SUB anymore
subEventChan = nil
case identifyData := <-identifyEventChan:
// you can't IDENTIFY anymore
identifyEventChan = nil
outputBufferTicker.Stop()
if identifyData.OutputBufferTimeout > 0 {
outputBufferTicker = time.NewTicker(identifyData.OutputBufferTimeout)
}
heartbeatTicker.Stop()
heartbeatChan = nil
if identifyData.HeartbeatInterval > 0 {
heartbeatTicker = time.NewTicker(identifyData.HeartbeatInterval)
heartbeatChan = heartbeatTicker.C
}
if identifyData.SampleRate > 0 {
sampleRate = identifyData.SampleRate
}
msgTimeout = identifyData.MsgTimeout
case <-heartbeatChan:
err = p.Send(client, frameTypeResponse, heartbeatBytes)
if err != nil {
goto exit
}
case b := <-backendMsgChan:
if sampleRate > 0 && rand.Int31n(100) > sampleRate {
continue
}
msg, err := decodeMessage(b)
if err != nil {
p.ctx.nsqd.logf(LOG_ERROR, "failed to decode message - %s", err)
continue
}
msg.Attempts++
subChannel.StartInFlightTimeout(msg, client.ID, msgTimeout)
client.SendingMessage()
err = p.SendMessage(client, msg)
if err != nil {
goto exit
}
flushed = false
case msg := <-memoryMsgChan:
if sampleRate > 0 && rand.Int31n(100) > sampleRate {
continue
}
msg.Attempts++
subChannel.StartInFlightTimeout(msg, client.ID, msgTimeout)
client.SendingMessage()
err = p.SendMessage(client, msg)
if err != nil {
goto exit
}
flushed = false
case <-client.ExitChan:
goto exit
}
}
如果subChannel为nil,说明当前client还没有订阅消息,如果subChannel不为nil,那么就能从memoryMsgChan和backendMsgChan中获取可发送的消息,所以提前将flushed设置为true,这样,当subChannel不为nil的时候,就可以直接构建memoryMsgChan 和backendMsgChan 了,注意看select里面的一个分支subChannel = <-subEventChan,当client订阅了消息,在处理sub命令的时候会向subEventChan里面传入用户订阅的channel,在这里就能接收到这个channel了,之后就可以利用这个channel处理待发送的消息了。
那么现在又有一个疑问了,待发送的消息来自subChannel中的memoryMsgChan和backendMsgChan,那么消息是什么时候放进来的呢,其实是在创建Topic的时候启动了一个goroutine,也叫messagePump
func (t *Topic) messagePump() {
var msg *Message
var buf []byte
var err error
var chans []*Channel
var memoryMsgChan chan *Message
var backendChan chan []byte
// do not pass messages before Start(), but avoid blocking Pause() or GetChannel()
for {
select {
case <-t.channelUpdateChan:
continue
case <-t.pauseChan:
continue
case <-t.exitChan:
goto exit
case <-t.startChan:
}
break
}
t.RLock()
for _, c := range t.channelMap {
chans = append(chans, c)
}
t.RUnlock()
if len(chans) > 0 && !t.IsPaused() {
memoryMsgChan = t.memoryMsgChan
backendChan = t.backend.ReadChan()
}
// main message loop
for {
select {
case msg = <-memoryMsgChan:
case buf = <-backendChan:
msg, err = decodeMessage(buf)
if err != nil {
t.ctx.nsqd.logf(LOG_ERROR, "failed to decode message - %s", err)
continue
}
case <-t.channelUpdateChan:
chans = chans[:0]
t.RLock()
for _, c := range t.channelMap {
chans = append(chans, c)
}
t.RUnlock()
if len(chans) == 0 || t.IsPaused() {
memoryMsgChan = nil
backendChan = nil
} else {
memoryMsgChan = t.memoryMsgChan
backendChan = t.backend.ReadChan()
}
continue
case <-t.pauseChan:
if len(chans) == 0 || t.IsPaused() {
memoryMsgChan = nil
backendChan = nil
} else {
memoryMsgChan = t.memoryMsgChan
backendChan = t.backend.ReadChan()
}
continue
case <-t.exitChan:
goto exit
}
for i, channel := range chans {
chanMsg := msg
// copy the message because each channel
// needs a unique instance but...
// fastpath to avoid copy if its the first channel
// (the topic already created the first copy)
if i > 0 {
chanMsg = NewMessage(msg.ID, msg.Body)
chanMsg.Timestamp = msg.Timestamp
chanMsg.deferred = msg.deferred
}
if chanMsg.deferred != 0 {
channel.PutMessageDeferred(chanMsg, chanMsg.deferred)
continue
}
err := channel.PutMessage(chanMsg)
if err != nil {
t.ctx.nsqd.logf(LOG_ERROR,
"TOPIC(%s) ERROR: failed to put msg(%s) to channel(%s) - %s",
t.name, msg.ID, channel.name, err)
}
}
}
exit:
t.ctx.nsqd.logf(LOG_INFO, "TOPIC(%s): closing ... messagePump", t.name)
}
当前Topic启动之后,获取topic下的所有channel保存到chans里面,分别从memoryMsgChan、backendChan、channelUpdateChan、pauseChan、exitChan获取信息,后面三个都是和状态有关的,前面两个是获取可投递的消息,获取到消息之后遍历之前获取的channel集合,向每一个channel里面投递消息,投递的消息也分为立即投递和延时投递,立即投递会直接向memoryMsgChan或者disk上传递消息,而延时投递会先添加到channel的延时队列中,那么这个延时消息是如何被重新投递的呢?回顾之前的queueScanLoop方法,它启动的worker就会检查消息是否过期,如果过期就会立即投递给channel。
问题又来了,Topic投递的消息又是哪里来的呢,要解决这个问题我们就要看看消息发布流程了
func (p *protocolV2) PUB(client *clientV2, params [][]byte) ([]byte, error) {
var err error
if len(params) < 2 {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "PUB insufficient number of parameters")
}
topicName := string(params[1])
if !protocol.IsValidTopicName(topicName) {
return nil, protocol.NewFatalClientErr(nil, "E_BAD_TOPIC",
fmt.Sprintf("PUB topic name %q is not valid", topicName))
}
bodyLen, err := readLen(client.Reader, client.lenSlice)
if err != nil {
return nil, protocol.NewFatalClientErr(err, "E_BAD_MESSAGE", "PUB failed to read message body size")
}
if bodyLen <= 0 {
return nil, protocol.NewFatalClientErr(nil, "E_BAD_MESSAGE",
fmt.Sprintf("PUB invalid message body size %d", bodyLen))
}
if int64(bodyLen) > p.ctx.nsqd.getOpts().MaxMsgSize {
return nil, protocol.NewFatalClientErr(nil, "E_BAD_MESSAGE",
fmt.Sprintf("PUB message too big %d > %d", bodyLen, p.ctx.nsqd.getOpts().MaxMsgSize))
}
messageBody := make([]byte, bodyLen)
_, err = io.ReadFull(client.Reader, messageBody)
if err != nil {
return nil, protocol.NewFatalClientErr(err, "E_BAD_MESSAGE", "PUB failed to read message body")
}
if err := p.CheckAuth(client, "PUB", topicName, ""); err != nil {
return nil, err
}
topic := p.ctx.nsqd.GetTopic(topicName)
msg := NewMessage(topic.GenerateID(), messageBody)
err = topic.PutMessage(msg)
if err != nil {
return nil, protocol.NewFatalClientErr(err, "E_PUB_FAILED", "PUB failed "+err.Error())
}
client.PublishedMessage(topicName, 1)
return okBytes, nil
}
获取我们要发布的消息的Topic以及要发布的消息的内容,从nsqd中获取对应的Topic,为当前消息生成一个id,最终和消息内容封装成Message,调用topic的PutMessage方法,看来这个方法就是关键所在了,最后又执行了client的PublishedMessage方法,我们现在还不知道这个方法是干什么的,我们一个一个的看
func (t *Topic) put(m *Message) error {
select {
case t.memoryMsgChan <- m:
default:
b := bufferPoolGet()
err := writeMessageToBackend(b, m, t.backend)
bufferPoolPut(b)
t.ctx.nsqd.SetHealth(err)
if err != nil {
t.ctx.nsqd.logf(LOG_ERROR,
"TOPIC(%s) ERROR: failed to write message to backend - %s",
t.name, err)
return err
}
}
return nil
}
其实和topic的处理一样直接将消息放到memoryMsgChan 或者disk上。
func (c *clientV2) PublishedMessage(topic string, count uint64) {
c.metaLock.Lock()
c.pubCounts[topic] += count
c.metaLock.Unlock()
}
最后这个方法就是给client发布的消息进行计数。
在nsqd的Main方法中的5个重要的函数我们已经看了三个了,另外两个我们之后再补充,看到这里,真的是要感叹go中chan和goroutine及其灵活的用法,再阅读代码的时候一定要保持清晰的头脑,否则很快就会被chan之间传值弄混,今天就先到这了。