supervisor通过调用sync-processes函数来启动worker,关于sync-processes函数的详细分析请参见"storm启动supervisor源码分析-supervisor.clj"。sync-processes函数代码片段如下:
sync-processes函数代码片段
;; sync-processes函数用于管理workers, 比如处理不正常的worker或dead worker, 并创建新的workers
;; supervisor标识supervisor的元数据
(defn sync-processes [supervisor]
.
.
.
;; 忽略了部分代码
.
.
.
(wait-for-workers-launch
conf
(dofor [[port assignment] reassign-executors]
(let [id (new-worker-ids port)]
(log-message "Launching worker with assignment "
(pr-str assignment)
" for this supervisor "
(:supervisor-id supervisor)
" on port "
port
" with id "
id
)
;; launch-worker函数负责启动worker
(launch-worker supervisor
(:storm-id assignment)
port
id)
id)))
))
sync-processes函数调用launch-worker函数启动worker,launch-worker函数是一个"多重函数",定义如下:
宏defmulti和defmethod经常被用在一起来定义multimethod-"多重函数"。宏defmulti的参数包括一个方法名以及一个dispatch函数,这个dispatch函数的返回值会被用来选择到底调用哪个重载的函数。宏defmethod的参数则包括方法名,dispatch的值,参数列表以及方法体。一个特殊的dispatch值:default 是用来表示默认情况的—即如果其它的dispatch值都不匹配的话,那么就调用这个方法。defmethod定义名字相同的方法,它们的参数个数必须一样。传给multimethod的参数会传给dipatch函数。实现类似java的重载
launch-worker函数
(
defmulti
launch-worker (
fn
[
supervisor
&
_
] (
cluster-mode (
:conf
supervisor))))
;; 如果dispatch函数的返回值为关键字:distributed,即storm集群运行在分布式模式下,则执行该方法
(
defmethod
launch-worker
;; supervisor标识supervisor的元数据,storm-id标识该worker所属的topology,port标识该worker占用的端口号,worker-id是一个32位的uuid,用于标识worker
:distributed
[
supervisor
storm-id
port
worker-id
]
;; conf绑定集群配置信息
(
let
[
conf (
:conf
supervisor)
;; storm-home绑定storm本地安装路径
storm-home (
System/getProperty
"storm.home")
;; storm-log-dir绑定日志路径
storm-log-dir (
or (
System/getProperty
"storm.log.dir") (
str
storm-home
"/logs"))
;; stormroot绑定supervisor本地路径"{storm.local.dir}/supervisor/stormdist/{storm-id}"
stormroot (
supervisor-stormdist-root
conf
storm-id)
;; jlp绑定运行时所依赖的本地库的路径,jlp函数生成本地库路径,参见jlp函数定义部分
jlp (
jlp
stormroot
conf)
;; stormjar绑定stormjar.jar文件的路径"{storm.local.dir}/supervisor/stormdist/{storm-id}/stormjar.jar"
stormjar (
supervisor-stormjar-path
stormroot)
;; storm-conf绑定集群配置信息和storm-id配置信息的并集
storm-conf (
read-supervisor-storm-conf
conf
storm-id)
;; topo-classpath绑定storm-id的classpath集合
topo-classpath (
if-let
[
cp (
storm-conf
TOPOLOGY-CLASSPATH
)]
[
cp
]
[])
;; 将stormjar和topo-classpath所标识的路径添加到Java的classpath中
classpath (
-> (
current-classpath)
(
add-to-classpath
[
stormjar
])
(
add-to-classpath
topo-classpath))
;; 从集群配置信息中获取默认情况下supervisor启动worker的jvm参数
worker-childopts (
when-let
[s (
conf
WORKER-CHILDOPTS
)]
(
substitute-childopts s
worker-id
storm-id
port))
;; 从topology的配置信息中获取为该topology的worker指定的jvm参数
topo-worker-childopts (
when-let
[s (
storm-conf
TOPOLOGY-WORKER-CHILDOPTS
)]
(
substitute-childopts s
worker-id
storm-id
port))
;; 将该topology特有的依赖库路径合并到jlp中,这样topology-worker-environment绑定的map中就包含了启动该topology的worker所需的所有的依赖库
topology-worker-environment (
if-let
[
env (
storm-conf
TOPOLOGY-ENVIRONMENT
)]
(
merge
env
{
"LD_LIBRARY_PATH"
jlp
})
{
"LD_LIBRARY_PATH"
jlp
})
;; 生成该worker的日志文件worker-{port}.log
logfilename (
str
"worker-"
port
".log")
;; command绑定一个Java -server xxxxxx -cp classpath classname arg_0 arg_1 ... arg_n命令,xxxxxx表示传递给java命令的jvm参数
command (
concat
[(
java-cmd)
"-server"
]
worker-childopts
topo-worker-childopts
[(
str
"-Djava.library.path="
jlp)
(
str
"-Dlogfile.name="
logfilename)
(
str
"-Dstorm.home="
storm-home)
(
str
"-Dstorm.log.dir="
storm-log-dir)
(
str
"-Dlogback.configurationFile="
storm-home
"/logback/cluster.xml")
(
str
"-Dstorm.id="
storm-id)
(
str
"-Dworker.id="
worker-id)
(
str
"-Dworker.port="
port)
"-cp"
classpath
"backtype.storm.daemon.worker"
storm-id
(
:assignment-id
supervisor)
port
worker-id
])
;; 去掉command命令数组中的空值
command (
->>
command (
map
str) (
filter (
complement
empty?)))
;; 获取command命令数组的字符串形式
shell-cmd (
->>
command
(
map
#(
str
\' (
clojure.string/escape
%
{
\'
"\\'"
})
\'))
(
clojure.string/join
" "
))]
(
log-message
"Launching worker with command: "
shell-cmd)
;; 通过ProcessBuilder类来执行command命令,即执行java命令运行backtype.storm.daemon.worker类的main方法创建一个新的进程,传递给main方法的参数为storm-id,supervisor-id,port和worker-id
;; 关于backtype.storm.daemon.worker类的main方法请参见其定义部分
(
launch-process
command
:environment
topology-worker-environment)
))
;; 如果dispatch函数的返回值为关键字:local,即storm集群运行在本地模式下,则执行该方法
(
defmethod
launch-worker
:local
[
supervisor
storm-id
port
worker-id
]
(
let
[
conf (
:conf
supervisor)
pid (
uuid)
worker (
worker/mk-worker
conf
(
:shared-context
supervisor)
storm-id
(
:assignment-id
supervisor)
port
worker-id
)]
(
psim/register-process
pid
worker)
(
swap! (
:worker-thread-pids-atom
supervisor)
assoc
worker-id
pid)
))
jlp函数定义如下:
;; stormroot绑定supervisor本地路径"{storm.local.dir}/supervisor/stormdist/{storm-id}",conf绑定集群配置
(
defn
jlp
[
stormroot
conf
]
;; resource-root绑定supervisor本地路径"{storm.local.dir}/supervisor/stormdist/{storm-id}/resources"
(
let
[
resource-root (
str
stormroot
File/separator
RESOURCES-SUBDIR)
;; os绑定supervisor服务器的操作系统名
os (
clojure.string/replace (
System/getProperty
"os.name")
#
"\s+"
"_")
;; arch绑定操作系统的架构,如"x86"和"i386"
arch (
System/getProperty
"os.arch")
;; arch-resource-root绑定路径"{storm.local.dir}/supervisor/stormdist/{storm-id}/resources/{os}-{arch}"
arch-resource-root (
str
resource-root
File/separator
os
"-"
arch
)]
;; 返回"{storm.local.dir}/supervisor/stormdist/{storm-id}/resources/{os}-{arch}:{storm.local.dir}/supervisor/stormdist/{storm-id}/resources:{java.library.path}"
(
str
arch-resource-root
File/pathSeparator
resource-root
File/pathSeparator (
conf
JAVA-LIBRARY-PATH))))
read-supervisor-storm-conf函数定义如下:
;; 从supervisor本地路径"{storm.local.dir}/supervisor/stormdist/stormconf.ser"读取topology运行配置信息
(
defn
read-supervisor-storm-conf
[
conf
storm-id
]
;; stormroot绑定目录路径"{storm.local.dir}/supervisor/stormdist"
(
let
[
stormroot (
supervisor-stormdist-root
conf
storm-id)
;; conf-path绑定文件路径"{storm.local.dir}/supervisor/stormdist/stormconf.ser"
conf-path (
supervisor-stormconf-path
stormroot)
;; topology-path绑定文件路径"{storm.local.dir}/supervisor/stormdist/stormcode.ser"
topology-path (
supervisor-stormcode-path
stormroot
)]
;; 返回集群配置信息和topology配置信息合并后的配置信息map
(
merge
conf (
Utils/deserialize (
FileUtils/readFileToByteArray (
File.
conf-path))))
))
backtype.storm.daemon.worker类定义在worker.clj文件中,通过:gen-class生成一个lava类,其main方法如下:
(
defn
-main
[
storm-id
assignment-id
port-str
worker-id
]
;; 读取storm集群配置信息
(
let
[
conf (
read-storm-config
)]
;; 验证配置信息
(
validate-distributed-mode!
conf)
;; 调用mk-worker函数,mk-worker函数请参见其定义部分
(
mk-worker
conf
nil
storm-id
assignment-id (
Integer/parseInt
port-str)
worker-id)))
mk-worker函数:
;; conf绑定集群配置信息,shared-mq-context绑定共享mq,storm-id标识topology-id,assignment-id标识supervisor-id
(
defserverfn
mk-worker
[
conf
shared-mq-context
storm-id
assignment-id
port
worker-id
]
(
log-message
"Launching worker for "
storm-id
" on "
assignment-id
":"
port
" with id "
worker-id
" and conf "
conf)
;; 如果storm不是"本地模式"运行(即"分布式模式"运行),则将标准输入输出流重定向到slf4j
(
if-not (
local-mode?
conf)
(
redirect-stdio-to-slf4j!))
;; because in local mode, its not a separate
;; process. supervisor will register it in this case
;; 如果storm是"分布式模式"运行,则在supervisor服务器本地创建文件"{storm.local.dir}/workers/{worker-id}/pids/{process-pid}",process-pid函数主要功能就是获取jvm进程的id
;; 需要特别注意的是worker-id是我们人为分配给该进程的一个标识,创建进程时,我们无法指定一个jvm进程的id,进程id是由操作系统分配的,所以我们需要获取该进程的实际id,并将我们指定的worker-id与进程id进行关联
(
when (
=
:distributed (
cluster-mode
conf))
(
touch (
worker-pid-path
conf
worker-id (
process-pid))))
;; worker绑定该进程的"元数据",worker-data函数的主要功能就是生成进程的"元数据",worker-data函数请参见其定义部分
(
let
[
worker (
worker-data
conf
shared-mq-context
storm-id
assignment-id
port
worker-id)
;; heartbeat-fn绑定一个匿名函数,该匿名函数的功能就是生成worker"本地心跳信息",这里相当定义了heartbeat-fn函数,do-heartbeat函数请参见其定义部分
heartbeat-fn
#(
do-heartbeat
worker)
;; do this here so that the worker process dies if this fails
;; it's important that worker heartbeat to supervisor ASAP when launching so that the supervisor knows it's running (and can move on)
;; 调用heartbeat-fn函数将worker进程心跳信息保存到本地LocalState对象中
_ (
heartbeat-fn)
;; 定义一个原子类型的引用executors
executors (
atom
nil)
;; launch heartbeat threads immediately so that slow-loading tasks don't cause the worker to timeout
;; to the supervisor
;; 将heartbeat-fn函数添加到定时器heartbeat-timer中,延迟执行时间为0s,每隔WORKER-HEARTBEAT-FREQUENCY-SECS执行一次
_ (
schedule-recurring (
:heartbeat-timer
worker)
0 (
conf
WORKER-HEARTBEAT-FREQUENCY-SECS)
heartbeat-fn)
;; 将#(do-executor-heartbeats worker :executors @executors)函数添加到定时器executor-heartbeat-timer中,延迟执行时间为0s,每隔TASK-HEARTBEAT-FREQUENCY-SECS执行一次
;; 这样就可以将worker进程心跳信息同步到zookeeper中, 以便nimbus可以立刻知道worker进程已经启动,do-executor-heartbeats函数请参见其定义部分
_ (
schedule-recurring (
:executor-heartbeat-timer
worker)
0 (
conf
TASK-HEARTBEAT-FREQUENCY-SECS)
#(
do-executor-heartbeats
worker
:executors
@
executors))
;; 更新发送connections,mk-refresh-connections函数请参见其定义部分
refresh-connections (
mk-refresh-connections
worker)
;; 主动调用refresh-connections函数refresh该worker进程所拥有的connections,并且不向zookeeper注册回调函数
_ (
refresh-connections
nil)
;; 调用refresh-storm-active函数refresh该worker进程缓存的所属topology的活跃状态,refresh-storm-active函数请其参见定义部分
_ (
refresh-storm-active
worker
nil)
;; 调用mk-executor函数生成executor对象,保存到executors集合中。关于executor对象的创建将会在以后文章中具体分析
_ (
reset!
executors (
dofor
[
e (
:executors
worker
)] (
executor/mk-executor
worker
e)))
;; 启动worker进程专有的接收线程,将数据从worker进程的侦听端口,不停的放到task对应的接收队列,receive-thread-shutdown绑定该接收线程的关闭函数。launch-receive-thread函数请参见其定义部分
receive-thread-shutdown (
launch-receive-thread
worker)
;; 定义event handler来处理transfer queue里面的数据。关于消息处理的流程会在以后文章中具体分析
transfer-tuples (
mk-transfer-tuples-handler
worker)
;; 创建transfer-thread。关于消息处理的流程会在以后文章中具体分析
transfer-thread (
disruptor/consume-loop* (
:transfer-queue
worker)
transfer-tuples)
;; 定义worker进程关闭回调函数,当关闭worker进程时调用该函数释放worker进程所占有的资源
shutdown* (
fn
[]
(
log-message
"Shutting down worker "
storm-id
" "
assignment-id
" "
port)
;; 关闭该worker进程到其他worker进程的连接
(
doseq
[[
_
socket
]
@(
:cached-node+port->socket
worker
)]
;; this will do best effort flushing since the linger period
;; was set on creation
(
.close
socket))
(
log-message
"Shutting down receive thread")
;; 调用receive-thread-shutdown函数关闭该worker进程的接收线程
(
receive-thread-shutdown)
(
log-message
"Shut down receive thread")
(
log-message
"Terminating messaging context")
(
log-message
"Shutting down executors")
;; 关闭该worker进程所拥有的executor
(
doseq
[
executor
@
executors
] (
.shutdown
executor))
(
log-message
"Shut down executors")
;;this is fine because the only time this is shared is when it's a local context,
;;in which case it's a noop
;; 关闭该worker进程所拥有的backtype.storm.messaging.netty.Context实例
(
.term
^
IContext (
:mq-context
worker))
(
log-message
"Shutting down transfer thread")
;; 关闭transfer-queue
(
disruptor/halt-with-interrupt! (
:transfer-queue
worker))
;; 中断transfer-thread
(
.interrupt
transfer-thread)
;; 等待transfer-thread结束
(
.join
transfer-thread)
(
log-message
"Shut down transfer thread")
;; 调用cancel-timer函数中断heartbeat-timer定时器线程
(
cancel-timer (
:heartbeat-timer
worker))
;; 调用cancel-timer函数中断refresh-connections-timer定时器线程
(
cancel-timer (
:refresh-connections-timer
worker))
;; 调用cancel-timer函数中断refresh-active-timer定时器线程
(
cancel-timer (
:refresh-active-timer
worker))
;; 调用cancel-timer函数中断executor-heartbeat-timer定时器线程
(
cancel-timer (
:executor-heartbeat-timer
worker))
;; 调用cancel-timer函数中断user-timer定时器线程
(
cancel-timer (
:user-timer
worker))
;; 关闭该worker进程所拥有的线程池
(
close-resources
worker)
;; TODO: here need to invoke the "shutdown" method of WorkerHook
;; 调用StormClusterState实例的remove-worker-heartbeat!函数从zookeeper上删除worker心跳信息
(
.remove-worker-heartbeat! (
:storm-cluster-state
worker)
storm-id
assignment-id
port)
(
log-message
"Disconnecting from storm cluster state context")
;; 关闭zookeeper连接
(
.disconnect (
:storm-cluster-state
worker))
(
.close (
:cluster-state
worker))
(
log-message
"Shut down worker "
storm-id
" "
assignment-id
" "
port))
;; ret实现了Shutdownable和DaemonCommon协议
ret (
reify
Shutdownable
(
shutdown
[
this
]
(
shutdown*))
DaemonCommon
(
waiting?
[
this
]
(
and
(
timer-waiting? (
:heartbeat-timer
worker))
(
timer-waiting? (
:refresh-connections-timer
worker))
(
timer-waiting? (
:refresh-active-timer
worker))
(
timer-waiting? (
:executor-heartbeat-timer
worker))
(
timer-waiting? (
:user-timer
worker))
))
)]
;; 将refresh-connections函数添加到定时器refresh-connections-timer中,每隔TASK-REFRESH-POLL-SECS执行一次。refresh-connections函数的无参版本提供一个默认回调函数调用其有参版本来更新所属 worker进程所拥有的collections,默认回调函数就是再次将refresh-connections函数无参版本添加到定时器refresh-connections-timer中
;; 这样只要zookeeper上分配信息发生变化,refresh-connections函数的有参版本就会执行,这里之所以周期执行refresh-connections函数是以防zookeeper的"watcher机制"失效
(
schedule-recurring (
:refresh-connections-timer
worker)
0 (
conf
TASK-REFRESH-POLL-SECS)
refresh-connections)
;; 将函数(partial refresh-storm-active worker)添加到定时器refresh-active-timer中,每隔TASK-REFRESH-POLL-SECS执行一次。refresh-storm-active函数的执行逻辑与refresh-connections函数完全相 同
(
schedule-recurring (
:refresh-active-timer
worker)
0 (
conf
TASK-REFRESH-POLL-SECS) (
partial
refresh-storm-active
worker))
(
log-message
"Worker has topology config " (
:storm-conf
worker))
(
log-message
"Worker "
worker-id
" for storm "
storm-id
" on "
assignment-id
":"
port
" has finished loading")
;; 返回实现了Shutdownable协议和DaemonCommon协议的实例ret,通过ret我们可以关闭worker进程
ret
))
worker-data函数:
;; worker-data函数生成进程的"元数据"
(
defn
worker-data
[
conf
mq-context
storm-id
assignment-id
port
worker-id
]
;; 为该进程生成ClusterState实例
(
let
[
cluster-state (
cluster/mk-distributed-cluster-state
conf)
;; 为该进程生成StormClusterState实例,这样进程就可以通过StormClusterState与zookeeper进行交互了
storm-cluster-state (
cluster/mk-storm-cluster-state
cluster-state)
;; 调用read-supervisor-storm-conf函数读取storm-id的配置信息,read-supervisor-storm-conf函数请参见其定义部分
storm-conf (
read-supervisor-storm-conf
conf
storm-id)
;; executors绑定分配给该进程的executor的id集合,包含system executor的id
executors (
set (
read-worker-executors
storm-conf
storm-cluster-state
storm-id
assignment-id
port))
;; 进程内executor间通信是通过disruptor实现的,所以这里为该worker创建了一个名为"worker-transfer-queue"的disruptor queue,关于disruptor的内容会在以后详细介绍
;; 注意transfer-queue是worker相关的,与executor无关
transfer-queue (
disruptor/disruptor-queue
"worker-transfer-queue" (
storm-conf
TOPOLOGY-TRANSFER-BUFFER-SIZE)
:wait-strategy (
storm-conf
TOPOLOGY-DISRUPTOR-WAIT-STRATEGY))
;; mk-receive-queue-map函数为每个executor创建一个名为"receive-queue{executor-id}"的disruptor queue,executor-receive-queue-map绑定executor-id->"disruptor接收queue"的map
;; 注意executor-receive-queue-map是executor相关,与worker无关
executor-receive-queue-map (
mk-receive-queue-map
storm-conf
executors)
;; executor可能有多个tasks,相同executor的tasks共用一个"disruptor接收queue",将executor-id->"disruptor接收queue"的map转化为task-id->"disruptor接收queue"的map,
;; 如executor-receive-queue-map={[1 2] receive-queue[1 2], [3 4] receive-queue[3 4]},那么receive-queue-map={1 receive-queue[1 2], 2 receive-queue[1 2], 3 receive-queue[3 4], 4 receive-queue[3 4]}
receive-queue-map (
->>
executor-receive-queue-map
(
mapcat (
fn
[[
e
queue
]] (
for
[
t (
executor-id->tasks
e
)]
[
t
queue
])))
(
into
{}))
;; 调用read-supervisor-topology函数从supervisor本地路径"{storm.local.dir}/supervisor/stormdist/stormcode.ser"读取topology对象的序列化文件
topology (
read-supervisor-topology
conf
storm-id
)]
;; recursive-map宏会将下面value都执行一遍,用返回值和key生成新的map作为worker的"元数据",recursive-map宏见其定义部分
(
recursive-map
;; 保存集群配置信息
:conf
conf
;; 保存一个传输层实例用于worker进程间消息传递,storm传输层被定义成了"可插拔式"插件,通过实现backtype.storm.messaging.IContext接口就可以定义自己的消息传输层。storm 0.8.x默认传输层实例是 backtype.storm.messaging.zmq,但是由于
;; 1.ZeroMQ是一个本地化的消息库,它过度依赖操作系统环境,而且ZeroMQ使用的是"堆外内存",无法使用jvm相关的内存监控工具进行监控管理,存在"堆外内存"泄漏风险
;; 2.安装起来比较麻烦
;; 3.ZeroMQ的稳定性在不同版本之间差异巨大,并且目前只有2.1.7版本的ZeroMQ能与Storm协调的工作。
;; 所以storm 0.9之后默认传出层实例为backtype.storm.messaging.netty.Context,Netty有如下优点:
;; 1.平台隔离,Netty是一个纯Java实现的消息队列,可以帮助Storm实现更好的跨平台特性,同时基于JVM的实现可以让我们对消息有更好的控制,因为Netty使用jvm的堆内存,而不是堆外内存
;; 2.高性能,Netty的性能要比ZeroMQ快两倍左右
;; 3. 安全性认证,使得我们将来要做的worker进程之间的认证授权机制成为可能。
:mq-context (
if
mq-context
mq-context
(
TransportFactory/makeContext
storm-conf))
;; 记录所属storm-id
:storm-id
storm-id
;; 记录所属supervisor-id
:assignment-id
assignment-id
;; 记录端口
:port
port
;; 记录我们分配给该进程的worker-id
:worker-id
worker-id
;; 记录ClusterState实例
:cluster-state
cluster-state
;; 记录StormClusterState实例,以便worker进程与zookeeper进行交互
:storm-cluster-state
storm-cluster-state
;; 记录topology的当前活跃状态为false
:storm-active-atom (
atom
false)
;; 记录分布在该worker进程上的executors的id
:executors
executors
;; 记录排序后的分布在该worker进程上的tasks的id
:task-ids (
->>
receive-queue-map
keys (
map
int)
sort)
;; 记录该topology的配置信息
:storm-conf
storm-conf
;; 记录topology实例
:topology
topology
;; 记录添加了acker,system bolt,metric bolt后的topology实例
:system-topology (
system-topology!
storm-conf
topology)
;; 记录一个名为"heartbeat-timer"的定时器
:heartbeat-timer (
mk-halting-timer
"heartbeat-timer")
;; 记录一个名为"refresh-connections-timer"的定时器
:refresh-connections-timer (
mk-halting-timer
"refresh-connections-timer")
;; 记录一个名为"refresh-active-timer"的定时器
:refresh-active-timer (
mk-halting-timer
"refresh-active-timer")
;; 记录一个名为"executor-heartbeat-timer"的定时器
:executor-heartbeat-timer (
mk-halting-timer
"executor-heartbeat-timer")
;; 记录一个名为"user-timer"的定时器
:user-timer (
mk-halting-timer
"user-timer")
;; 记录任务id->组件名称键值对的map,形如:{1 "boltA", 2 "boltA", 3 "boltA", 4 "boltA", 5 "boltB", 6 "boltB"},storm-task-info函数请参见其定义部分
:task->component (
HashMap. (
storm-task-info
topology
storm-conf))
; for optimized access when used in tasks later on
;; 记录"组件名称"->"stream_id->输出域Fields对象的map"的map,component->stream->fields函数请参见其定义部分
:component->stream->fields (
component->stream->fields (
:system-topology
<>))
;; 记录"组件名称"->排序后task-id集合的map,形如:{"boltA" [1 2 3 4], "boltB" [5 6]}
:component->sorted-tasks (
->> (
:task->component
<>)
reverse-map (
map-val
sort))
;; 记录一个ReentrantReadWriteLock对象
:endpoint-socket-lock (
mk-rw-lock)
;; 记录一个node+port->socket的原子类型的map
:cached-node+port->socket (
atom
{})
;; 记录一个task->node+port的原子类型的map
:cached-task->node+port (
atom
{})
;; 记录该worker进程的传输队列transfer-queue
:transfer-queue
transfer-queue
;; 记录executor接收队列executor-receive-queue-map
:executor-receive-queue-map
executor-receive-queue-map
;; 记录executor中"开始任务id"->executor接收queue的map,如executor-receive-queue-map={[1 2] receive-queue[1 2], [3 4] receive-queue[3 4]},那么short-executor-receive-queue-map={1 receive-queue[1 2], 3 receive-queue[3 4]}
:short-executor-receive-queue-map (
map-key
first
executor-receive-queue-map)
;; 记录task_id->executor中"开始任务id"的map,如executors=#{[1 2] [3 4] [5 6]},task->short-executor={1 1, 2 1, 3 3, 4 3, 5 5, 6 5}
:task->short-executor (
->>
executors
(
mapcat (
fn
[
e
] (
for
[
t (
executor-id->tasks
e
)]
[
t (
first
e
)])))
(
into
{})
(
HashMap.))
;; 记录一个可以终止该worker进程的"自杀函数"
:suicide-fn (
mk-suicide-fn
conf)
;; 记录一个可以计算该worker进程启动了多长时间的函数
:uptime (
uptime-computer)
;; 为该worker进程生成一个线程池
:default-shared-resources (
mk-default-resources
<>)
;; mk-user-resources函数目前版本为空实现
:user-shared-resources (
mk-user-resources
<>)
;; 记录一个函数,该函数的主要功能就是接收messages并将message发送到task对应的接收队列,mk-transfer-local-fn函数请参见其定义部分
:transfer-local-fn (
mk-transfer-local-fn
<>)
;; 记录每个worker进程特有的接收线程的个数
:receiver-thread-count (
get
storm-conf
WORKER-RECEIVER-THREAD-COUNT)
;; 将executor处理过的message放到worker进程发送队列transfer-queue中,mk-transfer-fn函数请参见其定义部分
:transfer-fn (
<>)
)))
read-worker-executors函数:
;; read-worker-executors函数用于读取分布在该进程上的executor信息
(
defn
read-worker-executors
[
storm-conf
storm-cluster-state
storm-id
assignment-id
port
]
;; assignment绑定executor->node+port的map,调用StormClusterState实例的assignment-info函数从zookeeper上读取storm-id的分配信息AssignmentInfo实例
;; AssignmentInfo定义如下:(defrecord Assignment [master-code-dir node->host executor->node+port executor->start-time-secs])
(
let
[
assignment (
:executor->node+port (
.assignment-info
storm-cluster-state
storm-id
nil
))]
;; 返回分配给该进程的executor的id集合,包含system executor的id
(
doall
;; 将system executor的id和topology executor的id合并
(
concat
;; system executor的id,[-1 -1]
[
Constants/SYSTEM_EXECUTOR_ID
]
;; 从分配信息assignment中获取分配给该进程的executor
(
mapcat (
fn
[[
executor
loc
]]
(
if (
=
loc
[
assignment-id
port
])
[
executor
]
))
assignment)))))
mk-receive-queue-map函数:
;; mk-receive-queue-map函数为每个executor创建一个名为"receive-queue{executor-id}"的disruptor queue,如"receive-queue[1 3]",并返回executor-id->receive-queue的map
(
defn-
mk-receive-queue-map
[
storm-conf
executors
]
;; executors标识了executor-id集合
(
->>
executors
;; TODO: this depends on the type of executor
;; 通过调用map函数为每个executor-id创建一个"disruptor接收queue"
(
map (
fn
[
e
]
[
e (
disruptor/disruptor-queue (
str
"receive-queue"
e)
(
storm-conf
TOPOLOGY-EXECUTOR-RECEIVE-BUFFER-SIZE)
:wait-strategy (
storm-conf
TOPOLOGY-DISRUPTOR-WAIT-STRATEGY
))]))
;; 返回executor-id->receive-queue的map
(
into
{})
))
storm-task-info函数:
(
defn
storm-task-info
"Returns map from task -> component id"
[
^
StormTopology
user-topology
storm-conf
]
(
->> (
system-topology!
storm-conf
user-topology)
;; 获取组件名称->组件对象键值对的map
all-components
;; 返回组件名称->组件任务数键值对的map,如{"boltA" 4, "boltB" 2}
(
map-val (
comp
#(
get
%
TOPOLOGY-TASKS)
component-conf))
;; 按照组件名称对map进行排序返回结果序列,如(["boltA" 4] ["boltB" 2])
(
sort-by
first)
;; mapcat函数等价于对(map (fn...))的返回结果执行concat函数,返回("boltA" "boltA" "boltA" "boltA" "boltB" "boltB")
(
mapcat (
fn
[[
c
num-tasks
]] (
repeat
num-tasks
c)))
;; {1 "boltA", 2 "boltA",3 "boltA", 4 "boltA", 5 "boltB", 6 "boltB"}
(
map (
fn
[
id
comp
]
[
id
comp
]) (
iterate (
comp int
inc) (
int
1)))
(
into
{})
))
component->stream->fields函数:
(
defn
component->stream->fields
[
^
StormTopology
topology
]
;; 调用ThriftTopologyUtils/getComponentIds方法获取topology所有组件名称集合,如#{"boltA", "boltB", "boltC"}
(
->> (
ThriftTopologyUtils/getComponentIds
topology)
;; 获取每个组件的stream_id->StreamInfo对象的map,stream->fields函数请参见其定义部分
(
map (
fn
[
c
]
[
c (
stream->fields
topology
c
)]))
;; 生成"组件名称"->"stream_id->输出域Fields对象的map"的map
(
into
{})
;; 将其转化成Java的HashMap
(
HashMap.)))
stream->fields函数:
(
defn-
stream->fields
[
^
StormTopology
topology
component
]
;; 获取指定组件名的ComponentCommon对象
(
->> (
ThriftTopologyUtils/getComponentCommon
topology
component)
;; 调用ComponentCommon对象的get_streams方法获取stream_id->StreamInfo对象的map,一个组件可以有多个输出流
.get_streams
;; s绑定stream_id,info绑定StremInfo对象,调用StreamInfo对象的get_output_fields获取输出域List<String>对象,再用输出域List<String>对象生成Fields对象
(
map (
fn
[[s
info
]]
[s (
Fields. (
.get_output_fields
info
))]))
;; 生成stream_id->Fields对象的map
(
into
{})
;; 将clojure结构的map转换成java中的HashMap
(
HashMap.)))
mk-transfer-local-fn函数:
;; mk-transfer-local-fn函数返回一个匿名函数,该匿名函数的主要功能就是接收messages并将message发送到task对应的接收队列
(
defn
mk-transfer-local-fn
[
worker
]
;; short-executor-receive-queue-map绑定"开始任务id"->executor接收queue的map,如:{1 receive-queue[1 2], 3 receive-queue[3 4]}
(
let
[
short-executor-receive-queue-map (
:short-executor-receive-queue-map
worker)
;; task->short-executor绑定task_id->executor中"开始任务id"的map,如:{1 1, 2 1, 3 3, 4 3}
task->short-executor (
:task->short-executor
worker)
;; task-getter绑定一个由comp生成的组合函数
task-getter (
comp
#(
get
task->short-executor
%)
fast-first
)]
;; 返回一个匿名函数,tuple-batch是一个ArrayList对象,ArrayList的每个元素都是一个长度为2的数组[task_id, message],task_id表示该消息由哪个task处理,message表示消息
(
fn
[
tuple-batch
]
;; 调用fast-group-by函数获取"executor简写id"->需要该executor处理的消息List的map
(
let
[
grouped (
fast-group-by
task-getter
tuple-batch
)]
;; fast-map-iters宏主要用于遍历map,short-executor标识"executor简写id",pairs标识消息[task_id, message]
(
fast-map-iter
[[
short-executor
pairs
]
grouped
]
;; 获取该executor的接收queue
(
let
[
q (
short-executor-receive-queue-map
short-executor
)]
;; 如果q不为空,则调用disruptor的publish方法将消息放入disruptor中
(
if
q
(
disruptor/publish
q
pairs)
(
log-warn
"Received invalid messages for unknown tasks. Dropping... ")
)))))))
fast-group-by函数:
;; fast-group-by函数的主要功能就是生成"executor简写id"->需要该executor处理的消息List的map
(
defn
fast-group-by
;; afn绑定mk-transfer-local-fn函数中定义的task-getter函数,alist绑定一个ArrayList对象,ArrayList的每个元素都是一个长度为2的数组[task_id, message],task_id表示该消息由哪个task处理,message表示消息
[
afn
alist
]
;; 创建一个HashMap对象ret
(
let
[
ret (
HashMap.
)]
;; fast-list-iter是一个宏,主要功能就是遍历list
(
fast-list-iter
;; e绑定每个[task_id, message]数组对象
[
e
alist
]
;; 调用afn绑定的task-getter函数获取该task_id所属的"executor的简写id",所以key绑定"executor简写id"
(
let
[
key (
afn
e)
;; 从ret中获取key所对应的ArrayList对象,即需要该executor处理的消息列表
^
List
curr (
get-with-default
ret
key (
ArrayList.
))]
;; [task_id, message]数组对象添加到list中
(
.add
curr
e)))
;; 返回ret
ret))
mk-transfer-fn函数:
;; mk-transfer-fn函数主要功能就是将executor处理过的message放到worker进程发送队列transfer-queue中
(
defn
mk-transfer-fn
[
worker
]
;; local-tasks绑定分布在该worker进程上的task的id集合
(
let
[
local-tasks (
->
worker
:task-ids
set)
;; local-transfer标识mk-transfer-local-fn返回的匿名函数
local-transfer (
:transfer-local-fn
worker)
;; transfer-queue绑定该worker进程的传输队列transfer-queue
^
DisruptorQueue
transfer-queue (
:transfer-queue
worker)
;; task->node+port绑定task_id->node+port的map
task->node+port (
:cached-task->node+port
worker
)]
;; 返回一个匿名函数,serializer标识一个Kryo序列化器,tuple-batch是一个ArrayList对象,ArrayList的每个元素都是一个长度为2的数组[task_id, message],task_id表示该消息由哪个task处理,即message的目标task,message表示消息
(
fn
[
^
KryoTupleSerializer
serializer
tuple-batch
]
;; local为ArrayList
(
let
[
local (
ArrayList.)
;; remoteMap为HashMap
remoteMap (
HashMap.
)]
;; 遍历tuple-batch
(
fast-list-iter
[[
task
tuple
:as
pair
]
tuple-batch
]
;; 如果接收该消息的task为本地task,即该task也分布在该worker进程上,那么将该消息添加到local中
(
if (
local-tasks
task)
(
.add
local
pair)
;;Using java objects directly to avoid performance issues in java code
;; 否则说明接收该消息的task不是本地task,即该task分布在其他worker进程上;node+port标识了运行该task的worker进程所在的节点和端口
(
let
[
node+port (
get
@
task->node+port
task
)]
;; 如果remoteMap不包含node+port,则添加
(
when (
not (
.get
remoteMap
node+port))
(
.put
remoteMap
node+port (
ArrayList.)))
(
let
[
remote (
.get
remoteMap
node+port
)]
;; 首先用task_id和序列化后的tuple生成TaskMessage对象,然后将TaskMessage对象添加到ArrayList中
(
.add
remote (
TaskMessage.
task (
.serialize
serializer
tuple)))
))))
;; 调用local-transfer函数发送需要本地task处理的消息
(
local-transfer
local)
;; 调用disruptor的publish方法将remoteMap放入worker进程的传输队列transfer-queue中,remoteMap的key为node+port,value为ArrayList,ArrayList中每个元素都是需要node+port所对应的worker进行处理
(
disruptor/publish
transfer-queue
remoteMap)
))))
do-heartbeat函数:
(
defn
do-heartbeat
[
worker
]
;; 获取集群配置信息
(
let
[
conf (
:conf
worker)
;; 创建WorkerHeartbeat对象
hb (
WorkerHeartbeat.
;; 本次心跳时间
(
current-time-secs)
;; 该worker进程所属的topology-id
(
:storm-id
worker)
;; 分布在该worker进程上的executor-id集合
(
:executors
worker)
;; 该worker进程所占用的端口
(
:port
worker))
;; 创建一个基于目录"{storm.local.dir}/workers/{worker-id}/heartbeats"的LocalState对象,用于存放worker进程的"本地心跳信息",通过LocalState对象我们可以访问一个序列化到磁盘的map对象
state (
worker-state
conf (
:worker-id
worker
))]
(
log-debug
"Doing heartbeat " (
pr-str
hb))
;; do the local-file-system heartbeat.
;; 将worker进程心跳信息通过LocalState对象存入磁盘,map对象的key为"worker-heartbeat"字符串,value为worker心跳信息
(
.put
state
LS-WORKER-HEARTBEAT
hb
false
)
;; 调用LocalState对象的clearup方法,只保留最近60次心跳信息
(
.cleanup
state
60)
; this is just in case supervisor is down so that disk doesn't fill up.
; it shouldn't take supervisor 120 seconds between listing dir and reading it
))
do-executor-heartbeats函数:
;; do-executor-heartbeats函数主要功能就是通过worker-heartbeat!函数将worker进程心跳信息写入zookeeper的workerbeats节点中
(
defnk
do-executor-heartbeats
[
worker
:executors
nil
]
;; stats is how we know what executors are assigned to this worker
;; stats绑定executor对象->executor统计信息的map。当第一次调用do-executor-heartbeats函数时,即第一次心跳时,executors为nil,map形如:{executor_1 nil, executor_2 nil, ... }
;; 当再次心跳时,将会调用executor对象的get-executor-id函数和render-stats函数,获取executor_id->executor统计信息的map,所以stats绑定的map在第一次心跳时和再次心跳时是不同的,有关executor统计信 息的计算会在以后文章中具体分析。
(
let
[
stats (
if-not
executors
(
into
{} (
map (
fn
[
e
]
{
e
nil
}) (
:executors
worker)))
(
->>
executors
(
map (
fn
[
e
]
{(
executor/get-executor-id
e) (
executor/render-stats
e
)}))
(
apply
merge)))
;; 构建worker进程的心跳信息
zk-hb
{
:storm-id (
:storm-id
worker)
;; 记录executor统计信息
:executor-stats
stats
;; 记录worker进程运行了多次时间
:uptime ((
:uptime
worker))
;; 记录worker进程心跳时间
:time-secs (
current-time-secs)
}]
;; do the zookeeper heartbeat
;; 调用StormClusterState对象的worker-heartbeat!函数将worker进程心跳信息zk-hb同步到zookeeper的"/workerbeats/{topology-id}/{supervisorId-port}/"节点中
(
.worker-heartbeat! (
:storm-cluster-state
worker) (
:storm-id
worker) (
:assignment-id
worker) (
:port
worker)
zk-hb)
))
mk-refresh-connections函数:
;; mk-refresh-connections函数返回一个名为this的函数,在"storm启动supervisor源码分析-supervisor.clj"中,我们在mk-synchronize-supervisor函数也见过这种定义函数的方式,是因为这个函数本身要在函数体内被使用。
;; 并且refresh-connections是需要反复被执行的,即当每次assignment-info发生变化的时候,就需要refresh一次,这里是通过zookeeper的"watcher机制"实现的
(
defn
mk-refresh-connections
[
worker
]
;; outbound-tasks绑定用于接收该worker进程输出消息的所有任务,worker-outbound-tasks函数请参见其定义部分
(
let
[
outbound-tasks (
worker-outbound-tasks
worker)
;; conf绑定worker配置信息
conf (
:conf
worker)
;; storm-cluster-state绑定StormClusterState实例
storm-cluster-state (
:storm-cluster-state
worker)
;; storm-id标识该worker进程所属的topology的id
storm-id (
:storm-id
worker
)]
;; 返回名称为this的函数,每次assignment-info发生变化时,就执行一次来refresh该worker进程的connections
(
fn
this
;; 无参版本,提供一个"默认回调函数"调用有参版本,"默认回调函数"就是将this函数无参版本本身添加到worker进程的refresh-connections-timer定时器中,这样当assignment-info发生变化时,zookeeper的"watcher机制"
;; 就会执行回调函数,refresh-connections-timer定时器线程将会执行this函数。这样就可以保证,每次assignment发生变化,定时器都会在后台做refresh-connections的操作
([]
(
this (
fn
[
&
ignored
] (
schedule (
:refresh-connections-timer
worker)
0
this))))
;; 有参版本
([
callback
]
;; 调用StormClusterState实例的assignment-version函数获取storm-id的当前分配信息版本,并将callback函数注册到zookeeper
(
let
[
version (
.assignment-version
storm-cluster-state
storm-id
callback)
;; 如果worker本地缓存的分配版本和zookeeper上获取的分配版本相等,那么说明storm-id的分配信息未发生变化,直接从worker本地获取分配信息
assignment (
if (
=
version (
:version (
get
@(
:assignment-versions
worker)
storm-id)))
(
:data (
get
@(
:assignment-versions
worker)
storm-id))
;; 否则调用assignment-info-with-version函数从zookeeper的"/assignments/{storm-id}"节点重新获取带有版本号的分配信息,并注册回调函数,这样worker就能感知某个已存在的assignment是否被重新分配
(
let
[
new-assignment (
.assignment-info-with-version
storm-cluster-state
storm-id
callback
)]
;; 将最近分配信息保存到worker本地缓存
(
swap! (
:assignment-versions
worker)
assoc
storm-id
new-assignment)
(
:data
new-assignment)))
;; my-assignment标识"接收该worker进程输出消息的任务"->[node port]的map
my-assignment (
->
assignment
;; 获取executor_id->[node port]的map,如:{[1 1] [node1 port1], [4 4] [node1 port1], [2 2] [node2 port1], [5 5] [node2 port1], [3 3] [node3 port1], [6 6] [node3 port1]}
:executor->node+port
;; 获取task_id->[node port]的map,如:{[1 [node1 port1], 4 [node1 port1], 2 [node2 port1], 5 [node2 port1], 3 [node3 port1], 6 [node3 port1]}
to-task->node+port
;; 选择"键"包含在outbound-tasks集合的键值对,假设outbound-tasks=#{4 5 6},过滤后为{4 [node1 port1], 5 [node2 port1], 6 [node3 port1]}
(
select-keys
outbound-tasks)
;; {4 "node1/port1", 5 "node2/port1", 6 "node3/port1"}
(
#(
map-val
endpoint->string
%)))
;; we dont need a connection for the local tasks anymore
;; 过滤掉分布在该worker进程上的task,因为分布在通一个进程上不需要建立socket连接。假设该worker进程位于node1的port1上,则needed-assignment={5 "node2/port1", 6 "node3/port1"}
needed-assignment (
->>
my-assignment
(
filter-key (
complement (
->
worker
:task-ids
set))))
;; needed-connections绑定"需要的连接"的集合,needed-connections=#{"node2/port1", "node3/port1"}
needed-connections (
->
needed-assignment
vals
set)
;; needed-tasks绑定需要建立连接的任务集合,needed-tasks=#{5, 6}
needed-tasks (
->
needed-assignment
keys)
;; current-connections绑定当前该worker进程"已建立的连接"的集合
current-connections (
set (
keys
@(
:cached-node+port->socket
worker)))
;; needed-connections和current-connections的差集表示需要"新建的连接"的集合,假设current-connections=#{},则new-connections=#{"node2/port1", "node3/port1"}
new-connections (
set/difference
needed-connections
current-connections)
;; current-connections和needed-connections的差集表示需要"删除的连接"的集合
remove-connections (
set/difference
current-connections
needed-connections
)]
;; 将新建的连接合并到cached-node+port->socket中
(
swap! (
:cached-node+port->socket
worker)
#(
HashMap. (
merge (
into
{}
%1)
%2))
;; 创建endpoint-str->connection对象的map,即建立新的连接。如:{"node2/port1" connect1, "node3/port1" connect2}
(
into
{}
(
dofor
[
endpoint-str
new-connections
:let
[[
node
port
] (
string->endpoint
endpoint-str
)]]
[
endpoint-str
(
.connect
^
IContext (
:mq-context
worker)
storm-id
((
:node->host
assignment)
node)
port)
]
)))
;; 将my-assignment保存到worker进程本地缓存cached-task->node+port中
(
write-locked (
:endpoint-socket-lock
worker)
(
reset! (
:cached-task->node+port
worker)
(
HashMap.
my-assignment)))
;; close需要"删除的连接"
(
doseq
[
endpoint
remove-connections
]
(
.close (
get
@(
:cached-node+port->socket
worker)
endpoint)))
;; 将需要"删除的连接"从worker进程本地缓存cached-node+port->socket中删除,通过worker进程本地缓存cached-task->node+port和cached-node+port->socket,我们就可以或得task和socket的对应关系
(
apply
swap!
(
:cached-node+port->socket
worker)
#(
HashMap. (
apply dissoc (
into
{}
%1)
%
&))
remove-connections)
;; 查找出未建立连接的task
(
let
[
missing-tasks (
->>
needed-tasks
(
filter (
complement
my-assignment
)))]
;; 如果存在未建立连接的task,则记录日志文件
(
when-not (
empty?
missing-tasks)
(
log-warn
"Missing assignment for following tasks: " (
pr-str
missing-tasks))
)))))))
worker-outbound-tasks函数:
;; worker-outbound-tasks函数主要功能就是获取接收来自该worker消息的组件的task-id集合
(
defn
worker-outbound-tasks
"Returns seq of task-ids that receive messages from this worker"
[
worker
]
;; context绑定backtype.storm.task.WorkerTopologyContext对象,worker-context函数请参见其定义部分
(
let
[
context (
worker-context
worker)
;; 对分布在该worker进程上的每个任务的task_id调用匿名函数(fn [task-id] ... ),并对返回结果进行concat操作,components绑定了接收组件id的集合
components (
mapcat
(
fn
[
task-id
]
;; 调用context的getComponentId方法获取该task-id所属的组件(spout/bolt)的名称
(
->> (
.getComponentId
context (
int
task-id))
;; 调用context的getTargets方法,获取哪些组件接收了componentId输出的消息
(
.getTargets
context)
vals
;; 获取接收组件id的集合
(
map
keys)
(
apply
concat)))
;; 获取分布在该worker进程上的task_id集合
(
:task-ids
worker
))]
(
->
worker
;; 获取任务id->组件名称键值对的map,形如:{1 "boltA", 2 "boltA", 3 "boltA", 4 "boltA", 5 "boltB", 6 "boltB"}
:task->component
;; 结果形如:{"boltA" [1 2 3 4], "boltB" [5 6]}
reverse-map
;; 过滤出"键"包含在components集合中的键值对
(
select-keys
components)
vals
flatten
;; 获取接收组件所有任务的id的集合
set )))
worker-context函数:
(
defn
worker-context
[
worker
]
;; 返回backtype.storm.task.WorkerTopologyContext对象
(
WorkerTopologyContext. (
:system-topology
worker)
(
:storm-conf
worker)
(
:task->component
worker)
(
:component->sorted-tasks
worker)
(
:component->stream->fields
worker)
(
:storm-id
worker)
(
supervisor-storm-resources-path
(
supervisor-stormdist-root (
:conf
worker) (
:storm-id
worker)))
(
worker-pids-root (
:conf
worker) (
:worker-id
worker))
(
:port
worker)
(
:task-ids
worker)
(
:default-shared-resources
worker)
(
:user-shared-resources
worker)
))
getTargets方法:
;; WorkerTopologyContext类继承GeneralTopologyContext类,getTargets方法是GeneralTopologyContext类实例方法,主要功能就是获取哪些组件接收了componentId输出的消息
;; 返回值为一个stream_id->{receive_component_id->Grouping}的map,receive_component_id就是接收组件的id
public
Map<String,
Map<String,
Grouping>>
getTargets(
String
componentId)
{
;; 创建返回结果map,ret
Map<String,
Map<String,
Grouping>>
ret
=
new
HashMap<String,
Map<String,
Grouping>>();
;; 获取该topology的所有组件ids,并遍历
for(
String
otherComponentId
:
getComponentIds())
{
;; 通过组件id获取组件的ComponentCommon对象,然后再获取其输入信息inputs
Map<GlobalStreamId,
Grouping>
inputs
=
getComponentCommon(
otherComponentId)
.get_inputs();
;; 遍历输入信息,GlobalStreamId对象有两个成员属性,一个是流id,一个是发送该流的组件id
for(
GlobalStreamId
id
:
inputs.keySet())
{
;; 如果输入流的组件id和componentId相等,那么说明该组件接收来自componentId的输出,则将其添加到ret中
if(
id.get_componentId()
.equals(
componentId))
{
Map<String,
Grouping>
curr
=
ret.get(
id.get_streamId());
if(
curr==null)
curr
=
new
HashMap<String,
Grouping>();
curr.put(
otherComponentId,
inputs.get(
id));
ret.put(
id.get_streamId(),
curr);
}
}
}
return
ret;
}
refresh-storm-active函数:
;; refresh-storm-active函数主要功能就是refresh指定worker进程缓存的所属topology的活跃状态
(
defn
refresh-storm-active
;; "无回调函数"版本,使用默认回调函数调用"有回调函数"版本,默认回调函数将refresh-storm-active函数本身添加到refresh-active-timer定时器
([
worker
]
(
refresh-storm-active
worker (
fn
[
&
ignored
] (
schedule (
:refresh-active-timer
worker)
0 (
partial
refresh-storm-active
worker)))))
;; "有回调函数"版本
([
worker
callback
]
;; 调用StormClusterState实例的storm-base函数,从zookeeper的"/storms/{storm-id}"节点获取该topology的StormBase数据,并将回调函数callback注册到zookeeper的"/storms/{storm-id}"节点
;; 这样当该节点数据发生变化时,callback函数将被执行,即将refresh-storm-active函数添加到refresh-active-timer定时器,refresh-active-timer定时器线程将会执行refresh-storm-active函数
(
let
[
base (
.storm-base (
:storm-cluster-state
worker) (
:storm-id
worker)
callback
)]
;; 更新worker进程缓存的topology的活跃状态
(
reset!
(
:storm-active-atom
worker)
(
=
:active (
->
base
:status
:type))
))
))
launch-receive-thread函数:
;; 为worker进程启动专有接收线程
(
defn
launch-receive-thread
[
worker
]
(
log-message
"Launching receive-thread for " (
:assignment-id
worker)
":" (
:port
worker))
;; launch-receive-thread!函数请参见其定义部分
(
msg-loader/launch-receive-thread!
;; 连接实例,0.9版本开始默认使用netty,backtype.storm.messaging.netty.Context实例
(
:mq-context
worker)
(
:storm-id
worker)
;; 接收线程数
(
:receiver-thread-count
worker)
(
:port
worker)
;; 获取本地消息传输函数transfer-local-fn,transfer-local-fn函数将消息发送给分布在该worker进程上的task相应队列
(
:transfer-local-fn
worker)
;; 获取worker进程输入队列大小
(
->
worker
:storm-conf (
get
TOPOLOGY-RECEIVER-BUFFER-SIZE))
:kill-fn (
fn
[
t
] (
exit-process!
11))))
launch-receive-thread!函数:
;; launch-receive-thread!函数定义在loader.clj文件中,用于启动指定worker进程的接收线程
(
defnk
launch-receive-thread!
[
context
storm-id
receiver-thread-count
port
transfer-local-fn
max-buffer-size
:daemon
true
:kill-fn (
fn
[
t
] (
System/exit
1))
:priority
Thread/NORM_PRIORITY
]
;; max-buffer-size绑定worker进程最大输入队列大小
(
let
[
max-buffer-size (
int
max-buffer-size)
;; 调用backtype.storm.messaging.netty.Context的bind方法建立一个服务器端的连接,socket绑定backtype.storm.messaging.netty.Server实例
socket (
.bind
^
IContext
context
storm-id
port)
;; thread-count绑定接收线程数,默认值为1
thread-count (
if
receiver-thread-count
receiver-thread-count
1)
;; 调用mk-receive-threads函数创建接收线程,vthreads绑定接收线程所对应的SmartThread实例,通过该实例我们可以start、join、interrupt接收线程,mk-receive-threads函数请参见其定义部分
vthreads (
mk-receive-threads
context
storm-id
port
transfer-local-fn
daemon
kill-fn
priority
socket
max-buffer-size
thread-count
)]
;; 返回一个匿名函数,该匿名函数的主要功能就是通过向task_id=-1的任务发送一个空消息来关闭接收线程
(
fn
[]
;; 向本地端口port创建连接
(
let
[
kill-socket (
.connect
^
IContext
context
storm-id
"localhost"
port
)]
(
log-message
"Shutting down receiving-thread: ["
storm-id
", "
port
"]")
;; 向task_id=-1的任务发送一个空消息,接收线程在接收消息时,首先检查是否是发送给task_id=-1消息,如果是则关闭接收线程
(
.send
^
IConnection
kill-socket
-1 (
byte-array
[]))
;; 关闭连接
(
.close
^
IConnection
kill-socket)
(
log-message
"Waiting for receiving-thread:["
storm-id
", "
port
"] to die")
;; 等待所有接收线程结束
(
for
[
thread-id (
range
thread-count
)]
(
.join (
vthreads
thread-id)))
(
log-message
"Shutdown receiving-thread: ["
storm-id
", "
port
"]")
))))
mk-receive-threads函数:
;; mk-receive-threads函数循环调用mk-receive-thread函数创建接收线程,mk-receive-thread请参见其定义部分
(
defn-
mk-receive-threads
[
context
storm-id
port
transfer-local-fn
daemon
kill-fn
priority
socket
max-buffer-size
thread-count
]
(
into
[] (
for
[
thread-id (
range
thread-count
)]
(
mk-receive-thread
context
storm-id
port
transfer-local-fn
daemon
kill-fn
priority
socket
max-buffer-size
thread-id))))
mk-receive-thread函数:
(
defn-
mk-receive-thread
[
context
storm-id
port
transfer-local-fn
daemon
kill-fn
priority
socket
max-buffer-size
thread-id
]
;; async-loop函数接收一个"函数"或"函数工厂"作为参数生成一个java thread,这个java thread不断循环执行这个"函数"或"函数工厂"生产的函数。async-loop函数返回实现SmartThread协议的实例,通过该实例我们可以start、join、interrupt接收线程
(
async-loop
;; 这个参数就是一个"函数工厂","函数工厂"就是一个返回函数的函数
(
fn
[]
(
log-message
"Starting receive-thread: [stormId: "
storm-id
", port: "
port
", thread-id: "
thread-id
" ]")
;; 生成的java thread的run方法不断循环执行该函数
(
fn
[]
;; batched是一个ArrayList对象
(
let
[
batched (
ArrayList.)
;; backtype.storm.messaging.netty.Server的recv方法返回ArrayList<TaskMessage>的Iterator<TaskMessage>。关于消息的处理流程会在以后文章中具体分析
^
Iterator
iter (
.recv
^
IConnection
socket
0
thread-id)
closed (
atom
false
)]
;; 当iter不为nil,遍历iter
(
when
iter
(
while (
and (
not
@
closed) (
.hasNext
iter))
;; packet绑定一个TaskMessage对象,TaskMessage有两个成员属性task和message,task表示处理该消息的任务id,message表示消息的byte数组
(
let
[
packet (
.next
iter)
;; task绑定接收该消息的任务id
task (
if
packet (
.task
^
TaskMessage
packet))
;; message绑定消息的byte数组
message (
if
packet (
.message
^
TaskMessage
packet
))]
;; 如果task=-1,则关闭接收线程
(
if (
=
task
-1)
(
do (
log-message
"Receiving-thread:["
storm-id
", "
port
"] received shutdown notice")
(
.close
socket)
(
reset!
closed
true))
;; 否则将数组[task message]添加到batched
(
when
packet (
.add
batched
[
task
message
]))))))
;; 如果接收线程关闭标识closed值为false,则调用transfer-local-fn函数将接收到的一批消息发送给task对应的接收队列
(
when (
not
@
closed)
(
do
(
if (
> (
.size
batched)
0)
(
transfer-local-fn
batched))
;; 0表示函数执行完一次不需要sleep,直接进行下一次执行
0)))))
;; 表示参数是一个"函数工厂"
:factory?
true
;; daemon的值为true,所以接收线程是一个守护线程
:daemon
daemon
;; 指定kill函数
:kill-fn
kill-fn
;; 指定java thread的优先级
:priority
priority
;; 指定接收线程的名称为"worker-receiver-thread-"+thread-id
:thread-name (
str
"worker-receiver-thread-"
thread-id)))
以上就是supervisor启动worker的源码分析,启动worker的过程中涉及了executor的相关内容,这里没有详细分析,会在以后进行分析。同时也涉及了跟消息队列相关的内容
也会在以后进行详细分析。