Harbor里面有很多job,譬如镜像同步,这些job有很多状态,应为job这执行过程中会有很多任务,在每种任务下面会有不同的任务状态,状态之前可以切换并触发动作,这个就是状态机,如果对Hadoop中的YARN调度熟悉可能知道,YARN里面就是通过状态机完成任务的管理和状态切换。下面介绍一下状态机
type SM struct {
JobID int64
CurrentState string
PreviousState string
//The states that don't have to exist in transition map, such as "Error", "Canceled"
ForcedStates map[string]struct{}
Transitions map[string]map[string]struct{}
Handlers map[string]StateHandler
desiredState string
Logger *log.Logger
Parms *RepJobParm
lock *sync.Mutex
}
那么这个状态机状态怎么转换呢?要看看Transitions他是一个二层的map。
func (sm *SM) AddTransition(from string, to string, h StateHandler) {
_, ok := sm.Transitions[from]
if !ok {
sm.Transitions[from] = make(map[string]struct{})
}
sm.Transitions[from][to] = struct{}{}
sm.Handlers[to] = h
}
AddTransition添加过渡,从from状态到to状态切换,执行StateHandler的动作。
func (sm *SM) RemoveTransition(from string, to string) {
_, ok := sm.Transitions[from]
if !ok {
return
}
delete(sm.Transitions[from], to)
}
删除简单就不介绍了。
当状态机启动的时候
func (sm *SM) Start(s string) {
n, err := sm.EnterState(s)
log.Debugf("Job id: %d, next state from handler: %s", sm.JobID, n)
for len(n) > 0 && err == nil {
if d := sm.getDesiredState(); len(d) > 0 {
log.Debugf("Job id: %d. Desired state: %s, will ignore the next state from handler", sm.JobID, d)
n = d
sm.setDesiredState("")
continue
}
if n == models.JobContinue && len(sm.Transitions[sm.CurrentState]) == 1 {
for n = range sm.Transitions[sm.CurrentState] {
break
}
log.Debugf("Job id: %d, Continue to state: %s", sm.JobID, n)
continue
}
if n == models.JobContinue && len(sm.Transitions[sm.CurrentState]) != 1 {
log.Errorf("Job id: %d, next state is continue but there are %d possible next states in transition table", sm.JobID, len(sm.Transitions[sm.CurrentState]))
err = fmt.Errorf("Unable to continue")
break
}
n, err = sm.EnterState(n)
log.Debugf("Job id: %d, next state from handler: %s", sm.JobID, n)
}
if err != nil {
log.Warningf("Job id: %d, the statemachin will enter error state due to error: %v", sm.JobID, err)
sm.EnterState(models.JobError)
}
}
Start传入的目标状态,上一篇启动镜像复制的时候就是传入JobRunning状态。Start先调用EnterState进入状态,
func (sm *SM) EnterState(s string) (string, error) {
log.Debugf("Job id: %d, transiting from State: %s, to State: %s", sm.JobID, sm.CurrentState, s)
targets, ok := sm.Transitions[sm.CurrentState]
_, exist := targets[s]
_, isForced := sm.ForcedStates[s]
if !exist && !isForced {
return "", fmt.Errorf("job id: %d, transition from %s to %s does not exist", sm.JobID, sm.CurrentState, s)
}
exitHandler, ok := sm.Handlers[sm.CurrentState]
if ok {
if err := exitHandler.Exit(); err != nil {
return "", err
}
} else {
log.Debugf("Job id: %d, no handler found for state:%s, skip", sm.JobID, sm.CurrentState)
}
enterHandler, ok := sm.Handlers[s]
var next = models.JobContinue
var err error
if ok {
if next, err = enterHandler.Enter(); err != nil {
return "", err
}
} else {
log.Debugf("Job id: %d, no handler found for state:%s, skip", sm.JobID, s)
}
sm.PreviousState = sm.CurrentState
sm.CurrentState = s
log.Debugf("Job id: %d, transition succeeded, current state: %s", sm.JobID, s)
return next, nil
}
这里先判断这个transition是否存在,如果不存在报错,然后判断Handler是否存在,如果存在就调用enterHandler.Enter(),调用完成返回下一个状态next,最后把当前状态修改成传入的目标状态。
接着说Start方法,返回next后,任务会接着从next往下走,再次调用EnterState往下,直到for循环结束或者是JobContinue状态将退出。
停止就简单了
func (sm *SM) Stop(id int64) {
log.Debugf("Trying to stop the job: %d", id)
sm.lock.Lock()
defer sm.lock.Unlock()
if id == sm.JobID {
sm.desiredState = models.JobStopped
log.Debugf("Desired state of job %d is set to stopped", id)
} else {
log.Debugf("State machine has switched to job %d, so the action to stop job %d will be ignored", sm.JobID, id)
}
}
直接将目标状态修改成JobStopped。这样就能结束任务,并把work放回到workerpool。
下面我就针对镜像同步为例分析一下
sm.AddTransition(models.JobPending, models.JobRunning, StatusUpdater{sm.JobID, models.JobRunning})
sm.AddTransition(models.JobRetrying, models.JobRunning, StatusUpdater{sm.JobID, models.JobRunning})
sm.AddTransition(models.JobRunning, replication.StateInitialize, &replication.Initializer{BaseHandler: base})
sm.AddTransition(replication.StateInitialize, replication.StateCheck, &replication.Checker{BaseHandler: base})
sm.AddTransition(replication.StateCheck, replication.StatePullManifest, &replication.ManifestPuller{BaseHandler: base})
sm.AddTransition(replication.StatePullManifest, replication.StateTransferBlob, &replication.BlobTransfer{BaseHandler: base})
sm.AddTransition(replication.StatePullManifest, models.JobFinished, &StatusUpdater{sm.JobID, models.JobFinished})
sm.AddTransition(replication.StateTransferBlob, replication.StatePushManifest, &replication.ManifestPusher{BaseHandler: base})
sm.AddTransition(replication.StatePushManifest, replication.StatePullManifest, &replication.ManifestPuller{BaseHandler: base})
上面截取过渡机,当把任务启动后,由于初始状态为JobPending将进入JobRunning
func (su StatusUpdater) Enter() (string, error) {
err := dao.UpdateRepJobStatus(su.JobID, su.State)
if err != nil {
log.Warningf("Failed to update state of job: %d, state: %s, error: %v", su.JobID, su.State, err)
}
var next = models.JobContinue
if su.State == models.JobStopped || su.State == models.JobError || su.State == models.JobFinished {
next = ""
}
return next, err
}