func main() {
args, err := docopt.Parse(usage, nil, true, "codis config v0.1", true)
if err != nil {
// set config file
var configFile string
var config *cfg.Cfg
if args["-c"] != nil {
configFile = args["-c"].(string)
config, err = utils.InitConfigFromFile(configFile)
if err != nil {
} else {
config, err = utils.InitConfig()
if err != nil {
// load global vars
globalEnv = env.LoadCodisEnv(config)
cmd := args["<command>"].(string)
cmdArgs := args["<args>"].([]string)
go http.ListenAndServe(":10086", nil)
err = runCommand(cmd, cmdArgs)
func runCommand(cmd string, args []string) (err error) {
argv := make([]string, 1)
argv[0] = cmd
argv = append(argv, args...)
switch cmd {
case "action":
return errors.Trace(cmdAction(argv))
case "dashboard":
return errors.Trace(cmdDashboard(argv))
case "server":
return errors.Trace(cmdServer(argv))
case "proxy":
return errors.Trace(cmdProxy(argv))
case "slot":
return errors.Trace(cmdSlot(argv))
return errors.Errorf("%s is not a valid command. See 'codis-config -h'", cmd)
nohup ../bin/codis-config -c config.ini -L ./log/dashboard.log dashboard --addr=:18087 --http-log=./log/requests.log &>/dev/null &
func runDashboard(addr string, httpLogFile string) {
log.Info("dashboard listening on addr: ", addr)
m := martini.Classic()
m.Use(martini.Static(filepath.Join(binRoot, "assets/statics")))
Directory: filepath.Join(binRoot, "assets/template"),
Extensions: []string{".tmpl", ".html"},
Charset: "UTF-8",
IndentJSON: true,
m.Get("/api/server_groups", apiGetServerGroupList)
m.Get("/api/overview", apiOverview)
m.Get("/api/redis/:addr/stat", apiRedisStat)
m.Get("/api/redis/:addr/:id/slotinfo", apiGetRedisSlotInfo)
m.Get("/api/redis/group/:group_id/:slot_id/slotinfo", apiGetRedisSlotInfoFromGroupId)
// create temp node in ZK
if err := createDashboardNode(); err != nil {
defer releaseDashboardNode()
// create long live migrate manager
conn := CreateZkConn()
defer conn.Close()
globalMigrateManager = NewMigrateManager(conn, globalEnv.ProductName(), preMigrateCheck)
defer globalMigrateManager.removeNode()
值得注意的是,Dashboard使用的是Martini框架,非常容易就能暴露各种RESTFul接口。这里不深入研究Martini框架的用法了,但提一个Java中的“山寨Martini”框架-SparkJava。Spark这个名字实在太火了,这个框架跟分布式内存计算的那个Spark框架可没有一点关系。稍有些遗憾的是,使用SparkJava的前提是必须安装JDK 8,因为SparkJava大量使用了JDK 8中的特性:
import static spark.Spark.*;
// Visit http://localhost:4567/hello
public class HelloWorld {
public static void main(String[] args) {
get("/hello", (req, res) -> "Hello World");
echo "slots initializing..."
../bin/codis-config -c config.ini slot init -f
echo "done"
echo "set slot ranges to server groups..."
../bin/codis-config -c config.ini slot range-set 0 511 1 online
../bin/codis-config -c config.ini slot range-set 512 1023 2 online
echo "done"
声明:因为codis-config除了启动Dashboard外,其主要作用就是封装RESTFul请求,代码比较简单,所以后面的源码剖析就都直接跳过codis-config的请求发送过程,直接跳到Dashboard接到请求后的处理过程,所有RESTFul API的实现都在dashboard_apis.go中。
dashboard_apis.go中并没有直接实现初始化功能,而是调用了models包。实际上,Codis提取了一整套的Model类作为模型层,Dashboard和Proxy都引用了这套模型层。在Codis 2.0中Proxy的内部架构发生了不小的变化,然而模型层的代码相对很稳定,这也是DDD领域驱动设计的优势吧。
func apiInitSlots(r *http.Request) (int, string) {
isForce := false
val := r.FormValue("is_force")
if len(val) > 0 && (val == "1" || val == "true") {
isForce = true
conn := CreateZkConn()
defer conn.Close()
if err := models.InitSlotSet(conn, globalEnv.ProductName(), models.DEFAULT_SLOT_NUM); err != nil {
return 500, err.Error()
return jsonRetSucc()
func InitSlotSet(zkConn zkhelper.Conn, productName string, totalSlotNum int) error {
for i := 0; i < totalSlotNum; i++ {
slot := NewSlot(productName, i)
if err := slot.Update(zkConn); err != nil {
return errors.Trace(err)
return nil
func NewSlot(productName string, id int) *Slot {
return &Slot{
ProductName: productName,
Id: id,
State: SlotState{
LastOpTs: "0",
MigrateStatus: SlotMigrateStatus{
func (s *Slot) Update(zkConn zkhelper.Conn) error {
data, err := json.Marshal(s)
zkPath := GetSlotPath(s.ProductName, s.Id)
_, err = zkhelper.CreateOrUpdate(zkConn, zkPath, string(data), 0, zkhelper.DefaultFileACLs(), true)
同样地,设置Range也是在Slot模型层中实现的。在./initslot.sh中,我们将0~511 Slot分配给了Group 1,将512~1023分配给Group 2。SetSlotRange()将范围内的Slot的GroupId从-1改为新分配的GroupId,并更新到ZooKeeper中:
func SetSlotRange(zkConn zkhelper.Conn, productName string, fromSlot, toSlot, groupId int, status SlotStatus) error {
for i := fromSlot; i <= toSlot; i++ {
s, err := GetSlot(zkConn, productName, i)
if err != nil {
return errors.Trace(err)
s.GroupId = groupId
s.State.Status = status
data, err := json.Marshal(s)
if err != nil {
return errors.Trace(err)
zkPath := GetSlotPath(productName, i)
_, err = zkhelper.CreateOrUpdate(zkConn, zkPath, string(data), 0, zkhelper.DefaultFileACLs(), true)
if err != nil {
return errors.Trace(err)
Codis目前使用的算法比较简单,不要被Rebalance()中的三层嵌套循环“吓住”了。具体来说,假如之前我们有Group 1负责Slot 0~511,Group 2负责Slot 512~1023。现在新增一个Group 3,没有负责任何Slot,此时通过命令行或Web管理页面触发”Auto Rebalance”。
因为Group 1和2的当前Slot肯定大于目标Quota,所以遍历到它俩时什么都不会做,主要处理就在Group 3。当遍历到Group 3时,它会从Group 1和2中不断迁移过来Slot,直到达到目标Quota再停止。
// experimental simple auto rebalance :)
func Rebalance(zkConn zkhelper.Conn, delay int) error {
targetQuota, err := getQuotaMap(zkConn)
livingNodes, err := getLivingNodeInfos(zkConn)
log.Info("start rebalance")
for _, node := range livingNodes {
for len(node.CurSlots) > targetQuota[node.GroupId] {
for _, dest := range livingNodes {
if dest.GroupId != node.GroupId && len(dest.CurSlots) < targetQuota[dest.GroupId] {
slot := node.CurSlots[len(node.CurSlots)-1]
// create a migration task
t := NewMigrateTask(MigrateTaskInfo{
Delay: delay,
FromSlot: slot,
ToSlot: slot,
NewGroupId: dest.GroupId,
CreateAt: strconv.FormatInt(time.Now().Unix(), 10),
u, err := uuid.NewV4()
t.Id = u.String()
if ok, err := preMigrateCheck(t); ok {
// do migrate
err := t.run()
node.CurSlots = node.CurSlots[0 : len(node.CurSlots)-1]
dest.CurSlots = append(dest.CurSlots, slot)
log.Info("rebalance finish")
return nil
func getQuotaMap(zkConn zkhelper.Conn) (map[int]int, error) {
nodes, err := getLivingNodeInfos(zkConn)
ret := make(map[int]int)
var totalMem int64
totalQuota := 0
for _, node := range nodes {
totalMem += node.MaxMemory
for _, node := range nodes {
quota := int(models.DEFAULT_SLOT_NUM * node.MaxMemory * 1.0 / totalMem)
ret[node.GroupId] = quota
totalQuota += quota
// round up
if totalQuota < models.DEFAULT_SLOT_NUM {
for k, _ := range ret {
ret[k] += models.DEFAULT_SLOT_NUM - totalQuota
return ret, nil
func getLivingNodeInfos(zkConn zkhelper.Conn) ([]*NodeInfo, error) {
groups, err := models.ServerGroups(zkConn, globalEnv.ProductName())
slots, err := models.Slots(zkConn, globalEnv.ProductName())
slotMap := make(map[int][]int)
for _, slot := range slots {
if slot.State.Status == models.SLOT_STATUS_ONLINE {
slotMap[slot.GroupId] = append(slotMap[slot.GroupId], slot.Id)
var ret []*NodeInfo
for _, g := range groups {
master, err := g.Master(zkConn)
out, err := utils.GetRedisConfig(master.Addr, "maxmemory")
maxMem, err := strconv.ParseInt(out, 10, 64)
node := &NodeInfo{
GroupId: g.Id,
CurSlots: slotMap[g.Id],
MaxMemory: maxMem,
ret = append(ret, node)
return ret, nil
// migrate multi slots
func (t *MigrateTask) run() error {
// create zk conn on demand
t.zkConn = CreateZkConn()
defer t.zkConn.Close()
to := t.NewGroupId
for slotId := t.FromSlot; slotId <= t.ToSlot; slotId++ {
err := t.migrateSingleSlot(slotId, to)
t.Percent = (slotId - t.FromSlot + 1) * 100 / (t.ToSlot - t.FromSlot + 1)
log.Info("total percent:", t.Percent)
log.Info("migration finished")
return nil
func (t *MigrateTask) migrateSingleSlot(slotId int, to int) error {
// set slot status
s, err := models.GetSlot(t.zkConn, t.productName, slotId)
from := s.GroupId
if s.State.Status == models.SLOT_STATUS_MIGRATE {
from = s.State.MigrateStatus.From
// modify slot status
if err := s.SetMigrateStatus(t.zkConn, from, to); err != nil {
return err
err = t.slotMigrator.Migrate(s, from, to, t, func(p SlotMigrateProgress) {
// on migrate slot progress
if p.Remain%500 == 0 {
// migrate done, change slot status back
s.State.Status = models.SLOT_STATUS_ONLINE
s.State.MigrateStatus.From = models.INVALID_ID
s.State.MigrateStatus.To = models.INVALID_ID
if err := s.Update(t.zkConn); err != nil {
return err
return nil
func (s *Slot) SetMigrateStatus(zkConn zkhelper.Conn, fromGroup, toGroup int) error {
// wait until all proxy confirmed
err := NewAction(zkConn, s.ProductName, ACTION_TYPE_SLOT_PREMIGRATE, s, "", true)
s.State.MigrateStatus.From = fromGroup
s.State.MigrateStatus.To = toGroup
s.GroupId = toGroup
return s.Update(zkConn)
func NewAction(zkConn zkhelper.Conn, productName string, actionType ActionType, target interface{}, desc string, needConfirm bool) error {
ts := strconv.FormatInt(time.Now().Unix(), 10)
action := &Action{
Type: actionType,
Desc: desc,
Target: target,
Ts: ts,
// set action receivers
proxies, err := ProxyList(zkConn, productName, func(p *ProxyInfo) bool {
return p.State == PROXY_STATE_ONLINE
for _, p := range proxies {
buf, err := json.Marshal(p)
if err != nil {
return errors.Trace(err)
action.Receivers = append(action.Receivers, string(buf))
b, _ := json.Marshal(action)
prefix := GetWatchActionPath(productName)
//action root path
err = CreateActionRootPath(zkConn, prefix)
//response path
respPath := path.Join(path.Dir(prefix), "ActionResponse")
err = CreateActionRootPath(zkConn, respPath)
//create response node, etcd do not support create in order directory
//get path first
actionRespPath, err := zkConn.Create(respPath+"/", b, int32(zk.FlagSequence), zkhelper.DefaultFileACLs())
//remove file then create directory
zkConn.Delete(actionRespPath, -1)
actionRespPath, err = zkConn.Create(actionRespPath, b, 0, zkhelper.DefaultDirACLs())
suffix := path.Base(actionRespPath)
// create action node
actionPath := path.Join(prefix, suffix)
_, err = zkConn.Create(actionPath, b, 0, zkhelper.DefaultFileACLs())
if needConfirm {
if err := WaitForReceiver(zkConn, productName, actionRespPath, proxies); err != nil {
return errors.Trace(err)
return nil
func WaitForReceiver(zkConn zkhelper.Conn, productName string, actionZkPath string, proxies []ProxyInfo) error {
times := 0
var proxyIds []string
var offlineProxyIds []string
for _, p := range proxies {
proxyIds = append(proxyIds, p.Id)
// check every 500ms
for times < 60 {
if times >= 6 && (times*500)%1000 == 0 {
log.Warning("abnormal waiting time for receivers", actionZkPath)
nodes, _, err := zkConn.Children(actionZkPath)
var confirmIds []string
for _, node := range nodes {
id := path.Base(node)
confirmIds = append(confirmIds, id)
if len(confirmIds) != 0 {
if utils.Strings(proxyIds).Eq(confirmIds) {
return nil
offlineProxyIds = proxyIds[len(confirmIds)-1:]
times += 1
time.Sleep(500 * time.Millisecond)
// set offline proxies
for _, id := range offlineProxyIds {
log.Errorf("mark proxy %s to PROXY_STATE_MARK_OFFLINE", id)
if err := SetProxyStatus(zkConn, productName, id, PROXY_STATE_MARK_OFFLINE); err != nil {
return err
return ErrReceiverTimeout
func (m *CodisSlotMigrator) Migrate(slot *models.Slot, fromGroup, toGroup int, task *MigrateTask, onProgress func(SlotMigrateProgress)) (err error) {
groupFrom, err := models.GetGroup(task.zkConn, task.productName, fromGroup)
groupTo, err := models.GetGroup(task.zkConn, task.productName, toGroup)
fromMaster, err := groupFrom.Master(task.zkConn)
toMaster, err := groupTo.Master(task.zkConn)
c, err := redis.Dial("tcp", fromMaster.Addr)
defer c.Close()
_, remain, err := sendRedisMigrateCmd(c, slot.Id, toMaster.Addr)
for remain > 0 {
if task.Delay > 0 {
time.Sleep(time.Duration(task.Delay) * time.Millisecond)
_, remain, err = sendRedisMigrateCmd(c, slot.Id, toMaster.Addr)
if remain >= 0 {
SlotId: slot.Id,
FromGroup: fromGroup,
ToGroup: toGroup,
Remain: remain,
return nil
func sendRedisMigrateCmd(c redis.Conn, slotId int, toAddr string) (int, int, error) {
addrParts := strings.Split(toAddr, ":")
if len(addrParts) != 2 {
return -1, -1, ErrInvalidAddr
reply, err := redis.Values(c.Do("SLOTSMGRTTAGSLOT", addrParts[0], addrParts[1], MIGRATE_TIMEOUT, slotId))
if err != nil {
return -1, -1, err
var succ, remain int
if _, err := redis.Scan(reply, &succ, &remain); err != nil {
return -1, -1, err
return succ, remain, nil
对于迁移中的Slot,如果恰好此时有客户端要访问该Slot中的某个Key该怎么办?Codis不是遇到这个问题的第一个中间件,像Taobao Tair中也有对此的解决方案:
发生迁移的时候data server如何对外提供服务?
当迁移发生的时候, 我们举个例子, 假设data server A 要把 桶 3,4,5 迁移给data server B. 因为迁移完成前, 客户端的路由表没有变化, 客户端对 3, 4, 5 的访问请求都会路由到A. 现在假设 3还没迁移, 4 正在迁移中, 5已经迁移完成. 那么如果是对3的访问, 则没什么特别, 跟以前一样. 如果是对5的访问, 则A会把该请求转发给B,并且将B的返回结果返回给客户, 如果是对4的访问, 在A处理, 同时如果是对4的修改操作, 会记录修改log.当桶4迁移完成的时候, 还要把log发送到B, 在B上应用这些log. 最终A B上对于桶4来说, 数据完全一致才是真正的迁移完成
func (s *Server) dispatch(r *PipelineRequest) {
s.handleMigrateState(r.slotIdx, r.keys[0])
tr, ok := s.pipeConns[s.slots[r.slotIdx].dst.Master()]
if !ok {
//try recreate taskrunner
if err := s.createTaskRunner(s.slots[r.slotIdx]); err != nil {
r.backQ <- &PipelineResponse{ctx: r, resp: nil, err: err}
tr = s.pipeConns[s.slots[r.slotIdx].dst.Master()]
tr.in <- r
func (s *Server) handleMigrateState(slotIndex int, key []byte) error {
shd := s.slots[slotIndex]
if shd.slotInfo.State.Status != models.SLOT_STATUS_MIGRATE {
return nil
redisConn, err := s.pools.GetConn(shd.migrateFrom.Master())
defer s.pools.ReleaseConn(redisConn)
redisReader := redisConn.(*redispool.PooledConn).BufioReader()
err = WriteMigrateKeyCmd(redisConn.(*redispool.PooledConn), shd.dst.Master(), 30*1000, key)
return nil
func WriteMigrateKeyCmd(w io.Writer, addr string, timeoutMs int, key []byte) error {
hostPort := strings.Split(addr, ":")
if len(hostPort) != 2 {
return errors.Errorf("invalid address " + addr)
respW := respcoding.NewRESPWriter(w)
err := respW.WriteCommand("slotsmgrttagone", hostPort[0], hostPort[1],
strconv.Itoa(int(timeoutMs)), string(key))
return errors.Trace(err)
class User < ActiveRecord::Base
self.table_name = "tb_user"
user = User.new
user.name = "David"
user.occupation = "Code Artist"
user = User.find_by(name: 'David')
user.name = 'Dave'
users = User.where(name: 'David', occupation: 'Code Artist').order('created_at DESC')
public abstract class Model {
private static CuratorFramework client;
private static ObjectMapper mapper;
public String save() {
try {
return client.create()
.withMode(ephemeral() ? EPHEMERAL : PERSISTENT)
.forPath(path(), marshal());
} catch (Exception e) {
throw new IllegalStateException(e);
public void update() {
try {
client.setData().forPath(path(), marshal());
} catch (Exception e) {
throw new IllegalStateException(e);
/** FIXME: Supposed to be static, yet cannot get class info in static context in Java */
public <T> T find() {
try {
return (T) unmarshal(client.getData().forPath(path()), getClass());
} catch (Exception e) {
throw new IllegalStateException(e);
public class Slot extends Model {
private int id;
private SlotState state;
private int groupId;
/** * Initialize all slots. * @param totalNum total number of slots */
public static void initSlotSet(int totalNum) {
try {
for (int i = 0; i < totalNum; i++) {
new Slot(i).save();
} catch (Exception e) {
throw new IllegalStateException("Error when init slots", e);
public static Slot find(int id) {
Slot slot = new Slot();
return slot.find();