influxdb集群中,client在node1上执行remove dataNode node3的命令:
influxd_ctl remove-data node3:8088
整体流程如下:
- node1收到CLI命令,向自己的8084发送GET /remove-data请求,request body: {"node":"node3:8088"};
- node1通过metaClient向集群发送DeleteDataNode();
node3收到/remove-data请求后:
- 先通知集群删除数据节点:metaClient.DeleteDataNode();
- 再删除该节点上的shards数据;
CLI命令处理
命令行解析的代码入口:
// cmd/influxd_ctl/cli/cli.go
func (c *CommandLine) Run() error {
switch cmd {
case "remove-data":
return do_remove_data(c)
}
}
向本机的集群管理端口8084,发送GET /remove-data:
func do_remove_data(c *CommandLine) error {
var node string
force := "false"
....
node = fs.Args()[len(fs.Args())-1]
if o.Force {
force = "true"
}
// 向自己的8084发送HTTP remove-data
url := c.getURL("remove-data", map[string]string{"node": node, "force": force})
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
...
return nil
}
node1上/remove-data的处理:
// services/admin_cluster/handler.go
func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
switch r.Method {
case "GET":
switch r.URL.Path {
case "/remove-data":
h.WrapHandler("remove-data", h.removeDataNode).ServeHTTP(w, r)
}
}
}
handler对/remove-data的处理:
- 向node3发送TCP消息:RemoveDataNode;
- 删除集群中该节点信息:metaClient.DeleteDataNode();
func (h *handler) removeDataNode(w http.ResponseWriter, r *http.Request) {
ps := r.URL.Query()
if len(ps["node"]) != 1 {
http.Error(w, "", http.StatusBadRequest)
}
var force bool
if len(ps["force"]) == 1 && ps["force"][0] == "true" {
force = true
}
node := ps["node"][0]
req := Request{RemoveDataNode, "", 0}
err := h.call(node, req)
if err != nil && force {
if nodeInfo, merr := h.MetaClient.DataNodeByTCPHost(node); merr != nil {
h.sendResponse(merr, w)
} else if merr = h.MetaClient.DeleteDataNode(nodeInfo.ID); merr != nil {
h.sendResponse(merr, w)
}
h.sendResponse(nil, w)
} else {
h.sendResponse(err, w)
}
}
metaClient.DeleteDataNode()会向Leader发送Command,然后Leader更新状态机fsm中的dataNodes信息。
node3处理TCP RemoveDataNode消息
TCPHandler:
// services/admin_cluster/tcphandler.go
func (h *TCPHandler) handleConn(conn net.Conn) {
r := &Request{}
err := json.NewDecoder(conn).Decode(r)
switch r.Type {
case RemoveDataNode:
h.handleRemoveDataNode(r, conn)
}
}
handleRemoveDataNode()的具体内容:
- 删除集群中该节点的信息:metaClient.DeleteDataNode();
- 删除该节点中shards信息:TSDB.DeleteDatabase();
func (h *TCPHandler) handleRemoveDataNode(r *Request, conn net.Conn) {
msg := ""
func() {
if h.TSDBStore == nil {
msg = fmt.Sprintf("Datanode is not enabled on %s", h.Server.TCPAddr())
return
}
if nodeInfo, err := h.MetaClient.DataNodeByTCPHost(h.Server.TCPAddr()); err != nil {
msg = fmt.Sprintf("Datanode %s is not in cluster", h.Server.TCPAddr())
} else {
// update meta first
if err = h.MetaClient.DeleteDataNode(nodeInfo.ID); err != nil {
msg = err.Error()
}
// delete data
dbs := h.TSDBStore.Databases()
for _, db := range dbs {
if err := h.TSDBStore.DeleteDatabase(db); err != nil {
msg = fmt.Sprintf("Remove data node meta successfully. But remove data directory failed. %v Please remove directory manully.", err)
return
}
}
}
}()
if err := json.NewEncoder(conn).Encode(Response{msg}); err != nil {
h.Logger.Info("Encode admin tcp resposne failed.", zap.Error(err))
}
}
删除节点中shards数据:删除shards文件以及WAL文件
// tsdb/store.go
func (s *Store) DeleteDatabase(name string) error {
s.mu.RLock()
if _, ok := s.databases[name]; !ok {
s.mu.RUnlock()
// no files locally, so nothing to do
return nil
}
// 找到负责的shard
shards := s.filterShards(func(sh *Shard) bool {
return sh.database == name
})
s.mu.RUnlock()
......
dbPath := filepath.Clean(filepath.Join(s.path, name))
//删除数据目录
if err := os.RemoveAll(dbPath); err != nil {
return err
}
//删除WAL
if err := os.RemoveAll(filepath.Join(s.EngineOptions.Config.WALDir, name)); err != nil {
return err
}
for _, sh := range shards {
delete(s.shards, sh.id)
}
// Remove database from store list of databases
delete(s.databases, name)
// Remove shared index for database if using inmem index.
delete(s.indexes, name)
return nil
}