member信息会持久化到磁盘上,数据丢失的节点必须以新的member身份加入,必须严格按照如下操作:
[root@node01 ~]# etcdctl endpoint health -w table \
> --cacert=/etc/etcd/pki/etcd-ca.pem \
> --key=/etc/etcd/pki/etcd-server-key.pem \
> --cert=/etc/etcd/pki/etcd-server.pem \
> --endpoints https://10.0.2.10:2379,https://10.0.2.11:2379,https://10.0.2.12:2379
{"level":"warn","ts":"2023-01-19T15:46:51.582+0800","logger":"client","caller":"v3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc000324fc0/10.0.2.12:2379","attempt":0,"error":"rpc error: code = DeadlineExceeded desc = latest balancer error: last connection error: connection error: desc = \"transport: Error while dialing dial tcp 10.0.2.12:2379: connect: connection refused\""}
+------------------------+--------+--------------+---------------------------+
| ENDPOINT | HEALTH | TOOK | ERROR |
+------------------------+--------+--------------+---------------------------+
| https://10.0.2.10:2379 | true | 16.441802ms | |
| https://10.0.2.11:2379 | true | 19.920028ms | |
| https://10.0.2.12:2379 | false | 5.000833537s | context deadline exceeded |
+------------------------+--------+--------------+---------------------------+
[root@node01 ~]# etcdctl member list -w table \
> --cacert=/etc/etcd/pki/etcd-ca.pem \
> --key=/etc/etcd/pki/etcd-server-key.pem \
> --cert=/etc/etcd/pki/etcd-server.pem \
> --endpoints https://10.0.2.10:2379,https://10.0.2.11:2379,https://10.0.2.12:2379
+------------------+---------+-------+------------------------+------------------------+------------+
| ID | STATUS | NAME | PEER ADDRS | CLIENT ADDRS | IS LEARNER |
+------------------+---------+-------+------------------------+------------------------+------------+
| 4ef681e2655d2a35 | started | etcd1 | https://10.0.2.10:2380 | https://10.0.2.10:2379 | false |
| 6c9d6a0746c7546b | started | etcd2 | https://10.0.2.11:2380 | https://10.0.2.11:2379 | false |
| eb20e4406d78f7f7 | started | etcd3 | https://10.0.2.12:2380 | https://10.0.2.12:2379 | false |
+------------------+---------+-------+------------------------+------------------------+------------+
[root@node01 ~]# etcdctl member remove eb20e4406d78f7f7 \
> --cacert=/etc/etcd/pki/etcd-ca.pem \
> --key=/etc/etcd/pki/etcd-server-key.pem \
> --cert=/etc/etcd/pki/etcd-server.pem \
> --endpoints https://10.0.2.10:2379,https://10.0.2.11:2379,https://10.0.2.12:2379
Member eb20e4406d78f7f7 removed from cluster de54f873fa2bd441
[root@node01 ~]# etcdctl member add etcd3 --peer-urls=https://10.0.2.12:2380 \
> --cacert=/etc/etcd/pki/etcd-ca.pem \
> --key=/etc/etcd/pki/etcd-server-key.pem \
> --cert=/etc/etcd/pki/etcd-server.pem \
> --endpoints https://10.0.2.10:2379,https://10.0.2.11:2379,https://10.0.2.12:2379
Member ebe50cbfc9552823 added to cluster de54f873fa2bd441
ETCD_NAME="etcd3"
ETCD_INITIAL_CLUSTER="etcd1=https://10.0.2.10:2380,etcd2=https://10.0.2.11:2380,etcd3=https://10.0.2.12:2380"
ETCD_INITIAL_ADVERTISE_PEER_URLS="https://10.0.2.12:2380"
ETCD_INITIAL_CLUSTER_STATE="existing"
# 在故障节点上修改 ETCD_INITIAL_CLUSTER_STATE="existing" 并重启 etcd
[root@node01 ~]# etcdctl endpoint status \
> --cacert=/etc/etcd/pki/etcd-ca.pem \
> --key=/etc/etcd/pki/etcd-server-key.pem \
> --cert=/etc/etcd/pki/etcd-server.pem \
> --endpoints https://10.0.2.10:2379,https://10.0.2.11:2379,https://10.0.2.12:2379
https://10.0.2.10:2379, 4ef681e2655d2a35, 3.5.4, 3.0 MB, false, false, 32, 375709, 375709,
https://10.0.2.11:2379, 6c9d6a0746c7546b, 3.5.4, 3.0 MB, true, false, 32, 375709, 375709,
https://10.0.2.12:2379, ebe50cbfc9552823, 3.5.4, 3.0 MB, false, false, 32, 375709, 375709,
https://etcd.io/docs/v3.5/op-guide/recovery/
https://etcd.io/docs/v3.5/upgrades/upgrade_3_5/
[root@node01 ~]# etcdctl snapshot save backup/$(date +%Y-%m-%d-%H-%M).snapshot \
> --cacert=/etc/etcd/pki/etcd-ca.pem \
> --key=/etc/etcd/pki/etcd-server-key.pem \
> --cert=/etc/etcd/pki/etcd-server.pem \
> --endpoints https://10.0.2.10:2379
{"level":"info","ts":"2023-01-19T16:11:47.041+0800","caller":"snapshot/v3_snapshot.go:65","msg":"created temporary db file","path":"backup/2023-01-19-16-11.snapshot.part"}
{"level":"info","ts":"2023-01-19T16:11:47.051+0800","logger":"client","caller":"v3/maintenance.go:211","msg":"opened snapshot stream; downloading"}
{"level":"info","ts":"2023-01-19T16:11:47.051+0800","caller":"snapshot/v3_snapshot.go:73","msg":"fetching snapshot","endpoint":"https://10.0.2.10:2379"}
{"level":"info","ts":"2023-01-19T16:11:47.085+0800","logger":"client","caller":"v3/maintenance.go:219","msg":"completed snapshot read; closing"}
{"level":"info","ts":"2023-01-19T16:11:47.126+0800","caller":"snapshot/v3_snapshot.go:88","msg":"fetched snapshot","endpoint":"https://10.0.2.10:2379","size":"3.0 MB","took":"now"}
{"level":"info","ts":"2023-01-19T16:11:47.126+0800","caller":"snapshot/v3_snapshot.go:97","msg":"saved","path":"backup/2023-01-19-16-11.snapshot"}
Snapshot saved at backup/2023-01-19-16-11.snapshot
1、首先停止apiserver(确保没有任何程序对etcd进行写入操作)
2、停掉etcd集群,并清空各etcd节点的数据目录
3、将快照文件分发到各etcd节点
4、在各etcd节点进行快照恢复
[root@node01 ~]# etcdutl snapshot restore /tmp/backup/2023-01-19-16-11.snapshot \
--data-dir=/var/lib/etcd/ --initial-cluster-token="etcd-cluster" \
--name=etcd1 --initial-advertise-peer-urls=https://10.0.2.10:2380 \
--initial-cluster="etcd1=https://10.0.2.10:2380,etcd2=https://10.0.2.11:2380,etcd3=https://10.0.2.12:2380"
[root@node02 ~]# etcdutl snapshot restore /tmp/backup/2023-01-19-16-11.snapshot \
--data-dir=/var/lib/etcd/ --initial-cluster-token="etcd-cluster" \
--name=etcd2 --initial-advertise-peer-urls=https://10.0.2.11:2380 \
--initial-cluster="etcd1=https://10.0.2.10:2380,etcd2=https://10.0.2.11:2380,etcd3=https://10.0.2.12:2380"
[root@node03 ~]# etcdutl snapshot restore /tmp/backup/2023-01-19-16-11.snapshot \
--data-dir=/var/lib/etcd/ --initial-cluster-token="etcd-cluster" \
--name=etcd3 --initial-advertise-peer-urls=https://10.0.2.12:2380 \
--initial-cluster="etcd1=https://10.0.2.10:2380,etcd2=https://10.0.2.11:2380,etcd3=https://10.0.2.12:2380"
5、重启etcd集群
注意etcd数据目录权限
chown -R etcd:etcd /var/lib/etcd
systemctl restart etcd
6、启动apiserver