一、初始花基本环境
1.主机环境信息
系统 | 主机名 | 主机IP地址 | 配置说明 |
---|---|---|---|
AlmaLinux release 8.6 (Sky Tiger) | master01 | 192.168.3.31 | k8s管理节点,VIP192.168.3.30 |
AlmaLinux release 8.6 (Sky Tiger) | master02 | 192.168.3.32 | k8s管理节点,VIP192.168.3.30 |
AlmaLinux release 8.6 (Sky Tiger) | master03 | 192.168.3.33 | k8s管理节点,VIP192.168.3.30 |
AlmaLinux release 8.6 (Sky Tiger) | node01 | 192.168.3.41 | k8s工作节点,10G数据盘1+10G数据盘2 |
AlmaLinux release 8.6 (Sky Tiger) | node02 | 192.168.3.42 | k8s工作节点,系统盘+10G数据盘1+10G数据盘2 |
AlmaLinux release 8.6 (Sky Tiger) | node03 | 192.168.3.43 | k8s工作节点,系统盘+10G数据盘1+10G数据盘2 |
AlmaLinux release 8.6 (Sky Tiger) | node04 | 192.168.3.44 | k8s工作节点,系统盘+10G数据盘1+10G数据盘2 |
AlmaLinux release 8.6 (Sky Tiger) | node03 | 192.168.3.45 | k8s工作节点,系统盘+10G数据盘1+10G数据盘2 |
2.部署k8s集群
参见;https://blog.csdn.net/lic95/article/details/125044136
二、部署rook ceph
1.参考rook-ceph说明
https://rook.github.io/docs/rook/latest/Getting-Started/quickstart/#deploy-the-rook-operator
2.确认none节点硬盘情况,本文5个node节点共10块硬盘
[root@node01 ~]# lsblk -f
NAME FSTYPE LABEL UUID MOUNTPOINT
nvme0n1
├─nvme0n1p1 xfs 4f22cfd0-c208-4a72-a2d5-82ee32d7f956 /boot
└─nvme0n1p2 LVM2_member 2o3Cz0-u0vm-D81w-hysk-LwSv-cLGg-5YyA5c
├─almalinux-root xfs 919eb2ea-14db-4105-b7fd-af85b1ec2dfd /
└─almalinux-swap swap d246b8f0-1ee4-425b-9a37-d8d9b2781403
nvme0n2
nvme0n3
[root@node01 ~]#
#如果该FSTYPE字段不为空,则无法被rook osd使用,请手动处理
3.部署证书管理器
#下载yaml文件
wget https://github.com/cert-manager/cert-manager/releases/download/v1.8.0/cert-manager.yaml
#启动pod
kubectl apply -f cert-manager.yaml
#查看启动状态
[root@master01 ~]# kubectl get pods -n cert-manager
NAME READY STATUS RESTARTS AGE
cert-manager-6868fddcb4-kcvpp 1/1 Running 0 42s
cert-manager-cainjector-6d6bbc7965-f5trt 1/1 Running 0 42s
cert-manager-webhook-59f66d6c7b-wsw6f 1/1 Running 0 42s
[root@master01 ~]#
4.安装lvm包管理器,用于处理ceph硬盘
yum install -y lvm2
5.部署rook ceph
#在master节点克隆指定版本
git clone --single-branch --branch v1.9.4 https://github.com/rook/rook.git
# 部署 rook Operator
cd rook/deploy/examples
kubectl create -f crds.yaml -f common.yaml -f operator.yaml
kubectl create -f cluster.yaml
# 在继续操作之前,验证 rook-ceph-operator 是否处于“Running”状态:
# 注意,部分镜像位于k8s.gcr.io,需要科学上网,或者查看日志手动下载到各个node节点
kubectl get pod -n rook-ceph
[root@master01 ~]# kubectl get pods -n rook-ceph
NAME READY STATUS RESTARTS AGE
csi-cephfsplugin-4bbbh 3/3 Running 0 55m
csi-cephfsplugin-9zsjn 3/3 Running 0 55m
csi-cephfsplugin-provisioner-5c6c4c7785-dlrfh 6/6 Running 0 55m
csi-cephfsplugin-provisioner-5c6c4c7785-fs6nz 6/6 Running 0 55m
csi-cephfsplugin-tvlxt 3/3 Running 0 55m
csi-cephfsplugin-vj7s9 3/3 Running 0 55m
csi-cephfsplugin-xg92l 3/3 Running 0 55m
csi-rbdplugin-9s64s 3/3 Running 0 55m
csi-rbdplugin-gvkbw 3/3 Running 0 55m
csi-rbdplugin-provisioner-7c756d9bd7-9b9sm 6/6 Running 0 55m
csi-rbdplugin-provisioner-7c756d9bd7-cdlfd 6/6 Running 0 55m
csi-rbdplugin-rdtxb 3/3 Running 0 55m
csi-rbdplugin-s9t2r 3/3 Running 0 55m
csi-rbdplugin-x2ldf 3/3 Running 0 55m
rook-ceph-crashcollector-node01-5c65c4845d-wtqgz 1/1 Running 0 51m
rook-ceph-crashcollector-node02-64fd8d97f7-w9mlv 1/1 Running 0 50m
rook-ceph-crashcollector-node03-675b749756-b9gjq 1/1 Running 0 49m
rook-ceph-crashcollector-node04-7dcb76b499-lc4td 1/1 Running 0 51m
rook-ceph-crashcollector-node05-79b4c99f86-sfvvf 1/1 Running 0 51m
rook-ceph-mgr-a-7dc64d847f-kzf26 2/2 Running 0 51m
rook-ceph-mgr-b-5dc59949ff-fwkl4 2/2 Running 0 51m
rook-ceph-mon-a-779dc5cd57-wlkhx 1/1 Running 0 55m
rook-ceph-mon-b-b9bdf6486-t48ks 1/1 Running 0 54m
rook-ceph-mon-c-776f7674b6-r29zr 1/1 Running 0 51m
rook-ceph-operator-74c6447d5b-gmlmx 1/1 Running 0 58m
rook-ceph-osd-0-7d746b7b59-7zn58 1/1 Running 0 51m
rook-ceph-osd-1-698b49669-5plgq 1/1 Running 0 51m
rook-ceph-osd-2-777bb8bfc9-4zm56 1/1 Running 0 51m
rook-ceph-osd-3-7568df5fd4-lgh25 1/1 Running 0 51m
rook-ceph-osd-4-6fd6747d6-bxtxx 1/1 Running 0 51m
rook-ceph-osd-5-868d874bc4-jpxjc 1/1 Running 0 51m
rook-ceph-osd-6-d7d46949-fgxb2 1/1 Running 0 50m
rook-ceph-osd-7-6bc688dcf6-t84g6 1/1 Running 0 50m
rook-ceph-osd-8-6fb5cdb988-fcnd5 1/1 Running 0 49m
rook-ceph-osd-9-7c595fd74d-khxdl 1/1 Running 0 49m
rook-ceph-osd-prepare-node01-s6z6d 0/1 Completed 0 49m
rook-ceph-osd-prepare-node02-z6s5z 0/1 Completed 0 49m
rook-ceph-osd-prepare-node03-9sjtl 0/1 Completed 0 49m
rook-ceph-osd-prepare-node04-7bglr 0/1 Completed 0 49m
rook-ceph-osd-prepare-node05-6rkgc 0/1 Completed 0 49m
rook-ceph-tools-68f89f79f9-jqcg8 1/1 Running 0 52m
[root@master01 ~]#
6.部署rook Toolbox
# 启动 rook-ceph-tools pod
kubectl create -f deploy/examples/toolbox.yaml
#等待工具箱 pod 下载其容器并进入running状态
[root@master01 ~]# kubectl -n rook-ceph rollout status deploy/rook-ceph-tools
deployment "rook-ceph-tools" successfully rolled out
[root@master01 ~]#
7.查看集群状态
#链接 toolbox
kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
#插看集群
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$ ceph -s
cluster:
id: da0ab8bb-e42f-42b1-b134-eb1d58888828
health: HEALTH_OK
services:
mon: 3 daemons, quorum a,b,c (age 99m)
mgr: a(active, since 98m), standbys: b
osd: 10 osds: 10 up (since 97m), 10 in (since 97m)
data:
pools: 1 pools, 1 pgs
objects: 0 objects, 0 B
usage: 51 MiB used, 100 GiB / 100 GiB avail
pgs: 1 active+clean
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$ ceph osd status
ID HOST USED AVAIL WR OPS WR DATA RD OPS RD DATA STATE
0 node01 5200k 9.99G 0 0 0 0 exists,up
1 node04 5260k 9.99G 0 0 0 0 exists,up
2 node05 5200k 9.99G 0 0 0 0 exists,up
3 node01 5264k 9.99G 0 0 0 0 exists,up
4 node04 5200k 9.99G 0 0 0 0 exists,up
5 node05 5264k 9.99G 0 0 0 0 exists,up
6 node02 5136k 9.99G 0 0 0 0 exists,up
7 node02 5136k 9.99G 0 0 0 0 exists,up
8 node03 5072k 9.99G 0 0 0 0 exists,up
9 node03 5072k 9.99G 0 0 0 0 exists,up
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$ ceph df
--- RAW STORAGE ---
CLASS SIZE AVAIL USED RAW USED %RAW USED
ssd 100 GiB 100 GiB 51 MiB 51 MiB 0.05
TOTAL 100 GiB 100 GiB 51 MiB 51 MiB 0.05
--- POOLS ---
POOL ID PGS STORED OBJECTS USED %USED MAX AVAIL
device_health_metrics 1 1 0 B 0 0 B 0 32 GiB
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$ rados df
POOL_NAME USED OBJECTS CLONES COPIES MISSING_ON_PRIMARY UNFOUND DEGRADED RD_OPS RD WR_OPS WR USED COMPR UNDER COMPR
device_health_metrics 0 B 0 0 0 0 0 0 0 0 B 0 0 B 0 B 0 B
total_objects 0
total_used 51 MiB
total_avail 100 GiB
total_space 100 GiB
[rook@rook-ceph-tools-68f89f79f9-jqcg8 /]$
8.收集操作日志,用于调试
kubectl create -f deploy/examples/toolbox-job.yaml
kubectl -n rook-ceph logs -l job-name=rook-ceph-toolbox-job
六、dashboard配置
#部署Ceph Dashboard
[root@master01 examples]# kubectl apply -f dashboard-external-https.yaml
service/rook-ceph-mgr-dashboard-external-https created
# 获取 dashboard admin密码
[root@master01 examples]# kubectl -n rook-ceph get secret rook-ceph-dashboard-password -o jsonpath="{['data']['password']}" | base64 -d
#输出密码:
}=1:6:@C>:NP!KVGId;r
#查看新端口
[root@master01 examples]# kubectl get svc -n rook-ceph
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
csi-cephfsplugin-metrics ClusterIP 172.18.4.87 8080/TCP,8081/TCP 78m
csi-rbdplugin-metrics ClusterIP 172.18.187.253 8080/TCP,8081/TCP 78m
rook-ceph-admission-controller ClusterIP 172.18.49.28 443/TCP 83m
rook-ceph-mgr ClusterIP 172.18.41.24 9283/TCP 80m
rook-ceph-mgr-dashboard ClusterIP 172.18.239.24 8443/TCP 80m
rook-ceph-mgr-dashboard-external-https NodePort 172.18.66.56 8443:30044/TCP 11m
rook-ceph-mon-a ClusterIP 172.18.26.25 6789/TCP,3300/TCP 82m
rook-ceph-mon-b ClusterIP 172.18.147.238 6789/TCP,3300/TCP 80m
rook-ceph-mon-c ClusterIP 172.18.244.12 6789/TCP,3300/TCP 80m
#浏览器访问:
https://192.168.3.41:30044
七、部署rbd和cephfs存储支持
# rdb:
#创建一个名为replicapool的rbd pool
[root@master01 examples]# kubectl apply -f csi/rbd/storageclass.yaml
cephblockpool.ceph.rook.io/replicapool created
storageclass.storage.k8s.io/rook-ceph-block created
#cephfs:
[root@master01 examples]# kubectl apply -f filesystem.yaml
cephfilesystem.ceph.rook.io/myfs created
[root@master01 examples]# kubectl apply -f csi/cephfs/storageclass.yaml
storageclass.storage.k8s.io/rook-cephfs created
[root@master01 examples]#
[root@k8s-master1 examples]# kubectl apply -f filesystem.yaml
[root@k8s-master1 examples]# kubectl apply -f csi/cephfs/storageclass.yaml
#查看部署情况
[root@master01 examples]# kubectl get sc
NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
rook-ceph-block rook-ceph.rbd.csi.ceph.com Delete Immediate true 109s
rook-cephfs rook-ceph.cephfs.csi.ceph.com Delete Immediate true 55s
[root@master01 examples]#
八、使用ceph作为后端存储部署redis集群测试
1.Redis是一个有状态应用
当把redis以pod的形式部署在k8s中时,每个pod里缓存的数据都是不一样的,而且pod的IP是会随时变化,这时候如果使用普通的deployment和service来部署redis-cluster就会出现很多问题,因此需要改用StatefulSet + Headless Service来解决。
2.数据持久化
redis虽然是基于内存的缓存,但还是需要依赖于磁盘进行数据的持久化,以便服务出现问题重启时可以恢复已经缓存的数据
3.Headless Service
headless Service就是没有指定Cluster IP的Service,相应的,在k8s的dns映射里,Headless Service的解析结果不是一个Cluster IP,而是它所关联的所有Pod的IP列表
4.StatefulSet
StatefulSet是k8s中专门用于解决有状态应用部署的一种资源,总的来说可以认为它是Deployment/RC的一个变种,它有以下几个特性:
6.生成yaml配置文件
redis 配置文件使用 configmap 方式进行挂载,chage-pod-ip.sh 脚本的作用用于当 redis 集群某 pod 重建后 Pod IP 发生变化,在 /data/nodes.conf 中将新的 Pod IP 替换原 Pod IP。不然集群会出问题,创建配置文件文件:
mkdir -p redis-cluster
cd redis-cluster
[root@master01 redis-cluster]# ll
总用量 8
-rw-r--r-- 1 root root 2374 6月 2 20:48 redis-cluster-configmap.yaml
-rw-r--r-- 1 root root 1942 6月 2 20:49 redis-cluster.yaml
[root@master01 redis-cluster]#
[root@master01 redis-cluster]# cat redis-cluster-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: redis-cluster
namespace: redis-cluster
data:
chage-pod-ip.sh: |
#!/bin/sh
CLUSTER_CONFIG="/data/nodes.conf"
if [ -f ${CLUSTER_CONFIG} ]; then
if [ -z "${POD_IP}" ]; then
echo "Unable to determine Pod IP address!"
exit 1
fi
echo "Updating my IP to ${POD_IP} in ${CLUSTER_CONFIG}"
sed -i.bak -e '/myself/ s/[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}/'${POD_IP}'/' ${CLUSTER_CONFIG}
fi
exec "$@"
redis.conf: |
bind 0.0.0.0
protected-mode yes
port 6379
tcp-backlog 2048
timeout 0
tcp-keepalive 300
daemonize no
supervised no
pidfile /var/run/redis.pid
loglevel notice
logfile /data/redis.log
databases 16
always-show-logo yes
stop-writes-on-bgsave-error yes
rdbcompression yes
rdbchecksum yes
dbfilename dump.rdb
dir /data
masterauth demo@2022
replica-serve-stale-data yes
replica-read-only no
repl-diskless-sync no
repl-diskless-sync-delay 5
repl-disable-tcp-nodelay no
replica-priority 100
requirepass demo@2022
maxclients 32768
maxmemory-policy allkeys-lru
lazyfree-lazy-eviction no
lazyfree-lazy-expire no
lazyfree-lazy-server-del no
replica-lazy-flush no
appendonly yes
appendfilename "appendonly.aof"
appendfsync everysec
no-appendfsync-on-rewrite no
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
aof-load-truncated yes
aof-use-rdb-preamble yes
lua-time-limit 5000
cluster-enabled yes
cluster-config-file /data/nodes.conf
cluster-node-timeout 15000
slowlog-log-slower-than 10000
slowlog-max-len 128
latency-monitor-threshold 0
notify-keyspace-events ""
hash-max-ziplist-entries 512
hash-max-ziplist-value 64
list-max-ziplist-size -2
list-compress-depth 0
set-max-intset-entries 512
zset-max-ziplist-entries 128
zset-max-ziplist-value 64
hll-sparse-max-bytes 3000
stream-node-max-bytes 4096
stream-node-max-entries 100
activerehashing yes
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit replica 256mb 64mb 60
client-output-buffer-limit pubsub 32mb 8mb 60
hz 10
dynamic-hz yes
aof-rewrite-incremental-fsync yes
rdb-save-incremental-fsync yes
[root@master01 redis-cluster]# cat redis-cluster.yaml
---
apiVersion: v1
kind: Service
metadata:
namespace: redis-cluster
name: redis-cluster
spec:
clusterIP: None
ports:
- port: 6379
targetPort: 6379
name: client
- port: 16379
targetPort: 16379
name: gossip
selector:
app: redis-cluster
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
namespace: redis-cluster
name: redis-cluster
spec:
serviceName: redis-cluster
replicas: 6
selector:
matchLabels:
app: redis-cluster
template:
metadata:
labels:
app: redis-cluster
spec:
terminationGracePeriodSeconds: 20
# pod反亲和配置
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- redis-cluster
topologyKey: kubernetes.io/hostname
containers:
- name: redis
image: redis:5.0.13
ports:
- containerPort: 6379
name: client
- containerPort: 16379
name: gossip
command: ["/etc/redis/chage-pod-ip.sh", "redis-server", "/etc/redis/redis.conf"]
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
volumeMounts:
- name: conf
mountPath: /etc/redis/
readOnly: false
- name: data
mountPath: /data
readOnly: false
volumes:
- name: conf
configMap:
name: redis-cluster
defaultMode: 0755
# 使用ceph集群文件存储动态提供pv配置段
volumeClaimTemplates:
- metadata:
name: data
spec:
storageClassName: "rook-cephfs"
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi
7、部署
[root@master01 redis-cluster]# kubectl create ns redis-cluster
namespace/redis-cluster created
[root@master01 redis-cluster]# kubectl apply -f redis-cluster-configmap.yaml
configmap/redis-cluster created
[root@master01 redis-cluster]# kubectl apply -f redis-cluster.yaml
service/redis-cluster created
statefulset.apps/redis-cluster created
[root@master01 redis-cluster]#
# 查看部署状态
[root@master01 redis-cluster]# kubectl get pod -n redis-cluster -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
redis-cluster-0 1/1 Running 0 62s 10.244.140.92 node02
redis-cluster-1 1/1 Running 0 58s 10.244.196.149 node01
redis-cluster-2 1/1 Running 0 50s 10.244.114.21 node05
redis-cluster-3 1/1 Running 0 40s 10.244.186.215 node03
redis-cluster-4 1/1 Running 0 32s 10.244.248.215 node04
redis-cluster-5 1/1 Running 0 22s 10.244.140.93 node02
[root@master01 redis-cluster]# kubectl get svc -n redis-cluster
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
redis-cluster ClusterIP None 6379/TCP,16379/TCP 94s
# 查看pvc及pv
[root@master01 redis-cluster]# kubectl get svc -n redis-cluster
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
redis-cluster ClusterIP None 6379/TCP,16379/TCP 94s
[root@master01 redis-cluster]# kubectl get pvc,pv -n redis-cluster
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
persistentvolumeclaim/data-redis-cluster-0 Bound pvc-78309c22-463c-48c9-8e9f-00ed32fec2e6 10Gi RWX rook-cephfs 16m
persistentvolumeclaim/data-redis-cluster-1 Bound pvc-6839daf7-53ed-42cf-961c-3a4aa403327f 10Gi RWX rook-cephfs 16m
persistentvolumeclaim/data-redis-cluster-2 Bound pvc-58e79d62-415e-4bc1-9e2f-0572f9144c12 10Gi RWX rook-cephfs 16m
persistentvolumeclaim/data-redis-cluster-3 Bound pvc-0dc7f552-4fd1-4f7a-831c-e30b2b11a27f 10Gi RWX rook-cephfs 16m
persistentvolumeclaim/data-redis-cluster-4 Bound pvc-12532ea4-2347-4f7f-b2f5-26b5dd949a86 10Gi RWX rook-cephfs 16m
persistentvolumeclaim/data-redis-cluster-5 Bound pvc-28fb6439-752e-461a-9600-13883a7bdd74 10Gi RWX rook-cephfs 75s
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
persistentvolume/pvc-0dc7f552-4fd1-4f7a-831c-e30b2b11a27f 10Gi RWX Delete Bound redis-cluster/data-redis-cluster-3 rook-cephfs 16m
persistentvolume/pvc-12532ea4-2347-4f7f-b2f5-26b5dd949a86 10Gi RWX Delete Bound redis-cluster/data-redis-cluster-4 rook-cephfs 16m
persistentvolume/pvc-28fb6439-752e-461a-9600-13883a7bdd74 10Gi RWX Delete Bound redis-cluster/data-redis-cluster-5 rook-cephfs 75s
persistentvolume/pvc-58e79d62-415e-4bc1-9e2f-0572f9144c12 10Gi RWX Delete Bound redis-cluster/data-redis-cluster-2 rook-cephfs 16m
persistentvolume/pvc-6839daf7-53ed-42cf-961c-3a4aa403327f 10Gi RWX Delete Bound redis-cluster/data-redis-cluster-1 rook-cephfs 16m
persistentvolume/pvc-78309c22-463c-48c9-8e9f-00ed32fec2e6 10Gi RWX Delete Bound redis-cluster/data-redis-cluster-0 rook-cephfs 16m
8.创建集群
#获取集群IP
[root@master01 redis-cluster]# kubectl get pod -n redis-cluster -o wide | awk '{print $6}'
IP
10.244.140.92
10.244.196.149
10.244.114.21
10.244.186.215
10.244.248.215
10.244.140.93
#进入redis容器
[root@master01 redis-cluster]# kubectl exec -it redis-cluster-0 -n redis-cluster -- bash
#创建集群,按提示输入"yes"即可完成集群创建
redis-cli -a demo@2022 --cluster create \
10.244.140.92:6379 \
10.244.196.149:6379 \
10.244.114.21:6379 \
10.244.186.215:6379 \
10.244.248.215:6379 \
10.244.140.93:6379 \
--cluster-replicas 1
9、验证集群
# 可以看到集群状态正常
[root@master01 redis-cluster]# kubectl exec -it redis-cluster-0 -n redis-cluster -- bash
root@redis-cluster-0:/data# redis-cli -c -h redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local -a 'demo@2022'
Warning: Using a password with '-a' or '-u' option on the command line interface may not be safe.
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379> cluster info
cluster_state:ok
cluster_slots_assigned:16384
cluster_slots_ok:16384
cluster_slots_pfail:0
cluster_slots_fail:0
cluster_known_nodes:6
cluster_size:3
cluster_current_epoch:6
cluster_my_epoch:3
cluster_stats_messages_ping_sent:111
cluster_stats_messages_pong_sent:102
cluster_stats_messages_meet_sent:1
cluster_stats_messages_sent:214
cluster_stats_messages_ping_received:102
cluster_stats_messages_pong_received:112
cluster_stats_messages_received:214
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379>
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379> cluster info
cluster_state:ok
cluster_slots_assigned:16384
cluster_slots_ok:16384
cluster_slots_pfail:0
cluster_slots_fail:0
cluster_known_nodes:6
cluster_size:3
cluster_current_epoch:6
cluster_my_epoch:3
cluster_stats_messages_ping_sent:170
cluster_stats_messages_pong_sent:161
cluster_stats_messages_meet_sent:1
cluster_stats_messages_sent:332
cluster_stats_messages_ping_received:161
cluster_stats_messages_pong_received:171
cluster_stats_messages_received:332
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379>
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379> cluster nodes
147ce6d4a6ece2a69c69ba62d9dcb0cc3fcd3252 10.244.114.21:6379@16379 myself,master - 0 1654177346000 3 connected 10923-16383
e805b85e338356615b7ad896f882d43e79281f47 10.244.186.215:6379@16379 slave 147ce6d4a6ece2a69c69ba62d9dcb0cc3fcd3252 0 1654177345000 4 connected
b98047a17cf7fcd144c94abac0e2576bafe9bb30 10.244.196.149:6379@16379 master - 0 1654177345674 2 connected 5461-10922
cebfdfbc97ef43d94d59cf5a87845c9b993d9954 10.244.140.92:6379@16379 master - 0 1654177343000 1 connected 0-5460
313081321f48ccae93f3a67bc43e2d6b0eae93a6 10.244.140.93:6379@16379 slave b98047a17cf7fcd144c94abac0e2576bafe9bb30 0 1654177346678 6 connected
94fbbe644f27519b348bfa6909d9bf44e680da20 10.244.248.215:6379@16379 slave cebfdfbc97ef43d94d59cf5a87845c9b993d9954 0 1654177345000 5 connected
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379>
10、故障测试
删除任意一个 pod(删除名称为redis-cluster-1的pod),可以看到k8s会自动拉起一个同样名称的pod(edis-cluster-1),自动绑定原来的pvc和pv,pod的IP也自动被chage-pod-ip.sh脚本修改为当前pod的IP
# 查看pod
[root@master01 redis-cluster]# kubectl get pods -n redis-cluster -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
redis-cluster-0 1/1 Running 0 10m 10.244.140.92 node02
redis-cluster-1 1/1 Running 0 21s 10.244.196.151 node01
redis-cluster-2 1/1 Running 0 10m 10.244.114.21 node05
redis-cluster-3 1/1 Running 0 9m59s 10.244.186.215 node03
redis-cluster-4 1/1 Running 0 9m51s 10.244.248.215 node04
redis-cluster-5 1/1 Running 0 9m41s 10.244.140.93 node02
# 删除redis-cluster-1 pod
[root@master01 redis-cluster]# kubectl delete pod redis-cluster-1 -n redis-cluster
pod "redis-cluster-1" deleted
# pod重建
[root@master01 redis-cluster]# kubectl get pods -n redis-cluster -o wideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
redis-cluster-0 1/1 Running 0 10m 10.244.140.92 node02
redis-cluster-1 0/1 ContainerCreating 0 2s node01
redis-cluster-2 1/1 Running 0 10m 10.244.114.21 node05
redis-cluster-3 1/1 Running 0 10m 10.244.186.215 node03
redis-cluster-4 1/1 Running 0 10m 10.244.248.215 node04
redis-cluster-5 1/1 Running 0 9m54s 10.244.140.93 node02
# pod重建完成,ip地址由原先的10.244.196.151变为了10.244.196.152,且由于设置了pod反亲和,六个redis pod不会被调度到同一台虚拟机
[root@master01 redis-cluster]# kubectl get pods -n redis-cluster -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
redis-cluster-0 1/1 Running 0 10m 10.244.140.92 node02
redis-cluster-1 1/1 Running 0 4s 10.244.196.152 node01
redis-cluster-2 1/1 Running 0 10m 10.244.114.21 node05
redis-cluster-3 1/1 Running 0 10m 10.244.186.215 node03
redis-cluster-4 1/1 Running 0 10m 10.244.248.215 node04
redis-cluster-5 1/1 Running 0 9m56s 10.244.140.93 node02
[root@master01 redis-cluster]#
# 查看集群配置
[root@master01 redis-cluster]# kubectl exec -it redis-cluster-0 -n redis-cluster -- ls
appendonly.aof dump.rdb nodes.conf nodes.conf.bak redis.log
[root@master01 redis-cluster]# kubectl exec -it redis-cluster-0 -n redis-cluster -- cat nodes.conf
e805b85e338356615b7ad896f882d43e79281f47 10.244.186.215:6379@16379 slave 147ce6d4a6ece2a69c69ba62d9dcb0cc3fcd3252 0 1654177555790 4 connected
94fbbe644f27519b348bfa6909d9bf44e680da20 10.244.248.215:6379@16379 slave cebfdfbc97ef43d94d59cf5a87845c9b993d9954 0 1654177556797 5 connected
cebfdfbc97ef43d94d59cf5a87845c9b993d9954 10.244.140.92:6379@16379 myself,master - 0 1654177554000 1 connected 0-5460
313081321f48ccae93f3a67bc43e2d6b0eae93a6 10.244.140.93:6379@16379 slave b98047a17cf7fcd144c94abac0e2576bafe9bb30 0 1654177555000 6 connected
147ce6d4a6ece2a69c69ba62d9dcb0cc3fcd3252 10.244.114.21:6379@16379 master - 0 1654177555000 3 connected 10923-16383
b98047a17cf7fcd144c94abac0e2576bafe9bb30 10.244.196.152:6379@16379 master - 1654177555088 1654177553000 2 disconnected 5461-10922
vars currentEpoch 6 lastVoteEpoch 0
[root@master01 redis-cluster]#
# 进入集群查看集群状态,集群状态又恢复了正常
[root@master01 redis-cluster]# kubectl exec -it redis-cluster-0 -n redis-cluster -- bash
root@redis-cluster-0:~# redis-cli -c -h redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local -a 'demo@2022'
Warning: Using a password with '-a' or '-u' option on the command line interface may not be safe.
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379>
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379>
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379> cluster info
cluster_state:ok
cluster_slots_assigned:16384
cluster_slots_ok:16384
cluster_slots_pfail:0
cluster_slots_fail:0
cluster_known_nodes:6
cluster_size:3
cluster_current_epoch:6
cluster_my_epoch:3
cluster_stats_messages_ping_sent:1039
cluster_stats_messages_pong_sent:986
cluster_stats_messages_meet_sent:1
cluster_stats_messages_sent:2026
cluster_stats_messages_ping_received:986
cluster_stats_messages_pong_received:1034
cluster_stats_messages_received:2020
redis-cluster-2.redis-cluster.redis-cluster.svc.cluster.local:6379>
九、总结
通过rook复用k8s节点部署的ceph集群虽然部署方便,但是由于rook部署ceph时全程自动化且服务全部为pod导致后期维护ceph集群比较困难,我个人并不建议在生产环境中使用rook部署ceph集群,生产中应独立部署ceph集群比较方便维护。可以服用k8s集群部分节点独立部署ceph集群,方便维护,也方便两个集群分别维护。