ceph学习笔记

ceph

ceph osd lspools


rbd ls -p testpool


#查看 ceph 集群中有多少个 pool,并且每个 pool 容量及利 用情况
rados df


ceph -s


ceph osd tree
ceph df

ceph versions

ceph osd pool ls

ceph osd crush rule dump

ceph auth print-key client.admin

ceph orch host ls

ceph crash ls

ceph osd pool stats

ceph df detail
ceph osd stat

ceph mon stat

查看image rbd
rbd ls -p kube

ceph osd df
ceph osd pool autoscale-status


ceph: 
10.240.62.11/12/13
root:autelceph2  


用户名:autel
密码:Autonomy@Autel

13 Autel#3913

[root@ceph-admin ~]# ceph mgr services
{
    "dashboard": "https://10.250.53.152:8443/",
    "prometheus": "http://10.250.53.152:9283/"
}



kubectl logs -f qinzhao-cache-resunet-demo-pipeline-wbkkh-2890309351 -n qinzhao -c lustre-importer-preload

kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"

apps/jupyter/jupyter-web-app/upstream/base/configs/spawner_ui_config.yaml

 kustomize build apps/jupyter/jupyter-web-app/upstream/overlays/istio | kubectl apply -f -

kustomize build  apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow | kubectl apply -f -

kubectl get pods -n kubeflow -l kustomize.component=profiles

https://www.amazonaws.cn/ec2/instance-types/

kubectl taint node autel-poweredge-r750 nodetype=T4:NoExecute
kubectl taint node autelrobotics-gpu10 nodetype=RTX3090:NoExecute
autelrobotics-gpu10
kubectl taint node autel-poweredge-r750 nodetype:NoExecute-

 kubectl taint node autelrobotics-gpu09 nodetype:NoSchedule-
 
  kubectl taint node autelrobotics-gpu09 nodetype:NoExecute-
  
  nodegroup=gpu:NoSchedule
  
  kubectl taint nodes autelrobotics-gpu02 nodegroup=gpu:NoSchedule
  
  kubectl label node autelrobotics-gpu02 gputype=A40
  
  lsof -n -P -i:22
  strace

kubectl get csinode
查看活跃进程个数
top -H -p 1

kubectl create secret tls ai-tls \
    --namespace ai-test \
    --key tls.key \
    --cert tls.pem

https://github.com/NVIDIA/nvidia-docker/issues/1678
nvidia-container-cli -k -d /dev/tty info


ls -l /dev/char
cat /etc/nvidia-container-runtime/config.toml

stat -fc %T /sys/fs/cgroup/

sar -n TCP,ETCP 1

 fdisk -l
 ldd
 
 # 修改后,重新挂载生效
# mount -o remount /dev/shm

nstat
mpstat -P all 1
slabtop
pcstat

netstat -ant | awk '{print $6}' | sort | uniq -c | sort -n

dmesg -T
pmap -x  1649 | sort -k 3 -n -r
cat /proc/1649/smaps | grep 7f4250021000
dump memory memory.dump 0x7f2340539000 0x7f235d553000

strings memory.dump

pidstat -p pid -r 1 1000
sudo ./stackcount ip_output

dmesg -Tw
perf
NetHogs

iftop -i eth0 -P -N
./opensnoop -Tn snmp-pass
slabtop

nfsstat -c

du -ah --max-depth=2 /var/log |sort -rh |head -10

./fileslower
ulimit -a


解决显存释放问题:
fuser -v /dev/nvidia*

 lsof -Pni
 
 netstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}'
 

ceph学习笔记_第1张图片

NFS运维:

systemctl status rpcbind nfs-server
nfsiostat
dmesg | grep nfs
exportfs -v
mpstat -P ALL 1
ss -t -a |grep "IP"

nfsstat -c
iostat 

iostat -d -x -k 1

netstat -an | "IP:2049"

dstat
ps aux | grep /app

https://learnku.com/articles/39851
https://zhuanlan.zhihu.com/p/614314627

 fdisk -l
 blkid
 
 nfsiostat  1
 
 sar -b 1
 
 iostat -m -d /dev/md0 1
 
 strace -p pid 查看进程当前调用栈,查死循环或者卡顿时极为有用
strace -eopen /usr/local/kk-mail/service/dovecot/sbin/dovecot  查看进程当前打开了哪些文件 

cat /proc/715765/*/task/stack

/proc/12544/task/12873/stack
systemtap

cat /var/log/Xorg.0.log |grep -i "nvidia"

ceph学习笔记_第2张图片

ceph学习笔记_第3张图片

你可能感兴趣的:(ceph,学习,笔记)