公司发的电脑,Thinkpad X13,8核16G,通过vm搭建一个ubuntu1.18.4的linux环境,8核16G+100G存储。
环境如下
root@ubuntu:~/kubeflow-manifests# uname -a
Linux ubuntu 4.15.0-180-generic #189-Ubuntu SMP Wed May 18 14:13:57 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
开启root ssh
sudo passwd root
vi /etc/ssh/sshd_config #PermitRootLogin yes
/etc/init.d/ssh restart
虚拟机配置如下
查看物理CPU个数
查看每个物理CPU中core的个数(即核数)
查看逻辑CPU的个数
查看线程数
root@ubuntu:~# cat /proc/cpuinfo| grep "physical id"| sort| uniq| wc -l
4
root@ubuntu:~# cat /proc/cpuinfo| grep "cpu cores"| uniq
cpu cores : 2
root@ubuntu:~# cat /proc/cpuinfo| grep "processor"| wc -l
8
root@ubuntu:~# grep 'processor' /proc/cpuinfo | sort -u | wc -l
8
内存
root@ubuntu:~# free -g
total used free shared buff/cache available
Mem: 15 0 14 0 0 15
Swap: 0 0 0
存储
root@ubuntu:~# df -h
Filesystem Size Used Avail Use% Mounted on
udev 7.8G 0 7.8G 0% /dev
tmpfs 1.6G 1.3M 1.6G 1% /run
/dev/mapper/ubuntu--vg-ubuntu--lv 97G 6.3G 86G 7% /
tmpfs 7.9G 0 7.9G 0% /dev/shm
tmpfs 5.0M 0 5.0M 0% /run/lock
tmpfs 7.9G 0 7.9G 0% /sys/fs/cgroup
/dev/sda2 976M 80M 830M 9% /boot
tmpfs 1.6G 0 1.6G 0% /run/user/0
按照之前的虚拟机安装文档,搭建到了时间同步,docker安装
root@ubuntu:~# docker info
Client:
Context: default
Debug Mode: false
Plugins:
app: Docker App (Docker Inc., v0.9.1-beta3)
buildx: Docker Buildx (Docker Inc., v0.8.2-docker)
scan: Docker Scan (Docker Inc., v0.17.0)
Server:
Containers: 0
Running: 0
Paused: 0
Stopped: 0
Images: 0
Server Version: 20.10.16
Storage Driver: overlay2
Backing Filesystem: extfs
Supports d_type: true
Native Overlay Diff: true
userxattr: false
Logging Driver: json-file
Cgroup Driver: systemd
Cgroup Version: 1
Plugins:
Volume: local
Network: bridge host ipvlan macvlan null overlay
Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog
Swarm: inactive
Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux runc
Default Runtime: runc
Init Binary: docker-init
containerd version: 212e8b6fa2f44b9c21b2798135fc6fb7c53efc16
runc version: v1.1.1-0-g52de29d
init version: de40ad0
Security Options:
apparmor
seccomp
Profile: default
Kernel Version: 4.15.0-180-generic
Operating System: Ubuntu 18.04.6 LTS
OSType: linux
Architecture: x86_64
CPUs: 8
Total Memory: 15.64GiB
Name: ubuntu
ID: QOFG:2C6N:RLFE:N4CR:TCYZ:WYS6:YHMC:BNX5:47LP:7M6L:LTBK:BCBT
Docker Root Dir: /var/lib/docker
Debug Mode: false
Registry: https://index.docker.io/v1/
Labels:
Experimental: false
Insecure Registries:
127.0.0.0/8
Registry Mirrors:
https://fzy3wxn0.mirror.aliyuncs.com/
Live Restore Enabled: false
WARNING: No swap limit support
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64
chmod +x ./kind
mv kind /usr/bin/
which kind
设置kind命令自定补全
source /usr/share/bash-completion/bash_completion
source <(kubectl completion bash)
echo 'source <(kind completion bash)' >>~/.bashrc
去docker hub 下载镜像,go
docker pull kindest/node:v1.19.1
下载kubeflow安装文件
git clone https://github.com/shikanon/kubeflow-manifests.git
安装k8s
cd kubeflow-manifests
kind create cluster --config=kind/kind-config.yaml --name=kubeflow --image=kindest/node:v1.19.1
安装kubectl
为了能够查看pods,还要安装kubectl工具
# 使得 apt 支持 ssl 传输
apt-get update && apt-get install -y apt-transport-https
curl https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add -
cat <<EOF >/etc/apt/sources.list.d/kubernetes.list
deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main
EOF
apt-get update
apt-cache madison kubectl kubeadm kubelet
apt-get install kubectl=1.19.1-00
结果
root@ubuntu:~/kubeflow-manifests# kubectl get pods -A
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system coredns-74ff55c5b-2fxs8 1/1 Running 0 23m
kube-system coredns-74ff55c5b-bp7c2 1/1 Running 0 23m
kube-system etcd-kubeflow-control-plane 1/1 Running 0 23m
kube-system kindnet-hhbgz 1/1 Running 0 23m
kube-system kube-apiserver-kubeflow-control-plane 1/1 Running 0 23m
kube-system kube-controller-manager-kubeflow-control-plane 1/1 Running 0 23m
kube-system kube-proxy-7jqgd 1/1 Running 0 23m
kube-system kube-scheduler-kubeflow-control-plane 1/1 Running 0 23m
local-path-storage local-path-provisioner-78776bfc44-bx4ws 1/1 Running 0 23m
注意
kind 会创建存储类,如果是k8s cluster 模式,需要先搭建存储类storageclass,没有的提前创建一下。
kubectl create -f localpath/
python3 install.py
由于017-***.yaml 文件会报错
error: unable to recognize "./manifest1.3/017-pipeline-env-platform-agnostic-multi-user.yaml": no matches for kind "CompositeController" in version "metacontroller.k8s.io/v1alpha1"
重新apply一下
kubectl apply -f manifest1.3/017-pipeline-env-platform-agnostic-multi-user.yaml
kubectl apply -f patch/
然后等待就可以了,一共用了一个小时
结果
root@ubuntu:~/kubeflow-manifests# kubectl get pods -A
NAMESPACE NAME READY STATUS RESTARTS AGE
auth dex-6d8cd4fccb-ktjdl 1/1 Running 0 27m
cert-manager cert-manager-649f8dfd4b-89cwl 1/1 Running 0 33m
cert-manager cert-manager-cainjector-75cd8bbf6d-wzmbb 1/1 Running 0 33m
cert-manager cert-manager-webhook-5b5cd9bd6f-4j4mn 1/1 Running 1 33m
istio-system authservice-0 1/1 Running 0 32m
istio-system cluster-local-gateway-74d9fd9586-wpc7p 1/1 Running 0 27m
istio-system istio-ingressgateway-8bf685655-8bf5z 1/1 Running 0 27m
istio-system istiod-756554b96b-hf697 1/1 Running 0 27m
knative-eventing broker-controller-cfb5ccb77-57nqr 1/1 Running 0 31m
knative-eventing eventing-controller-8657cd4b8-xw5qq 1/1 Running 0 31m
knative-eventing eventing-webhook-67f86f4d4d-grm27 1/1 Running 0 31m
knative-eventing imc-controller-68bd666784-927rr 1/1 Running 0 31m
knative-eventing imc-dispatcher-78ff9dd847-c4kjs 1/1 Running 0 31m
knative-serving activator-54b777546f-zjdlq 1/1 Running 1 31m
knative-serving autoscaler-79bbc84d47-g8dp4 1/1 Running 0 31m
knative-serving controller-dd65cb4b7-82ngq 1/1 Running 0 31m
knative-serving istio-webhook-5f545fc44b-bgthr 1/1 Running 0 31m
knative-serving networking-istio-6b6df495d6-wdfwq 1/1 Running 0 31m
knative-serving webhook-9ff656f95-qp89s 1/1 Running 0 31m
kube-system coredns-74ff55c5b-2fxs8 1/1 Running 0 58m
kube-system coredns-74ff55c5b-bp7c2 1/1 Running 0 58m
kube-system etcd-kubeflow-control-plane 1/1 Running 0 58m
kube-system kindnet-hhbgz 1/1 Running 0 58m
kube-system kube-apiserver-kubeflow-control-plane 1/1 Running 0 58m
kube-system kube-controller-manager-kubeflow-control-plane 1/1 Running 0 58m
kube-system kube-proxy-7jqgd 1/1 Running 0 58m
kube-system kube-scheduler-kubeflow-control-plane 1/1 Running 0 58m
kubeflow-user-example-com ml-pipeline-ui-artifact-6b9bb7f495-8s9kh 2/2 Running 0 4m34s
kubeflow-user-example-com ml-pipeline-visualizationserver-5c648f8448-7hlmt 2/2 Running 0 4m34s
kubeflow admission-webhook-deployment-5f5cc7968b-rzbsz 1/1 Running 0 29m
kubeflow cache-deployer-deployment-64598b6c87-7crx4 2/2 Running 1 30m
kubeflow cache-server-59d67c7584-r7wkb 2/2 Running 0 27m
kubeflow centraldashboard-7b6b6cc7fc-tk7mj 1/1 Running 0 29m
kubeflow jupyter-web-app-deployment-7c6974bb88-2bz7z 1/1 Running 0 27m
kubeflow katib-controller-7b784c44dd-lbsgm 1/1 Running 0 30m
kubeflow katib-db-manager-6c5757dc64-zdpdd 1/1 Running 3 30m
kubeflow katib-mysql-79d75c7444-fjzwq 1/1 Running 0 30m
kubeflow katib-ui-69f5b6795d-b7lx7 1/1 Running 0 30m
kubeflow kfserving-controller-manager-0 2/2 Running 0 30m
kubeflow kubeflow-pipelines-profile-controller-7699846fd7-sfw2d 1/1 Running 0 27m
kubeflow metacontroller-0 1/1 Running 0 30m
kubeflow metadata-envoy-deployment-56f745f7fb-j66bw 1/1 Running 0 30m
kubeflow metadata-grpc-deployment-6494577fdb-d57jf 2/2 Running 3 30m
kubeflow metadata-writer-b7ff9787-5fngh 2/2 Running 1 30m
kubeflow minio-cc8f7c6d-khbxw 2/2 Running 0 25m
kubeflow ml-pipeline-66bcb9d79d-jnfnc 2/2 Running 5 30m
kubeflow ml-pipeline-persistenceagent-7fb8f6dc68-nlhsp 2/2 Running 1 30m
kubeflow ml-pipeline-scheduledworkflow-64bcfd6596-8ncr7 2/2 Running 0 30m
kubeflow ml-pipeline-ui-8578f6685f-d4llg 2/2 Running 0 30m
kubeflow ml-pipeline-viewer-crd-565fb9b5c5-nlbdv 2/2 Running 1 30m
kubeflow ml-pipeline-visualizationserver-b7c7d49fb-h72g7 2/2 Running 0 30m
kubeflow mpi-operator-794849c566-jksfc 1/1 Running 0 28m
kubeflow mxnet-operator-6668d797d4-72f2d 1/1 Running 0 27m
kubeflow mysql-9dfc684cd-4dcs9 2/2 Running 0 30m
kubeflow notebook-controller-deployment-6795dd887b-w5dnn 1/1 Running 0 29m
kubeflow profiles-deployment-84bd4f9bc7-558bw 2/2 Running 0 29m
kubeflow pytorch-operator-6887749499-t2b5f 2/2 Running 0 28m
kubeflow tensorboard-controller-controller-manager-dd896c8df-9gqc5 3/3 Running 1 28m
kubeflow tensorboards-web-app-deployment-5969cd5b68-btxk4 1/1 Running 0 27m
kubeflow tf-job-operator-ccb48b77b-ptppw 1/1 Running 0 28m
kubeflow volumes-web-app-deployment-867dfb5b5c-qtfkz 1/1 Running 0 27m
kubeflow workflow-controller-6885c56f65-c45qw 2/2 Running 1 27m
kubeflow xgboost-operator-deployment-665cf9bf8d-5kzrd 2/2 Running 1 27m
local-path-storage local-path-provisioner-78776bfc44-bx4ws 1/1 Running 0 58m
用户名是[email protected],密码是password
kind delete cluster --name kubeflow # 卸载集群
apt-get --purge remove kubectl # 卸载软件包
apt-get autoremove kubectl # 卸载相关依赖
在kubeflow官网,有很多的repo,找到manifests文件
todo
1. kind安装k8s集群
2. 玩转Kubeflow第一章: kubeflow 国内本地安装及案例介绍
3. 手把手教你搭建Kubeflow——基于K8s的机器学习平台
3 这篇文章挺好的,会拆解安装的内容。