超算中心 | 账号 | 超算资源 | 可用队列分区 |
---|---|---|---|
NC-N30 | [ “scz0au1” ] | 宁夏超算N30分区 - RTX3090, | gpu |
使用北京超级云计算中心平台,直通北京超级云计算中心、中国科学院超算中心等中国科技云资源,具有随需供应,高速稳定链路,开通快、登陆快、操作快、传输快、计算快、分析快等特点;无需使用VPN,一键点击登陆超算;
请点击链接 https://cloud.blsc.cn/ 下载客户端, 使用下列北京超级云计算中心账号登陆;
根据 mmclassification 的环境要求,需要用 anaconda、cuda、gcc 等基础环境模块。在 N30 分区可以使用module avail 命令可以使用模块信息。
加载 anaconda ,创建一个 python 3.8 的环境。
# 加载 anaconda/2021.05
module load anaconda/2021.05
# 创建 python=3.8 的环境
conda create --name openmmlab_mmclassification python=3.8
# 激活环境
source activate openmmlab_mmclassificatio
安装 torch,torch 参考需求。注意在 RTX3090 的GPU上,cuda 版本需要≥ 11.1 。如下安装的 torch是 1.10.0+cu111 。使用 pip 安装的torch 不包括 cuda,所以需要使用 module 加载 cuda/11.1 模块。
# 加载 cuda/11.1
moduleload cuda/11.1
# 安装 torch
pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html
安装 mmcv-full 模块,mmcv-full 模块安装时候需要注意 torch 和 cuda 版本。参考。
pip install mmcv-full==1.7.0 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10/index.html
安装 openmmlab/mmclassification 模块,建议通过下载编译的方式进行安装;安装该模块需要 gcc ≥ 5,使用 module 加载一个 gcc ,例如 module load gcc/7.3 。
# 加载 gcc/7.3 模块
module load gcc/7.3
# git 下载 mmclassification 代码
git clone https://github.com/open-mmlab/mmclassification.git
# 编译安装
cd mmclassification
pip install -e .
划分代码 split_data.py 如下,执行:
python split_data.py 源数据集路径 目标数据集路径
# -*- coding: utf-8 -*-
import os
import sys
import shutil
import numpy as np
def load_data(data_path):
count = 0
data = {}
for dir_name in os.listdir(data_path):
dir_path = os.path.join(data_path, dir_name)
if not os.path.isdir(dir_path):
continue
data[dir_name] = []
for file_name in os.listdir(dir_path):
file_path = os.path.join(dir_path, file_name)
if not os.path.isfile(file_path):
continue
data[dir_name].append(file_path)
count += len(data[dir_name])
print("{} :{}".format(dir_name, len(data[dir_name])))
print("total of image : {}".format(count))
return data
def copy_dataset(src_img_list, data_index, target_path):
target_img_list = []
for index in data_index:
src_img = src_img_list[index]
img_name = os.path.split(src_img)[-1]
shutil.copy(src_img, target_path)
target_img_list.append(os.path.join(target_path, img_name))
return target_img_list
def write_file(data, file_name):
if isinstance(data, dict):
write_data = []
for lab, img_list in data.items():
for img in img_list:
write_data.append("{} {}".format(img, lab))
else:
write_data = data
with open(file_name, "w") as f:
for line in write_data:
f.write(line + "\n")
print("{} write over!".format(file_name))
def split_data(src_data_path, target_data_path, train_rate=0.8):
src_data_dict = load_data(src_data_path)
classes = []
train_dataset, val_dataset = {}, {}
train_count, val_count = 0, 0
for i, (cls_name, img_list) in enumerate(src_data_dict.items()):
img_data_size = len(img_list)
random_index = np.random.choice(img_data_size, img_data_size,replace=False)
train_data_size = int(img_data_size * train_rate)
train_data_index = random_index[:train_data_size]
val_data_index = random_index[train_data_size:]
train_data_path = os.path.join(target_data_path, "train", cls_name)
val_data_path = os.path.join(target_data_path, "val", cls_name)
os.makedirs(train_data_path, exist_ok=True)
os.makedirs(val_data_path, exist_ok=True)
classes.append(cls_name)
train_dataset[i] = copy_dataset(img_list, train_data_index,train_data_path)
val_dataset[i] = copy_dataset(img_list, val_data_index, val_data_path)
print("target {} train:{}, val:{}".format(cls_name,len(train_dataset[i]), len(val_dataset[i])))
train_count += len(train_dataset[i])
val_count += len(val_dataset[i])
print("train size:{}, val size:{}, total:{}".format(train_count, val_count,train_count + val_count))
write_file(classes, os.path.join(target_data_path, "classes.txt"))
write_file(train_dataset, os.path.join(target_data_path, "train.txt"))
write_file(val_dataset, os.path.join(target_data_path, "val.txt"))
def main():
src_data_path = sys.argv[1]
target_data_path = sys.argv[2]
split_data(src_data_path, target_data_path, train_rate=0.8)
if __name__ == '__main__':
main()
# 执行命令
# python split_data.py [ 源数据集路径 ] [ 目标数据集路径 ]
# python split_data.py /HOME/scz0au1/run/mmclassification/configs/resnet18/flower_dataset /HOME/scz0au1/run/mmclassification/configs/resnet18/data
如下内容可命名为 resnet18_b32_flower.py,在 mmclassification/configs 下创建 resnet18 目录,将该文件放到里面。
_base_ = ['../_base_/models/resnet18.py', '../_base_/datasets/imagenet_bs32.py',
'../_base_/default_runtime.py']
model = dict(
head=dict(
num_classes=5,
topk = (1,)
))
data = dict(
samples_per_gpu = 32,
workers_per_gpu = 2,
train = dict(
data_prefix = '/HOME/scz0au1/run/mmclassification/configs/resnet18/data/train',
ann_file = '/HOME/scz0au1/run/mmclassification/configs/resnet18/data/train.txt',
classes = '/HOME/scz0au1/run/mmclassification/configs/resnet18/data/classes.txt'
),
val = dict(
data_prefix = '/HOME/scz0au1/run/mmclassification/configs/resnet18/data/val',
ann_file = '/HOME/scz0au1/run/mmclassification/configs/resnet18/data/val.txt',
classes = '/HOME/scz0au1/run/mmclassification/configs/resnet18/data/classes.txt'
)
)
optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(
policy='step',
step=[1])
runner = dict(type='EpochBasedRunner', max_epochs=100)
#预训练模型
load_from ='/HOME/scz0au1/run/mmclassification/checkpoints/resnet18_batch256_imagenet_20200708-34ab8f90.pth'
在环境、数据集、MMCls 配置文件准备完成之后就可以提交计算。在 N30 提交计算可以通过作业脚本的方式,操作步骤如下:
#!/bin/bash
# 加载模块
module load anaconda/2021.05
module load cuda/11.1
module load gcc/7.3
# 激活环境
source activate openmmlab_mmclassification
# 刷新⽇志缓存
export PYTHONUNBUFFERED=1
# 训练模型
python tools/train.py \
configs/resnet18/resnet18_b32_flower.py
--work-dir work/resnet18_b32_flower
sbatch --gpus=1 run.sh
parajobs
# 288888为提交作业ID
tail-f slurm-288888.out
squeue
# 获得作业使用的节点g00066
ssh g00066
nvidia-smi