这个本质是cuda计算,训练的时候gpu是紧缺资源,这个也需要gpu,所以gpu冲突了。
import cupy as cp
def to_cupy(tensor):
"""Convert PyTorch tensor to CuPy array.
"""
return cp.fromDlpack(to_dlpack(tensor))
def to_tensor(cp_array):
"""Convert CuPy array to PyTorch tensor.
"""
return from_dlpack(cp_array.toDlpack())
合并拼接:
cp_data=cp.hstack((cp_data, cp_data))
numpy 与cupy互转:
import cupy as cp
import numpy as np
x_cpu = np.random.randn(100, 100).astype(np.float32)
x_gpu = cp.asarray(x_cpu)
n_data = cp.asnumpy(x_gpu)
print(n_data)
x_cpu*x_cpu
2.41 µs ± 19.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%%timeit
x_gpu*x_gpu
14.3 µs ± 53.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
import torch
x_tensor = torch.from_numpy(x_cpu)
%%timeit
x_tensor*x_tensor
3.01 µs ± 33.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
x_gpu_tensor = x_tensor.cuda()
%%timeit
x_gpu_tensor*x_gpu_tensor
cupy与torch互转
from torch.utils.dlpack import to_dlpack
from torch.utils.dlpack import from_dlpack
xgpu = torch.zeros(6, 3).cuda()
cp_data= cp.fromDlpack(to_dlpack(xgpu))
indexes = (slice(0, 6), slice(0, 3))
dataa=cp_data[indexes].toDlpack()
dataa=from_dlpack(dataa)
cupy转之前也需要是cuda,这个有问题,大数据量时cuda内存不够。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import random
import time
from torch.utils.dlpack import to_dlpack
from torch.utils.dlpack import from_dlpack
import cv2
import torch
import numpy as np
import cupy as cp
import SpeedTorch
xgpu = torch.zeros(300, 416, 416, 3).cuda()
ygpu = torch.zeros(300, 416, 416, 3).cuda()
dir_path = r"D:\dataset\chumao_train\chumao_head\JPEGImages/"
data_file = 'data.npy'
def read_img(npy_file):
imgs = []
files = os.listdir(path=dir_path)
for i in range(100):
img = cv2.imread(dir_path + files[i])
img = cv2.resize(img, (416, 416))
imgs.append(img)
imgs = np.asarray(imgs)
np.save(npy_file, imgs)
def speedtorch_demo():
read_img(data_file)
start = time.time()
cpu_pinned1 = SpeedTorch.DataGadget(data_file, CPUPinn=True)
cpu_pinned1.gadgetInit()
print('init time', time.time() - start)
batch_size = 100
for i in range(20):
li = [i for i in range(100)]
random.shuffle(li)
for i_index, j in enumerate(li):
j_data = cpu_pinned1.getData(indexes=(slice(j, j + 1), slice(0, 416))) # transfer cpu_pinned to xgpu
xgpu[i_index % batch_size] = j_data
if i_index % batch_size == 0:
start = time.time()
if i_index % batch_size < batch_size - 1:
continue
# print(xgpu[0][0])
if time.time() - start > 0.001:
print("dat_time", time.time() - start)
# print(xgpu[0][0])
# print('time',time.time()-start)
def torch_demo():
start = time.time()
imgs = []
files = os.listdir(path=dir_path)
for i in range(300):
img = cv2.imread(dir_path + files[i])
img = cv2.resize(img, (416, 416))
imgs.append(img)
imgs = np.asarray(imgs)
imgs=torch.from_numpy(imgs).cuda()
print('init time', time.time() - start)
cp_data = cp.fromDlpack(to_dlpack(imgs))
indexes = (slice(0, 300), slice(0, 416))
# dataa = cp_data[indexes].toDlpack()
# dataaa = from_dlpack(dataa)
batch_size = 300
li = [i for i in range(900)]
# for i_index, j in enumerate(li):
# j_data = cpu_pinned1.getData(indexes=(slice(j, j + 1), slice(0, 416))) # transfer cpu_pinned to xgpu
# xgpu[i_index % batch_size] = j_data
for i in range(20):
li = [i for i in range(300)]
random.shuffle(li)
start = time.time()
for i_index, j in enumerate(li):
indexes = (slice(j, j+1), slice(0, 416))
dataa = cp_data[indexes].toDlpack()
ygpu[i_index % batch_size] = from_dlpack(dataa).cuda()
# if i_index % batch_size == 0:
# start = time.time()
# if i_index % batch_size < batch_size - 1:
# continue
# print(ygpu[0][0])
if time.time() - start > 0.001:
print("dat_time", time.time() - start)
if __name__ == '__main__':
# speedtorch_demo()
torch_demo()
# xgpu = torch.zeros(6, 3).cuda()
#
# cp_data= cp.fromDlpack(to_dlpack(xgpu))
# indexes = (slice(0, 6), slice(0, 3))
# dataa=cp_data[indexes].toDlpack()
# dataa=from_dlpack(dataa)
#
# for i in range(100):
# data = np.random.rand(6, 3)
# cpu_pinned1 = SpeedTorch.DataGadget(data, CPUPinn=True)
# cpu_pinned1.gadgetInit()
# xgpu[:] = cpu_pinned1.getData(indexes=(slice(0, 6), slice(0, 3))) # transfer cpu_pinned to xgpu