cupy 加速

这个本质是cuda计算,训练的时候gpu是紧缺资源,这个也需要gpu,所以gpu冲突了。

 

import cupy as cp

def to_cupy(tensor):
    """Convert PyTorch tensor to CuPy array.
    """
    return cp.fromDlpack(to_dlpack(tensor))


def to_tensor(cp_array):
    """Convert CuPy array to PyTorch tensor.
    """
    return from_dlpack(cp_array.toDlpack())

合并拼接:

cp_data=cp.hstack((cp_data, cp_data))

numpy 与cupy互转:

import cupy as cp
import numpy as np
x_cpu = np.random.randn(100, 100).astype(np.float32)
x_gpu = cp.asarray(x_cpu)


n_data = cp.asnumpy(x_gpu)
print(n_data)


x_cpu*x_cpu
2.41 µs ± 19.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%%timeit
x_gpu*x_gpu
14.3 µs ± 53.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
import torch
x_tensor = torch.from_numpy(x_cpu)
%%timeit
x_tensor*x_tensor
3.01 µs ± 33.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
x_gpu_tensor = x_tensor.cuda()
%%timeit
x_gpu_tensor*x_gpu_tensor

cupy与torch互转

from torch.utils.dlpack import to_dlpack
	from torch.utils.dlpack import from_dlpack

	xgpu = torch.zeros(6, 3).cuda()

	cp_data= cp.fromDlpack(to_dlpack(xgpu))
	indexes = (slice(0, 6), slice(0, 3))
	dataa=cp_data[indexes].toDlpack()
	dataa=from_dlpack(dataa)

cupy转之前也需要是cuda,这个有问题,大数据量时cuda内存不够。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import random
import time
from torch.utils.dlpack import to_dlpack
from torch.utils.dlpack import from_dlpack
import cv2
import torch
import numpy as np
import cupy as cp
import SpeedTorch

xgpu = torch.zeros(300, 416, 416, 3).cuda()
ygpu = torch.zeros(300, 416, 416, 3).cuda()

dir_path = r"D:\dataset\chumao_train\chumao_head\JPEGImages/"
data_file = 'data.npy'


def read_img(npy_file):
	imgs = []
	files = os.listdir(path=dir_path)
	for i in range(100):
		img = cv2.imread(dir_path + files[i])
		img = cv2.resize(img, (416, 416))
		imgs.append(img)

	imgs = np.asarray(imgs)
	np.save(npy_file, imgs)

def speedtorch_demo():
	read_img(data_file)
	start = time.time()
	cpu_pinned1 = SpeedTorch.DataGadget(data_file, CPUPinn=True)
	cpu_pinned1.gadgetInit()
	print('init time', time.time() - start)

	batch_size = 100
	for i in range(20):
		li = [i for i in range(100)]
		random.shuffle(li)
		for i_index, j in enumerate(li):
			j_data = cpu_pinned1.getData(indexes=(slice(j, j + 1), slice(0, 416)))  # transfer cpu_pinned to xgpu
			xgpu[i_index % batch_size] = j_data
			if i_index % batch_size == 0:
				start = time.time()
			if i_index % batch_size < batch_size - 1:
				continue
			# print(xgpu[0][0])
			if time.time() - start > 0.001:
				print("dat_time", time.time() - start)

			# print(xgpu[0][0])
			# print('time',time.time()-start)


def torch_demo():
	start = time.time()

	imgs = []
	files = os.listdir(path=dir_path)
	for i in range(300):
		img = cv2.imread(dir_path + files[i])
		img = cv2.resize(img, (416, 416))
		imgs.append(img)

	imgs = np.asarray(imgs)
	imgs=torch.from_numpy(imgs).cuda()
	print('init time', time.time() - start)

	cp_data = cp.fromDlpack(to_dlpack(imgs))
	indexes = (slice(0, 300), slice(0, 416))
	# dataa = cp_data[indexes].toDlpack()
	# dataaa = from_dlpack(dataa)
	batch_size = 300
	li = [i for i in range(900)]
	# for i_index, j in enumerate(li):
	# 	j_data = cpu_pinned1.getData(indexes=(slice(j, j + 1), slice(0, 416)))  # transfer cpu_pinned to xgpu
	# 	xgpu[i_index % batch_size] = j_data

	for i in range(20):
		li = [i for i in range(300)]
		random.shuffle(li)
		start = time.time()
		for i_index, j in enumerate(li):
			indexes = (slice(j, j+1), slice(0, 416))
			dataa = cp_data[indexes].toDlpack()
			ygpu[i_index % batch_size] = from_dlpack(dataa).cuda()
		# if i_index % batch_size == 0:
		#     start = time.time()
		# if i_index % batch_size < batch_size - 1:
		#     continue
		# print(ygpu[0][0])
		if time.time() - start > 0.001:
			print("dat_time", time.time() - start)


if __name__ == '__main__':
	# speedtorch_demo()


	torch_demo()

	# xgpu = torch.zeros(6, 3).cuda()
	#
	# cp_data= cp.fromDlpack(to_dlpack(xgpu))
	# indexes = (slice(0, 6), slice(0, 3))
	# dataa=cp_data[indexes].toDlpack()
	# dataa=from_dlpack(dataa)
	#
	# for i in range(100):
	# 	data = np.random.rand(6, 3)
	# 	cpu_pinned1 = SpeedTorch.DataGadget(data, CPUPinn=True)
	# 	cpu_pinned1.gadgetInit()
	# 	xgpu[:] = cpu_pinned1.getData(indexes=(slice(0, 6), slice(0, 3)))  # transfer cpu_pinned to xgpu

 

你可能感兴趣的:(python,torch)