fengbingchun

TensorRT Samples: MNIST(Plugin, add a custom layer)

关于TensorRT的介绍可以参考： http://blog.csdn.net/fengbingchun/article/details/78469551

以下是参考TensorRT 2.1.2中的samplePlugin.cpp文件改写的通过IPlugin添加一个全连接层实现对手写数字0-9识别的测试代码，plugin.cpp文件内容如下：

#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 
#include 
#include 
#include 

#include "common.hpp"

// reference: TensorRT-2.1.2/samples/samplePlugin/samplePlugin.cpp
// demonstrates how to add a custom layer to TensorRT. It replaces the final fully connected layer of the MNIST sample with a direct call to cuBLAS

namespace {

typedef std::tuple DATA_INFO; // intput width, input height, output size, input blob name, output blob name

int caffeToGIEModel(const std::string& deployFile,					// name for caffe prototxt
					 const std::string& modelFile,					// name for model 
					 const std::vector& outputs,		// network outputs
					 unsigned int maxBatchSize,						// batch size - NB must be at least as large as the batch we want to run with)
					 nvcaffeparser1::IPluginFactory* pluginFactory,	// factory for plugin layers
					 nvinfer1::IHostMemory *&gieModelStream, Logger logger)		// output stream for the GIE model
{
	// create the builder
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);

	// parse the caffe model to populate the network, then set the outputs
	nvinfer1::INetworkDefinition* network = builder->createNetwork();
	nvcaffeparser1::ICaffeParser* parser = nvcaffeparser1::createCaffeParser();
	parser->setPluginFactory(pluginFactory);

	bool fp16 = builder->platformHasFastFp16();
	const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile.c_str(),
						modelFile.c_str(), *network, fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);

	// specify which tensors are outputs
	for (auto& s : outputs)
		network->markOutput(*blobNameToTensor->find(s.c_str()));

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1 << 20);
	builder->setHalf2Mode(fp16);

	nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
	CHECK(engine != nullptr);

	// we don't need the network any more, and we can destroy the parser
	network->destroy();
	parser->destroy();

	// serialize the engine, then close everything down
	gieModelStream = engine->serialize();

	engine->destroy();
	builder->destroy();
	nvcaffeparser1::shutdownProtobufLibrary();
}

int doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize, const DATA_INFO& info)
{
	const nvinfer1::ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	CHECK(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(std::get<3>(info).c_str()), 
		outputIndex = engine.getBindingIndex(std::get<4>(info).c_str());

	// create GPU buffers and a stream
	cudaMalloc(&buffers[inputIndex], batchSize * std::get<1>(info) * std::get<0>(info) * sizeof(float));
	cudaMalloc(&buffers[outputIndex], batchSize * std::get<2>(info) * sizeof(float));

	cudaStream_t stream;
	cudaStreamCreate(&stream);

	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	cudaMemcpyAsync(buffers[inputIndex], input, batchSize * std::get<1>(info) * std::get<0>(info) * sizeof(float), cudaMemcpyHostToDevice, stream);
	context.enqueue(batchSize, buffers, stream, nullptr);
	cudaMemcpyAsync(output, buffers[outputIndex], batchSize * std::get<2>(info)*sizeof(float), cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	// release the stream and the buffers
	cudaStreamDestroy(stream);
	cudaFree(buffers[inputIndex]);
	cudaFree(buffers[outputIndex]);

	return 0;
}

class FCPlugin: public nvinfer1::IPlugin
{
public:
	FCPlugin(const nvinfer1::Weights* weights, int nbWeights, int nbOutputChannels) : mNbOutputChannels(nbOutputChannels)
	{
		// since we want to deal with the case where there is no bias, we can't infer
		// the number of channels from the bias weights.
		assert(nbWeights == 2);
		mKernelWeights = copyToDevice(weights[0].values, weights[0].count);
		mBiasWeights = copyToDevice(weights[1].values, weights[1].count);
		assert(mBiasWeights.count == 0 || mBiasWeights.count == nbOutputChannels);

		mNbInputChannels = int(weights[0].count / nbOutputChannels);
	}

	// create the plugin at runtime from a byte stream
	FCPlugin(const void* data, size_t length)
	{
		const char* d = reinterpret_cast(data), *a = d;
		mNbInputChannels = read(d);
		mNbOutputChannels = read(d);
		int biasCount = read(d);

		mKernelWeights = deserializeToDevice(d, mNbInputChannels * mNbOutputChannels);
		mBiasWeights = deserializeToDevice(d, biasCount);
		assert(d == a + length);
	}

	~FCPlugin()
	{
		cudaFree(const_cast(mKernelWeights.values));
		cudaFree(const_cast(mBiasWeights.values));
	}

	int getNbOutputs() const override
	{
		return 1;
	}

	nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
	{
		assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
		assert(mNbInputChannels == inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2]);
		return nvinfer1::DimsCHW(mNbOutputChannels, 1, 1);
	}

	void configure(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs, int maxBatchSize) override
	{
	}

	int initialize() override
	{
		cudnnCreate(&mCudnn); // initialize cudnn and cublas
		cublasCreate(&mCublas);
		cudnnCreateTensorDescriptor(&mSrcDescriptor); // create cudnn tensor descriptors we need for bias addition
		cudnnCreateTensorDescriptor(&mDstDescriptor);

		return 0;
	}

	virtual void terminate() override
	{
		cublasDestroy(mCublas);
		cudnnDestroy(mCudnn);
	}

	virtual size_t getWorkspaceSize(int maxBatchSize) const override
	{
		return 0;
	}

	virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
	{
		float kONE = 1.0f, kZERO = 0.0f;
		cublasSetStream(mCublas, stream);
		cudnnSetStream(mCudnn, stream);
		cublasSgemm(mCublas, CUBLAS_OP_T, CUBLAS_OP_N, mNbOutputChannels, batchSize, mNbInputChannels, &kONE, 
				reinterpret_cast(mKernelWeights.values), mNbInputChannels, 
				reinterpret_cast(inputs[0]), mNbInputChannels, &kZERO, 
				reinterpret_cast(outputs[0]), mNbOutputChannels);
		if (mBiasWeights.count) {
			cudnnSetTensor4dDescriptor(mSrcDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, mNbOutputChannels, 1, 1);
			cudnnSetTensor4dDescriptor(mDstDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batchSize, mNbOutputChannels, 1, 1);
			cudnnAddTensor(mCudnn, &kONE, mSrcDescriptor, mBiasWeights.values, &kONE, mDstDescriptor, outputs[0]);
		}

		return 0;
	}

	virtual size_t getSerializationSize() override
	{
		// 3 integers (number of input channels, number of output channels, bias size), and then the weights:
		return sizeof(int)*3 + mKernelWeights.count*sizeof(float) + mBiasWeights.count*sizeof(float);
	}

	virtual void serialize(void* buffer) override
	{
		char* d = reinterpret_cast(buffer), *a = d;

		write(d, mNbInputChannels);
		write(d, mNbOutputChannels);
		write(d, (int)mBiasWeights.count);
		serializeFromDevice(d, mKernelWeights);
		serializeFromDevice(d, mBiasWeights);

		assert(d == a + getSerializationSize());
	}

private:
	template void write(char*& buffer, const T& val)
	{
		*reinterpret_cast(buffer) = val;
		buffer += sizeof(T);
	}

	template T read(const char*& buffer)
	{
		T val = *reinterpret_cast(buffer);
		buffer += sizeof(T);
		return val;
	}

	nvinfer1::Weights copyToDevice(const void* hostData, size_t count)
	{
		void* deviceData;
		cudaMalloc(&deviceData, count * sizeof(float));
		cudaMemcpy(deviceData, hostData, count * sizeof(float), cudaMemcpyHostToDevice);
		return nvinfer1::Weights{ nvinfer1::DataType::kFLOAT, deviceData, int64_t(count) };
	}

	void serializeFromDevice(char*& hostBuffer, nvinfer1::Weights deviceWeights)
	{
		cudaMemcpy(hostBuffer, deviceWeights.values, deviceWeights.count * sizeof(float), cudaMemcpyDeviceToHost);
		hostBuffer += deviceWeights.count * sizeof(float);
	}

	nvinfer1::Weights deserializeToDevice(const char*& hostBuffer, size_t count)
	{
		nvinfer1::Weights w = copyToDevice(hostBuffer, count);
		hostBuffer += count * sizeof(float);
		return w;	
	}

	int mNbOutputChannels, mNbInputChannels;
	cudnnHandle_t mCudnn;
	cublasHandle_t mCublas;
	nvinfer1::Weights mKernelWeights, mBiasWeights;
	cudnnTensorDescriptor_t mSrcDescriptor, mDstDescriptor;
};

// integration for serialization
class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory
{
public:
	// caffe parser plugin implementation
	bool isPlugin(const char* name) override
	{
		return !strcmp(name, "ip2");
	}

	virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override
	{
		// there's no way to pass parameters through from the model definition, so we have to define it here explicitly
		static const int NB_OUTPUT_CHANNELS = 10;	
		assert(isPlugin(layerName) && nbWeights == 2 && weights[0].type == nvinfer1::DataType::kFLOAT && weights[1].type == nvinfer1::DataType::kFLOAT);
		assert(mPlugin.get() == nullptr);
		mPlugin = std::unique_ptr(new FCPlugin(weights, nbWeights, NB_OUTPUT_CHANNELS));
		return mPlugin.get();
	}

	// deserialization plugin implementation
	nvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
	{		
		assert(isPlugin(layerName));
		assert(mPlugin.get() == nullptr);
		mPlugin = std::unique_ptr(new FCPlugin(serialData, serialLength));
		return mPlugin.get();
	}

	// the application has to destroy the plugin when it knows it's safe to do so
	void destroyPlugin()
	{
		mPlugin.release();
	}

	std::unique_ptr mPlugin{ nullptr };
};

} // namespace

int test_plugin()
{
	// stuff we know about the network and the caffe input/output blobs
	const DATA_INFO info(28, 28, 10, "data", "prob");
	const std::string deploy_file {"models/mnist.prototxt"};
	const std::string model_file {"models/mnist.caffemodel"};
	const std::string mean_file {"models/mnist_mean.binaryproto"};
	Logger logger; // multiple instances of IRuntime and/or IBuilder must all use the same logger

	// create a GIE model from the caffe model and serialize it to a stream
	PluginFactory pluginFactory;
	nvinfer1::IHostMemory* gieModelStream{ nullptr };
	caffeToGIEModel(deploy_file, model_file, std::vector{std::get<4>(info).c_str()}, 1, &pluginFactory, gieModelStream, logger);
	pluginFactory.destroyPlugin();

	nvcaffeparser1::ICaffeParser* parser = nvcaffeparser1::createCaffeParser();
	nvcaffeparser1::IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(mean_file.c_str());
	parser->destroy();

	// deserialize the engine 
	nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
	nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory);
	nvinfer1::IExecutionContext *context = engine->createExecutionContext();

	// parse the mean file and 	subtract it from the image
	const float* meanData = reinterpret_cast(meanBlob->getData());

	const std::string image_path{ "images/digit/" };
	for (int i = 0; i < 10; ++i) {
		const std::string image_name = image_path + std::to_string(i) + ".png";
		cv::Mat mat = cv::imread(image_name, 0);
		if (!mat.data) {
			fprintf(stderr, "read image fail: %s\n", image_name.c_str());
			return -1;
		}

		cv::resize(mat, mat, cv::Size(std::get<0>(info), std::get<1>(info)));
		mat.convertTo(mat, CV_32FC1);

		float data[std::get<1>(info)*std::get<0>(info)];
		const float* p = (float*)mat.data;
		for (int j = 0; j < std::get<1>(info)*std::get<0>(info); ++j) {
			data[j] = p[j] - meanData[j];
		}

		// run inference
		float prob[std::get<2>(info)];
		doInference(*context, data, prob, 1, info);

		float val{-1.f};
		int idx{-1};

		for (int t = 0; t < std::get<2>(info); ++t) {
			if (val < prob[t]) {
				val = prob[t];
				idx = t;
			}
		}

		fprintf(stdout, "expected value: %d, actual value: %d, probability: %f\n", i, idx, val);
	}

	meanBlob->destroy();
	if (gieModelStream) gieModelStream->destroy();
	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();
	pluginFactory.destroyPlugin();

	return 0;
}

测试图像如下：

执行结果如下(与http://blog.csdn.net/fengbingchun/article/details/78552908 中结果一致)：

测试代码编译步骤如下(ReadMe.txt)：

在Linux下通过CMake编译TensorRT_Test中的测试代码步骤：
1. 将终端定位到CUDA_Test/prj/linux_tensorrt_cmake，依次执行如下命令：
	$ mkdir build
	$ cd build
	$ cmake ..
	$ make (生成TensorRT_Test执行文件)
	$ ln -s ../../../test_data/models  ./ (将models目录软链接到build目录下)
	$ ln -s ../../../test_data/images  ./ (将images目录软链接到build目录下)
	$ ./TensorRT_Test
2. 对于有需要用OpenCV参与的读取图像的操作，需要先将对应文件中的图像路径修改为Linux支持的路径格式

GitHub： https://github.com/fengbingchun/CUDA_Test

深度学习(1) 浅忆へ梦微凉深度学习人工智能深度学习学习方法 python
一、torch的安装基于直接设备情况，选择合适的torch版本，有显卡的建议安装GPU版本，可以通过nvidia-smi命令来查看显卡驱动的版本，在官网中根据cuda版本，选择合适的版本号，下面是安装示例代码GPU：pipinstalltorch==2.5.0torchvision==0.20.0torchaudio==2.5.0--index-urlhttps://download.pytorc
[python]通过whl文件安装torchvision和torchaudio及国内whl文件下载地址汇总萌萌哒240 环境配置 python 开发语言
要通过.whl文件安装torchvision和torchaudio，你需要先确保你已经安装了与这些库兼容的PyTorch版本。以下是一个详细的步骤指南，帮助你通过.whl文件安装这些库。1.安装PyTorch首先，确保你已经安装了PyTorch。你可以从PyTorch的官方网站获取适合你系统的安装命令例如，如果你使用的是CUDA11.3和Python3.8，你可以使用以下命令安装PyTorch：p
centos7安装cuda、dudnn、Nvidia驱动[最详细的教程和报错解决方案] LensonYuan 自然语言处理深度学习自然语言处理数据挖掘机器学习深度学习
现在很多教程都没有覆盖全报错问题，本文几乎把所有问题都描述到位首先说下我电脑的环境：centos7x86-64：uname-r命令显示为3.10.0-1160.el7.x86_64；物理显卡为单卡2080ti；gcc通过yum安装，yuminstallgcc版本为4.8.5,同时会显示安装一些其他包，如glibc版本为2.17；安装准备以下为驱动和官网文件名一致，可到nvidia官网下载，选择历史
Jetson Nano部署TensorRT加速的YOLO V8 Hylan_J 学习项目 YOLO github arm开发
JetsonNano部署YOLOV8刷JetPack镜像Step1：格式化SD卡Step2：下载镜像文件Step3：烧录镜像文件Python环境配置Solution1：更改默认的Python环境Solution2：使用conda管理Python环境YOLOV8部署Step1：源码下载Step2：trtexec环境变量设置Step3：pt权重文件导出为ONNX格式Step4：ONNX文件生成engi
如何确保热点产品查询延迟控制在10ms以内?思维导图代码示例（java 架构) 用心去追梦 java 架构开发语言
为了确保热点产品查询的延迟控制在10ms以内，可以采取一系列优化措施和技术手段。以下是一个思维导图的结构和一个简化的Java架构代码示例，用于展示如何实现这一目标。思维导图结构低延迟查询数据预加载热点数据预测提前加载到内存缓存使用高性能缓存内存级缓存（如Caffeine）分布式缓存（如Redis）缓存一致性管理弱一致性模型缓存更新策略（写后失效、读时更新等）并发处理多线程/异步编程线程池管理数据库
华为昇腾AI处理器，atc模型转换和推理过程，华为的CANN库对标的是NVIDIA的cuda，所以在华为昇腾AI处理器上安装了CANN后，就可以直接使用pytorch了，以及推理框架(参考学习) 鼾声鼾语嵌入式硬件学习 stm32 python 数据库
1，模型转换atc模型转换模型转换如图参考链接如下：https://toscode.mulanos.cn/ascend/samples/tree/master/inference/modelInference/sampleResnetQuickStart/cpp2，推理运行，这里需要注意用到了infer，有点类似的pytorch对模型的加载使用model=InferSession(0,model_
【vLLM 学习】安装
vLLM是一款专为大语言模型推理加速而设计的框架，实现了KV缓存内存几乎零浪费，解决了内存管理瓶颈问题。更多vLLM中文文档及教程可访问→https://vllm.hyper.ai/vLLM是一个Python库，包含预编译的C++和CUDA(12.1)二进制文件。依赖环境操作系统：LinuxPython：3.8-3.12GPU：计算能力7.0或更高（例如V100、T4、RTX20xx、A100、L
Spring Cache自定义过期时间
背景要求：对数据做统计分析，时间截止到当天零点根据要求，每天查询的数据范围都是截止前一天结束，第二天需要查询新数据。那么缓存只保留一天。使用caffeine简单举个例子，主要依赖有：org.springframework.bootspring-boot-starter-web2.7.18org.springframework.bootspring-boot-starter-cache2.7.18c
【安装环境】配置MMTracking环境 xuanyu22 安装环境机器学习神经网络深度学习 python
版本v0.14.0安装torchnumpy的版本不能太高，否则后面安装时会发生冲突。先安装numpy，因为pytorch的安装会自动配置高版本numpy。condainstallnumpy=1.21.5mmtracking支持的torch版本有限，需要找到合适的condainstallpytorch==1.11.0torchvision==0.12.0cudatoolkit=10.2-cpytor
安装torch报错 raise ReadTimeoutError(self._pool, None, “Read timed out.“) pip._vendor.urllib3.exceptions 待磨的钝刨 pip pytorch 人工智能
文章目录1.配置cuda的torch环境时报错1.配置命令2.报错bug2.解决方法1.增加下载超时时间：2.尝试使用镜像源：3.检查网络连接：4.分次安装：5.重试安装：6.手动下载.whl文件安装1.配置cuda的torch环境时报错1.配置命令pipinstalltorch==2.0.1torchvision==0.15.2torchaudio==2.0.2--index-urlhttps:
SAM2跑通（Ubuntu20.04)内含安装多个cuda 好好607 pytorch linux
参考链接：github链接安装cuda，之前借鉴的方法安装多个cuda补充cuda安装：Asymlinkalreadyexistsat/usr/local/cuda.Updatetothisinstallation?选择no，否则会创建一个软连接覆盖之前那个/usr/local/cudasudogedit~/.bashrc如果按第二个链接安装的cuda，手动改一下版本即可SAM环境安装步骤除了本地
使用vllIm部署大语言模型添砖JAVA的小墨机器学习
使用vllm部署大语言模型一般需要以下步骤：一、准备工作1.系统要求-操作系统：常见的Linux发行版（如Ubuntu、CentOS）或Windows（通过WSL）。-GPU支持：NVIDIAGPU并安装了适当的驱动程序。-足够的内存和存储空间。2.安装依赖-Python3.8及以上版本。-CUDA工具包（根据GPU型号选择合适的版本）。二、安装vllm1.创建虚拟环境（推荐）-使用Conda：c
vllm在线推理踩坑记懂点投资的码农大语言模型 ai 语言模型 python
最近在《AI大模型全栈工程师》课程里看老师推荐使用vllm部署大模型，优点就不详细介绍了，这里摘抄一段来自于Qwen2上手指南对于它的简单介绍：它易于使用，且具有最先进的服务吞吐量、高效的注意力键值内存管理（通过PagedAttention实现）、连续批处理输入请求、优化的CUDA内核等功能。至于原理就先不看了，直接上手部署，以后再来补理论知识。一、vLLM在线推理在Qwen2的上市指南里介绍了v
多版本cuda安装及灵活切换详细教程 Fzc_PCL CUDA Linux 记录 cuda linux
一、首先介绍下我所使用的环境ubuntu18.04+1080ti二、下载安装包1.cudatoolkit下载①环境选择，想要多版本共存的，尽量选择runfile文件进行安装②有些cudatoolkit下载页面，和我上边的一样，没有对应的安装包下载按钮，BaseInstaller中只给了两行命令，如果运行第一行命令的话，下载会比较慢，我是直接复制wget后边的链接在新网页窗口中打开，会自动弹出下载窗
Cuda 程序编译报错: fatal error: cusparse.h: No such file or directory 原野寻踪实践经验 cuda
编译cuda程序时发现下列报错：/mnt/xxx/miniconda3/envs/xxx/lib/python3.8/site-packages/torch/include/ATen/cuda/CUDAContext.h:6:10:fatalerror:cusparse.h:Nosuchfileordirectory#include^~~~~~~~~~~~检查发现是选择了错误的Cuda版本。ls/
天下苦英伟达久矣！PyTorch官方免CUDA加速推理，Triton时代要来？诗者才子酒中仙物联网 /互联网 /人工智能 /其他 pytorch 人工智能 python
在做大语言模型（LLM）的训练、微调和推理时，使用英伟达的GPU和CUDA是常见的做法。在更大的机器学习编程与计算范畴，同样严重依赖CUDA，使用它加速的机器学习模型可以实现更大的性能提升。虽然CUDA在加速计算领域占据主导地位，并成为英伟达重要的护城河之一。但其他一些工作的出现正在向CUDA发起挑战，比如OpenAI推出的Triton，它在可用性、内存开销、AI编译器堆栈构建等方面具有一定的优势
GPU版pytorch安装普通攻击往后拉 python tips 神经网络基础模型关键点
由于经常重装系统，导致电脑的环境需要经常重新配置，其中尤其是cudatorch比较难以安装，因此记录一下安装GPU版本torch的过程。1）安装CUDAtoolkit这个可以看做是N卡所有cuda计算的基础，一般都会随驱动的更新自动安装，但是不全，仍然需要安装toolkit，并不需要先看已有版本是哪个，反正下载完后会自动覆盖原有的cuda。下载网站两个：国内网站：只能下载最新的toolkit，但是
人工智能-GPU版本机器学习、深度学习模型安装 bw876720687 人工智能机器学习深度学习
背景1、在有Nvidia-GPU的情况下模型使用cuda加速计算，但是很有多模型的GPU和CPU版本安装方式不同，如何安装lgb\cat\xgb.2、为了让代码有普适性，如何自适应环境当中的设备进行CPU或者GPU的调整？解决方案问题一：安装GPU版本的LightGBMLightGBM默认不会安装GPU支持版，需要手动编译以启用GPU。以下是在Linux和Windows上编译GPU版本LightG
CUDA 编程入门（2）：CUDA 调度模型知识搬运工人 CUDA GPU CUDA
参考：CUDA编程入门（2）：CUDA编程模型-知乎(zhihu.com)CUDA调度模型Block调度Block对应的物理硬件概念是SM，也就是说SM负责block中线程的执行，SM会为每个block分配需求的资源，比如寄存器，共享内存等，由于SM自身资源有限，因此它被分配到的block数量也是有限的，这取决于block中线程的资源需求。当所有的SM都饱和之后，剩下的blocks将会被暂时挂起，
TensorRT模型量化实践痛&快乐着深度学习 TensorRT c++深度学习
文章目录量化基本概念量化的方法方式1：trtexec（PTQ的一种）方式2：PTQ2.1pythononnx转trt2.2polygraphy工具:应该是对2.1量化过程的封装方式3：QAT(追求精度时推荐)使用TensorRT量化实践（C++版）使用TensorRT量化（python版）参考文献量化基本概念后训练量化PostTrainingQuantization(PTQ)量化过程仅仅通过离线推
Ubuntu 开机出现 recovering journal 无法进入图形界面解决流程(不通用，自用) Artintel 学习 ubuntu
远程连接进入命令行：rm-rf/etc/X11/xorg.confcp/etc/X11/xorg.conf.failsafe/etc/X11/xorg.confsudoservicelightdmstopsudoapt-getremovenvidia*cdjohn/qudong+cuda9.0\+\cudnn/sudochmoda+xnv.runsudo./nv.run-no-x-check-no
Yolo-v3利用GPU训练make时发生错误：/usr/bin/ld: cannot find -lcuda 徐小妞66666
一.利用GPU训练Yolov3时，首先要修改MakeFile文件，修改格式如下：GPU=1(原来为0)CUDNN=1(原来为0)NVCC=/usr/local/cuda/bin/nvcc(新建,注意自己本机的地址)二.此时make产生错误/usr/bin/ld:cannotfind-lcuda1.查看MakeFile文件找到该行代码：LDFLAGS+=-L/usr/local/cuda/lib64
【环境搭建：onnx模型部署】onnxruntime-gpu安装与测试（python）(1) 2401_83703835 程序员 python 深度学习 pytorch
cuda==10.2cudnn==8.0.3onnxruntime-gpu==1.5.0or1.6.0pipinstallonnxruntime-gpu==1.6.0###2.2方法二：onnxruntime-gpu不依赖于本地主机上cuda和cudnn在conda环境中安装，不依赖于本地主机上已安装的cuda和cudnn版本，灵活方便。这里，先说一下已经测试通过的组合：*python3.6,cu
pytorch计算网络参数量和Flops Mr_Lowbee PyTorch pytorch 深度学习人工智能
fromtorchsummaryimportsummarysummary(net,input_size=(3,256,256),batch_size=-1)输出的参数是除以一百万（/1000000）M，fromfvcore.nnimportFlopCountAnalysisinputs=torch.randn(1,3,256,256).cuda()flop_counter=FlopCountAna
使用TensorRT对YOLOv8模型进行加速推理 fengbingchun Deep Learning CUDA/TensorRT YOLOv8 TensorRT
这里使用GitHub上shouxieai的infer框架对YOLOv8模型进行加速推理，操作过程如下所示：1.配置环境，依赖项，包括：(1).CUDA:11.8(2).cuDNN:8.7.0(3).TensorRT:8.5.3.1(4).ONNX:1.16.0(5).OpenCV:4.10.02.cloneinfer代码：https://github.com/shouxieai/infer3.使用
ONNX Runtime、CUDA、cuDNN、TensorRT版本对应可keke ML&DL pytorch deep learning
文章目录ONNXRuntime的安装ONNXRuntime与CUDA、cuDNN的版本对应ONNXRuntime与ONNX的版本对应ONNXRuntime、TensorRT、CUDA版本对应ONNXRuntime的安装官方文档注意，到目前为止，onnxruntime-gpu在CUDA12.x和CUDA11.x下的安装命令是不同的，仔细阅读官方文档。验证安装python>>>importonnxru
ONNXRuntime与CUDA版本对应 zy_destiny 部署 YOLO onnxruntime onnX 部署 cuda python
onnxruntime-gpu版本可以说是一个非常简单易用的框架，因为通常用pytorch训练的模型，在部署时，会首先转换成onnx，而onnxruntime和onnx又是有着同一个爸爸，无疑，在op的支持上肯定是最好的。通常在安装onnxruntime时，需要将其版本与pytorch版本和CUDA版本进行对应，其中ONNXRuntime与CUDA版本对应关系表如下表所示。ONNXRuntimeC
【已解决】onnx无法找到CUDA的路径烟花节已解决人工智能深度学习 python pip
报错RuntimeError:D:\a\_work\1\s\onnxruntime\python\onnxruntime_pybind_state.cc:857onnxruntime::python::CreateExecutionProviderInstanceCUDA_PATHissetbutCUDAwasntabletobeloaded.Pleaseinstallthecorrectvers
Window 下 Vim 环境安装踩坑问题汇总及解决方法 yyywxk #Python模块有关问题 vim python mamba windows
导航Linux下Mamba及Vim安装问题参看本人之前博客：Mamba环境安装踩坑问题汇总及解决方法Linux下Vmamba安装教程参看本人之前博客：Vmamba安装教程（无需更改base环境中的cuda版本）Windows下VMamba的安装参看本人之前博客：Windows下VMamba安装教程（无需更改base环境中的cuda版本且可加速）Window下Mamba环境教程参看本人之前博客：Wi
windows11 wsl2 ubuntu20.04安装vision mamba并进行测试一剑斩蛟龙人工智能深度学习图像处理计算机视觉 python 机器学习 pytorch
windows11wsl2ubuntu20.04安装visionmamba安装流程使用cifar-100测试安装成功安装流程visionmamba安装了半天才跑通，记录一下流程在wsl上安装cudawgethttps://developer.download.nvidia.cn/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_l
面向对象面向过程 3213213333332132 java
面向对象：把要完成的一件事，通过对象间的协作实现。面向过程：把要完成的一件事，通过循序依次调用各个模块实现。我把大象装进冰箱这件事为例，用面向对象和面向过程实现，都是用java代码完成。 1、面向对象 package bigDemo.ObjectOriented; /** * 大象类 * * @Description * @author FuJian
Java Hotspot: Remove the Permanent Generation bookjovi HotSpot
openjdk上关于hotspot将移除永久带的描述非常详细，http://openjdk.java.net/jeps/122 JEP 122: Remove the Permanent Generation Author Jon Masamitsu Organization Oracle Created 2010/8/15 Updated 2011/
正则表达式向前查找向后查找,环绕或零宽断言 dcj3sjt126com 正则表达式
向前查找和向后查找 1. 向前查找：根据要匹配的字符序列后面存在一个特定的字符序列(肯定式向前查找)或不存在一个特定的序列(否定式向前查找)来决定是否匹配。.NET将向前查找称之为零宽度向前查找断言。对于向前查找，出现在指定项之后的字符序列不会被正则表达式引擎返回。 2. 向后查找：一个要匹配的字符序列前面有或者没有指定的
BaseDao 171815164 seda
import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.sql.PreparedStatement; import java.sql.ResultSet; public class BaseDao { public Conn
Ant标签详解--Java命令 g21121 Java命令
这一篇主要介绍与java相关标签的使用终于开始重头戏了，Java部分是我们关注的重点也是项目中用处最多的部分。 1
[简单]代码片段_电梯数字排列 53873039oycg 代码
今天看电梯数字排列是9 18 26这样呈倒N排列的,写了个类似的打印例子，如下: import java.util.Arrays; public class 电梯数字排列_S3_Test { public static void main(S
Hessian原理云端月影 hessian原理
Hessian 原理分析一．远程通讯协议的基本原理网络通信需要做的就是将流从一台计算机传输到另外一台计算机，基于传输协议和网络 IO 来实现，其中传输协议比较出名的有 http 、 tcp 、 udp 等等， http 、 tcp 、 udp 都是在基于 Socket 概念上为某类应用场景而扩展出的传输协
区分Activity的四种加载模式----以及Intent的setFlags aijuans android
在多Activity开发中，有可能是自己应用之间的Activity跳转，或者夹带其他应用的可复用Activity。可能会希望跳转到原来某个Activity实例，而不是产生大量重复的Activity。这需要为Activity配置特定的加载模式，而不是使用默认的加载模式。加载模式分类及在哪里配置 Activity有四种加载模式： standard singleTop
hibernate几个核心API及其查询分析 antonyup_2006 html .net Hibernate xml 配置管理
(一) org.hibernate.cfg.Configuration类读取配置文件并创建唯一的SessionFactory对象.(一般,程序初始化hibernate时创建.) Configuration co
PL/SQL的流程控制百合不是茶 oracle PL/SQL编程循环控制
PL/SQL也是一门高级语言,所以流程控制是必须要有的,oracle数据库的pl/sql比sqlserver数据库要难,很多pl/sql中有的sqlserver里面没有流程控制; 分支语句 if 条件 then 结果 else 结果 end if ; 条件语句 case when 条件 then 结果; 循环语句 loop
强大的Mockito测试框架 bijian1013 mockito 单元测试
一.自动生成Mock类在需要Mock的属性上标记@Mock注解，然后@RunWith中配置Mockito的TestRunner或者在setUp()方法中显示调用MockitoAnnotations.initMocks(this);生成Mock类即可。二.自动注入Mock类到被测试类 &nbs
精通Oracle10编程SQL(11)开发子程序 bijian1013 oracle 数据库 plsql
/* *开发子程序 */ --子程序目是指被命名的PL/SQL块，这种块可以带有参数，可以在不同应用程序中多次调用 --PL/SQL有两种类型的子程序：过程和函数 --开发过程 --建立过程：不带任何参数 CREATE OR REPLACE PROCEDURE out_time IS BEGIN DBMS_OUTPUT.put_line(systimestamp); E
【EhCache一】EhCache版Hello World bit1129 Hello world
本篇是EhCache系列的第一篇，总体介绍使用EhCache缓存进行CRUD的API的基本使用，更细节的内容包括EhCache源代码和设计、实现原理在接下来的文章中进行介绍环境准备 1.新建Maven项目 2.添加EhCache的Maven依赖 <dependency> <groupId>ne
学习EJB3基础知识笔记白糖_ bean Hibernate jboss webservice ejb
最近项目进入系统测试阶段，全赖袁大虾领导有力，保持一周零bug记录，这也让自己腾出不少时间补充知识。花了两天时间把“传智播客EJB3.0”看完了，EJB基本的知识也有些了解，在这记录下EJB的部分知识，以供自己以后复习使用。 EJB是sun的服务器端组件模型，最大的用处是部署分布式应用程序。EJB (Enterprise JavaBean)是J2EE的一部分，定义了一个用于开发基
angular.bootstrap boyitech AngularJS AngularJS API angular中文api
angular.bootstrap 描述：手动初始化angular。这个函数会自动检测创建的module有没有被加载多次，如果有则会在浏览器的控制台打出警告日志，并且不会再次加载。这样可以避免在程序运行过程中许多奇怪的问题发生。使用方法： angular .
java-谷歌面试题-给定一个固定长度的数组，将递增整数序列写入这个数组。当写到数组尾部时，返回数组开始重新写，并覆盖先前写过的数 bylijinnan java
public class SearchInShiftedArray { /** * 题目：给定一个固定长度的数组，将递增整数序列写入这个数组。当写到数组尾部时，返回数组开始重新写，并覆盖先前写过的数。 * 请在这个特殊数组中找出给定的整数。 * 解答： * 其实就是“旋转数组”。旋转数组的最小元素见http://bylijinnan.iteye.com/bl
天使还是魔鬼？都是我们制造 ducklsl 生活教育情感
----------------------------剧透请原谅，有兴趣的朋友可以自己看看电影，互相讨论哦！！！从厦门回来的动车上，无意中瞟到了书中推荐的几部关于儿童的电影。当然，这几部电影可能会另大家失望，并不是类似小鬼当家的电影，而是关于“坏小孩”的电影！自己挑了两部先看了看，但是发现看完之后，心里久久不能平
[机器智能与生物]研究生物智能的问题 comsci 生物
我想,人的神经网络和苍蝇的神经网络,并没有本质的区别...就是大规模拓扑系统和中小规模拓扑分析的区别.... 但是,如果去研究活体人类的神经网络和脑系统,可能会受到一些法律和道德方面的限制,而且研究结果也不一定可靠,那么希望从事生物神经网络研究的朋友,不如把
获取Android Device的信息 dai_lm android
String phoneInfo = "PRODUCT: " + android.os.Build.PRODUCT; phoneInfo += ", CPU_ABI: " + android.os.Build.CPU_ABI; phoneInfo += ", TAGS: " + android.os.Build.TAGS; ph
最佳字符串匹配算法（Damerau-Levenshtein距离算法）的Java实现 datamachine java 算法字符串匹配
原文：http://www.javacodegeeks.com/2013/11/java-implementation-of-optimal-string-alignment.html------------------------------------------------------------------------------------------------------------
小学5年级英语单词背诵第一课 dcj3sjt126com english word
long 长的 show 给...看，出示 mouth 口，嘴 write 写 use 用，使用 take 拿，带来 hand 手 clever 聪明的 often 经常 wash 洗 slow 慢的 house 房子 water 水 clean 清洁的 supper 晚餐 out 在外 face 脸，
macvim的使用实战 dcj3sjt126com mac vim
macvim用的是mac里面的vim, 只不过是一个GUI的APP, 相当于一个壳 1. 下载macvim https://code.google.com/p/macvim/ 2. 了解macvim :h vim的使用帮助信息 :h macvim
java二分法查找蕃薯耀 java二分法查找二分法 java二分法
java二分法查找 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 蕃薯耀 2015年6月23日 11:40:03 星期二 http:/
Spring Cache注解+Memcached hanqunfeng spring memcached
Spring3.1 Cache注解依赖jar包：  <dependency> <groupId>com.google.code.simple-spring-memcached</groupId> <artifactId>simple-s
apache commons io包快速入门 jackyrong apache commons
原文参考 http://www.javacodegeeks.com/2014/10/apache-commons-io-tutorial.html Apache Commons IO 包绝对是好东西，地址在http://commons.apache.org/proper/commons-io/，下面用例子分别介绍： 1）工具类 2
如何学习编程 lampcy java 编程 C++c
首先,我想说一下学习思想.学编程其实跟网络游戏有着类似的效果.开始的时候,你会对那些代码,函数等产生很大的兴趣,尤其是刚接触编程的人,刚学习第一种语言的人.可是,当你一步步深入的时候,你会发现你没有了以前那种斗志.就好象你在玩韩国泡菜网游似的,玩到一定程度,每天就是练级练级,完全是一个想冲到高级别的意志力在支持着你.而学编程就更难了,学了两个月后,总是觉得你好象全都学会了,却又什么都做不了,又没有
架构师之spring-----spring3.0新特性的bean加载控制@DependsOn和@Lazy nannan408 Spring3
1.前言。如题。 2.描述。 @DependsOn用于强制初始化其他Bean。可以修饰Bean类或方法，使用该Annotation时可以指定一个字符串数组作为参数，每个数组元素对应于一个强制初始化的Bean。 @DependsOn({"steelAxe","abc"}) @Comp
Spring4+quartz2的配置和代码方式调度 Everyday都不同代码配置 spring4 quartz2.x 定时任务
前言：这些天简直被quartz虐哭。。因为quartz 2.x版本相比quartz1.x版本的API改动太多，所以，只好自己去查阅底层API…… quartz定时任务必须搞清楚几个概念： JobDetail——处理类 Trigger——触发器，指定触发时间，必须要有JobDetail属性，即触发对象 Scheduler——调度器，组织处理类和触发器，配置方式一般只需指定触发
Hibernate入门 tntxia Hibernate
前言使用面向对象的语言和关系型的数据库，开发起来很繁琐，费时。由于现在流行的数据库都不面向对象。Hibernate 是一个Java的ORM（Object/Relational Mapping）解决方案。 Hibernte不仅关心把Java对象对应到数据库的表中，而且提供了请求和检索的方法。简化了手工进行JDBC操作的流程。如
Math类 xiaoxing598 Math
一、Java中的数字（Math）类是final类，不可继承。 1、常数 PI：double圆周率 E：double自然对数 2、截取（注意方法的返回类型） double ceil(double d) 返回不小于d的最小整数 double floor(double d) 返回不大于d的整最大数 int round(float f) 返回四舍五入后的整数 long round

TensorRT Samples: MNIST(Plugin, add a custom layer)

你可能感兴趣的:(CUDA/TensorRT,Caffe)