EfficientDet tensorrt
Environment
Operating System + Version: Ubuntu + 16.04
TensorRT Version: 8.0.1.6
GPU Type: GeForce GTX1650,4GB
Nvidia Driver Version: 470.63.01
CUDA Version: 11.0.194
CUDNN Version: 8.0.5
Python Version (if applicable): 3.7.11
Anaconda Version:4.10.3
gcc:7.5.0
g++:7.5.0
absl-py==0.13.0
appdirs==1.4.4
astunparse==1.6.3
attrs==21.2.0
cached-property==1.5.2
cachetools==4.2.2
certifi==2021.5.30
charset-normalizer==2.0.5
clang==5.0
cycler==0.10.0
Cython==0.29.24
dm-tree==0.1.6
flatbuffers==1.12
gast==0.4.0
google-auth==1.35.0
google-auth-oauthlib==0.4.6
google-pasta==0.2.0
graphsurgeon @ file:///.../TensorRT-8.0.1.6/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl
grpcio==1.34.1
h5py==3.1.0
idna==3.2
importlib-metadata==4.8.1
keras==2.6.0
keras-nightly==2.5.0.dev2021032900
Keras-Preprocessing==1.1.2
kiwisolver==1.3.2
lxml==4.6.3
Mako==1.1.5
Markdown==3.3.4
MarkupSafe==2.0.1
matplotlib==3.4.3
neural-structured-learning==1.3.1
numpy==1.19.5
oauthlib==3.1.1
onnx==1.8.1
onnx-graphsurgeon==0.3.11
onnxruntime==1.8.0
opt-einsum==3.3.0
Pillow==8.3.2
protobuf==3.18.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycocotools==2.0
pycuda==2021.1
pyparsing==2.4.7
python-dateutil==2.8.2
pytools==2021.2.8
PyYAML==5.4.1
requests==2.26.0
requests-oauthlib==1.3.0
rsa==4.7.2
scipy==1.7.1
six==1.15.0
tensorboard==2.6.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.0
tensorflow-addons==0.14.0
tensorflow-estimator==2.5.0
tensorflow-gpu @ file:///.../tensorflow_gpu-2.5.0-cp37-cp37m-manylinux2010_x86_64.whl
tensorflow-hub==0.12.0
tensorflow-model-optimization==0.6.0
tensorrt @ file:///.../TensorRT-8.0.1.6/python/tensorrt-8.0.1.6-cp37-none-linux_x86_64.whl
termcolor==1.1.0
tf2onnx==1.8.1
typeguard==2.12.1
typing-extensions==3.7.4.3
uff @ file:///.../TensorRT-8.0.1.6/uff/uff-0.6.9-py2.py3-none-any.whl
urllib3==1.26.6
Werkzeug==2.0.1
wrapt==1.12.1
zipp==3.5.0
name: tensorRT-efficientdet
channels:
- http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
- http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
- http://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- ca-certificates=2021.7.5=h06a4308_1
- certifi=2021.5.30=py37h06a4308_0
- ld_impl_linux-64=2.35.1=h7274673_9
- libffi=3.3=he6710b0_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- libstdcxx-ng=9.3.0=hd4cf53a_17
- ncurses=6.2=he6710b0_1
- openssl=1.1.1l=h7f8727e_0
- pip=21.0.1=py37h06a4308_0
- python=3.7.11=h12debd9_0
- readline=8.1=h27cfd23_0
- setuptools=58.0.4=py37h06a4308_0
- sqlite=3.36.0=hc218d9a_0
- tk=8.6.10=hbc83047_0
- wheel=0.37.0=pyhd3eb1b0_1
- xz=5.2.5=h7b6447c_0
- zlib=1.2.11=h7b6447c_3
- pip:
- absl-py==0.13.0
- appdirs==1.4.4
- astunparse==1.6.3
- attrs==21.2.0
- cached-property==1.5.2
- cachetools==4.2.2
- charset-normalizer==2.0.5
- clang==5.0
- cycler==0.10.0
- cython==0.29.24
- dm-tree==0.1.6
- flatbuffers==1.12
- gast==0.4.0
- google-auth==1.35.0
- google-auth-oauthlib==0.4.6
- google-pasta==0.2.0
- graphsurgeon==0.4.5
- grpcio==1.34.1
- h5py==3.1.0
- idna==3.2
- importlib-metadata==4.8.1
- keras==2.6.0
- keras-nightly==2.5.0.dev2021032900
- keras-preprocessing==1.1.2
- kiwisolver==1.3.2
- lxml==4.6.3
- mako==1.1.5
- markdown==3.3.4
- markupsafe==2.0.1
- matplotlib==3.4.3
- neural-structured-learning==1.3.1
- numpy==1.19.5
- oauthlib==3.1.1
- onnx==1.8.1
- onnx-graphsurgeon==0.3.11
- onnxruntime==1.8.0
- opt-einsum==3.3.0
- pillow==8.3.2
- protobuf==3.18.0
- pyasn1==0.4.8
- pyasn1-modules==0.2.8
- pycocotools==2.0
- pycuda==2021.1
- pyparsing==2.4.7
- python-dateutil==2.8.2
- pytools==2021.2.8
- pyyaml==5.4.1
- requests==2.26.0
- requests-oauthlib==1.3.0
- rsa==4.7.2
- scipy==1.7.1
- six==1.15.0
- tensorboard==2.6.0
- tensorboard-data-server==0.6.1
- tensorboard-plugin-wit==1.8.0
- tensorflow-addons==0.14.0
- tensorflow-estimator==2.5.0
- tensorflow-gpu==2.5.0
- tensorflow-hub==0.12.0
- tensorflow-model-optimization==0.6.0
- tensorrt==8.0.1.6
- termcolor==1.1.0
- tf2onnx==1.8.1
- typeguard==2.12.1
- typing-extensions==3.7.4.3
- uff==0.6.9
- urllib3==1.26.6
- werkzeug==2.0.1
- wrapt==1.12.1
- zipp==3.5.0
prefix: /home/yichao/miniconda3/envs/tensorRT-efficientdet
git clone https://github.com/google/automl.git
cd /home/yichao/MyDocuments/automl/efficientdet
pip install -r requirements.txt
pip install pyyaml
pip install tensorflow-model-optimization
pip install matplotlib
efficientdet-d0
# /home/yichao/Downloads/efficientdet-d0
├── efficientdet-d0
│ ├── checkpoint
│ ├── d0_coco_test-dev2017.txt
│ ├── d0_coco_val.txt
│ ├── model.data-00000-of-00001
│ ├── model.index
│ └── model.meta
cd /home/yichao/MyDocuments/automl/efficientdet
python model_inspect.py \
--runmode saved_model \
--model_name efficientdet-d0 \
--ckpt_path /home/yichao/Downloads/efficientdet-d0 \
--saved_model_dir /home/yichao/Downloads/saved_model
# /home/yichao/Downloads/saved_model
├── saved_model
│ ├── efficientdet-d0_frozen.pb
│ ├── saved_model.pb
│ └── variables
# saved_model.pb,7.3MB
git clone https://github.com/NVIDIA/TensorRT.git
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
pip install -r requirements.txt
pip install onnx-graphsurgeon --index-url https://pypi.ngc.nvidia.com
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python create_onnx.py \
--input_shape '1,512,512,3' \
--saved_model /home/yichao/Downloads/saved_model \
--onnx /home/yichao/Downloads/saved_model_onnx/model.onnx
# /home/yichao/Downloads/saved_model_onnx
├── saved_model_onnx
│ └── model.onnx
# model.onnx,16.5MB
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python build_engine.py \
--onnx /home/yichao/Downloads/saved_model_onnx/model.onnx \
--engine /home/yichao/Downloads/saved_model_trt_fp32/engine.trt \
--precision fp32
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ time python build_engine.py \
> --onnx /home/yichao/Downloads/saved_model_onnx/model.onnx \
> --engine /home/yichao/Downloads/saved_model_trt_fp32/engine.trt \
> --precision fp32
[TensorRT] INFO: [MemUsageChange] Init CUDA: CPU +320, GPU +0, now: CPU 343, GPU 476 (MiB)
[TensorRT] WARNING: onnx2trt_utils.cpp:364: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[TensorRT] INFO: No importer registered for op: EfficientNMS_TRT. Attempting to import as plugin.
[TensorRT] INFO: Searching for plugin: EfficientNMS_TRT, plugin_version: 1, plugin_namespace:
[TensorRT] INFO: Successfully created plugin: EfficientNMS_TRT
INFO:EngineBuilder:Network Description
INFO:EngineBuilder:Input 'image_arrays:0' with shape (1, 512, 512, 3) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'num_detections' with shape (1, 1) and dtype DataType.INT32
INFO:EngineBuilder:Output 'detection_boxes' with shape (1, 100, 4) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'detection_scores' with shape (1, 100) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'detection_classes' with shape (1, 100) and dtype DataType.INT32
INFO:EngineBuilder:Building fp32 Engine in /home/yichao/Downloads/saved_model_trt_fp32/engine.trt
build_engine.py:203: DeprecationWarning: Use build_serialized_network instead.
with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f:
[TensorRT] INFO: [MemUsageSnapshot] Builder begin: CPU 361 MiB, GPU 476 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +357, GPU +148, now: CPU 721, GPU 624 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +118, GPU +148, now: CPU 839, GPU 772 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] WARNING: Detected invalid timing cache, setup a local cache instead
[TensorRT] INFO: Detected 1 inputs and 4 output network tensors.
[TensorRT] INFO: Total Host Persistent Memory: 321392
[TensorRT] INFO: Total Device Persistent Memory: 16182272
[TensorRT] INFO: Total Scratch Memory: 106066176
[TensorRT] INFO: [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 12 MiB, GPU 8 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 1267, GPU 970 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +1, GPU +8, now: CPU 1268, GPU 978 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1268, GPU 962 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1267, GPU 944 (MiB)
[TensorRT] INFO: [MemUsageSnapshot] Builder end: CPU 1266 MiB, GPU 944 MiB
INFO:EngineBuilder:Serializing engine to file: /home/yichao/Downloads/saved_model_trt_fp32/engine.trt
real 1m51.712s
user 1m33.469s
sys 0m3.339s
# engine.trt,29.7MB
显存占用情况
Sat Sep 18 11:39:08 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 27% 42C P0 19W / 75W | 1750MiB / 3903MiB | 2% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 285MiB |
| 0 N/A N/A 20266 C python 1461MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python build_engine.py \
--onnx /home/yichao/Downloads/saved_model_onnx/model.onnx \
--engine /home/yichao/Downloads/saved_model_trt_fp16/engine.trt \
--precision fp16
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ time python build_engine.py \
> --onnx /home/yichao/Downloads/saved_model_onnx/model.onnx \
> --engine /home/yichao/Downloads/saved_model_trt_fp16/engine.trt \
> --precision fp16
[TensorRT] INFO: [MemUsageChange] Init CUDA: CPU +320, GPU +0, now: CPU 343, GPU 496 (MiB)
[TensorRT] WARNING: onnx2trt_utils.cpp:364: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[TensorRT] INFO: No importer registered for op: EfficientNMS_TRT. Attempting to import as plugin.
[TensorRT] INFO: Searching for plugin: EfficientNMS_TRT, plugin_version: 1, plugin_namespace:
[TensorRT] INFO: Successfully created plugin: EfficientNMS_TRT
INFO:EngineBuilder:Network Description
INFO:EngineBuilder:Input 'image_arrays:0' with shape (1, 512, 512, 3) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'num_detections' with shape (1, 1) and dtype DataType.INT32
INFO:EngineBuilder:Output 'detection_boxes' with shape (1, 100, 4) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'detection_scores' with shape (1, 100) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'detection_classes' with shape (1, 100) and dtype DataType.INT32
INFO:EngineBuilder:Building fp16 Engine in /home/yichao/Downloads/saved_model_trt_fp16/engine.trt
build_engine.py:203: DeprecationWarning: Use build_serialized_network instead.
with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f:
[TensorRT] INFO: [MemUsageSnapshot] Builder begin: CPU 361 MiB, GPU 496 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +357, GPU +148, now: CPU 721, GPU 644 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +118, GPU +148, now: CPU 839, GPU 792 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] WARNING: Detected invalid timing cache, setup a local cache instead
[TensorRT] INFO: Detected 1 inputs and 4 output network tensors.
[TensorRT] INFO: Total Host Persistent Memory: 295824
[TensorRT] INFO: Total Device Persistent Memory: 8877568
[TensorRT] INFO: Total Scratch Memory: 88388864
[TensorRT] INFO: [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 12 MiB, GPU 8 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +1, GPU +8, now: CPU 1294, GPU 984 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 1294, GPU 992 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1294, GPU 976 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1293, GPU 958 (MiB)
[TensorRT] INFO: [MemUsageSnapshot] Builder end: CPU 1290 MiB, GPU 958 MiB
INFO:EngineBuilder:Serializing engine to file: /home/yichao/Downloads/saved_model_trt_fp16/engine.trt
real 12m14.915s
user 10m47.976s
sys 0m17.289s
# engine.trt,23.4MB
显存占用情况
Sat Sep 18 11:20:07 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 28% 42C P0 19W / 75W | 2640MiB / 3903MiB | 10% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 19261 C python 2331MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python build_engine.py \
--onnx /home/yichao/Downloads/saved_model_onnx/model.onnx \
--engine /home/yichao/Downloads/saved_model_trt_int8/engine.trt \
--precision int8 \
--calib_input /home/yichao/Downloads/coco_calib \
--calib_cache /home/yichao/Downloads/calibration/calibration.cache
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ time python build_engine.py \
> --onnx /home/yichao/Downloads/saved_model_onnx/model.onnx \
> --engine /home/yichao/Downloads/saved_model_trt_int8/engine.trt \
> --precision int8 \
> --calib_input /home/yichao/Downloads/coco_calib \
> --calib_cache /home/yichao/Downloads/calibration/calibration.cache
[TensorRT] INFO: [MemUsageChange] Init CUDA: CPU +320, GPU +0, now: CPU 343, GPU 476 (MiB)
[TensorRT] WARNING: onnx2trt_utils.cpp:364: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[TensorRT] INFO: No importer registered for op: EfficientNMS_TRT. Attempting to import as plugin.
[TensorRT] INFO: Searching for plugin: EfficientNMS_TRT, plugin_version: 1, plugin_namespace:
[TensorRT] INFO: Successfully created plugin: EfficientNMS_TRT
INFO:EngineBuilder:Network Description
INFO:EngineBuilder:Input 'image_arrays:0' with shape (1, 512, 512, 3) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'num_detections' with shape (1, 1) and dtype DataType.INT32
INFO:EngineBuilder:Output 'detection_boxes' with shape (1, 100, 4) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'detection_scores' with shape (1, 100) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'detection_classes' with shape (1, 100) and dtype DataType.INT32
INFO:EngineBuilder:Building int8 Engine in /home/yichao/Downloads/saved_model_trt/engine.trt
build_engine.py:203: DeprecationWarning: Use build_serialized_network instead.
with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f:
[TensorRT] INFO: [MemUsageSnapshot] Builder begin: CPU 361 MiB, GPU 500 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +357, GPU +148, now: CPU 720, GPU 648 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +128, GPU +148, now: CPU 848, GPU 796 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] INFO: Detected 1 inputs and 4 output network tensors.
[TensorRT] INFO: Total Host Persistent Memory: 34896
[TensorRT] INFO: Total Device Persistent Memory: 0
[TensorRT] INFO: Total Scratch Memory: 106066176
[TensorRT] INFO: [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 1258, GPU 990 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 1258, GPU 998 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1258, GPU 982 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1257, GPU 964 (MiB)
[TensorRT] INFO: [MemUsageSnapshot] ExecutionContext creation begin: CPU 1257 MiB, GPU 964 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +1, GPU +10, now: CPU 1258, GPU 974 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 1258, GPU 982 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] INFO: [MemUsageSnapshot] ExecutionContext creation end: CPU 1258 MiB, GPU 1138 MiB
[TensorRT] INFO: Starting Calibration.
INFO:EngineBuilder:Calibrating image 8 / 1000
[TensorRT] INFO: Calibrated batch 0 in 0.299519 seconds.
INFO:EngineBuilder:Calibrating image 16 / 1000
...
[TensorRT] INFO: Calibrated batch 123 in 0.275864 seconds.
INFO:EngineBuilder:Calibrating image 1000 / 1000
[TensorRT] INFO: Calibrated batch 124 in 0.289 seconds.
INFO:EngineBuilder:Finished calibration batches
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1349, GPU 1122 (MiB)
[TensorRT] INFO: Post Processing Calibration data in 81.7665 seconds.
[TensorRT] INFO: Calibration completed in 129.299 seconds.
[TensorRT] INFO: Writing Calibration Cache for calibrator: TRT-8001-EntropyCalibration2
INFO:EngineBuilder:Writing calibration cache data to: /home/yichao/Downloads/calibration/calibration.cache
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 82) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 177) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 180) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 263) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 266) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 270) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 273) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 277) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 284) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 297) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 300) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 305) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 308) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 314) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 329) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 332) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 338) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 341) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 356) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 361) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 366) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 370) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 385) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 390) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 395) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 398) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 413) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 418) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 421) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 425) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 434) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 440) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 449) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 453) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 462) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 465) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 469) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 478) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 481) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 485) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 494) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 497) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 501) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 510) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 515) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 524) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 530) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 539) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 545) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 554) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 560) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 572) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 600) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 644) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: Missing scale and zero-point for tensor (Unnamed Layer* 692) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 1322, GPU 958 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 1322, GPU 966 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] WARNING: Detected invalid timing cache, setup a local cache instead
[TensorRT] INFO: Detected 1 inputs and 4 output network tensors.
[TensorRT] INFO: Total Host Persistent Memory: 306784
[TensorRT] INFO: Total Device Persistent Memory: 6846464
[TensorRT] INFO: Total Scratch Memory: 88388864
[TensorRT] INFO: [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 20 MiB, GPU 158 MiB
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 1369, GPU 986 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 1369, GPU 994 (MiB)
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.0.5
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1369, GPU 978 (MiB)
[TensorRT] INFO: [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 1368, GPU 962 (MiB)
[TensorRT] INFO: [MemUsageSnapshot] Builder end: CPU 1364 MiB, GPU 962 MiB
INFO:EngineBuilder:Serializing engine to file: /home/yichao/Downloads/saved_model_trt/engine.trt
real 25m27.581s
user 23m12.319s
sys 0m43.315s
# engine.trt,22.2MB
显存占用情况
Sat Sep 18 09:48:55 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 28% 42C P0 18W / 75W | 1784MiB / 3903MiB | 1% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 285MiB |
| 0 N/A N/A 15585 C python 1495MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python infer.py \
--engine /home/yichao/Downloads/saved_model_trt_fp32/engine.trt \
--input /home/yichao/Downloads/coco_calib \
--output /home/yichao/Downloads/infer_fp32
(tensorRT-efficientdet) yichao@yichao:/media/yichao/蚁巢文件/YOYOFile/YOYOFile/PyProjects/TensorRT/samples/python/efficientdet$ time python infer.py --engine /media/yichao/蚁巢文件/YOYOFile/EfficientDet/saved_model_trt_fp32/engine.trt --input /media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib --output /media/yichao/蚁巢文件/YOYOFile/EfficientDet/infer_fp32
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000009.jpg']
time_1: 511Image 1 / 1000
len(images): 1
detections: [[{'ymin': -10.355606079101562, 'xmin': 299.74897384643555, 'ymax': 240.54559707641602, 'xmax': 637.5265502929688, 'score': 0.6842542, 'class': 50}, {'ymin': 80.77005386352539, 'xmin': -1.2041282653808594, 'ymax': 436.90185546875, 'xmax': 459.16343688964844, 'score': 0.62642604, 'class': 50}, {'ymin': 188.0276870727539, 'xmin': 27.04761505126953, 'ymax': 471.3081741333008, 'xmax': 601.4738464355469, 'score': 0.57705814, 'class': 50}, {'ymin': 222.76105880737305, 'xmin': 249.59787368774414, 'ymax': 473.05328369140625, 'xmax': 562.2936248779297, 'score': 0.5430005, 'class': 55}, {'ymin': 69.5292615890503, 'xmin': 388.29090118408203, 'ymax': 141.84276580810547, 'xmax': 470.1576614379883, 'score': 0.45278752, 'class': 54}, {'ymin': 6.308698654174805, 'xmin': 19.28295135498047, 'ymax': 293.5408401489258, 'xmax': 427.21588134765625, 'score': 0.4251722, 'class': 50}]]
time_2: 159
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000151.jpg']
time_1: 15 Image 2 / 1000
len(images): 1
detections: [[{'ymin': 326.95018768310547, 'xmin': 211.58906936645508, 'ymax': 365.53333282470703, 'xmax': 250.12731552124023, 'score': 0.84729725, 'class': 12}, {'ymin': 38.47867965698242, 'xmin': 200.59566497802734, 'ymax': 634.6967315673828, 'xmax': 473.6790084838867, 'score': 0.7619643, 'class': 6}]]
time_2: 108
...
...
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000580385.jpg']
time_1: 14 Image 998 / 1000
len(images): 1
detections: [[{'ymin': 47.34233856201172, 'xmin': 125.20095825195312, 'ymax': 359.3656539916992, 'xmax': 525.5948638916016, 'score': 0.9364282, 'class': 6}]]
time_2: 117
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000581218.jpg']
time_1: 15 Image 999 / 1000
len(images): 1
detections: [[{'ymin': 309.59455490112305, 'xmin': 241.9521713256836, 'ymax': 348.5172653198242, 'xmax': 271.9435501098633, 'score': 0.6911016, 'class': 4}, {'ymin': 147.70557403564453, 'xmin': 271.3910484313965, 'ymax': 188.7246322631836, 'xmax': 300.4056739807129, 'score': 0.6408937, 'class': 4}, {'ymin': 269.51610565185547, 'xmin': 260.4315185546875, 'ymax': 310.3202438354492, 'xmax': 289.44976806640625, 'score': 0.63147306, 'class': 4}, {'ymin': 234.6424102783203, 'xmin': 308.0194282531738, 'ymax': 274.24530029296875, 'xmax': 339.48490142822266, 'score': 0.52650064, 'class': 4}, {'ymin': 273.0849838256836, 'xmin': 298.9488983154297, 'ymax': 313.12862396240234, 'xmax': 325.66226959228516, 'score': 0.4886203, 'class': 4}, {'ymin': 233.1707763671875, 'xmin': 282.28904724121094, 'ymax': 268.44730377197266, 'xmax': 312.5209426879883, 'score': 0.41873103, 'class': 4}]]
time_2: 169
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000581766.jpg']
time_1: 15 Image 1000 / 1000
len(images): 1
detections: [[{'ymin': 148.56109023094177, 'xmin': 204.04024422168732, 'ymax': 283.86542201042175, 'xmax': 297.4337339401245, 'score': 0.8775655, 'class': 69}, {'ymin': 153.7458896636963, 'xmin': 17.941415309906006, 'ymax': 285.7228219509125, 'xmax': 127.41473317146301, 'score': 0.81447136, 'class': 69}, {'ymin': 146.68381214141846, 'xmin': 373.2069432735443, 'ymax': 278.8565158843994, 'xmax': 481.57575726509094, 'score': 0.75143397, 'class': 69}]]
time_2: 111
get_batch time: 158031
Finished Processing
real 2m40.346s
user 2m30.624s
sys 0m4.989s
结论:tensorRT FP 32平均耗时 15ms/张,即 66fps。
显存占用情况
Sat Sep 18 11:44:30 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 27% 40C P0 20W / 75W | 1146MiB / 3903MiB | 14% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 20470 C python 837MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python infer.py \
--engine /home/yichao/Downloads/saved_model_trt_fp16/engine.trt \
--input /home/yichao/Downloads/coco_calib \
--output /home/yichao/Downloads/infer_fp16
(tensorRT-efficientdet) yichao@yichao:/media/yichao/蚁巢文件/YOYOFile/YOYOFile/P
yProjects/TensorRT/samples/python/efficientdet$ time python infer.py \
> --engine /media/yichao/蚁巢文件/YOYOFile/EfficientDet/saved_model_trt_fp16/engine.trt \
> --input /media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib \
> --output /media/yichao/蚁巢文件/YOYOFile/EfficientDet/infer_fp16
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000009.jpg']
time_1: 3582mage 1 / 1000
len(images): 1
detections: [[{'ymin': -10.390625, 'xmin': 299.84375, 'ymax': 240.625, 'xmax': 637.5, 'score': 0.6845703, 'class': 50}, {'ymin': 80.9375, 'xmin': -1.25, 'ymax': 436.875, 'xmax': 459.0625, 'score': 0.625, 'class': 50}, {'ymin': 188.125, 'xmin': 27.1875, 'ymax': 471.25, 'xmax': 601.25, 'score': 0.57666016, 'class': 50}, {'ymin': 222.8125, 'xmin': 249.6875, 'ymax': 472.8125, 'xmax': 562.1875, 'score': 0.54296875, 'class': 55}, {'ymin': 69.53125, 'xmin': 388.125, 'ymax': 141.875, 'xmax': 470.0, 'score': 0.453125, 'class': 54}, {'ymin': 6.328125, 'xmin': 19.375, 'ymax': 293.4375, 'xmax': 427.1875, 'score': 0.42456055, 'class': 50}]]
time_2: 230
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000151.jpg']
time_1: 10 Image 2 / 1000
len(images): 1
detections: [[{'ymin': 326.875, 'xmin': 211.5625, 'ymax': 365.625, 'xmax': 250.0, 'score': 0.84716797, 'class': 12}, {'ymin': 38.28125, 'xmin': 200.625, 'ymax': 635.0, 'xmax': 473.75, 'score': 0.7636719, 'class': 6}]]
time_2: 100
...
...
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000580385.jpg']
time_1: 11 Image 998 / 1000
len(images): 1
detections: [[{'ymin': 47.265625, 'xmin': 125.15625, 'ymax': 359.375, 'xmax': 525.625, 'score': 0.9370117, 'class': 6}]]
time_2: 114
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000581218.jpg']
time_1: 11 Image 999 / 1000
len(images): 1
detections: [[{'ymin': 309.6875, 'xmin': 241.875, 'ymax': 348.4375, 'xmax': 271.875, 'score': 0.6899414, 'class': 4}, {'ymin': 147.734375, 'xmin': 271.40625, 'ymax': 188.75, 'xmax': 300.46875, 'score': 0.640625, 'class': 4}, {'ymin': 269.375, 'xmin': 260.46875, 'ymax': 310.3125, 'xmax': 289.53125, 'score': 0.6303711, 'class': 4}, {'ymin': 234.53125, 'xmin': 308.125, 'ymax': 274.21875, 'xmax': 339.375, 'score': 0.52734375, 'class': 4}, {'ymin': 273.125, 'xmin': 299.0625, 'ymax': 313.125, 'xmax': 325.625, 'score': 0.48632812, 'class': 4}, {'ymin': 233.125, 'xmin': 282.1875, 'ymax': 268.4375, 'xmax': 312.5, 'score': 0.41870117, 'class': 4}]]
time_2: 170
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000581766.jpg']
time_1: 10 Image 1000 / 1000
len(images): 1
detections: [[{'ymin': 148.5595703125, 'xmin': 204.1015625, 'ymax': 283.69140625, 'xmax': 297.36328125, 'score': 0.8774414, 'class': 69}, {'ymin': 153.80859375, 'xmin': 17.9443359375, 'ymax': 285.64453125, 'xmax': 127.44140625, 'score': 0.81396484, 'class': 69}, {'ymin': 146.728515625, 'xmin': 373.291015625, 'ymax': 278.80859375, 'xmax': 481.689453125, 'score': 0.7519531, 'class': 69}]]
time_2: 113
get_batch time: 158293
Finished Processing
real 3m5.879s
user 2m28.488s
sys 0m5.079s
结论:tesorRT FP 16平均耗时 11ms/张,即 90fps。
显存占用情况
Sat Sep 18 11:22:24 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 27% 40C P0 20W / 75W | 1094MiB / 3903MiB | 11% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 19579 C python 785MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python infer.py \
--engine /home/yichao/Downloads/saved_model_trt_int8/engine.trt \
--input /home/yichao/Downloads/coco_calib \
--output /home/yichao/Downloads/infer_int8
(tensorRT-efficientdet) yichao@yichao:/media/yichao/蚁巢文件/YOYOFile/YOYOFile/PyProjects/TensorRT/samples/python/efficientdet$ time python infer.py \
> --engine /media/yichao/蚁巢文件/YOYOFile/EfficientDet/saved_model_trt_int8/engine.trt \
> --input /media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib \
> --output /media/yichao/蚁巢文件/YOYOFile/EfficientDet/infer_int8
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000009.jpg']
time_1: 499Image 1 / 1000
len(images): 1
detections: [[{'ymin': 224.6875, 'xmin': 251.40625, 'ymax': 476.5625, 'xmax': 561.875, 'score': 0.57666016, 'class': 55}, {'ymin': 1.09375, 'xmin': 311.5625, 'ymax': 238.75, 'xmax': 630.3125, 'score': 0.50146484, 'class': 50}, {'ymin': 68.125, 'xmin': -3.4375, 'ymax': 458.125, 'xmax': 527.5, 'score': 0.49902344, 'class': 50}]]
time_2: 165
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000151.jpg']
time_1: 6g Image 2 / 1000
len(images): 1
detections: [[{'ymin': 18.28125, 'xmin': 204.6875, 'ymax': 639.375, 'xmax': 505.9375, 'score': 0.5888672, 'class': 6}]]
time_2: 104
...
...
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000580385.jpg']
time_1: 6g Image 998 / 1000
len(images): 1
detections: [[{'ymin': 45.0, 'xmin': 122.03125, 'ymax': 361.875, 'xmax': 525.625, 'score': 0.9316406, 'class': 6}]]
time_2: 120
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000581218.jpg']
time_1: 6g Image 999 / 1000
len(images): 1
detections: [[]]
time_2: 177
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000581766.jpg']
time_1: 6g Image 1000 / 1000
len(images): 1
detections: [[{'ymin': 147.705078125, 'xmin': 204.1015625, 'ymax': 284.423828125, 'xmax': 297.36328125, 'score': 0.7817383, 'class': 69}, {'ymin': 151.3671875, 'xmin': 17.63916015625, 'ymax': 281.982421875, 'xmax': 128.173828125, 'score': 0.7714844, 'class': 69}, {'ymin': 145.99609375, 'xmin': 373.779296875, 'ymax': 277.34375, 'xmax': 482.666015625, 'score': 0.7338867, 'class': 69}]]
time_2: 108
get_batch time: 153597
Finished Processing
real 2m37.937s
user 2m25.907s
sys 0m2.906s
结论:tensorRT INT8平均耗时 7ms/张,即 142fps。
显存占用情况
Tue Sep 28 18:35:05 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 28% 43C P0 18W / 75W | 935MiB / 3903MiB | 10% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1553 G /usr/lib/xorg/Xorg 156MiB |
| 0 N/A N/A 7282 C python 775MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python eval_coco.py \
--engine /home/yichao/Downloads/saved_model_trt_fp32/engine.trt \
--input /home/yichao/Downloads/COCO/val2017 \
--annotations /home/yichao/Downloads/COCO/annotations/instances_val2017.json \
--automl_path /home/yichao/MyDocuments/automl
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ time python eval_coco.py \
> --engine /home/yichao/Downloads/saved_model_trt_fp32/engine.trt \
> --input /home/yichao/Downloads/COCO/val2017 \
> --annotations /home/yichao/Downloads/COCO/annotations/instances_val2017.json \
> --automl_path /home/yichao/MyDocuments/automl
2021-09-18 11:47:43.730693: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processing Image 5000 / 5000
loading annotations into memory...
Done (t=0.60s)
creating index...
index created!
Loading and preparing results...
Converting ndarray to lists...
(18145, 7)
0/18145
DONE (t=0.13s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=8.57s).
Accumulating evaluation results...
DONE (t=1.46s).
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.282
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.397
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.315
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.053
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.319
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.242
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.315
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.316
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.052
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.349
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.562
real 3m11.921s
user 2m3.908s
sys 0m17.163s
显存占用情况
Sat Sep 18 11:47:57 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 27% 45C P0 38W / 75W | 1146MiB / 3903MiB | 34% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 20581 C python 837MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python eval_coco.py \
--engine /home/yichao/Downloads/saved_model_trt_fp16/engine.trt \
--input /home/yichao/Downloads/COCO/val2017 \
--annotations /home/yichao/Downloads/COCO/annotations/instances_val2017.json \
--automl_path /home/yichao/MyDocuments/automl
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ python eval_coco.py \
> --engine /home/yichao/Downloads/saved_model_trt_fp16/engine.trt \
> --input /home/yichao/Downloads/COCO/val2017 \
> --annotations /home/yichao/Downloads/COCO/annotations/instances_val2017.json \
> --automl_path /home/yichao/MyDocuments/automl
2021-09-17 18:20:02.568124: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processing Image 5000 / 5000
loading annotations into memory...
Done (t=0.63s)
creating index...
index created!
Loading and preparing results...
Converting ndarray to lists...
(18146, 7)
0/18146
DONE (t=0.13s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=8.83s).
Accumulating evaluation results...
DONE (t=1.46s).
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.282
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.397
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.315
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.053
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.319
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.242
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.316
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.316
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.052
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.349
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.562
显存占用情况
Sat Sep 18 11:27:42 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 27% 43C P0 28W / 75W | 1094MiB / 3903MiB | 36% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 19780 C python 785MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python eval_coco.py \
--engine /home/yichao/Downloads/saved_model_trt_int8/engine.trt \
--input /home/yichao/Downloads/COCO/val2017 \
--annotations /home/yichao/Downloads/COCO/annotations/instances_val2017.json \
--automl_path /home/yichao/MyDocuments/automl
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ time python eval_coco.py \
> --engine /home/yichao/Downloads/saved_model_trt_int8/engine.trt \
> --input /home/yichao/Downloads/COCO/val2017 \
> --annotations /home/yichao/Downloads/COCO/annotations/instances_val2017.json \
> --automl_path /home/yichao/MyDocuments/automl
2021-09-18 10:20:40.389173: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processing Image 5000 / 5000
loading annotations into memory...
Done (t=0.65s)
creating index...
index created!
Loading and preparing results...
Converting ndarray to lists...
(14482, 7)
0/14482
DONE (t=0.13s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=7.95s).
Accumulating evaluation results...
DONE (t=1.38s).
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.227
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.318
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.254
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.024
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.231
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.430
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.199
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.254
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.254
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.022
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.245
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.492
real 2m40.845s
user 1m45.329s
sys 0m7.975s
显存占用情况
Sat Sep 18 10:23:19 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 27% 42C P0 18W / 75W | 1279MiB / 3903MiB | 3% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 16664 C python 972MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python compare_tf.py \
--engine /home/yichao/Downloads/saved_model_trt_fp32/engine.trt \
--saved_model /home/yichao/Downloads/saved_model \
--input /home/yichao/Downloads/coco_calib \
--output /home/yichao/Downloads/output_fp32
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ time python compare_tf.py \
> --engine /home/yichao/Downloads/saved_model_trt_fp32/engine.trt \
> --saved_model /home/yichao/Downloads/saved_model \
> --input /home/yichao/Downloads/coco_calib \
> --output /home/yichao/Downloads/output_fp32
2021-09-18 11:51:59.627768: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-09-18 11:52:05.397166: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-09-18 11:52:05.397267: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-18 11:52:05.397505: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.68GHz coreCount: 14 deviceMemorySize: 3.81GiB deviceMemoryBandwidth: 119.24GiB/s
2021-09-18 11:52:05.397524: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-09-18 11:52:05.397555: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-09-18 11:52:05.397577: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2021-09-18 11:52:05.466217: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10
2021-09-18 11:52:05.466545: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10
2021-09-18 11:52:05.466862: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/home/yichao/360Downloads/TensorRT-8.0.1.6/lib:/home/yichao/miniconda3/envs/tensorRT-yolov5/lib/libmkl_intel_lp64.so
2021-09-18 11:52:05.495741: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11
2021-09-18 11:52:05.495966: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2021-09-18 11:52:05.496021: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1766] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2021-09-18 11:52:05.803052: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-18 11:52:05.803585: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-09-18 11:52:05.803601: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]
2021-09-18 11:52:12.462233: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-09-18 11:52:12.505942: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2899885000 Hz
Processing 100 / 100 images (TensorFlow)
Processing 100 / 100 images (TensorRT)
Processing 100 / 100 images (Visualization)
real 0m52.613s
user 1m23.590s
sys 0m3.248s
显存占用情况
Sat Sep 18 11:52:37 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 33% 44C P0 18W / 75W | 1146MiB / 3903MiB | 2% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 20720 C python 837MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python compare_tf.py \
--engine /home/yichao/Downloads/saved_model_trt_fp16/engine.trt \
--saved_model /home/yichao/Downloads/saved_model \
--input /home/yichao/Downloads/coco_calib \
--output /home/yichao/Downloads/output_fp16
(tensorRT-efficientdet) yichao@yichao:/media/yichao/蚁巢文件/YOYOFile/YOYOFile/PyProjects/TensorRT/samples/python/efficientdet$ time python compare_tf.py --engine /media/yichao/蚁巢文件/YOYOFile/EfficientDet/saved_model_trt_fp16/engine.trt --saved_model /media/yichao/蚁巢文件/YOYOFile/EfficientDet/saved_model --input /media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib --output /media/yichao/蚁巢文件/YOYOFile/EfficientDet/output_fp16
2021-10-22 16:23:31.638538: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-10-22 16:23:32.897147: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-10-22 16:23:32.897287: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-22 16:23:32.897550: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.68GHz coreCount: 14 deviceMemorySize: 3.81GiB deviceMemoryBandwidth: 119.24GiB/s
2021-10-22 16:23:32.897571: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-10-22 16:23:32.897604: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-10-22 16:23:32.897625: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2021-10-22 16:23:32.898508: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10
2021-10-22 16:23:32.898545: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10
2021-10-22 16:23:32.898692: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/usr/local/cuda-10.2/lib64:/home/yichao/MyDocuments/PyProjects/software/TensorRT-7.0.0.11/lib:/home/yichao/miniconda3/envs/tensorRT-yolov5/lib/libmkl_intel_lp64.so
2021-10-22 16:23:32.899198: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11
2021-10-22 16:23:32.899227: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2021-10-22 16:23:32.899236: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1766] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2021-10-22 16:23:33.134158: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-22 16:23:33.134560: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-10-22 16:23:33.134572: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]
2021-10-22 16:23:40.393064: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-22 16:23:40.438255: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2899885000 Hz
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000009.jpg']
...
...
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000061328.jpg']
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000061399.jpg']
Processing 100 / 100 images (TensorFlow)
get_batch time: 15944
time_1: 15944
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000000009.jpg']
...
...
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000061328.jpg']
len(batch_images): ['/media/yichao/蚁巢文件/YOYOFile/EfficientDet/coco_calib/COCO_train2014_000000061399.jpg']
Processing 100 / 100 images (TensorRT)
get_batch time: 2519
time_2: 2519
Processing 100 / 100 images (Visualization)
real 0m35.517s
user 1m5.616s
sys 0m3.689s
数据一:
测试数据 | 分辨率 | TensorFlow耗时/ms | TensorRT耗时/ms | 加速比 |
---|---|---|---|---|
COCO,100张 | 640x480 | 16658 | 2577 | 6.5 |
COCO,100张 | 640x480 | 15944 | 2519 | 6.3 |
COCO,100张 | 640x480 | 16982 | 2559 | 6.6 |
COCO,100张 | 640x480 | 16026 | 2533 | 6.3 |
数据二:
测试数据 | 分辨率 | TensorFlow耗时/ms | TensorRT耗时/ms | 加速比 |
---|---|---|---|---|
COCO,100张 | 640x480 | 16089 | 5268 | 3 |
COCO,100张 | 640x480 | 11888 | 2509 | 4.7 |
COCO,100张 | 640x480 | 12012 | 3008 | 4 |
COCO,100张 | 640x480 | 11803 | 3183 | 3.7 |
COCO,100张 | 640x480 | 11776 | 3140 | 3.7 |
测试数据 | 分辨率 | TensorFlow耗时/ms | TensorRT耗时/ms | 加速比 |
---|---|---|---|---|
person_horse,100张 | 1280x720 | 12891 | 3452 | 3.7 |
person_horse,100张 | 1280x720 | 14989 | 3480 | 4.3 |
person_horse,100张 | 1280x720 | 15565 | 3515 | 4.4 |
person_horse,100张 | 1280x720 | 12883 | 3456 | 3.7 |
注意: 加 速 比 = T e n s o r F l o w 耗 时 T e n s o r R T 耗 时 加速比=\frac{TensorFlow耗时}{TensorRT耗时} 加速比=TensorRT耗时TensorFlow耗时
显存占用情况
Sat Sep 18 11:33:21 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 30% 41C P0 18W / 75W | 1094MiB / 3903MiB | 2% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 20023 C python 785MiB |
+-----------------------------------------------------------------------------+
cd /home/yichao/MyDocuments/TensorRT/samples/python/efficientdet
python compare_tf.py \
--engine /home/yichao/Downloads/saved_model_trt_int8/engine.trt \
--saved_model /home/yichao/Downloads/saved_model \
--input /home/yichao/Downloads/coco_calib \
--output /home/yichao/Downloads/output_int8
(tensorRT-efficientdet) yichao@yichao:~/MyDocuments/TensorRT/samples/python/efficientdet$ time python compare_tf.py \
> --engine /home/yichao/Downloads/saved_model_trt_int8/engine.trt \
> --saved_model /home/yichao/Downloads/saved_model \
> --input /home/yichao/Downloads/coco_calib \
> --output /home/yichao/Downloads/output_int8
2021-09-18 09:25:28.721580: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-09-18 09:25:34.855431: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-09-18 09:25:34.855675: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-18 09:25:34.856186: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.68GHz coreCount: 14 deviceMemorySize: 3.81GiB deviceMemoryBandwidth: 119.24GiB/s
2021-09-18 09:25:34.856231: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-09-18 09:25:34.856312: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-09-18 09:25:34.856366: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2021-09-18 09:25:34.938908: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10
2021-09-18 09:25:34.939165: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10
2021-09-18 09:25:34.939584: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/home/yichao/360Downloads/TensorRT-8.0.1.6/lib:/home/yichao/miniconda3/envs/tensorRT-yolov5/lib/libmkl_intel_lp64.so
2021-09-18 09:25:34.974029: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11
2021-09-18 09:25:34.974272: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2021-09-18 09:25:34.974324: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1766] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2021-09-18 09:25:35.443757: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-18 09:25:35.445486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-09-18 09:25:35.445558: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]
2021-09-18 09:25:42.378910: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-09-18 09:25:42.434208: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2899885000 Hz
Processing 100 / 100 images (TensorFlow)
Processing 100 / 100 images (TensorRT)
Processing 100 / 100 images (Visualization)
real 0m53.631s
user 1m19.881s
sys 0m3.897s
显存占用情况
Sat Sep 18 10:27:43 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01 Driver Version: 470.63.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 On | N/A |
| 27% 40C P0 18W / 75W | 1094MiB / 3903MiB | 4% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1465 G /usr/lib/xorg/Xorg 305MiB |
| 0 N/A N/A 16827 C python 785MiB |
+-----------------------------------------------------------------------------+