简介:本文详细介绍如何使用TensorRT在Python环境中实现高效的深度学习模型推理,包括环境配置、模型转换、推理代码实现及性能优化技巧,帮助开发者提升模型推理速度。
TensorRT是NVIDIA推出的高性能深度学习推理优化器和运行时引擎,专为生产环境设计。其核心优势在于通过模型优化(如层融合、精度校准)和硬件感知调度,显著提升GPU上的推理速度。相比原生框架(如PyTorch/TensorFlow),TensorRT可实现3-10倍的加速效果,尤其适用于实时性要求高的场景(如自动驾驶、视频分析)。
在Python生态中,TensorRT通过tensorrt和onnxruntime-gpu等库提供完整支持。开发者可将训练好的模型转换为TensorRT引擎文件(.engine),后续通过Python API加载执行推理。这种”训练-优化-部署”的分离模式,既保持了训练阶段的灵活性,又确保了部署阶段的高性能。
安装CUDA/cuDNN:
# 以CUDA 11.8为例wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pinsudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pubsudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"sudo apt-get updatesudo apt-get -y install cuda-11-8
安装TensorRT:
# 通过pip安装(推荐)pip install tensorrt==8.6.1.6 # 版本需与CUDA匹配# 或从NVIDIA官网下载.deb包安装
验证安装:
import tensorrt as trtprint(f"TensorRT版本: {trt.__version__}")print(f"CUDA版本: {trt.cuda().get_device_count()}")
导出ONNX模型:
# PyTorch示例import torchdummy_input = torch.randn(1, 3, 224, 224)model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)torch.onnx.export(model, dummy_input, "resnet18.onnx",input_names=["input"], output_names=["output"],dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},opset_version=13)
使用trtexec工具转换(命令行):
trtexec --onnx=resnet18.onnx --saveEngine=resnet18.engine --fp16
Python API转换(更灵活):
import tensorrt as trtdef build_engine(onnx_path, engine_path):logger = trt.Logger(trt.Logger.INFO)builder = trt.Builder(logger)network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))parser = trt.OnnxParser(network, logger)with open(onnx_path, "rb") as f:if not parser.parse(f.read()):for error in range(parser.num_errors):print(parser.get_error(error))return Noneconfig = builder.create_builder_config()config.set_flag(trt.BuilderFlag.FP16) # 启用FP16profile = builder.create_optimization_profile()profile.set_shape("input", min=(1,3,224,224), opt=(8,3,224,224), max=(32,3,224,224))config.add_optimization_profile(profile)engine = builder.build_engine(network, config)with open(engine_path, "wb") as f:f.write(engine.serialize())return engine
精度模式:
FP32:默认模式,精度最高但速度最慢FP16:速度提升2-3倍,需GPU支持(如V100/A100)INT8:速度最快(比FP32快4倍),但需要校准数据集动态形状:
通过optimization_profile设置输入的最小/最优/最大形状,支持变长输入
import tensorrt as trtimport pycuda.driver as cudaimport pycuda.autoinitimport numpy as npclass TensorRTInfer:def __init__(self, engine_path):self.logger = trt.Logger(trt.Logger.INFO)with open(engine_path, "rb") as f:runtime = trt.Runtime(self.logger)self.engine = runtime.deserialize_cuda_engine(f.read())self.context = self.engine.create_execution_context()# 分配输入/输出缓冲区self.inputs, self.outputs, self.bindings = [], [], []for binding in self.engine:size = trt.volume(self.engine.get_binding_shape(binding))dtype = trt.nptype(self.engine.get_binding_dtype(binding))host_mem = cuda.pagelocked_empty(size, dtype)device_mem = cuda.mem_alloc(host_mem.nbytes)self.bindings.append(int(device_mem))if self.engine.binding_is_input(binding):self.inputs.append({"host": host_mem, "device": device_mem})else:self.outputs.append({"host": host_mem, "device": device_mem})def infer(self, input_data):# 准备输入数据np.copyto(self.inputs[0]["host"], input_data.ravel())# 传输数据到设备for inp in self.inputs:cuda.memcpy_htod_async(inp["device"], inp["host"], stream=None)# 执行推理self.context.execute_async_v2(bindings=self.bindings, stream_handle=None)# 传输结果回主机for out in self.outputs:cuda.memcpy_dtoh_async(out["host"], out["device"], stream=None)cuda.Context.synchronize()# 返回输出(假设单输出)return [out["host"].reshape(self.engine.get_binding_shape(1))]# 使用示例if __name__ == "__main__":infer = TensorRTInfer("resnet18.engine")dummy_input = np.random.randn(1, 3, 224, 224).astype(np.float32)output = infer.infer(dummy_input)print("推理结果形状:", output[0].shape)
def build_dynamic_engine(onnx_path, engine_path):logger = trt.Logger(trt.Logger.INFO)builder = trt.Builder(logger)network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))parser = trt.OnnxParser(network, logger)with open(onnx_path, "rb") as f:parser.parse(f.read())config = builder.create_builder_config()profile = builder.create_optimization_profile()profile.set_shape("input", min=(1,3,224,224), opt=(8,3,224,224), max=(32,3,224,224))config.add_optimization_profile(profile)engine = builder.build_engine(network, config)with open(engine_path, "wb") as f:f.write(engine.serialize())return engine
def build_int8_engine(onnx_path, engine_path, calibration_data):logger = trt.Logger(trt.Logger.INFO)builder = trt.Builder(logger)network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))parser = trt.OnnxParser(network, logger)with open(onnx_path, "rb") as f:parser.parse(f.read())config = builder.create_builder_config()config.set_flag(trt.BuilderFlag.INT8)# 创建校准器class MyCalibrator(trt.IInt8EntropyCalibrator2):def __init__(self, data_loader, batch_size=32):super().__init__()self.data_loader = data_loaderself.batch_size = batch_sizeself.current_index = 0self.cache_file = "calibrator.cache"def get_batch_size(self):return self.batch_sizedef get_batch(self, names):batch = next(self.data_loader)return [np.ascontiguousarray(batch)]def read_calibration_cache(self, size):return Nonedef write_calibration_cache(self, cache):with open(self.cache_file, "wb") as f:f.write(cache)# 假设calibration_data是生成器,每次返回(batch,)calibrator = MyCalibrator(calibration_data)config.int8_calibrator = calibratorengine = builder.build_engine(network, config)with open(engine_path, "wb") as f:f.write(engine.serialize())return engine
stream = cuda.Stream()# 在推理时指定stream参数self.context.execute_async_v2(bindings=self.bindings, stream_handle=stream.handle)
builder.build_engine()时的自动选择最佳实现self.inputs/self.outputscuda.register_buffer()减少内存拷贝CUDA version mismatchInvalid shape for input Xoptimization_profile设置是否覆盖实际输入范围builder_config进行优化TensorRT通过模型优化和硬件感知调度,为Python开发者提供了高效的深度学习推理解决方案。本文详细介绍了从环境配置、模型转换到推理代码实现的完整流程,并提供了动态形状支持、INT8量化等高级功能的实现方法。实际测试表明,在ResNet50模型上,TensorRT相比PyTorch原生推理可实现:
未来发展方向包括更自动化的量化校准流程、支持更多新型网络结构(如Transformer)的优化,以及与ONNX Runtime等推理框架的深度集成。开发者应持续关注NVIDIA官方更新,以充分利用最新的优化技术。