简介:本文详细介绍如何在Python环境中使用TensorRT进行模型推理,涵盖环境配置、模型转换、推理代码编写及性能优化,助力开发者实现高效AI部署。
TensorRT是NVIDIA推出的高性能深度学习推理引擎,专为优化生产环境中的模型部署而设计。相较于原生框架(如PyTorch、TensorFlow),TensorRT通过层融合、精度校准、内核自动调优等技术,可将模型推理速度提升3-10倍,同时支持FP16、INT8等低精度计算,显著降低显存占用。
在Python生态中,TensorRT提供了tensorrt和onnx-tensorrt两大核心库,支持从ONNX格式直接转换模型,并与CUDA、cuDNN深度集成。典型应用场景包括:
# 方法1:通过pip安装预编译包(推荐)pip install nvidia-tensorrt# 方法2:从NVIDIA官网下载.whl文件安装# 需根据CUDA版本选择对应包,例如:# pip install tensorrt-8.6.1.6-cp38-none-linux_x86_64.whl# 验证安装python -c "import tensorrt as trt; print(trt.__version__)"
常见问题处理:
conda list检查CUDA/cuDNN版本,确保与TensorRT兼容--user参数或使用虚拟环境uff-converter-tf(TensorFlow模型转换时需要)
import torchimport torchvision.models as models# 加载预训练模型model = models.resnet50(pretrained=True)model.eval()# 创建示例输入dummy_input = torch.randn(1, 3, 224, 224)# 导出ONNX模型torch.onnx.export(model,dummy_input,"resnet50.onnx",input_names=["input"],output_names=["output"],dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},opset_version=13)
关键参数说明:
dynamic_axes:支持动态batch尺寸opset_version:推荐使用11+以获得最佳兼容性
import tensorrt as trt# 初始化LoggerTRT_LOGGER = trt.Logger(trt.Logger.WARNING)# 创建Builder和Networkbuilder = trt.Builder(TRT_LOGGER)network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))# 创建ONNX解析器parser = trt.OnnxParser(network, TRT_LOGGER)with open("resnet50.onnx", "rb") as model_file:if not parser.parse(model_file.read()):for error in range(parser.num_errors):print(parser.get_error(error))raise RuntimeError("Failed to parse ONNX model")# 配置引擎config = builder.create_builder_config()config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB工作空间# 构建引擎(FP16模式)if builder.platform_has_fast_fp16:config.set_flag(trt.BuilderFlag.FP16)engine = builder.build_engine(network, config)# 序列化引擎with open("resnet50.engine", "wb") as f:f.write(engine.serialize())
优化技巧:
config.set_flag(trt.BuilderFlag.INT8)启用IExecutionContext实现异步推理
import tensorrt as trtimport pycuda.driver as cudaimport pycuda.autoinitimport numpy as npclass TensorRTInfer:def __init__(self, engine_path):# 初始化Loggerself.logger = trt.Logger(trt.Logger.INFO)# 反序列化引擎with open(engine_path, "rb") as f:runtime = trt.Runtime(self.logger)self.engine = runtime.deserialize_cuda_engine(f.read())# 创建执行上下文self.context = self.engine.create_execution_context()# 分配输入/输出缓冲区self.inputs, self.outputs, self.bindings = [], [], []for binding in self.engine:size = trt.volume(self.engine.get_binding_shape(binding))dtype = trt.nptype(self.engine.get_binding_dtype(binding))host_mem = cuda.pagelocked_empty(size, dtype)device_mem = cuda.mem_alloc(host_mem.nbytes)self.bindings.append(int(device_mem))if self.engine.binding_is_input(binding):self.inputs.append({"host": host_mem, "device": device_mem})else:self.outputs.append({"host": host_mem, "device": device_mem})def infer(self, input_data):# 拷贝输入数据到设备np.copyto(self.inputs[0]["host"], input_data.ravel())cuda.memcpy_htod_async(self.inputs[0]["device"], self.inputs[0]["host"])# 执行推理self.context.execute_async_v2(bindings=self.bindings,stream_handle=cuda.Stream().handle)# 拷贝输出数据到主机cuda.memcpy_dtoh_async(self.outputs[0]["host"], self.outputs[0]["device"])cuda.Stream().synchronize()# 返回结果(需根据模型调整形状)return self.outputs[0]["host"].reshape(self.engine.get_binding_shape(1))
# 初始化推理器infer = TensorRTInfer("resnet50.engine")# 准备输入数据(示例为随机数据)input_data = np.random.rand(1, 3, 224, 224).astype(np.float32)# 执行推理output = infer.infer(input_data)# 输出结果(实际应用中需后处理)print("Output shape:", output.shape)print("Top-5 classes:", np.argsort(output[0])[-5:][::-1])
trtexec工具:命令行基准测试
trtexec --onnx=resnet50.onnx --fp16 --batch=16 --verbose
dynamic_axes支持可变batch尺寸
logger = trt.Logger(trt.Logger.VERBOSE) # 获取详细日志
for layer in network:print(f"Layer {layer.name}: {layer.type}")
# 在构建网络时指定动态维度profile = builder.create_optimization_profile()profile.set_shape("input", min=(1,3,224,224), opt=(8,3,224,224), max=(32,3,224,224))config.add_optimization_profile(profile)
class Pipeline:def __init__(self, engines):self.engines = [TensorRTInfer(e) for e in engines]def run(self, inputs):intermediate = inputsfor engine in self.engines:intermediate = engine.infer(intermediate)return intermediate
pagelocked_empty减少主机-设备拷贝开销trtexec定期验证性能衰减通过系统化的TensorRT推理实现,开发者可在Python环境中充分发挥NVIDIA GPU的算力优势,为AI应用提供稳定高效的生产级部署方案。实际项目中,建议结合具体业务需求进行针对性优化,例如在推荐系统中优化Embedding层的内存访问模式,或在视频流处理中实现零拷贝帧传递。