简介:本文深入解析TensorRT推理的核心原理,结合Python代码示例,从模型转换到优化部署全流程讲解,帮助开发者快速掌握高性能推理实现方法。
TensorRT是NVIDIA推出的高性能深度学习推理优化器,通过量化、层融合、内核自动调优等技术,将预训练模型转换为高度优化的推理引擎。相较于原生框架推理,TensorRT可实现3-10倍的性能提升,特别适合边缘计算、实时视频分析等对延迟敏感的场景。
# 推荐环境配置conda create -n trt_env python=3.8conda activate trt_envpip install nvidia-pyindex # 必须首先安装pip install onnx-graphsurgeon tensorrt
关键依赖说明:
支持输入的模型格式:
预处理要求:
import tensorrt as trtimport pycuda.driver as cudaimport pycuda.autoinitimport numpy as npclass HostDeviceMem(object):def __init__(self, host_mem, device_mem):self.host = host_memself.device = device_memdef __str__(self):return f"Host:\n{self.host}\nDevice:\n{self.device}"def allocate_buffers(engine):inputs = []outputs = []bindings = []stream = cuda.Stream()for binding in engine:size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_sizedtype = trt.nptype(engine.get_binding_dtype(binding))host_mem = cuda.pagelocked_empty(size, dtype)device_mem = cuda.mem_alloc(host_mem.nbytes)bindings.append(int(device_mem))if engine.binding_is_input(binding):inputs.append(HostDeviceMem(host_mem, device_mem))else:outputs.append(HostDeviceMem(host_mem, device_mem))return inputs, outputs, bindings, streamdef do_inference(context, bindings, inputs, outputs, stream, batch_size=1):[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]stream.synchronize()return [out.host for out in outputs]
class TensorRTInfer:def __init__(self, engine_path):self.logger = trt.Logger(trt.Logger.INFO)self.runtime = trt.Runtime(self.logger)with open(engine_path, "rb") as f:engine_data = f.read()self.engine = self.runtime.deserialize_cuda_engine(engine_data)self.context = self.engine.create_execution_context()self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.engine)def infer(self, input_data):# 预处理:归一化+维度调整np.copyto(self.inputs[0].host, input_data.ravel())# 执行推理trt_outputs = do_inference(self.context,self.bindings,self.inputs,self.outputs,self.stream)# 后处理:结果解析output_data = trt_outputs[0].reshape(...) # 根据模型输出调整return output_data
批处理策略:
# 动态批处理实现示例def batch_inference(self, input_batch):# 输入数据需满足:batch_size*channel*height*widthassert len(input_batch.shape) == 4batch_size = input_batch.shape[0]# 调整context的动态维度self.context.set_binding_shape(0, input_batch.shape[1:])# 执行批处理推理np.copyto(self.inputs[0].host, input_batch.ravel())return do_inference(...)
内存优化:
cuda.pagelocked_empty替代numpy.zeros减少内存拷贝多线程实现:
from threading import Threadclass AsyncInfer:def __init__(self, engine_path):self.infer = TensorRTInfer(engine_path)self.queue = []self.lock = threading.Lock()def async_infer(self, input_data, callback):with self.lock:self.queue.append((input_data, callback))# 实际项目中需配合工作线程使用
诊断流程:
trtexec工具验证各层输出精度代码修复示例:
# 在builder配置中指定特定层使用FP32config = builder.create_builder_config()profile = builder.create_optimization_profile()config.set_flag(trt.BuilderFlag.FP16) # 全局FP16# 对特定层设置例外layer = network.get_layer(idx)layer.precision = trt.float32
典型错误:
[TRT] [E] Parameter check failed at: engine.cpp::setBindingDimensions::305,Conditions: (mEngine != nullptr), Error: Invalid engine
解决方案:
# 正确设置动态维度profile = builder.create_optimization_profile()profile.set_shape("input",min=(1,3,224,224),opt=(8,3,224,224),max=(32,3,224,224))config.add_optimization_profile(profile)
class PipelineInfer:def __init__(self, detectors, classifiers):self.detectors = [TensorRTInfer(p) for p in detectors]self.classifiers = [TensorRTInfer(p) for p in classifiers]def process(self, frame):# 检测阶段det_results = []for det in self.detectors:det_results.append(det.infer(frame))# 分类阶段(并行处理)class_results = []with ThreadPoolExecutor() as executor:futures = [executor.submit(cls.infer, roi)for cls, roi in zip(self.classifiers, det_results)]class_results = [f.result() for f in futures]return class_results
# 使用TensorRT的量化校准工具class EntropyCalibrator(trt.IInt8EntropyCalibrator2):def __init__(self, cache_file, batch_size=1):trt.IInt8EntropyCalibrator2.__init__(self)self.cache_file = cache_fileself.batch_size = batch_size# 实现get_batch等必要方法...# 在builder配置中使用config.set_flag(trt.BuilderFlag.INT8)calibrator = EntropyCalibrator("calibration.cache")config.int8_calibrator = calibrator
import timedef benchmark(infer, input_data, num_runs=100):warmup = 10for _ in range(warmup):infer.infer(input_data)times = []for _ in range(num_runs):start = time.time()infer.infer(input_data)times.append(time.time() - start)return {"mean": np.mean(times)*1000, # ms"std": np.std(times)*1000,"fps": num_runs/sum(times)}
| 指标 | 理想值 | 优化方向 |
|---|---|---|
| 延迟 | <10ms | 量化/层融合/批处理 |
| 吞吐量 | >100FPS | 多流/异步执行 |
| 内存占用 | <1GB | 动态内存/张量重用 |
| 精度损失 | <1% mAP | 量化校准/混合精度 |
本文通过完整的代码示例和深入的技术解析,系统阐述了TensorRT在Python环境下的推理实现方法。开发者可按照文中提供的优化策略,根据具体业务场景调整参数配置,实现性能与精度的最佳平衡。实际部署时建议结合trtexec工具进行初步性能分析,再通过Python接口实现定制化功能。