TensorRT 8.x/10.x Migration Guide#
This section documents API changes between TensorRT 8.x and TensorRT 10.x safety runtimes. TensorRT 10.x safety runtime support will be available in an upcoming DriveOS 7.2 release.
If you are unfamiliar with these changes, refer to our sample code for clarification.
Python#
Python API Changes#
Note
These Python migrations are not applicable on QNX, where the Python API is not supported.
Allocating Buffers and Using a Name-Based Engine API
1def allocate_buffers(self, engine):
2'''
3Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
4'''
5inputs = []
6outputs = []
7bindings = []
8stream = cuda.Stream()
9
10# binding is the name of input/output
11for binding in the engine:
12 size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
13 dtype = trt.nptype(engine.get_binding_dtype(binding))
14
15 # Allocate host and device buffers
16 host_mem = cuda.pagelocked_empty(size, dtype) # page-locked memory buffer (won't be swapped to disk)
17 device_mem = cuda.mem_alloc(host_mem.nbytes)
18
19 # Append the device buffer address to device bindings.
20 # When cast to int, it's a linear index into the context's memory (like memory address).
21 bindings.append(int(device_mem))
22
23 # Append to the appropriate input/output list.
24 if engine.binding_is_input(binding):
25 inputs.append(self.HostDeviceMem(host_mem, device_mem))
26 else:
27 outputs.append(self.HostDeviceMem(host_mem, device_mem))
28
29return inputs, outputs, bindings, stream
1def allocate_buffers(self, engine):
2'''
3Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
4'''
5inputs = []
6outputs = []
7bindings = []
8stream = cuda.Stream()
9
10for i in range(engine.num_io_tensors):
11 tensor_name = engine.get_tensor_name(i)
12 size = trt.volume(engine.get_tensor_shape(tensor_name))
13 dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
14
15 # Allocate host and device buffers
16 host_mem = cuda.pagelocked_empty(size, dtype) # page-locked memory buffer (won't be swapped to disk)
17 device_mem = cuda.mem_alloc(host_mem.nbytes)
18
19 # Append the device buffer address to device bindings.
20 # When cast to int, it's a linear index into the context's memory (like memory address).
21 bindings.append(int(device_mem))
22
23 # Append to the appropriate input/output list.
24 if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
25 inputs.append(self.HostDeviceMem(host_mem, device_mem))
26 else:
27 outputs.append(self.HostDeviceMem(host_mem, device_mem))
28
29return inputs, outputs, bindings, stream
Transition from enqueueV2 to enqueueV3 for Python
1# Allocate device memory for inputs.
2d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(input_num)]
3
4# Allocate device memory for outputs.
5h_output = cuda.pagelocked_empty(output_nbytes, dtype=np.float32)
6d_output = cuda.mem_alloc(h_output.nbytes)
7
8# Transfer data from host to device.
9cuda.memcpy_htod_async(d_inputs[0], input_a, stream)
10cuda.memcpy_htod_async(d_inputs[1], input_b, stream)
11cuda.memcpy_htod_async(d_inputs[2], input_c, stream)
12
13# Run inference
14context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
15
16# Synchronize the stream
17stream.synchronize()
1# Allocate device memory for inputs.
2d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(input_num)]
3
4# Allocate device memory for outputs.
5h_output = cuda.pagelocked_empty(output_nbytes, dtype=np.float32)
6d_output = cuda.mem_alloc(h_output.nbytes)
7
8# Transfer data from host to device.
9cuda.memcpy_htod_async(d_inputs[0], input_a, stream)
10cuda.memcpy_htod_async(d_inputs[1], input_b, stream)
11cuda.memcpy_htod_async(d_inputs[2], input_c, stream)
12
13# Setup tensor address
14bindings = [int(d_inputs[i]) for i in range(3)] + [int(d_output)]
15
16for i in range(engine.num_io_tensors):
17 context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
18
19# Run inference
20context.execute_async_v3(stream_handle=stream.handle)
21
22# Synchronize the stream
23stream.synchronize()
Engine Building, use only build_serialized_network
1engine_bytes = None
2try:
3 engine_bytes = self.builder.build_serialized_network(self.network, self.config)
4except AttributeError:
5 engine = self.builder.build_engine(self.network, self.config)
6 engine_bytes = engine.serialize()
7 del engine
8assert engine_bytes
1engine_bytes = self.builder.build_serialized_network(self.network, self.config)
2if engine_bytes is None:
3 log.error("Failed to create engine")
4 sys.exit(1)
Added Python APIs#
Types
APILanguageExecutionContextAllocationStrategyIGpuAsyncAllocatorInterfaceInfoIPluginResourceIPluginV3IStreamReaderIVersionedInterface
Methods and Properties
ICudaEngine.is_debug_tensor()ICudaEngine.minimum_weight_streaming_budgetICudaEngine.streamable_weights_sizeICudaEngine.weight_streaming_budgetIExecutionContext.get_debug_listener()IExecutionContext.get_debug_state()IExecutionContext.set_all_tensors_debug_state()IExecutionContext.set_debug_listener()IExecutionContext.set_tensor_debug_state()IExecutionContext.update_device_memory_size_for_shapes()IGpuAllocator.allocate_async()IGpuAllocator.deallocate_async()INetworkDefinition.add_plugin_v3()INetworkDefinition.is_debug_tensor()INetworkDefinition.mark_debug()INetworkDefinition.unmark_debug()IPluginRegistry.acquire_plugin_resource()IPluginRegistry.all_creatorsIPluginRegistry.deregister_creator()IPluginRegistry.get_creator()IPluginRegistry.register_creator()IPluginRegistry.release_plugin_resource()
Removed Python APIs#
The following removed Python APIs are listed next to their superseded API.
BuilderFlag.ENABLE_TACTIC_HEURISTIC> Builder optimization level 2BuilderFlag.STRICT_TYPES> Use all three flags:BuilderFlag.DIRECT_IO,BuilderFlag.PREFER_PRECISION_CONSTRAINTS, andBuilderFlag.REJECT_EMPTY_ALGORITHMSEngineCapability.DEFAULT>EngineCapability.STANDARDEngineCapability.kSAFE_DLA>EngineCapability.DLA_STANDALONEEngineCapability.SAFE_GPU>EngineCapability.SAFETYIAlgorithmIOInfo.tensor_format> The strides, data type, and vectorization information are sufficient to identify tensor formats uniquely.IBuilder.max_batch_size> Implicit batch support was removedIBuilderConfig.max_workspace_size>IBuilderConfig.set_memory_pool_limit()withMemoryPoolType.WORKSPACEorIBuilderConfig.get_memory_pool_limit()withMemoryPoolType.WORKSPACEIBuilderConfig.min_timing_iterations>IBuilderConfig.avg_timing_iterationsICudaEngine.binding_is_input()>ICudaEngine.get_tensor_mode()ICudaEngine.get_binding_bytes_per_component()>ICudaEngine.get_tensor_bytes_per_component()ICudaEngine.get_binding_components_per_element()>ICudaEngine.get_tensor_components_per_element()ICudaEngine.get_binding_dtype()>ICudaEngine.get_tensor_dtype()ICudaEngine.get_binding_format()>ICudaEngine.get_tensor_format()ICudaEngine.get_binding_format_desc()>ICudaEngine.get_tensor_format_desc()ICudaEngine.get_binding_index()> No name-based equivalent replacementICudaEngine.get_binding_name()> No name-based equivalent replacementICudaEngine.get_binding_shape()>ICudaEngine.get_tensor_shape()ICudaEngine.get_binding_vectorized_dim()>ICudaEngine.get_tensor_vectorized_dim()ICudaEngine.get_location()>ITensor.locationICudaEngine.get_profile_shape()>ICudaEngine.get_tensor_profile_shape()ICudaEngine.get_profile_shape_input()>ICudaEngine.get_tensor_profile_values()ICudaEngine.has_implicit_batch_dimension()> Implicit batch is no longer supportedICudaEngine.is_execution_binding()> No name-based equivalent replacementICudaEngine.is_shape_binding()>ICudaEngine.is_shape_inference_io()ICudaEngine.max_batch_size()> Implicit batch is no longer supportedICudaEngine.num_bindings()>ICudaEngine.num_io_tensors()IExecutionContext.get_binding_shape()>IExecutionContext.get_tensor_shape()IExecutionContext.get_strides()>IExecutionContext.get_tensor_strides()IExecutionContext.set_binding_shape()>IExecutionContext.set_input_shape()IFullyConnectedLayer>IMatrixMultiplyLayerINetworkDefinition.add_convolution()>INetworkDefinition.add_convolution_nd()INetworkDefinition.add_deconvolution()>INetworkDefinition.add_deconvolution_nd()INetworkDefinition.add_fully_connected()>INetworkDefinition.add_matrix_multiply()INetworkDefinition.add_padding()>INetworkDefinition.add_padding_nd()INetworkDefinition.add_pooling()>INetworkDefinition.add_pooling_nd()INetworkDefinition.add_rnn_v2()>INetworkDefinition.add_loop()INetworkDefinition.has_explicit_precision> Explicit precision support was removed in 10.0INetworkDefinition.has_implicit_batch_dimension> - Implicit batch support was removedIRNNv2Layer>ILoopNetworkDefinitionCreationFlag.EXPLICIT_BATCH> Support was removed in 10.0NetworkDefinitionCreationFlag.EXPLICIT_PRECISION> Support was removed in 10.0PaddingMode.CAFFE_ROUND_DOWN> Caffe support was removedPaddingMode.CAFFE_ROUND_UP> Caffe support was removedPreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805> External tactics are always disabled for core codePreviewFeature.FASTER_DYNAMIC_SHAPES_0805> This flag is on by defaultProfilingVerbosity.DEFAULT>ProfilingVerbosity.LAYER_NAMES_ONLYProfilingVerbosity.VERBOSE>ProfilingVerbosity.DETAILEDResizeMode> UseInterpolationMode. Alias was removed.SampleMode.DEFAULT>SampleMode.STRICT_BOUNDSSliceMode> UseSampleMode. Alias was removed.
C++#
C++ API Changes#
Transition from enqueueV2 to enqueueV3 for C++
1// Create RAII buffer manager object.
2samplesCommon::BufferManager buffers(mEngine);
3
4auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
5if (!context)
6{
7 return false;
8}
9
10// Pick a random digit to try to infer.
11srand(time(NULL));
12int32_t const digit = rand() % 10;
13
14// Read the input data into the managed buffers.
15// There should be just 1 input tensor.
16ASSERT(mParams.inputTensorNames.size() == 1);
17
18if (!processInput(buffers, mParams.inputTensorNames[0], digit))
19{
20 return false;
21}
22// Create a CUDA stream to execute this inference.
23cudaStream_t stream;
24CHECK(cudaStreamCreate(&stream));
25
26// Asynchronously copy data from host input buffers to device input
27buffers.copyInputToDeviceAsync(stream);
28
29// Asynchronously enqueue the inference work
30if (!context->enqueueV2(buffers.getDeviceBindings().data(), stream, nullptr))
31{
32 return false;
33}
34// Asynchronously copy data from device output buffers to host output buffers.
35buffers.copyOutputToHostAsync(stream);
36
37// Wait for the work in the stream to complete.
38CHECK(cudaStreamSynchronize(stream));
39
40// Release stream.
41CHECK(cudaStreamDestroy(stream));
1// Create RAII buffer manager object.
2samplesCommon::BufferManager buffers(mEngine);
3
4auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
5if (!context)
6{
7 return false;
8}
9
10for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++)
11{
12 auto const name = mEngine->getIOTensorName(i);
13 context->setTensorAddress(name, buffers.getDeviceBuffer(name));
14}
15
16// Pick a random digit to try to infer.
17srand(time(NULL));
18int32_t const digit = rand() % 10;
19
20// Read the input data into the managed buffers.
21// There should be just 1 input tensor.
22ASSERT(mParams.inputTensorNames.size() == 1);
23
24if (!processInput(buffers, mParams.inputTensorNames[0], digit))
25{
26 return false;
27}
28// Create a CUDA stream to execute this inference.
29cudaStream_t stream;
30CHECK(cudaStreamCreate(&stream));
31
32// Asynchronously copy data from host input buffers to device input
33buffers.copyInputToDeviceAsync(stream);
34
35// Asynchronously enqueue the inference work
36if (!context->enqueueV3(stream))
37{
38 return false;
39}
40
41// Asynchronously copy data from device output buffers to host output buffers.
42buffers.copyOutputToHostAsync(stream);
43
44// Wait for the work in the stream to complete.
45CHECK(cudaStreamSynchronize(stream));
46
47// Release stream.
48CHECK(cudaStreamDestroy(stream));
64-Bit Dimension Changes#
The dimensions held by Dims changed from int32_t to int64_t. However, in TensorRT 10.x, TensorRT will generally reject networks that use dimensions exceeding the range of int32_t. The tensor type returned by IShapeLayer is now DataType::kINT64. Use ICastLayer to cast the result to the tensor of type DataType::kINT32 if 32-bit dimensions are required.
Inspect code that bitwise copies to and from Dims to ensure it is correct for int64_t dimensions.
Added C++ APIs#
Enums
ActivationType::kGELU_ERFActivationType::kGELU_TANHBuilderFlag::kREFIT_IDENTICALBuilderFlag::kSTRIP_PLANBuilderFlag::kWEIGHT_STREAMINGBuilderFlag::kSTRICT_NANSDatatype::kINT4LayerType::kPLUGIN_V3
Types
APILanguageDims64ExecutionContextAllocationStrategyIGpuAsyncAllocatorInterfaceInfoIPluginResourceIPluginV3IStreamReaderIVersionedInterface
Methods and Properties
getInferLibBuildVersiongetInferLibMajorVersiongetInferLibMinorVersiongetInferLibPatchVersionIBuilderConfig::setMaxNbTacticsIBuilderConfig::getMaxNbTacticsICudaEngine::createRefitterIcudaEngine::getMinimumWeightStreamingBudgetIcudaEngine::getStreamableWeightsSizeICudaEngine::getWeightStreamingBudgetIcudaEngine::isDebugTensorICudaEngine::setWeightStreamingBudgetIExecutionContext::getDebugListenerIExecutionContext::getTensorDebugStateIExecutionContext::setAllTensorsDebugStateIExecutionContext::setDebugListenerIExecutionContext::setOuputTensorAddressIExecutionContext::setTensorDebugStateIExecutionContext::updateDeviceMemorySizeForShapesIGpuAllocator::allocateAsyncIGpuAllocator::deallocateAsyncINetworkDefinition::addPluginV3INetworkDefinition::isDebugTensorINetworkDefinition::markDebugINetworkDefinition::unmarkDebugIPluginRegistry::acquirePluginResourceIPluginRegistry::deregisterCreatorIPluginRegistry::getAllCreatorsIPluginRegistry::getCreatorIPluginRegistry::registerCreatorIPluginRegistry::releasePluginResource
Removed C++ APIs#
The following removed C++ APIs are listed next to their superseded API.
BuilderFlag::kENABLE_TACTIC_HEURISTIC> Builder optimization level 2BuilderFlag::kSTRICT_TYPES[1] > Use all three flags:kREJECT_EMPTY_ALGORITHMS,kDIRECT_IO, andkPREFER_PRECISION_CONSTRAINTSEngineCapability::kDEFAULT>EngineCapability::kSTANDARDEngineCapability::kSAFE_DLA>EngineCapability::kDLA_STANDALONEEngineCapability::kSAFE_GPU>EngineCapability::kSAFETYIAlgorithm::getAlgorithmIOInfo()>IAlgorithm::getAlgorithmIOInfoByIndex()IAlgorithmIOInfo::getTensorFormat()> The strides, data type, and vectorization information are sufficient to identify tensor formats uniquely.IBuilder::buildEngineWithConfig()>IBuilder::buildSerializedNetwork()IBuilder::destroy()>delete ObjectNameIBuilder::getMaxBatchSize()> Implicit batch support was removedIBuilder::setMaxBatchSize()> Implicit batch support was removedIBuilderConfig::destroy()>delete ObjectNameIBuilderConfig::getMaxWorkspaceSize()>IBuilderConfig::getMemoryPoolLimit()withMemoryPoolType::kWORKSPACEIBuilderConfig::getMinTimingIterations()>IBuilderConfig::getAvgTimingIterations()IBuilderConfig::setMaxWorkspaceSize()>IBuilderConfig::setMemoryPoolLimit()withMemoryPoolType::kWORKSPACEIBuilderConfig::setMinTimingIterations()>IBuilderConfig::setAvgTimingIterations()IConvolutionLayer::getDilation()>IConvolutionLayer::getDilationNd()IConvolutionLayer::getKernelSize()>IConvolutionLayer::getKernelSizeNd()IConvolutionLayer::getPadding()>IConvolutionLayer::getPaddingNd()IConvolutionLayer::getStride()>IConvolutionLayer::getStrideNd()IConvolutionLayer::setDilation()>IConvolutionLayer::setDilationNd()IConvolutionLayer::setKernelSize()>IConvolutionLayer::setKernelSizeNd()IConvolutionLayer::setPadding()>IConvolutionLayer::setPaddingNd()IConvolutionLayer::setStride()>IConvolutionLayer::setStrideNd()ICudaEngine::bindingIsInput()>ICudaEngine::getTensorIOMode()ICudaEngine::destroy()>delete ObjectNameICudaEngine::getBindingBytesPerComponent()>ICudaEngine::getTensorBytesPerComponent()ICudaEngine::getBindingComponentsPerElement()>ICudaEngine::getTensorComponentsPerElement()ICudaEngine::getBindingDataType()>ICudaEngine::getTensorDataType()ICudaEngine::getBindingDimensions()>ICudaEngine::getTensorShape()ICudaEngine::getBindingFormat()>ICudaEngine::getTensorFormat()ICudaEngine::getBindingFormatDesc()>ICudaEngine::getTensorFormatDesc()ICudaEngine::getBindingIndex()> Name-based methodsICudaEngine::getBindingName()> Name-based methodsICudaEngine::getBindingVectorizedDim()>ICudaEngine::getTensorVectorizedDim()ICudaEngine::getLocation()>ITensor::getLocation()ICudaEngine::getMaxBatchSize()> Implicit batch support was removedICudaEngine::getNbBindings()>ICudaEngine::getNbIOTensors()ICudaEngine::getProfileDimensions()>ICudaEngine::getProfileShape()ICudaEngine::getProfileShapeValues()>ICudaEngine::getShapeValues()ICudaEngine::hasImplicitBatchDimension()> Implicit batch support was removedICudaEngine::isExecutionBinding()> No name-based equivalent replacementICudaEngine::isShapeBinding()>ICudaEngine::isShapeInferenceIO()IDeconvolutionLayer::getKernelSize()>IDeconvolutionLayer::getKernelSizeNd()IDeconvolutionLayer::getPadding()>IDeconvolutionLayer::getPaddingNd()IDeconvolutionLayer::getStride()>IDeconvolutionLayer::getStrideNd()IDeconvolutionLayer::setKernelSize()>IDeconvolutionLayer::setKernelSizeNd()IDeconvolutionLayer::setPadding()>IDeconvolutionLayer::setPaddingNd()IDeconvolutionLayer::setStride()>IDeconvolutionLayer::setStrideNd()IExecutionContext::destroy()>delete ObjectNameIExecutionContext::enqueue()>IExecutionContext::enqueueV3()IExecutionContext::enqueueV2()>IExecutionContext::enqueueV3()IExecutionContext::execute()>IExecutionContext::executeV2()IExecutionContext::getBindingDimensions()>IExecutionContext::getTensorShape()IExecutionContext::getShapeBinding()>IExecutionContext::getTensorAddress()orgetOutputTensorAddress()IExecutionContext::getStrides()>IExecutionContext::getTensorStrides()IExecutionContext::setBindingDimensions()>IExecutionContext::setInputShape()IExecutionContext::setInputShapeBinding()>IExecutionContext::setInputTensorAddress()orsetTensorAddress()IExecutionContext::setOptimizationProfile()>IExecutionContext::setOptimizationProfileAsync()IFullyConnectedLayer>IMatrixMultiplyLayerIGpuAllocator::free()>IGpuAllocator::deallocate()IHostMemory::destroy()>delete ObjectNameINetworkDefinition::addConvolution()>INetworkDefinition::addConvolutionNd()INetworkDefinition::addDeconvolution()>INetworkDefinition::addDeconvolutionNd()INetworkDefinition::addFullyConnected()>INetworkDefinition::addMatrixMultiply()INetworkDefinition::addPadding()>INetworkDefinition::addPaddingNd()INetworkDefinition::addPooling()>INetworkDefinition::addPoolingNd()INetworkDefinition::addRNNv2()>INetworkDefinition::addLoop()INetworkDefinition::destroy()>delete ObjectNameINetworkDefinition::hasExplicitPrecision()> Explicit precision support was removed in 10.0INetworkDefinition::hasImplicitBatchDimension()> Implicit batch support was removedIOnnxConfig::destroy()>delete ObjectNameIPaddingLayer::getPostPadding()>IPaddingLayer::getPostPaddingNd()IPaddingLayer::getPrePadding()>IPaddingLayer::getPrePaddingNd()IPaddingLayer::setPostPadding()>IPaddingLayer::setPostPaddingNd()IPaddingLayer::setPrePadding()>IPaddingLayer::setPrePaddingNd()IPoolingLayer::getPadding()>IPoolingLayer::getPaddingNd()IPoolingLayer::getStride()>IPoolingLayer::getStrideNd()IPoolingLayer::getWindowSize()>IPoolingLayer::getWindowSizeNd()IPoolingLayer::setPadding()>IPoolingLayer::setPaddingNd()IPoolingLayer::setStride()>IPoolingLayer::setStrideNd()IPoolingLayer::setWindowSize()>IPoolingLayer::setWindowSizeNd()IRefitter::destroy()>delete ObjectNameIResizeLayer::getAlignCorners()>IResizeLayer::getAlignCornersNd()IResizeLayer::setAlignCorners()>IResizeLayer::setAlignCornersNd()IRuntime::deserializeCudaEngine(void const* blob, std::size_t size, IPluginFactory* pluginFactory)> UsedeserializeCudaEnginewith two parametersIRuntime::destroy()>delete ObjectNameIRNNv2Layer>ILoopkNV_TENSORRT_VERSION_IMPL[2] >define NV_TENSORRT_VERSION_INT(major, minor, patch) ((major) *10000L + (minor) *100L + (patch) *1L)NetworkDefinitionCreationFlag::kEXPLICIT_BATCH> Support was removed in 10.0NetworkDefinitionCreationFlag::kEXPLICIT_PRECISION> Support was removed in 10.0NV_TENSORRT_SONAME_MAJOR>NV_TENSORRT_MAJORNV_TENSORRT_SONAME_MINOR>NV_TENSORRT_MINORNV_TENSORRT_SONAME_PATCH>NV_TENSORRT_PATCHnvinfer1::safe::IPluginRegistry* getBuilderSafePluginRegistry(nvinfer1::EngineCapability capability)> API will not be implementedPaddingMode::kCAFFE_ROUND_DOWN> Caffe support was removedPaddingMode::kCAFFE_ROUND_UP> Caffe support was removedPreviewFeature::kDISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805> External tactics are always disabled for core codePreviewFeature::kFASTER_DYNAMIC_SHAPES_080> This flag is on by defaultProfilingVerbosity::kDEFAULT>ProfilingVerbosity::kLAYER_NAMES_ONLYProfilingVerbosity::kVERBOSE>ProfilingVerbosity::kDETAILEDResizeMode> UseInterpolationMode. Alias was removed.RNNDirection> RNN-related data structures were removedRNNGateType> RNN-related data structures were removedRNNInputMode> RNN-related data structures were removedRNNOperation> RNN-related data structures were removedSampleMode::kDEFAULT>SampleMode::kSTRICT_BOUNDSSliceMode> UseSampleMode. Alias was removed.
Removed C++ Plugins#
The following removed C++ plugins are listed next to their superseded plugin.
createAnchorGeneratorPlugin()>GridAnchorPluginCreator::createPlugin()createBatchedNMSPlugin()>BatchedNMSPluginCreator::createPlugin()createInstanceNormalizationPlugin()>InstanceNormalizationPluginCreator::createPlugin()createNMSPlugin()>NMSPluginCreator::createPlugin()createNormalizePlugin()>NormalizePluginCreator::createPlugin()createPriorBoxPlugin()>PriorBoxPluginCreator::createPlugin()createRegionPlugin()>RegionPluginCreator::createPlugin()createReorgPlugin()>ReorgPluginCreator::createPlugin()createRPNROIPlugin()>RPROIPluginCreator::createPlugin()createSplitPlugin()>INetworkDefinition::addSlice()struct Quadruple> Related plugins were removed
trtexec#
trtexec Flag Changes#
Changes to flag workspace and minTiming.
1trtexec \
2 --onnx=/path/to/model.onnx \
3 --saveEngine=/path/to/engine.trt \
4 --optShapes=input:$INPUT_SHAPE \
5
6 --workspace=1024 \
7 --minTiming=1
1trtexec \
2 --onnx=/path/to/model.onnx \
3 --saveEngine=/path/to/engine.trt \
4 --optShapes=input:$INPUT_SHAPE \
5
6 --memPoolSize=workspace:1024
Removed trtexec Flags#
The following removed trtexec` flags are listed next to their superseded flag.
--deploy> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--output> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--model> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--uff> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--uffInput> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--uffNHWC> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--batch> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--maxBatch> TensorRT 10.x does not support Caffe input, UFF input, and implicit batch dimension mode.--minTiming>--avgTiming--preview=features>disableExternalTacticSourcesForCore0805orfasterDynamicShapes0805--workspace=N>--memPoolSize=poolspec--explicitPrecision> Removed--nativeInstanceNorm> Removed--heuristic>--builderOptimizationLevel=<N>(where<N>can be0,1, or2)--buildOnly>--skipInference--nvtxMode>--profilingVerbosity
Deprecated trtexec Flags#
The following deprecated trtexec` flags are listed next to their superseded flag.
--sparsity=force> Usepolygraphy surgeon pruneto rewrite the weights to a sparsity pattern and then run--sparsity=enable.--plugins>--staticPlugins--preview=profileSharing0806> Enabled by default and has no effect.--profilingVerbosity=default>--profilingVerbosity=layer_names_only--profilingVerbosity=verbose>--profilingVerbosity=detailed--streams>--infStreams--weightless>--stripWeights
Safety Runtime#
Transition from TensorRT 8.x safety runtime to TensorRT 10.x safety runtime.
Load the engine from the user’s local file system into a memory buffer.
Create an
InferRuntimeobject with which to deserialize the CUDA engine. Use the InferRuntime’sdeserializeCudaEnginemethod to retrieve theICudaEngineobject from the memory buffer.1using namespace nvinfer1::safe; 2auto infer = createInferRuntime(logger); 3infer->setErrorRecorder(recorder.get()); 4auto engine = infer->deserializeCudaEngine(enginebuffer.data(), enginebufferSize); 5 6// The inferRuntime object acts as the primary C entry point for TensorRT. It is in charge of deserializing a serialized engine buffer into an ICudaEngine object.
1using namespace nvinfer2::safe; 2ITRTGraph *graph = nullptr; 3ErrorCode code = createTRTGraph(/* ITRTGraph *& */graph, 4/*Engine buffer*/ buffer.data(), 5/*Engine buffer size*/ buffer.size(), 6/*ISafeRecorder& */ recorder, 7/*trtManagedScratch*/ true, 8/*ISafeMemAllocator* */ allocator); 9 10// ITRTGraph is an abstraction of a neural network graph. The engine data buffer is passed directly to createTRTGraph to construct a graph object that represents the user’s serialized neural network engine file. 11// The parameter “recorder” is a reference to an ISafeRecorder object (sample provided) which is a derived class from IErrorRecorder. 12// The parameter “trtManagedScratch” defaults to true to indicate that the memory allocator (either user-supplied or default) will be used to allocate scratch memory (equivalent to createExecutionContext in TensorRT 8.x). If set to false (equivalent to createExecutionContextWithoutScratch), then the user has to supply scratch memory that they manage themselves. 13// The parameter “allocator” (default nullptr) is a custom memory allocator implementing ISafeMemAllocator interface. A default allocator is used if left unspecified.
Get the input/output tensor attributes from the engine.
1int32_t nb = engine->getNbIOTensors(); 2char const* inputName = engine->getIOTensorName(inputIndex); 3char const* outputName = engine->getIOTensorName(outputIndex); 4Dims inputDims = engine->getTensorShape(inputName); 5Dims outputDims = engine->getTensorShape(outputName); 6DataType inputType = engine->getTensorDataType(inputName); 7DataType outputType = engine->getTensorDataType(outputName); 8 9// Here the user might also need to use getTensorVectorizedDim, getTensorBytesPerComponent, and getTensorComponentsPerElement to calculate the tensor volume.
1int64_t nb; 2ErrorCode code = graph->getNbIOTensors(nb); 3char const* inputName; 4code = graph->getIOTensorName(inputName, inputIndex); 5char const*outputName; 6code = graph->getIOTensorName(outputName, outputIndex); 7TensorDescriptor inputDescriptor; 8code = graph->getIOTensorDescriptor(inputDescriptor, inputName); 9TensorDescriptor outputDescriptor; 10code = graph->getIOTensorDescriptor(outputDescriptor, outputName); 11 12 13// In TensorRT 10.x, TensorRT provides a convenience class, TensorDescriptor, to the user. This class contains all the information that users need to construct an IOTensor outside of TensorRT and pass in its address to TensorRT later. 14 15struct TensorDescriptor 16{ 17char const* const tensorName; 18Dims const shape; // Extent of the tensor. 19Dims const stride; // Strides of the tensor 20DataType const dataType; // The type of the data in the buffer 21int64_t const bytesPerComponent; // The size of the tensor data type in bytes (4 for float and int32, 2 for half, 1 for int8) 22int64_t const componentsPerVector; // The vector length (in scalars) for a vectorized tensor, 1 if the tensor is scalar. 23int64_t const vectorizedDim; // The dimension index along which the buffer is vectorized, -1 if the tensor is scalar. 24uint64_t const sizeInBytes; // The sizeInBytes of this tensor 25TensorIOMode const ioMode; // Whether the tensor is an input / output 26IOTensorProperty const ioProperty; // i/o tensor property for async / sync tensor 27}; 28 29// The tensor volume may be calculated using these properties.
Allocate device/host buffers for the input/output tensors of the engine.
Create an
ExecutionContextfrom the engine with or without scratch memory. If without scratch memory, the user also needs to allocate and set memory for the context.1auto context = engine->createExecutionContext(); 2 3// If without scratchMemory, 4 5auto context = engine->createExecutionContextWithoutDeviceMemory(); 6size_t size = engine->getDeviceMemorySize(); 7void* mem; 8cudaError_t code = cudaMalloc(&mem, size); 9context->setDeviceMemory(mem);
1// There is no need to call createExecutionContext. The TensorRT graph carries its own context. 2 3// If without scratchMemory (trtManagedScratch = False when creating the graph) 4 5size_t size; 6code = graph->getScratchMemorySize(size); 7void* mem; 8cudaError_t code = cudaMalloc(&mem, size); 9code = graph->setScratchMemory(mem);
Preprocess the input data with the host buffer and copy it to the device buffer.
Set the input/output tensor addresses of the context with the device buffer addresses accordingly.
1bool result = context->setInputTensorAddress(inputName,inDataPtr); 2result = context->setOutputTensorAddress(outputName, outDataPtr);
1TypedArray inputData(inDataPtr, inSize); 2code = graph->setInputTensorAddress(inputName, inputData); 3TypedArray outputData(outDataPtr, outSize); 4graph->setOutputTensorAddress(outputName, outputData); 5 6// In TensorRT 10.x, the IO tensor memory must be strongly-typed and wrapped with a TypedArray object.
Call enqueue to start inference.
1result = context->enqueueV3(stream); 2cudaStreamSynchronize(stream);
1code = graph->executeAsync(stream); 2code = graph->sync(); 3 4// In TensorRT 10.x, the CUDA stream synchronization is wrapped into an API sync() which does additional finalization.
Postprocess the output device buffer to retrieve the inference output.
Removed Safety C++ APIs#
The following removed Safety C++ plugins are listed next to their superseded plugin.
safe::ICudaEngine::bindingIsInput()>safe::ICudaEngine::tensorIOMode()safe::ICudaEngine::getBindingBytesPerComponent()>safe::ICudaEngine::getTensorBytesPerComponent()safe::ICudaEngine::getBindingComponentsPerElement()>safe::ICudaEngine::getTensorComponentsPerElement()safe::ICudaEngine::getBindingDataType()>safe::ICudaEngine::getTensorDataType()safe::ICudaEngine::getBindingDimensions()>safe::ICudaEngine::getTensorShape()safe::ICudaEngine::getBindingIndex()>safe::name-based methodssafe::ICudaEngine::getBindingName()>safe::name-based methodssafe::ICudaEngine::getBindingVectorizedDim()>safe::ICudaEngine::getTensorVectorizedDim()safe::ICudaEngine::getNbBindings()>safe::ICudaEngine::getNbIOTensors()safe::ICudaEngine::getTensorFormat()>safe::ICudaEngine::getBindingFormat()safe::IExecutionContext::enqueueV2()>safe::IExecutionContext::enqueueV3()safe::IExecutionContext::getStrides()>safe::IExecutionContext::getTensorStrides()
Footnotes