Source code for polygraphy.tools.args.backend.trt.config

#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import os

from polygraphy import mod, util
from polygraphy.common import TensorMetadata
from polygraphy.logger import G_LOGGER, LogMode
from polygraphy.tools.args import util as args_util
from polygraphy.tools.args.base import BaseArgs
from polygraphy.tools.args.comparator.data_loader import DataLoaderArgs
from polygraphy.tools.args.model import ModelArgs
from polygraphy.tools.script import assert_identifier, inline, make_invocable, make_invocable_if_nondefault, safe


def parse_profile_shapes(default_shapes, min_args, opt_args, max_args):
    """
    Parses TensorRT profile options from command-line arguments.

    Args:
        default_shapes (TensorMetadata): The inference input shapes.

    Returns:
        List[OrderedDict[str, Tuple[Shape]]]:
            A list of profiles where each profile is a dictionary that maps
            input names to a tuple of (min, opt, max) shapes.
    """

    def get_shapes(lst, idx):
        # Overwrite a copy of default_shapes with the shapes for min, opt, or max (if applicable)
        nonlocal default_shapes
        default_shapes = copy.copy(default_shapes)
        if idx < len(lst):
            default_shapes.update(args_util.parse_meta(lst[idx], includes_dtype=False))

        # Don't care about dtype, and need to override dynamic dimensions
        shapes = {name: util.override_dynamic_shape(shape) for name, (_, shape) in default_shapes.items()}

        for name, shape in shapes.items():
            if tuple(default_shapes[name].shape) != tuple(shape):
                G_LOGGER.warning(
                    f"Input tensor: {name} | For TensorRT profile, overriding dynamic shape: {default_shapes[name].shape} to: {shape}",
                    mode=LogMode.ONCE,
                )

        return shapes

    num_profiles = max(len(min_args), len(opt_args), len(max_args))

    # For cases where input shapes are provided, we have to generate a profile
    if not num_profiles and default_shapes:
        num_profiles = 1

    profiles = []
    for idx in range(num_profiles):
        min_shapes = get_shapes(min_args, idx)
        opt_shapes = get_shapes(opt_args, idx)
        max_shapes = get_shapes(max_args, idx)
        if sorted(min_shapes.keys()) != sorted(opt_shapes.keys()):
            G_LOGGER.critical(
                f"Mismatch in input names between minimum shapes ({list(min_shapes.keys())}) and optimum shapes ({list(opt_shapes.keys())})"
            )
        elif sorted(opt_shapes.keys()) != sorted(max_shapes.keys()):
            G_LOGGER.critical(
                f"Mismatch in input names between optimum shapes ({list(opt_shapes.keys())}) and maximum shapes ({list(max_shapes.keys())})"
            )

        profile = {name: (min_shapes[name], opt_shapes[name], max_shapes[name]) for name in min_shapes.keys()}
        profiles.append(profile)
    return profiles


[docs]@mod.export() class TrtConfigArgs(BaseArgs): """ TensorRT Builder Configuration: creating the TensorRT BuilderConfig. Depends on: - ModelArgs: if allow_custom_input_shapes == True - DataLoaderArgs: if allow_calibration == True """ def __init__( self, precision_constraints_default: bool = None, allow_random_data_calib_warning: bool = None, allow_custom_input_shapes: bool = None, allow_calibration: bool = None, ): """ Args: precision_constraints_default (str): The default value to use for the precision constraints option. Defaults to "none". allow_random_data_calib_warning (bool): Whether to issue a warning when randomly generated data is being used for calibration. Defaults to True. allow_custom_input_shapes (bool): Whether to allow custom input shapes when randomly generating data. Defaults to True. allow_calibration (bool): Whether to allow INT8 calibration. Defaults to True. """ super().__init__() self._precision_constraints_default = util.default(precision_constraints_default, "none") self._allow_random_data_calib_warning = util.default(allow_random_data_calib_warning, True) self._allow_custom_input_shapes = util.default(allow_custom_input_shapes, True) self._allow_calibration = util.default(allow_calibration, True) def add_parser_args_impl(self): self.group.add_argument( "--trt-min-shapes", action="append", help="The minimum shapes the optimization profile(s) will support. " "Specify this option once for each profile. If not provided, inference-time input shapes are used. " "Format: --trt-min-shapes <input0>:[D0,D1,..,DN] .. <inputN>:[D0,D1,..,DN]", nargs="+", default=[], ) self.group.add_argument( "--trt-opt-shapes", action="append", help="The shapes for which the optimization profile(s) will be most performant. " "Specify this option once for each profile. If not provided, inference-time input shapes are used. " "Format: --trt-opt-shapes <input0>:[D0,D1,..,DN] .. <inputN>:[D0,D1,..,DN]", nargs="+", default=[], ) self.group.add_argument( "--trt-max-shapes", action="append", help="The maximum shapes the optimization profile(s) will support. " "Specify this option once for each profile. If not provided, inference-time input shapes are used. " "Format: --trt-max-shapes <input0>:[D0,D1,..,DN] .. <inputN>:[D0,D1,..,DN]", nargs="+", default=[], ) self.group.add_argument("--tf32", help="Enable tf32 precision in TensorRT", action="store_true", default=None) self.group.add_argument("--fp16", help="Enable fp16 precision in TensorRT", action="store_true", default=None) self.group.add_argument( "--int8", help="Enable int8 precision in TensorRT. " "If calibration is required but no calibration cache is provided, this option will cause TensorRT to run " "int8 calibration using the Polygraphy data loader to provide calibration data. ", action="store_true", default=None, ) precision_constraints_group = self.group.add_mutually_exclusive_group() precision_constraints_group.add_argument( "--precision-constraints", help=f"If set to `prefer`, TensorRT will restrict available tactics to layer precisions specified in the network unless no implementation exists with the preferred layer constraints, in which case it will issue a warning and use the fastest available implementation. If set to `obey`, TensorRT will instead fail to build the network if no implementation exists with the preferred layer constraints. Defaults to `{self._precision_constraints_default}`", choices=("prefer", "obey", "none"), default=self._precision_constraints_default, ) if self._precision_constraints_default == "obey": precision_constraints_group.add_argument( "--no-obey-precision-constraints", help="[DEPRECATED - use --precision-constraints] Disables enforcing precision constraints in TensorRT, allowing it to choose tactics outside the " "layer precision set.", action="store_false", default=True, dest="obey_precision_constraints", ) else: precision_constraints_group.add_argument( "--obey-precision-constraints", help="[DEPRECATED - use --precision-constraints] Enable enforcing precision constraints in TensorRT, forcing it to use tactics based on the " "layer precision set, even if another precision is faster. Build fails if such an engine cannot be built.", action="store_true", default=None, dest="obey_precision_constraints", ) precision_constraints_group.add_argument( "--strict-types", help="[DEPRECATED - use --precision-constraints] Enable preference for precision constraints and avoidance of I/O reformatting in TensorRT, " "and fall back to ignoring the request if such an engine cannot be built.", action="store_true", default=None, dest="strict_types", ) self.group.add_argument( "--sparse-weights", help="Enable optimizations for sparse weights in TensorRT", action="store_true", default=None, ) self.group.add_argument( "--workspace", metavar="BYTES", help="[DEPRECATED - use --pool-limit] Amount of memory, in bytes, to allocate for the TensorRT builder's workspace. " "Optionally, use a `K`, `M`, or `G` suffix to indicate KiB, MiB, or GiB respectively. " "For example, `--workspace=16M` is equivalent to `--workspace=16777216`. ", default=None, ) self.group.add_argument( "--calibration-cache", help="Path to load/save a calibration cache. " "Used to store calibration scales to speed up the process of int8 calibration. " "If the provided path does not yet exist, int8 calibration scales will be calculated and written to it during engine building. " "If the provided path does exist, it will be read and int8 calibration will be skipped during engine building. ", default=None, ) self.group.add_argument( "--calib-base-cls", "--calibration-base-class", dest="calibration_base_class", help="The name of the calibration base class to use. For example, 'IInt8MinMaxCalibrator'. ", default=None, ) self.group.add_argument( "--quantile", type=float, help="The quantile to use for IInt8LegacyCalibrator. Has no effect for other calibrator types.", default=None, ) self.group.add_argument( "--regression-cutoff", type=float, help="The regression cutoff to use for IInt8LegacyCalibrator. Has no effect for other calibrator types.", default=None, ) self.group.add_argument( "--timing-cache", help="[DEPRECATED - use --load-timing-cache/--save-timing-cache] Path to load/save tactic timing cache. " "Used to cache tactic timing information to speed up the engine building process. " "Existing caches will be appended to with any new timing information gathered. ", default=None, ) self.group.add_argument( "--load-timing-cache", help="Path to load tactic timing cache. " "Used to cache tactic timing information to speed up the engine building process. ", default=None, ) replay_group = self.group.add_mutually_exclusive_group() replay_group.add_argument( "--save-tactics", help="Path to save a Polygraphy tactic replay file. " "Details about tactics selected by TensorRT will be recorded and stored at this location as a JSON file. ", default=None, ) replay_group.add_argument( "--load-tactics", help="Path to load a Polygraphy tactic replay file, such as one created by --save-tactics. " "The tactics specified in the file will be used to override TensorRT's default selections. ", default=None, ) self.group.add_argument( "--tactic-sources", help="Tactic sources to enable. This controls which libraries " "(e.g. cudnn, cublas, etc.) TensorRT is allowed to load tactics from. " "Values come from the names of the values in the trt.TacticSource enum and are case-insensitive. " "If no arguments are provided, e.g. '--tactic-sources', then all tactic sources are disabled.", nargs="*", default=None, ) self.group.add_argument( "--trt-config-script", help="Path to a Python script that defines a function that creates a " "TensorRT IBuilderConfig. The function should take a builder and network as parameters and return a " "TensorRT builder configuration. When this option is specified, all other config arguments are ignored. ", default=None, ) self.group.add_argument( "--trt-config-func-name", help="When using a trt-config-script, this specifies the name of the function " "that creates the config. Defaults to `load_config`. ", default="load_config", ) self.group.add_argument( "--trt-safety-restricted", help="Enable safety scope checking in TensorRT", action="store_true", default=None, dest="restricted", ) self.group.add_argument( "--use-dla", help="[EXPERIMENTAL] Use DLA as the default device type", action="store_true", default=None, ) self.group.add_argument( "--allow-gpu-fallback", help="[EXPERIMENTAL] Allow layers unsupported on the DLA to fall back to GPU. Has no effect if --dla is not set.", action="store_true", default=None, ) self.group.add_argument( "--pool-limit", "--memory-pool-limit", dest="memory_pool_limit", help="Set memory pool limits. Memory pool names come from the names of values in the trt.MemoryPoolType enum and are case-insensitive" "Format: `--pool-limit <pool_name>:<pool_limit> ...`. For example, `--pool-limit dla_local_dram:1e9 workspace:16777216`. " "Optionally, use a `K`, `M`, or `G` suffix to indicate KiB, MiB, or GiB respectively. " "For example, `--pool-limit workspace:16M` is equivalent to `--pool-limit workspace:16777216`. ", nargs="*", default=None, )
[docs] def parse_impl(self, args): """ Parses command-line arguments and populates the following attributes: Attributes: profile_dicts (List[OrderedDict[str, Tuple[Shape]]]): A list of profiles where each profile is a dictionary that maps input names to a tuple of (min, opt, max) shapes. tf32 (bool): Whether to enable TF32. fp16 (bool): Whether to enable FP16. int8 (bool): Whether to enable INT8. precision_constraints (str): The precision constraints to apply. restricted (bool): Whether to enable safety scope checking in the builder. calibration_cache (str): Path to the calibration cache. calibration_base_class (str): The name of the base class to use for the calibrator. sparse_weights (bool): Whether to enable sparse weights. load_timing_cache (str): Path from which to load a timing cache. load_tactics (str): Path from which to load a tactic replay file. save_tactics (str): Path at which to save a tactic replay file. tactic_sources (List[str]): Names of the tactic sources to enable. trt_config_script (str): Path to a custom TensorRT config script. trt_config_func_name (str): Name of the function in the custom config script that creates the config. use_dla (bool): Whether to enable DLA. allow_gpu_fallback (bool): Whether to allow GPU fallback when DLA is enabled. memory_pool_limits (Dict[str, int]): Mapping of memory pool names to memory limits in bytes. """ trt_min_shapes = args_util.get(args, "trt_min_shapes", default=[]) trt_max_shapes = args_util.get(args, "trt_max_shapes", default=[]) trt_opt_shapes = args_util.get(args, "trt_opt_shapes", default=[]) default_shapes = TensorMetadata() if self._allow_custom_input_shapes: if not hasattr(self.arg_groups[ModelArgs], "input_shapes"): G_LOGGER.internal_error("ModelArgs must be parsed before TrtConfigArgs!") default_shapes = self.arg_groups[ModelArgs].input_shapes self.profile_dicts = parse_profile_shapes(default_shapes, trt_min_shapes, trt_opt_shapes, trt_max_shapes) self.workspace = args_util.parse_num_bytes(args_util.get(args, "workspace")) if self.workspace is not None: mod.warn_deprecated( "--workspace", use_instead=f"--pool-limit workspace:{args_util.get(args, 'workspace')}", remove_in="0.45.0", always_show_warning=True, ) self.tf32 = args_util.get(args, "tf32") self.fp16 = args_util.get(args, "fp16") self.int8 = args_util.get(args, "int8") self.precision_constraints = args_util.get(args, "precision_constraints") if self.precision_constraints == "none": self.precision_constraints = None # XXX: Although --precision-constraints and --obey-precision-constraints are mutually exclusive options # they may still both be set in args (due to the default being "obey" e.g. in the debug precision subtool). # In that case, let the newer --precision-constraints flag take precedence. This should go away once # --obey-precision-constraints is removed. if self.precision_constraints is None and args_util.get(args, "obey_precision_constraints"): self.precision_constraints = "obey" self.strict_types = args_util.get(args, "strict_types") if self.strict_types is not None: mod.warn_deprecated( "--strict-types", use_instead=f"--precision-constraints=obey", remove_in="0.45.0", always_show_warning=True, ) self.restricted = args_util.get(args, "restricted") self.calibration_cache = args_util.get(args, "calibration_cache") calib_base = args_util.get(args, "calibration_base_class") self.calibration_base_class = None if calib_base is not None: calib_base = safe(assert_identifier(calib_base)) self.calibration_base_class = inline(safe("trt.{:}", inline(calib_base))) self.quantile = args_util.get(args, "quantile") self.regression_cutoff = args_util.get(args, "regression_cutoff") self.sparse_weights = args_util.get(args, "sparse_weights") self.load_timing_cache = args_util.get(args, "load_timing_cache") self.timing_cache = args_util.get(args, "timing_cache") if self.timing_cache: mod.warn_deprecated( "--timing-cache", use_instead="--load-timing-cache/--save-timing-cache", remove_in="0.45.0", always_show_warning=True, ) if os.path.exists(self.timing_cache): self.load_timing_cache = self.timing_cache self.load_tactics = args_util.get(args, "load_tactics") self.save_tactics = args_util.get(args, "save_tactics") tactic_sources = args_util.get(args, "tactic_sources") self.tactic_sources = None if tactic_sources is not None: self.tactic_sources = [] for source in tactic_sources: source = safe(assert_identifier(source.upper())) source_str = safe("trt.TacticSource.{:}", inline(source)) self.tactic_sources.append(inline(source_str)) self.trt_config_script = args_util.get(args, "trt_config_script") self.trt_config_func_name = args_util.get(args, "trt_config_func_name") self.use_dla = args_util.get(args, "use_dla") self.allow_gpu_fallback = args_util.get(args, "allow_gpu_fallback") memory_pool_limits = args_util.parse_dict_with_default( args_util.get(args, "memory_pool_limit"), cast_to=args_util.parse_num_bytes, allow_empty_key=False ) self.memory_pool_limits = None if memory_pool_limits is not None: self.memory_pool_limits = {} for pool_type, pool_size in memory_pool_limits.items(): pool_type = safe(assert_identifier(pool_type.upper())) pool_type_str = safe("trt.MemoryPoolType.{:}", inline(pool_type)) self.memory_pool_limits[inline(pool_type_str)] = pool_size
def add_to_script_impl(self, script): profiles = [] for profile_dict in self.profile_dicts: profile_str = "Profile()" for name in profile_dict.keys(): profile_str += safe(".add({:}, min={:}, opt={:}, max={:})", name, *profile_dict[name]).unwrap() profiles.append(profile_str) if profiles: script.add_import(imports=["Profile"], frm="polygraphy.backend.trt") profiles = safe("[\n\t{:}\n]", inline(safe(",\n\t".join(profiles)))) profile_name = script.add_loader(profiles, "profiles") else: profile_name = None calibrator = None if any(arg is not None for arg in [self.calibration_cache, self.calibration_base_class]) and not self.int8: G_LOGGER.warning( "Some int8 calibrator options were set, but int8 precision is not enabled. " "Calibration options will be ignored. Please set --int8 to enable calibration. " ) if self.int8 and self._allow_calibration: script.add_import(imports=["Calibrator"], frm="polygraphy.backend.trt") script.add_import(imports=["DataLoader"], frm="polygraphy.comparator") data_loader_name = self.arg_groups[DataLoaderArgs].add_to_script(script) if self.calibration_base_class: script.add_import(imports=["tensorrt as trt"]) if ( self.arg_groups[DataLoaderArgs].is_using_random_data() and (not self.calibration_cache or not os.path.exists(self.calibration_cache)) and self._allow_random_data_calib_warning ): G_LOGGER.warning( "Int8 Calibration is using randomly generated input data.\n" "This could negatively impact accuracy if the inference-time input data is dissimilar " "to the randomly generated calibration data.\n" "You may want to consider providing real data via the --data-loader-script option." ) calibrator = make_invocable( "Calibrator", data_loader=data_loader_name if data_loader_name else inline(safe("DataLoader()")), cache=self.calibration_cache, BaseClass=self.calibration_base_class, quantile=self.quantile, regression_cutoff=self.regression_cutoff, ) algo_selector = None if self.load_tactics is not None: script.add_import(imports=["TacticReplayer"], frm="polygraphy.backend.trt") algo_selector = make_invocable("TacticReplayer", replay=self.load_tactics) elif self.save_tactics is not None: script.add_import(imports=["TacticRecorder"], frm="polygraphy.backend.trt") algo_selector = make_invocable("TacticRecorder", record=self.save_tactics) if self.tactic_sources is not None or self.memory_pool_limits is not None: script.add_import(imports=["tensorrt as trt"]) if self.trt_config_script is not None: script.add_import(imports=["InvokeFromScript"], frm="polygraphy.backend.common") config_loader_str = make_invocable( "InvokeFromScript", self.trt_config_script, name=self.trt_config_func_name ) else: config_loader_str = make_invocable_if_nondefault( "CreateTrtConfig", max_workspace_size=self.workspace, tf32=self.tf32, fp16=self.fp16, int8=self.int8, precision_constraints=self.precision_constraints, strict_types=self.strict_types, restricted=self.restricted, profiles=profile_name, calibrator=calibrator, load_timing_cache=self.load_timing_cache, algorithm_selector=algo_selector, sparse_weights=self.sparse_weights, tactic_sources=self.tactic_sources, use_dla=self.use_dla, allow_gpu_fallback=self.allow_gpu_fallback, memory_pool_limits=self.memory_pool_limits, ) if config_loader_str is not None: script.add_import(imports=["CreateConfig as CreateTrtConfig"], frm="polygraphy.backend.trt") if config_loader_str is not None: config_loader_name = script.add_loader(config_loader_str, "create_trt_config") else: config_loader_name = None return config_loader_name
[docs] def create_config(self, builder, network): """ Creates a TensorRT BuilderConfig according to arguments provided on the command-line. Args: builder (trt.Builder): The TensorRT builder to use to create the configuration. network (trt.INetworkDefinition): The TensorRT network for which to create the config. The network is used to automatically create a default optimization profile if none are provided. Returns: trt.IBuilderConfig: The TensorRT builder configuration. """ from polygraphy.backend.trt import CreateConfig loader = util.default(args_util.run_script(self.add_to_script), CreateConfig()) return loader(builder, network)