Performance Tuning#
Several performance tuning techniques are available in cuTile:
architecture-specific configuration values, using
ByTarget;load/store hints such as
latencyandallow_tma.
Architecture-specific configuration#
- class cuda.tile.ByTarget(*, default=UNSPECIFIED, **value_by_target)#
Type used to specify a value that depends on the target GPU architecture.
- Parameters:
default – The fallback value to use when the target GPU architecture is not explicitly listed in
value_by_target.value_by_target – Mapping from GPU architecture name to value. Keys must be strings of the form
"sm_<major><minor>", such as"sm_100"or"sm_120".
Examples
Use one
num_ctasvalue for all architectures:from cuda.tile import ByTarget @ct.kernel(num_ctas=8) def kernel_fn(x): ...
import cuda.tile as ct import torch torch.cuda.init() stream = torch.cuda.current_stream() from cuda.tile import ByTarget @ct.kernel(num_ctas=8) def kernel_fn(x): ... torch.cuda.synchronize()
Use different
num_ctasvalues for specific architectures, and a fallback value for all others:from cuda.tile import ByTarget @ct.kernel(num_ctas=ByTarget(sm_100=8, sm_120=4, default=2)) def kernel_fn(x): ...
import cuda.tile as ct import torch torch.cuda.init() stream = torch.cuda.current_stream() from cuda.tile import ByTarget @ct.kernel(num_ctas=ByTarget(sm_100=8, sm_120=4, default=2)) def kernel_fn(x): ... torch.cuda.synchronize()
See Tile Kernels for the full description of kernel configuration
parameters such as num_ctas, occupancy and opt_level. Any of
these options may be given as a ByTarget value to specialize them
for different GPU architectures.
Load/store performance hints#
The load() and store() operations accept optional keyword
arguments that can influence how memory traffic is scheduled and lowered:
latency(intorNone) – A hint indicating how heavy the DRAM traffic will be for this operation. It shall be an integer between 1 (low) and 10 (high). A large value typically fits the cases when DRAM traffic is high, and will likely result in a larger prefetch depth of the memory operation.allow_tma(boolorNone) – IfTrue, the load or store may be lowered to use TMA (Tensor Memory Accelerator) when the target architecture supports it. IfFalse, TMA will not be used for this operation. By default, TMA is allowed.
These hints are optional: kernels will compile and run without specifying them, but providing them can help the compiler make better code-generation decisions for a particular memory-access pattern.
Example#
import cuda.tile as ct
TILE_SIZE = 16
@ct.kernel
def load_store_with_hints_kernel(x, y):
bid = ct.bid(0)
tx = ct.load(
x,
index=(bid,),
shape=(TILE_SIZE,),
latency=8, # high-latency DRAM load
)
ct.store(
y,
index=(bid,),
tile=tx,
latency=2, # cheaper write
allow_tma=False, # disallow TMA
)
Autotuning#
tune.exhaustive_search() provides a convenient way to measure kernel performance
on a finite space of configurations and return the best configuration.
- cuda.tile.tune.exhaustive_search(
- search_space,
- stream,
- grid_fn,
- kernel,
- args_fn,
- hints_fn=None,
- *,
- quiet=False,
Searches the entire search space and return the best configuration.
- Parameters:
search_space (Sequence[T]) – Sequence of configs to evaluate.
stream – The CUDA stream to execute kernel on.
grid_fn (Callable[[T], tuple[int, ...]]) – Maps a config to grid dimensions.
kernel – The kernel to tune.
args_fn (Callable[[T], tuple[Any, ...]]) – Maps a config to kernel arguments for timing.
hints_fn (Callable[[T], dict[str, Any]] | None) – Maps a config to compiler hints. Default: no hints.
quiet (bool) – If true, avoid printing any progress or result.
- Returns:
TuningResult with the best config and its time in microseconds.
- Return type:
TuningResult[T]
Examples:
# Define the kernel @ct.kernel def matmul(X, Y, Out, tm: ct.Constant[int], tn: ct.Constant[int], tk: ct.Constant[int]): i, j = ct.bid(0), ct.bid(1) x_view = X.tiled_view((tm, tk), padding_mode=ct.PaddingMode.ZERO) y_view = Y.tiled_view((tk, tn), padding_mode=ct.PaddingMode.ZERO) acc = ct.zeros((tm, tn), ct.float32) for k in range(x_view.num_tiles(1)): tx = x_view.load((i, k)) ty = y_view.load((k, j)) acc = ct.mma(tx, ty, acc) ct.store(Out, (i, j), acc.astype(Out.dtype)) # Tune the kernel from itertools import product from cuda.tile import ByTarget def tune(x, y, out) -> ct.tune.TuningResult: keys = ("tm", "tn", "tk", "num_ctas") search_space = [dict(zip(keys, vals)) for vals in product( (64, 128), (64, 128), (32, 64), (1, 2))] grid = lambda cfg: (ct.cdiv(M, cfg['tm']), ct.cdiv(N, cfg['tn'])) args = lambda cfg: (x, y, out.clone(), cfg['tm'], cfg['tn'], cfg['tk']) hints = lambda cfg: {'num_ctas': ByTarget(sm_100=cfg['num_ctas'])} stream = torch.cuda.current_stream() tuning_result = ct.tune.exhaustive_search(search_space, stream, grid, matmul, args, hints) return tuning_result M, N, K = 1024, 256, 512 x = torch.rand((M, K), dtype=torch.float16, device='cuda') y = torch.rand((K, N), dtype=torch.float16, device='cuda') out = torch.zeros((M, N), dtype=torch.float16, device='cuda') result = tune(x, y, out) print(f"Best config: {result.best.config} ({result.best.mean_us:.1f}us)") # Launch the kernel with tuned result tm, tn, tk, num_ctas = result.best.config.values() kernel = matmul.replace_hints(num_ctas=ByTarget(sm_100=num_ctas)) ct.launch(torch.cuda.current_stream(), (ct.cdiv(M, tm), ct.cdiv(N, tn)), kernel, (x, y, out, tm, tn, tk)) torch.testing.assert_close(out, x @ y)
import cuda.tile as ct import torch torch.cuda.init() stream = torch.cuda.current_stream() # Define the kernel @ct.kernel def matmul(X, Y, Out, tm: ct.Constant[int], tn: ct.Constant[int], tk: ct.Constant[int]): i, j = ct.bid(0), ct.bid(1) x_view = X.tiled_view((tm, tk), padding_mode=ct.PaddingMode.ZERO) y_view = Y.tiled_view((tk, tn), padding_mode=ct.PaddingMode.ZERO) acc = ct.zeros((tm, tn), ct.float32) for k in range(x_view.num_tiles(1)): tx = x_view.load((i, k)) ty = y_view.load((k, j)) acc = ct.mma(tx, ty, acc) ct.store(Out, (i, j), acc.astype(Out.dtype)) # Tune the kernel from itertools import product from cuda.tile import ByTarget def tune(x, y, out) -> ct.tune.TuningResult: keys = ("tm", "tn", "tk", "num_ctas") search_space = [dict(zip(keys, vals)) for vals in product( (64, 128), (64, 128), (32, 64), (1, 2))] grid = lambda cfg: (ct.cdiv(M, cfg['tm']), ct.cdiv(N, cfg['tn'])) args = lambda cfg: (x, y, out.clone(), cfg['tm'], cfg['tn'], cfg['tk']) hints = lambda cfg: {'num_ctas': ByTarget(sm_100=cfg['num_ctas'])} stream = torch.cuda.current_stream() tuning_result = ct.tune.exhaustive_search(search_space, stream, grid, matmul, args, hints) return tuning_result M, N, K = 1024, 256, 512 x = torch.rand((M, K), dtype=torch.float16, device='cuda') y = torch.rand((K, N), dtype=torch.float16, device='cuda') out = torch.zeros((M, N), dtype=torch.float16, device='cuda') result = tune(x, y, out) print(f"Best config: {result.best.config} ({result.best.mean_us:.1f}us)") # Launch the kernel with tuned result tm, tn, tk, num_ctas = result.best.config.values() kernel = matmul.replace_hints(num_ctas=ByTarget(sm_100=num_ctas)) ct.launch(torch.cuda.current_stream(), (ct.cdiv(M, tm), ct.cdiv(N, tn)), kernel, (x, y, out, tm, tn, tk)) torch.testing.assert_close(out, x @ y) torch.cuda.synchronize()
Output
16 succeeded, 0 failed ... Best config: {'tm': ..., 'tn': ..., 'tk': ..., 'num_ctas': ...} (...us)
To achieve consistent result with tuning, it is best to fix GPU clock and memory clock.
Enable persistent mode:
nvidia-smi -i <GPU_ID> -pm 1
Query supported clocks:
nvidia-smi -i <GPU_ID> --query-supported-clocks=graphics,memory --format=csv | head
Fix graphics and memory clocks:
nvidia-smi -i <GPU_ID> -lgc <MIN_CLOCK>,<MAX_CLOCK>
nvidia-smi -i <GPU_ID> -lmc <MIN_CLOCK>,<MAX_CLOCK>