Other Methods¶
Get Memory Layout¶
BLAS::get_layout_<gmem/smem>_<a/b/c>()
function fetches global memory or shared memory CuTe layout for matrix A
, B
or C
,
determined by matrix sizes, arrangement,
and leading dimensions. For shared memory layouts the leading dimensions, if not specified
explicitly through a parameter, will be inferred from the leading dimensions operator. For
global memory layouts custom leading dimensions must be passed either through a static or dynamic integral type, or otherwise
they will be inferred from matrix sizes.
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_a();
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_b();
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_c();
__forceinline__ __host__ __device__ constexpr static auto get_layout_smem_a();
__forceinline__ __host__ __device__ constexpr static auto get_layout_smem_b();
__forceinline__ __host__ __device__ constexpr static auto get_layout_smem_c();
// Overloads for specifying the leading dimensions statically during compilation time.
// integral_type can be either signed or unsigned integer type and integral_value follow
// this specification.
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_a(const std::integral_constant<integral_type, lda>);
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_b(const std::integral_constant<integral_type, ldb>);
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_c(const std::integral_constant<integral_type, ldc>);
// Overloads for specifying the leading dimensions during the execution time.
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_a(const unsigned int lda);
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_b(const unsigned int ldb);
__forceinline__ __host__ __device__ constexpr static auto get_layout_gmem_c(const unsigned int ldc);
__forceinline__ __host__ __device__ constexpr static auto get_layout_smem_a(const unsigned int lda);
__forceinline__ __host__ __device__ constexpr static auto get_layout_smem_b(const unsigned int ldb);
__forceinline__ __host__ __device__ constexpr static auto get_layout_smem_c(const unsigned int ldc);
BLAS::get_layout_<gmem/smem>_<a/b/c>()
returns a combination of memory tag (global or shared) and the layout
(cute::Layout)
for matrix A
, B
or C
which can be directly passed to cublasdx::make_tensor
to create a tensor.
BLAS::get_layout_<gmem/smem>_<a/b/c>()
returns a matrix layout corresponding to the order set via
Arrangement operator. For example, if the order for A matrix was set to cublasdx::row-major
,
the returned layout follows the row-major order.
In case of dynamic leading dimensions provided by user during execution time, the function accepts the leading dimension as an argument, see the example below.
Example
using BLAS = decltype(...)
extern __shared__ __align__(16) char smem[];
// a, b, c are pointers to global memory of input matrices A and B and output matrix C
auto a_global_tensor = cublasdx::make_tensor(a, BLAS::get_layout_gmem_a());
auto b_global_tensor = cublasdx::make_tensor(b, BLAS::get_layout_gmem_b());
auto c_global_tensor = cublasdx::make_tensor(c, BLAS::get_layout_gmem_c());
auto [smem_a, smem_b, smem_c] = BLAS::slice_shared_memory(smem);
auto a_shared_tensor = cublasdx::make_tensor(smem_a, BLAS::get_layout_smem_a());
auto b_shared_tensor = cublasdx::make_tensor(smem_b, BLAS::get_layout_smem_b());
auto c_shared_tensor = cublasdx::make_tensor(smem_c, BLAS::get_layout_smem_c());
// With leading dimensions specified during the compilation time
auto a_global_tensor = cublasdx::make_tensor(a, BLAS::get_layout_gmem_a(std::integral_constant<int, lda>{}));
auto b_global_tensor = cublasdx::make_tensor(b, BLAS::get_layout_gmem_b(std::integral_constant<int, ldb>{}));
auto c_global_tensor = cublasdx::make_tensor(c, BLAS::get_layout_gmem_c(std::integral_constant<int, ldc>{}));
// With leading dimensions specified during the execution time
auto a_global_tensor = cublasdx::make_tensor(a, BLAS::get_layout_gmem_a(lda));
auto b_global_tensor = cublasdx::make_tensor(b, BLAS::get_layout_gmem_b(ldb));
auto c_global_tensor = cublasdx::make_tensor(c, BLAS::get_layout_gmem_c(ldc));
auto [smem_a, smem_b, smem_c] = BLAS::slice_shared_memory(smem, lda, ldb, ldc);
auto a_shared_tensor = cublasdx::make_tensor(smem_a, BLAS::get_layout_smem_a(lda));
auto b_shared_tensor = cublasdx::make_tensor(smem_b, BLAS::get_layout_smem_b(ldb));
auto c_shared_tensor = cublasdx::make_tensor(smem_c, BLAS::get_layout_smem_c(ldc));