↰ Return to documentation for file (morpheus/_lib/src/utilities/matx_util.cu
)
#include "morpheus/utilities/matx_util.hpp"
#include "morpheus/utilities/type_util.hpp"
#include "morpheus/utilities/tensor_util.hpp"// for TensorUtils::get_element_stride
#include <cudf/utilities/type_dispatcher.hpp>
#include <matx.h>
#include <mrc/cuda/sync.hpp>
#include <memory>
namespace morpheus {
// Component-private classes.
// ************ MatxUtil__MatxCast**************//
struct MatxUtil__MatxCast { // NOLINT
size_t element_count;
rmm::cuda_stream_view stream;
template<typename InputT,
typename OutputT,
std::enable_if_t<!cudf::is_numeric<InputT>() || !cudf::is_numeric<OutputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
throw std::invalid_argument("Unsupported conversion");
}
template<typename InputT,
typename OutputT,
std::enable_if_t<cudf::is_numeric<InputT>() && cudf::is_numeric<OutputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
matx::tensorShape_t<1> shape({static_cast<matx::index_t>(element_count)});
matx::tensor_t<InputT, 1> input_tensor(static_cast<InputT *>(input_data), shape);
matx::tensor_t<OutputT, 1> output_tensor(static_cast<OutputT *>(output_data), shape);
(output_tensor = input_tensor).run(stream.value());
}
};
// ************ MatxUtil__MatxCreateSegIds**************//
struct MatxUtil__MatxCreateSegIds {
size_t element_count;
size_t fea_len;
rmm::cuda_stream_view stream;
template<typename OutputT, std::enable_if_t<!std::is_integral_v<OutputT>> * = nullptr>
void operator()(void *output_data) {
throw std::invalid_argument("Unsupported conversion");
}
template<typename OutputT, std::enable_if_t<std::is_integral_v<OutputT>> * = nullptr>
void operator()(void *output_data) {
matx::tensorShape_t<2> shape({static_cast<matx::index_t>(element_count), 3});
matx::tensor_t<OutputT, 2> output_tensor(static_cast<OutputT *>(output_data), shape);
auto col0 = output_tensor.template Slice<1>({0, 0}, {matx::matxEnd, matx::matxDropDim});
auto col2 = output_tensor.template Slice<1>({0, 2}, {matx::matxEnd, matx::matxDropDim});
auto range_col =
matx::range_x<OutputT>(matx::tensorShape_t<1>({static_cast<matx::index_t>(element_count)}), 0, 1);
(col0 = range_col).run(stream.value());
(col2 = fea_len - 1).run(stream.value());
}
}; // NOLINT
// ************ MatxUtil__MatxLogits**************//
struct MatxUtil__MatxLogits { // NOLINT
size_t element_count;
rmm::cuda_stream_view stream;
template<typename InputT, std::enable_if_t<!cudf::is_floating_point<InputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
throw std::invalid_argument("Unsupported conversion");
}
template<typename InputT, std::enable_if_t<cudf::is_floating_point<InputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
matx::tensorShape_t<1> shape({static_cast<matx::index_t>(element_count)});
matx::tensor_t<InputT, 1> input_tensor(static_cast<InputT *>(input_data), shape);
matx::tensor_t<InputT, 1> output_tensor(static_cast<InputT *>(output_data), shape);
(output_tensor = (InputT) 1 / ((InputT) 1 + matx::exp((InputT) -1 * input_tensor))).run(stream.value());
}
}; // NOLINT
// ************ MatxUtil__MatxTranspose**************//
struct MatxUtil__MatxTranspose { // NOLINT
size_t element_count;
rmm::cuda_stream_view stream;
size_t rows;
size_t cols;
template<typename InputT, std::enable_if_t<!cudf::is_numeric<InputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
throw std::invalid_argument("Unsupported conversion");
}
template<typename InputT, std::enable_if_t<cudf::is_numeric<InputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
matx::tensorShape_t<2> input_shape({static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)});
matx::tensorShape_t<2> output_shape({static_cast<matx::index_t>(cols), static_cast<matx::index_t>(rows)});
matx::tensor_t<InputT, 2> input_tensor(static_cast<InputT *>(input_data), input_shape);
matx::tensor_t<InputT, 2> output_tensor(static_cast<InputT *>(output_data), output_shape);
(output_tensor = input_tensor.Permute({1, 0})).run(stream.value());
}
};
// ************ MatxUtil__MatxThreshold**************//
struct MatxUtil__MatxThreshold { // NOLINT
size_t rows;
size_t cols;
bool by_row;
rmm::cuda_stream_view stream;
template<typename InputT, std::enable_if_t<!cudf::is_floating_point<InputT>()> * = nullptr>
void
operator()(void *input_data, void *output_data, double threshold, const std::vector<std::size_t>& stride) {
throw std::invalid_argument("Unsupported conversion");
}
template<typename InputT, std::enable_if_t<cudf::is_floating_point<InputT>()> * = nullptr>
void
operator()(void *input_data, void *output_data, double threshold, const std::vector<std::size_t>& stride) {
if (by_row) {
this->threshold_by_row<InputT>(input_data, output_data, threshold, stride);
} else {
this->threshold<InputT>(input_data, output_data, threshold, stride);
}
}
private:
template<typename InputT>
void threshold_by_row(void *input_data, void *output_data, double threshold,
const std::vector<std::size_t>& stride) {
matx::tensorShape_t<2> input_shape({static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)});
// Output is always 1 column
matx::tensorShape_t<1> output_shape({static_cast<matx::index_t>(rows)});
// Specify the stride here since the data comes in column major order.
matx::tensor_t<InputT, 2> input_tensor(static_cast<InputT *>(input_data),
input_shape,
{static_cast<matx::index_t>(stride[0]),
static_cast<matx::index_t>(stride[1])});
// Tmp array to hold max value
matx::tensor_t<InputT, 1> max_tensor(output_shape);
// row-wise reduction
matx::rmax(max_tensor, input_tensor, stream.value());
matx::tensor_t<bool, 1> output_tensor(static_cast<bool *>(output_data), output_shape);
// Convert max value to bool
(output_tensor = max_tensor > (InputT) threshold).run(stream.value());
}
template<typename InputT>
void
threshold(void *input_data, void *output_data, double threshold, const std::vector<std::size_t>& stride) {
matx::tensorShape_t<2> shape({static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)});
matx::index_t matx_stride[2] = {static_cast<matx::index_t>(stride[0]),
static_cast<matx::index_t>(stride[1])};
matx::tensor_t<InputT, 2> input_tensor(static_cast<InputT *>(input_data), shape, matx_stride);
matx::tensor_t<bool, 2> output_tensor(static_cast<bool *>(output_data), shape, matx_stride);
// Convert max value to bool
(output_tensor = input_tensor > (InputT) threshold).run(stream.value());
}
};
struct MatxUtil__MatxReduceMax {
matx::index_t num_input_rows;
matx::index_t num_cols;
std::vector<matx::index_t> input_stride;
matx::index_t num_output_rows;
void *input_data;
void *output_data;
rmm::cuda_stream_view stream;
template<typename InputT, std::enable_if_t<!cudf::is_floating_point<InputT>()> * = nullptr>
void operator()(std::size_t start, std::size_t stop, int32_t output_idx) {
throw std::invalid_argument("Unsupported conversion");
}
template<typename InputT, std::enable_if_t<cudf::is_floating_point<InputT>()> * = nullptr>
void operator()(std::size_t start, std::size_t stop, int32_t output_idx) {
auto input_count = stop - start;
matx::tensorShape_t<2> input_shape({static_cast<matx::index_t>(input_count), num_cols});
matx::tensorShape_t<1> output_shape({num_cols});
matx::index_t output_stride[2] = {input_stride[0], input_stride[1]};
if (output_stride[0] == 1)
{
output_stride[1] = num_output_rows;
}
auto input_ptr = static_cast<InputT *>(input_data) + (start * input_stride[0]);
auto output_ptr = static_cast<InputT *>(output_data) + (output_idx * output_stride[0]);
matx::tensor_t<InputT, 2> input_tensor(input_ptr, input_shape, {input_stride[0], input_stride[1]});
matx::tensor_t<InputT, 1> output_tensor(output_ptr, output_shape, {output_stride[1]});
// We need to transpose the input such that rmax will reduce the rows
// Matx performs reductions over the innermost dimensions.
// see https://nvidia.github.io/MatX/api/reduce.html
matx::rmax(output_tensor, input_tensor.Permute({1, 0}), stream.value());
}
};
// Component public implementations
// ************ MatxUtil************************* //
std::shared_ptr<rmm::device_buffer> MatxUtil::cast(const DevMemInfo &input, TypeId output_type) {
auto output_dtype = DType(output_type);
// Create the output
auto output = input.make_new_buffer(output_dtype.item_size() * input.count());
cudf::double_type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
cudf::data_type{output_dtype.cudf_type_id()},
MatxUtil__MatxCast{input.count(), output->stream()},
input.data(),
output->data());
mrc::enqueue_stream_sync_event(output->stream()).get();
return output;
}
std::shared_ptr<rmm::device_buffer>
MatxUtil::create_seg_ids(size_t row_count, size_t fea_len, TypeId output_type) {
auto output_dtype = DType(output_type);
// Now create the output
auto output =
std::make_shared<rmm::device_buffer>(output_dtype.item_size() * row_count * 3,
rmm::cuda_stream_per_thread);
cudf::type_dispatcher(cudf::data_type{output_dtype.cudf_type_id()},
MatxUtil__MatxCreateSegIds{row_count, fea_len, output->stream()},
output->data());
return output;
}
std::shared_ptr<rmm::device_buffer> MatxUtil::logits(const DevMemInfo &input) {
// Create the output
auto output = input.make_new_buffer(input.bytes());
cudf::type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
MatxUtil__MatxLogits{input.count(), output->stream()},
input.data(),
output->data());
return output;
}
std::shared_ptr<rmm::device_buffer> MatxUtil::transpose(const DevMemInfo &input) {
// Now create the output
auto output = input.make_new_buffer(input.bytes());
cudf::type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
MatxUtil__MatxTranspose{input.count(), output->stream(), input.shape(0), input.shape(1)},
input.data(),
output->data());
return output;
}
std::shared_ptr<rmm::device_buffer>
MatxUtil::threshold(const DevMemInfo &input,
double thresh_val, bool by_row) {
const auto rows = input.shape(0);
const auto cols = input.shape(1);
std::size_t output_size = sizeof(bool) * rows;
if (!by_row) {
output_size *= cols;
}
// Now create the output array of bools
auto output = input.make_new_buffer(output_size);
cudf::type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
MatxUtil__MatxThreshold{rows, cols, by_row, output->stream()},
input.data(),
output->data(),
thresh_val,
input.stride());
mrc::enqueue_stream_sync_event(output->stream()).get();
return output;
}
std::shared_ptr<rmm::device_buffer>
MatxUtil::reduce_max(const DevMemInfo &input,
const std::vector<int32_t> &seq_ids,
size_t seq_id_offset,
const std::vector<int64_t> &output_shape)
{
const auto& dtype = input.dtype();
auto cudf_type = cudf::data_type{dtype.cudf_type_id()};
auto num_input_rows = input.shape(0);
auto num_input_cols = input.shape(1);
std::vector<matx::index_t> matx_stride{static_cast<matx::index_t>(input.stride(0)), static_cast<matx::index_t>(input.stride(1))};
std::size_t output_element_count = output_shape[0] * output_shape[1];
std::size_t output_buff_size = dtype.item_size() * output_element_count;
DCHECK(output_element_count <= input.count()) << "Output buffer size should be less than or equal to the input";
DCHECK(num_input_cols == output_shape[1]) << "Number of input and output columns must match";
auto output = input.make_new_buffer(output_buff_size);
MatxUtil__MatxReduceMax matx_reduce_max{static_cast<matx::index_t>(num_input_rows),
static_cast<matx::index_t>(num_input_cols),
matx_stride,
output_shape[0],
input.data(),
output->data(),
output->stream()};
std::size_t start = 0;
auto output_offset = seq_ids[seq_id_offset];
for (std::size_t i=0; i < num_input_rows; ++i)
{
auto idx = seq_ids[i+seq_id_offset];
if (idx != seq_ids[start+seq_id_offset])
{
cudf::type_dispatcher(cudf_type,
matx_reduce_max,
start,
i,
seq_ids[start+seq_id_offset]-output_offset);
start = i;
}
}
cudf::type_dispatcher(cudf_type,
matx_reduce_max,
start,
num_input_rows,
seq_ids[start+seq_id_offset]-output_offset);
mrc::enqueue_stream_sync_event(output->stream()).get();
return output;
}
}