NVIDIA Morpheus (24.06)
(Latest Version)

Program Listing for File preprocess_nlp.cpp

Return to documentation for file (morpheus/_lib/src/stages/preprocess_nlp.cpp)

Copy
Copied!
            

/* * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "morpheus/stages/preprocess_nlp.hpp" #include "mrc/segment/object.hpp" // for Object #include "morpheus/messages/control.hpp" // for ControlMessage #include "morpheus/messages/memory/inference_memory.hpp" // for InferenceMemory #include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory #include "morpheus/messages/meta.hpp" // for MessageMeta #include "morpheus/messages/multi.hpp" // for MultiMessage #include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage #include "morpheus/objects/dev_mem_info.hpp" // for DevMemInfo #include "morpheus/objects/dtype.hpp" // for DType #include "morpheus/objects/table_info.hpp" // for TableInfo #include "morpheus/objects/tensor.hpp" // for Tensor #include "morpheus/types.hpp" // for TensorIndex #include "morpheus/utilities/matx_util.hpp" // for MatxUtil #include <cudf/column/column.hpp> // for column #include <cudf/column/column_factories.hpp> // for make_column_from_scalar #include <cudf/column/column_view.hpp> // for column_view #include <cudf/filling.hpp> // for sequence #include <cudf/reshape.hpp> // for interleave_columns #include <cudf/scalar/scalar.hpp> // for numeric_scalar #include <cudf/strings/strings_column_view.hpp> // for strings_column_view #include <cudf/table/table_view.hpp> // for table_view #include <cudf/types.hpp> // for type_id, data_type #include <cudf/unary.hpp> // for cast #include <mrc/segment/builder.hpp> // for Builder #include <nvtext/normalize.hpp> // for normalize_spaces #include <nvtext/subword_tokenize.hpp> // for tokenizer_result, load_vocabulary_file, subword_tok... #include <rmm/cuda_stream_view.hpp> // for cuda_stream_default #include <rmm/device_buffer.hpp> // for device_buffer #include <rmm/mr/device/per_device_resource.hpp> // for get_current_device_resource #include <cstdint> // for uint32_t, int32_t #include <memory> // for shared_ptr, unique_ptr, __shared_ptr_access, make_s... #include <type_traits> // for is_same_v #include <utility> // for move #include <vector> // for vector namespace morpheus { // Component public implementations // ************ PreprocessNLPStage ************************* // template <typename InputT, typename OutputT> PreprocessNLPStage<InputT, OutputT>::PreprocessNLPStage(std::string vocab_hash_file, uint32_t sequence_length, bool truncation, bool do_lower_case, bool add_special_token, int stride, std::string column) : base_t(rxcpp::operators::map([this](sink_type_t x) { return this->on_data(std::move(x)); })), m_vocab_hash_file(std::move(vocab_hash_file)), m_sequence_length(sequence_length), m_truncation(truncation), m_do_lower_case(do_lower_case), m_add_special_token(add_special_token), m_column(std::move(column)) { // Auto calc stride to be 75% of sequence length if (stride < 0) { stride = m_sequence_length / 2; stride = stride + stride / 2; } m_stride = stride; } template <typename InputT, typename OutputT> PreprocessNLPStage<InputT, OutputT>::source_type_t PreprocessNLPStage<InputT, OutputT>::on_data(sink_type_t x) { if constexpr (std::is_same_v<sink_type_t, std::shared_ptr<MultiMessage>>) { return this->on_multi_message(x); } else if constexpr (std::is_same_v<sink_type_t, std::shared_ptr<ControlMessage>>) { return this->on_control_message(x); } else { // sink_type_t not supported static_assert(!sizeof(sink_type_t), "PreProcessNLPStage receives unsupported input type"); } } template <> std::shared_ptr<MultiInferenceMessage> PreprocessNLPStage<MultiMessage, MultiInferenceMessage>::on_multi_message( std::shared_ptr<MultiMessage> x) { // Convert to string view auto meta = x->get_meta(this->m_column); auto col = meta.get_column(0); auto string_col = cudf::strings_column_view{col}; auto token_results = subword_tokenize(this->m_vocab_hash_file, this->m_sequence_length, this->m_do_lower_case, this->m_truncation, string_col, this->m_stride, rmm::mr::get_current_device_resource()); // Build the results auto memory = std::make_shared<InferenceMemory>(token_results.nrows_tensor); TensorIndex length = token_results.tensor_token_ids->size() / token_results.sequence_length; auto input_ids_released = cudf::cast(token_results.tensor_token_ids->view(), cudf::data_type(cudf::type_id::INT32))->release(); memory->set_tensor("input_ids", Tensor::create(std::move(input_ids_released.data), DType::create<int32_t>(), {length, static_cast<TensorIndex>(token_results.sequence_length)}, {}, 0)); length = token_results.tensor_attention_mask->size() / token_results.sequence_length; auto input_mask_released = cudf::cast(token_results.tensor_attention_mask->view(), cudf::data_type(cudf::type_id::INT32))->release(); memory->set_tensor("input_mask", Tensor::create(std::move(input_mask_released.data), DType::create<int32_t>(), {length, static_cast<TensorIndex>(token_results.sequence_length)}, {}, 0)); auto tensor_index_dtype = DType::create<TensorIndex>(); length = token_results.tensor_metadata->size() / 3; auto seq_ids_released = cudf::cast(token_results.tensor_metadata->view(), cudf::data_type(tensor_index_dtype.cudf_type_id())) ->release(); std::shared_ptr<rmm::device_buffer> seq_ids_data = std::move(seq_ids_released.data); if (x->mess_offset > 0) { // Add an offset to the seq_ids so the message IDs line up MatxUtil::offset_seq_ids(DevMemInfo{seq_ids_data, tensor_index_dtype.type_id(), {length, 3}, {1, 3}}, x->mess_offset); } memory->set_tensor("seq_ids", Tensor::create(seq_ids_data, tensor_index_dtype, {length, 3}, {}, 0)); auto next = std::make_shared<MultiInferenceMessage>( x->meta, x->mess_offset, x->mess_count, std::move(memory), 0, memory->count); return std::move(next); } template <> std::shared_ptr<ControlMessage> PreprocessNLPStage<ControlMessage, ControlMessage>::on_control_message( std::shared_ptr<ControlMessage> x) { // Convert to string view auto meta = x->payload()->get_info(this->m_column); auto col = meta.get_column(0); auto string_col = cudf::strings_column_view{col}; auto token_results = subword_tokenize(this->m_vocab_hash_file, this->m_sequence_length, this->m_do_lower_case, this->m_truncation, string_col, this->m_stride, rmm::mr::get_current_device_resource()); // Build the results auto memory = std::make_shared<TensorMemory>(token_results.nrows_tensor); TensorIndex length = token_results.tensor_token_ids->size() / token_results.sequence_length; auto input_ids_released = cudf::cast(token_results.tensor_token_ids->view(), cudf::data_type(cudf::type_id::INT32))->release(); memory->set_tensor("input_ids", Tensor::create(std::move(input_ids_released.data), DType::create<int32_t>(), {length, static_cast<TensorIndex>(token_results.sequence_length)}, {}, 0)); length = token_results.tensor_attention_mask->size() / token_results.sequence_length; auto input_mask_released = cudf::cast(token_results.tensor_attention_mask->view(), cudf::data_type(cudf::type_id::INT32))->release(); memory->set_tensor("input_mask", Tensor::create(std::move(input_mask_released.data), DType::create<int32_t>(), {length, static_cast<TensorIndex>(token_results.sequence_length)}, {}, 0)); auto tensor_index_dtype = DType::create<TensorIndex>(); length = token_results.tensor_metadata->size() / 3; auto seq_ids_released = cudf::cast(token_results.tensor_metadata->view(), cudf::data_type(tensor_index_dtype.cudf_type_id())) ->release(); std::shared_ptr<rmm::device_buffer> seq_ids_data = std::move(seq_ids_released.data); memory->set_tensor("seq_ids", Tensor::create(seq_ids_data, tensor_index_dtype, {length, 3}, {}, 0)); auto next = x; next->tensors(memory); return std::move(next); } template <typename InputT, typename OutputT> nvtext::tokenizer_result PreprocessNLPStage<InputT, OutputT>::subword_tokenize( const std::string& vocab_hash_file, uint32_t sequence_length, bool do_lower_case, bool truncation, cudf::strings_column_view const& string_col, int stride, rmm::mr::device_memory_resource* mr) { // Create the hashed vocab thread_local std::unique_ptr<nvtext::hashed_vocabulary> vocab = nvtext::load_vocabulary_file(vocab_hash_file); // remove leading and trailing whitespace auto normalized_col = nvtext::normalize_spaces(string_col); auto normalized_col_view = cudf::strings_column_view{normalized_col->view()}; // Perform the tokenizer nvtext::tokenizer_result token_results; if (normalized_col_view.chars_size(rmm::cuda_stream_default) > 0) { token_results = nvtext::subword_tokenize(normalized_col_view, *vocab, sequence_length, stride, do_lower_case, truncation, rmm::mr::get_current_device_resource()); } else { // workaround for a situation where the input strings contain either no characters or only // whitespace auto zero = cudf::numeric_scalar<uint32_t>(0, true, rmm::cuda_stream_default); auto ids = cudf::make_column_from_scalar(zero, sequence_length * normalized_col_view.size()); auto mask = cudf::make_column_from_scalar(zero, sequence_length * normalized_col_view.size()); auto metadata = [&]() { auto iota = cudf::sequence(normalized_col_view.size(), zero); auto zeroes = cudf::make_column_from_scalar(zero, normalized_col_view.size()); return cudf::interleave_columns( cudf::table_view{std::vector<cudf::column_view>{iota->view(), zeroes->view(), zeroes->view()}}); }(); token_results = nvtext::tokenizer_result{static_cast<uint32_t>(normalized_col_view.size()), sequence_length, std::move(ids), std::move(mask), std::move(metadata)}; } return token_results; } template class PreprocessNLPStage<MultiMessage, MultiInferenceMessage>; template class PreprocessNLPStage<ControlMessage, ControlMessage>; // ************ PreprocessNLPStageInterfaceProxy *********** // std::shared_ptr<mrc::segment::Object<PreprocessNLPStageMM>> PreprocessNLPStageInterfaceProxy::init_multi( mrc::segment::Builder& builder, const std::string& name, std::string vocab_hash_file, uint32_t sequence_length, bool truncation, bool do_lower_case, bool add_special_token, int stride, std::string column) { auto stage = builder.construct_object<PreprocessNLPStageMM>( name, vocab_hash_file, sequence_length, truncation, do_lower_case, add_special_token, stride, column); return stage; } std::shared_ptr<mrc::segment::Object<PreprocessNLPStageCM>> PreprocessNLPStageInterfaceProxy::init_cm( mrc::segment::Builder& builder, const std::string& name, std::string vocab_hash_file, uint32_t sequence_length, bool truncation, bool do_lower_case, bool add_special_token, int stride, std::string column) { auto stage = builder.construct_object<PreprocessNLPStageCM>( name, vocab_hash_file, sequence_length, truncation, do_lower_case, add_special_token, stride, column); return stage; } } // namespace morpheus

© Copyright 2024, NVIDIA. Last updated on Jul 8, 2024.