Program Listing for File preprocess_nlp.hpp
↰ Return to documentation for file (morpheus/_lib/include/morpheus/stages/preprocess_nlp.hpp
)
/*
* SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "morpheus/export.h" // for exporting symbols
#include "morpheus/messages/control.hpp" // for ControlMessage
#include "morpheus/messages/multi.hpp" // for MultiMessage
#include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage
#include <boost/fiber/context.hpp> // for operator<<
#include <cudf/strings/strings_column_view.hpp> // for strings_column_view
#include <mrc/segment/builder.hpp> // for Builder
#include <mrc/segment/object.hpp> // for Object
#include <nvtext/subword_tokenize.hpp> // for tokenizer_result
#include <pymrc/node.hpp> // for PythonNode
#include <rmm/mr/device/device_memory_resource.hpp> // for device_memory_resource
#include <rxcpp/rx.hpp> // for observable_member, trace_activity, decay_t
#include <cstdint> // for uint32_t
#include <memory> // for shared_ptr, allocator
#include <string> // for string
#include <thread> // for operator<<
// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp"
namespace morpheus {
/****** Component public implementations *******************/
/****** PreprocessNLPStage**********************************/
template <typename InputT, typename OutputT>
class MORPHEUS_EXPORT PreprocessNLPStage
: public mrc::pymrc::PythonNode<std::shared_ptr<InputT>, std::shared_ptr<OutputT>>
{
public:
using base_t = mrc::pymrc::PythonNode<std::shared_ptr<InputT>, std::shared_ptr<OutputT>>;
using typename base_t::sink_type_t;
using typename base_t::source_type_t;
using typename base_t::subscribe_fn_t;
PreprocessNLPStage(std::string vocab_hash_file,
uint32_t sequence_length,
bool truncation,
bool do_lower_case,
bool add_special_token,
int stride = -1,
std::string column = "data");
source_type_t on_data(sink_type_t x);
private:
std::shared_ptr<MultiInferenceMessage> on_multi_message(std::shared_ptr<MultiMessage> x);
std::shared_ptr<ControlMessage> on_control_message(std::shared_ptr<ControlMessage> x);
nvtext::tokenizer_result subword_tokenize(const std::string& vocab_hash_file,
uint32_t sequence_length,
bool do_lower_case,
bool truncation,
cudf::strings_column_view const& string_col,
int stride,
rmm::mr::device_memory_resource* mr);
std::string m_vocab_hash_file;
std::string m_column;
uint32_t m_sequence_length;
bool m_truncation;
bool m_do_lower_case;
bool m_add_special_token;
int m_stride{-1};
};
using PreprocessNLPStageMM = // NOLINT(readability-identifier-naming)
PreprocessNLPStage<MultiMessage, MultiInferenceMessage>;
using PreprocessNLPStageCM = // NOLINT(readability-identifier-naming)
PreprocessNLPStage<ControlMessage, ControlMessage>;
/****** PreprocessNLPStageInferenceProxy********************/
struct MORPHEUS_EXPORT PreprocessNLPStageInterfaceProxy
{
static std::shared_ptr<mrc::segment::Object<PreprocessNLPStage<MultiMessage, MultiInferenceMessage>>> init_multi(
mrc::segment::Builder& builder,
const std::string& name,
std::string vocab_hash_file,
uint32_t sequence_length,
bool truncation,
bool do_lower_case,
bool add_special_token,
int stride = -1,
std::string column = "data");
static std::shared_ptr<mrc::segment::Object<PreprocessNLPStage<ControlMessage, ControlMessage>>> init_cm(
mrc::segment::Builder& builder,
const std::string& name,
std::string vocab_hash_file,
uint32_t sequence_length,
bool truncation,
bool do_lower_case,
bool add_special_token,
int stride = -1,
std::string column = "data");
}; // end of group
} // namespace morpheus