Program Listing for File preprocess_nlp.hpp

Return to documentation for file (morpheus/_lib/include/morpheus/stages/preprocess_nlp.hpp)

Copy
Copied!
            

/* * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "morpheus/messages/control.hpp" // for ControlMessage #include "morpheus/messages/multi.hpp" // for MultiMessage #include "morpheus/messages/multi_inference.hpp" // for MultiInferenceMessage #include <boost/fiber/context.hpp> // for operator<< #include <cudf/strings/strings_column_view.hpp> // for strings_column_view #include <mrc/segment/builder.hpp> // for Builder #include <mrc/segment/object.hpp> // for Object #include <nvtext/subword_tokenize.hpp> // for tokenizer_result #include <pymrc/node.hpp> // for PythonNode #include <rmm/mr/device/device_memory_resource.hpp> // for device_memory_resource #include <rxcpp/rx.hpp> // for observable_member, trace_activity, decay_t #include <cstdint> // for uint32_t #include <memory> // for shared_ptr, allocator #include <string> // for string #include <thread> // for operator<< // IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp" namespace morpheus { /****** Component public implementations *******************/ /****** PreprocessNLPStage**********************************/ #pragma GCC visibility push(default) template <typename InputT, typename OutputT> class PreprocessNLPStage : public mrc::pymrc::PythonNode<std::shared_ptr<InputT>, std::shared_ptr<OutputT>> { public: using base_t = mrc::pymrc::PythonNode<std::shared_ptr<InputT>, std::shared_ptr<OutputT>>; using typename base_t::sink_type_t; using typename base_t::source_type_t; using typename base_t::subscribe_fn_t; PreprocessNLPStage(std::string vocab_hash_file, uint32_t sequence_length, bool truncation, bool do_lower_case, bool add_special_token, int stride = -1, std::string column = "data"); source_type_t on_data(sink_type_t x); private: std::shared_ptr<MultiInferenceMessage> on_multi_message(std::shared_ptr<MultiMessage> x); std::shared_ptr<ControlMessage> on_control_message(std::shared_ptr<ControlMessage> x); nvtext::tokenizer_result subword_tokenize(const std::string& vocab_hash_file, uint32_t sequence_length, bool do_lower_case, bool truncation, cudf::strings_column_view const& string_col, int stride, rmm::mr::device_memory_resource* mr); std::string m_vocab_hash_file; std::string m_column; uint32_t m_sequence_length; bool m_truncation; bool m_do_lower_case; bool m_add_special_token; int m_stride{-1}; }; using PreprocessNLPStageMM = // NOLINT(readability-identifier-naming) PreprocessNLPStage<MultiMessage, MultiInferenceMessage>; using PreprocessNLPStageCM = // NOLINT(readability-identifier-naming) PreprocessNLPStage<ControlMessage, ControlMessage>; /****** PreprocessNLPStageInferenceProxy********************/ struct PreprocessNLPStageInterfaceProxy { static std::shared_ptr<mrc::segment::Object<PreprocessNLPStage<MultiMessage, MultiInferenceMessage>>> init_multi( mrc::segment::Builder& builder, const std::string& name, std::string vocab_hash_file, uint32_t sequence_length, bool truncation, bool do_lower_case, bool add_special_token, int stride = -1, std::string column = "data"); static std::shared_ptr<mrc::segment::Object<PreprocessNLPStage<ControlMessage, ControlMessage>>> init_cm( mrc::segment::Builder& builder, const std::string& name, std::string vocab_hash_file, uint32_t sequence_length, bool truncation, bool do_lower_case, bool add_special_token, int stride = -1, std::string column = "data"); }; #pragma GCC visibility pop// end of group } // namespace morpheus

© Copyright 2024, NVIDIA. Last updated on Apr 11, 2024.