Program Listing for File preprocess_nlp.hpp#

Return to documentation for file (python/morpheus/morpheus/_lib/include/morpheus/stages/preprocess_nlp.hpp)

/*
 * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "morpheus/export.h"              // for MORPHEUS_EXPORT
#include "morpheus/messages/control.hpp"  // for ControlMessage

#include <boost/fiber/context.hpp>                   // for operator<<
#include <cudf/strings/strings_column_view.hpp>      // for strings_column_view
#include <mrc/segment/builder.hpp>                   // for Builder
#include <mrc/segment/object.hpp>                    // for Object
#include <nvtext/subword_tokenize.hpp>               // for tokenizer_result
#include <pymrc/node.hpp>                            // for PythonNode
#include <rmm/mr/device/device_memory_resource.hpp>  // for device_memory_resource
#include <rxcpp/rx.hpp>                              // for trace_activity, decay_t, from

#include <cstdint>  // for uint32_t
#include <memory>   // for shared_ptr, allocator
#include <string>   // for string
#include <thread>   // for operator<<

// IWYU pragma: no_include "rxcpp/sources/rx-iterate.hpp"

namespace morpheus {
/****** Component public implementations *******************/
/****** PreprocessNLPStage**********************************/

class MORPHEUS_EXPORT PreprocessNLPStage
  : public mrc::pymrc::PythonNode<std::shared_ptr<ControlMessage>, std::shared_ptr<ControlMessage>>
{
  public:
    using base_t = mrc::pymrc::PythonNode<std::shared_ptr<ControlMessage>, std::shared_ptr<ControlMessage>>;
    using typename base_t::sink_type_t;
    using typename base_t::source_type_t;
    using typename base_t::subscribe_fn_t;

    PreprocessNLPStage(std::string vocab_hash_file,
                       uint32_t sequence_length,
                       bool truncation,
                       bool do_lower_case,
                       bool add_special_token,
                       int stride         = -1,
                       std::string column = "data");

    source_type_t on_data(sink_type_t x);

  private:
    nvtext::tokenizer_result subword_tokenize(const std::string& vocab_hash_file,
                                              uint32_t sequence_length,
                                              bool do_lower_case,
                                              bool truncation,
                                              cudf::strings_column_view const& string_col,
                                              int stride,
                                              rmm::mr::device_memory_resource* mr);
    std::string m_vocab_hash_file;
    std::string m_column;
    uint32_t m_sequence_length;
    bool m_truncation;
    bool m_do_lower_case;
    bool m_add_special_token;
    int m_stride{-1};
};

/****** PreprocessNLPStageInferenceProxy********************/
struct MORPHEUS_EXPORT PreprocessNLPStageInterfaceProxy
{
    static std::shared_ptr<mrc::segment::Object<PreprocessNLPStage>> init(mrc::segment::Builder& builder,
                                                                          const std::string& name,
                                                                          std::string vocab_hash_file,
                                                                          uint32_t sequence_length,
                                                                          bool truncation,
                                                                          bool do_lower_case,
                                                                          bool add_special_token,
                                                                          int stride         = -1,
                                                                          std::string column = "data");
};  // end of group
}  // namespace morpheus