9.0/sdk-api/9_80_2sources_2libs_2nvdsinferserver_2infer__trtis__backend_8h_source.html

/*

 * SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

 * SPDX-License-Identifier: LicenseRef-NvidiaProprietary

 *

 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual

 * property and proprietary rights in and to this material, related

 * documentation and any modifications thereto. Any use, reproduction,

 * disclosure or distribution of this material and related documentation

 * without an express license agreement from NVIDIA CORPORATION or

 * its affiliates is strictly prohibited.

 */


#ifndef __NVDSINFER_TRTIS_BACKEND_H__

#define __NVDSINFER_TRTIS_BACKEND_H__


#include "infer_base_backend.h"

#include "infer_common.h"

#include "infer_post_datatypes.h"

#include "infer_utils.h"


namespace nvdsinferserver {


class TrtServerAllocator;

class TrtISServer;

class TrtServerRequest;

class TrtServerResponse;


class TrtISBackend : public BaseBackend {

public:

    TrtISBackend(

        const std::string& name, int64_t version, TrtServerPtr ptr = nullptr);


    ~TrtISBackend() override;


    void addClassifyParams(const TritonClassParams& c) {

        m_ClassifyParams.emplace_back(c); }


    void setOutputPoolSize(int size) { m_PerPoolSize = size; }

    int outputPoolSize() const { return m_PerPoolSize; }

    void setOutputMemType(InferMemType memType) { m_OutputMemType = memType; }

    InferMemType outputMemType() const { return m_OutputMemType; }

    void setOutputDevId(int64_t devId) { m_OutputDevId = devId; }

    int64_t outputDevId() const { return m_OutputDevId; }

    std::vector<TritonClassParams> getClassifyParams() {

        return m_ClassifyParams;}

    const std::string& model() const { return m_Model; }

    int64_t version() const { return m_ModelVersion; }

    NvDsInferStatus initialize() override;


    NvDsInferStatus specifyInputDims(const InputShapes& shapes) override;


    NvDsInferStatus enqueue(SharedBatchArray inputs, SharedCuStream stream,

        InputsConsumed bufConsumed, InferenceDone inferenceDone) override;


    void setTensorMaxBytes(const std::string& name, size_t maxBytes)

    {

        size_t& bytes = m_TensorMaxBytes[name];

        bytes = std::max<size_t>(maxBytes, bytes);

        bytes = INFER_ROUND_UP(bytes, INFER_MEM_ALIGNMENT);

    }


protected:

    // interface for derived class


    virtual void requestTritonOutputNames(std::set<std::string>& outNames);


    virtual NvDsInferStatus ensureServerReady();


    virtual NvDsInferStatus ensureModelReady();


    NvDsInferStatus setupReorderThread();


    void setAllocator(UniqTritonAllocator allocator)

    {

        m_ResponseAllocator = std::move(allocator);

    }


    virtual NvDsInferStatus setupLayersInfo();


    TrtServerPtr& server() { return m_Server; }


    using AsyncDone = std::function<void(NvDsInferStatus, SharedBatchArray)>;

    virtual NvDsInferStatus Run(

        SharedBatchArray inputs, InputsConsumed bufConsumed,

        AsyncDone asyncDone);


    NvDsInferStatus fixateDims(const SharedBatchArray& bufs);


    SharedSysMem allocateResponseBuf(

        const std::string& tensor, size_t bytes, InferMemType memType, int64_t devId);


    void releaseResponseBuf(const std::string& tensor, SharedSysMem mem);


    NvDsInferStatus ensureInputs(SharedBatchArray& inputs);


    enum { kName, kGpuId, kMemType };

    using PoolKey = std::tuple<std::string, int64_t, InferMemType>;

    using PoolValue = SharedBufPool<UniqSysMem>;


    PoolValue findResponsePool(PoolKey& key);


    PoolValue createResponsePool(PoolKey& key, size_t bytes);


    void serverInferCompleted(

        std::shared_ptr<TrtServerRequest> request,

        std::unique_ptr<TrtServerResponse> uniqResponse,

        InputsConsumed inputsConsumed, AsyncDone asyncDone);


    struct ReorderItem {

        NvDsInferStatus status = NVDSINFER_SUCCESS;

        SharedBatchArray inputs;

        SharedBatchArray outputs;


        std::promise<void> promise;

        std::future<void> future;

        InferenceDone inferenceDone;

    };

    /*

     * @brief Pointer to the reorder thread task.

     */

    using ReorderItemPtr = std::shared_ptr<ReorderItem>;

    bool inferenceDoneReorderLoop(ReorderItemPtr item);


    bool debatchingOutput(SharedBatchArray& outputs, SharedBatchArray& inputs);


private:

    std::string m_Model;

    int64_t m_ModelVersion = -1;

    TrtServerPtr m_Server;

    bool m_NeedUnload = false;

    std::vector<TritonClassParams> m_ClassifyParams;

    ShrTritonAllocator m_ResponseAllocator;

    InferMemType m_OutputMemType = InferMemType::kNone;

    int64_t m_OutputDevId = -1;

    int m_PerPoolSize = 2;

    std::map<PoolKey, PoolValue> m_ResponsePool;

    using SharedMutex = std::shared_timed_mutex;

    SharedMutex m_ResponseMutex;

    std::unordered_map<std::string, size_t> m_TensorMaxBytes;


    using ReorderThread = QueueThread<std::vector<ReorderItemPtr>>;

    std::unique_ptr<ReorderThread> m_ReorderThread;

};


} // namespace nvdsinferserver


#endif