NVIDIA DeepStream SDK API Reference

6.4 Release
infer_trtis_server.h
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
4  *
5  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
6  * property and proprietary rights in and to this material, related
7  * documentation and any modifications thereto. Any use, reproduction,
8  * disclosure or distribution of this material and related documentation
9  * without an express license agreement from NVIDIA CORPORATION or
10  * its affiliates is strictly prohibited.
11  */
12 
24 #ifndef __NVDSINFER_TRTIS_SERVER_H__
25 #define __NVDSINFER_TRTIS_SERVER_H__
26 
27 #include <string>
28 #include <thread>
29 #include <vector>
30 #include <memory>
31 
32 #include "infer_batch_buffer.h"
33 #include "infer_post_datatypes.h"
34 #include "infer_proto_utils.h"
35 #include "infer_trtis_utils.h"
36 
37 #include "nvds_version.h"
38 
39 #ifdef IS_TEGRA
40 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 5.3
41 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 26)
42 #else
43 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 6.0
44 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 28)
45 #endif
46 #define TRITON_DEFAULT_BACKEND_DIR "/opt/tritonserver/backends"
47 
48 struct TRITONSERVER_Server;
49 
50 namespace nvdsinferserver {
51 
52 namespace ni = inference;
53 
54 class TrtISServer;
55 class TrtServerRequest;
56 class TrtServerResponse;
57 class TrtServerAllocator;
58 
59 using SharedRequest = std::shared_ptr<TrtServerRequest>;
60 using UniqResponse = std::unique_ptr<TrtServerResponse>;
61 using SharedResponse = std::shared_ptr<TrtServerResponse>;
62 
63 using TritonInferAsyncDone = std::function<void(SharedRequest, UniqResponse)>;
64 
69 protected:
70  friend class TrtISServer;
77 
90  const std::string& model, int64_t version, SharedBatchArray& inputs,
91  const std::vector<std::string>& outputs, uint64_t reqId,
92  const std::vector<TritonClassParams>& clasList);
93 
102  void* userPtr);
103 
112  ShrTritonAllocator& allocator,
114  void* responseUserPtr);
115 
122  static void RequestOnRelease(
123  TRITONSERVER_InferenceRequest* request, const uint32_t flags,
124  void* userp);
125 
126 public:
131 
135  TRITONSERVER_InferenceRequest* ptr() { return m_ReqPtr.get(); }
139  const std::string& model() const { return m_Model; }
143  uint64_t id() const { return m_ReqId; }
147  uint64_t bufId() const { return m_BufId; }
152  SharedBatchArray releaseInputs() { return std::move(m_Inputs); }
156  const std::vector<std::string>& outputs() const { return m_Outputs; }
161  const std::map<std::string, TritonClassParams>& classParams() const {
162  return m_ClasList; }
163 
164 private:
171  NvDsInferStatus setInputs(SharedBatchArray& inputs);
176  NvDsInferStatus setOption(const IOptions* option);
177 
179 
180 private:
188  std::weak_ptr<TrtISServer> m_Server;
192  SharedBatchArray m_Inputs;
196  std::string m_Model;
200  uint64_t m_ReqId = UINT64_C(0);
204  uint64_t m_BufId = UINT64_C(0);
205 
209  std::vector<std::string> m_Outputs;
213  std::map<std::string, TritonClassParams> m_ClasList;
214 };
215 
220  friend class TrtISServer;
221 
222 protected:
231  uint64_t id);
232 
233 public:
248  const std::string& model() const { return m_Model; }
252  std::vector<SharedBatchBuf>& mutableOutputs() { return m_BufOutputs; }
256  NvDsInferStatus getStatus() const { return m_Status; }
260  SharedOptions takeoverOptions() { return std::move(m_Options); }
261 
262 private:
266  NvDsInferStatus parseParams();
267 
280  NvDsInferStatus parseOutputData(const TrtServerRequest* req);
281 
292  NvDsInferStatus addClass(
293  const TritonClassParams& classP, const InferBufferDescription& desc, uint32_t batchSize,
294  uint32_t tensorIdx, const void* base);
295 
306  NvDsInferStatus topKClass(
307  InferClassificationOutput& ret, const TritonClassParams& classP,
308  const InferBufferDescription& desc, uint32_t tensorIdx, const void* base);
309 
311 
315  uint64_t m_ResponseId = UINT64_C(0);
323  std::weak_ptr<TrtISServer> m_Server;
327  std::string m_Model;
331  int64_t m_ModelVersion = UINT64_C(1);
336 
341  std::vector<SharedBatchBuf> m_BufOutputs;
345  SharedOptions m_Options;
346 };
347 
352  : public std::enable_shared_from_this<TrtServerAllocator> {
353 public:
354  using AllocFn = std::function<SharedSysMem(const std::string&, size_t, InferMemType, int64_t)>;
355  using FreeFn = std::function<void(const std::string&, SharedSysMem)>;
356 
364  TrtServerAllocator(AllocFn alloc, FreeFn release);
365 
369  virtual ~TrtServerAllocator() = default;
370 
374  TRITONSERVER_ResponseAllocator* ptr() { return m_Allocator.get(); }
375 
376 private:
392  static TRITONSERVER_Error* ResponseAlloc(
393  TRITONSERVER_ResponseAllocator* allocator, const char* tensorName,
394  size_t bytes, TRITONSERVER_MemoryType preferredMemType,
395  int64_t preferredDevId, void* userP, void** buffer, void** bufferUserP,
396  TRITONSERVER_MemoryType* actualMemType, int64_t* actualMemTypeId);
397 
408  static TRITONSERVER_Error* ResponseRelease(
409  TRITONSERVER_ResponseAllocator* allocator, void* buffer,
410  void* bufferUserP, size_t bytes, TRITONSERVER_MemoryType memType,
411  int64_t devId);
412 
413 private:
414  DISABLE_CLASS_COPY(TrtServerAllocator);
415 
423  AllocFn m_allocFn;
427  FreeFn m_releaseFn;
428 };
429 
430 namespace triton {
431 
439  std::string backend;
443  std::string key;
447  std::string value;
448 };
449 
453 struct RepoSettings {
457  std::set<std::string> roots;
461  uint32_t logLevel = 0;
465  bool tfAllowSoftPlacement = true;
473  bool strictModelConfig = true;
494  std::map<uint32_t, uint64_t> cudaDevMemMap;
498  std::vector<BackendConfig> backendConfigs;
499 
503  std::string debugStr;
504 
512  bool initFrom(
513  const ic::TritonModelRepo& repo, const std::vector<int>& devIds);
514 
521  bool operator==(const RepoSettings& other) const;
522  bool operator!=(const RepoSettings& other) const
523  {
524  return !this->operator==(other);
525  }
527 };
528 } // namespace triton
529 
533 class TrtISServer : public std::enable_shared_from_this<TrtISServer> {
534  friend class TrtServerRequest;
535  friend class TrtServerResponse;
536 
537 protected:
542  TrtISServer(const triton::RepoSettings& repo);
543 
554 
558  const triton::RepoSettings& getRepoSettings() { return m_RepoSettings; }
559 
560 public:
564  ~TrtISServer();
565 
580  static TrtServerPtr getInstance(const triton::RepoSettings* repo);
581 
585  bool isServerReady();
586 
590  bool isServerLive();
591 
598  bool isModelReady(const std::string& model, int64_t version);
599 
605  NvDsInferStatus loadModel(const std::string& modelName);
606 
612  NvDsInferStatus unloadModel(const std::string& modelName);
613 
622  const std::string& model, int64_t version, ni::ModelConfig& config);
623 
635  const std::string& model, int64_t version, SharedBatchArray& inputs,
636  const std::vector<std::string>& outputs, const std::vector<TritonClassParams>& clasList);
637 
651  SharedRequest request, WeakTritonAllocator allocator,
652  TritonInferAsyncDone done);
653 
654 private:
665  UniqResponse createResponse(
666  UniqTritonT<TRITONSERVER_InferenceResponse>&& data, uint64_t id);
667 
668  using InferUserData =
669  std::tuple<SharedRequest, TritonInferAsyncDone, TrtISServer*>;
679  static void InferComplete(
680  TRITONSERVER_InferenceResponse* response, const uint32_t flags,
681  void* userp);
682 
686  TRITONSERVER_Server* serverPtr() const { return m_Impl.get(); }
687 
689 
693  static std::weak_ptr<TrtISServer> sTrtServerInstance;
697  static std::mutex sTrtServerMutex;
698 
699 private:
703  UniqTritonT<TRITONSERVER_Server> m_Impl;
707  std::atomic<uint64_t> m_LastRequestId{UINT64_C(0)};
711  triton::RepoSettings m_RepoSettings;
712 };
713 
714 } // namespace nvdsinferserver
715 
716 #endif
nvdsinferserver::triton::RepoSettings::minComputeCapacity
double minComputeCapacity
The minimun supported compute compability for Triton server.
Definition: infer_trtis_server.h:477
nvdsinferserver
Copyright (c) 2021, NVIDIA CORPORATION.
Definition: infer_custom_process.h:28
nvdsinferserver::triton::RepoSettings::initFrom
bool initFrom(const ic::TritonModelRepo &repo, const std::vector< int > &devIds)
Populate the RepoSettings instance with the values from the TritonModelRepo protobuf message.
nvdsinferserver::TrtServerRequest::outputs
const std::vector< std::string > & outputs() const
Get the list of requested output layer names.
Definition: infer_trtis_server.h:156
nvdsinferserver::UniqResponse
std::unique_ptr< TrtServerResponse > UniqResponse
Definition: infer_trtis_server.h:60
TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY
#define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY
Definition: infer_trtis_server.h:43
nvdsinferserver::TrtISServer::isModelReady
bool isModelReady(const std::string &model, int64_t version)
Check if the server is ready for inference using specified model.
nvdsinferserver::TrtServerRequest::setRequestComplete
NvDsInferStatus setRequestComplete(TRITONSERVER_InferenceRequestReleaseFn_t requestCompleteCb, void *userPtr)
Set the release callback function for the request.
TritonClassParams
Definition: infer_post_datatypes.h:96
TRITON_DEFAULT_BACKEND_DIR
#define TRITON_DEFAULT_BACKEND_DIR
Definition: infer_trtis_server.h:46
nvdsinferserver::TrtISServer
Wrapper class for creating Triton Inference Server instance.
Definition: infer_trtis_server.h:533
nvdsinferserver::triton::RepoSettings::roots
std::set< std::string > roots
Set of model repository directories.
Definition: infer_trtis_server.h:457
nvdsinferserver::TrtServerRequest::id
uint64_t id() const
Get the request ID.
Definition: infer_trtis_server.h:143
nvdsinferserver::TrtServerRequest::classParams
const std::map< std::string, TritonClassParams > & classParams() const
Get the Triton classification parameters list (tensor name : classification parameters).
Definition: infer_trtis_server.h:161
nvdsinferserver::triton::RepoSettings::debugStr
std::string debugStr
Debug string of the TritonModelRepo protobuf message.
Definition: infer_trtis_server.h:503
nvdsinferserver::SharedSysMem
std::shared_ptr< SysMem > SharedSysMem
Definition: infer_common.h:88
nvdsinferserver::TrtServerResponse
Wrapper class for Triton output parsing.
Definition: infer_trtis_server.h:219
nvdsinferserver::TrtServerResponse::getStatus
NvDsInferStatus getStatus() const
Check if the response could be parsed correctly.
Definition: infer_trtis_server.h:256
nvdsinferserver::ShrTritonT
std::shared_ptr< T > ShrTritonT
Definition: infer_common.h:117
nvdsinferserver::TrtISServer::getInstance
static TrtServerPtr getInstance(const triton::RepoSettings *repo)
Get a new or existing instance of the Triton Inference Server.
nvdsinferserver::TrtISServer::createRequest
SharedRequest createRequest(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, const std::vector< TritonClassParams > &clasList)
Create and initializes a new inference request.
TRITONSERVER_MemoryType
enum TRITONSERVER_memorytype_enum TRITONSERVER_MemoryType
TRITONSERVER_MemoryType.
nvdsinferserver::triton::RepoSettings::pinnedMemBytes
uint64_t pinnedMemBytes
Pre-allocated pinned memory on host for Triton runtime.
Definition: infer_trtis_server.h:481
nvdsinferserver::triton::RepoSettings::strictModelConfig
bool strictModelConfig
Flag to enable/disable Triton strict model configuration.
Definition: infer_trtis_server.h:473
nvds_version.h
TRITONSERVER_InferenceRequestReleaseFn_t
void(* TRITONSERVER_InferenceRequestReleaseFn_t)(TRITONSERVER_InferenceRequest *request, const uint32_t flags, void *userp)
Type for inference request release callback function.
Definition: tritonserver.h:909
nvdsinferserver::WeakTritonAllocator
std::weak_ptr< TrtServerAllocator > WeakTritonAllocator
Definition: infer_common.h:125
NVDSINFER_SUCCESS
@ NVDSINFER_SUCCESS
NvDsInferContext operation succeeded.
Definition: nvdsinfer.h:220
infer_batch_buffer.h
Header file of batch buffer related class declarations.
nvdsinferserver::TrtServerAllocator
Wrapper class for Triton server output memory allocator.
Definition: infer_trtis_server.h:351
nvdsinferserver::TrtServerResponse::mutableOutputs
std::vector< SharedBatchBuf > & mutableOutputs()
Get the list of output batch buffers.
Definition: infer_trtis_server.h:252
nvdsinferserver::TrtServerRequest::bufId
uint64_t bufId() const
Get the input buffer ID associated with the request.
Definition: infer_trtis_server.h:147
nvdsinferserver::ShrTritonAllocator
std::shared_ptr< TrtServerAllocator > ShrTritonAllocator
Definition: infer_common.h:124
nvdsinferserver::TrtISServer::isServerReady
bool isServerReady()
Check if the server is ready.
TRITON_DEFAULT_PINNED_MEMORY_BYTES
#define TRITON_DEFAULT_PINNED_MEMORY_BYTES
Definition: infer_trtis_server.h:44
nvdsinferserver::triton::RepoSettings::logLevel
uint32_t logLevel
Level of the Triton log output.
Definition: infer_trtis_server.h:461
TRITONSERVER_InferenceResponseCompleteFn_t
void(* TRITONSERVER_InferenceResponseCompleteFn_t)(TRITONSERVER_InferenceResponse *response, const uint32_t flags, void *userp)
Type for callback function indicating that an inference response has completed.
Definition: tritonserver.h:929
nvdsinferserver::TrtServerPtr
std::shared_ptr< TrtISServer > TrtServerPtr
Definition: infer_common.h:121
nvdsinferserver::TrtISServer::initialize
NvDsInferStatus initialize()
Create a new instance of the Triton Inference Server.
nvdsinferserver::triton::BackendConfig::backend
std::string backend
Name of the backend.
Definition: infer_trtis_server.h:439
nvdsinferserver::TrtISServer::getModelConfig
NvDsInferStatus getModelConfig(const std::string &model, int64_t version, ni::ModelConfig &config)
Get the model configuration for the specified model.
nvdsinferserver::triton::RepoSettings
Model repository settings for the Triton Inference Server.
Definition: infer_trtis_server.h:453
nvdsinferserver::TrtServerAllocator::ptr
TRITONSERVER_ResponseAllocator * ptr()
Get the pointer to the TRITONSERVER_ResponseAllocator instance.
Definition: infer_trtis_server.h:374
nvdsinferserver::triton::BackendConfig::key
std::string key
Name of the setting.
Definition: infer_trtis_server.h:443
nvdsinferserver::TrtServerRequest::ptr
TRITONSERVER_InferenceRequest * ptr()
Get the pointer to the Triton inference request object.
Definition: infer_trtis_server.h:135
nvdsinferserver::SharedResponse
std::shared_ptr< TrtServerResponse > SharedResponse
Definition: infer_trtis_server.h:61
nvdsinferserver::TrtISServer::TrtISServer
TrtISServer(const triton::RepoSettings &repo)
Constructor.
nvdsinferserver::UniqTritonT
std::unique_ptr< T, std::function< void(T *)> > UniqTritonT
Definition: infer_common.h:114
nvdsinferserver::triton::RepoSettings::tfGpuMemoryFraction
float tfGpuMemoryFraction
TensorFlow GPU memory fraction per process.
Definition: infer_trtis_server.h:469
nvdsinferserver::triton::RepoSettings::controlMode
int32_t controlMode
Triton model control mode.
Definition: infer_trtis_server.h:489
nvdsinferserver::TritonInferAsyncDone
std::function< void(SharedRequest, UniqResponse)> TritonInferAsyncDone
Definition: infer_trtis_server.h:63
nvdsinferserver::TrtServerResponse::takeoverOptions
SharedOptions takeoverOptions()
Get and own the options list.
Definition: infer_trtis_server.h:260
nvdsinferserver::triton::BackendConfig::value
std::string value
Value of the setting.
Definition: infer_trtis_server.h:447
TRITONSERVER_MODEL_CONTROL_EXPLICIT
@ TRITONSERVER_MODEL_CONTROL_EXPLICIT
Definition: tritonserver.h:1539
nvdsinferserver::TrtServerRequest::TrtServerRequest
TrtServerRequest(TrtServerPtr server)
Constructor.
nvdsinferserver::SharedOptions
std::shared_ptr< IOptions > SharedOptions
Definition: infer_common.h:73
infer_proto_utils.h
nvdsinferserver::TrtServerRequest::init
NvDsInferStatus init(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, uint64_t reqId, const std::vector< TritonClassParams > &clasList)
Create a new Triton inference request with the specified inputs and parameters.
nvdsinferserver::TrtServerRequest::RequestOnRelease
static void RequestOnRelease(TRITONSERVER_InferenceRequest *request, const uint32_t flags, void *userp)
The callback function to release the request instance.
nvdsinferserver::TrtServerResponse::model
const std::string & model() const
Get the model name parsed from the Triton response.
Definition: infer_trtis_server.h:248
nvdsinferserver::TrtServerAllocator::~TrtServerAllocator
virtual ~TrtServerAllocator()=default
Destructor.
nvdsinferserver::triton::RepoSettings::tfAllowSoftPlacement
bool tfAllowSoftPlacement
Flag to enable/disable soft placement of TF operators.
Definition: infer_trtis_server.h:465
nvdsinferserver::TrtServerAllocator::AllocFn
std::function< SharedSysMem(const std::string &, size_t, InferMemType, int64_t)> AllocFn
Definition: infer_trtis_server.h:354
nvdsinferserver::TrtServerRequest::~TrtServerRequest
~TrtServerRequest()
Destructor.
nvdsinferserver::triton::BackendConfig
The backend configuration settings.
Definition: infer_trtis_server.h:435
nvdsinferserver::TrtServerAllocator::FreeFn
std::function< void(const std::string &, SharedSysMem)> FreeFn
Definition: infer_trtis_server.h:355
infer_post_datatypes.h
nvdsinferserver::TrtISServer::isServerLive
bool isServerLive()
Check if the server is live.
nvdsinferserver::triton::RepoSettings::backendDirectory
std::string backendDirectory
The path to the Triton backends directory.
Definition: infer_trtis_server.h:485
nvdsinferserver::TrtServerResponse::parse
NvDsInferStatus parse(const TrtServerRequest *req)
Check for error and parse the inference output.
nvdsinferserver::TrtServerRequest::setResponseComplete
NvDsInferStatus setResponseComplete(ShrTritonAllocator &allocator, TRITONSERVER_InferenceResponseCompleteFn_t responseCompleteCb, void *responseUserPtr)
Set the allocator and response callback for the request.
nvdsinferserver::triton::RepoSettings::backendConfigs
std::vector< BackendConfig > backendConfigs
Array of backend configurations settings.
Definition: infer_trtis_server.h:498
nvdsinferserver::IOptions
Definition: infer_ioptions.h:57
nvdsinferserver::triton::RepoSettings::operator!=
bool operator!=(const RepoSettings &other) const
Definition: infer_trtis_server.h:522
nvdsinferserver::InferMemType
InferMemType
The memory types of inference buffers.
Definition: infer_datatypes.h:56
nvdsinferserver::TrtISServer::loadModel
NvDsInferStatus loadModel(const std::string &modelName)
Load or reload the specified model.
nvdsinferserver::triton::RepoSettings::operator==
bool operator==(const RepoSettings &other) const
Comparison operators.
nvdsinferserver::TrtISServer::~TrtISServer
~TrtISServer()
Destructor.
nvdsinferserver::TrtServerRequest::model
const std::string & model() const
Get the model name.
Definition: infer_trtis_server.h:139
nvdsinferserver::TrtServerResponse::TrtServerResponse
TrtServerResponse(TrtServerPtr server, UniqTritonT< TRITONSERVER_InferenceResponse > data, uint64_t id)
Constructor.
nvdsinferserver::TrtServerRequest
Wrapper class for Triton inference request.
Definition: infer_trtis_server.h:68
DISABLE_CLASS_COPY
#define DISABLE_CLASS_COPY(NoCopyClass)
Copyright (c) 2020, NVIDIA CORPORATION.
Definition: infer_defines.h:30
nvdsinferserver::SharedRequest
std::shared_ptr< TrtServerRequest > SharedRequest
Definition: infer_trtis_server.h:59
nvdsinferserver::SharedBatchArray
std::shared_ptr< BaseBatchArray > SharedBatchArray
Definition: infer_common.h:75
nvdsinferserver::TrtISServer::unloadModel
NvDsInferStatus unloadModel(const std::string &modelName)
Unload the specified model.
infer_trtis_utils.h
Triton Inference Server utilies header file.
nvdsinferserver::TrtISServer::inferAsync
NvDsInferStatus inferAsync(SharedRequest request, WeakTritonAllocator allocator, TritonInferAsyncDone done)
Submit a request for asynchronous inference.
nvdsinferserver::TrtISServer::getRepoSettings
const triton::RepoSettings & getRepoSettings()
Get the model repository settings.
Definition: infer_trtis_server.h:558
nvdsinferserver::TrtServerRequest::releaseInputs
SharedBatchArray releaseInputs()
Release the ownership of input batch buffer array.
Definition: infer_trtis_server.h:152
nvdsinferserver::TrtServerAllocator::TrtServerAllocator
TrtServerAllocator(AllocFn alloc, FreeFn release)
Constructor, create an instance of the type TRITONSERVER_ResponseAllocator which calls provided alloc...
NvDsInferStatus
NvDsInferStatus
Enum for the status codes returned by NvDsInferContext.
Definition: nvdsinfer.h:218
nvdsinferserver::InferBufferDescription
Holds the information about a inference buffer.
Definition: infer_datatypes.h:168
nvdsinferserver::triton::RepoSettings::cudaDevMemMap
std::map< uint32_t, uint64_t > cudaDevMemMap
Map of the device IDs and corresponding size of CUDA memory pool to be allocated.
Definition: infer_trtis_server.h:494