NVIDIA DeepStream SDK API Reference

6.2 Release
infer_trtis_server.h
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
4  *
5  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
6  * property and proprietary rights in and to this material, related
7  * documentation and any modifications thereto. Any use, reproduction,
8  * disclosure or distribution of this material and related documentation
9  * without an express license agreement from NVIDIA CORPORATION or
10  * its affiliates is strictly prohibited.
11  */
12 
24 #ifndef __NVDSINFER_TRTIS_SERVER_H__
25 #define __NVDSINFER_TRTIS_SERVER_H__
26 
27 #include <string>
28 #include <thread>
29 #include <vector>
30 #include <memory>
31 
32 #include "infer_batch_buffer.h"
33 #include "infer_post_datatypes.h"
34 #include "infer_proto_utils.h"
35 #include "infer_trtis_utils.h"
36 
37 #include "nvds_version.h"
38 
39 #ifdef IS_TEGRA
40 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 5.3
41 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 26)
42 #define TRITON_DEFAULT_BACKEND_DIR GetTritonBackendDir()
43 #else
44 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 6.0
45 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 28)
46 #define TRITON_DEFAULT_BACKEND_DIR "/opt/tritonserver/backends"
47 #endif
48 
49 struct TRITONSERVER_Server;
50 
51 namespace nvdsinferserver {
52 
53 namespace ni = inference;
54 
55 class TrtISServer;
56 class TrtServerRequest;
57 class TrtServerResponse;
58 class TrtServerAllocator;
59 
60 using SharedRequest = std::shared_ptr<TrtServerRequest>;
61 using UniqResponse = std::unique_ptr<TrtServerResponse>;
62 using SharedResponse = std::shared_ptr<TrtServerResponse>;
63 
64 using TritonInferAsyncDone = std::function<void(SharedRequest, UniqResponse)>;
65 
70 protected:
71  friend class TrtISServer;
78 
91  const std::string& model, int64_t version, SharedBatchArray& inputs,
92  const std::vector<std::string>& outputs, uint64_t reqId,
93  const std::vector<TritonClassParams>& clasList);
94 
102  TRITONSERVER_InferenceRequestReleaseFn_t requestCompleteCb,
103  void* userPtr);
104 
113  ShrTritonAllocator& allocator,
114  TRITONSERVER_InferenceResponseCompleteFn_t responseCompleteCb,
115  void* responseUserPtr);
116 
123  static void RequestOnRelease(
124  TRITONSERVER_InferenceRequest* request, const uint32_t flags,
125  void* userp);
126 
127 public:
132 
136  TRITONSERVER_InferenceRequest* ptr() { return m_ReqPtr.get(); }
140  const std::string& model() const { return m_Model; }
144  uint64_t id() const { return m_ReqId; }
148  uint64_t bufId() const { return m_BufId; }
153  SharedBatchArray releaseInputs() { return std::move(m_Inputs); }
157  const std::vector<std::string>& outputs() const { return m_Outputs; }
162  const std::map<std::string, TritonClassParams>& classParams() const {
163  return m_ClasList; }
164 
165 private:
172  NvDsInferStatus setInputs(SharedBatchArray& inputs);
177  NvDsInferStatus setOption(const IOptions* option);
178 
180 
181 private:
189  std::weak_ptr<TrtISServer> m_Server;
193  SharedBatchArray m_Inputs;
197  std::string m_Model;
201  uint64_t m_ReqId = UINT64_C(0);
205  uint64_t m_BufId = UINT64_C(0);
206 
210  std::vector<std::string> m_Outputs;
214  std::map<std::string, TritonClassParams> m_ClasList;
215 };
216 
221  friend class TrtISServer;
222 
223 protected:
232  uint64_t id);
233 
234 public:
249  const std::string& model() const { return m_Model; }
253  std::vector<SharedBatchBuf>& mutableOutputs() { return m_BufOutputs; }
257  NvDsInferStatus getStatus() const { return m_Status; }
261  SharedOptions takeoverOptions() { return std::move(m_Options); }
262 
263 private:
267  NvDsInferStatus parseParams();
268 
281  NvDsInferStatus parseOutputData(const TrtServerRequest* req);
282 
293  NvDsInferStatus addClass(
294  const TritonClassParams& classP, const InferBufferDescription& desc, uint32_t batchSize,
295  uint32_t tensorIdx, const void* base);
296 
307  NvDsInferStatus topKClass(
308  InferClassificationOutput& ret, const TritonClassParams& classP,
309  const InferBufferDescription& desc, uint32_t tensorIdx, const void* base);
310 
312 
316  uint64_t m_ResponseId = UINT64_C(0);
324  std::weak_ptr<TrtISServer> m_Server;
328  std::string m_Model;
332  int64_t m_ModelVersion = UINT64_C(1);
337 
342  std::vector<SharedBatchBuf> m_BufOutputs;
346  SharedOptions m_Options;
347 };
348 
353  : public std::enable_shared_from_this<TrtServerAllocator> {
354 public:
355  using AllocFn = std::function<SharedSysMem(const std::string&, size_t, InferMemType, int64_t)>;
356  using FreeFn = std::function<void(const std::string&, SharedSysMem)>;
357 
365  TrtServerAllocator(AllocFn alloc, FreeFn release);
366 
370  virtual ~TrtServerAllocator() = default;
371 
375  TRITONSERVER_ResponseAllocator* ptr() { return m_Allocator.get(); }
376 
377 private:
393  static TRITONSERVER_Error* ResponseAlloc(
394  TRITONSERVER_ResponseAllocator* allocator, const char* tensorName,
395  size_t bytes, TRITONSERVER_MemoryType preferredMemType,
396  int64_t preferredDevId, void* userP, void** buffer, void** bufferUserP,
397  TRITONSERVER_MemoryType* actualMemType, int64_t* actualMemTypeId);
398 
409  static TRITONSERVER_Error* ResponseRelease(
410  TRITONSERVER_ResponseAllocator* allocator, void* buffer,
411  void* bufferUserP, size_t bytes, TRITONSERVER_MemoryType memType,
412  int64_t devId);
413 
414 private:
415  DISABLE_CLASS_COPY(TrtServerAllocator);
416 
424  AllocFn m_allocFn;
428  FreeFn m_releaseFn;
429 };
430 
431 namespace triton {
432 
433 #ifdef IS_TEGRA
434 static inline const char*
435 GetTritonBackendDir()
436 {
437  static char dirBuf[256];
438  snprintf(
439  dirBuf, sizeof(dirBuf),
440  "/opt/nvidia/deepstream/deepstream-%d.%d/lib/triton_backends",
442  return dirBuf;
443 }
444 #endif
445 
453  std::string backend;
457  std::string key;
461  std::string value;
462 };
463 
467 struct RepoSettings {
471  std::set<std::string> roots;
475  uint32_t logLevel = 0;
479  bool tfAllowSoftPlacement = true;
487  bool strictModelConfig = true;
503  int32_t controlMode = (int32_t)TRITONSERVER_MODEL_CONTROL_EXPLICIT;
508  std::map<uint32_t, uint64_t> cudaDevMemMap;
512  std::vector<BackendConfig> backendConfigs;
513 
517  std::string debugStr;
518 
526  bool initFrom(
527  const ic::TritonModelRepo& repo, const std::vector<int>& devIds);
528 
535  bool operator==(const RepoSettings& other) const;
536  bool operator!=(const RepoSettings& other) const
537  {
538  return !this->operator==(other);
539  }
541 };
542 } // namespace triton
543 
547 class TrtISServer : public std::enable_shared_from_this<TrtISServer> {
548  friend class TrtServerRequest;
549  friend class TrtServerResponse;
550 
551 protected:
556  TrtISServer(const triton::RepoSettings& repo);
557 
568 
572  const triton::RepoSettings& getRepoSettings() { return m_RepoSettings; }
573 
574 public:
578  ~TrtISServer();
579 
594  static TrtServerPtr getInstance(const triton::RepoSettings* repo);
595 
599  bool isServerReady();
600 
604  bool isServerLive();
605 
612  bool isModelReady(const std::string& model, int64_t version);
613 
619  NvDsInferStatus loadModel(const std::string& modelName);
620 
626  NvDsInferStatus unloadModel(const std::string& modelName);
627 
636  const std::string& model, int64_t version, ni::ModelConfig& config);
637 
649  const std::string& model, int64_t version, SharedBatchArray& inputs,
650  const std::vector<std::string>& outputs, const std::vector<TritonClassParams>& clasList);
651 
665  SharedRequest request, WeakTritonAllocator allocator,
666  TritonInferAsyncDone done);
667 
668 private:
679  UniqResponse createResponse(
680  UniqTritonT<TRITONSERVER_InferenceResponse>&& data, uint64_t id);
681 
682  using InferUserData =
683  std::tuple<SharedRequest, TritonInferAsyncDone, TrtISServer*>;
693  static void InferComplete(
694  TRITONSERVER_InferenceResponse* response, const uint32_t flags,
695  void* userp);
696 
700  TRITONSERVER_Server* serverPtr() const { return m_Impl.get(); }
701 
703 
707  static std::weak_ptr<TrtISServer> sTrtServerInstance;
711  static std::mutex sTrtServerMutex;
712 
713 private:
717  UniqTritonT<TRITONSERVER_Server> m_Impl;
721  std::atomic<uint64_t> m_LastRequestId{UINT64_C(0)};
725  triton::RepoSettings m_RepoSettings;
726 };
727 
728 } // namespace nvdsinferserver
729 
730 #endif
nvdsinferserver::triton::RepoSettings::minComputeCapacity
double minComputeCapacity
The minimun supported compute compability for Triton server.
Definition: infer_trtis_server.h:491
nvdsinferserver
Copyright (c) 2021, NVIDIA CORPORATION.
Definition: infer_custom_process.h:23
nvdsinferserver::triton::RepoSettings::initFrom
bool initFrom(const ic::TritonModelRepo &repo, const std::vector< int > &devIds)
Populate the RepoSettings instance with the values from the TritonModelRepo protobuf message.
nvdsinferserver::TrtServerRequest::outputs
const std::vector< std::string > & outputs() const
Get the list of requested output layer names.
Definition: infer_trtis_server.h:157
nvdsinferserver::UniqResponse
std::unique_ptr< TrtServerResponse > UniqResponse
Definition: infer_trtis_server.h:61
TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY
#define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY
Definition: infer_trtis_server.h:44
nvdsinferserver::TrtISServer::isModelReady
bool isModelReady(const std::string &model, int64_t version)
Check if the server is ready for inference using specified model.
nvdsinferserver::TrtServerRequest::setRequestComplete
NvDsInferStatus setRequestComplete(TRITONSERVER_InferenceRequestReleaseFn_t requestCompleteCb, void *userPtr)
Set the release callback function for the request.
TritonClassParams
Definition: infer_post_datatypes.h:90
TRITON_DEFAULT_BACKEND_DIR
#define TRITON_DEFAULT_BACKEND_DIR
Definition: infer_trtis_server.h:46
nvdsinferserver::TrtISServer
Wrapper class for creating Triton Inference Server instance.
Definition: infer_trtis_server.h:547
nvdsinferserver::triton::RepoSettings::roots
std::set< std::string > roots
Set of model repository directories.
Definition: infer_trtis_server.h:471
nvdsinferserver::TrtServerRequest::id
uint64_t id() const
Get the request ID.
Definition: infer_trtis_server.h:144
nvdsinferserver::TrtServerRequest::classParams
const std::map< std::string, TritonClassParams > & classParams() const
Get the Triton classification parameters list (tensor name : classification parameters).
Definition: infer_trtis_server.h:162
nvdsinferserver::triton::RepoSettings::debugStr
std::string debugStr
Debug string of the TritonModelRepo protobuf message.
Definition: infer_trtis_server.h:517
nvdsinferserver::SharedSysMem
std::shared_ptr< SysMem > SharedSysMem
Definition: infer_common.h:88
nvdsinferserver::TrtServerResponse
Wrapper class for Triton output parsing.
Definition: infer_trtis_server.h:220
nvdsinferserver::TrtServerResponse::getStatus
NvDsInferStatus getStatus() const
Check if the response could be parsed correctly.
Definition: infer_trtis_server.h:257
nvdsinferserver::ShrTritonT
std::shared_ptr< T > ShrTritonT
Definition: infer_common.h:117
nvdsinferserver::TrtISServer::getInstance
static TrtServerPtr getInstance(const triton::RepoSettings *repo)
Get a new or existing instance of the Triton Inference Server.
nvdsinferserver::TrtISServer::createRequest
SharedRequest createRequest(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, const std::vector< TritonClassParams > &clasList)
Create and initializes a new inference request.
nvdsinferserver::triton::RepoSettings::pinnedMemBytes
uint64_t pinnedMemBytes
Pre-allocated pinned memory on host for Triton runtime.
Definition: infer_trtis_server.h:495
nvdsinferserver::triton::RepoSettings::strictModelConfig
bool strictModelConfig
Flag to enable/disable Triton strict model configuration.
Definition: infer_trtis_server.h:487
nvds_version.h
NVDS_VERSION_MAJOR
#define NVDS_VERSION_MAJOR
Definition: nvds_version.h:32
nvdsinferserver::WeakTritonAllocator
std::weak_ptr< TrtServerAllocator > WeakTritonAllocator
Definition: infer_common.h:125
infer_batch_buffer.h
Header file of batch buffer related class declarations.
nvdsinferserver::TrtServerAllocator
Wrapper class for Triton server output memory allocator.
Definition: infer_trtis_server.h:352
nvdsinferserver::TrtServerResponse::mutableOutputs
std::vector< SharedBatchBuf > & mutableOutputs()
Get the list of output batch buffers.
Definition: infer_trtis_server.h:253
nvdsinferserver::TrtServerRequest::bufId
uint64_t bufId() const
Get the input buffer ID associated with the request.
Definition: infer_trtis_server.h:148
nvdsinferserver::ShrTritonAllocator
std::shared_ptr< TrtServerAllocator > ShrTritonAllocator
Definition: infer_common.h:124
NVDS_VERSION_MINOR
#define NVDS_VERSION_MINOR
Definition: nvds_version.h:33
nvdsinferserver::TrtISServer::isServerReady
bool isServerReady()
Check if the server is ready.
TRITON_DEFAULT_PINNED_MEMORY_BYTES
#define TRITON_DEFAULT_PINNED_MEMORY_BYTES
Definition: infer_trtis_server.h:45
nvdsinferserver::triton::RepoSettings::logLevel
uint32_t logLevel
Level of the Triton log output.
Definition: infer_trtis_server.h:475
nvdsinferserver::TrtServerPtr
std::shared_ptr< TrtISServer > TrtServerPtr
Definition: infer_common.h:121
nvdsinferserver::TrtISServer::initialize
NvDsInferStatus initialize()
Create a new instance of the Triton Inference Server.
nvdsinferserver::triton::BackendConfig::backend
std::string backend
Name of the backend.
Definition: infer_trtis_server.h:453
nvdsinferserver::TrtISServer::getModelConfig
NvDsInferStatus getModelConfig(const std::string &model, int64_t version, ni::ModelConfig &config)
Get the model configuration for the specified model.
nvdsinferserver::triton::RepoSettings
Model repository settings for the Triton Inference Server.
Definition: infer_trtis_server.h:467
nvdsinferserver::TrtServerAllocator::ptr
TRITONSERVER_ResponseAllocator * ptr()
Get the pointer to the TRITONSERVER_ResponseAllocator instance.
Definition: infer_trtis_server.h:375
nvdsinferserver::triton::BackendConfig::key
std::string key
Name of the setting.
Definition: infer_trtis_server.h:457
nvdsinferserver::TrtServerRequest::ptr
TRITONSERVER_InferenceRequest * ptr()
Get the pointer to the Triton inference request object.
Definition: infer_trtis_server.h:136
nvdsinferserver::SharedResponse
std::shared_ptr< TrtServerResponse > SharedResponse
Definition: infer_trtis_server.h:62
nvdsinferserver::TrtISServer::TrtISServer
TrtISServer(const triton::RepoSettings &repo)
Constructor.
nvdsinferserver::UniqTritonT
std::unique_ptr< T, std::function< void(T *)> > UniqTritonT
Definition: infer_common.h:114
NVDSINFER_SUCCESS
@ NVDSINFER_SUCCESS
NvDsInferContext operation succeeded.
Definition: nvdsinfer.h:219
nvdsinferserver::triton::RepoSettings::tfGpuMemoryFraction
float tfGpuMemoryFraction
TensorFlow GPU memory fraction per process.
Definition: infer_trtis_server.h:483
nvdsinferserver::triton::RepoSettings::controlMode
int32_t controlMode
Triton model control mode.
Definition: infer_trtis_server.h:503
nvdsinferserver::TritonInferAsyncDone
std::function< void(SharedRequest, UniqResponse)> TritonInferAsyncDone
Definition: infer_trtis_server.h:64
nvdsinferserver::TrtServerResponse::takeoverOptions
SharedOptions takeoverOptions()
Get and own the options list.
Definition: infer_trtis_server.h:261
nvdsinferserver::triton::BackendConfig::value
std::string value
Value of the setting.
Definition: infer_trtis_server.h:461
nvdsinferserver::TrtServerRequest::TrtServerRequest
TrtServerRequest(TrtServerPtr server)
Constructor.
nvdsinferserver::SharedOptions
std::shared_ptr< IOptions > SharedOptions
Definition: infer_common.h:73
infer_proto_utils.h
nvdsinferserver::TrtServerRequest::init
NvDsInferStatus init(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, uint64_t reqId, const std::vector< TritonClassParams > &clasList)
Create a new Triton inference request with the specified inputs and parameters.
nvdsinferserver::TrtServerRequest::RequestOnRelease
static void RequestOnRelease(TRITONSERVER_InferenceRequest *request, const uint32_t flags, void *userp)
The callback function to release the request instance.
nvdsinferserver::TrtServerResponse::model
const std::string & model() const
Get the model name parsed from the Triton response.
Definition: infer_trtis_server.h:249
nvdsinferserver::TrtServerAllocator::~TrtServerAllocator
virtual ~TrtServerAllocator()=default
Destructor.
nvdsinferserver::triton::RepoSettings::tfAllowSoftPlacement
bool tfAllowSoftPlacement
Flag to enable/disable soft placement of TF operators.
Definition: infer_trtis_server.h:479
nvdsinferserver::TrtServerAllocator::AllocFn
std::function< SharedSysMem(const std::string &, size_t, InferMemType, int64_t)> AllocFn
Definition: infer_trtis_server.h:355
nvdsinferserver::TrtServerRequest::~TrtServerRequest
~TrtServerRequest()
Destructor.
nvdsinferserver::triton::BackendConfig
The backend configuration settings.
Definition: infer_trtis_server.h:449
nvdsinferserver::TrtServerAllocator::FreeFn
std::function< void(const std::string &, SharedSysMem)> FreeFn
Definition: infer_trtis_server.h:356
infer_post_datatypes.h
nvdsinferserver::TrtISServer::isServerLive
bool isServerLive()
Check if the server is live.
nvdsinferserver::triton::RepoSettings::backendDirectory
std::string backendDirectory
The path to the Triton backends directory.
Definition: infer_trtis_server.h:499
nvdsinferserver::TrtServerResponse::parse
NvDsInferStatus parse(const TrtServerRequest *req)
Check for error and parse the inference output.
nvdsinferserver::TrtServerRequest::setResponseComplete
NvDsInferStatus setResponseComplete(ShrTritonAllocator &allocator, TRITONSERVER_InferenceResponseCompleteFn_t responseCompleteCb, void *responseUserPtr)
Set the allocator and response callback for the request.
nvdsinferserver::triton::RepoSettings::backendConfigs
std::vector< BackendConfig > backendConfigs
Array of backend configurations settings.
Definition: infer_trtis_server.h:512
nvdsinferserver::IOptions
Definition: infer_ioptions.h:53
nvdsinferserver::triton::RepoSettings::operator!=
bool operator!=(const RepoSettings &other) const
Definition: infer_trtis_server.h:536
nvdsinferserver::InferMemType
InferMemType
The memory types of inference buffers.
Definition: infer_datatypes.h:56
DISABLE_CLASS_COPY
#define DISABLE_CLASS_COPY(NoCopyClass)
Copyright (c) 2020, NVIDIA CORPORATION.
Definition: infer_defines.h:25
nvdsinferserver::TrtISServer::loadModel
NvDsInferStatus loadModel(const std::string &modelName)
Load or reload the specified model.
nvdsinferserver::triton::RepoSettings::operator==
bool operator==(const RepoSettings &other) const
Comparison operators.
nvdsinferserver::TrtISServer::~TrtISServer
~TrtISServer()
Destructor.
nvdsinferserver::TrtServerRequest::model
const std::string & model() const
Get the model name.
Definition: infer_trtis_server.h:140
nvdsinferserver::TrtServerResponse::TrtServerResponse
TrtServerResponse(TrtServerPtr server, UniqTritonT< TRITONSERVER_InferenceResponse > data, uint64_t id)
Constructor.
nvdsinferserver::TrtServerRequest
Wrapper class for Triton inference request.
Definition: infer_trtis_server.h:69
nvdsinferserver::SharedRequest
std::shared_ptr< TrtServerRequest > SharedRequest
Definition: infer_trtis_server.h:60
nvdsinferserver::SharedBatchArray
std::shared_ptr< BaseBatchArray > SharedBatchArray
Definition: infer_common.h:75
nvdsinferserver::TrtISServer::unloadModel
NvDsInferStatus unloadModel(const std::string &modelName)
Unload the specified model.
infer_trtis_utils.h
Triton Inference Server utilies header file.
nvdsinferserver::TrtISServer::inferAsync
NvDsInferStatus inferAsync(SharedRequest request, WeakTritonAllocator allocator, TritonInferAsyncDone done)
Submit a request for asynchronous inference.
nvdsinferserver::TrtISServer::getRepoSettings
const triton::RepoSettings & getRepoSettings()
Get the model repository settings.
Definition: infer_trtis_server.h:572
nvdsinferserver::TrtServerRequest::releaseInputs
SharedBatchArray releaseInputs()
Release the ownership of input batch buffer array.
Definition: infer_trtis_server.h:153
nvdsinferserver::TrtServerAllocator::TrtServerAllocator
TrtServerAllocator(AllocFn alloc, FreeFn release)
Constructor, create an instance of the type TRITONSERVER_ResponseAllocator which calls provided alloc...
NvDsInferStatus
NvDsInferStatus
Enum for the status codes returned by NvDsInferContext.
Definition: nvdsinfer.h:217
nvdsinferserver::InferBufferDescription
Holds the information about a inference buffer.
Definition: infer_datatypes.h:168
nvdsinferserver::triton::RepoSettings::cudaDevMemMap
std::map< uint32_t, uint64_t > cudaDevMemMap
Map of the device IDs and corresponding size of CUDA memory pool to be allocated.
Definition: infer_trtis_server.h:508