|
NVIDIA DeepStream SDK API Reference
|
6.2 Release
|
Go to the documentation of this file.
24 #ifndef __NVDSINFER_TRTIS_SERVER_H__
25 #define __NVDSINFER_TRTIS_SERVER_H__
40 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 5.3
41 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 26)
42 #define TRITON_DEFAULT_BACKEND_DIR GetTritonBackendDir()
44 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 6.0
45 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 28)
46 #define TRITON_DEFAULT_BACKEND_DIR "/opt/tritonserver/backends"
49 struct TRITONSERVER_Server;
53 namespace ni = inference;
56 class TrtServerRequest;
57 class TrtServerResponse;
58 class TrtServerAllocator;
92 const std::vector<std::string>&
outputs, uint64_t reqId,
93 const std::vector<TritonClassParams>& clasList);
102 TRITONSERVER_InferenceRequestReleaseFn_t requestCompleteCb,
114 TRITONSERVER_InferenceResponseCompleteFn_t responseCompleteCb,
115 void* responseUserPtr);
124 TRITONSERVER_InferenceRequest* request,
const uint32_t flags,
136 TRITONSERVER_InferenceRequest*
ptr() {
return m_ReqPtr.get(); }
140 const std::string&
model()
const {
return m_Model; }
144 uint64_t
id()
const {
return m_ReqId; }
148 uint64_t
bufId()
const {
return m_BufId; }
157 const std::vector<std::string>&
outputs()
const {
return m_Outputs; }
162 const std::map<std::string, TritonClassParams>&
classParams()
const {
189 std::weak_ptr<TrtISServer> m_Server;
201 uint64_t m_ReqId = UINT64_C(0);
205 uint64_t m_BufId = UINT64_C(0);
210 std::vector<std::string> m_Outputs;
214 std::map<std::string, TritonClassParams> m_ClasList;
249 const std::string&
model()
const {
return m_Model; }
295 uint32_t tensorIdx,
const void* base);
316 uint64_t m_ResponseId = UINT64_C(0);
324 std::weak_ptr<TrtISServer> m_Server;
332 int64_t m_ModelVersion = UINT64_C(1);
342 std::vector<SharedBatchBuf> m_BufOutputs;
353 :
public std::enable_shared_from_this<TrtServerAllocator> {
375 TRITONSERVER_ResponseAllocator*
ptr() {
return m_Allocator.get(); }
393 static TRITONSERVER_Error* ResponseAlloc(
394 TRITONSERVER_ResponseAllocator* allocator,
const char* tensorName,
395 size_t bytes, TRITONSERVER_MemoryType preferredMemType,
396 int64_t preferredDevId,
void* userP,
void** buffer,
void** bufferUserP,
397 TRITONSERVER_MemoryType* actualMemType, int64_t* actualMemTypeId);
409 static TRITONSERVER_Error* ResponseRelease(
410 TRITONSERVER_ResponseAllocator* allocator,
void* buffer,
411 void* bufferUserP,
size_t bytes, TRITONSERVER_MemoryType memType,
434 static inline const char*
435 GetTritonBackendDir()
437 static char dirBuf[256];
439 dirBuf,
sizeof(dirBuf),
440 "/opt/nvidia/deepstream/deepstream-%d.%d/lib/triton_backends",
503 int32_t
controlMode = (int32_t)TRITONSERVER_MODEL_CONTROL_EXPLICIT;
527 const ic::TritonModelRepo& repo,
const std::vector<int>& devIds);
547 class TrtISServer :
public std::enable_shared_from_this<TrtISServer> {
612 bool isModelReady(
const std::string& model, int64_t version);
636 const std::string& model, int64_t version, ni::ModelConfig& config);
650 const std::vector<std::string>& outputs,
const std::vector<TritonClassParams>& clasList);
682 using InferUserData =
683 std::tuple<SharedRequest, TritonInferAsyncDone, TrtISServer*>;
693 static void InferComplete(
694 TRITONSERVER_InferenceResponse* response,
const uint32_t flags,
700 TRITONSERVER_Server* serverPtr()
const {
return m_Impl.get(); }
707 static std::weak_ptr<TrtISServer> sTrtServerInstance;
711 static std::mutex sTrtServerMutex;
717 UniqTritonT<TRITONSERVER_Server> m_Impl;
721 std::atomic<uint64_t> m_LastRequestId{UINT64_C(0)};
725 triton::RepoSettings m_RepoSettings;
double minComputeCapacity
The minimun supported compute compability for Triton server.
Copyright (c) 2021, NVIDIA CORPORATION.
bool initFrom(const ic::TritonModelRepo &repo, const std::vector< int > &devIds)
Populate the RepoSettings instance with the values from the TritonModelRepo protobuf message.
const std::vector< std::string > & outputs() const
Get the list of requested output layer names.
std::unique_ptr< TrtServerResponse > UniqResponse
#define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY
bool isModelReady(const std::string &model, int64_t version)
Check if the server is ready for inference using specified model.
NvDsInferStatus setRequestComplete(TRITONSERVER_InferenceRequestReleaseFn_t requestCompleteCb, void *userPtr)
Set the release callback function for the request.
#define TRITON_DEFAULT_BACKEND_DIR
Wrapper class for creating Triton Inference Server instance.
std::set< std::string > roots
Set of model repository directories.
uint64_t id() const
Get the request ID.
const std::map< std::string, TritonClassParams > & classParams() const
Get the Triton classification parameters list (tensor name : classification parameters).
std::string debugStr
Debug string of the TritonModelRepo protobuf message.
std::shared_ptr< SysMem > SharedSysMem
Wrapper class for Triton output parsing.
NvDsInferStatus getStatus() const
Check if the response could be parsed correctly.
std::shared_ptr< T > ShrTritonT
static TrtServerPtr getInstance(const triton::RepoSettings *repo)
Get a new or existing instance of the Triton Inference Server.
SharedRequest createRequest(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, const std::vector< TritonClassParams > &clasList)
Create and initializes a new inference request.
uint64_t pinnedMemBytes
Pre-allocated pinned memory on host for Triton runtime.
bool strictModelConfig
Flag to enable/disable Triton strict model configuration.
#define NVDS_VERSION_MAJOR
std::weak_ptr< TrtServerAllocator > WeakTritonAllocator
Header file of batch buffer related class declarations.
Wrapper class for Triton server output memory allocator.
std::vector< SharedBatchBuf > & mutableOutputs()
Get the list of output batch buffers.
uint64_t bufId() const
Get the input buffer ID associated with the request.
std::shared_ptr< TrtServerAllocator > ShrTritonAllocator
#define NVDS_VERSION_MINOR
bool isServerReady()
Check if the server is ready.
#define TRITON_DEFAULT_PINNED_MEMORY_BYTES
uint32_t logLevel
Level of the Triton log output.
std::shared_ptr< TrtISServer > TrtServerPtr
NvDsInferStatus initialize()
Create a new instance of the Triton Inference Server.
std::string backend
Name of the backend.
NvDsInferStatus getModelConfig(const std::string &model, int64_t version, ni::ModelConfig &config)
Get the model configuration for the specified model.
Model repository settings for the Triton Inference Server.
TRITONSERVER_ResponseAllocator * ptr()
Get the pointer to the TRITONSERVER_ResponseAllocator instance.
std::string key
Name of the setting.
TRITONSERVER_InferenceRequest * ptr()
Get the pointer to the Triton inference request object.
std::shared_ptr< TrtServerResponse > SharedResponse
TrtISServer(const triton::RepoSettings &repo)
Constructor.
std::unique_ptr< T, std::function< void(T *)> > UniqTritonT
@ NVDSINFER_SUCCESS
NvDsInferContext operation succeeded.
float tfGpuMemoryFraction
TensorFlow GPU memory fraction per process.
int32_t controlMode
Triton model control mode.
std::function< void(SharedRequest, UniqResponse)> TritonInferAsyncDone
SharedOptions takeoverOptions()
Get and own the options list.
std::string value
Value of the setting.
TrtServerRequest(TrtServerPtr server)
Constructor.
std::shared_ptr< IOptions > SharedOptions
NvDsInferStatus init(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, uint64_t reqId, const std::vector< TritonClassParams > &clasList)
Create a new Triton inference request with the specified inputs and parameters.
static void RequestOnRelease(TRITONSERVER_InferenceRequest *request, const uint32_t flags, void *userp)
The callback function to release the request instance.
const std::string & model() const
Get the model name parsed from the Triton response.
virtual ~TrtServerAllocator()=default
Destructor.
bool tfAllowSoftPlacement
Flag to enable/disable soft placement of TF operators.
std::function< SharedSysMem(const std::string &, size_t, InferMemType, int64_t)> AllocFn
~TrtServerRequest()
Destructor.
The backend configuration settings.
std::function< void(const std::string &, SharedSysMem)> FreeFn
bool isServerLive()
Check if the server is live.
std::string backendDirectory
The path to the Triton backends directory.
NvDsInferStatus parse(const TrtServerRequest *req)
Check for error and parse the inference output.
NvDsInferStatus setResponseComplete(ShrTritonAllocator &allocator, TRITONSERVER_InferenceResponseCompleteFn_t responseCompleteCb, void *responseUserPtr)
Set the allocator and response callback for the request.
std::vector< BackendConfig > backendConfigs
Array of backend configurations settings.
bool operator!=(const RepoSettings &other) const
InferMemType
The memory types of inference buffers.
#define DISABLE_CLASS_COPY(NoCopyClass)
Copyright (c) 2020, NVIDIA CORPORATION.
NvDsInferStatus loadModel(const std::string &modelName)
Load or reload the specified model.
bool operator==(const RepoSettings &other) const
Comparison operators.
~TrtISServer()
Destructor.
const std::string & model() const
Get the model name.
TrtServerResponse(TrtServerPtr server, UniqTritonT< TRITONSERVER_InferenceResponse > data, uint64_t id)
Constructor.
Wrapper class for Triton inference request.
std::shared_ptr< TrtServerRequest > SharedRequest
std::shared_ptr< BaseBatchArray > SharedBatchArray
NvDsInferStatus unloadModel(const std::string &modelName)
Unload the specified model.
Triton Inference Server utilies header file.
NvDsInferStatus inferAsync(SharedRequest request, WeakTritonAllocator allocator, TritonInferAsyncDone done)
Submit a request for asynchronous inference.
const triton::RepoSettings & getRepoSettings()
Get the model repository settings.
SharedBatchArray releaseInputs()
Release the ownership of input batch buffer array.
TrtServerAllocator(AllocFn alloc, FreeFn release)
Constructor, create an instance of the type TRITONSERVER_ResponseAllocator which calls provided alloc...
NvDsInferStatus
Enum for the status codes returned by NvDsInferContext.
Holds the information about a inference buffer.
std::map< uint32_t, uint64_t > cudaDevMemMap
Map of the device IDs and corresponding size of CUDA memory pool to be allocated.