Program Listing for File gpu_resource_monitor.hpp

↰ Return to documentation for file (include/holoscan/core/system/gpu_resource_monitor.hpp)

Copy
Copied!

            
            /*
* SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef HOLOSCAN_CORE_SYSTEM_GPU_RESOURCE_MONITOR_HPP
#define HOLOSCAN_CORE_SYSTEM_GPU_RESOURCE_MONITOR_HPP

#include <memory>
#include <vector>

#include "cuda_runtime_wrapper.h"
#include "gpu_info.hpp"
#include "nvml_wrapper.h"

namespace holoscan {

constexpr uint64_t kDefaultGpuMetrics = GPUMetricFlag::GPU_DEVICE_ID;

class GPUResourceMonitor {
 public:
  explicit GPUResourceMonitor(uint64_t metric_flags = kDefaultGpuMetrics);
  virtual ~GPUResourceMonitor();

  void init();

  void close();

  uint64_t metric_flags() const;

  void metric_flags(uint64_t metric_flags);

  GPUInfo update(uint32_t index, uint64_t metric_flags = GPUMetricFlag::DEFAULT);

  std::vector<GPUInfo> update(uint64_t metric_flags = GPUMetricFlag::DEFAULT);

  GPUInfo& update(uint32_t index, GPUInfo& gpu_info,
                  uint64_t metric_flags = GPUMetricFlag::DEFAULT);

  GPUInfo gpu_info(uint32_t index, uint64_t metric_flags = GPUMetricFlag::DEFAULT);

  std::vector<GPUInfo> gpu_info(uint64_t metric_flags = GPUMetricFlag::DEFAULT);

  uint32_t num_gpus() const;

  bool is_integrated_gpu(uint32_t index);

 protected:
  bool bind_nvml_methods();
  bool bind_cuda_runtime_methods();

  bool init_nvml();
  bool init_cuda_runtime();

  void shutdown_nvml() noexcept;
  void shutdown_cuda_runtime() noexcept;

  void* handle_ = nullptr;
  void* cuda_handle_ = nullptr;

  // NVML function pointers

  nvml::nvmlErrorString_t nvmlErrorString = nullptr;
  nvml::nvmlInit_t nvmlInit = nullptr;
  nvml::nvmlDeviceGetCount_t nvmlDeviceGetCount = nullptr;
  nvml::nvmlDeviceGetHandleByIndex_t nvmlDeviceGetHandleByIndex = nullptr;
  nvml::nvmlDeviceGetHandleByPciBusId_t nvmlDeviceGetHandleByPciBusId = nullptr;
  nvml::nvmlDeviceGetHandleBySerial_t nvmlDeviceGetHandleBySerial = nullptr;
  nvml::nvmlDeviceGetHandleByUUID_t nvmlDeviceGetHandleByUUID = nullptr;
  nvml::nvmlDeviceGetName_t nvmlDeviceGetName = nullptr;
  nvml::nvmlDeviceGetIndex_t nvmlDeviceGetIndex = nullptr;
  nvml::nvmlDeviceGetPciInfo_t nvmlDeviceGetPciInfo = nullptr;
  nvml::nvmlDeviceGetSerial_t nvmlDeviceGetSerial = nullptr;
  nvml::nvmlDeviceGetUUID_t nvmlDeviceGetUUID = nullptr;
  nvml::nvmlDeviceGetMemoryInfo_t nvmlDeviceGetMemoryInfo = nullptr;
  nvml::nvmlDeviceGetUtilizationRates_t nvmlDeviceGetUtilizationRates = nullptr;
  nvml::nvmlDeviceGetPowerManagementLimit_t nvmlDeviceGetPowerManagementLimit = nullptr;
  nvml::nvmlDeviceGetPowerUsage_t nvmlDeviceGetPowerUsage = nullptr;
  nvml::nvmlDeviceGetTemperature_t nvmlDeviceGetTemperature = nullptr;
  nvml::nvmlShutdown_t nvmlShutdown = nullptr;

  // CUDA Runtime function pointers

  cuda::cudaGetErrorString_t cudaGetErrorString = nullptr;
  cuda::cudaGetDeviceCount_t cudaGetDeviceCount = nullptr;
  cuda::cudaGetDeviceProperties_t cudaGetDeviceProperties = nullptr;
  cuda::cudaDeviceGetPCIBusId_t cudaDeviceGetPCIBusId = nullptr;
  cuda::cudaMemGetInfo_t cudaMemGetInfo = nullptr;

  uint64_t metric_flags_ = kDefaultGpuMetrics;
  bool is_cached_ = false;
  uint32_t gpu_count_ = 0;
  std::vector<GPUInfo> gpu_info_;
  std::vector<nvml::nvmlDevice_t> nvml_devices_;
};
}  // namespace holoscan

#endif/* HOLOSCAN_CORE_SYSTEM_GPU_RESOURCE_MONITOR_HPP */