Source code for nemo_rl.models.policy.utils
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import os
from transformers import AutoConfig
[docs]
def import_class_from_path(name):
"""Import a class from a string path (e.g. 'torch.optim.AdamW').
Args:
full_path: Full path to class including module path and class name
Returns:
The imported class object
"""
module_name, cls_name = name.rsplit(".", 1)
cls_instance = getattr(importlib.import_module(module_name), cls_name)
return cls_instance
[docs]
def get_gpu_info(model):
"""Return information about the GPU being used by this worker."""
import torch
# Get distributed training info
rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()
local_rank = int(os.environ.get("LOCAL_RANK", 0))
# Get device info from CUDA
device = torch.cuda.current_device()
device_name = torch.cuda.get_device_name(device)
device_count = torch.cuda.device_count()
memory_allocated = torch.cuda.memory_allocated(device) / (1024**2) # in MB
memory_reserved = torch.cuda.memory_reserved(device) / (1024**2) # in MB
peak_memory = torch.cuda.max_memory_allocated() / (1024**2) # in MB
peak_reserved = torch.cuda.max_memory_reserved() / (1024**2) # in MB
# Try to get the real global device ID (not the local one)
# In distributed training, each process only sees its assigned GPU as device 0
local_device_id = device
global_device_id = local_device_id
if "CUDA_VISIBLE_DEVICES" in os.environ:
cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
if local_rank < len(cuda_visible_devices):
global_device_id = int(cuda_visible_devices[local_rank])
# Get a parameter from the model to verify CUDA device placement
# This confirms tensors are actually on the appropriate device
param_info = {}
for module_name, module in model.named_modules():
for param_name, param in module.named_parameters(recurse=False):
if param is not None and param.requires_grad:
full_name = f"{module_name}.{param_name}"
param_info[full_name] = {
"device": str(param.device),
"shape": list(param.shape),
"dtype": str(param.dtype),
}
# Just grab one parameter for verification
break
if param_info:
break
return {
"rank": rank,
"world_size": world_size,
"local_rank": local_rank,
"local_device_id": local_device_id,
"global_device_id": global_device_id,
"device_count": device_count,
"device_name": device_name,
"memory_allocated_mb": memory_allocated,
"memory_reserved_mb": memory_reserved,
"peak_memory_allocated_mb": peak_memory,
"peak_memory_reserved_mb": peak_reserved,
"parameter_sample": param_info,
"env_vars": {
k: v
for k, v in os.environ.items()
if k.startswith("CUDA") or k in ["LOCAL_RANK", "RANK", "WORLD_SIZE"]
},
}
[docs]
def sliding_window_overwrite(model_name: str) -> dict:
"""Returns configuration overrides to handle sliding window settings based on model rules.
Args:
model_name: The HuggingFace model name or path to load configuration from
Returns:
dict: Dictionary with overwrite values, or empty dict if no overwrites needed
"""
hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
overwrite_dict = {}
# Override sliding_window setting to address a HF mismatch relevant to use_sliding_window
# TODO(@zhiyul): remove this once the bug is fixed https://github.com/huggingface/transformers/issues/38002
if (
hasattr(hf_config, "use_sliding_window")
and hf_config.use_sliding_window == False
):
assert hasattr(hf_config, "sliding_window")
overwrite_dict = {
"sliding_window": None,
}
print(
f"use_sliding_window=False in config - overriding sliding_window parameter to None: {overwrite_dict}"
)
return overwrite_dict