`nemo_rl.models.value.lm_value`#

Module Contents#

Value function model for PPO using distributed training with Ray workers.

class nemo_rl.models.value.lm_value.Value( cluster: nemo_rl.distributed.virtual_cluster.RayVirtualCluster, config: nemo_rl.models.value.config.ValueConfig, tokenizer: transformers.tokenization_utils_base.PreTrainedTokenizerBase, name_prefix: str = 'lm_value', workers_per_node: Optional[Union[int, list[int]]] = None, init_optimizer: bool = True, weights_path: Optional[nemo_rl.models.value.lm_value.PathLike] = None, optimizer_path: Optional[nemo_rl.models.value.lm_value.PathLike] = None, )#

Value function model for PPO using distributed training with Ray workers.

Initialization

Initialize the Value model.

Parameters:

Get value predictions for a batch of data.

Parameters:

Returns:

BatchedDataDict containing value predictions [batch_size, sequence_length]

train( data: nemo_rl.distributed.batched_data_dict.BatchedDataDict, loss_fn: nemo_rl.algorithms.loss.interfaces.LossFunction, eval_mode: bool = False, *, gbs: Optional[int] = None, mbs: Optional[int] = None, timer: Optional[nemo_rl.utils.timer.Timer] = None, ) → dict[str, Any]#

Train the value function on a batch of data with a given loss function.

Parameters:

Returns:

Dictionary containing training metrics (loss, grad_norm, etc.)

prepare_for_training() → None#: Prepare the value model for training (load to GPU).

prepare_for_inference() → None#: Prepare the value model for inference (offload gradients, set eval mode).

save_checkpoint( weights_path: str, optimizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None, ) → None#: Save a checkpoint of the value model.

__del__() → None#: Shuts down the worker groups when the object is deleted or garbage collected.