Source code for nemo_automodel.components.launcher.nemo_run.launcher

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys
import time as _time
from pathlib import Path
from typing import Any, Dict, List, Optional

import yaml

from nemo_automodel.components.launcher.base import Launcher
from nemo_automodel.components.launcher.nemo_run.config import NemoRunConfig
from nemo_automodel.components.launcher.nemo_run.utils import (
    apply_overrides,
    load_executor_from_file,
    submit_nemo_run_job,
)

logger = logging.getLogger(__name__)

# Config filename and its path inside the container (/nemo_run/code/).
_CONFIG_FILENAME = "automodel_config.yaml"
_REMOTE_CONFIG_PATH = f"/nemo_run/code/{_CONFIG_FILENAME}"


[docs] class NemoRunLauncher(Launcher): """Launch a recipe via NeMo-Run's executor API. Supports loading pre-configured executors from ``$NEMORUN_HOME/executors.py`` (or a custom path) and submitting jobs as ``nemo_run.Script`` objects. Works with any NeMo-Run executor backend (Slurm, Kubernetes, Docker, local). Uses NeMo-Run's native ``Torchrun`` launcher so that distributed training arguments (rendezvous, node rank, nproc-per-node) are managed automatically. The training config YAML is packaged via ``PatternPackager`` so it is available at ``/nemo_run/code/automodel_config.yaml`` inside the container. """
[docs] def _resolve_executor(self, nr_config: NemoRunConfig) -> Any: """Load a named executor or build a local one.""" try: import nemo_run as run except ImportError: logger.error("nemo-run is not installed. Install with: pip install nemo-run") sys.exit(1) if nr_config.executor == "local": executor = run.LocalExecutor() apply_overrides(executor, nr_config.overrides) return executor # Named executor from executors file executor = load_executor_from_file(nr_config.executor, nr_config.executors_file) apply_overrides(executor, nr_config.overrides) return executor
[docs] @staticmethod def _configure_torchrun(executor: Any, devices: int) -> None: """Enable the native NeMo-Run Torchrun launcher on *executor*. Sets ``executor.launcher = "torchrun"`` and ``torchrun_nproc_per_node`` so NeMo-Run generates the correct ``torchrun --nproc-per-node=<N>`` invocation in the sbatch script. """ executor.launcher = "torchrun" if hasattr(executor, "torchrun_nproc_per_node"): executor.torchrun_nproc_per_node = devices
[docs] @staticmethod def _setup_packager(executor: Any, config_path: str) -> None: """Configure a ``PatternPackager`` that ships the config YAML. The packager tars the config file and NeMo-Run extracts it into ``{job_dir}/code/``, which is mounted at ``/nemo_run/code/`` inside the container. """ try: import nemo_run as run except ImportError: return config_dir = os.path.dirname(config_path) executor.packager = run.PatternPackager( include_pattern=config_path, relative_path=config_dir, )
[docs] def launch( self, config: Dict[str, Any], config_path: Path, recipe_target: str, launcher_config: Dict[str, Any], extra_args: Optional[List[str]] = None, ) -> int: try: import nemo_run as run except ImportError: logger.error("nemo-run is not installed. Install with: pip install nemo-run") sys.exit(1) nr_config = NemoRunConfig.from_dict(launcher_config) executor = self._resolve_executor(nr_config) # Determine devices (GPUs per node) via the executor's standard # nproc_per_node() method (defined on the base Executor class and # implemented by every backend). try: devices = executor.nproc_per_node() except (NotImplementedError, AttributeError): devices = 1 # Enable native Torchrun launcher (must be set *before* experiment.run # because NeMo-Run reads it during the packaging phase). self._configure_torchrun(executor, devices) # -- Write the training config for both local record and packaging. -- job_dir = os.path.join( nr_config.job_dir or os.path.join(os.getcwd(), "nemo_run_jobs"), str(int(_time.time())), ) os.makedirs(job_dir, exist_ok=True) config_yaml = yaml.dump(config, default_flow_style=False, sort_keys=False) # Local record. local_config_path = os.path.join(job_dir, _CONFIG_FILENAME) with open(local_config_path, "w") as fp: fp.write(config_yaml) logger.info("NeMo-Run job artifacts in: %s", job_dir) # Set up PatternPackager so the config is shipped to the remote. self._setup_packager(executor, local_config_path) # Build the Script: use ``python -m <module>`` so the recipe is resolved # from the installed package, not a relative file path. module_path = recipe_target.rsplit(".", 1)[0] args = ["-c", _REMOTE_CONFIG_PATH] if extra_args: args.extend(extra_args) script = run.Script( path=module_path, m=True, entrypoint="python", args=args, ) job_name = nr_config.job_name or f"{recipe_target.rsplit('.', 1)[-1]}" return submit_nemo_run_job( script=script, executor=executor, job_name=job_name, detach=nr_config.detach, tail_logs=nr_config.tail_logs, )