Systems
This chapter lists all the systems supported by CloudAI. The attributes shown for each system can be set in TOML configuration files.
System | Scheduler Value |
|---|---|
| Slurm | slurm |
| Kubernetes | kubernetes |
| RunAI | runai |
| LSF | lsf |
| Standalone | standalone |
- pydantic model cloudai.systems.slurm.slurm_system.SlurmSystem[source]
Represents a Slurm system.
- field default_partition: str [Required]
- field partitions: List[SlurmPartition] [Required]
- field account: str | None = None
- field distribution: str | None = None
- field mpi: str = 'pmix'
- field gpus_per_node: int | None = None
- field ntasks_per_node: int | None = None
- field cache_docker_images_locally: bool = False
- field scheduler: str = 'slurm'
- field monitor_interval: int = 60
- field extra_srun_args: str | None = None
- field extra_sbatch_args: list[str] [Optional]
- field container_mount_home: bool = False
- field data_repository: DataRepositoryConfig | None = None
- field reports: dict[str, ReportConfig] | None = None
- field name: str [Required]
- field install_path: Path [Required]
- field output_path: Path [Required]
- field hf_home_path: Path [Optional]
- field global_env_vars: dict[str, Any] [Optional]
- pydantic model cloudai.systems.slurm.slurm_system.SlurmPartition[source]
Represents a partition within a Slurm system.
- field name: str [Required]
- field groups: List[SlurmGroup] = []
- pydantic model cloudai.systems.slurm.slurm_system.SlurmGroup[source]
Represents a group of nodes within a partition.
- field name: str [Required]
- field nodes: List[str] [Required]
- pydantic model cloudai.systems.slurm.slurm_system.DataRepositoryConfig[source]
Configuration for a data repository.
- field endpoint: str [Required]
- field verify_certs: bool = True
- pydantic model cloudai.systems.kubernetes.kubernetes_system.KubernetesSystem[source]
Represents a Kubernetes system.
- field kube_config_path: Path [Required]
- field default_namespace: str [Required]
- field scheduler: str = 'kubernetes'
- field monitor_interval: int = 1
- field gpus_per_node: int = 1
- field use_host_network: bool | None = None
- field name: str [Required]
- field install_path: Path [Required]
- field output_path: Path [Required]
- field hf_home_path: Path [Optional]
- field global_env_vars: dict[str, Any] [Optional]
- pydantic model cloudai.systems.runai.runai_system.RunAISystem[source]
RunAISystem integrates with the RunAI platform to manage and monitor jobs and nodes.
- field scheduler: str = 'runai'
- field monitor_interval: int = 60
- field base_url: str [Required]
- field user_email: str [Required]
- field app_id: str [Required]
- field app_secret: str [Required]
- field project_id: str [Required]
- field cluster_id: str [Required]
- field name: str [Required]
- field install_path: Path [Required]
- field output_path: Path [Required]
- field hf_home_path: Path [Optional]
- field global_env_vars: dict[str, Any] [Optional]
- pydantic model cloudai.systems.runai.runai_node.RunAINode[source]
Represent a node in the RunAI cluster.
- field status: NodeStatus = NodeStatus.UNKNOWN
- field conditions: List[Dict[str, Any]] [Optional]
- field taints: List[Dict[str, Any]] [Optional]
- field node_pool: str = '' (alias 'nodePool')
- field created_at: str = '' (alias 'createdAt')
- field gpu_type: str | None = None (alias 'gpuType')
- field gpu_count: int | None = None (alias 'gpuCount')
- field nvlink_domain_uid: str | None = None (alias 'nvLinkDomainUid')
- field nvlink_clique_id: str | None = None (alias 'nvLinkCliqueId')
- field name: str = ''
- field id: str = ''
- field cluster_uuid: str = '' (alias 'clusterUuid')
- field updated_at: str = '' (alias 'updatedAt')
- pydantic model cloudai.systems.lsf.lsf_system.LSFSystem[source]
Represents an LSF system.
- field queues: List[LSFQueue] [Optional]
A list of queues in the LSF system, filled in automatically
- field account: str | None = None
- field scheduler: str = 'lsf'
- field project_name: str | None = None
- field default_queue: str | None = None
- field monitor_interval: int = 60
- field app: str | None = None
- field os_version: str | None = None
- field name: str [Required]
- field install_path: Path [Required]
- field output_path: Path [Required]
- field hf_home_path: Path [Optional]
- field global_env_vars: dict[str, Any] [Optional]
- pydantic model cloudai.systems.lsf.lsf_system.LSFQueue[source]
Represents a queue within the LSF system.
- field name: str [Required]
- field groups: List[LSFGroup] = []
- pydantic model cloudai.systems.lsf.lsf_system.LSFGroup[source]
Represents a group of nodes within a queue.
- field name: str [Required]
- field nodes: List[str] [Required]
- pydantic model cloudai.systems.standalone.standalone_system.StandaloneSystem[source]
Class representing a Standalone system.
This class is used for systems that execute commands directly without a job scheduler.
- field scheduler: str = 'standalone'
- field monitor_interval: int = 1
- field name: str [Required]
- field install_path: Path [Required]
- field output_path: Path [Required]
- field hf_home_path: Path [Optional]
- field global_env_vars: dict[str, Any] [Optional]