Get Job Status
Get detailed execution status for a customization job, including step-by-step progress and real-time training metrics.
This endpoint provides granular execution details including:
- Step-level status:
model-and-dataset-download→customization-training-job→model-upload→model-entity-creation - Training metrics:
step,epoch,loss,lr(learning rate),grad_norm,val_loss - Progress tracking:
downloaded_files,uploaded_bytes,progress_pct
To list jobs or get job definitions (model entity, hyperparameters, spec), use List Active Jobs instead.
Prerequisites
Before you can get the status of a customization job, make sure that you have:
- Obtained the base URL of your NeMo Platform.
- Set the
NMP_BASE_URLenvironment variable to your NeMo Platform endpoint
$ export NMP_BASE_URL="https://your-nmp-base-url"
To Get the Status of a Customization Job
A submitted customization job runs on the platform’s Jobs service, so you poll its status through that service using the job name returned at submission (for example, automodel-a1b2c3d4e5f6). This works the same way for both the automodel and unsloth backends.
Use the SDK to get detailed job status:
1 import os 2 from nemo_platform import NeMoPlatform 3 4 # Initialize the client 5 client = NeMoPlatform( 6 base_url=os.environ.get("NMP_BASE_URL", "http://localhost:8080"), 7 workspace="default", 8 ) 9 10 # Get job status (use the job name returned by jobs.create) 11 job_name = "automodel-a1b2c3d4e5f6" 12 status = client.jobs.get_status(name=job_name, workspace="default") 13 14 print(f"Job: {status.name}") 15 print(f"Status: {status.status}") 16 17 # Check step-level status and training progress 18 for step in status.steps or []: 19 print(f" Step '{step.name}': {step.status}") 20 if step.name == "customization-training-job": 21 for task in step.tasks or []: 22 task_details = task.status_details or {} 23 current_step = task_details.get("step") 24 max_steps = task_details.get("max_steps") 25 if current_step and max_steps: 26 print(f" Progress: {current_step}/{max_steps}")
Example Response
Active Job (Training in Progress)
1 { 2 "id": "platform-job-2k8i3i1HqJHHPVB5M6Bk9Z", 3 "error_details": null, 4 "name": "my-sft-job", 5 "status": "active", 6 "status_details": {}, 7 "steps": [ 8 { 9 "id": "platform-job-step-4RJTW5wSy4gHJ539EfTUif", 10 "error_details": {}, 11 "name": "model-and-dataset-download", 12 "status": "completed", 13 "status_details": { 14 "message": "Job completed successfully with exit code 0" 15 }, 16 "tasks": [ 17 { 18 "id": "platform-job-task-MBC2vcDEyv6Jw7RtFs5JDd", 19 "error_details": {}, 20 "error_stack": "", 21 "name": "task-5969b01a4b5a4bb181ba530ff51dafa6", 22 "status": "completed", 23 "status_details": { 24 "message": "Job completed successfully with exit code 0", 25 "phase": "completed", 26 "total_filesets": 2, 27 "completed_filesets": 1, 28 "current_fileset": "default/sft-dataset", 29 "fileset": "default/sft-dataset", 30 "total_files": 2, 31 "total_size": 2984632, 32 "downloaded_files": 2, 33 "downloaded_bytes": 2984632, 34 "current_file": "validation.jsonl", 35 "progress_pct": 100 36 } 37 } 38 ] 39 }, 40 { 41 "id": "platform-job-step-9kme8ibxDGES4t9TZvLp4X", 42 "error_details": {}, 43 "name": "customization-training-job", 44 "status": "active", 45 "status_details": { 46 "message": "Job is running" 47 }, 48 "tasks": [ 49 { 50 "id": "platform-job-task-SawMB6ssVZd7NCiruFVJn8", 51 "error_details": {}, 52 "error_stack": "", 53 "name": "task-2249da790a574388ba02fcabd06cd338", 54 "status": "active", 55 "status_details": { 56 "message": "Job is running", 57 "phase": "training", 58 "backend": "automodel", 59 "max_steps": 94, 60 "num_epochs": 2, 61 "step": 8, 62 "epoch": 1, 63 "loss": 2.8918895721435547, 64 "lr": 4.9101714686276044e-05, 65 "grad_norm": 26.0 66 } 67 } 68 ] 69 } 70 ] 71 }
Completed Job (All Steps Finished)
1 { 2 "id": "platform-job-2k8i3i1HqJHHPVB5M6Bk9Z", 3 "error_details": null, 4 "name": "my-sft-job", 5 "status": "completed", 6 "status_details": {}, 7 "steps": [ 8 { 9 "id": "platform-job-step-4RJTW5wSy4gHJ539EfTUif", 10 "error_details": {}, 11 "name": "model-and-dataset-download", 12 "status": "completed", 13 "status_details": { 14 "message": "Job completed successfully with exit code 0" 15 }, 16 "tasks": [ 17 { 18 "id": "platform-job-task-MBC2vcDEyv6Jw7RtFs5JDd", 19 "error_details": {}, 20 "error_stack": "", 21 "name": "task-5969b01a4b5a4bb181ba530ff51dafa6", 22 "status": "completed", 23 "status_details": { 24 "message": "Job completed successfully with exit code 0", 25 "phase": "completed", 26 "total_filesets": 2, 27 "completed_filesets": 1, 28 "current_fileset": "default/sft-dataset", 29 "fileset": "default/sft-dataset", 30 "total_files": 2, 31 "total_size": 2984632, 32 "downloaded_files": 2, 33 "downloaded_bytes": 2984632, 34 "current_file": "validation.jsonl", 35 "progress_pct": 100 36 } 37 } 38 ] 39 }, 40 { 41 "id": "platform-job-step-9kme8ibxDGES4t9TZvLp4X", 42 "error_details": {}, 43 "name": "customization-training-job", 44 "status": "completed", 45 "status_details": { 46 "message": "Job completed successfully with exit code 0" 47 }, 48 "tasks": [ 49 { 50 "id": "platform-job-task-SawMB6ssVZd7NCiruFVJn8", 51 "error_details": {}, 52 "error_stack": "", 53 "name": "task-2249da790a574388ba02fcabd06cd338", 54 "status": "completed", 55 "status_details": { 56 "message": "Job completed successfully with exit code 0", 57 "phase": "processing_checkpoint", 58 "backend": "automodel", 59 "max_steps": 94, 60 "num_epochs": 2, 61 "step": 94, 62 "epoch": 2, 63 "loss": 0.3437718152999878, 64 "lr": 5.000000000000001e-07, 65 "grad_norm": 20.125, 66 "val_loss": 0.5527229905128479, 67 "checkpoint_path": "/var/run/scratch/job/training/checkpoints" 68 } 69 } 70 ] 71 }, 72 { 73 "id": "platform-job-step-6gkKgfT5AwyamBWNtQFA9t", 74 "error_details": {}, 75 "name": "model-upload", 76 "status": "completed", 77 "status_details": { 78 "message": "Job completed successfully with exit code 0" 79 }, 80 "tasks": [ 81 { 82 "id": "platform-job-task-J2wt8G8X3vkUHZJCEhzYTP", 83 "error_details": {}, 84 "error_stack": "", 85 "name": "task-9647899028aa44c493ce1d65a4ec8f7b", 86 "status": "completed", 87 "status_details": { 88 "message": "Job completed successfully with exit code 0", 89 "phase": "completed", 90 "total_filesets": 1, 91 "completed_filesets": 0, 92 "current_fileset": "default/customization-b5b20520fe4f", 93 "fileset": "default/customization-b5b20520fe4f", 94 "total_files": 7, 95 "total_size": 2488928480, 96 "uploaded_files": 7, 97 "uploaded_bytes": 2488928480, 98 "current_file": "model-00001-of-00001.safetensors", 99 "progress_pct": 100 100 } 101 } 102 ] 103 }, 104 { 105 "id": "platform-job-step-9NmC8mrS3VtthjUSo52EiB", 106 "error_details": {}, 107 "name": "model-entity-creation", 108 "status": "completed", 109 "status_details": { 110 "message": "Job completed successfully with exit code 0" 111 }, 112 "tasks": [ 113 { 114 "id": "platform-job-task-8Mbbz6AnZP5fQLu61aVRb", 115 "error_details": {}, 116 "error_stack": "", 117 "name": "task-ab80f766085f4203b72a612f38a2c1b0", 118 "status": "completed", 119 "status_details": { 120 "message": "Job completed successfully with exit code 0" 121 } 122 } 123 ] 124 } 125 ] 126 }