Get Job Status

View as Markdown

Get detailed execution status for a customization job, including step-by-step progress and real-time training metrics.

This endpoint provides granular execution details including:

  • Step-level status: model-and-dataset-downloadcustomization-training-jobmodel-uploadmodel-entity-creation
  • Training metrics: step, epoch, loss, lr (learning rate), grad_norm, val_loss
  • Progress tracking: downloaded_files, uploaded_bytes, progress_pct

To list jobs or get job definitions (model entity, hyperparameters, spec), use List Active Jobs instead.

Prerequisites

Before you can get the status of a customization job, make sure that you have:

  • Obtained the base URL of your NeMo Platform.
  • Set the NMP_BASE_URL environment variable to your NeMo Platform endpoint
$export NMP_BASE_URL="https://your-nmp-base-url"

To Get the Status of a Customization Job

A submitted customization job runs on the platform’s Jobs service, so you poll its status through that service using the job name returned at submission (for example, automodel-a1b2c3d4e5f6). This works the same way for both the automodel and unsloth backends.

Use the SDK to get detailed job status:

1import os
2from nemo_platform import NeMoPlatform
3
4# Initialize the client
5client = NeMoPlatform(
6 base_url=os.environ.get("NMP_BASE_URL", "http://localhost:8080"),
7 workspace="default",
8)
9
10# Get job status (use the job name returned by jobs.create)
11job_name = "automodel-a1b2c3d4e5f6"
12status = client.jobs.get_status(name=job_name, workspace="default")
13
14print(f"Job: {status.name}")
15print(f"Status: {status.status}")
16
17# Check step-level status and training progress
18for step in status.steps or []:
19 print(f" Step '{step.name}': {step.status}")
20 if step.name == "customization-training-job":
21 for task in step.tasks or []:
22 task_details = task.status_details or {}
23 current_step = task_details.get("step")
24 max_steps = task_details.get("max_steps")
25 if current_step and max_steps:
26 print(f" Progress: {current_step}/{max_steps}")

Active Job (Training in Progress)

1{
2 "id": "platform-job-2k8i3i1HqJHHPVB5M6Bk9Z",
3 "error_details": null,
4 "name": "my-sft-job",
5 "status": "active",
6 "status_details": {},
7 "steps": [
8 {
9 "id": "platform-job-step-4RJTW5wSy4gHJ539EfTUif",
10 "error_details": {},
11 "name": "model-and-dataset-download",
12 "status": "completed",
13 "status_details": {
14 "message": "Job completed successfully with exit code 0"
15 },
16 "tasks": [
17 {
18 "id": "platform-job-task-MBC2vcDEyv6Jw7RtFs5JDd",
19 "error_details": {},
20 "error_stack": "",
21 "name": "task-5969b01a4b5a4bb181ba530ff51dafa6",
22 "status": "completed",
23 "status_details": {
24 "message": "Job completed successfully with exit code 0",
25 "phase": "completed",
26 "total_filesets": 2,
27 "completed_filesets": 1,
28 "current_fileset": "default/sft-dataset",
29 "fileset": "default/sft-dataset",
30 "total_files": 2,
31 "total_size": 2984632,
32 "downloaded_files": 2,
33 "downloaded_bytes": 2984632,
34 "current_file": "validation.jsonl",
35 "progress_pct": 100
36 }
37 }
38 ]
39 },
40 {
41 "id": "platform-job-step-9kme8ibxDGES4t9TZvLp4X",
42 "error_details": {},
43 "name": "customization-training-job",
44 "status": "active",
45 "status_details": {
46 "message": "Job is running"
47 },
48 "tasks": [
49 {
50 "id": "platform-job-task-SawMB6ssVZd7NCiruFVJn8",
51 "error_details": {},
52 "error_stack": "",
53 "name": "task-2249da790a574388ba02fcabd06cd338",
54 "status": "active",
55 "status_details": {
56 "message": "Job is running",
57 "phase": "training",
58 "backend": "automodel",
59 "max_steps": 94,
60 "num_epochs": 2,
61 "step": 8,
62 "epoch": 1,
63 "loss": 2.8918895721435547,
64 "lr": 4.9101714686276044e-05,
65 "grad_norm": 26.0
66 }
67 }
68 ]
69 }
70 ]
71}

Completed Job (All Steps Finished)

1{
2 "id": "platform-job-2k8i3i1HqJHHPVB5M6Bk9Z",
3 "error_details": null,
4 "name": "my-sft-job",
5 "status": "completed",
6 "status_details": {},
7 "steps": [
8 {
9 "id": "platform-job-step-4RJTW5wSy4gHJ539EfTUif",
10 "error_details": {},
11 "name": "model-and-dataset-download",
12 "status": "completed",
13 "status_details": {
14 "message": "Job completed successfully with exit code 0"
15 },
16 "tasks": [
17 {
18 "id": "platform-job-task-MBC2vcDEyv6Jw7RtFs5JDd",
19 "error_details": {},
20 "error_stack": "",
21 "name": "task-5969b01a4b5a4bb181ba530ff51dafa6",
22 "status": "completed",
23 "status_details": {
24 "message": "Job completed successfully with exit code 0",
25 "phase": "completed",
26 "total_filesets": 2,
27 "completed_filesets": 1,
28 "current_fileset": "default/sft-dataset",
29 "fileset": "default/sft-dataset",
30 "total_files": 2,
31 "total_size": 2984632,
32 "downloaded_files": 2,
33 "downloaded_bytes": 2984632,
34 "current_file": "validation.jsonl",
35 "progress_pct": 100
36 }
37 }
38 ]
39 },
40 {
41 "id": "platform-job-step-9kme8ibxDGES4t9TZvLp4X",
42 "error_details": {},
43 "name": "customization-training-job",
44 "status": "completed",
45 "status_details": {
46 "message": "Job completed successfully with exit code 0"
47 },
48 "tasks": [
49 {
50 "id": "platform-job-task-SawMB6ssVZd7NCiruFVJn8",
51 "error_details": {},
52 "error_stack": "",
53 "name": "task-2249da790a574388ba02fcabd06cd338",
54 "status": "completed",
55 "status_details": {
56 "message": "Job completed successfully with exit code 0",
57 "phase": "processing_checkpoint",
58 "backend": "automodel",
59 "max_steps": 94,
60 "num_epochs": 2,
61 "step": 94,
62 "epoch": 2,
63 "loss": 0.3437718152999878,
64 "lr": 5.000000000000001e-07,
65 "grad_norm": 20.125,
66 "val_loss": 0.5527229905128479,
67 "checkpoint_path": "/var/run/scratch/job/training/checkpoints"
68 }
69 }
70 ]
71 },
72 {
73 "id": "platform-job-step-6gkKgfT5AwyamBWNtQFA9t",
74 "error_details": {},
75 "name": "model-upload",
76 "status": "completed",
77 "status_details": {
78 "message": "Job completed successfully with exit code 0"
79 },
80 "tasks": [
81 {
82 "id": "platform-job-task-J2wt8G8X3vkUHZJCEhzYTP",
83 "error_details": {},
84 "error_stack": "",
85 "name": "task-9647899028aa44c493ce1d65a4ec8f7b",
86 "status": "completed",
87 "status_details": {
88 "message": "Job completed successfully with exit code 0",
89 "phase": "completed",
90 "total_filesets": 1,
91 "completed_filesets": 0,
92 "current_fileset": "default/customization-b5b20520fe4f",
93 "fileset": "default/customization-b5b20520fe4f",
94 "total_files": 7,
95 "total_size": 2488928480,
96 "uploaded_files": 7,
97 "uploaded_bytes": 2488928480,
98 "current_file": "model-00001-of-00001.safetensors",
99 "progress_pct": 100
100 }
101 }
102 ]
103 },
104 {
105 "id": "platform-job-step-9NmC8mrS3VtthjUSo52EiB",
106 "error_details": {},
107 "name": "model-entity-creation",
108 "status": "completed",
109 "status_details": {
110 "message": "Job completed successfully with exit code 0"
111 },
112 "tasks": [
113 {
114 "id": "platform-job-task-8Mbbz6AnZP5fQLu61aVRb",
115 "error_details": {},
116 "error_stack": "",
117 "name": "task-ab80f766085f4203b72a612f38a2c1b0",
118 "status": "completed",
119 "status_details": {
120 "message": "Job completed successfully with exit code 0"
121 }
122 }
123 ]
124 }
125 ]
126}