Source code for nemo_gym.server_status
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from time import time
from typing import List
import requests
from devtools import pprint
from nemo_gym.server_utils import ServerClient, ServerInstanceDisplayConfig, ServerStatus
[docs]
class StatusCommand:
"""Main class to check server status"""
[docs]
def check_health(self, server_info: ServerInstanceDisplayConfig) -> ServerStatus:
"""Check if server is responding"""
if not server_info.url:
return "unknown_error"
try:
requests.get(server_info.url, timeout=2)
return "success"
except requests.exceptions.ConnectionError:
return "connection_error"
except requests.exceptions.Timeout:
return "timeout"
except Exception:
return "unknown_error"
[docs]
def discover_servers(self) -> List[ServerInstanceDisplayConfig]:
"""Find all running NeMo Gym server processes"""
try:
head_server_config = ServerClient.load_head_server_config()
head_url = f"http://{head_server_config.host}:{head_server_config.port}"
response = requests.get(f"{head_url}/server_instances", timeout=5)
response.raise_for_status()
instances = response.json()
servers = []
current_time = time()
for inst in instances:
uptime = current_time - inst.get("start_time", current_time)
server_info = ServerInstanceDisplayConfig(
process_name=inst["process_name"],
server_type=inst["server_type"],
name=inst["name"],
host=inst.get("host"),
port=inst.get("port"),
url=inst.get("url"),
entrypoint=inst.get("entrypoint"),
pid=inst.get("pid"),
uptime_seconds=uptime,
status="unknown_error",
)
server_info.status = self.check_health(server_info)
servers.append(server_info)
return servers
except (requests.RequestException, ConnectionError) as e:
print(f"""
Could not connect to head server: {e}
Is the head server running? Start it with: `ng_run`
""")
return []
[docs]
def display_status(self, servers: List[ServerInstanceDisplayConfig]) -> None:
"""Show server info in a table"""
def format_uptime(uptime_seconds: float) -> str:
"""Format uptime in a human readable format"""
minutes, seconds = divmod(uptime_seconds, 60)
hours, minutes = divmod(minutes, 60)
days, hours = divmod(hours, 24)
return f"{int(days)}d {int(hours)}h {int(minutes)}m {seconds:.1f}s"
if not servers:
print("No NeMo Gym servers found running.")
return
print("\nNeMo Gym Server Status:\n")
for i, server in enumerate(servers, 1):
status_icon = "✓" if server.status == "success" else "✗"
print(f"[{i}] {status_icon} {server.process_name} ({server.server_type}/{server.name})")
display_dict = {
"server_type": server.server_type,
"name": server.name,
"port": server.port,
"pid": server.pid,
"uptime_seconds": format_uptime(server.uptime_seconds),
}
pprint(display_dict)
healthy_count = sum(1 for s in servers if s.status == "success")
print(f"""
{len(servers)} servers found ({healthy_count} healthy, {len(servers) - healthy_count} unhealthy)
""")