nemo_rl.models.generation.sglang.config#

Module Contents#

Classes#

SglangSpecificArgs

SGLang-specific configuration arguments.

SGLangConfig

Configuration for SGLang runtime.

API#

class nemo_rl.models.generation.sglang.config.SglangSpecificArgs#

Bases: typing.TypedDict

SGLang-specific configuration arguments.

Most fields below map directly to SGLang’s ServerArgs (see: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py).

Initialization

Initialize self. See help(type(self)) for accurate signature.

model_path: NotRequired[str]#

None

gpus_per_server: NotRequired[int]#

None

random_seed: NotRequired[int]#

None

skip_tokenizer_init: NotRequired[bool]#

None

disable_cuda_graph: NotRequired[bool]#

None

disable_radix_cache: NotRequired[bool]#

None

disable_cuda_graph_padding: NotRequired[bool]#

None

enable_nccl_nvls: NotRequired[bool]#

None

disable_outlines_disk_cache: NotRequired[bool]#

None

disable_custom_all_reduce: NotRequired[bool]#

None

disable_overlap_schedule: NotRequired[bool]#

None

enable_mixed_chunk: NotRequired[bool]#

None

enable_dp_attention: NotRequired[bool]#

None

enable_ep_moe: NotRequired[bool]#

None

enable_torch_compile: NotRequired[bool]#

None

torch_compile_max_bs: NotRequired[int]#

None

cuda_graph_max_bs: NotRequired[int | None]#

None

cuda_graph_bs: NotRequired[list[int] | None]#

None

torchao_config: NotRequired[str]#

None

enable_nan_detection: NotRequired[bool]#

None

enable_p2p_check: NotRequired[bool]#

None

triton_attention_reduce_in_fp32: NotRequired[bool]#

None

triton_attention_num_kv_splits: NotRequired[int]#

None

num_continuous_decode_steps: NotRequired[int]#

None

enable_memory_saver: NotRequired[bool]#

None

allow_auto_truncate: NotRequired[bool]#

None

attention_backend: NotRequired[str | None]#

None

enable_multimodal: NotRequired[bool]#

None

sampling_backend: NotRequired[str | None]#

None

context_length: NotRequired[int | None]#

None

mem_fraction_static: NotRequired[float | None]#

None

max_running_requests: NotRequired[int | None]#

None

chunked_prefill_size: NotRequired[int | None]#

None

max_prefill_tokens: NotRequired[int]#

None

schedule_policy: NotRequired[str]#

None

schedule_conservativeness: NotRequired[float]#

None

cpu_offload_gb: NotRequired[int]#

None

dtype: NotRequired[str]#

None

kv_cache_dtype: NotRequired[str]#

None

dp_size: NotRequired[int]#

None

pp_size: NotRequired[int]#

None

ep_size: NotRequired[int]#

None

enable_lora: NotRequired[bool | None]#

None

max_lora_rank: NotRequired[int | None]#

None

lora_target_modules: NotRequired[list[str] | None]#

None

lora_paths: NotRequired[list[str] | None]#

None

max_loaded_loras: NotRequired[int]#

None

max_loras_per_batch: NotRequired[int]#

None

lora_backend: NotRequired[str]#

None

log_level: NotRequired[str]#

None

log_level_http: NotRequired[str | None]#

None

log_requests: NotRequired[bool]#

None

log_requests_level: NotRequired[int]#

None

show_time_cost: NotRequired[bool]#

None

enable_metrics: NotRequired[bool]#

None

decode_log_interval: NotRequired[int]#

None

enable_multithread_load: NotRequired[bool]#

None

enable_fast_load: NotRequired[bool]#

None

skip_server_warmup: NotRequired[bool]#

None

class nemo_rl.models.generation.sglang.config.SGLangConfig#

Bases: nemo_rl.models.generation.interfaces.GenerationConfig

Configuration for SGLang runtime.

Initialization

Initialize self. See help(type(self)) for accurate signature.

sglang_cfg: nemo_rl.models.generation.sglang.config.SglangSpecificArgs#

None

sglang_kwargs: NotRequired[dict[str, Any]]#

None