nemo_skills#
This page contains all evaluation tasks for the nemo_skills harness.
Task |
Description |
|---|---|
AA-LCR |
|
AIME2024 |
|
AIME2025 |
|
BFCLv3 |
|
BFCLv4 |
|
GPQA Diamond |
|
HumanityLastExam |
|
HumanityLastExam aligned with AA |
|
HMMT February 2025 (MathArena/hmmt_feb_2025) |
|
IFBench - Instruction Following Benchmark |
|
IFEval - Instruction-Following Evaluation for Large Language Models |
|
LiveCodeBench v6 |
|
LiveCodeBench with AA custom prompt format (315 problems from July 2024 to Dec 2024, release_v5) |
|
LiveCodeBench v5 |
|
MMLU |
|
MMLU-PRO |
|
MMLU-ProX |
|
RULER - Long Context Understanding |
|
SciCode |
|
WMT24++ |
ns_aa_lcr#
AA-LCR
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_aa_lcr
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: aalcr
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: true
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: 0.0
top_p: 1.0
max_new_tokens: 4096
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_aa_lcr
target: {}
ns_aime2024#
AIME2024
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_aime2024
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: aime24
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: true
judge:
url: null
model_id: null
api_key: null
generation_type: math_judge
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_aime2024
target: {}
ns_aime2025#
AIME2025
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_aime2025
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: aime25
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: true
judge:
url: null
model_id: null
api_key: null
generation_type: math_judge
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_aime2025
target: {}
ns_bfcl_v3#
BFCLv3
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_bfcl_v3
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: bfcl_v3
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: ++use_client_parsing=False
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_bfcl_v3
target: {}
ns_bfcl_v4#
BFCLv4
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_bfcl_v4
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: bfcl_v4
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: ++use_client_parsing=False
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_bfcl_v4
target: {}
ns_gpqa#
GPQA Diamond
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_gpqa
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: gpqa
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_gpqa
target: {}
ns_hle#
HumanityLastExam
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_hle
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: hle
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_hle
target: {}
ns_hle_aa#
HumanityLastExam aligned with AA
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_hle_aa
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: hle
extra:
use_sandbox: false
num_repeats: 1
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: true
judge:
url: https://inference-api.nvidia.com/v1
model_id: us/azure/openai/gpt-4.1
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_hle_aa
target: {}
ns_hmmt_feb2025#
HMMT February 2025 (MathArena/hmmt_feb_2025)
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_hmmt_feb2025
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: hmmt_feb25
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: true
judge:
url: null
model_id: null
api_key: null
generation_type: math_judge
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_hmmt_feb2025
target: {}
ns_ifbench#
IFBench - Instruction Following Benchmark
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_ifbench
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: ifbench
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_ifbench
target: {}
ns_ifeval#
IFEval - Instruction-Following Evaluation for Large Language Models
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_ifeval
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: ifeval
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_ifeval
target: {}
ns_livecodebench#
LiveCodeBench v6
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_livecodebench
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: livecodebench
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: test_v6_2408_2505
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_livecodebench
target: {}
ns_livecodebench_aa#
LiveCodeBench with AA custom prompt format (315 problems from July 2024 to Dec 2024, release_v5)
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_livecodebench_aa
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: livecodebench
extra:
use_sandbox: false
num_repeats: 3
prompt_config: /nemo_run/code/eval_factory_prompts/livecodebench-aa.yaml
args: null
system_message: null
dataset_split: test_v5_2407_2412
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_livecodebench_aa
target: {}
ns_livecodebench_v5#
LiveCodeBench v5
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_livecodebench_v5
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: livecodebench
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: test_v5_2407_2412
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_livecodebench_v5
target: {}
ns_mmlu#
MMLU
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_mmlu
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: mmlu
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_mmlu
target: {}
ns_mmlu_pro#
MMLU-PRO
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_mmlu_pro
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: mmlu-pro
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_mmlu_pro
target: {}
ns_mmlu_prox#
MMLU-ProX
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_mmlu_prox
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: mmlu-prox
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_mmlu_prox
target: {}
ns_ruler#
RULER - Long Context Understanding
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_ruler
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: ruler.evaluation_128k
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: /workspace/ruler_data
cluster: local
setup: evaluation_128k
max_seq_length: 131072
tokenizer_path: null
template_tokens: 50
num_samples: null
tasks: null
supported_endpoint_types:
- completions
type: ns_ruler
target: {}
ns_scicode#
SciCode
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_scicode
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: scicode
extra:
use_sandbox: true
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_scicode
target: {}
ns_wmt24pp#
WMT24++
Harness: nemo_skills
Container:
nvcr.io/nvidia/eval-factory/nemo-skills:26.01
Container Digest:
sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a
Container Arch: multiarch
Task Type: ns_wmt24pp
cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data && ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
params:
parallelism: 16
task: wmt24pp
extra:
use_sandbox: false
num_repeats: null
prompt_config: null
args: null
system_message: null
dataset_split: null
judge_support: false
judge:
url: null
model_id: null
api_key: null
generation_type: null
random_seed: 1234
temperature: null
top_p: null
max_new_tokens: null
args: null
parallelism: null
ruler:
data_dir: null
cluster: null
setup: null
max_seq_length: null
tokenizer_path: null
template_tokens: null
num_samples: null
tasks: null
supported_endpoint_types:
- chat
type: ns_wmt24pp
target: {}