nemo_skills#

This page contains all evaluation tasks for the nemo_skills harness.

Task

Description

ns_aa_lcr

AA-LCR

ns_aime2024

AIME2024

ns_aime2025

AIME2025

ns_bfcl_v3

BFCLv3

ns_bfcl_v4

BFCLv4

ns_gpqa

GPQA Diamond

ns_hle

HumanityLastExam

ns_hle_aa

HumanityLastExam aligned with AA

ns_hmmt_feb2025

HMMT February 2025 (MathArena/hmmt_feb_2025)

ns_ifbench

IFBench - Instruction Following Benchmark

ns_ifeval

IFEval - Instruction-Following Evaluation for Large Language Models

ns_livecodebench

LiveCodeBench v6

ns_livecodebench_aa

LiveCodeBench with AA custom prompt format (315 problems from July 2024 to Dec 2024, release_v5)

ns_livecodebench_v5

LiveCodeBench v5

ns_mmlu

MMLU

ns_mmlu_pro

MMLU-PRO

ns_mmlu_prox

MMLU-ProX

ns_ruler

RULER - Long Context Understanding

ns_scicode

SciCode

ns_wmt24pp

WMT24++

ns_aa_lcr#

AA-LCR

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_aa_lcr

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: aalcr
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: true
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: 0.0
        top_p: 1.0
        max_new_tokens: 4096
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_aa_lcr
target: {}

ns_aime2024#

AIME2024

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_aime2024

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: aime24
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: true
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: math_judge
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_aime2024
target: {}

ns_aime2025#

AIME2025

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_aime2025

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: aime25
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: true
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: math_judge
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_aime2025
target: {}

ns_bfcl_v3#

BFCLv3

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_bfcl_v3

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: bfcl_v3
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: ++use_client_parsing=False
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_bfcl_v3
target: {}

ns_bfcl_v4#

BFCLv4

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_bfcl_v4

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: bfcl_v4
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: ++use_client_parsing=False
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_bfcl_v4
target: {}

ns_gpqa#

GPQA Diamond

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_gpqa

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: gpqa
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_gpqa
target: {}

ns_hle#

HumanityLastExam

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_hle

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: hle
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_hle
target: {}

ns_hle_aa#

HumanityLastExam aligned with AA

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_hle_aa

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: hle
    extra:
      use_sandbox: false
      num_repeats: 1
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: true
      judge:
        url: https://inference-api.nvidia.com/v1
        model_id: us/azure/openai/gpt-4.1
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_hle_aa
target: {}

ns_hmmt_feb2025#

HMMT February 2025 (MathArena/hmmt_feb_2025)

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_hmmt_feb2025

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: hmmt_feb25
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: true
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: math_judge
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_hmmt_feb2025
target: {}

ns_ifbench#

IFBench - Instruction Following Benchmark

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_ifbench

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: ifbench
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_ifbench
target: {}

ns_ifeval#

IFEval - Instruction-Following Evaluation for Large Language Models

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_ifeval

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: ifeval
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_ifeval
target: {}

ns_livecodebench#

LiveCodeBench v6

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_livecodebench

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: livecodebench
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: test_v6_2408_2505
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_livecodebench
target: {}

ns_livecodebench_aa#

LiveCodeBench with AA custom prompt format (315 problems from July 2024 to Dec 2024, release_v5)

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_livecodebench_aa

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: livecodebench
    extra:
      use_sandbox: false
      num_repeats: 3
      prompt_config: /nemo_run/code/eval_factory_prompts/livecodebench-aa.yaml
      args: null
      system_message: null
      dataset_split: test_v5_2407_2412
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_livecodebench_aa
target: {}

ns_livecodebench_v5#

LiveCodeBench v5

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_livecodebench_v5

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: livecodebench
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: test_v5_2407_2412
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_livecodebench_v5
target: {}

ns_mmlu#

MMLU

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_mmlu

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: mmlu
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_mmlu
target: {}

ns_mmlu_pro#

MMLU-PRO

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_mmlu_pro

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: mmlu-pro
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_mmlu_pro
target: {}

ns_mmlu_prox#

MMLU-ProX

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_mmlu_prox

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: mmlu-prox
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_mmlu_prox
target: {}

ns_ruler#

RULER - Long Context Understanding

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_ruler

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: ruler.evaluation_128k
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: /workspace/ruler_data
        cluster: local
        setup: evaluation_128k
        max_seq_length: 131072
        tokenizer_path: null
        template_tokens: 50
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - completions
  type: ns_ruler
target: {}

ns_scicode#

SciCode

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_scicode

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: scicode
    extra:
      use_sandbox: true
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_scicode
target: {}

ns_wmt24pp#

WMT24++

Harness: nemo_skills

Container:

nvcr.io/nvidia/eval-factory/nemo-skills:26.01

Container Digest:

sha256:43e2c4d6e197744f7fd0a874d06c5600a8b46b54e16d333c0ebf057b6d54635a

Container Arch: multiarch

Task Type: ns_wmt24pp

cd /nemo_run/code && {% if config.params.extra.use_sandbox %}python -m nemo_skills.code_execution.local_sandbox.local_sandbox_server > {{config.output_dir}}/sandbox.log 2>&1 & SANDBOX_PID=$! && sleep 3 && {% endif %}{% if not config.params.task.startswith('ruler') %} ns prepare_data {{config.params.task}} {% else %} mkdir -p {{config.params.extra.ruler.data_dir}} && ln -sf {{config.params.extra.ruler.data_dir}} /nemo_run/code/ruler_data &&  ns prepare_data ruler --data_dir={{config.params.extra.ruler.data_dir}} --cluster={{config.params.extra.ruler.cluster}} --setup={{config.params.extra.ruler.setup}} --max_seq_length={{config.params.extra.ruler.max_seq_length}} --tokenizer_path={{config.params.extra.ruler.tokenizer_path}} {% if config.params.extra.ruler.template_tokens is not none %}--template_tokens={{config.params.extra.ruler.template_tokens}}{% endif %} {% if config.params.extra.ruler.num_samples is not none %}--num_samples={{config.params.extra.ruler.num_samples}}{% elif config.params.limit_samples is not none %}--num_samples={{config.params.limit_samples}}{% endif %} {% if config.params.extra.ruler.tasks is not none %}--tasks {% for task in config.params.extra.ruler.tasks %}{{task}}{% if not loop.last %} {% endif %}{% endfor %}{% endif %} {% endif %} && ns eval --server_type=openai --model={{target.api_endpoint.model_id}} --server_address={{target.api_endpoint.url}} --benchmarks={{config.params.task}}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}:{{config.params.extra.num_repeats}}{% endif %} --output_dir={{config.output_dir}} {% if config.params.extra.dataset_split is not none %}--split={{config.params.extra.dataset_split}}{% endif %} {% if config.params.extra.ruler.data_dir is not none %}--data_dir={{config.params.extra.ruler.data_dir}}{% endif %} ++server.api_key_env_var={% if target.api_endpoint.api_key_name is not none %}{{target.api_endpoint.api_key_name}}{% else %}DUMMY_API_KEY{% endif %} {% if config.params.max_new_tokens is not none %}++inference.tokens_to_generate={{config.params.max_new_tokens}}{% endif %} {% if config.params.extra.system_message is not none %} ++system_message='{{config.params.extra.system_message}}' {% endif %} {% if config.params.limit_samples is not none %}++max_samples={{config.params.limit_samples}}{% endif %} {% if config.params.parallelism is not none %}{% if config.params.extra.num_repeats is not none and config.params.extra.num_repeats > 1 %}++max_concurrent_requests={{[(config.params.parallelism / config.params.extra.num_repeats) | int, 1] | max}}{% else %}++max_concurrent_requests={{config.params.parallelism | int}}{% endif %}{% endif %} {% if config.params.temperature is not none %}++inference.temperature={{config.params.temperature}}{% endif %} {% if config.params.top_p is not none %}++inference.top_p={{config.params.top_p}}{% endif %} {% if config.params.extra.prompt_config is not none %}++prompt_config={{config.params.extra.prompt_config}}{% endif %} {% if config.params.extra.ruler.tokenizer_path is not none %}++tokenizer={{config.params.extra.ruler.tokenizer_path}}{% endif %} {% if config.params.extra.args is not none %} {{config.params.extra.args}} {% endif %} {% if config.params.extra.judge_support and config.params.extra.judge.url is not none %} --judge_model={{config.params.extra.judge.model_id}} --judge_server_address={{config.params.extra.judge.url}} --judge_server_type=openai {% if config.params.extra.judge.generation_type is not none %} --judge_generation_type={{config.params.extra.judge.generation_type}} {% endif %} --extra_judge_args="++server.api_key_env_var={% if config.params.extra.judge.api_key is not none %}{{config.params.extra.judge.api_key}}{% else %}DUMMY_API_KEY{% endif %} {%- if config.params.extra.judge.temperature is not none %} ++inference.temperature={{config.params.extra.judge.temperature}}{% endif %} {%- if config.params.extra.judge.top_p is not none %} ++inference.top_p={{config.params.extra.judge.top_p}}{% endif %} {%- if config.params.extra.judge.max_new_tokens is not none %} ++inference.tokens_to_generate={{config.params.extra.judge.max_new_tokens}}{% endif %} {%- if config.params.extra.judge.parallelism is not none %} ++max_concurrent_requests={{config.params.extra.judge.parallelism}}{% endif %} {%- if config.params.extra.judge.args is not none %} {{config.params.extra.judge.args}}{% endif %}" {% endif %} {% if config.params.extra.use_sandbox %} ; EXIT_CODE=$? ; kill $SANDBOX_PID 2>/dev/null || true ; exit $EXIT_CODE{% endif %}
framework_name: nemo_skills
pkg_name: nemo_skills
config:
  params:
    parallelism: 16
    task: wmt24pp
    extra:
      use_sandbox: false
      num_repeats: null
      prompt_config: null
      args: null
      system_message: null
      dataset_split: null
      judge_support: false
      judge:
        url: null
        model_id: null
        api_key: null
        generation_type: null
        random_seed: 1234
        temperature: null
        top_p: null
        max_new_tokens: null
        args: null
        parallelism: null
      ruler:
        data_dir: null
        cluster: null
        setup: null
        max_seq_length: null
        tokenizer_path: null
        template_tokens: null
        num_samples: null
        tasks: null
  supported_endpoint_types:
  - chat
  type: ns_wmt24pp
target: {}