profbench#

This page contains all evaluation tasks for the profbench harness.

Task

Description

llm_judge

Run LLM judge on provided ProfBench reports and score them

report_generation

Generate professional reports and evaluate them (full pipeline)

llm_judge#

Run LLM judge on provided ProfBench reports and score them

Harness: profbench

Container:

nvcr.io/nvidia/eval-factory/profbench:26.01

Container Digest:

sha256:7b2766affe4c2070ec803a893f7bf1ff2fc735df562aa520ec910c9ef58d3598

Container Arch: multiarch

Task Type: llm_judge

{% if target.api_endpoint.api_key_name is not none %}
  export API_KEY=${{target.api_endpoint.api_key_name}} && 
{% endif %} {% if config.params.extra.run_generation %}
  python -m profbench.run_report_generation \
    --model {{target.api_endpoint.model_id}} \
    --library {{config.params.extra.library}} \
    --timeout {{config.params.request_timeout}} \
    --parallel {{config.params.parallelism}} \
    --retry-attempts {{config.params.max_retries}} \
    --folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.version is not none %} --version {{config.params.extra.version}}{% endif %}{% if config.params.extra.web_search %} --web-search{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} && 
  GENERATION_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) && 
{% endif %} {% if config.params.extra.run_judge_generated %}
  python -m profbench.run_best_llm_judge_on_generated_reports \
    --filename $GENERATION_OUTPUT \
    --api-key $API_KEY \
    --model {{target.api_endpoint.model_id}} \
    --library {{config.params.extra.library}} \
    --timeout {{config.params.request_timeout}} \
    --parallel {{config.params.parallelism}} \
    --retry-attempts {{config.params.max_retries}} \
    --output-folder {{config.output_dir}}/judgements{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} && 
  JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/judgements/*.jsonl | head -1) && 
  python -m profbench.score_report_generation $JUDGE_OUTPUT
{% endif %} {% if config.params.extra.run_judge_provided %}
  python -m profbench.run_llm_judge_on_provided_reports \
    --model {{target.api_endpoint.model_id}} \
    --library {{config.params.extra.library}} \
    --timeout {{config.params.request_timeout}} \
    --parallel {{config.params.parallelism}} \
    --retry-attempts {{config.params.max_retries}} \
    --folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.extra.debug %} --debug{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} && 
  JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) && 
  python -m profbench.score_llm_judge $JUDGE_OUTPUT
{% endif %}
framework_name: profbench
pkg_name: profbench
config:
  params:
    max_new_tokens: 4096
    max_retries: 5
    parallelism: 10
    temperature: 0.0
    request_timeout: 600
    top_p: 1.0e-05
    extra:
      run_generation: false
      run_judge_generated: false
      run_judge_provided: true
      library: openai
      version: lite
      web_search: false
      reasoning: false
      reasoning_effort: null
      debug: false
  supported_endpoint_types:
  - chat
  type: llm_judge
target:
  api_endpoint: {}

report_generation#

Generate professional reports and evaluate them (full pipeline)

Harness: profbench

Container:

nvcr.io/nvidia/eval-factory/profbench:26.01

Container Digest:

sha256:7b2766affe4c2070ec803a893f7bf1ff2fc735df562aa520ec910c9ef58d3598

Container Arch: multiarch

Task Type: report_generation

{% if target.api_endpoint.api_key_name is not none %}
  export API_KEY=${{target.api_endpoint.api_key_name}} && 
{% endif %} {% if config.params.extra.run_generation %}
  python -m profbench.run_report_generation \
    --model {{target.api_endpoint.model_id}} \
    --library {{config.params.extra.library}} \
    --timeout {{config.params.request_timeout}} \
    --parallel {{config.params.parallelism}} \
    --retry-attempts {{config.params.max_retries}} \
    --folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.version is not none %} --version {{config.params.extra.version}}{% endif %}{% if config.params.extra.web_search %} --web-search{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} && 
  GENERATION_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) && 
{% endif %} {% if config.params.extra.run_judge_generated %}
  python -m profbench.run_best_llm_judge_on_generated_reports \
    --filename $GENERATION_OUTPUT \
    --api-key $API_KEY \
    --model {{target.api_endpoint.model_id}} \
    --library {{config.params.extra.library}} \
    --timeout {{config.params.request_timeout}} \
    --parallel {{config.params.parallelism}} \
    --retry-attempts {{config.params.max_retries}} \
    --output-folder {{config.output_dir}}/judgements{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} && 
  JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/judgements/*.jsonl | head -1) && 
  python -m profbench.score_report_generation $JUDGE_OUTPUT
{% endif %} {% if config.params.extra.run_judge_provided %}
  python -m profbench.run_llm_judge_on_provided_reports \
    --model {{target.api_endpoint.model_id}} \
    --library {{config.params.extra.library}} \
    --timeout {{config.params.request_timeout}} \
    --parallel {{config.params.parallelism}} \
    --retry-attempts {{config.params.max_retries}} \
    --folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.extra.debug %} --debug{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} && 
  JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) && 
  python -m profbench.score_llm_judge $JUDGE_OUTPUT
{% endif %}
framework_name: profbench
pkg_name: profbench
config:
  params:
    max_new_tokens: 4096
    max_retries: 5
    parallelism: 10
    temperature: 0.0
    request_timeout: 600
    top_p: 1.0e-05
    extra:
      run_generation: true
      run_judge_generated: true
      run_judge_provided: false
      library: openai
      version: lite
      web_search: false
      reasoning: false
      reasoning_effort: null
      debug: false
  supported_endpoint_types:
  - chat
  type: report_generation
target:
  api_endpoint: {}