profbench#
This page contains all evaluation tasks for the profbench harness.
Task |
Description |
|---|---|
Run LLM judge on provided ProfBench reports and score them |
|
Generate professional reports and evaluate them (full pipeline) |
llm_judge#
Run LLM judge on provided ProfBench reports and score them
Harness: profbench
Container:
nvcr.io/nvidia/eval-factory/profbench:26.01
Container Digest:
sha256:7b2766affe4c2070ec803a893f7bf1ff2fc735df562aa520ec910c9ef58d3598
Container Arch: multiarch
Task Type: llm_judge
{% if target.api_endpoint.api_key_name is not none %}
export API_KEY=${{target.api_endpoint.api_key_name}} &&
{% endif %} {% if config.params.extra.run_generation %}
python -m profbench.run_report_generation \
--model {{target.api_endpoint.model_id}} \
--library {{config.params.extra.library}} \
--timeout {{config.params.request_timeout}} \
--parallel {{config.params.parallelism}} \
--retry-attempts {{config.params.max_retries}} \
--folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.version is not none %} --version {{config.params.extra.version}}{% endif %}{% if config.params.extra.web_search %} --web-search{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} &&
GENERATION_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) &&
{% endif %} {% if config.params.extra.run_judge_generated %}
python -m profbench.run_best_llm_judge_on_generated_reports \
--filename $GENERATION_OUTPUT \
--api-key $API_KEY \
--model {{target.api_endpoint.model_id}} \
--library {{config.params.extra.library}} \
--timeout {{config.params.request_timeout}} \
--parallel {{config.params.parallelism}} \
--retry-attempts {{config.params.max_retries}} \
--output-folder {{config.output_dir}}/judgements{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} &&
JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/judgements/*.jsonl | head -1) &&
python -m profbench.score_report_generation $JUDGE_OUTPUT
{% endif %} {% if config.params.extra.run_judge_provided %}
python -m profbench.run_llm_judge_on_provided_reports \
--model {{target.api_endpoint.model_id}} \
--library {{config.params.extra.library}} \
--timeout {{config.params.request_timeout}} \
--parallel {{config.params.parallelism}} \
--retry-attempts {{config.params.max_retries}} \
--folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.extra.debug %} --debug{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} &&
JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) &&
python -m profbench.score_llm_judge $JUDGE_OUTPUT
{% endif %}
framework_name: profbench
pkg_name: profbench
config:
params:
max_new_tokens: 4096
max_retries: 5
parallelism: 10
temperature: 0.0
request_timeout: 600
top_p: 1.0e-05
extra:
run_generation: false
run_judge_generated: false
run_judge_provided: true
library: openai
version: lite
web_search: false
reasoning: false
reasoning_effort: null
debug: false
supported_endpoint_types:
- chat
type: llm_judge
target:
api_endpoint: {}
report_generation#
Generate professional reports and evaluate them (full pipeline)
Harness: profbench
Container:
nvcr.io/nvidia/eval-factory/profbench:26.01
Container Digest:
sha256:7b2766affe4c2070ec803a893f7bf1ff2fc735df562aa520ec910c9ef58d3598
Container Arch: multiarch
Task Type: report_generation
{% if target.api_endpoint.api_key_name is not none %}
export API_KEY=${{target.api_endpoint.api_key_name}} &&
{% endif %} {% if config.params.extra.run_generation %}
python -m profbench.run_report_generation \
--model {{target.api_endpoint.model_id}} \
--library {{config.params.extra.library}} \
--timeout {{config.params.request_timeout}} \
--parallel {{config.params.parallelism}} \
--retry-attempts {{config.params.max_retries}} \
--folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.version is not none %} --version {{config.params.extra.version}}{% endif %}{% if config.params.extra.web_search %} --web-search{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} &&
GENERATION_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) &&
{% endif %} {% if config.params.extra.run_judge_generated %}
python -m profbench.run_best_llm_judge_on_generated_reports \
--filename $GENERATION_OUTPUT \
--api-key $API_KEY \
--model {{target.api_endpoint.model_id}} \
--library {{config.params.extra.library}} \
--timeout {{config.params.request_timeout}} \
--parallel {{config.params.parallelism}} \
--retry-attempts {{config.params.max_retries}} \
--output-folder {{config.output_dir}}/judgements{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} &&
JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/judgements/*.jsonl | head -1) &&
python -m profbench.score_report_generation $JUDGE_OUTPUT
{% endif %} {% if config.params.extra.run_judge_provided %}
python -m profbench.run_llm_judge_on_provided_reports \
--model {{target.api_endpoint.model_id}} \
--library {{config.params.extra.library}} \
--timeout {{config.params.request_timeout}} \
--parallel {{config.params.parallelism}} \
--retry-attempts {{config.params.max_retries}} \
--folder {{config.output_dir}}{% if target.api_endpoint.url is not none %} --base-url {{target.api_endpoint.url}}{% endif %}{% if config.params.extra.reasoning %} --reasoning{% endif %}{% if config.params.extra.reasoning_effort is not none %} --reasoning-effort {{config.params.extra.reasoning_effort}}{% endif %}{% if config.params.extra.debug %} --debug{% endif %}{% if config.params.limit_samples is not none %} --limit-samples {{config.params.limit_samples}}{% endif %}{% if config.params.temperature is not none %} --temperature {{config.params.temperature}}{% endif %}{% if config.params.top_p is not none %} --top-p {{config.params.top_p}}{% endif %}{% if config.params.max_new_tokens is not none %} --max-tokens {{config.params.max_new_tokens}}{% endif %} &&
JUDGE_OUTPUT=$(ls -t {{config.output_dir}}/*.jsonl | head -1) &&
python -m profbench.score_llm_judge $JUDGE_OUTPUT
{% endif %}
framework_name: profbench
pkg_name: profbench
config:
params:
max_new_tokens: 4096
max_retries: 5
parallelism: 10
temperature: 0.0
request_timeout: 600
top_p: 1.0e-05
extra:
run_generation: true
run_judge_generated: true
run_judge_provided: false
library: openai
version: lite
web_search: false
reasoning: false
reasoning_effort: null
debug: false
supported_endpoint_types:
- chat
type: report_generation
target:
api_endpoint: {}