vlmevalkit#

This page contains all evaluation tasks for the vlmevalkit harness.

Task

Description

ai2d_judge

A benchmark for evaluating diagram understanding capabilities of large vision-language models.

chartqa

A Benchmark for Question Answering about Charts with Visual and Logical Reasoning

mathvista-mini

Evaluating Math Reasoning in Visual Contexts

mmmu_judge

A benchmark for evaluating multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning.

ocr_reasoning

Comprehensive benchmark of 1,069 human-annotated examples designed to evaluate multimodal large language models on text-rich image reasoning tasks by assessing both final answers and the reasoning process across six core abilities and 18 practical tasks.

ocrbench

Comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models

slidevqa

Evaluates ability to answer questions about slide decks by selecting relevant slides from multiple images

ai2d_judge#

A benchmark for evaluating diagram understanding capabilities of large vision-language models.

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: ai2d_judge

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}
framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: AI2D_TEST
        class: ImageMCQDataset
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: ai2d_judge
target:
  api_endpoint: {}

chartqa#

A Benchmark for Question Answering about Charts with Visual and Logical Reasoning

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: chartqa

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}
framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: ChartQA_TEST
        class: ImageVQADataset
  supported_endpoint_types:
  - vlm
  type: chartqa
target:
  api_endpoint: {}

mathvista-mini#

Evaluating Math Reasoning in Visual Contexts

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: mathvista-mini

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}
framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: MathVista_MINI
        class: MathVista
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: mathvista-mini
target:
  api_endpoint: {}

mmmu_judge#

A benchmark for evaluating multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning.

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: mmmu_judge

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}
framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: MMMU_DEV_VAL
        class: MMMUDataset
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: mmmu_judge
target:
  api_endpoint: {}

ocr_reasoning#

Comprehensive benchmark of 1,069 human-annotated examples designed to evaluate multimodal large language models on text-rich image reasoning tasks by assessing both final answers and the reasoning process across six core abilities and 18 practical tasks.

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: ocr_reasoning

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}
framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: OCR_Reasoning
        class: OCR_Reasoning
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: ocr_reasoning
target:
  api_endpoint: {}

ocrbench#

Comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: ocrbench

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}
framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: OCRBench
        class: OCRBench
  supported_endpoint_types:
  - vlm
  type: ocrbench
target:
  api_endpoint: {}

slidevqa#

Evaluates ability to answer questions about slide decks by selecting relevant slides from multiple images

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: slidevqa

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}
framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: SLIDEVQA
        class: SlideVQA
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: slidevqa
target:
  api_endpoint: {}