vlmevalkit#

This page contains all evaluation tasks for the vlmevalkit harness.

Task	Description
ai2d_judge	A benchmark for evaluating diagram understanding capabilities of large vision-language models.
chartqa	A Benchmark for Question Answering about Charts with Visual and Logical Reasoning
mathvista-mini	Evaluating Math Reasoning in Visual Contexts
mmmu_judge	A benchmark for evaluating multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning.
ocr_reasoning	Comprehensive benchmark of 1,069 human-annotated examples designed to evaluate multimodal large language models on text-rich image reasoning tasks by assessing both final answers and the reasoning process across six core abilities and 18 practical tasks.
ocrbench	Comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models
slidevqa	Evaluates ability to answer questions about slide decks by selecting relevant slides from multiple images

ai2d_judge#

A benchmark for evaluating diagram understanding capabilities of large vision-language models.

Container

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: ai2d_judge

Command

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}

Defaults

framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: AI2D_TEST
        class: ImageMCQDataset
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: ai2d_judge
target:
  api_endpoint: {}

chartqa#

A Benchmark for Question Answering about Charts with Visual and Logical Reasoning

Container

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: chartqa

Command

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}

Defaults

framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: ChartQA_TEST
        class: ImageVQADataset
  supported_endpoint_types:
  - vlm
  type: chartqa
target:
  api_endpoint: {}

mathvista-mini#

Evaluating Math Reasoning in Visual Contexts

Container

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: mathvista-mini

Command

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}

Defaults

framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: MathVista_MINI
        class: MathVista
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: mathvista-mini
target:
  api_endpoint: {}

mmmu_judge#

A benchmark for evaluating multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning.

Container

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: mmmu_judge

Command

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}

Defaults

framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: MMMU_DEV_VAL
        class: MMMUDataset
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: mmmu_judge
target:
  api_endpoint: {}

ocr_reasoning#

Comprehensive benchmark of 1,069 human-annotated examples designed to evaluate multimodal large language models on text-rich image reasoning tasks by assessing both final answers and the reasoning process across six core abilities and 18 practical tasks.

Container

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: ocr_reasoning

Command

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}

Defaults

framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: OCR_Reasoning
        class: OCR_Reasoning
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: ocr_reasoning
target:
  api_endpoint: {}

ocrbench#

Comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models

Container

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: ocrbench

Command

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}

Defaults

framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: OCRBench
        class: OCRBench
  supported_endpoint_types:
  - vlm
  type: ocrbench
target:
  api_endpoint: {}

slidevqa#

Evaluates ability to answer questions about slide decks by selecting relevant slides from multiple images

Container

Harness: vlmevalkit

Container:

nvcr.io/nvidia/eval-factory/vlmevalkit:26.01

Container Digest:

sha256:24c650c547cfd666bcc5ec822c996eb90e89e4964a1d4ec29e4d01d8bd3a22dc

Container Arch: amd

Task Type: slidevqa

Command

cat > {{config.output_dir}}/vlmeval_config.json << 'EOF'
{
  "model": {
    "{{target.api_endpoint.model_id.split('/')[-1]}}": {
      "class": "CustomOAIEndpoint",
      "model": "{{target.api_endpoint.model_id}}",
      "api_base": "{{target.api_endpoint.url}}",
      "api_key_var_name": "{{target.api_endpoint.api_key_name}}",
      "max_tokens": {{config.params.max_new_tokens}},
      "temperature": {{config.params.temperature}},{% if config.params.top_p is not none %}
      "top_p": {{config.params.top_p}},{% endif %}
      "retry": {{config.params.max_retries}},
      "timeout": {{config.params.request_timeout}}{% if config.params.extra.wait is defined %},
      "wait": {{config.params.extra.wait}}{% endif %}{% if config.params.extra.img_size is defined %},
      "img_size": {{config.params.extra.img_size}}{% endif %}{% if config.params.extra.img_detail is defined %},
      "img_detail": "{{config.params.extra.img_detail}}"{% endif %}{% if config.params.extra.system_prompt is defined %},
      "system_prompt": "{{config.params.extra.system_prompt}}"{% endif %}{% if config.params.extra.verbose is defined %},
      "verbose": {{config.params.extra.verbose}}{% endif %}
    }
  },
  "data": {
    "{{config.params.extra.dataset.name}}": {
      "class": "{{config.params.extra.dataset.class}}",
      "dataset": "{{config.params.extra.dataset.name}}",
      "model": "{{target.api_endpoint.model_id}}"
    }
  }
}
EOF
python -m vlmeval.run \
  --config {{config.output_dir}}/vlmeval_config.json \
  --work-dir {{config.output_dir}} \
  --api-nproc {{config.params.parallelism}} \
  {%- if config.params.extra.judge is defined %}
  --judge {{config.params.extra.judge.model}} \
  --judge-args '{{config.params.extra.judge.args}}' \
  {%- endif %}
  {% if config.params.limit_samples is not none %}--first-n {{config.params.limit_samples}}{% endif %}

Defaults

framework_name: vlmevalkit
pkg_name: vlmevalkit
config:
  params:
    max_new_tokens: 2048
    max_retries: 5
    parallelism: 4
    temperature: 0.0
    request_timeout: 60
    extra:
      dataset:
        name: SLIDEVQA
        class: SlideVQA
      judge:
        model: gpt-4o
        args: '{"use_azure": true}'
  supported_endpoint_types:
  - vlm
  type: slidevqa
target:
  api_endpoint: {}