For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Seed Dataset Preparation
      • Nemotron Parse OCR
      • Text QA from OCR Transcripts
      • Page Classification
      • Visual QA
      • Single-Page QA
      • Multi-Page Windowed QA
      • Whole-Document QA
      • Frontier Judge QA Filter
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Your Privacy Choices | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
RecipesVLM Long-Document Understanding

Text QA from OCR Transcripts

||View as Markdown|
Previous

Nemotron Parse OCR

Next

Page Classification

Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "data-designer>=0.5.6",
7# ]
8# ///
9"""Long-Document Understanding Text Question-Answering Recipe
10
11Generate question-answer pairs from OCR-transcribed document text using a
12reasoning LLM. For each seed record the pipeline:
13
14 1. Samples a question type (multiple choice, true/false, short answer, numerical)
15 2. Generates a structured question + answer pair grounded in the transcribed text
16 3. Evaluates question relevance against the source text
17 4. Evaluates answer correctness against the source text
18
19Prerequisites:
20 - A seed parquet file containing a `transcribed_texts` column with the
21 OCR-transcribed document text (e.g. output of 02-nemotron-parse-ocr-sdg.py).
22 - A vLLM-compatible deployment of the reasoning LLM
23 (default: openai/gpt-oss-120b).
24 Recommended vLLM launch flags:
25 --tensor-parallel-size 2
26 --reasoning-parser openai_gptoss
27
28 Example launch script for 2× H100:
29 docker run --gpus all \
30 -p 8000:8000 \
31 vllm/vllm-openai:latest \
32 --model openai/gpt-oss-120b \
33 --tensor-parallel-size 2 \
34 --reasoning-parser openai_gptoss \
35 --gpu-memory-utilization 0.80 \
36 --max-model-len 32768
37
38Run:
39 # Basic usage (seed-path should point to the output of 02-nemotron-parse-ocr-sdg.py)
40 uv run 03-text-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/nemotron_parse_ocr/parquet-files/*.parquet
41
42 # Custom model and record count
43 uv run 03-text-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/nemotron_parse_ocr/parquet-files/*.parquet --num-records 100
44
45 # For help message and available options
46 uv run 03-text-qa-sdg.py --help
47"""
48
49from pathlib import Path
50from typing import Literal
51
52from pydantic import BaseModel, Field
53
54import data_designer.config as dd
55from data_designer.interface import DataDesigner, DatasetCreationResults
56
57DEFAULT_REASONER_MODEL = "openai/gpt-oss-120b"
58VLLM_PROVIDER_NAME = "vllm"
59
60# =============================================================================
61# Structured output schemas
62# =============================================================================
63
64
65class QuestionAnswer(BaseModel):
66 question: str = Field(..., description="The question to be answered.")
67 answer: str = Field(..., description="The correct answer to the question.")
68
69
70class QuestionRelevance(BaseModel):
71 is_relevant: Literal["Relevant", "Irrelevant"] = Field(
72 ...,
73 description="The relevance of the question to the document content provided.",
74 )
75
76
77class AnswerCorrectness(BaseModel):
78 is_correct: Literal["Correct", "Incorrect"] = Field(..., description="Whether the answer is correct.")
79
80
81# =============================================================================
82# Prompt templates
83# =============================================================================
84
85PROMPT_QUESTION_ANSWER = """\
86<question-type>
87{{question_type}}
88</question-type>
89
90<context>
91{{ transcribed_texts }}
92</context>
93
94You are an expert in creating challenging reasoning questions that require deep analysis \
95and critical thinking. Your task is to examine the provided pages information and create a \
96question that can only be answered by reviewing <context>.
97
98Create a question & answer pair using <context> of type <question-type>.\
99"""
100
101PROMPT_QUESTION_RELEVANCE = """\
102<context>
103{{ transcribed_texts }}
104</context>
105
106<question>
107{{ question_and_answer.question }}
108</question>
109
110Determine if the <question> is relevant to the <context>.\
111"""
112
113PROMPT_ANSWER_CORRECTNESS = """\
114<context>
115{{ transcribed_texts }}
116</context>
117
118<question>
119{{ question_and_answer.question }}
120</question>
121
122<answer>
123{{ question_and_answer.answer }}
124</answer>
125
126Determine if the <answer> to <question> is correct given <context>.\
127"""
128
129
130# =============================================================================
131# Pipeline configuration
132# =============================================================================
133
134
135def build_config(
136 seed_path: str = "seed.parquet",
137 model_alias: str = "reasoner",
138 model_id: str = DEFAULT_REASONER_MODEL,
139) -> dd.DataDesignerConfigBuilder:
140 model_configs = [
141 dd.ModelConfig(
142 alias=model_alias,
143 model=model_id,
144 provider=VLLM_PROVIDER_NAME,
145 inference_parameters=dd.ChatCompletionInferenceParams(
146 max_tokens=32768,
147 timeout=1200,
148 extra_body={"reasoning_effort": "high"},
149 max_parallel_requests=32,
150 ),
151 ),
152 ]
153
154 config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
155
156 config_builder.with_seed_dataset(
157 dd.LocalFileSeedSource(path=seed_path),
158 sampling_strategy=dd.SamplingStrategy.ORDERED,
159 )
160
161 config_builder.add_column(
162 dd.SamplerColumnConfig(
163 name="question_type",
164 sampler_type=dd.SamplerType.CATEGORY,
165 params=dd.CategorySamplerParams(
166 values=[
167 "multiple choice",
168 "true or false",
169 "short answer",
170 "numerical question",
171 ],
172 ),
173 )
174 )
175
176 config_builder.add_column(
177 dd.LLMStructuredColumnConfig(
178 name="question_and_answer",
179 model_alias=model_alias,
180 prompt=PROMPT_QUESTION_ANSWER,
181 output_format=QuestionAnswer,
182 )
183 )
184
185 config_builder.add_column(
186 dd.LLMStructuredColumnConfig(
187 name="question_relevance",
188 model_alias=model_alias,
189 prompt=PROMPT_QUESTION_RELEVANCE,
190 output_format=QuestionRelevance,
191 )
192 )
193
194 config_builder.add_column(
195 dd.LLMStructuredColumnConfig(
196 name="answer_correctness",
197 model_alias=model_alias,
198 prompt=PROMPT_ANSWER_CORRECTNESS,
199 output_format=AnswerCorrectness,
200 )
201 )
202
203 return config_builder
204
205
206def create_dataset(
207 config_builder: dd.DataDesignerConfigBuilder,
208 num_records: int,
209 vllm_endpoint: str,
210 artifact_path: Path | str | None = None,
211) -> DatasetCreationResults:
212 model_providers = [
213 dd.ModelProvider(
214 name=VLLM_PROVIDER_NAME,
215 endpoint=vllm_endpoint,
216 ),
217 ]
218 data_designer = DataDesigner(
219 artifact_path=artifact_path,
220 model_providers=model_providers,
221 )
222 data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
223 results = data_designer.create(config_builder, num_records=num_records, dataset_name="text_qa")
224 return results
225
226
227if __name__ == "__main__":
228 from argparse import ArgumentParser
229
230 parser = ArgumentParser()
231 parser.add_argument(
232 "--vllm-endpoint",
233 type=str,
234 required=True,
235 help="Base URL of the vLLM server hosting the reasoning LLM (e.g. http://localhost:8000/v1)",
236 )
237 parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
238 parser.add_argument("--model-alias", type=str, default="reasoner")
239 parser.add_argument("--model-id", type=str, default=DEFAULT_REASONER_MODEL)
240 parser.add_argument("--num-records", type=int, default=5)
241 parser.add_argument("--artifact-path", type=str, default=None)
242 args = parser.parse_args()
243
244 config_builder = build_config(
245 seed_path=args.seed_path,
246 model_alias=args.model_alias,
247 model_id=args.model_id,
248 )
249 results = create_dataset(
250 config_builder,
251 num_records=args.num_records,
252 vllm_endpoint=args.vllm_endpoint,
253 artifact_path=args.artifact_path,
254 )
255
256 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
257
258 results.load_analysis().to_report()