Text QA from OCR Transcripts | NVIDIA NeMo Data Designer

Download Recipe

1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "data-designer>=0.5.6",
7 # ]
8 # ///
9 """Long-Document Understanding Text Question-Answering Recipe
10 
11 Generate question-answer pairs from OCR-transcribed document text using a
12 reasoning LLM. For each seed record the pipeline:
13 
14   1. Samples a question type (multiple choice, true/false, short answer, numerical)
15   2. Generates a structured question + answer pair grounded in the transcribed text
16   3. Evaluates question relevance against the source text
17   4. Evaluates answer correctness against the source text
18 
19 Prerequisites:
20     - A seed parquet file containing a `transcribed_texts` column with the
21       OCR-transcribed document text (e.g. output of 02-nemotron-parse-ocr-sdg.py).
22     - A vLLM-compatible deployment of the reasoning LLM
23       (default: openai/gpt-oss-120b).
24       Recommended vLLM launch flags:
25         --tensor-parallel-size 2
26         --reasoning-parser openai_gptoss
27 
28       Example launch script for 2× H100:
29         docker run --gpus all \
30             -p 8000:8000 \
31             vllm/vllm-openai:latest \
32             --model openai/gpt-oss-120b \
33             --tensor-parallel-size 2 \
34             --reasoning-parser openai_gptoss \
35             --gpu-memory-utilization 0.80 \
36             --max-model-len 32768
37 
38 Run:
39     # Basic usage (seed-path should point to the output of 02-nemotron-parse-ocr-sdg.py)
40     uv run 03-text-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/nemotron_parse_ocr/parquet-files/*.parquet
41 
42     # Custom model and record count
43     uv run 03-text-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/nemotron_parse_ocr/parquet-files/*.parquet --num-records 100
44 
45     # For help message and available options
46     uv run 03-text-qa-sdg.py --help
47 """
48 
49 from pathlib import Path
50 from typing import Literal
51 
52 from pydantic import BaseModel, Field
53 
54 import data_designer.config as dd
55 from data_designer.interface import DataDesigner, DatasetCreationResults
56 
57 DEFAULT_REASONER_MODEL = "openai/gpt-oss-120b"
58 VLLM_PROVIDER_NAME = "vllm"
59 
60 # =============================================================================
61 # Structured output schemas
62 # =============================================================================
63 
64 
65 class QuestionAnswer(BaseModel):
66     question: str = Field(..., description="The question to be answered.")
67     answer: str = Field(..., description="The correct answer to the question.")
68 
69 
70 class QuestionRelevance(BaseModel):
71     is_relevant: Literal["Relevant", "Irrelevant"] = Field(
72         ...,
73         description="The relevance of the question to the document content provided.",
74     )
75 
76 
77 class AnswerCorrectness(BaseModel):
78     is_correct: Literal["Correct", "Incorrect"] = Field(..., description="Whether the answer is correct.")
79 
80 
81 # =============================================================================
82 # Prompt templates
83 # =============================================================================
84 
85 PROMPT_QUESTION_ANSWER = """\
86 <question-type>
87 {{question_type}}
88 </question-type>
89 
90 <context>
91 {{ transcribed_texts }}
92 </context>
93 
94 You are an expert in creating challenging reasoning questions that require deep analysis \
95 and critical thinking. Your task is to examine the provided pages information and create a \
96 question that can only be answered by reviewing <context>.
97 
98 Create a question & answer pair using <context> of type <question-type>.\
99 """
100 
101 PROMPT_QUESTION_RELEVANCE = """\
102 <context>
103 {{ transcribed_texts }}
104 </context>
105 
106 <question>
107 {{ question_and_answer.question }}
108 </question>
109 
110 Determine if the <question> is relevant to the <context>.\
111 """
112 
113 PROMPT_ANSWER_CORRECTNESS = """\
114 <context>
115 {{ transcribed_texts }}
116 </context>
117 
118 <question>
119 {{ question_and_answer.question }}
120 </question>
121 
122 <answer>
123 {{ question_and_answer.answer }}
124 </answer>
125 
126 Determine if the <answer> to <question> is correct given <context>.\
127 """
128 
129 
130 # =============================================================================
131 # Pipeline configuration
132 # =============================================================================
133 
134 
135 def build_config(
136     seed_path: str = "seed.parquet",
137     model_alias: str = "reasoner",
138     model_id: str = DEFAULT_REASONER_MODEL,
139 ) -> dd.DataDesignerConfigBuilder:
140     model_configs = [
141         dd.ModelConfig(
142             alias=model_alias,
143             model=model_id,
144             provider=VLLM_PROVIDER_NAME,
145             inference_parameters=dd.ChatCompletionInferenceParams(
146                 max_tokens=32768,
147                 timeout=1200,
148                 extra_body={"reasoning_effort": "high"},
149                 max_parallel_requests=32,
150             ),
151         ),
152     ]
153 
154     config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
155 
156     config_builder.with_seed_dataset(
157         dd.LocalFileSeedSource(path=seed_path),
158         sampling_strategy=dd.SamplingStrategy.ORDERED,
159     )
160 
161     config_builder.add_column(
162         dd.SamplerColumnConfig(
163             name="question_type",
164             sampler_type=dd.SamplerType.CATEGORY,
165             params=dd.CategorySamplerParams(
166                 values=[
167                     "multiple choice",
168                     "true or false",
169                     "short answer",
170                     "numerical question",
171                 ],
172             ),
173         )
174     )
175 
176     config_builder.add_column(
177         dd.LLMStructuredColumnConfig(
178             name="question_and_answer",
179             model_alias=model_alias,
180             prompt=PROMPT_QUESTION_ANSWER,
181             output_format=QuestionAnswer,
182         )
183     )
184 
185     config_builder.add_column(
186         dd.LLMStructuredColumnConfig(
187             name="question_relevance",
188             model_alias=model_alias,
189             prompt=PROMPT_QUESTION_RELEVANCE,
190             output_format=QuestionRelevance,
191         )
192     )
193 
194     config_builder.add_column(
195         dd.LLMStructuredColumnConfig(
196             name="answer_correctness",
197             model_alias=model_alias,
198             prompt=PROMPT_ANSWER_CORRECTNESS,
199             output_format=AnswerCorrectness,
200         )
201     )
202 
203     return config_builder
204 
205 
206 def create_dataset(
207     config_builder: dd.DataDesignerConfigBuilder,
208     num_records: int,
209     vllm_endpoint: str,
210     artifact_path: Path | str | None = None,
211 ) -> DatasetCreationResults:
212     model_providers = [
213         dd.ModelProvider(
214             name=VLLM_PROVIDER_NAME,
215             endpoint=vllm_endpoint,
216         ),
217     ]
218     data_designer = DataDesigner(
219         artifact_path=artifact_path,
220         model_providers=model_providers,
221     )
222     data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
223     results = data_designer.create(config_builder, num_records=num_records, dataset_name="text_qa")
224     return results
225 
226 
227 if __name__ == "__main__":
228     from argparse import ArgumentParser
229 
230     parser = ArgumentParser()
231     parser.add_argument(
232         "--vllm-endpoint",
233         type=str,
234         required=True,
235         help="Base URL of the vLLM server hosting the reasoning LLM (e.g. http://localhost:8000/v1)",
236     )
237     parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
238     parser.add_argument("--model-alias", type=str, default="reasoner")
239     parser.add_argument("--model-id", type=str, default=DEFAULT_REASONER_MODEL)
240     parser.add_argument("--num-records", type=int, default=5)
241     parser.add_argument("--artifact-path", type=str, default=None)
242     args = parser.parse_args()
243 
244     config_builder = build_config(
245         seed_path=args.seed_path,
246         model_alias=args.model_alias,
247         model_id=args.model_id,
248     )
249     results = create_dataset(
250         config_builder,
251         num_records=args.num_records,
252         vllm_endpoint=args.vllm_endpoint,
253         artifact_path=args.artifact_path,
254     )
255 
256     print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
257 
258     results.load_analysis().to_report()