| 1 | # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "data-designer>=0.5.6", |
| 7 | # ] |
| 8 | # /// |
| 9 | """Long-Document Understanding Text Question-Answering Recipe |
| 10 | |
| 11 | Generate question-answer pairs from OCR-transcribed document text using a |
| 12 | reasoning LLM. For each seed record the pipeline: |
| 13 | |
| 14 | 1. Samples a question type (multiple choice, true/false, short answer, numerical) |
| 15 | 2. Generates a structured question + answer pair grounded in the transcribed text |
| 16 | 3. Evaluates question relevance against the source text |
| 17 | 4. Evaluates answer correctness against the source text |
| 18 | |
| 19 | Prerequisites: |
| 20 | - A seed parquet file containing a `transcribed_texts` column with the |
| 21 | OCR-transcribed document text (e.g. output of 02-nemotron-parse-ocr-sdg.py). |
| 22 | - A vLLM-compatible deployment of the reasoning LLM |
| 23 | (default: openai/gpt-oss-120b). |
| 24 | Recommended vLLM launch flags: |
| 25 | --tensor-parallel-size 2 |
| 26 | --reasoning-parser openai_gptoss |
| 27 | |
| 28 | Example launch script for 2× H100: |
| 29 | docker run --gpus all \ |
| 30 | -p 8000:8000 \ |
| 31 | vllm/vllm-openai:latest \ |
| 32 | --model openai/gpt-oss-120b \ |
| 33 | --tensor-parallel-size 2 \ |
| 34 | --reasoning-parser openai_gptoss \ |
| 35 | --gpu-memory-utilization 0.80 \ |
| 36 | --max-model-len 32768 |
| 37 | |
| 38 | Run: |
| 39 | # Basic usage (seed-path should point to the output of 02-nemotron-parse-ocr-sdg.py) |
| 40 | uv run 03-text-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/nemotron_parse_ocr/parquet-files/*.parquet |
| 41 | |
| 42 | # Custom model and record count |
| 43 | uv run 03-text-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/nemotron_parse_ocr/parquet-files/*.parquet --num-records 100 |
| 44 | |
| 45 | # For help message and available options |
| 46 | uv run 03-text-qa-sdg.py --help |
| 47 | """ |
| 48 | |
| 49 | from pathlib import Path |
| 50 | from typing import Literal |
| 51 | |
| 52 | from pydantic import BaseModel, Field |
| 53 | |
| 54 | import data_designer.config as dd |
| 55 | from data_designer.interface import DataDesigner, DatasetCreationResults |
| 56 | |
| 57 | DEFAULT_REASONER_MODEL = "openai/gpt-oss-120b" |
| 58 | VLLM_PROVIDER_NAME = "vllm" |
| 59 | |
| 60 | # ============================================================================= |
| 61 | # Structured output schemas |
| 62 | # ============================================================================= |
| 63 | |
| 64 | |
| 65 | class QuestionAnswer(BaseModel): |
| 66 | question: str = Field(..., description="The question to be answered.") |
| 67 | answer: str = Field(..., description="The correct answer to the question.") |
| 68 | |
| 69 | |
| 70 | class QuestionRelevance(BaseModel): |
| 71 | is_relevant: Literal["Relevant", "Irrelevant"] = Field( |
| 72 | ..., |
| 73 | description="The relevance of the question to the document content provided.", |
| 74 | ) |
| 75 | |
| 76 | |
| 77 | class AnswerCorrectness(BaseModel): |
| 78 | is_correct: Literal["Correct", "Incorrect"] = Field(..., description="Whether the answer is correct.") |
| 79 | |
| 80 | |
| 81 | # ============================================================================= |
| 82 | # Prompt templates |
| 83 | # ============================================================================= |
| 84 | |
| 85 | PROMPT_QUESTION_ANSWER = """\ |
| 86 | <question-type> |
| 87 | {{question_type}} |
| 88 | </question-type> |
| 89 | |
| 90 | <context> |
| 91 | {{ transcribed_texts }} |
| 92 | </context> |
| 93 | |
| 94 | You are an expert in creating challenging reasoning questions that require deep analysis \ |
| 95 | and critical thinking. Your task is to examine the provided pages information and create a \ |
| 96 | question that can only be answered by reviewing <context>. |
| 97 | |
| 98 | Create a question & answer pair using <context> of type <question-type>.\ |
| 99 | """ |
| 100 | |
| 101 | PROMPT_QUESTION_RELEVANCE = """\ |
| 102 | <context> |
| 103 | {{ transcribed_texts }} |
| 104 | </context> |
| 105 | |
| 106 | <question> |
| 107 | {{ question_and_answer.question }} |
| 108 | </question> |
| 109 | |
| 110 | Determine if the <question> is relevant to the <context>.\ |
| 111 | """ |
| 112 | |
| 113 | PROMPT_ANSWER_CORRECTNESS = """\ |
| 114 | <context> |
| 115 | {{ transcribed_texts }} |
| 116 | </context> |
| 117 | |
| 118 | <question> |
| 119 | {{ question_and_answer.question }} |
| 120 | </question> |
| 121 | |
| 122 | <answer> |
| 123 | {{ question_and_answer.answer }} |
| 124 | </answer> |
| 125 | |
| 126 | Determine if the <answer> to <question> is correct given <context>.\ |
| 127 | """ |
| 128 | |
| 129 | |
| 130 | # ============================================================================= |
| 131 | # Pipeline configuration |
| 132 | # ============================================================================= |
| 133 | |
| 134 | |
| 135 | def build_config( |
| 136 | seed_path: str = "seed.parquet", |
| 137 | model_alias: str = "reasoner", |
| 138 | model_id: str = DEFAULT_REASONER_MODEL, |
| 139 | ) -> dd.DataDesignerConfigBuilder: |
| 140 | model_configs = [ |
| 141 | dd.ModelConfig( |
| 142 | alias=model_alias, |
| 143 | model=model_id, |
| 144 | provider=VLLM_PROVIDER_NAME, |
| 145 | inference_parameters=dd.ChatCompletionInferenceParams( |
| 146 | max_tokens=32768, |
| 147 | timeout=1200, |
| 148 | extra_body={"reasoning_effort": "high"}, |
| 149 | max_parallel_requests=32, |
| 150 | ), |
| 151 | ), |
| 152 | ] |
| 153 | |
| 154 | config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) |
| 155 | |
| 156 | config_builder.with_seed_dataset( |
| 157 | dd.LocalFileSeedSource(path=seed_path), |
| 158 | sampling_strategy=dd.SamplingStrategy.ORDERED, |
| 159 | ) |
| 160 | |
| 161 | config_builder.add_column( |
| 162 | dd.SamplerColumnConfig( |
| 163 | name="question_type", |
| 164 | sampler_type=dd.SamplerType.CATEGORY, |
| 165 | params=dd.CategorySamplerParams( |
| 166 | values=[ |
| 167 | "multiple choice", |
| 168 | "true or false", |
| 169 | "short answer", |
| 170 | "numerical question", |
| 171 | ], |
| 172 | ), |
| 173 | ) |
| 174 | ) |
| 175 | |
| 176 | config_builder.add_column( |
| 177 | dd.LLMStructuredColumnConfig( |
| 178 | name="question_and_answer", |
| 179 | model_alias=model_alias, |
| 180 | prompt=PROMPT_QUESTION_ANSWER, |
| 181 | output_format=QuestionAnswer, |
| 182 | ) |
| 183 | ) |
| 184 | |
| 185 | config_builder.add_column( |
| 186 | dd.LLMStructuredColumnConfig( |
| 187 | name="question_relevance", |
| 188 | model_alias=model_alias, |
| 189 | prompt=PROMPT_QUESTION_RELEVANCE, |
| 190 | output_format=QuestionRelevance, |
| 191 | ) |
| 192 | ) |
| 193 | |
| 194 | config_builder.add_column( |
| 195 | dd.LLMStructuredColumnConfig( |
| 196 | name="answer_correctness", |
| 197 | model_alias=model_alias, |
| 198 | prompt=PROMPT_ANSWER_CORRECTNESS, |
| 199 | output_format=AnswerCorrectness, |
| 200 | ) |
| 201 | ) |
| 202 | |
| 203 | return config_builder |
| 204 | |
| 205 | |
| 206 | def create_dataset( |
| 207 | config_builder: dd.DataDesignerConfigBuilder, |
| 208 | num_records: int, |
| 209 | vllm_endpoint: str, |
| 210 | artifact_path: Path | str | None = None, |
| 211 | ) -> DatasetCreationResults: |
| 212 | model_providers = [ |
| 213 | dd.ModelProvider( |
| 214 | name=VLLM_PROVIDER_NAME, |
| 215 | endpoint=vllm_endpoint, |
| 216 | ), |
| 217 | ] |
| 218 | data_designer = DataDesigner( |
| 219 | artifact_path=artifact_path, |
| 220 | model_providers=model_providers, |
| 221 | ) |
| 222 | data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True)) |
| 223 | results = data_designer.create(config_builder, num_records=num_records, dataset_name="text_qa") |
| 224 | return results |
| 225 | |
| 226 | |
| 227 | if __name__ == "__main__": |
| 228 | from argparse import ArgumentParser |
| 229 | |
| 230 | parser = ArgumentParser() |
| 231 | parser.add_argument( |
| 232 | "--vllm-endpoint", |
| 233 | type=str, |
| 234 | required=True, |
| 235 | help="Base URL of the vLLM server hosting the reasoning LLM (e.g. http://localhost:8000/v1)", |
| 236 | ) |
| 237 | parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file") |
| 238 | parser.add_argument("--model-alias", type=str, default="reasoner") |
| 239 | parser.add_argument("--model-id", type=str, default=DEFAULT_REASONER_MODEL) |
| 240 | parser.add_argument("--num-records", type=int, default=5) |
| 241 | parser.add_argument("--artifact-path", type=str, default=None) |
| 242 | args = parser.parse_args() |
| 243 | |
| 244 | config_builder = build_config( |
| 245 | seed_path=args.seed_path, |
| 246 | model_alias=args.model_alias, |
| 247 | model_id=args.model_id, |
| 248 | ) |
| 249 | results = create_dataset( |
| 250 | config_builder, |
| 251 | num_records=args.num_records, |
| 252 | vllm_endpoint=args.vllm_endpoint, |
| 253 | artifact_path=args.artifact_path, |
| 254 | ) |
| 255 | |
| 256 | print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") |
| 257 | |
| 258 | results.load_analysis().to_report() |