nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc

View as Markdown

Module Contents

Classes

NameDescription
DistillStage-
DiverseQAPostProcessingStagePost-processing stage for DiverseQA outputs. It parses the raw generated QA list,
DiverseQAStage-
ExtractKnowledgeStage-
KnowledgeListPostProcessingStagePost-processing stage that formats knowledge list outputs generated by the LLM.
KnowledgeListStage-
WikipediaParaphrasingStage-

API

class nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.DistillStage(
system_prompt: str = NEMOTRON_CC_DISTILL_SYSTEM_...,
prompt: str = DISTILL_PROMPT_TEMPLATE,
input_field: str = 'text',
output_field: str = 'distill',
client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None, client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None,
model_name: str = None,
generation_config: nemo_curator.models.client.llm_client.GenerationConfig | None = None,
name: str = 'NemotronCCBaseStage'
)
Dataclass

Bases: BaseSyntheticStage

input_field
str = 'text'
output_field
str = 'distill'
prompt
str = DISTILL_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_DISTILL_SYSTEM_PROMPT
class nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.DiverseQAPostProcessingStage(
input_field: str = 'text',
qa_field: str = 'diverse_qa',
tokenizer: transformers.AutoTokenizer | None = None,
prefix: str = 'Here are the questions and...,
max_num_pairs: int = 10,
name: str = 'DiverseQAPostProcessing'
)
Dataclass

Bases: ProcessingStage[DocumentBatch, DocumentBatch]

Post-processing stage for DiverseQA outputs. It parses the raw generated QA list, normalizes bullets, optionally samples pairs based on input length/tokenizer, and concatenates the original document text with the selected QA pairs.

input_field
str = 'text'
max_num_pairs
int = 10
name
str = 'DiverseQAPostProcessing'
prefix
str
qa_field
str = 'diverse_qa'
tokenizer
AutoTokenizer | None = None
nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.DiverseQAPostProcessingStage.process(
batch: nemo_curator.tasks.DocumentBatch
) -> nemo_curator.tasks.DocumentBatch
class nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.DiverseQAStage(
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = DIVERSE_QA_PROMPT_TEMPLATE,
input_field: str = 'text',
output_field: str = 'diverse_qa',
client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None, client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None,
model_name: str = None,
generation_config: nemo_curator.models.client.llm_client.GenerationConfig | None = None,
name: str = 'NemotronCCBaseStage',
tokenizer: transformers.AutoTokenizer = None,
prefix: str = 'Here are the questions and...,
max_num_pairs: int = 10
)
Dataclass

Bases: BaseSyntheticStage

input_field
str = 'text'
max_num_pairs
int = 10
output_field
str = 'diverse_qa'
prefix
str
prompt
str = DIVERSE_QA_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT
tokenizer
AutoTokenizer = None
class nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.ExtractKnowledgeStage(
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = EXTRACT_KNOWLEDGE_PROMPT_TE...,
input_field: str = 'text',
output_field: str = 'extract_knowledge',
client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None, client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None,
model_name: str = None,
generation_config: nemo_curator.models.client.llm_client.GenerationConfig | None = None,
name: str = 'NemotronCCBaseStage'
)
Dataclass

Bases: BaseSyntheticStage

input_field
str = 'text'
output_field
str = 'extract_knowledge'
prompt
str = EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT
class nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.KnowledgeListPostProcessingStage(
input_field: str = 'knowledge_list',
name: str = 'KnowledgeListPostProcessing'
)
Dataclass

Bases: ProcessingStage[DocumentBatch, DocumentBatch]

Post-processing stage that formats knowledge list outputs generated by the LLM. It normalizes leading bullet markers and trims indentation, producing a clean newline-separated list.

input_field
str = 'knowledge_list'
name
str = 'KnowledgeListPostProcessing'
nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.KnowledgeListPostProcessingStage.process(
batch: nemo_curator.tasks.DocumentBatch
) -> nemo_curator.tasks.DocumentBatch
class nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.KnowledgeListStage(
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = KNOWLEDGE_LIST_PROMPT_TEMPLATE,
input_field: str = 'text',
output_field: str = 'knowledge_list',
client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None, client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None,
model_name: str = None,
generation_config: nemo_curator.models.client.llm_client.GenerationConfig | None = None,
name: str = 'NemotronCCBaseStage'
)
Dataclass

Bases: BaseSyntheticStage

input_field
str = 'text'
output_field
str = 'knowledge_list'
prompt
str = KNOWLEDGE_LIST_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT
class nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc.WikipediaParaphrasingStage(
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = WIKIPEDIA_REPHRASING_PROMPT...,
input_field: str = 'text',
output_field: str = 'rephrased',
client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None, client: nemo_curator.models.client.llm_client.AsyncLLMClient | nemo_curator.models.client.llm_client.LLMClient = None,
model_name: str = None,
generation_config: nemo_curator.models.client.llm_client.GenerationConfig | None = None,
name: str = 'NemotronCCBaseStage'
)
Dataclass

Bases: BaseSyntheticStage

input_field
str = 'text'
output_field
str = 'rephrased'
prompt
str = WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT