nemo_curator.stages.synthetic.nemotron_cc.nemo_data_designer.nemotron_cc

View as Markdown

NDD-backed NemotronCC synthetic data generation stages.

Drop-in replacements for the stages in nemo_curator.stages.synthetic.nemotron_cc.nemotron_cc that use NeMo Data Designer instead of LLMClient/AsyncLLMClient.

Module Contents

Classes

API

class nemo_curator.stages.synthetic.nemotron_cc.nemo_data_designer.nemotron_cc.DistillStage(
config_builder: data_designer.config.DataDesignerConfigBuilder | None = None,
data_designer_config_file: str | None = None,
model_providers: list | None = None,
verbose: bool = False,
system_prompt: str = NEMOTRON_CC_DISTILL_SYSTEM_...,
prompt: str = DISTILL_PROMPT_TEMPLATE,
input_field: str = 'text',
output_field: str = 'distill',
model_alias: str | None = None,
model_configs: list | None = None
)
Dataclass

Bases: NDDBaseSyntheticStage

input_field
str = 'text'
output_field
str = 'distill'
prompt
str = DISTILL_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_DISTILL_SYSTEM_PROMPT
class nemo_curator.stages.synthetic.nemotron_cc.nemo_data_designer.nemotron_cc.DiverseQAStage(
config_builder: data_designer.config.DataDesignerConfigBuilder | None = None,
data_designer_config_file: str | None = None,
model_providers: list | None = None,
verbose: bool = False,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = DIVERSE_QA_PROMPT_TEMPLATE,
input_field: str = 'text',
output_field: str = 'diverse_qa',
model_alias: str | None = None,
model_configs: list | None = None
)
Dataclass

Bases: NDDBaseSyntheticStage

input_field
str = 'text'
output_field
str = 'diverse_qa'
prompt
str = DIVERSE_QA_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT
class nemo_curator.stages.synthetic.nemotron_cc.nemo_data_designer.nemotron_cc.ExtractKnowledgeStage(
config_builder: data_designer.config.DataDesignerConfigBuilder | None = None,
data_designer_config_file: str | None = None,
model_providers: list | None = None,
verbose: bool = False,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = EXTRACT_KNOWLEDGE_PROMPT_TE...,
input_field: str = 'text',
output_field: str = 'extract_knowledge',
model_alias: str | None = None,
model_configs: list | None = None
)
Dataclass

Bases: NDDBaseSyntheticStage

input_field
str = 'text'
output_field
str = 'extract_knowledge'
prompt
str = EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT
class nemo_curator.stages.synthetic.nemotron_cc.nemo_data_designer.nemotron_cc.KnowledgeListStage(
config_builder: data_designer.config.DataDesignerConfigBuilder | None = None,
data_designer_config_file: str | None = None,
model_providers: list | None = None,
verbose: bool = False,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = KNOWLEDGE_LIST_PROMPT_TEMPLATE,
input_field: str = 'text',
output_field: str = 'knowledge_list',
model_alias: str | None = None,
model_configs: list | None = None
)
Dataclass

Bases: NDDBaseSyntheticStage

input_field
str = 'text'
output_field
str = 'knowledge_list'
prompt
str = KNOWLEDGE_LIST_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT
class nemo_curator.stages.synthetic.nemotron_cc.nemo_data_designer.nemotron_cc.WikipediaParaphrasingStage(
config_builder: data_designer.config.DataDesignerConfigBuilder | None = None,
data_designer_config_file: str | None = None,
model_providers: list | None = None,
verbose: bool = False,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt: str = WIKIPEDIA_REPHRASING_PROMPT...,
input_field: str = 'text',
output_field: str = 'rephrased',
model_alias: str | None = None,
model_configs: list | None = None
)
Dataclass

Bases: NDDBaseSyntheticStage

input_field
str = 'text'
output_field
str = 'rephrased'
prompt
str = WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE
system_prompt
str = NEMOTRON_CC_SYSTEM_PROMPT