synthetic.nemotron_cc#

Module Contents#

Classes#

API#

class synthetic.nemotron_cc.NemotronCCDiverseQAPostprocessor(
tokenizer: transformers.AutoTokenizer | None = None,
text_field: str = 'text',
response_field: str = 'response',
max_num_pairs: int = 1,
prefix: str = 'Here are the questions and answers based on the provided text:',
)#

Bases: nemo_curator.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
class synthetic.nemotron_cc.NemotronCCGenerator(llm_client: nemo_curator.services.LLMClient)#

Initialization

distill(
document: str,
model: str,
prompt_template: str = DISTILL_PROMPT_TEMPLATE,
system_prompt: str = NEMOTRON_CC_DISTILL_SYSTEM_PROMPT,
prompt_kwargs: dict | None = None,
model_kwargs: dict | None = None,
) list[str]#
extract_knowledge(
document: str,
model: str,
prompt_template: str = EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt_kwargs: dict | None = None,
model_kwargs: dict | None = None,
) list[str]#
generate_diverse_qa(
document: str,
model: str,
prompt_template: str = DIVERSE_QA_PROMPT_TEMPLATE,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt_kwargs: dict | None = None,
model_kwargs: dict | None = None,
) list[str]#
generate_knowledge_list(
document: str,
model: str,
prompt_template: str = KNOWLEDGE_LIST_PROMPT_TEMPLATE,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt_kwargs: dict | None = None,
model_kwargs: dict | None = None,
) list[str]#
rewrite_to_wikipedia_style(
document: str,
model: str,
prompt_template: str = WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE,
system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT,
prompt_kwargs: dict | None = None,
model_kwargs: dict | None = None,
) list[str]#
class synthetic.nemotron_cc.NemotronCCKnowledgeListPostprocessor(text_field: str = 'text')#

Bases: nemo_curator.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#