NeMo-Curator
Table of Contents
About NeMo Curator
Get Started
Curate Text
Curate Images
Setup & Deployment
Reference
datasets
datasets.doc_dataset
datasets.image_text_pair_dataset
datasets.parallel_dataset
download
download.arxiv
download.commoncrawl
download.doc_builder
download.ja_stopwords
download.th_stopwords
download.wikipedia
download.zh_stopwords
filters
filters.models
filters.models.qe_models
filters.bitext_filter
filters.classifier_filter
filters.code
filters.doc_filter
filters.heuristic_filter
filters.synthetic
modifiers
modifiers.async_llm_pii_modifier
modifiers.c4
modifiers.doc_modifier
modifiers.fasttext
modifiers.line_remover
modifiers.llm_pii_modifier
modifiers.markdown_remover
modifiers.newline_normalizer
modifiers.pii_modifier
modifiers.quotation_remover
modifiers.slicer
modifiers.unicode_reformatter
modifiers.url_remover
modules
modules.fuzzy_dedup
modules.fuzzy_dedup.bucketstoedges
modules.fuzzy_dedup.connectedcomponents
modules.fuzzy_dedup.fuzzyduplicates
modules.fuzzy_dedup.jaccardsimilarity
modules.fuzzy_dedup.lsh
modules.fuzzy_dedup.minhash
modules.semantic_dedup
modules.semantic_dedup.clusteringmodel
modules.semantic_dedup.embeddings
modules.semantic_dedup.semanticclusterleveldedup
modules.semantic_dedup.semdedup
modules.add_id
modules.base
modules.config
modules.dataset_ops
modules.exact_dedup
modules.filter
modules.joiner
modules.meta
modules.modify
modules.splitter
modules.task
modules.to_backend
classifiers
classifiers.aegis
classifiers.base
classifiers.content_type
classifiers.domain
classifiers.fineweb_edu
classifiers.prompt_task_complexity
classifiers.quality
image
image.classifiers
image.classifiers.aesthetic
image.classifiers.base
image.classifiers.nsfw
image.embedders
image.embedders.base
image.embedders.timm
pii
pii.recognizers
pii.recognizers.address_recognizer
pii.algorithm
pii.constants
pii.custom_batch_analyzer_engine
pii.custom_nlp_engine
synthetic
synthetic.async_nemotron
synthetic.async_nemotron_cc
synthetic.error
synthetic.generator
synthetic.mixtral
synthetic.nemotron
synthetic.nemotron_cc
synthetic.no_format
synthetic.prompts
services
services.conversation_formatter
services.model_client
services.nemo_client
services.openai_client
nemo_run
nemo_run.slurm
tasks
tasks.downstream_task
tasks.metrics
utils
utils.fuzzy_dedup_utils
utils.fuzzy_dedup_utils.id_mapping
utils.fuzzy_dedup_utils.io_utils
utils.fuzzy_dedup_utils.merge_utils
utils.fuzzy_dedup_utils.output_map_utils
utils.fuzzy_dedup_utils.shuffle_utils
utils.image
utils.image.transforms
utils.aegis_utils
utils.config_utils
utils.constants
utils.decorators
utils.distributed_utils
utils.download_utils
utils.duplicates_removal
utils.file_utils
utils.gpu_utils
utils.import_utils
utils.llm_pii_utils
utils.module_utils
utils.script_utils
utils.semdedup_utils
utils.text_utils