Index A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z A ACCESS_ERROR_MESSAGE (in module classifiers.aegis) add_arg_autocast() (utils.script_utils.ArgumentHelper method) add_arg_batch_size() (utils.script_utils.ArgumentHelper method) add_arg_device() (utils.script_utils.ArgumentHelper method) add_arg_enable_spilling() (utils.script_utils.ArgumentHelper method) add_arg_id_column() (utils.script_utils.ArgumentHelper method) add_arg_id_column_type() (utils.script_utils.ArgumentHelper method) add_arg_input_data_dir() (utils.script_utils.ArgumentHelper method) add_arg_input_file_extension() (utils.script_utils.ArgumentHelper method) add_arg_input_file_type() (utils.script_utils.ArgumentHelper method) add_arg_input_local_data_dir() (utils.script_utils.ArgumentHelper method) add_arg_input_meta() (utils.script_utils.ArgumentHelper method) add_arg_input_text_field() (utils.script_utils.ArgumentHelper method) add_arg_language() (utils.script_utils.ArgumentHelper method) add_arg_log_dir() (utils.script_utils.ArgumentHelper method) add_arg_max_chars() (utils.script_utils.ArgumentHelper method) add_arg_max_mem_gb_classifier() (utils.script_utils.ArgumentHelper method) add_arg_minhash_length() (utils.script_utils.ArgumentHelper method) add_arg_model_path() (utils.script_utils.ArgumentHelper method) add_arg_nvlink_only() (utils.script_utils.ArgumentHelper method) add_arg_output_data_dir() (utils.script_utils.ArgumentHelper method) add_arg_output_dir() (utils.script_utils.ArgumentHelper method) add_arg_output_file_type() (utils.script_utils.ArgumentHelper method) add_arg_output_train_file() (utils.script_utils.ArgumentHelper method) add_arg_protocol() (utils.script_utils.ArgumentHelper method) add_arg_rmm_pool_size() (utils.script_utils.ArgumentHelper method) add_arg_scheduler_address() (utils.script_utils.ArgumentHelper method) add_arg_scheduler_file() (utils.script_utils.ArgumentHelper method) add_arg_seed() (utils.script_utils.ArgumentHelper method) add_arg_set_torch_to_use_rmm() (utils.script_utils.ArgumentHelper method) add_arg_shuffle() (utils.script_utils.ArgumentHelper method) add_arg_text_ddf_blocksize() (utils.script_utils.ArgumentHelper method) add_custom_operator() (pii.algorithm.PiiDeidentifier method) add_custom_recognizer() (pii.algorithm.PiiDeidentifier method) add_distributed_args() (utils.script_utils.ArgumentHelper method) add_distributed_classifier_cluster_args() (utils.script_utils.ArgumentHelper method) add_instruction_data_guard (classifiers.aegis.AegisConfig attribute) add_l2_cosine_dist_to_centroid() (in module utils.semdedup_utils) AddId (class in modules.add_id) AddressRecognizer (class in pii.recognizers.address_recognizer) AEGIS_LABELS (in module classifiers.aegis) AegisClassifier (class in classifiers.aegis) AegisConfig (class in classifiers.aegis) AegisHFModel (class in classifiers.aegis) AegisModel (class in classifiers.aegis) AestheticClassifier (class in image.classifiers.aesthetic) aggregated_anchor_docs_with_bk_read() (in module utils.fuzzy_dedup_utils.io_utils) AlphaFilter (class in filters.code) analyze() (pii.recognizers.address_recognizer.AddressRecognizer method) analyze_batch() (pii.custom_batch_analyzer_engine.CustomBatchAnalyzerEngine method) analyze_dict() (pii.custom_batch_analyzer_engine.CustomBatchAnalyzerEngine method) analyze_iterator() (pii.custom_batch_analyzer_engine.CustomBatchAnalyzerEngine method) analyze_text() (pii.algorithm.PiiDeidentifier method) analyze_text_batch() (pii.algorithm.PiiDeidentifier method) ANLI (class in tasks.metrics) AnswerabilityFilter (class in filters.synthetic) apply_bk_mapping() (in module utils.fuzzy_dedup_utils.merge_utils) ArcChallenge (class in tasks.metrics) ArcEasy (class in tasks.metrics) ArgumentHelper (class in utils.script_utils) ArxivDownloader (class in download.arxiv) ArxivExtractor (class in download.arxiv) ArxivIterator (class in download.arxiv) AsyncLLMClient (class in services.model_client) AsyncLLMInference (class in modifiers.async_llm_pii_modifier) AsyncLLMPiiModifier (class in modifiers.async_llm_pii_modifier) AsyncNemotronCCGenerator (class in synthetic.async_nemotron_cc) AsyncNemotronGenerator (class in synthetic.async_nemotron) AsyncOpenAIClient (class in services.openai_client) attach_bool_arg() (utils.script_utils.ArgumentHelper static method) attach_version_arg() (utils.script_utils.ArgumentHelper method) B backend (filters.doc_filter.DocumentFilter property) (modifiers.doc_modifier.DocumentModifier property) base_model (classifiers.domain.DomainModelConfig attribute) (classifiers.prompt_task_complexity.PromptTaskComplexityConfig attribute) BaseConfig (class in modules.config) BaseDeduplicationModule (class in modules.base) BaseModule (class in modules.base) batch_download() (in module download.doc_builder) batch_redact() (modifiers.async_llm_pii_modifier.AsyncLLMPiiModifier method) batched() (in module utils.decorators) batched_cosine_similarity (modules.config.SemDedupConfig attribute) BigBenchHard (class in tasks.metrics) BigBenchLight (class in tasks.metrics) BIT_WIDTH_32 (in module modules.fuzzy_dedup.minhash) BIT_WIDTH_64 (in module modules.fuzzy_dedup.minhash) BitextFilter (class in filters.bitext_filter) blend_datasets() (in module modules.dataset_ops) blockwise_merge() (in module utils.fuzzy_dedup_utils.merge_utils) BoilerPlateStringFilter (class in filters.heuristic_filter) BoilerPlateStringModifier (class in modifiers.c4) BoolQ (class in tasks.metrics) bucket_id_to_int() (modules.fuzzy_dedup.lsh.LSH method) bucket_mapping_blocksize (modules.config.FuzzyDuplicatesConfig attribute) bucket_parts_per_worker (modules.config.FuzzyDuplicatesConfig attribute) buckets_per_shuffle (modules.config.FuzzyDuplicatesConfig attribute) buckets_to_edges() (modules.fuzzy_dedup.bucketstoedges.BucketsToEdges method) BucketsToEdges (class in modules.fuzzy_dedup.bucketstoedges) (in module modules) build_downloader() (in module utils.config_utils) build_filter() (in module utils.config_utils) build_filter_pipeline() (in module utils.config_utils) build_partition() (in module utils.fuzzy_dedup_utils.output_map_utils) bullet_list (in module utils.constants) BulletsFilter (class in filters.heuristic_filter) C cache_dir (modules.config.FuzzyDuplicatesConfig attribute) (modules.config.SemDedupConfig attribute) call() (classifiers.base.DistributedDataClassifier method) (modules.add_id.AddId method) (modules.base.BaseDeduplicationModule method) (modules.base.BaseModule method) (modules.dataset_ops.Shuffle method) (modules.filter.Filter method) (modules.filter.ParallelScoreFilter method) (modules.filter.Score method) (modules.filter.ScoreFilter method) (modules.joiner.DocumentJoiner method) (modules.modify.Modify method) (modules.splitter.DocumentSplitter method) (modules.task.TaskDecontamination method) (modules.to_backend.ToBackend method) (synthetic.nemotron_cc.NemotronCCDiverseQAPostprocessor method) (synthetic.nemotron_cc.NemotronCCKnowledgeListPostprocessor method) call_inferer() (modifiers.async_llm_pii_modifier.AsyncLLMPiiModifier method) CAT_ALIASES (in module download.wikipedia) CB (class in tasks.metrics) cc_workflow() (modules.fuzzy_dedup.connectedcomponents.ConnectedComponents method) char_ngrams (modules.config.FuzzyDuplicatesConfig attribute) check_dask_cwd() (in module utils.distributed_utils) check_empty_buckets() (in module utils.fuzzy_dedup_utils.io_utils) chunk_files() (in module utils.fuzzy_dedup_utils.io_utils) classifiers module classifiers.aegis module classifiers.base module classifiers.content_type module classifiers.domain module classifiers.fineweb_edu module classifiers.prompt_task_complexity module classifiers.quality module classify_math_entity() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) classify_python_entity() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) clustering_input_partition_size (modules.config.SemDedupConfig attribute) clustering_save_loc (modules.config.SemDedupConfig attribute) ClusteringModel (class in modules.semantic_dedup.clusteringmodel) (in module modules) comet (in module filters.models.qe_models) COMET_IMPORT_MSG (in module filters.models.qe_models) COMETQEModel (class in filters.models.qe_models) common_english_words (in module utils.constants) CommonCrawlWARCDownloader (class in download.commoncrawl) CommonCrawlWARCDownloaderExtractOnly (class in download.commoncrawl) CommonCrawlWARCExtractor (class in download.commoncrawl) CommonCrawlWARCIterator (class in download.commoncrawl) CommonEnglishWordsFilter (class in filters.heuristic_filter) compute_filter_mask() (modules.filter.Filter method) (modules.filter.ScoreFilter method) compute_results() (classifiers.prompt_task_complexity.CustomHFDeberta method) compute_semantic_match_dfs() (modules.semantic_dedup.semanticclusterleveldedup.SemanticClusterLevelDedup method) configure_forward() (classifiers.fineweb_edu.FinewebEduModel static method) ConnectedComponents (class in modules.fuzzy_dedup.connectedcomponents) (in module modules) container_entrypoint (nemo_run.slurm.SlurmJobConfig attribute) CONTENT_TYPE_IDENTIFIER (in module classifiers.content_type) ContentTypeClassifier (class in classifiers.content_type) ContentTypeModel (class in classifiers.content_type) ContentTypeModelConfig (class in classifiers.content_type) ConversationFormatter (class in services.conversation_formatter) convert_response_to_yaml_list() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) convert_str_id_to_int() (in module utils.fuzzy_dedup_utils.id_mapping) convert_transforms_to_dali() (in module utils.image.transforms) Copa (class in tasks.metrics) COQA (class in tasks.metrics) COSINE_DIST_TO_CENT_COL (in module utils.semdedup_utils) count_digits() (in module utils.module_utils) cpu_worker_memory_limit (nemo_run.slurm.SlurmJobConfig attribute) create_client() (in module filters.synthetic) create_embeddings() (modules.semantic_dedup.embeddings.EmbeddingCreator method) cudf_spill (nemo_run.slurm.SlurmJobConfig attribute) CustomBatchAnalyzerEngine (class in pii.custom_batch_analyzer_engine) CustomHFDeberta (class in classifiers.prompt_task_complexity) CustomNlpEngine (class in pii.custom_nlp_engine) D dask_cuda_version (in module utils.fuzzy_dedup_utils.shuffle_utils) dask_cudf (in module utils.fuzzy_dedup_utils.output_map_utils) datasets module datasets.doc_dataset module datasets.image_text_pair_dataset module datasets.parallel_dataset module decode_html() (in module download.commoncrawl) deduplicate_groups() (in module utils.duplicates_removal) DEFAULT_BATCH_SIZE (in module modifiers.pii_modifier) DEFAULT_CLOSED_QA_PROMPT_TEMPLATE (in module synthetic.prompts) default_filename() (in module modules.dataset_ops) DEFAULT_LANGUAGE (in module pii.constants) DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_MAX_DOC_SIZE (in module pii.constants) DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_SUBTOPICS_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_WRITING_TASK_PROMPT_TEMPLATE (in module synthetic.prompts) DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE (in module synthetic.prompts) deidentify_text() (pii.algorithm.PiiDeidentifier method) deidentify_text_batch() (pii.algorithm.PiiDeidentifier method) device (nemo_run.slurm.SlurmJobConfig attribute) DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE (in module synthetic.prompts) DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE (in module synthetic.prompts) DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE (in module synthetic.prompts) distill() (synthetic.async_nemotron_cc.AsyncNemotronCCGenerator method) (synthetic.nemotron_cc.NemotronCCGenerator method) DISTILL_PROMPT_TEMPLATE (in module synthetic.prompts) DistributedDataClassifier (class in classifiers.base) DIVERSE_QA_PROMPT_TEMPLATE (in module synthetic.prompts) DocumentDataset (class in datasets.doc_dataset) DocumentDownloader (class in download.doc_builder) DocumentExtractor (class in download.doc_builder) DocumentFilter (class in filters.doc_filter) DocumentIterator (class in download.doc_builder) DocumentJoiner (class in modules.joiner) DocumentModifier (class in modifiers.doc_modifier) DocumentSplitter (class in modules.splitter) DOMAIN_BASE_MODEL (in module classifiers.domain) DOMAIN_IDENTIFIER (in module classifiers.domain) DomainClassifier (class in classifiers.domain) DomainModel (class in classifiers.domain) DomainModelConfig (class in classifiers.domain) download module download() (download.arxiv.ArxivDownloader method) (download.commoncrawl.CommonCrawlWARCDownloader method) (download.commoncrawl.CommonCrawlWARCDownloaderExtractOnly method) (download.doc_builder.DocumentDownloader method) (download.wikipedia.WikipediaDownloader method) download.arxiv module download.commoncrawl module download.doc_builder module download.ja_stopwords module download.th_stopwords module download.wikipedia module download.zh_stopwords module download_and_extract() (in module download.doc_builder) download_arxiv() (in module download.arxiv) download_common_crawl() (in module download.commoncrawl) download_wikipedia() (in module download.wikipedia) DownstreamTask (class in tasks.downstream_task) Drop (class in tasks.metrics) dtype (classifiers.aegis.AegisConfig attribute) E EasinessFilter (class in filters.synthetic) ellipsis_marks (in module utils.constants) EllipsisFilter (class in filters.heuristic_filter) embedding_batch_size (modules.config.SemDedupConfig attribute) embedding_column (modules.config.SemDedupConfig attribute) embedding_max_mem_gb (modules.config.SemDedupConfig attribute) embedding_model_name_or_path (modules.config.SemDedupConfig attribute) embedding_pooling_strategy (modules.config.SemDedupConfig attribute) EmbeddingConfig (class in modules.semantic_dedup.embeddings) EmbeddingCreator (class in modules.semantic_dedup.embeddings) (in module modules) EmbeddingCrossFitModel (class in modules.semantic_dedup.embeddings) EmbeddingPytorchModel (class in modules.semantic_dedup.embeddings) embeddings_save_loc (modules.config.SemDedupConfig attribute) end_marks (in module utils.constants) end_position (utils.llm_pii_utils.EntitySpan attribute) entity_type (utils.llm_pii_utils.EntitySpan attribute) EntitySpan (class in utils.llm_pii_utils) eps_to_extract (modules.config.SemDedupConfig attribute) ERROR_MESSAGE (in module utils.image.transforms) ExactDuplicates (class in modules.exact_dedup) expand_outdir_and_mkdir() (in module utils.file_utils) extract() (download.arxiv.ArxivExtractor method) (download.commoncrawl.CommonCrawlWARCExtractor method) (download.doc_builder.DocumentExtractor method) (download.wikipedia.WikipediaExtractor method) extract_dedup_data() (modules.semantic_dedup.semanticclusterleveldedup.SemanticClusterLevelDedup method) extract_knowledge() (synthetic.async_nemotron_cc.AsyncNemotronCCGenerator method) (synthetic.nemotron_cc.NemotronCCGenerator method) EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE (in module synthetic.prompts) extract_partitioning_index() (in module utils.fuzzy_dedup_utils.merge_utils) extract_text() (download.commoncrawl.HTMLExtractorAlgorithm method) (download.commoncrawl.JusTextExtractor method) (download.commoncrawl.ResiliparseExtractor method) (download.commoncrawl.TrafilaturaExtractor method) F false_positive_check (modules.config.FuzzyDuplicatesConfig attribute) FastTextLabelModifier (class in modifiers.fasttext) FastTextLangId (class in filters.classifier_filter) FastTextQualityFilter (class in filters.classifier_filter) fc_dropout (classifiers.content_type.ContentTypeModelConfig attribute) (classifiers.domain.DomainModelConfig attribute) (classifiers.quality.QualityModelConfig attribute) feature() (modules.semantic_dedup.embeddings.EmbeddingPytorchModel method) Filter (class in modules.filter) filter_files_by_extension() (in module utils.file_utils) filter_text_rows_by_bucket_batch() (in module utils.fuzzy_dedup_utils.merge_utils) filters module filters.bitext_filter module filters.classifier_filter module filters.code module filters.doc_filter module filters.heuristic_filter module filters.models module filters.models.qe_models module filters.synthetic module find_entity_spans() (in module utils.llm_pii_utils) find_matching_ngrams() (modules.task.TaskDecontamination method) FINEWEB_EDU_IDENTIFIER (in module classifiers.fineweb_edu) FINEWEB_MIXTRAL_IDENTIFIER (in module classifiers.fineweb_edu) FINEWEB_NEMOTRON_IDENTIFIER (in module classifiers.fineweb_edu) FineWebEduClassifier (class in classifiers.fineweb_edu) FinewebEduModel (class in classifiers.fineweb_edu) FineWebMixtralEduClassifier (class in classifiers.fineweb_edu) FineWebNemotronEduClassifier (class in classifiers.fineweb_edu) fix_overlaps() (in module utils.llm_pii_utils) format_aegis() (in module utils.aegis_utils) format_conversation() (services.conversation_formatter.ConversationFormatter method) (synthetic.mixtral.Mixtral8x7BFormatter static method) (synthetic.nemotron.NemotronFormatter static method) (synthetic.no_format.NoFormat method) forward() (classifiers.aegis.AegisModel method) (classifiers.aegis.InstructionDataGuardNet method) (classifiers.base.HFDeberta method) (classifiers.prompt_task_complexity.CustomHFDeberta method) (classifiers.prompt_task_complexity.MeanPooling method) (classifiers.prompt_task_complexity.MulticlassHead method) (image.classifiers.aesthetic.MLP method) (image.classifiers.nsfw.Normalization method) (image.classifiers.nsfw.NSFWModel method) (modules.semantic_dedup.embeddings.EmbeddingPytorchModel method) from_config() (pii.algorithm.PiiDeidentifier static method) from_default_config() (pii.algorithm.PiiDeidentifier static method) from_pandas() (datasets.doc_dataset.DocumentDataset class method) from_webdataset() (datasets.image_text_pair_dataset.ImageTextPairDataset class method) from_yaml() (modules.config.BaseConfig class method) from_yaml_file() (pii.algorithm.PiiDeidentifier static method) FuzzyDuplicates (class in modules.fuzzy_dedup.fuzzyduplicates) (in module modules) FuzzyDuplicatesConfig (class in modules.config) G GeneralCommentToCodeFilter (class in filters.code) generate() (synthetic.generator.SyntheticDataGenerator method) generate_closed_qa_instructions() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_dialogue() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_diverse_qa() (synthetic.async_nemotron_cc.AsyncNemotronCCGenerator method) (synthetic.nemotron_cc.NemotronCCGenerator method) generate_hash_permutation_seeds() (modules.fuzzy_dedup.minhash.MinHash method) generate_knowledge_list() (synthetic.async_nemotron_cc.AsyncNemotronCCGenerator method) (synthetic.nemotron_cc.NemotronCCGenerator method) generate_macro_topics() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_math_macro_topics() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_math_problem() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_math_subtopics() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_ngrams() (tasks.downstream_task.DownstreamTask method) (tasks.metrics.ANLI method) (tasks.metrics.ArcChallenge method) (tasks.metrics.ArcEasy method) (tasks.metrics.BigBenchHard method) (tasks.metrics.BigBenchLight method) (tasks.metrics.BoolQ method) (tasks.metrics.CB method) (tasks.metrics.Copa method) (tasks.metrics.COQA method) (tasks.metrics.Drop method) (tasks.metrics.Lambada method) (tasks.metrics.MMLU method) (tasks.metrics.Multilingual method) (tasks.metrics.MultiRC method) (tasks.metrics.NumDasc method) (tasks.metrics.OpenBookQA method) (tasks.metrics.PIQA method) (tasks.metrics.Quac method) (tasks.metrics.Race method) (tasks.metrics.Record method) (tasks.metrics.RTE method) (tasks.metrics.Squad method) (tasks.metrics.StoryCloze method) (tasks.metrics.TriviaQA method) (tasks.metrics.WebQA method) (tasks.metrics.WiC method) (tasks.metrics.Winogrande method) (tasks.metrics.WSC method) generate_open_qa_from_topic() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_python_macro_topics() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_python_problem() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_python_subtopics() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_subtopics() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_two_turn_prompt() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) generate_writing_tasks() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) get_agg_text_bytes_df() (in module utils.fuzzy_dedup_utils.output_map_utils) get_all_files_paths_under() (in module utils.file_utils) get_all_stop_words() (in module download.commoncrawl) get_array_from_df() (in module utils.semdedup_utils) get_arxiv_urls() (in module utils.download_utils) get_batched_files() (in module utils.file_utils) get_bucket_ddf_from_parquet_path() (in module utils.fuzzy_dedup_utils.io_utils) get_client() (in module utils.distributed_utils) get_comments() (in module utils.text_utils) get_comments_and_docstring() (in module utils.text_utils) get_common_crawl_snapshot_index() (in module utils.download_utils) get_common_crawl_urls() (in module utils.download_utils) get_current_client() (in module utils.distributed_utils) get_device_total_memory (in module utils.distributed_utils) get_docstrings() (in module utils.text_utils) get_file_size() (in module utils.fuzzy_dedup_utils.io_utils) get_filepath_without_extension() (in module utils.distributed_utils) get_frag_size() (in module utils.fuzzy_dedup_utils.io_utils) get_gpu_memory_info() (in module utils.distributed_utils) get_labels() (classifiers.base.DistributedDataClassifier method) (classifiers.prompt_task_complexity.PromptTaskComplexityClassifier method) get_main_warc_paths() (in module utils.download_utils) get_network_interfaces() (in module utils.distributed_utils) get_news_warc_paths() (in module utils.download_utils) get_ngrams() (in module utils.text_utils) get_num_workers() (in module utils.distributed_utils) get_paragraphs() (in module utils.text_utils) get_remaining_files() (in module utils.file_utils) get_restart_offsets() (in module utils.fuzzy_dedup_utils.io_utils) get_semantic_matches_per_cluster() (in module utils.semdedup_utils) get_sentences() (in module utils.text_utils) get_shuffle_part_ids_df() (in module utils.fuzzy_dedup_utils.shuffle_utils) get_stop_list_dict() (in module download.commoncrawl) get_system_prompt() (in module utils.llm_pii_utils) get_text_ddf_from_json_path_with_blocksize() (in module utils.fuzzy_dedup_utils.io_utils) get_wikipedia_urls() (in module utils.download_utils) get_word_splitter() (in module utils.text_utils) get_words() (in module utils.text_utils) GPU_INSTALL_STRING (in module utils.gpu_utils) H hash_documents() (modules.exact_dedup.ExactDuplicates method) hashes_per_bucket (modules.config.FuzzyDuplicatesConfig attribute) head() (datasets.doc_dataset.DocumentDataset method) HFDeberta (class in classifiers.base) HistogramFilter (class in filters.heuristic_filter) HTMLBoilerplateFilter (class in filters.code) HTMLExtractorAlgorithm (class in download.commoncrawl) I id_field (modules.config.FuzzyDuplicatesConfig attribute) identifier (classifiers.domain.DomainModelConfig attribute) identify_duplicates() (modules.base.BaseDeduplicationModule method) (modules.exact_dedup.ExactDuplicates method) (modules.fuzzy_dedup.fuzzyduplicates.FuzzyDuplicates method) (modules.semantic_dedup.semdedup.SemDedup method) image module image.classifiers module image.classifiers.aesthetic module image.classifiers.base module image.classifiers.nsfw module image.embedders module image.embedders.base module image.embedders.timm module IMAGE_INSTALL_STRING (in module utils.import_utils) image_only_import_from() (in module utils.import_utils) ImageClassifier (class in image.classifiers.base) ImageEmbedder (class in image.embedders.base) ImageTextPairDataset (class in datasets.image_text_pair_dataset) (in module datasets) import_downloader() (in module download.doc_builder) import_extractor() (in module download.doc_builder) import_filter() (in module filters.doc_filter) import_iterator() (in module download.doc_builder) import_task() (in module tasks.downstream_task) infer() (modifiers.async_llm_pii_modifier.AsyncLLMInference method) (modifiers.llm_pii_modifier.LLMInference method) instruction_data_guard_path (classifiers.aegis.AegisConfig attribute) InstructionDataGuardClassifier (class in classifiers.aegis) InstructionDataGuardNet (class in classifiers.aegis) int_ids_to_str() (in module utils.fuzzy_dedup_utils.id_mapping) interface (nemo_run.slurm.SlurmJobConfig attribute) is_batched() (in module utils.module_utils) is_cudf_type() (in module utils.gpu_utils) is_paragraph_indices_in_top_or_bottom_only() (in module utils.text_utils) is_unavailable() (in module utils.import_utils) iterate() (download.arxiv.ArxivIterator method) (download.commoncrawl.CommonCrawlWARCIterator method) (download.doc_builder.DocumentIterator method) (download.wikipedia.WikipediaIterator method) J ja_stopwords (in module download.ja_stopwords) jaccard_compute() (modules.fuzzy_dedup.jaccardsimilarity.JaccardSimilarity method) jaccard_threshold (modules.config.FuzzyDuplicatesConfig attribute) JaccardSimilarity (class in modules.fuzzy_dedup.jaccardsimilarity) (in module modules) job_dir (nemo_run.slurm.SlurmJobConfig attribute) JSON_SCHEMA (in module utils.llm_pii_utils) JusTextExtractor (class in download.commoncrawl) K keep_bitext() (filters.bitext_filter.BitextFilter method) (filters.classifier_filter.QualityEstimationFilter method) (filters.heuristic_filter.LengthRatioFilter method) keep_document() (filters.classifier_filter.FastTextLangId method) (filters.classifier_filter.FastTextQualityFilter method) (filters.code.AlphaFilter method) (filters.code.GeneralCommentToCodeFilter method) (filters.code.HTMLBoilerplateFilter method) (filters.code.NumberOfLinesOfCodeFilter method) (filters.code.PerExtensionFilter method) (filters.code.PythonCommentToCodeFilter method) (filters.code.TokenizerFertilityFilter method) (filters.code.XMLHeaderFilter method) (filters.doc_filter.DocumentFilter method) (filters.heuristic_filter.BoilerPlateStringFilter method) (filters.heuristic_filter.BulletsFilter method) (filters.heuristic_filter.CommonEnglishWordsFilter method) (filters.heuristic_filter.EllipsisFilter method) (filters.heuristic_filter.HistogramFilter method) (filters.heuristic_filter.LongWordFilter method) (filters.heuristic_filter.MeanWordLengthFilter method) (filters.heuristic_filter.NonAlphaNumericFilter method) (filters.heuristic_filter.NumbersFilter method) (filters.heuristic_filter.ParenthesesFilter method) (filters.heuristic_filter.PornographicUrlsFilter method) (filters.heuristic_filter.PunctuationFilter method) (filters.heuristic_filter.RepeatedLinesByCharFilter method) (filters.heuristic_filter.RepeatedLinesFilter method) (filters.heuristic_filter.RepeatedParagraphsByCharFilter method) (filters.heuristic_filter.RepeatedParagraphsFilter method) (filters.heuristic_filter.RepeatingDuplicateNGramsFilter method) (filters.heuristic_filter.RepeatingTopNGramsFilter method) (filters.heuristic_filter.SubstringFilter method) (filters.heuristic_filter.SymbolsToWordsFilter method) (filters.heuristic_filter.TokenCountFilter method) (filters.heuristic_filter.UrlsFilter method) (filters.heuristic_filter.WhiteSpaceFilter method) (filters.heuristic_filter.WordCountFilter method) (filters.heuristic_filter.WordsWithoutAlphabetsFilter method) (filters.synthetic.AnswerabilityFilter method) (filters.synthetic.EasinessFilter method) KNOWLEDGE_LIST_PROMPT_TEMPLATE (in module synthetic.prompts) L L2_DIST_TO_CENT_COL (in module utils.semdedup_utils) Lambada (class in tasks.metrics) lang_detect() (in module download.commoncrawl) left_anti_join() (in module utils.duplicates_removal) LengthRatioFilter (class in filters.heuristic_filter) libcudf_cufile_policy (nemo_run.slurm.SlurmJobConfig attribute) LineRemover (class in modifiers.line_remover) list_operators() (pii.algorithm.PiiDeidentifier method) list_supported_entities() (pii.algorithm.PiiDeidentifier method) LLMClient (class in services.model_client) LLMInference (class in modifiers.llm_pii_modifier) LLMPiiModifier (class in modifiers.llm_pii_modifier) load() (pii.custom_nlp_engine.CustomNlpEngine method) (pii.recognizers.address_recognizer.AddressRecognizer method) load_cfg() (classifiers.aegis.AegisHFModel method) load_config() (classifiers.aegis.AegisHFModel method) (classifiers.content_type.ContentTypeModel method) (classifiers.domain.DomainModel method) (classifiers.fineweb_edu.FinewebEduModel method) (classifiers.prompt_task_complexity.PromptTaskComplexityModel method) (classifiers.quality.QualityModel method) (modules.semantic_dedup.embeddings.EmbeddingCrossFitModel method) load_dataset_shard() (image.embedders.base.ImageEmbedder method) (image.embedders.timm.TimmImageEmbedder method) load_deidentifier() (modifiers.pii_modifier.PiiModifier method) load_embedding_model() (image.embedders.base.ImageEmbedder method) (image.embedders.timm.TimmImageEmbedder method) load_inferer() (modifiers.async_llm_pii_modifier.AsyncLLMPiiModifier method) (modifiers.llm_pii_modifier.LLMPiiModifier method) load_model() (classifiers.aegis.AegisHFModel method) (classifiers.content_type.ContentTypeModel method) (classifiers.domain.DomainModel method) (classifiers.fineweb_edu.FinewebEduModel method) (classifiers.prompt_task_complexity.PromptTaskComplexityModel method) (classifiers.quality.QualityModel method) (filters.models.qe_models.COMETQEModel class method) (filters.models.qe_models.PyMarianQEModel class method) (filters.models.qe_models.QEModel class method) (image.classifiers.aesthetic.AestheticClassifier method) (image.classifiers.base.ImageClassifier method) (image.classifiers.nsfw.NsfwClassifier method) (modules.semantic_dedup.embeddings.EmbeddingCrossFitModel method) load_object_on_worker() (in module utils.distributed_utils) load_tokenizer() (classifiers.aegis.AegisHFModel method) (classifiers.content_type.ContentTypeModel method) (classifiers.domain.DomainModel method) (classifiers.fineweb_edu.FinewebEduModel method) (classifiers.prompt_task_complexity.PromptTaskComplexityModel method) (classifiers.quality.QualityModel method) (modules.semantic_dedup.embeddings.EmbeddingCrossFitModel method) LocalCUDACluster (in module utils.distributed_utils) logger (in module pii.custom_batch_analyzer_engine) (in module pii.custom_nlp_engine) (in module utils.import_utils) LongWordFilter (class in filters.heuristic_filter) LSH (class in modules.fuzzy_dedup.lsh) (in module modules) lsh() (modules.fuzzy_dedup.lsh.LSH method) M MARIAN_CPU_ARGS (filters.models.qe_models.PyMarianQEModel attribute) MARIAN_GPU_ARGS (filters.models.qe_models.PyMarianQEModel attribute) MARKDOWN_BOLD_REGEX (in module modifiers.markdown_remover) MARKDOWN_ITALIC_REGEX (in module modifiers.markdown_remover) MARKDOWN_LINK_REGEX (in module modifiers.markdown_remover) MARKDOWN_UNDERLINE_REGEX (in module modifiers.markdown_remover) MarkdownRemover (class in modifiers.markdown_remover) MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE (in module synthetic.prompts) MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE (in module synthetic.prompts) max_iter (modules.config.SemDedupConfig attribute) max_len (classifiers.content_type.ContentTypeModelConfig attribute) (classifiers.domain.DomainModelConfig attribute) (classifiers.prompt_task_complexity.PromptTaskComplexityConfig attribute) (classifiers.quality.QualityModelConfig attribute) max_length (classifiers.aegis.AegisConfig attribute) max_seq_length (modules.semantic_dedup.embeddings.EmbeddingConfig attribute) max_seq_length() (classifiers.aegis.AegisHFModel method) (modules.semantic_dedup.embeddings.EmbeddingCrossFitModel method) MeanPooling (class in classifiers.prompt_task_complexity) MeanWordLengthFilter (class in filters.heuristic_filter) MEDIA_ALIASES (in module download.wikipedia) merge_counts() (in module utils.file_utils) merge_left_to_shuffled_right() (in module utils.fuzzy_dedup_utils.merge_utils) MinHash (class in modules.fuzzy_dedup.minhash) (in module modules) minhash32() (modules.fuzzy_dedup.minhash.MinHash method) minhash64() (modules.fuzzy_dedup.minhash.MinHash method) minhash_to_buckets() (modules.fuzzy_dedup.lsh.LSH method) Mixtral8x7BFormatter (class in synthetic.mixtral) mkdir() (in module utils.file_utils) MLP (class in image.classifiers.aesthetic) MMLU (class in tasks.metrics) model (classifiers.content_type.ContentTypeModelConfig attribute) (classifiers.quality.QualityModelConfig attribute) model_name_or_path (modules.semantic_dedup.embeddings.EmbeddingConfig attribute) MODEL_NAME_TO_HF_PATH (filters.models.qe_models.COMETQEModel attribute) (filters.models.qe_models.PyMarianQEModel attribute) model_output_type (classifiers.prompt_task_complexity.PromptTaskComplexityConfig attribute) modifiers module modifiers.async_llm_pii_modifier module modifiers.c4 module modifiers.doc_modifier module modifiers.fasttext module modifiers.line_remover module modifiers.llm_pii_modifier module modifiers.markdown_remover module modifiers.newline_normalizer module modifiers.pii_modifier module modifiers.quotation_remover module modifiers.slicer module modifiers.unicode_reformatter module modifiers.url_remover module Modify (class in modules.modify) modify_document() (modifiers.async_llm_pii_modifier.AsyncLLMPiiModifier method) (modifiers.c4.BoilerPlateStringModifier method) (modifiers.doc_modifier.DocumentModifier method) (modifiers.fasttext.FastTextLabelModifier method) (modifiers.line_remover.LineRemover method) (modifiers.llm_pii_modifier.LLMPiiModifier method) (modifiers.markdown_remover.MarkdownRemover method) (modifiers.newline_normalizer.NewlineNormalizer method) (modifiers.pii_modifier.PiiModifier method) (modifiers.quotation_remover.QuotationRemover method) (modifiers.slicer.Slicer method) (modifiers.unicode_reformatter.UnicodeReformatter method) (modifiers.url_remover.UrlRemover method) module classifiers classifiers.aegis classifiers.base classifiers.content_type classifiers.domain classifiers.fineweb_edu classifiers.prompt_task_complexity classifiers.quality datasets datasets.doc_dataset datasets.image_text_pair_dataset datasets.parallel_dataset download download.arxiv download.commoncrawl download.doc_builder download.ja_stopwords download.th_stopwords download.wikipedia download.zh_stopwords filters filters.bitext_filter filters.classifier_filter filters.code filters.doc_filter filters.heuristic_filter filters.models filters.models.qe_models filters.synthetic image image.classifiers image.classifiers.aesthetic image.classifiers.base image.classifiers.nsfw image.embedders image.embedders.base image.embedders.timm modifiers modifiers.async_llm_pii_modifier modifiers.c4 modifiers.doc_modifier modifiers.fasttext modifiers.line_remover modifiers.llm_pii_modifier modifiers.markdown_remover modifiers.newline_normalizer modifiers.pii_modifier modifiers.quotation_remover modifiers.slicer modifiers.unicode_reformatter modifiers.url_remover modules modules.add_id modules.base modules.config modules.dataset_ops modules.exact_dedup modules.filter modules.fuzzy_dedup modules.fuzzy_dedup.bucketstoedges modules.fuzzy_dedup.connectedcomponents modules.fuzzy_dedup.fuzzyduplicates modules.fuzzy_dedup.jaccardsimilarity modules.fuzzy_dedup.lsh modules.fuzzy_dedup.minhash modules.joiner modules.meta modules.modify modules.semantic_dedup modules.semantic_dedup.clusteringmodel modules.semantic_dedup.embeddings modules.semantic_dedup.semanticclusterleveldedup modules.semantic_dedup.semdedup modules.splitter modules.task modules.to_backend nemo_run nemo_run.slurm pii pii.algorithm pii.constants pii.custom_batch_analyzer_engine pii.custom_nlp_engine pii.recognizers pii.recognizers.address_recognizer services services.conversation_formatter services.model_client services.nemo_client services.openai_client synthetic synthetic.async_nemotron synthetic.async_nemotron_cc synthetic.error synthetic.generator synthetic.mixtral synthetic.nemotron synthetic.nemotron_cc synthetic.no_format synthetic.prompts tasks tasks.downstream_task tasks.metrics utils utils.aegis_utils utils.config_utils utils.constants utils.decorators utils.distributed_utils utils.download_utils utils.duplicates_removal utils.file_utils utils.fuzzy_dedup_utils utils.fuzzy_dedup_utils.id_mapping utils.fuzzy_dedup_utils.io_utils utils.fuzzy_dedup_utils.merge_utils utils.fuzzy_dedup_utils.output_map_utils utils.fuzzy_dedup_utils.shuffle_utils utils.gpu_utils utils.image utils.image.transforms utils.import_utils utils.llm_pii_utils utils.module_utils utils.script_utils utils.semdedup_utils utils.text_utils modules module modules.add_id module modules.base module modules.config module modules.dataset_ops module modules.exact_dedup module modules.filter module modules.fuzzy_dedup module modules.fuzzy_dedup.bucketstoedges module modules.fuzzy_dedup.connectedcomponents module modules.fuzzy_dedup.fuzzyduplicates module modules.fuzzy_dedup.jaccardsimilarity module modules.fuzzy_dedup.lsh module modules.fuzzy_dedup.minhash module modules.joiner module modules.meta module modules.modify module modules.semantic_dedup module modules.semantic_dedup.clusteringmodel module modules.semantic_dedup.embeddings module modules.semantic_dedup.semanticclusterleveldedup module modules.semantic_dedup.semdedup module modules.splitter module modules.task module modules.to_backend module MulticlassHead (class in classifiers.prompt_task_complexity) Multilingual (class in tasks.metrics) MULTILINGUAL_DOMAIN_BASE_MODEL (in module classifiers.domain) MULTILINGUAL_DOMAIN_IDENTIFIER (in module classifiers.domain) MultilingualDomainClassifier (class in classifiers.domain) MultiRC (class in tasks.metrics) N n_clusters (modules.config.SemDedupConfig attribute) name (filters.doc_filter.DocumentFilter property) NEMO_CURATOR_HOME (in module utils.file_utils) nemo_run module nemo_run.slurm module NemoDeployClient (class in services.nemo_client) NEMOTRON_CC_DISTILL_SYSTEM_PROMPT (in module synthetic.prompts) NEMOTRON_CC_SYSTEM_PROMPT (in module synthetic.prompts) NemotronCCDiverseQAPostprocessor (class in synthetic.nemotron_cc) NemotronCCGenerator (class in synthetic.nemotron_cc) NemotronCCKnowledgeListPostprocessor (class in synthetic.nemotron_cc) NemotronFormatter (class in synthetic.nemotron) NemotronGenerator (class in synthetic.nemotron) NewlineNormalizer (class in modifiers.newline_normalizer) ngrams (filters.doc_filter.DocumentFilter property) (tasks.downstream_task.DownstreamTask property) NODE_TYPES (in module utils.text_utils) NoFormat (class in synthetic.no_format) NON_SPACED_LANGUAGES (in module download.commoncrawl) NonAlphaNumericFilter (class in filters.heuristic_filter) Normalization (class in image.classifiers.nsfw) normalize_embeddings_col_in_df() (in module utils.semdedup_utils) NoWorkerError NsfwClassifier (class in image.classifiers.nsfw) NSFWModel (class in image.classifiers.nsfw) null_decorator() (in module utils.import_utils) num_anchors (modules.config.FuzzyDuplicatesConfig attribute) num_buckets (modules.config.FuzzyDuplicatesConfig attribute) num_files (modules.config.SemDedupConfig attribute) NumberOfLinesOfCodeFilter (class in filters.code) NumbersFilter (class in filters.heuristic_filter) NumDasc (class in tasks.metrics) O offload_object_on_worker() (in module utils.distributed_utils) OpenAIClient (class in services.openai_client) OpenBookQA (class in tasks.metrics) P pairwise_cosine_similarity() (in module utils.semdedup_utils) pairwise_cosine_similarity_batched() (in module utils.semdedup_utils) paragraphs (filters.doc_filter.DocumentFilter property) ParallelDataset (class in datasets.parallel_dataset) ParallelScoreFilter (class in modules.filter) ParenthesesFilter (class in filters.heuristic_filter) parse_client_args() (utils.script_utils.ArgumentHelper static method) parse_distributed_classifier_args() (utils.script_utils.ArgumentHelper static method) parse_docstrings() (in module utils.text_utils) parse_gpu_dedup_args() (utils.script_utils.ArgumentHelper method) parse_response() (synthetic.generator.SyntheticDataGenerator method) parse_semdedup_args() (utils.script_utils.ArgumentHelper static method) parse_str_of_num_bytes() (in module utils.file_utils) parts_per_worker (modules.config.FuzzyDuplicatesConfig attribute) peft_model_name_or_path (classifiers.aegis.AegisConfig attribute) PerExtensionFilter (class in filters.code) perform_removal (modules.config.FuzzyDuplicatesConfig attribute) performance_report_if() (in module utils.distributed_utils) performance_report_if_with_ts_suffix() (in module utils.distributed_utils) persist() (datasets.doc_dataset.DocumentDataset method) (datasets.parallel_dataset.ParallelDataset method) pii module pii.algorithm module pii.constants module pii.custom_batch_analyzer_engine module pii.custom_nlp_engine module pii.recognizers module pii.recognizers.address_recognizer module PII_LABELS (in module utils.llm_pii_utils) PiiDeidentifier (class in pii.algorithm) PiiModifier (class in modifiers.pii_modifier) PIQA (class in tasks.metrics) policy_substrings (in module utils.constants) pooling_strategy (modules.semantic_dedup.embeddings.EmbeddingConfig attribute) PornographicUrlsFilter (class in filters.heuristic_filter) postprocess() (image.classifiers.aesthetic.AestheticClassifier method) (image.classifiers.base.ImageClassifier method) (image.classifiers.nsfw.NsfwClassifier method) predict() (filters.models.qe_models.COMETQEModel method) (filters.models.qe_models.PyMarianQEModel method) (filters.models.qe_models.QEModel method) prepare_task_ngram_count() (modules.task.TaskDecontamination method) pretrained_model_name_or_path (classifiers.aegis.AegisConfig attribute) process_all_batches() (in module utils.distributed_utils) process_batch() (in module utils.distributed_utils) (pii.custom_nlp_engine.CustomNlpEngine method) process_logits() (classifiers.prompt_task_complexity.CustomHFDeberta method) profile_dir (modules.config.FuzzyDuplicatesConfig attribute) (modules.config.SemDedupConfig attribute) PROMPT_PREFIX (synthetic.mixtral.Mixtral8x7BFormatter attribute) (synthetic.nemotron.NemotronFormatter attribute) PROMPT_TASK_COMPLEXITY_IDENTIFIER (in module classifiers.prompt_task_complexity) PromptTaskComplexityClassifier (class in classifiers.prompt_task_complexity) PromptTaskComplexityConfig (class in classifiers.prompt_task_complexity) PromptTaskComplexityModel (class in classifiers.prompt_task_complexity) protocol (nemo_run.slurm.SlurmJobConfig attribute) prune_single_cluster() (in module utils.semdedup_utils) PunctuationFilter (class in filters.heuristic_filter) pymarian (in module filters.models.qe_models) PYMARIAN_IMPORT_MSG (in module filters.models.qe_models) PyMarianQEModel (class in filters.models.qe_models) PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE (in module synthetic.prompts) PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE (in module synthetic.prompts) PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE (in module synthetic.prompts) PythonCommentToCodeFilter (class in filters.code) Q QEModel (class in filters.models.qe_models) Quac (class in tasks.metrics) QUALITY_IDENTIFIER (in module classifiers.quality) QualityClassifier (class in classifiers.quality) QualityEstimationFilter (class in filters.classifier_filter) QualityModel (class in classifiers.quality) QualityModelConfig (class in classifiers.quality) query_model() (services.model_client.AsyncLLMClient method) (services.model_client.LLMClient method) (services.nemo_client.NemoDeployClient method) (services.openai_client.AsyncOpenAIClient method) (services.openai_client.OpenAIClient method) query_reward_model() (services.model_client.AsyncLLMClient method) (services.model_client.LLMClient method) (services.nemo_client.NemoDeployClient method) (services.openai_client.AsyncOpenAIClient method) (services.openai_client.OpenAIClient method) QuotationRemover (class in modifiers.quotation_remover) R Race (class in tasks.metrics) random_state (modules.config.SemDedupConfig attribute) rapids_no_initialize (nemo_run.slurm.SlurmJobConfig attribute) read_custom() (datasets.doc_dataset.DocumentDataset class method) read_data() (in module utils.distributed_utils) read_data_blocksize() (in module utils.distributed_utils) read_data_files_per_partition() (in module utils.distributed_utils) read_json() (datasets.doc_dataset.DocumentDataset class method) read_pandas_pickle() (in module utils.distributed_utils) read_parquet() (datasets.doc_dataset.DocumentDataset class method) read_pickle() (datasets.doc_dataset.DocumentDataset class method) read_simple_bitext() (datasets.parallel_dataset.ParallelDataset class method) read_single_partition() (in module utils.distributed_utils) read_single_simple_bitext_file_pair() (datasets.parallel_dataset.ParallelDataset static method) rearange_by_column_direct() (in module utils.fuzzy_dedup_utils.shuffle_utils) Record (class in tasks.metrics) redact() (in module utils.llm_pii_utils) regex_alpha (in module utils.constants) regex_alphanum (in module utils.constants) regex_digit (in module utils.constants) regex_hash (in module utils.constants) regex_paren (in module utils.constants) regex_url (in module utils.constants) remove() (modules.base.BaseDeduplicationModule method) (modules.exact_dedup.ExactDuplicates method) (modules.fuzzy_dedup.fuzzyduplicates.FuzzyDuplicates method) (modules.semantic_dedup.semdedup.SemDedup method) remove_duplicates() (in module utils.duplicates_removal) remove_matching_ngrams() (modules.task.TaskDecontamination method) remove_path_extension() (in module utils.file_utils) remove_punctuation() (in module utils.text_utils) repartition() (datasets.doc_dataset.DocumentDataset method) RepeatedLinesByCharFilter (class in filters.heuristic_filter) RepeatedLinesFilter (class in filters.heuristic_filter) RepeatedParagraphsByCharFilter (class in filters.heuristic_filter) RepeatedParagraphsFilter (class in filters.heuristic_filter) RepeatingDuplicateNGramsFilter (class in filters.heuristic_filter) RepeatingTopNGramsFilter (class in filters.heuristic_filter) reshard_jsonl() (in module utils.file_utils) ResiliparseExtractor (class in download.commoncrawl) revise_open_qa() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) revise_writing_tasks() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) rewrite_to_wikipedia_style() (synthetic.async_nemotron_cc.AsyncNemotronCCGenerator method) (synthetic.nemotron_cc.NemotronCCGenerator method) rmm_scheduler_pool_size (nemo_run.slurm.SlurmJobConfig attribute) rmm_worker_pool_size (nemo_run.slurm.SlurmJobConfig attribute) RTE (class in tasks.metrics) run (in module nemo_run.slurm) run_closed_qa_pipeline() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) run_math_pipeline() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) run_open_qa_pipeline() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) run_python_pipeline() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) run_writing_pipeline() (synthetic.async_nemotron.AsyncNemotronGenerator method) (synthetic.nemotron.NemotronGenerator method) S safe_import() (in module utils.import_utils) safe_import_from() (in module utils.import_utils) save_metadata() (datasets.image_text_pair_dataset.ImageTextPairDataset method) Score (class in modules.filter) score_bitext() (filters.bitext_filter.BitextFilter method) (filters.classifier_filter.QualityEstimationFilter method) (filters.heuristic_filter.LengthRatioFilter method) score_document() (filters.classifier_filter.FastTextLangId method) (filters.classifier_filter.FastTextQualityFilter method) (filters.code.AlphaFilter method) (filters.code.GeneralCommentToCodeFilter method) (filters.code.HTMLBoilerplateFilter method) (filters.code.NumberOfLinesOfCodeFilter method) (filters.code.PerExtensionFilter method) (filters.code.PythonCommentToCodeFilter method) (filters.code.TokenizerFertilityFilter method) (filters.code.XMLHeaderFilter method) (filters.doc_filter.DocumentFilter method) (filters.heuristic_filter.BoilerPlateStringFilter method) (filters.heuristic_filter.BulletsFilter method) (filters.heuristic_filter.CommonEnglishWordsFilter method) (filters.heuristic_filter.EllipsisFilter method) (filters.heuristic_filter.HistogramFilter method) (filters.heuristic_filter.LongWordFilter method) (filters.heuristic_filter.MeanWordLengthFilter method) (filters.heuristic_filter.NonAlphaNumericFilter method) (filters.heuristic_filter.NumbersFilter method) (filters.heuristic_filter.ParenthesesFilter method) (filters.heuristic_filter.PornographicUrlsFilter method) (filters.heuristic_filter.PunctuationFilter method) (filters.heuristic_filter.RepeatedLinesByCharFilter method) (filters.heuristic_filter.RepeatedLinesFilter method) (filters.heuristic_filter.RepeatedParagraphsByCharFilter method) (filters.heuristic_filter.RepeatedParagraphsFilter method) (filters.heuristic_filter.RepeatingDuplicateNGramsFilter method) (filters.heuristic_filter.RepeatingTopNGramsFilter method) (filters.heuristic_filter.SubstringFilter method) (filters.heuristic_filter.SymbolsToWordsFilter method) (filters.heuristic_filter.TokenCountFilter method) (filters.heuristic_filter.UrlsFilter method) (filters.heuristic_filter.WhiteSpaceFilter method) (filters.heuristic_filter.WordCountFilter method) (filters.heuristic_filter.WordsWithoutAlphabetsFilter method) (filters.synthetic.AnswerabilityFilter method) (filters.synthetic.EasinessFilter method) ScoreFilter (class in modules.filter) script_command (nemo_run.slurm.SlurmJobConfig attribute) seed (modules.config.FuzzyDuplicatesConfig attribute) seed_all() (in module utils.distributed_utils) select_columns() (in module utils.distributed_utils) SemanticClusterLevelDedup (class in modules.semantic_dedup.semanticclusterleveldedup) (in module modules) SemDedup (class in modules.semantic_dedup.semdedup) (in module modules) SemDedupConfig (class in modules.config) sentences (filters.doc_filter.DocumentFilter property) separate_by_metadata() (in module utils.file_utils) Sequential (class in modules.meta) services module services.conversation_formatter module services.model_client module services.nemo_client module services.openai_client module set_autocast() (classifiers.base.HFDeberta method) (classifiers.prompt_task_complexity.CustomHFDeberta method) set_default_n_workers() (utils.script_utils.ArgumentHelper method) SHARD_SIZE (filters.models.qe_models.PyMarianQEModel attribute) Shuffle (class in modules.dataset_ops) shuffle_deterministic() (modules.dataset_ops.Shuffle method) shuffle_nondeterministic() (modules.dataset_ops.Shuffle method) sim_metric (modules.config.SemDedupConfig attribute) single_partition_write_with_filename() (in module utils.distributed_utils) Slicer (class in modifiers.slicer) SlurmJobConfig (class in nemo_run.slurm) Squad (class in tasks.metrics) start_dask_cpu_local_cluster() (in module utils.distributed_utils) start_dask_gpu_local_cluster() (in module utils.distributed_utils) start_position (utils.llm_pii_utils.EntitySpan attribute) StoryCloze (class in tasks.metrics) strip_trailing_sep() (in module utils.fuzzy_dedup_utils.io_utils) SubstringFilter (class in filters.heuristic_filter) SUPPORTED_BACKENDS (modules.base.BaseModule attribute) SUPPORTED_ENTITIES (in module pii.constants) SUPPORTED_HASHES (modules.exact_dedup.ExactDuplicates attribute) SUPPORTED_INTERPOLATIONS (in module utils.image.transforms) SUPPORTED_JSONL_COMPRESSIONS (in module utils.distributed_utils) SUPPORTED_MODELS (filters.classifier_filter.QualityEstimationFilter attribute) SymbolsToWordsFilter (class in filters.heuristic_filter) synthetic module synthetic.async_nemotron module synthetic.async_nemotron_cc module synthetic.error module synthetic.generator module synthetic.mixtral module synthetic.nemotron module synthetic.nemotron_cc module synthetic.no_format module synthetic.prompts module SyntheticDataGenerator (class in synthetic.generator) T TaskDecontamination (class in modules.task) tasks module tasks.downstream_task module tasks.metrics module text_field (modules.config.FuzzyDuplicatesConfig attribute) th_stopwords (in module download.th_stopwords) THREE_OR_MORE_NEWLINES_REGEX (in module modifiers.newline_normalizer) THREE_OR_MORE_WINDOWS_NEWLINES_REGEX (in module modifiers.newline_normalizer) thresholding() (modules.fuzzy_dedup.connectedcomponents.ConnectedComponents static method) TimmImageEmbedder (class in image.embedders.timm) to_bitext() (datasets.parallel_dataset.ParallelDataset method) to_json() (datasets.doc_dataset.DocumentDataset method) to_pandas() (datasets.doc_dataset.DocumentDataset method) to_parquet() (datasets.doc_dataset.DocumentDataset method) to_pickle() (datasets.doc_dataset.DocumentDataset method) to_script() (nemo_run.slurm.SlurmJobConfig method) to_webdataset() (datasets.image_text_pair_dataset.ImageTextPairDataset method) ToBackend (class in modules.to_backend) token (classifiers.aegis.AegisConfig attribute) TokenCountFilter (class in filters.heuristic_filter) TokenizerFertilityFilter (class in filters.code) TrafilaturaExtractor (class in download.commoncrawl) TriviaQA (class in tasks.metrics) try_decode_with_detected_encoding() (in module download.commoncrawl) U UnavailableError UnavailableMeta (class in utils.import_utils) UnavailableNullContext (class in utils.import_utils) UnicodeReformatter (class in modifiers.unicode_reformatter) unsafe_categories (in module utils.aegis_utils) update_restart_offsets() (in module utils.fuzzy_dedup_utils.io_utils) URL_REGEX (in module modifiers.url_remover) UrlRemover (class in modifiers.url_remover) UrlsFilter (class in filters.heuristic_filter) use_64_bit_hash (modules.config.FuzzyDuplicatesConfig attribute) USE_EXCOMMS (in module utils.fuzzy_dedup_utils.shuffle_utils) utils module utils.aegis_utils module utils.config_utils module utils.constants module utils.decorators module utils.distributed_utils module utils.download_utils module utils.duplicates_removal module utils.file_utils module utils.fuzzy_dedup_utils module utils.fuzzy_dedup_utils.id_mapping module utils.fuzzy_dedup_utils.io_utils module utils.fuzzy_dedup_utils.merge_utils module utils.fuzzy_dedup_utils.output_map_utils module utils.fuzzy_dedup_utils.shuffle_utils module utils.gpu_utils module utils.image module utils.image.transforms module utils.import_utils module utils.llm_pii_utils module utils.module_utils module utils.script_utils module utils.semdedup_utils module utils.text_utils module V validate_entity() (in module utils.llm_pii_utils) validate_keys() (in module utils.llm_pii_utils) W WebQA (class in tasks.metrics) which_to_keep (modules.config.SemDedupConfig attribute) white_space_list (in module utils.constants) WhiteSpaceFilter (class in filters.heuristic_filter) WiC (class in tasks.metrics) WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE (in module synthetic.prompts) WikipediaDownloader (class in download.wikipedia) WikipediaExtractor (class in download.wikipedia) WikipediaIterator (class in download.wikipedia) Winogrande (class in tasks.metrics) WordCountFilter (class in filters.heuristic_filter) WordsWithoutAlphabetsFilter (class in filters.heuristic_filter) wrap_qe_input() (filters.models.qe_models.COMETQEModel static method) (filters.models.qe_models.PyMarianQEModel static method) (filters.models.qe_models.QEModel static method) write_dataframe_by_meta() (in module utils.file_utils) write_embeddings_to_disk (modules.config.SemDedupConfig attribute) write_partitioned_file() (in module utils.fuzzy_dedup_utils.shuffle_utils) write_pruned_summary_file() (in module utils.semdedup_utils) write_record() (in module utils.file_utils) write_to_disk() (in module utils.distributed_utils) write_to_filename (modules.config.SemDedupConfig attribute) WSC (class in tasks.metrics) X XMLHeaderFilter (class in filters.code) Y YamlConversionError Z zh_stopwords (in module download.zh_stopwords)