Important
You are viewing the NeMo 2.0 documentation. This release introduces significant changes to the API and a new library, NeMo Run. We are currently porting all features from NeMo 1.0 to 2.0. For documentation on previous versions or features not yet available in 2.0, please refer to the NeMo 24.07 documentation.
Deduplication#
Exact#
- class nemo_curator.ExactDuplicates(
- logger: logging.LoggerAdapter | str = './',
- id_field: str = 'id',
- text_field: str = 'text',
- hash_method: str = 'md5',
- profile_dir: str | None = None,
- cache_dir: str | None = None,
Find exact duplicates in a document corpus
- hash_documents(
- df: cudf.Series | pd.Series,
Compute hashes for a Series containing documents
Fuzzy#
- class nemo_curator.FuzzyDuplicatesConfig(
- cache_dir: str,
- profile_dir: str | None = None,
- id_field: str = 'id',
- text_field: str = 'text',
- seed: int = 42,
- char_ngrams: int = 5,
- num_buckets: int = 20,
- hashes_per_bucket: int = 13,
- use_64_bit_hash: bool = False,
- buckets_per_shuffle: int = 1,
- false_positive_check: bool = True,
- num_anchors: int = 2,
- jaccard_threshold: float = 0.8,
- bucket_mapping_blocksize: int = 256,
- parts_per_worker: int = 1,
- bucket_parts_per_worker: int = 8,
Configuration for MinHash based fuzzy duplicates detection. :param seed: :type seed: Seed for minhash permutations :param char_ngrams: :type char_ngrams: Size of Char ngram shingles used in minhash computation :param num_buckets: :type num_buckets: Number of Bands or buckets to use during Locality Sensitive Hashing :param hashes_per_bucket: :type hashes_per_bucket: Number of hashes per bucket/band. :param use_64_bit_hash: :type use_64_bit_hash: Whether to use a 32bit or 64bit hash function for minhashing. :param buckets_per_shuffle: Larger values process larger batches by processing multiple bands
but might lead to memory pressures and related errors.
- Parameters:
id_field (Column in the Dataset denoting document ID.)
text_field (Column in the Dataset denoting document content.)
profile_dir (str, Default None) – If specified directory to write dask profile
cache_dir (str, Default None) – Location to store deduplcation intermediates such as minhashes/buckets etc.
false_positive_check (bool,) – Whether to run a check to look for false positives within buckets. Note: This is a computationally expensive step.
num_anchors (int) – Number of documents per bucket to use as reference for computing jaccard pairs within that bucket to identify false positives.
jaccard_threshold (float) – The Jaccard similariy threshold to consider a document a near duplicate during false positive evaluations.
- class nemo_curator.FuzzyDuplicates(
- config: FuzzyDuplicatesConfig,
- logger: LoggerAdapter | str = './',
- class nemo_curator.LSH(
- cache_dir: str,
- num_hashes: int,
- num_buckets: int,
- buckets_per_shuffle: int = 1,
- logger: LoggerAdapter | str = './',
- id_fields: str | list = 'id',
- minhash_field: str = '_minhash_signature',
- profile_dir: str | None = None,
Performs LSH on a MinhashSignatures
- bucket_id_to_int(
- bucket_ddf: dask_cudf.DataFrame,
- bucket_col_name: str = 'bucket_id',
- start_id: int = 0,
Maps bucket ids to a contigious integer range from starting from start_id.
- lsh(write_path: str, df: dask_cudf.DataFrame) None #
Computes buckets and writes them as parquet files to the write_path
- class nemo_curator.MinHash(
- seed: int = 42,
- num_hashes: int = 260,
- char_ngrams: int = 5,
- use_64bit_hash: bool = False,
- logger: LoggerAdapter | str = './',
- id_field: str = 'id',
- text_field: str = 'text',
- profile_dir: str | None = None,
- cache_dir: str | None = None,
Computes minhash signatures of a document corpus
- generate_seeds(
- n_seeds: int = 260,
- seed: int = 0,
Generate seeds for all minhash permutations based on the given seed.
- minhash32(
- ser: cudf.Series,
- seeds: numpy.ndarray,
- char_ngram: int,
Compute 32bit minhashes based on the MurmurHash3 algorithm
- minhash64(
- ser: cudf.Series,
- seeds: numpy.ndarray,
- char_ngram: int,
Compute 64bit minhashes based on the MurmurHash3 algorithm
Semantic#
- class nemo_curator.SemDedup(
- config: SemDedupConfig,
- input_column: str = 'text',
- id_column: str = 'id',
- id_column_type: str = 'int',
- logger: Logger | str = './',
- class nemo_curator.SemDedupConfig(
- cache_dir: str,
- profile_dir: str | None = None,
- num_files: int = -1,
- embeddings_save_loc: str = 'embeddings',
- embedding_model_name_or_path: str = 'sentence-transformers/all-MiniLM-L6-v2',
- embedding_batch_size: int = 128,
- clustering_save_loc: str = 'clustering_results',
- n_clusters: int = 1000,
- seed: int = 1234,
- max_iter: int = 100,
- kmeans_with_cos_dist: bool = False,
- which_to_keep: str = 'hard',
- largest_cluster_size_to_process: int = 100000,
- sim_metric: str = 'cosine',
- eps_thresholds: ~typing.List[float] = <factory>,
- eps_to_extract: float = 0.01,
Configuration for Semantic Deduplication.
- cache_dir#
Directory to store cache.
- Type:
str
- profile_dir#
If specified directory to write dask profile. Default is None.
- Type:
Optional[str]
- cache_dir#
Directory to store cache.
- Type:
str
- num_files#
Number of files. Default is -1, meaning all files.
- Type:
int
- embeddings_save_loc#
Location to save embeddings.
- Type:
str
- embedding_model_name_or_path#
Model name or path for embeddings.
- Type:
str
- embedding_batch_size#
Inital Batch size for processing embeddings.
- Type:
int
- clustering_save_loc#
Location to save clustering results.
- Type:
str
- n_clusters#
Number of clusters.
- Type:
int
- seed#
Seed for clustering.
- Type:
int
- max_iter#
Maximum iterations for clustering.
- Type:
int
- kmeans_with_cos_dist#
Use KMeans with cosine distance.
- Type:
bool
- which_to_keep#
Which duplicates to keep.
- Type:
str
- largest_cluster_size_to_process#
Largest cluster size to process.
- Type:
int
- sim_metric#
Similarity metric for deduplication.
- Type:
str
- eps_thresholds#
Epsilon thresholds to calculate if semantically similar or not.
- Type:
List[float]
- eps_to_extract#
Epsilon value to extract deduplicated data.
- Type:
float
- class nemo_curator.EmbeddingCreator(
- embedding_model_name_or_path: str,
- embedding_batch_size: int,
- embedding_output_dir: str,
- embedding_max_mem_gb: int | None = None,
- input_column: str = 'text',
- embedding_column: str = 'embeddings',
- write_embeddings_to_disk: bool = True,
- write_to_filename: bool = False,
- logger: Logger | str = './',
- profile_dir: str | None = None,
- class nemo_curator.ClusteringModel(
- id_column: str,
- max_iter: int,
- n_clusters: int,
- clustering_output_dir: str,
- embedding_col: str = 'embeddings',
- sim_metric: str = 'cosine',
- which_to_keep: str = 'hard',
- sort_clusters: bool = True,
- kmeans_with_cos_dist: bool = False,
- partition_size: str = '2gb',
- logger: Logger | str = './',
- profile_dir: str | None = None,
- class nemo_curator.SemanticClusterLevelDedup(
- n_clusters: int,
- emb_by_clust_dir: str,
- sorted_clusters_dir: str,
- id_column: str,
- id_column_type: str,
- which_to_keep: str,
- output_dir: str,
- embedding_col: str = 'embeddings',
- logger: Logger | str = './',
- profile_dir: str | None = None,
- compute_semantic_match_dfs(
- eps_list: List[float] | None = None,
Compute semantic match dataframes for clusters.
- Parameters:
eps_list (Optional[List[float]]) – List of epsilon values for clustering.
- extract_dedup_data(
- eps_to_extract: float,
Extract deduplicated data based on epsilon value.
- Parameters:
eps_to_extract (float) – Epsilon threshold for extracting deduplicated data.
- Returns:
Dataset containing deduplicated documents.
- Return type: