modules.filter#
Module Contents#
Classes#
API#
- class modules.filter.Filter(
- filter_fn: collections.abc.Callable | nemo_curator.filters.DocumentFilter,
- filter_field: str,
- invert: bool = False,
Bases:
nemo_curator.modules.base.BaseModuleInitialization
- call(
- dataset: nemo_curator.datasets.DocumentDataset,
- compute_filter_mask(
- dataset: nemo_curator.datasets.DocumentDataset,
- class modules.filter.ParallelScoreFilter(
- src_filter_obj: nemo_curator.filters.DocumentFilter,
- tgt_filter_obj: nemo_curator.filters.DocumentFilter,
- src_field: str = 'src',
- tgt_field: str = 'tgt',
- src_score: str | None = None,
- tgt_score: str | None = None,
- score_type: str | None = None,
- invert: bool = False,
Bases:
nemo_curator.modules.base.BaseModuleInitialization
- call(
- dataset: nemo_curator.datasets.parallel_dataset.ParallelDataset,
- class modules.filter.Score(
- score_fn: collections.abc.Callable | nemo_curator.filters.DocumentFilter,
- score_field: str,
- text_field: str = 'text',
- score_type: type | str | None = None,
Bases:
nemo_curator.modules.base.BaseModuleInitialization
- call(
- dataset: nemo_curator.datasets.DocumentDataset,
- class modules.filter.ScoreFilter(
- filter_obj: nemo_curator.filters.DocumentFilter,
- text_field: str = 'text',
- score_field: str | None = None,
- score_type: type | str | None = None,
- invert: bool = False,
Bases:
nemo_curator.modules.base.BaseModuleInitialization
- call(
- dataset: nemo_curator.datasets.DocumentDataset,
- compute_filter_mask(
- dataset: nemo_curator.datasets.DocumentDataset,