modules.filter#

Module Contents#

Classes#

API#

class modules.filter.Filter(
filter_fn: collections.abc.Callable | nemo_curator.filters.DocumentFilter,
filter_field: str,
invert: bool = False,
)#

Bases: nemo_curator.modules.base.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
compute_filter_mask(
dataset: nemo_curator.datasets.DocumentDataset,
) pandas.Series | pandas.DataFrame#
class modules.filter.ParallelScoreFilter(
src_filter_obj: nemo_curator.filters.DocumentFilter,
tgt_filter_obj: nemo_curator.filters.DocumentFilter,
src_field: str = 'src',
tgt_field: str = 'tgt',
src_score: str | None = None,
tgt_score: str | None = None,
score_type: str | None = None,
invert: bool = False,
)#

Bases: nemo_curator.modules.base.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.parallel_dataset.ParallelDataset,
) nemo_curator.datasets.parallel_dataset.ParallelDataset#
class modules.filter.Score(
score_fn: collections.abc.Callable | nemo_curator.filters.DocumentFilter,
score_field: str,
text_field: str = 'text',
score_type: type | str | None = None,
)#

Bases: nemo_curator.modules.base.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
class modules.filter.ScoreFilter(
filter_obj: nemo_curator.filters.DocumentFilter,
text_field: str = 'text',
score_field: str | None = None,
score_type: type | str | None = None,
invert: bool = False,
)#

Bases: nemo_curator.modules.base.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
compute_filter_mask(
dataset: nemo_curator.datasets.DocumentDataset,
) pandas.Series | pandas.DataFrame#