nemo_curator.stages.math.classifiers.finemath

View as Markdown

Module Contents

Classes

NameDescription
CenterCropTextStagePre-tokenization stage that center-crops the text field to a fixed number
FineMathClassifierFineMath composite: TokenizerStage -> FineMathModelStage.
FineMathModelStageHugging Face sequence classification model stage for FineMath.

Data

FINEMATH_MODEL_ID

MAX_SEQ_LENGTH

API

class nemo_curator.stages.math.classifiers.finemath.CenterCropTextStage(
text_field: str = 'text',
center_crop_chars: int = 10000
)

Bases: ProcessingStage[DocumentBatch, DocumentBatch]

Pre-tokenization stage that center-crops the text field to a fixed number of characters to keep central context.

center_crop_chars
= max(0, int(center_crop_chars))
name
nemo_curator.stages.math.classifiers.finemath.CenterCropTextStage._mid_slice(
s: str,
n: int
) -> str
staticmethod
nemo_curator.stages.math.classifiers.finemath.CenterCropTextStage.inputs() -> tuple[list[str], list[str]]
nemo_curator.stages.math.classifiers.finemath.CenterCropTextStage.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.math.classifiers.finemath.CenterCropTextStage.process(
batch: nemo_curator.tasks.DocumentBatch
) -> nemo_curator.tasks.DocumentBatch
class nemo_curator.stages.math.classifiers.finemath.FineMathClassifier(
cache_dir: str | None = None,
float_score_column: str = 'finemath_scores',
int_score_column: str = 'finemath_int_scores',
text_field: str = 'text',
max_chars: int | None = None,
max_seq_length: int = MAX_SEQ_LENGTH,
sort_by_length: bool = False,
model_inference_batch_size: int = 1024,
autocast: bool = True,
center_crop_chars: int | None = 10000
)
Dataclass

Bases: CompositeStage[DocumentBatch, DocumentBatch]

FineMath composite: TokenizerStage -> FineMathModelStage.

autocast
bool = True
cache_dir
str | None = None
center_crop_chars
int | None = 10000
float_score_column
str = 'finemath_scores'
int_score_column
str = 'finemath_int_scores'
max_chars
int | None = None
max_seq_length
int = MAX_SEQ_LENGTH
model_inference_batch_size
int = 1024
sort_by_length
bool = False
text_field
str = 'text'
nemo_curator.stages.math.classifiers.finemath.FineMathClassifier.__post_init__() -> None
nemo_curator.stages.math.classifiers.finemath.FineMathClassifier.decompose() -> list[nemo_curator.stages.base.ProcessingStage]
class nemo_curator.stages.math.classifiers.finemath.FineMathModelStage(
model_identifier: str,
cache_dir: str | None = None,
float_score_column: str = 'finemath_scores',
int_score_column: str = 'finemath_int_scores',
model_inference_batch_size: int = 256,
has_seq_order: bool = True,
autocast: bool = True
)

Bases: ModelStage

Hugging Face sequence classification model stage for FineMath.

Outputs columns:

  • finemath_scores (float list)
  • finemath_int_scores (int list)
nemo_curator.stages.math.classifiers.finemath.FineMathModelStage._configure_forward(
model: torch.nn.Module
) -> torch.nn.Module
staticmethod
nemo_curator.stages.math.classifiers.finemath.FineMathModelStage._setup(
local_files_only: bool = True
) -> None
nemo_curator.stages.math.classifiers.finemath.FineMathModelStage.create_output_dataframe(
df_cpu: pandas.DataFrame,
collected_output: dict[str, numpy.ndarray]
) -> pandas.DataFrame
nemo_curator.stages.math.classifiers.finemath.FineMathModelStage.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.math.classifiers.finemath.FineMathModelStage.process_model_output(
outputs: torch.Tensor,
_: dict[str, torch.Tensor] | None = None
) -> dict[str, numpy.ndarray]
nemo_curator.stages.math.classifiers.finemath.FINEMATH_MODEL_ID = 'HuggingFaceTB/finemath-classifier'
nemo_curator.stages.math.classifiers.finemath.MAX_SEQ_LENGTH = 512