classifiers.fineweb_edu#

Module Contents#

Classes#

Data#

API#

classifiers.fineweb_edu.FINEWEB_EDU_IDENTIFIER#

‘HuggingFaceFW/fineweb-edu-classifier’

classifiers.fineweb_edu.FINEWEB_MIXTRAL_IDENTIFIER#

‘nvidia/nemocurator-fineweb-mixtral-edu-classifier’

classifiers.fineweb_edu.FINEWEB_NEMOTRON_IDENTIFIER#

‘nvidia/nemocurator-fineweb-nemotron-4-edu-classifier’

class classifiers.fineweb_edu.FineWebEduClassifier(
batch_size: int = 256,
text_field: str = 'text',
pred_column: str = 'fineweb-edu-score',
int_column: str = 'fineweb-edu-score-int',
max_chars: int = -1,
device_type: str = 'cuda',
autocast: bool = True,
max_mem_gb: int | None = None,
)#

Bases: classifiers.fineweb_edu._FineWebBaseClassifier

Initialization

class classifiers.fineweb_edu.FineWebMixtralEduClassifier(
batch_size: int = 1024,
text_field: str = 'text',
pred_column: str = 'fineweb-mixtral-edu-score',
int_column: str = 'fineweb-mixtral-edu-score-int',
quality_label_column: str = 'fineweb-mixtral-edu-score-label',
max_chars: int = -1,
device_type: str = 'cuda',
autocast: bool = True,
max_mem_gb: int | None = None,
)#

Bases: classifiers.fineweb_edu._FineWebBaseClassifier

Initialization

class classifiers.fineweb_edu.FineWebNemotronEduClassifier(
batch_size: int = 1024,
text_field: str = 'text',
pred_column: str = 'fineweb-nemotron-edu-score',
int_column: str = 'fineweb-nemotron-edu-score-int',
quality_label_column: str = 'fineweb-nemotron-edu-score-label',
max_chars: int = -1,
device_type: str = 'cuda',
autocast: bool = True,
max_mem_gb: int | None = None,
)#

Bases: classifiers.fineweb_edu._FineWebBaseClassifier

Initialization

class classifiers.fineweb_edu.FinewebEduModel(
path_or_name: str,
max_mem_gb: int | None = None,
autocast: bool = False,
)#

Bases: crossfit.backend.torch.hf.model.HFModel

Initialization

static configure_forward(
model: torch.nn.Module,
autocast: bool = True,
) torch.nn.Module#
load_config() transformers.AutoConfig#
load_model(device: str = 'cuda') torch.nn.Module#
load_tokenizer() transformers.AutoTokenizer#