classifiers.fineweb_edu#
Module Contents#
Classes#
Data#
API#
- classifiers.fineweb_edu.FINEWEB_EDU_IDENTIFIER#
‘HuggingFaceFW/fineweb-edu-classifier’
- classifiers.fineweb_edu.FINEWEB_MIXTRAL_IDENTIFIER#
‘nvidia/nemocurator-fineweb-mixtral-edu-classifier’
- classifiers.fineweb_edu.FINEWEB_NEMOTRON_IDENTIFIER#
‘nvidia/nemocurator-fineweb-nemotron-4-edu-classifier’
- class classifiers.fineweb_edu.FineWebEduClassifier(
- batch_size: int = 256,
- text_field: str = 'text',
- pred_column: str = 'fineweb-edu-score',
- int_column: str = 'fineweb-edu-score-int',
- max_chars: int = -1,
- device_type: str = 'cuda',
- autocast: bool = True,
- max_mem_gb: int | None = None,
Bases:
classifiers.fineweb_edu._FineWebBaseClassifierInitialization
- class classifiers.fineweb_edu.FineWebMixtralEduClassifier(
- batch_size: int = 1024,
- text_field: str = 'text',
- pred_column: str = 'fineweb-mixtral-edu-score',
- int_column: str = 'fineweb-mixtral-edu-score-int',
- quality_label_column: str = 'fineweb-mixtral-edu-score-label',
- max_chars: int = -1,
- device_type: str = 'cuda',
- autocast: bool = True,
- max_mem_gb: int | None = None,
Bases:
classifiers.fineweb_edu._FineWebBaseClassifierInitialization
- class classifiers.fineweb_edu.FineWebNemotronEduClassifier(
- batch_size: int = 1024,
- text_field: str = 'text',
- pred_column: str = 'fineweb-nemotron-edu-score',
- int_column: str = 'fineweb-nemotron-edu-score-int',
- quality_label_column: str = 'fineweb-nemotron-edu-score-label',
- max_chars: int = -1,
- device_type: str = 'cuda',
- autocast: bool = True,
- max_mem_gb: int | None = None,
Bases:
classifiers.fineweb_edu._FineWebBaseClassifierInitialization
- class classifiers.fineweb_edu.FinewebEduModel(
- path_or_name: str,
- max_mem_gb: int | None = None,
- autocast: bool = False,
Bases:
crossfit.backend.torch.hf.model.HFModelInitialization
- static configure_forward(
- model: torch.nn.Module,
- autocast: bool = True,
- load_config() transformers.AutoConfig#
- load_model(device: str = 'cuda') torch.nn.Module#
- load_tokenizer() transformers.AutoTokenizer#