core.tokenizers.text.libraries.megatron_hf_tokenizer#
Module Contents#
Classes#
Data#
API#
- core.tokenizers.text.libraries.megatron_hf_tokenizer.logger#
‘getLogger(…)’
- core.tokenizers.text.libraries.megatron_hf_tokenizer.torch_home#
‘_get_torch_home(…)’
- core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CACHE#
‘join(…)’
- core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CONFIG_MAP#
None
- class core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer(
- tokenizer_path: str,
- vocab_file: Optional[str] = None,
- merges_file: Optional[str] = None,
- **kwargs,
Bases:
core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizerInitialization
- _get_vocab_file(tokenizer_name: str, vocab_file: str = None) str#
Gets vocabulary file from cache or downloads it.
- Parameters:
tokenizer_name (str) – pretrained model name.
vocab_file (str) – path to the vocab file.
- Returns:
path to the vocab file
- Return type:
path
- _get_merges_file(tokenizer_name: str, merges_file: str = None) str#
Gets merge file from cache or downloads it.
- Parameters:
tokenizer_name (str) – pretrained model name.
merges_file (str) – path to the merges file.
- Returns:
path to the vocab file.
- Return type:
path
- _get_available_models_list() list#
Returns a list of available megatron tokenizers.
- _download(path: str, url: str)#
Gets a file from cache or downloads it
- Parameters:
path – path to the file in cache
url – url to the file
- Returns:
path to the file in cache
- Return type:
path