core.tokenizers.text.libraries.megatron_hf_tokenizer#

Module Contents#

Classes#

Data#

API#

core.tokenizers.text.libraries.megatron_hf_tokenizer.logger#

‘getLogger(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.torch_home#

‘_get_torch_home(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CACHE#

‘join(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CONFIG_MAP#

None

class core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer(
tokenizer_path: str,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
**kwargs,
)#

Bases: core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer

Initialization

_get_vocab_file(tokenizer_name: str, vocab_file: str = None) str#

Gets vocabulary file from cache or downloads it.

Parameters:
  • tokenizer_name (str) – pretrained model name.

  • vocab_file (str) – path to the vocab file.

Returns:

path to the vocab file

Return type:

path

_get_merges_file(tokenizer_name: str, merges_file: str = None) str#

Gets merge file from cache or downloads it.

Parameters:
  • tokenizer_name (str) – pretrained model name.

  • merges_file (str) – path to the merges file.

Returns:

path to the vocab file.

Return type:

path

_get_available_models_list() list#

Returns a list of available megatron tokenizers.

_download(path: str, url: str)#

Gets a file from cache or downloads it

Parameters:
  • path – path to the file in cache

  • url – url to the file

Returns:

path to the file in cache

Return type:

path