`core.tokenizers.text.libraries.megatron_hf_tokenizer`#

Module Contents#

Classes#

MegatronHFTokenizer

Data#

`logger`
`torch_home`
`MEGATRON_CACHE`
`MEGATRON_CONFIG_MAP`

API#

core.tokenizers.text.libraries.megatron_hf_tokenizer.logger#: ‘getLogger(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.torch_home#: ‘_get_torch_home(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CACHE#: ‘join(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CONFIG_MAP#: None

class core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer(

tokenizer_path: str,

vocab_file: Optional[str] = None,

merges_file: Optional[str] = None,

**kwargs,

)#

Bases: core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer

Initialization

_get_vocab_file(tokenizer_name: str, vocab_file: str = None) → str#

Gets vocabulary file from cache or downloads it.

Parameters:

tokenizer_name (str) – pretrained model name.
vocab_file (str) – path to the vocab file.

Returns:

path to the vocab file

Return type:

path

_get_merges_file(tokenizer_name: str, merges_file: str = None) → str#

Gets merge file from cache or downloads it.

Parameters:

tokenizer_name (str) – pretrained model name.
merges_file (str) – path to the merges file.

Returns:

path to the vocab file.

Return type:

path

_get_available_models_list() → list#: Returns a list of available megatron tokenizers.

_download(path: str, url: str)#

Gets a file from cache or downloads it

Parameters:

path – path to the file in cache
url – url to the file

Returns:

path to the file in cache

Return type:

path

core.tokenizers.text.libraries.megatron_hf_tokenizer#

Module Contents#

Classes#

Data#

API#

`core.tokenizers.text.libraries.megatron_hf_tokenizer`#