core.tokenizers.text.libraries.megatron_hf_tokenizer#

Module Contents#

Classes#

Data#

API#

core.tokenizers.text.libraries.megatron_hf_tokenizer.logger#

‘getLogger(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.torch_home#

‘_get_torch_home(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CACHE#

‘join(…)’

core.tokenizers.text.libraries.megatron_hf_tokenizer.MEGATRON_CONFIG_MAP#

None

class core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer(
tokenizer_path: str,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
**kwargs,
)#

Bases: core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer

Initialization

Parameters:
  • tokenizer_path – corresponds to HuggingFace-AutoTokenizer’s ‘pretrained_model_name_or_path’ input argument. For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained.

  • vocab_file – path to file with vocabulary which consists of characters separated by newlines.

  • mask_token – mask token

  • bos_token – the beginning of sequence token

  • eos_token – the end of sequence token. Usually equal to sep_token

  • pad_token – token to use for padding

  • sep_token – token used for separating sequences

  • cls_token – class token. Usually equal to bos_token

  • unk_token – token to use for unknown tokens

  • additional_special_tokens – list of other tokens beside standard special tokens (bos, eos, pad, etc.). For example, sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.)

  • use_fast – whether to use fast HuggingFace tokenizer

  • include_special_tokens – when True, converting text to ids will include special tokens / prompt tokens (if any), yielding self.tokenizer(text).input_ids

_get_vocab_file(tokenizer_name: str, vocab_file: str = None) str#

Gets vocabulary file from cache or downloads it.

Parameters:
  • tokenizer_name (str) – pretrained model name.

  • vocab_file (str) – path to the vocab file.

Returns:

path to the vocab file

Return type:

path

_get_merges_file(tokenizer_name: str, merges_file: str = None) str#

Gets merge file from cache or downloads it.

Parameters:
  • tokenizer_name (str) – pretrained model name.

  • merges_file (str) – path to the merges file.

Returns:

path to the vocab file.

Return type:

path

_get_available_models_list() list#

Returns a list of available megatron tokenizers.

_download(path: str, url: str)#

Gets a file from cache or downloads it

Parameters:
  • path – path to the file in cache

  • url – url to the file

Returns:

path to the file in cache

Return type:

path