core.tokenizers.text.libraries.null_tokenizer#

Module Contents#

Classes#

NullTokenizer

Synthetic tokenizer for performance benchmarking and debugging

API#

class core.tokenizers.text.libraries.null_tokenizer.NullTokenizer(vocab_size)#

Synthetic tokenizer for performance benchmarking and debugging

Parameters:

vocab_size – vocabulary size for embedding

Initialization

text_to_ids(text)#

Converts text to ids.

ids_to_text(ids)#

Converts ids to text.

tokens_to_ids(tokens)#

Converts tokens to ids.

ids_to_tokens(ids)#

Converts ids to tokens.

offsets(ids: list[int], text: str) list[int]#

Returns offsets.

property unique_identifiers: collections.OrderedDict#

Property required for use with megatron-core datasets.

property vocab_size#

Returns vocab size.

abstract property vocab#
abstract property inv_vocab#
property cls#

Returns cls token.

property sep#

Returns sep token.

property mask#

Returns mask token.

property eod#

Returns eod token.

property additional_special_tokens_ids#