core.tokenizers.text.libraries.abstract_tokenizer#

Module Contents#

Classes#

MegatronTokenizerTextAbstract

Abstract class for Megatron text tokenizers.

API#

class core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract#

Bases: abc.ABC

Abstract class for Megatron text tokenizers.

abstractmethod text_to_tokens(text: str) List[str]#

Converts text to tokens.

Parameters:

text (str) – text to be tokenized.

Returns:

list of tokens.

Return type:

List[str]

abstractmethod tokens_to_text(tokens: List[str]) str#

Converts tokens to text.

Parameters:

tokens (List[str]) – tokens to be detokenized.

Returns:

detokenized text.

Return type:

str

abstractmethod tokens_to_ids(tokens: List[str]) List[int]#

Converts tokens to ids.

Parameters:

tokens (List[str]) – tokens to be converted.

Returns:

ids of tokens.

Return type:

List[int]

abstractmethod ids_to_tokens(ids: List[int]) List[str]#

Converts ids to tokens.

Parameters:

ids (List[int]) – ids to be converted.

Returns:

list of tokens.

Return type:

List[str]

abstractmethod text_to_ids(text: str) List[int]#

Converts text to ids.

Parameters:

text (str) – text to be tokenized.

Returns:

list of ids.

Return type:

List[int]

abstractmethod ids_to_text(ids: List[int]) str#

Converts ids to text.

Parameters:

ids (List[int]) – ids to be detokenized.

Returns:

detokenized text.

Return type:

str

abstractmethod add_special_tokens()#

Adds special tokens to the tokenizer.

property cls_id: int#

Property alias to match MegatronTokenizer; returns cls_id if available.

property sep_id: int#

Property alias to match MegatronTokenizer; returns sep_id if available.

property pad_id: int#

Property alias to match MegatronTokenizer; returns pad_id if available.

property eod: int#

Property alias to match MegatronTokenizer; returns eod_id if available.

property bos_id: int#

Property alias to match MegatronTokenizer; returns bos_id if available.

property eos_id: int#

Property alias to match MegatronTokenizer; returns eos_id if available.

property mask_id: int#

Property alias to match MegatronTokenizer; returns mask_id if available.