core.tokenizers.text.libraries.sft_tokenizer#

Module Contents#

Classes#

PromptConfig

Config options for different prompt formats.

SFTTokenizer

SFT Tokenizer.

Data#

API#

core.tokenizers.text.libraries.sft_tokenizer.nemotron_h_aligned_custom_template = <Multiline-String>#
core.tokenizers.text.libraries.sft_tokenizer.nemotron_nano_v2_custom_template = <Multiline-String>#
core.tokenizers.text.libraries.sft_tokenizer.identity_template#

“{% for message in messages %}{{ message[‘content’] }}{% endfor %}”

core.tokenizers.text.libraries.sft_tokenizer.IGNORE_INDEX#

None

class core.tokenizers.text.libraries.sft_tokenizer.PromptConfig#

Config options for different prompt formats.

assistant_prefix_len: int#

None

pad_token_id: int#

None

custom_chat_template: str#

None

has_bos: bool#

None

has_system_role: bool#

None

force_system_message: bool#

False

system_default: dict#

None

class core.tokenizers.text.libraries.sft_tokenizer.SFTTokenizer(tokenizer_path: str, prompt_format: str)#

SFT Tokenizer.

Initialization

Note: Currently, only HuggingFaceTokenizer is supported as the underlying text tokenizer.

Parameters:
  • tokenizer_path (str) – Underlying tokenizer path.

  • prompt_format (str) – Prompt format for the tokenizer.

tokenize_conversation(
conversation: List[Dict],
return_target: bool,
add_generation_prompt: bool,
)#

Convert a conversation to tokens.

Parameters:
  • conversation (List[Dict]) – Sequence of system/user/assistant messages. Must be in the following format: [ {“role”: “system”, “content”: “something”}, {“role”: “user”, “content”: “something1”}, {“role”: “assistant”, “content”: “something2”}, ]

  • return_target (bool) – Return target tokens with system and assistant masked.

  • add_generation_prompt (bool) – Add assistant prefix to the end.

text_to_ids(text: Union[str, List[Dict]])#

Tokenize conversation or string input.

tokens_to_ids(tokens: List[str])#

Convert tokens to IDs.

ids_to_text(tokens: List[int])#

Detokenize tokens.

abstractmethod ids_to_tokens()#

Converts ids to tokens.

abstractmethod text_to_tokens()#

Converts text to tokens.

abstractmethod tokens_to_text()#

Converts tokens to text.

get_special_tokens()#

Get special tokens.

abstractmethod add_special_tokens()#

Add special tokens.

property pad_id#

Pad token ID.

property bos_id#

Beginning of sequence token ID.

property eod#

End of sentence token ID.

property vocab#

Vocab.

property inv_vocab#

Inverse vocab.

property vocab_size#

Vocabulary size.