`core.tokenizers.vision.libraries.multimodal_tokenizer`#

Module Contents#

Classes#

MegatronMultimodalTokenizer

Multimodal Tokenizer.

Data#

`IMAGE_TAGS`
`mistral_custom_template`
`nvlm_yi_34b_template`
`qwen2p0_custom_template`
`llama3p1_chat_template`
`nemotron_custom_template`
`nemotron_aligned_custom_template`

API#

core.tokenizers.vision.libraries.multimodal_tokenizer.IMAGE_TAGS#: None

core.tokenizers.vision.libraries.multimodal_tokenizer.mistral_custom_template = <Multiline-String>#

core.tokenizers.vision.libraries.multimodal_tokenizer.nvlm_yi_34b_template = <Multiline-String>#

core.tokenizers.vision.libraries.multimodal_tokenizer.qwen2p0_custom_template = <Multiline-String>#

core.tokenizers.vision.libraries.multimodal_tokenizer.llama3p1_chat_template = <Multiline-String>#

core.tokenizers.vision.libraries.multimodal_tokenizer.nemotron_custom_template = <Multiline-String>#

core.tokenizers.vision.libraries.multimodal_tokenizer.nemotron_aligned_custom_template = <Multiline-String>#

class core.tokenizers.vision.libraries.multimodal_tokenizer.MegatronMultimodalTokenizer(

path: str,

prompt_format: str,

special_tokens: List[str],

image_tag_type: str,

force_system_message: bool = False,

**kwargs,

)#

Multimodal Tokenizer.

Initialization

Tokenizer with a support for non-text inputs.

Note: Currently, only HuggingFaceTokenizer is supported as the underlying text tokenizer.

Parameters:

path (str) – Path to the underlying tokenizer.
prompt_format (str) – Prompt format for the tokenizer.
special_tokens (List[str]) – Non-text tokens.
image_tag_type (str) – Image tag to apply, if any. For example .

_apply_image_tag(text: Union[str, List[Dict]])#: Surround with image tags such as and .

tokenize(text: Union[str, List[Dict]])#: Tokenize conversation or string input.

_encode(text: str)#: Tokenize text input.

tokenize_conversation( conversation: List[Dict], return_target: bool, add_generation_prompt: bool, )#

Convert a conversation to tokens.