Field value_type Description default_value valid_min valid_max valid_options automl_enabled

pretrained_backbone_path string [Optional] Path to a pretrained backbone file. FALSE

backbone string The backbone name of the model.

TAO implementation of Groudning DINO support Swin. swin_tiny_224_1k swin_tiny_224_1k,swin_base_224_22k,swin_base_384_22k,swin_large_224_22k,swin_large_384_22k FALSE

num_queries int The number of queries 900 1 inf TRUE

num_feature_levels int The number of feature levels to use in the model 4 1 5 FALSE

set_cost_class float The relative weight of the classification error in the matching cost. 1.0 0.0 inf FALSE

set_cost_bbox float The relative weight of the L1 error of the bounding box coordinates in the matching cost. 5.0 0.0 inf FALSE

set_cost_giou float The relative weight of the GIoU loss of the bounding box in the matching cost. 2.0 0.0 inf FALSE

cls_loss_coef float The relative weight of the classification error in the final loss. 2.0 0.0 inf FALSE

bbox_loss_coef float The relative weight of the L1 error of the bounding box coordinates in the final loss. 5.0 0.0 inf FALSE

giou_loss_coef float The relative weight of the GIoU loss of the bounding box in the final loss. 2.0 0.0 inf FALSE

num_select int The number of top-K predictions selected during post-process 300 1 TRUE

interm_loss_coef float 1.0 FALSE

no_interm_box_loss bool No intermediate bbox loss. False FALSE

pre_norm bool Flag to add layer norm in the encoder or not. False FALSE

two_stage_type string Type of two stage in DINO standard standard,no FALSE

decoder_sa_type string Type of decoder self attention. sa sa,ca_label,ca_content FALSE

embed_init_tgt bool Flag to add target embedding True FALSE

fix_refpoints_hw int If this value is -1, width and height are learned seperately for each box.

If this value is -2, a shared width and height are learned.

A value greater than 0 specifies learning with a fixed number. -1 -2 inf FALSE

pe_temperatureH int The temperature applied to the height dimension of the positional sine embedding. 20 1 inf FALSE

pe_temperatureW int The temperature applied to the width dimension of the positional sine embedding. 20 1 inf FALSE

return_interm_indices list The index of feature levels to use in the model. The length must match num_feature_levels . [1, 2, 3, 4] FALSE

use_dn bool A flag specifying whether to enbable contrastive de-noising training in DINO True FALSE

dn_number int The number of denoising queries in DINO. 0 0 inf FALSE

dn_box_noise_scale float The scale of noise applied to boxes during contrastive de-noising. If this value is 0, noise is not applied. 1.0 0.0 inf FALSE

dn_label_noise_ratio float The scale of the noise applied to labels during

contrastive denoising. If this value is 0, then noise is

no applied. 0.5 0.0 FALSE

focal_alpha float The alpha value in the focal loss. 0.25 FALSE

focal_gamma float The gamma value in the focal loss. 2.0 FALSE

clip_max_norm float 0.1 FALSE

nheads int Number of heads 8 FALSE

dropout_ratio float The probability to drop hidden units. 0.0 0.0 1.0 FALSE

hidden_dim int Dimension of the hidden units. 256 FALSE

enc_layers int Numer of encoder layers in the transformer 6 1 TRUE

dec_layers int Numer of decoder layers in the transformer 6 1 TRUE

dim_feedforward int Dimension of the feedforward network 2048 1 FALSE

dec_n_points int Number of reference points in the decoder. 4 1 FALSE

enc_n_points int Number of reference points in the encoder. 4 1 FALSE

aux_loss bool A flag specifying whether to use auxiliary

decoding losses (loss at each decoder layer) True FALSE

dilation bool A flag specifying whether enable dilation or not in the backbone. False FALSE

train_backbone bool Flag to set backbone weights as trainable or frozen.

When set to False , the backbone weights will be frozen. True FALSE

text_encoder_type string BERT encoder type. If only the name of the type is provided,

the weight is download from the Hugging Face Hub.

If a path is provided, then we load the weight from the local path. bert-base-uncased FALSE

max_text_len int Maximum text length of BERT. 256 1 FALSE

class_embed_bias bool Flag to set bias in the contrastive embedding. False FALSE

log_scale string [Optional] The initial value of a learnable parameter to multiply with the similarity

matrix to normalize the output. Defaults to None.

- If set to ‘auto’, the similarity matrix is normalized by

a fixed value sqrt(d_c) where d_c is the channel number.

- If set to ‘none’ or None , there is no normalization applied. none FALSE

loss_types list Losses to be used during training [‘labels’, ‘boxes’] FALSE

backbone_names list Prefix of the tensor names corresponding to the backbone. [‘backbone.0’, ‘bert’] FALSE

linear_proj_names list Linear projection layer names. [‘reference_points’, ‘sampling_offsets’] FALSE

has_mask bool Flag to enable mask head in Grounding Dino True FALSE

mask_loss_coef float The relative weight of the mask error in the final loss. 2.0 FALSE