Speaker Diarization Configuration Files#
Hydra Configurations for Sortformer Diarizer Training#
Sortformer Diarizer is an end-to-end speaker diarization model that is solely based on Transformer-encoder type of architecture. Model name convention for Sortformer Diarizer: sortformer_diarizer_<loss_type>_<speaker count limit>-<version>.yaml
Example <NeMo_root>/examples/speaker_tasks/diarization/neural_diarizer/conf/sortformer_diarizer_hybrid_loss_4spk-v1.yaml.
name: "SortformerDiarizer"
num_workers: 18
batch_size: 8
model:
sample_rate: 16000
pil_weight: 0.5 # Weight for Permutation Invariant Loss (PIL) used in training the Sortformer diarizer model
ats_weight: 0.5 # Weight for Arrival Time Sort (ATS) loss in training the Sortformer diarizer model
max_num_of_spks: 4 # Maximum number of speakers per model; currently set to 4
model_defaults:
fc_d_model: 512 # Hidden dimension size of the Fast-conformer Encoder
tf_d_model: 192 # Hidden dimension size of the Transformer Encoder
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5 # Threshold for binarizing target values; higher values make the model more conservative in predicting speaker activity.
soft_targets: False # If True, use continuous values as target values when calculating cross-entropy loss
labels: null
batch_size: ${batch_size}
shuffle: True
num_workers: ${num_workers}
validation_mode: False
# lhotse config
use_lhotse: False
use_bucketing: True
num_buckets: 10
bucket_duration_bins: [10, 20, 30, 40, 50, 60, 70, 80, 90]
pin_memory: True
min_duration: 10
max_duration: 90
batch_duration: 400
quadratic_duration: 1200
bucket_buffer_size: 20000
shuffle_buffer_size: 10000
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
validation_ds:
manifest_filepath: ???
is_tarred: False
tarred_audio_filepaths: null
sample_rate: ${model.sample_rate}
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5 # A threshold value for setting up the binarized labels. The higher the more conservative the model becomes.
soft_targets: False
labels: null
batch_size: ${batch_size}
shuffle: False
num_workers: ${num_workers}
validation_mode: True
# lhotse config
use_lhotse: False
use_bucketing: False
drop_last: False
pin_memory: True
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
test_ds:
manifest_filepath: null
is_tarred: False
tarred_audio_filepaths: null
sample_rate: 16000
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5
soft_targets: False
labels: null
batch_size: ${batch_size}
shuffle: False
seq_eval_mode: True
num_workers: ${num_workers}
validation_mode: True
# lhotse config
use_lhotse: False
use_bucketing: False
drop_last: False
pin_memory: True
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: "per_feature"
window_size: 0.025
sample_rate: ${model.sample_rate}
window_stride: 0.01
window: "hann"
features: 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
sortformer_modules:
_target_: nemo.collections.asr.modules.sortformer_modules.SortformerModules
num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 4.
dropout_rate: 0.5 # Dropout rate
fc_d_model: ${model.model_defaults.fc_d_model}
tf_d_model: ${model.model_defaults.tf_d_model} # Hidden layer size for linear layers in Sortformer Diarizer module
encoder:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: ${model.preprocessor.features}
feat_out: -1
n_layers: 18
d_model: ${model.model_defaults.fc_d_model}
# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
subsampling_factor: 8 # must be power of 2 for striding and vggnet
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
causal_downsampling: false
# Feed forward module's params
ff_expansion_factor: 4
# Multi-headed Attention Module's params
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
att_context_size: [-1, -1] # -1 means unlimited context
att_context_style: regular # regular or chunked_limited
xscaling: true # scales up the input embeddings by sqrt(d_model)
untie_biases: true # unties the biases of the TransformerXL layers
pos_emb_max_len: 5000
# Convolution module's params
conv_kernel_size: 9
conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
conv_context_size: null
# Regularization
dropout: 0.1 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0.1 # The dropout used before the encoder
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0.1 # The dropout for multi-headed attention modules
# Set to non-zero to enable stochastic depth
stochastic_depth_drop_prob: 0.0
stochastic_depth_mode: linear # linear or uniform
stochastic_depth_start_layer: 1
transformer_encoder:
_target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder
num_layers: 18
hidden_size: ${model.model_defaults.tf_d_model} # Needs to be multiple of num_attention_heads
inner_size: 768
num_attention_heads: 8
attn_score_dropout: 0.5
attn_layer_dropout: 0.5
ffn_dropout: 0.5
hidden_act: relu
pre_ln: False
pre_ln_final_layer_norm: True
loss:
_target_: nemo.collections.asr.losses.bce_loss.BCELoss
weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5])
reduction: mean
lr: 0.0001
optim:
name: adamw
lr: ${model.lr}
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 1e-3
sched:
name: InverseSquareRootAnnealing
warmup_steps: 2500
warmup_ratio: null
min_lr: 1e-06
trainer:
devices: 1 # number of gpus (devices)
accelerator: gpu
max_epochs: 800
max_steps: -1 # computed at runtime if not set
num_nodes: 1
strategy: ddp_find_unused_parameters_true # Could be "ddp"
accumulate_grad_batches: 1
deterministic: True
enable_checkpointing: False
logger: False
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
exp_manager:
use_datetime_version: False
exp_dir: null
name: ${name}
resume_if_exists: True
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
resume_ignore_no_checkpoint: True
create_tensorboard_logger: True
create_checkpoint_callback: True
create_wandb_logger: False
checkpoint_callback_params:
monitor: "val_f1_acc"
mode: "max"
save_top_k: 9
every_n_epochs: 1
wandb_logger_kwargs:
resume: True
name: null
project: null
Hydra Configurations for Streaming Sortformer Diarizer Training#
Model name convention for Streaming Sortformer Diarizer: streaming_sortformer_diarizer_<speaker count limit>-<version>.yaml
Example <NeMo_root>/examples/speaker_tasks/diarization/neural_diarizer/conf/streaming_sortformer_diarizer_4spk-v2.yaml.
name: "StreamingSortformerDiarizer"
num_workers: 18
batch_size: 4
model:
sample_rate: 16000
pil_weight: 0.5 # Weight for Permutation Invariant Loss (PIL) used in training the Sortformer diarizer model
ats_weight: 0.5 # Weight for Arrival Time Sort (ATS) loss in training the Sortformer diarizer model
max_num_of_spks: 4 # Maximum number of speakers per model; currently set to 4
streaming_mode: True
model_defaults:
fc_d_model: 512 # Hidden dimension size of the Fast-conformer Encoder
tf_d_model: 192 # Hidden dimension size of the Transformer Encoder
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5 # Threshold for binarizing target values; higher values make the model more conservative in predicting speaker activity.
soft_targets: False # If True, use continuous values as target values when calculating cross-entropy loss
labels: null
batch_size: ${batch_size}
shuffle: True
num_workers: ${num_workers}
validation_mode: False
# lhotse config
use_lhotse: False
use_bucketing: True
num_buckets: 10
bucket_duration_bins: [10, 20, 30, 40, 50, 60, 70, 80, 90]
pin_memory: True
min_duration: 10
max_duration: 90
batch_duration: 400
quadratic_duration: 1200
bucket_buffer_size: 20000
shuffle_buffer_size: 10000
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
validation_ds:
manifest_filepath: ???
is_tarred: False
tarred_audio_filepaths: null
sample_rate: ${model.sample_rate}
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5 # A threshold value for setting up the binarized labels. The higher the more conservative the model becomes.
soft_targets: False
labels: null
batch_size: ${batch_size}
shuffle: False
num_workers: ${num_workers}
validation_mode: True
# lhotse config
use_lhotse: False
use_bucketing: False
drop_last: False
pin_memory: True
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
test_ds:
manifest_filepath: null
is_tarred: False
tarred_audio_filepaths: null
sample_rate: 16000
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5
soft_targets: False
labels: null
batch_size: ${batch_size}
shuffle: False
seq_eval_mode: True
num_workers: ${num_workers}
validation_mode: True
# lhotse config
use_lhotse: False
use_bucketing: False
drop_last: False
pin_memory: True
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: "NA"
window_size: 0.025
sample_rate: ${model.sample_rate}
window_stride: 0.01
window: "hann"
features: 128
n_fft: 512
frame_splicing: 1
dither: 0.00001
sortformer_modules:
_target_: nemo.collections.asr.modules.sortformer_modules.SortformerModules
num_spks: ${model.max_num_of_spks} # Maximum number of speakers the model can handle
dropout_rate: 0.5 # Dropout rate
fc_d_model: ${model.model_defaults.fc_d_model} # Hidden dimension size for Fast Conformer encoder
tf_d_model: ${model.model_defaults.tf_d_model} # Hidden dimension size for Transformer encoder
# Streaming mode parameters
spkcache_len: 188 # Length of speaker cache buffer (total number of frames for all speakers)
fifo_len: 0 # Length of FIFO buffer for streaming processing (0 = disabled)
chunk_len: 188 # Number of frames processed in each streaming chunk
spkcache_update_period: 188 # Speaker cache update period in frames
chunk_left_context: 1 # Number of previous frames for each streaming chunk
chunk_right_context: 1 # Number of future frames for each streaming chunk
# Speaker cache update parameters
spkcache_sil_frames_per_spk: 3 # Number of silence frames allocated per speaker in the speaker cache
scores_add_rnd: 0 # Standard deviation of random noise added to scores in speaker cache update (training only)
pred_score_threshold: 0.25 # Probability threshold for internal scores processing in speaker cache update
max_index: 99999 # Maximum allowed index value for internal processing in speaker cache update
scores_boost_latest: 0.05 # Gain for scores for recently added frames in speaker cache update
sil_threshold: 0.2 # Threshold for determining silence frames to calculate average silence embedding
strong_boost_rate: 0.75 # Rate determining number of frames per speaker that receive strong score boosting
weak_boost_rate: 1.5 # Rate determining number of frames per speaker that receive weak score boosting
min_pos_scores_rate: 0.5 # Rate threshold for dropping overlapping frames when enough non-overlapping exist
# Self-attention parameters (training only)
causal_attn_rate: 0.5 # Proportion of batches that use self-attention with limited right context
causal_attn_rc: 7 # Right context size for self-attention with limited right context
encoder:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: ${model.preprocessor.features}
feat_out: -1
n_layers: 17
d_model: ${model.model_defaults.fc_d_model}
# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
subsampling_factor: 8 # must be power of 2 for striding and vggnet
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
causal_downsampling: false
# Feed forward module's params
ff_expansion_factor: 4
# Multi-headed Attention Module's params
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
att_context_size: [-1, -1] # -1 means unlimited context
att_context_style: regular # regular or chunked_limited
xscaling: true # scales up the input embeddings by sqrt(d_model)
untie_biases: true # unties the biases of the TransformerXL layers
pos_emb_max_len: 5000
# Convolution module's params
conv_kernel_size: 9
conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
conv_context_size: null
# Regularization
dropout: 0.1 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0.1 # The dropout used before the encoder
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0.1 # The dropout for multi-headed attention modules
# Set to non-zero to enable stochastic depth
stochastic_depth_drop_prob: 0.0
stochastic_depth_mode: linear # linear or uniform
stochastic_depth_start_layer: 1
transformer_encoder:
_target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder
num_layers: 18
hidden_size: ${model.model_defaults.tf_d_model} # Needs to be multiple of num_attention_heads
inner_size: 768
num_attention_heads: 8
attn_score_dropout: 0.5
attn_layer_dropout: 0.5
ffn_dropout: 0.5
hidden_act: relu
pre_ln: False
pre_ln_final_layer_norm: True
loss:
_target_: nemo.collections.asr.losses.bce_loss.BCELoss
weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5])
reduction: mean
lr: 0.0001
optim:
name: adamw
lr: ${model.lr}
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 1e-3
sched:
name: InverseSquareRootAnnealing
warmup_steps: 500
warmup_ratio: null
min_lr: 1e-06
trainer:
devices: 1 # number of gpus (devices)
accelerator: gpu
max_epochs: 800
max_steps: -1 # computed at runtime if not set
num_nodes: 1
strategy: ddp_find_unused_parameters_true # Could be "ddp"
accumulate_grad_batches: 1
deterministic: True
enable_checkpointing: False
logger: False
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
exp_manager:
use_datetime_version: False
exp_dir: null
name: ${name}
resume_if_exists: True
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
resume_ignore_no_checkpoint: True
create_tensorboard_logger: True
create_checkpoint_callback: True
create_wandb_logger: False
checkpoint_callback_params:
monitor: "val_f1_acc"
mode: "max"
save_top_k: 9
every_n_epochs: 1
wandb_logger_kwargs:
resume: True
name: null
project: null
Hydra Configurations for (Streaming) Sortformer Diarization Post-processing#
Post-processing converts the floating point number based Tensor output to time stamp output. While generating the speaker-homogeneous segments, onset and offset threshold, paddings can be considered to render the time stamps that can lead to the lowest diarization error rate (DER). This post-processing can be applied to both offline and streaming Sortformer diarizer.
By default, post-processing is bypassed, and only binarization is performed. If you want to reproduce DER scores reported on NeMo model cards, you need to apply post-processing steps. Use batch_size = 1 to have the longest inference window and the highest possible accuracy.
parameters:
onset: 0.64 # Onset threshold for detecting the beginning of a speech segment
offset: 0.74 # Offset threshold for detecting the end of a speech segment
pad_onset: 0.06 # Adds the specified duration at the beginning of each speech segment
pad_offset: 0.0 # Adds the specified duration at the end of each speech segment
min_duration_on: 0.1 # Removes short speech segments if the duration is less than the specified minimum duration
min_duration_off: 0.15 # Removes short silences if the duration is less than the specified minimum duration
Hydra Configurations for Diarization Inference#
Example configuration files for speaker diarization inference can be found in <NeMo_root>/examples/speaker_tasks/diarization/conf/inference/. Choose a yaml file that fits your targeted domain. For example, if you want to diarize audio recordings of telephonic speech, choose diar_infer_telephonic.yaml.
The configurations for all the components of diarization inference are included in a single file named diar_infer_<domain>.yaml. Each .yaml file has a few different sections for the following modules: VAD, Speaker Embedding, Clustering and ASR.
In speaker diarization inference, the datasets provided in manifest format denote the data that you would like to perform speaker diarization on.
Diarizer Configurations#
An example diarizer Hydra configuration could look like:
diarizer:
manifest_filepath: ???
out_dir: ???
oracle_vad: False # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps
collar: 0.25 # Collar value for scoring
ignore_overlap: True # Consider or ignore overlap segments while scoring
Under diarizer key, there are vad, speaker_embeddings, clustering and asr keys containing configurations for the inference of the corresponding modules.
Configurations for Voice Activity Detector#
Parameters for VAD model are provided as in the following Hydra config example.
vad:
model_path: null # .nemo local model path or pretrained model name or none
external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set
parameters: # Tuned parameters for CH109 (using the 11 multi-speaker sessions as dev set)
window_length_in_sec: 0.15 # Window length in sec for VAD context input
shift_length_in_sec: 0.01 # Shift length in sec for generate frame level VAD prediction
smoothing: "median" # False or type of smoothing method (eg: median)
overlap: 0.875 # Overlap ratio for overlapped mean/median smoothing filter
onset: 0.4 # Onset threshold for detecting the beginning and end of a speech
offset: 0.7 # Offset threshold for detecting the end of a speech
pad_onset: 0.05 # Adding durations before each speech segment
pad_offset: -0.1 # Adding durations after each speech segment
min_duration_on: 0.2 # Threshold for short speech segment deletion
min_duration_off: 0.2 # Threshold for small non_speech deletion
filter_speech_first: True
Configurations for Speaker Embedding in Diarization#
Parameters for speaker embedding model are provided in the following Hydra config example. Note that multiscale parameters either accept list or single floating point number.
speaker_embeddings:
model_path: ??? # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet)
parameters:
window_length_in_sec: 1.5 # Window length(s) in sec (floating-point number). Either a number or a list. Ex) 1.5 or [1.5,1.25,1.0,0.75,0.5]
shift_length_in_sec: 0.75 # Shift length(s) in sec (floating-point number). Either a number or a list. Ex) 0.75 or [0.75,0.625,0.5,0.375,0.25]
multiscale_weights: null # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. Ex) [1,1,1,1,1]
save_embeddings: False # Save embeddings as pickle file for each audio input.
Configurations for Clustering in Diarization#
Parameters for clustering algorithm are provided in the following Hydra config example.
clustering:
parameters:
oracle_num_speakers: False # If True, use num of speakers value provided in the manifest file.
max_num_speakers: 20 # Max number of speakers for each recording. If oracle_num_speakers is passed, this value is ignored.
enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold.
sparse_search_volume: 30 # The higher the number, the more values will be examined with more time.
Configurations for Diarization with ASR#
The following configuration needs to be appended under diarizer to run ASR with diarization to get a transcription with speaker labels.
asr:
model_path: ??? # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes.
parameters:
asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference.
asr_based_vad_threshold: 50 # threshold (multiple of 10ms) for ignoring the gap between two words when generating VAD timestamps using ASR based VAD.
asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null.
lenient_overlap_WDER: True # If true, when a word falls into speaker-overlapped regions, consider the word as a correctly diarized word.
decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model.
word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05 0.2].
word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'.
fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature.
colored_text: False # If True, use colored text to distinguish speakers in the output transcript.
print_time: True # If True, the start of the end time of each speaker turn is printed in the output transcript.
break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars)
ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode)
pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file.
beam_width: 32
alpha: 0.5
beta: 2.5
realigning_lm_parameters: # Experimental feature
arpa_language_model: null # Provide a KenLM language model in .arpa format.
min_number_of_words: 3 # Min number of words for the left context.
max_number_of_words: 10 # Max number of words for the right context.
logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses.