Source code for nemo.collections.asr.data.audio_to_text_lhotse_speaker

# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random
from typing import Dict, Optional, Tuple

import torch.utils.data
from lhotse.dataset import AudioSamples
from lhotse.dataset.collation import collate_vectors

from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
from nemo.collections.asr.parts.utils.asr_multispeaker_utils import speaker_to_target
from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType


[docs] class LhotseSpeechToTextSpkBpeDataset(torch.utils.data.Dataset): """ This dataset is based on BPE datasets from audio_to_text.py. It has the same functionality of LhotseSpeechToTextBpeDataset but also yield speaker target tensor. Unlike native NeMo datasets, Lhotse dataset defines only the mapping from a CutSet (meta-data) to a mini-batch with PyTorch tensors. Specifically, it performs tokenization, I/O, augmentation, and feature extraction (if any). Managing data, sampling, de-duplication across workers/nodes etc. is all handled by Lhotse samplers instead. """ @property def output_types(self) -> Optional[Dict[str, NeuralType]]: return { 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), 'a_sig_length': NeuralType(tuple('B'), LengthsType()), 'transcripts': NeuralType(('B', 'T'), LabelsType()), 'transcript_length': NeuralType(tuple('B'), LengthsType()), 'spk_targets': NeuralType(('B', 'T'), LabelsType()), 'bg_spk_targets': NeuralType(('B', 'T'), LabelsType()), } def __init__(self, cfg, tokenizer: TokenizerSpec): super().__init__() self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) self.cfg = cfg self.num_speakers = self.cfg.get('num_speakers', 4) self.num_sample_per_mel_frame = self.cfg.get('num_sample_per_mel_frame', 160) self.num_mel_frame_per_asr_frame = self.cfg.get('num_mel_frame_per_asr_frame', 8) self.fixed_spk_id = self.cfg.get('fixed_spk_id', None) self.inference_mode = self.cfg.get('inference_mode', False) def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]: audio, audio_lens, cuts = self.load_audio(cuts) tokens = [] spk_targets = [] bg_spk_targets = [] if self.inference_mode: return audio, audio_lens, None, None, None, None for idx, cut in enumerate(cuts): speaker_targets, texts = speaker_to_target( a_cut=cut, num_speakers=self.num_speakers, num_sample_per_mel_frame=self.num_sample_per_mel_frame, num_mel_frame_per_asr_frame=self.num_mel_frame_per_asr_frame, return_text=True, ) speaker_targets = speaker_targets.transpose(0, 1)[: len(texts)] target_speaker_id = random.choice(range(len(texts))) non_target_speaker_ids = [i for i in range(len(texts)) if i != target_speaker_id] text = texts[target_speaker_id] speaker_target = speaker_targets[target_speaker_id] bg_speaker_target = speaker_targets[non_target_speaker_ids].sum(dim=0) > 0 tokens.append(torch.as_tensor(self.tokenizer(text, cut.supervisions[0].language))) spk_targets.append(speaker_target) bg_spk_targets.append(bg_speaker_target) token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long) tokens = collate_vectors(tokens, padding_value=0) spk_targets = collate_vectors(spk_targets, padding_value=0) bg_spk_targets = collate_vectors(bg_spk_targets, padding_value=0) return audio, audio_lens, tokens, token_lens, spk_targets, bg_spk_targets