Speech to Text Datasets

Speech to text datasets

Lhotse-Based Dataset

LibriSpeech DataModule

Usage

dm = LibriSpeechDataModule(
    target_dir="../data/en", 
    dataset_parts="mini_librispeech",
    output_dir="../data/en/LibriSpeech",
    num_jobs=1
)
# skip this at export time to not waste time
# download
dm.prepare_data()
# libri = prepare_librispeech("../data/en/LibriSpeech", dataset_parts='mini_librispeech')
#! rm ../data/en/LibriSpeech/*.gz
# dm.setup(stage='test')
# dm.cuts_test
recs = RecordingSet.from_file("../data/en/LibriSpeech/librispeech_recordings_dev-clean-2.jsonl.gz")
sup = SupervisionSet("../data/en/LibriSpeech/librispeech_supervisions_dev-clean-2.jsonl.gz")
print(len(recs),len(sup))
1089 68
# test_dl = dm.test_dataloader()
# b = next(iter(test_dl))
# print(b["feats_pad"].shape, b["tokens_pad"].shape, b["ilens"].shape)
# plt.imshow(b["feats_pad"][0].transpose(0,1), origin='lower')

# dm.tokenizer.idx2token(b["tokens_pad"][0])
# dm.tokenizer.inverse(b["tokens_pad"][0], b["ilens"][0])
# print(dm.cuts_test)
# cut = dm.cuts_test[0]
# # pprint(cut.to_dict())
# cut.plot_audio()

MiniLibriSpeech Dataset

class MiniLibriSpeechDataset(Dataset):
    def __init__(
        self,
        root:str,
        subset:str,
        audio_transform:torchaudio.transforms = None,
        text_transform = None
        ):

        self.data = [] # audiopath, text
        
        self.audio_transform = audio_transform
        self.text_transform = text_transform

        path = Path(root) / subset
        for speaker_dir in path.iterdir():
            if not speaker_dir.is_dir():
                continue
            for chapter_dir in speaker_dir.iterdir():
                if not chapter_dir.is_dir():
                    continue
                trn_path = chapter_dir / f"{speaker_dir.name}-{chapter_dir.name}.trans.txt"
                if not trn_path.exists():
                    continue
                with open(trn_path, 'r') as f:
                    lines = f.readlines()

                with open(trn_path, 'r') as f:
                    lines = f.readlines()
                transcripts = {
                    line.split()[0]: ' '.join(line.strip().split()[1:])
                    for line in lines
                }

                for utt_id, text in transcripts.items():
                    audio_path = chapter_dir / f"{utt_id}.flac"
                    if audio_path.exists():
                        self.data.append((audio_path, text))
    
    def __getitem__(self, idx:int):
        audio_path, trn =  self.data[idx]
        waveform, sr = torchaudio.load(audio_path)
        if self.audio_transform:
            feats = self.audio_transform(waveform)
        else:
            feats = waveform
        
        if self.text_transform:
            label = self.text_transform(trn.lower())
        else:
            label = trn.lower()
        
        return feats.squeeze(0), label

    def __len__(self):
        return len(self.data)

Audio & text transforms

ds = MiniLibriSpeechDataset('../data/en/LibriSpeech', 'dev-clean-2')
print(len(ds), ds[0][0].shape, ds[0][1])

mfcc_transform = torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=13)
ds = MiniLibriSpeechDataset(
    "../data/en/LibriSpeech", "dev-clean-2",
    audio_transform=mfcc_transform
)
print(ds[0][0].shape, ds[0][1])
1089 torch.Size([186560]) if the reader will excuse me i will say nothing of my antecedents nor of the circumstances which led me to leave my native country the narrative would be tedious to him and painful to myself
torch.Size([13, 933]) if the reader will excuse me i will say nothing of my antecedents nor of the circumstances which led me to leave my native country the narrative would be tedious to him and painful to myself
import string
vocab = list(string.ascii_lowercase + string.digits) + [' ', '.', ',', '?', '!', '-', "'"]
tok = CharTokenizer(vocab)
encoded = tok.encode('hello!')
decoded = tok.decode(encoded)
print(decoded)

ds = MiniLibriSpeechDataset(
    "../data/en/LibriSpeech", "dev-clean-2",
    audio_transform=mfcc_transform,
    text_transform=tok.encode
)
# print(ds[0][0].shape, ds[0][1])
print(ds[0][0].shape, tok.decode(ds[0][1]))
hello!
torch.Size([13, 933]) if the reader will excuse me i will say nothing of my antecedents nor of the circumstances which led me to leave my native country the narrative would be tedious to him and painful to myself