Audio TTS Datasets

TTS datasets

LibriTTS

Lhotse-based Base Class

https://github.com/Lightning-AI/lightning/issues/10358 https://colab.research.google.com/drive/1HKSYPsWx_HoCdrnLpaPdYj5zwlPsM3NH

# tok = TokenCollater()
# ds = LhotseTTSDataset(tok)

TTS Base Class

LibriTTS DataModule

#(Waveform, Sample_rate, Original_text, Normalized_text, Speaker_ID, Chapter_ID, Utterance_ID)
ds = LIBRITTS("../data/en", 'test-clean')
print(ds[0])
plot_waveform(ds[0][0], ds[0][1])

Usage

# num_jobs=0 turns parallel computing off within jupyter notebook. Else it fails.
dm = LibriTTSDataModule(
    target_dir="../data/en", 
    dataset_parts="test-clean",
    output_dir="../data/en/LibriTTS/test-clean",
    num_jobs=1
)
# skip download and use local data folder
# dm.prepare_data()
# libri = prepare_libritts("../data/en/LibriTTS", dataset_parts="test-clean")
dm.setup(stage='test')
test_dl = dm.test_dataloader()
batch = next(iter(test_dl))
print(batch.keys())
print(batch['feats_pad'].shape)
plt.imshow(batch['feats_pad'][3].transpose(0,1))
print(batch['feats_lens'])
print(batch['tokens_pad'][3], batch['tokens_lens'][3])
original_sentences = dm.tokenizer.inverse(batch['tokens_pad'], batch['tokens_lens'])
print(original_sentences)