dm = LibriSpeechDataModule(
target_dir="../data/en",
dataset_parts="mini_librispeech",
output_dir="../data/en/LibriSpeech",
num_jobs=1
)Speech to Text Datasets
Speech to text datasets
Lhotse-Based Dataset
LibriSpeech DataModule
Usage
# skip this at export time to not waste time
# download
dm.prepare_data()# libri = prepare_librispeech("../data/en/LibriSpeech", dataset_parts='mini_librispeech')#! rm ../data/en/LibriSpeech/*.gz# dm.setup(stage='test')
# dm.cuts_testrecs = RecordingSet.from_file("../data/en/LibriSpeech/librispeech_recordings_dev-clean-2.jsonl.gz")
sup = SupervisionSet("../data/en/LibriSpeech/librispeech_supervisions_dev-clean-2.jsonl.gz")
print(len(recs),len(sup))1089 68
# test_dl = dm.test_dataloader()
# b = next(iter(test_dl))
# print(b["feats_pad"].shape, b["tokens_pad"].shape, b["ilens"].shape)
# plt.imshow(b["feats_pad"][0].transpose(0,1), origin='lower')
# dm.tokenizer.idx2token(b["tokens_pad"][0])
# dm.tokenizer.inverse(b["tokens_pad"][0], b["ilens"][0])# print(dm.cuts_test)
# cut = dm.cuts_test[0]
# # pprint(cut.to_dict())
# cut.plot_audio()MiniLibriSpeech Dataset
class MiniLibriSpeechDataset(Dataset):
def __init__(
self,
root:str,
subset:str,
audio_transform:torchaudio.transforms = None,
text_transform = None
):
self.data = [] # audiopath, text
self.audio_transform = audio_transform
self.text_transform = text_transform
path = Path(root) / subset
for speaker_dir in path.iterdir():
if not speaker_dir.is_dir():
continue
for chapter_dir in speaker_dir.iterdir():
if not chapter_dir.is_dir():
continue
trn_path = chapter_dir / f"{speaker_dir.name}-{chapter_dir.name}.trans.txt"
if not trn_path.exists():
continue
with open(trn_path, 'r') as f:
lines = f.readlines()
with open(trn_path, 'r') as f:
lines = f.readlines()
transcripts = {
line.split()[0]: ' '.join(line.strip().split()[1:])
for line in lines
}
for utt_id, text in transcripts.items():
audio_path = chapter_dir / f"{utt_id}.flac"
if audio_path.exists():
self.data.append((audio_path, text))
def __getitem__(self, idx:int):
audio_path, trn = self.data[idx]
waveform, sr = torchaudio.load(audio_path)
if self.audio_transform:
feats = self.audio_transform(waveform)
else:
feats = waveform
if self.text_transform:
label = self.text_transform(trn.lower())
else:
label = trn.lower()
return feats.squeeze(0), label
def __len__(self):
return len(self.data)Audio & text transforms
ds = MiniLibriSpeechDataset('../data/en/LibriSpeech', 'dev-clean-2')
print(len(ds), ds[0][0].shape, ds[0][1])
mfcc_transform = torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=13)
ds = MiniLibriSpeechDataset(
"../data/en/LibriSpeech", "dev-clean-2",
audio_transform=mfcc_transform
)
print(ds[0][0].shape, ds[0][1])1089 torch.Size([186560]) if the reader will excuse me i will say nothing of my antecedents nor of the circumstances which led me to leave my native country the narrative would be tedious to him and painful to myself
torch.Size([13, 933]) if the reader will excuse me i will say nothing of my antecedents nor of the circumstances which led me to leave my native country the narrative would be tedious to him and painful to myself
import string
vocab = list(string.ascii_lowercase + string.digits) + [' ', '.', ',', '?', '!', '-', "'"]
tok = CharTokenizer(vocab)
encoded = tok.encode('hello!')
decoded = tok.decode(encoded)
print(decoded)
ds = MiniLibriSpeechDataset(
"../data/en/LibriSpeech", "dev-clean-2",
audio_transform=mfcc_transform,
text_transform=tok.encode
)
# print(ds[0][0].shape, ds[0][1])
print(ds[0][0].shape, tok.decode(ds[0][1]))hello!
torch.Size([13, 933]) if the reader will excuse me i will say nothing of my antecedents nor of the circumstances which led me to leave my native country the narrative would be tedious to him and painful to myself