Lhotse support for datasets

allows to leverage preliminary data prep from lhotse recipes

TTS Lhotse

Usage

Download data and load into Lhotse cuts

# download_ljspeech('~/Data/en/')
# skip this step already done
ljspeech = prepare_ljspeech('../data/en/LJSpeech-1.1', '../recipes/tts/ljspeech/data')

cut_set = CutSet.from_manifests(**ljspeech)
subset = cut_set.subset(first=3)
subset.to_file('../recipes/tts/ljspeech/data/first_3.jsonl.gz')
reload_subset = CutSet.from_file('../recipes/tts/ljspeech/data/first_3.jsonl.gz')

print(subset[1])
print(reload_subset[1])
print(len(subset))

MonoCut(id='LJ001-0002-1', start=0, duration=1.899546485260771, channel=0, supervisions=[SupervisionSegment(id='LJ001-0002', recording_id='LJ001-0002', start=0.0, duration=1.899546485260771, channel=0, text='in being comparatively modern.', language='English', speaker=None, gender='female', custom={'normalized_text': 'in being comparatively modern.'}, alignment=None)], features=None, recording=Recording(id='LJ001-0002', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0002.wav')], sampling_rate=22050, num_samples=41885, duration=1.899546485260771, channel_ids=[0], transforms=None), custom=None)
MonoCut(id='LJ001-0002-1', start=0, duration=1.899546485260771, channel=0, supervisions=[SupervisionSegment(id='LJ001-0002', recording_id='LJ001-0002', start=0.0, duration=1.899546485260771, channel=0, text='in being comparatively modern.', language='English', speaker=None, gender='female', custom={'normalized_text': 'in being comparatively modern.'}, alignment=None)], features=None, recording=Recording(id='LJ001-0002', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0002.wav')], sampling_rate=22050, num_samples=41885, duration=1.899546485260771, channel_ids=[0], transforms=None), custom=None)
3

Encodec feature extractor

encodec_extractor = EncoDecExtractor()

# torch.set_num_threads(1)
# torch.set_num_interop_threads(1)

# TODO: fix bug for n_jobs >1
cuts = subset.compute_and_store_features(
    extractor=encodec_extractor,
    storage_path="../recipes/tts/ljspeech/data/encodec",
    num_jobs=1,
    # storage_type=NumpyHdf5Writer
)

Extracting and storing features:   0%|          | 0/3 [00:00<?, ?it/s]Extracting and storing features: 100%|██████████| 3/3 [00:00<00:00,  4.57it/s]

print(cuts[0])

MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom={'normalized_text': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'}, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../recipes/tts/ljspeech/data/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom=None)

cuts.to_file("../recipes/tts/ljspeech/data/first_3.encodec.jsonl.gz")
cuts[0]
reload_cuts = CutSet.from_file("../recipes/tts/ljspeech/data/first_3.encodec.jsonl.gz")
reload_cuts[0]

MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom={'normalized_text': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'}, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../recipes/tts/ljspeech/data/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom=None)

# cuts[0].recording
!soxi '../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav'


Input File     : '../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav'
Channels       : 1
Sample Rate    : 22050
Precision      : 16-bit
Duration       : 00:00:09.66 = 212893 samples ~ 724.126 CDDA sectors
File Size      : 426k
Bit Rate       : 353k
Sample Encoding: 16-bit Signed Integer PCM

strategy = PrecomputedFeatures()
feats, feats_len = strategy(cuts)

# print([(f"feat: {feat.shape}", f"len: {feat_len}") for feat in feats for feat_len in feats_len])
print([feat.shape for feat in feats])
print([int(feat_len) for feat_len in feats_len])
print(feats.shape, feats_len.shape)
# TODO: debug OnTheFlyFeature case
# strategy = OnTheFlyFeatures(extractor=encodec_extractor)
# feats, feats_len = strategy(cuts)
# print(feats, feats_len)

[torch.Size([725, 8]), torch.Size([725, 8]), torch.Size([725, 8])]
[724, 142, 725]
torch.Size([3, 725, 8]) torch.Size([3])

Text normalization, tokenization and numericalization

cleaner = TTSTextNormalizer()
tokenizer = Phonemizer()

cleaner("tutu. this is ture!")

'tutu. this is ture!'

n_jobs = 1
unique_phonemes = set()
with CutSet.open_writer('../recipes/tts/ljspeech/data/first_3.final.jsonl.gz', overwrite=True) as writer:
    for cut in cuts:
        text = cut.supervisions[0].text
        print(text)
        normalized = cleaner(text)
        print(normalized)
        phonemes = tokenizer(text)
        print(phonemes)
        cut.custom = {'normalized': normalized, 'phonemes': phonemes}
        writer.write(cut, flush=True)
        unique_phonemes.update(list(phonemes))

Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the exhibition
pɹɪntɪŋ, ɪnðɪ oʊnli sɛns wɪð wɪtʃ wiː ɑːɹ æt pɹɛzənt kənsɜːnd, dɪfɚz fɹʌm moʊst ɪf nɑːt fɹʌm ɔːl ðɪ ɑːɹts ænd kɹæfts ɹɛpɹɪzɛntᵻd ɪnðɪ ɛksɪbɪʃən
in being comparatively modern.
in being comparatively modern.
ɪn biːɪŋ kəmpæɹətɪvli mɑːdɚn.
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process
for although the chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the netherlands, by a similar process
fɔːɹ ɑːlðoʊ ðə tʃaɪniːz tʊk ɪmpɹɛʃənz fɹʌm wʊd blɑːks ɛnɡɹeɪvd ɪn ɹɪliːf fɔːɹ sɛntʃɚɹɪz bɪfoːɹ ðə wʊdkʌɾɚz ʌvðə nɛðɚləndz, baɪ ɐ sɪmɪlɚ pɹɑːsɛs

Export phoneme lexicon

cuts = CutSet.from_file("../data/en/LJSpeech-1.1/first_3.final.jsonl.gz")
print(cuts[0])
map = {}
unique_syms = set()
for cut in cuts:
    unique_syms.update(list(cut.custom['phonemes']))
for (i, v) in enumerate(sorted(list(unique_syms))):
    map[i] = v
map[len(map)] = "<eps>"
print(map, len(map))

json_map = json.dumps(map)
with open("../data/en/LJSpeech-1.1/map.json","w") as f:
    f.write(json_map)

MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom=None, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../data/en/LJSpeech-1.1/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='/data/en/LJSpeech/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom={'normalized': 'printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the exhibition', 'phonemes': 'pɹɪntɪŋ, ɪnðɪ oʊnli sɛns wɪð wɪtʃ wiː ɑːɹ æt pɹɛzənt kənsɜːnd, dɪfɚz fɹʌm moʊst ɪf nɑːt fɹʌm ɔːl ðɪ ɑːɹts ænd kɹæfts ɹɛpɹᵻzɛntᵻd ɪnðɪ ɛksɪbɪʃən'})
{0: ' ', 1: ',', 2: '.', 3: 'a', 4: 'b', 5: 'd', 6: 'e', 7: 'f', 8: 'i', 9: 'k', 10: 'l', 11: 'm', 12: 'n', 13: 'o', 14: 'p', 15: 's', 16: 't', 17: 'v', 18: 'w', 19: 'z', 20: 'æ', 21: 'ð', 22: 'ŋ', 23: 'ɐ', 24: 'ɑ', 25: 'ɔ', 26: 'ə', 27: 'ɚ', 28: 'ɛ', 29: 'ɜ', 30: 'ɡ', 31: 'ɪ', 32: 'ɹ', 33: 'ɾ', 34: 'ʃ', 35: 'ʊ', 36: 'ʌ', 37: 'ː', 38: 'ᵻ', 39: '<eps>'} 40

with open('../data/en/LJSpeech-1.1/map.json', 'r') as f:
    data = json.load(f)

print(data)

{'0': ' ', '1': ',', '2': '.', '3': 'a', '4': 'b', '5': 'd', '6': 'e', '7': 'f', '8': 'i', '9': 'k', '10': 'l', '11': 'm', '12': 'n', '13': 'o', '14': 'p', '15': 's', '16': 't', '17': 'v', '18': 'w', '19': 'z', '20': 'æ', '21': 'ð', '22': 'ŋ', '23': 'ɐ', '24': 'ɑ', '25': 'ɔ', '26': 'ə', '27': 'ɚ', '28': 'ɛ', '29': 'ɜ', '30': 'ɡ', '31': 'ɪ', '32': 'ɹ', '33': 'ɾ', '34': 'ʃ', '35': 'ʊ', '36': 'ʌ', '37': 'ː', '38': 'ᵻ', '39': '<eps>'}

Collate

cuts[0]

MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom=None, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../data/en/LJSpeech-1.1/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='/data/en/LJSpeech/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom={'normalized': 'printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the exhibition', 'phonemes': 'pɹɪntɪŋ, ɪnðɪ oʊnli sɛns wɪð wɪtʃ wiː ɑːɹ æt pɹɛzənt kənsɜːnd, dɪfɚz fɹʌm moʊst ɪf nɑːt fɹʌm ɔːl ðɪ ɑːɹts ænd kɹæfts ɹɛpɹᵻzɛntᵻd ɪnðɪ ɛksɪbɪʃən'})

pc = PhonemeCollater(cuts)
tokens, tokens_len = pc(cuts)
print(tokens, tokens_len)
print(pc.inverse(tokens, tokens_len))

tensor([[ 2, 18,  4, 36,  4, 35,  4, 16,  4, 20,  4, 35,  4, 26,  4,  5,  4,  4,
          4, 35,  4, 16,  4, 25,  4, 35,  4,  4,  4, 17,  4, 39,  4, 16,  4, 14,
          4, 12,  4,  4,  4, 19,  4, 32,  4, 16,  4, 19,  4,  4,  4, 22,  4, 35,
          4, 25,  4,  4,  4, 22,  4, 35,  4, 20,  4, 38,  4,  4,  4, 22,  4, 12,
          4, 41,  4,  4,  4, 28,  4, 41,  4, 36,  4,  4,  4, 24,  4, 20,  4,  4,
          4, 18,  4, 36,  4, 32,  4, 23,  4, 30,  4, 16,  4, 20,  4,  4,  4, 13,
          4, 30,  4, 16,  4, 19,  4, 33,  4, 41,  4, 16,  4,  9,  4,  5,  4,  4,
          4,  9,  4, 35,  4, 11,  4, 31,  4, 23,  4,  4,  4, 11,  4, 36,  4, 40,
          4, 15,  4,  4,  4, 15,  4, 17,  4, 39,  4, 19,  4, 20,  4,  4,  4, 35,
          4, 11,  4,  4,  4, 16,  4, 28,  4, 41,  4, 20,  4,  4,  4, 11,  4, 36,
          4, 40,  4, 15,  4,  4,  4, 29,  4, 41,  4, 14,  4,  4,  4, 25,  4, 35,
          4,  4,  4, 28,  4, 41,  4, 36,  4, 20,  4, 19,  4,  4,  4, 24,  4, 16,
          4,  9,  4,  4,  4, 13,  4, 36,  4, 24,  4, 11,  4, 20,  4, 19,  4,  4,
          4, 36,  4, 32,  4, 18,  4, 36,  4, 42,  4, 23,  4, 32,  4, 16,  4, 20,
          4, 42,  4,  9,  4,  4,  4, 35,  4, 16,  4, 25,  4, 35,  4,  4,  4, 32,
          4, 13,  4, 19,  4, 35,  4,  8,  4, 35,  4, 38,  4, 30,  4, 16,  3,  0,
          0],
        [ 2, 35,  4, 16,  4,  4,  4,  8,  4, 12,  4, 41,  4, 35,  4, 26,  4,  4,
          4, 13,  4, 30,  4, 15,  4, 18,  4, 24,  4, 36,  4, 30,  4, 20,  4, 35,
          4, 21,  4, 14,  4, 12,  4,  4,  4, 15,  4, 28,  4, 41,  4,  9,  4, 31,
          4, 16,  4,  6,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [ 2, 11,  4, 29,  4, 41,  4, 36,  4,  4,  4, 29,  4, 41,  4, 14,  4, 25,
          4, 17,  4, 39,  4,  4,  4, 25,  4, 30,  4,  4,  4, 20,  4, 38,  4,  7,
          4, 35,  4, 16,  4, 12,  4, 41,  4, 23,  4,  4,  4, 20,  4, 39,  4, 13,
          4,  4,  4, 35,  4, 15,  4, 18,  4, 36,  4, 32,  4, 38,  4, 30,  4, 16,
          4, 23,  4,  4,  4, 11,  4, 36,  4, 40,  4, 15,  4,  4,  4, 22,  4, 39,
          4,  9,  4,  4,  4,  8,  4, 14,  4, 28,  4, 41,  4, 13,  4, 19,  4,  4,
          4, 32,  4, 26,  4, 34,  4, 36,  4, 10,  4, 35,  4, 21,  4,  9,  4,  4,
          4, 35,  4, 16,  4,  4,  4, 36,  4, 42,  4, 14,  4, 12,  4, 41,  4, 11,
          4,  4,  4, 11,  4, 29,  4, 41,  4, 36,  4,  4,  4, 19,  4, 32,  4, 16,
          4, 20,  4, 38,  4, 31,  4, 36,  4, 12,  4, 23,  4,  4,  4,  8,  4, 42,
          4, 11,  4, 17,  4, 41,  4, 36,  4,  4,  4, 25,  4, 30,  4,  4,  4, 22,
          4, 39,  4,  9,  4, 13,  4, 40,  4, 37,  4, 31,  4, 23,  4,  4,  4, 40,
          4, 21,  4, 25,  4, 30,  4,  4,  4, 16,  4, 32,  4, 25,  4, 33,  4, 41,
          4, 14,  4, 30,  4, 16,  4,  9,  4, 23,  4,  5,  4,  4,  4,  8,  4,  7,
          4, 35,  4,  4,  4, 27,  4,  4,  4, 19,  4, 35,  4, 15,  4, 35,  4, 14,
          4, 31,  4,  4,  4, 18,  4, 36,  4, 28,  4, 41,  4, 19,  4, 32,  4, 19,
          3]]) tensor([287,  59, 289], dtype=torch.int32)
['p ɹ ɪ n t ɪ ŋ ,   ɪ n ð ɪ   o ʊ n l i   s ɛ n s   w ɪ ð   w ɪ t ʃ   w i ː   ɑ ː ɹ   æ t   p ɹ ɛ z ə n t   k ə n s ɜ ː n d ,   d ɪ f ɚ z   f ɹ ʌ m   m o ʊ s t   ɪ f   n ɑ ː t   f ɹ ʌ m   ɔ ː l   ð ɪ   ɑ ː ɹ t s   æ n d   k ɹ æ f t s   ɹ ɛ p ɹ ᵻ z ɛ n t ᵻ d   ɪ n ð ɪ   ɛ k s ɪ b ɪ ʃ ə n', 'ɪ n   b i ː ɪ ŋ   k ə m p æ ɹ ə t ɪ v l i   m ɑ ː d ɚ n .', 'f ɔ ː ɹ   ɔ ː l ð o ʊ   ð ə   t ʃ a ɪ n i ː z   t ʊ k   ɪ m p ɹ ɛ ʃ ə n z   f ɹ ʌ m   w ʊ d   b l ɑ ː k s   ɛ ŋ ɡ ɹ e ɪ v d   ɪ n   ɹ ᵻ l i ː f   f ɔ ː ɹ   s ɛ n t ʃ ɚ ɹ i z   b ᵻ f o ː ɹ   ð ə   w ʊ d k ʌ ɾ ɚ z   ʌ v ð ə   n ɛ ð ɜ ː l ə n d z ,   b a ɪ   ɐ   s ɪ m ɪ l ɚ   p ɹ ɑ ː s ɛ s']

class ValleDataset(Dataset):
    def __init__(
            self,
            cuts:CutSet,
            strategy:BatchIO=PrecomputedFeatures()
        ):
        self.extractor = strategy
        self.tokenizer = PhonemeCollater(cuts)

    def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
        # getitem is on full cutset not just one cut like usual for pytorch datasets
        cuts = cuts.sort_by_duration()
        feats, feat_lens = self.extractor(cuts)
        tokens, token_lens = self.tokenizer(cuts)
        return {"feats_pad": feats, "feats_lens": feat_lens, "tokens_pad": tokens, "tokens_lens": token_lens}

ds = ValleDataset(cuts)
# Dataset performs batching by itself, so we have to indicate that to the DataLoader with batch_size=None
# train_sampler = BucketingSampler(cuts, max_duration=300, shuffle=True, bucket_method="equal_duration")
train_sampler = DynamicBucketingSampler(cuts, max_duration=300, shuffle=True, num_buckets=2)
dl = DataLoader(ds, sampler=train_sampler, batch_size=None, num_workers=0)
print(next(iter(dl)))

{'feats_pad': tensor([[[ 160.0000,  909.0000,  956.0117,  ...,  594.9853,  432.9870,
           962.9949],
         [ 438.0000,  876.0039,  486.0096,  ...,  602.0046,  997.9940,
           262.0071],
         [ 935.0078,  927.9921,  956.0148,  ...,  371.9996,  338.9874,
           228.0006],
         ...,
         [ 475.0099,  856.9933,  653.0055,  ...,   95.9989,  853.0098,
           467.0154],
         [ 105.9963,  544.0138,  785.9864,  ...,  938.9966,  627.9919,
           899.0155],
         [ 474.9892,  913.0139,  981.9944,  ...,   40.9858,  771.9880,
          1012.0151]]]), 'feats_lens': tensor([725], dtype=torch.int32), 'tokens_pad': tensor([[ 2, 11,  4, 29,  4, 41,  4, 36,  4,  4,  4, 29,  4, 41,  4, 14,  4, 25,
          4, 17,  4, 39,  4,  4,  4, 25,  4, 30,  4,  4,  4, 20,  4, 38,  4,  7,
          4, 35,  4, 16,  4, 12,  4, 41,  4, 23,  4,  4,  4, 20,  4, 39,  4, 13,
          4,  4,  4, 35,  4, 15,  4, 18,  4, 36,  4, 32,  4, 38,  4, 30,  4, 16,
          4, 23,  4,  4,  4, 11,  4, 36,  4, 40,  4, 15,  4,  4,  4, 22,  4, 39,
          4,  9,  4,  4,  4,  8,  4, 14,  4, 28,  4, 41,  4, 13,  4, 19,  4,  4,
          4, 32,  4, 26,  4, 34,  4, 36,  4, 10,  4, 35,  4, 21,  4,  9,  4,  4,
          4, 35,  4, 16,  4,  4,  4, 36,  4, 42,  4, 14,  4, 12,  4, 41,  4, 11,
          4,  4,  4, 11,  4, 29,  4, 41,  4, 36,  4,  4,  4, 19,  4, 32,  4, 16,
          4, 20,  4, 38,  4, 31,  4, 36,  4, 12,  4, 23,  4,  4,  4,  8,  4, 42,
          4, 11,  4, 17,  4, 41,  4, 36,  4,  4,  4, 25,  4, 30,  4,  4,  4, 22,
          4, 39,  4,  9,  4, 13,  4, 40,  4, 37,  4, 31,  4, 23,  4,  4,  4, 40,
          4, 21,  4, 25,  4, 30,  4,  4,  4, 16,  4, 32,  4, 25,  4, 33,  4, 41,
          4, 14,  4, 30,  4, 16,  4,  9,  4, 23,  4,  5,  4,  4,  4,  8,  4,  7,
          4, 35,  4,  4,  4, 27,  4,  4,  4, 19,  4, 35,  4, 15,  4, 35,  4, 14,
          4, 31,  4,  4,  4, 18,  4, 36,  4, 28,  4, 41,  4, 19,  4, 32,  4, 19,
          3]]), 'tokens_lens': tensor([289], dtype=torch.int32)}