# download_ljspeech('~/Data/en/')
# skip this step already done
= prepare_ljspeech('../data/en/LJSpeech-1.1', '../recipes/tts/ljspeech/data') ljspeech
Lhotse support for datasets
allows to leverage preliminary data prep from lhotse recipes
TTS Lhotse
Usage
Download data and load into Lhotse cuts
= CutSet.from_manifests(**ljspeech)
cut_set = cut_set.subset(first=3)
subset '../recipes/tts/ljspeech/data/first_3.jsonl.gz')
subset.to_file(= CutSet.from_file('../recipes/tts/ljspeech/data/first_3.jsonl.gz') reload_subset
print(subset[1])
print(reload_subset[1])
print(len(subset))
MonoCut(id='LJ001-0002-1', start=0, duration=1.899546485260771, channel=0, supervisions=[SupervisionSegment(id='LJ001-0002', recording_id='LJ001-0002', start=0.0, duration=1.899546485260771, channel=0, text='in being comparatively modern.', language='English', speaker=None, gender='female', custom={'normalized_text': 'in being comparatively modern.'}, alignment=None)], features=None, recording=Recording(id='LJ001-0002', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0002.wav')], sampling_rate=22050, num_samples=41885, duration=1.899546485260771, channel_ids=[0], transforms=None), custom=None)
MonoCut(id='LJ001-0002-1', start=0, duration=1.899546485260771, channel=0, supervisions=[SupervisionSegment(id='LJ001-0002', recording_id='LJ001-0002', start=0.0, duration=1.899546485260771, channel=0, text='in being comparatively modern.', language='English', speaker=None, gender='female', custom={'normalized_text': 'in being comparatively modern.'}, alignment=None)], features=None, recording=Recording(id='LJ001-0002', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0002.wav')], sampling_rate=22050, num_samples=41885, duration=1.899546485260771, channel_ids=[0], transforms=None), custom=None)
3
Encodec feature extractor
= EncoDecExtractor() encodec_extractor
# torch.set_num_threads(1)
# torch.set_num_interop_threads(1)
# TODO: fix bug for n_jobs >1
= subset.compute_and_store_features(
cuts =encodec_extractor,
extractor="../recipes/tts/ljspeech/data/encodec",
storage_path=1,
num_jobs# storage_type=NumpyHdf5Writer
)
Extracting and storing features: 0%| | 0/3 [00:00<?, ?it/s]Extracting and storing features: 100%|██████████| 3/3 [00:00<00:00, 4.57it/s]
print(cuts[0])
MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom={'normalized_text': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'}, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../recipes/tts/ljspeech/data/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom=None)
"../recipes/tts/ljspeech/data/first_3.encodec.jsonl.gz")
cuts.to_file(0]
cuts[= CutSet.from_file("../recipes/tts/ljspeech/data/first_3.encodec.jsonl.gz")
reload_cuts 0] reload_cuts[
MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom={'normalized_text': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'}, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../recipes/tts/ljspeech/data/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom=None)
# cuts[0].recording
!soxi '../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav'
Input File : '../data/en/LJSpeech-1.1/wavs/LJ001-0001.wav'
Channels : 1
Sample Rate : 22050
Precision : 16-bit
Duration : 00:00:09.66 = 212893 samples ~ 724.126 CDDA sectors
File Size : 426k
Bit Rate : 353k
Sample Encoding: 16-bit Signed Integer PCM
= PrecomputedFeatures()
strategy = strategy(cuts)
feats, feats_len
# print([(f"feat: {feat.shape}", f"len: {feat_len}") for feat in feats for feat_len in feats_len])
print([feat.shape for feat in feats])
print([int(feat_len) for feat_len in feats_len])
print(feats.shape, feats_len.shape)
# TODO: debug OnTheFlyFeature case
# strategy = OnTheFlyFeatures(extractor=encodec_extractor)
# feats, feats_len = strategy(cuts)
# print(feats, feats_len)
[torch.Size([725, 8]), torch.Size([725, 8]), torch.Size([725, 8])]
[724, 142, 725]
torch.Size([3, 725, 8]) torch.Size([3])
Text normalization, tokenization and numericalization
= TTSTextNormalizer()
cleaner = Phonemizer() tokenizer
"tutu. this is ture!") cleaner(
'tutu. this is ture!'
= 1
n_jobs = set()
unique_phonemes with CutSet.open_writer('../recipes/tts/ljspeech/data/first_3.final.jsonl.gz', overwrite=True) as writer:
for cut in cuts:
= cut.supervisions[0].text
text print(text)
= cleaner(text)
normalized print(normalized)
= tokenizer(text)
phonemes print(phonemes)
= {'normalized': normalized, 'phonemes': phonemes}
cut.custom =True)
writer.write(cut, flushlist(phonemes)) unique_phonemes.update(
Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the exhibition
pɹɪntɪŋ, ɪnðɪ oʊnli sɛns wɪð wɪtʃ wiː ɑːɹ æt pɹɛzənt kənsɜːnd, dɪfɚz fɹʌm moʊst ɪf nɑːt fɹʌm ɔːl ðɪ ɑːɹts ænd kɹæfts ɹɛpɹɪzɛntᵻd ɪnðɪ ɛksɪbɪʃən
in being comparatively modern.
in being comparatively modern.
ɪn biːɪŋ kəmpæɹətɪvli mɑːdɚn.
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process
for although the chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the netherlands, by a similar process
fɔːɹ ɑːlðoʊ ðə tʃaɪniːz tʊk ɪmpɹɛʃənz fɹʌm wʊd blɑːks ɛnɡɹeɪvd ɪn ɹɪliːf fɔːɹ sɛntʃɚɹɪz bɪfoːɹ ðə wʊdkʌɾɚz ʌvðə nɛðɚləndz, baɪ ɐ sɪmɪlɚ pɹɑːsɛs
Export phoneme lexicon
= CutSet.from_file("../data/en/LJSpeech-1.1/first_3.final.jsonl.gz")
cuts print(cuts[0])
map = {}
= set()
unique_syms for cut in cuts:
list(cut.custom['phonemes']))
unique_syms.update(for (i, v) in enumerate(sorted(list(unique_syms))):
map[i] = v
map[len(map)] = "<eps>"
print(map, len(map))
= json.dumps(map)
json_map with open("../data/en/LJSpeech-1.1/map.json","w") as f:
f.write(json_map)
MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom=None, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../data/en/LJSpeech-1.1/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='/data/en/LJSpeech/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom={'normalized': 'printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the exhibition', 'phonemes': 'pɹɪntɪŋ, ɪnðɪ oʊnli sɛns wɪð wɪtʃ wiː ɑːɹ æt pɹɛzənt kənsɜːnd, dɪfɚz fɹʌm moʊst ɪf nɑːt fɹʌm ɔːl ðɪ ɑːɹts ænd kɹæfts ɹɛpɹᵻzɛntᵻd ɪnðɪ ɛksɪbɪʃən'})
{0: ' ', 1: ',', 2: '.', 3: 'a', 4: 'b', 5: 'd', 6: 'e', 7: 'f', 8: 'i', 9: 'k', 10: 'l', 11: 'm', 12: 'n', 13: 'o', 14: 'p', 15: 's', 16: 't', 17: 'v', 18: 'w', 19: 'z', 20: 'æ', 21: 'ð', 22: 'ŋ', 23: 'ɐ', 24: 'ɑ', 25: 'ɔ', 26: 'ə', 27: 'ɚ', 28: 'ɛ', 29: 'ɜ', 30: 'ɡ', 31: 'ɪ', 32: 'ɹ', 33: 'ɾ', 34: 'ʃ', 35: 'ʊ', 36: 'ʌ', 37: 'ː', 38: 'ᵻ', 39: '<eps>'} 40
with open('../data/en/LJSpeech-1.1/map.json', 'r') as f:
= json.load(f)
data
print(data)
{'0': ' ', '1': ',', '2': '.', '3': 'a', '4': 'b', '5': 'd', '6': 'e', '7': 'f', '8': 'i', '9': 'k', '10': 'l', '11': 'm', '12': 'n', '13': 'o', '14': 'p', '15': 's', '16': 't', '17': 'v', '18': 'w', '19': 'z', '20': 'æ', '21': 'ð', '22': 'ŋ', '23': 'ɐ', '24': 'ɑ', '25': 'ɔ', '26': 'ə', '27': 'ɚ', '28': 'ɛ', '29': 'ɜ', '30': 'ɡ', '31': 'ɪ', '32': 'ɹ', '33': 'ɾ', '34': 'ʃ', '35': 'ʊ', '36': 'ʌ', '37': 'ː', '38': 'ᵻ', '39': '<eps>'}
Collate
0] cuts[
MonoCut(id='LJ001-0001-0', start=0, duration=9.65501133786848, channel=0, supervisions=[SupervisionSegment(id='LJ001-0001', recording_id='LJ001-0001', start=0.0, duration=9.65501133786848, channel=0, text='Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', language='English', speaker=None, gender='female', custom=None, alignment=None)], features=Features(type='encodec', num_frames=724, num_features=8, frame_shift=0.013333333333333334, sampling_rate=22050, start=0, duration=9.65501134, storage_type='lilcom_chunky', storage_path='../data/en/LJSpeech-1.1/encodec.lca', storage_key='0,8029,3610', recording_id='None', channels=0), recording=Recording(id='LJ001-0001', sources=[AudioSource(type='file', channels=[0], source='/data/en/LJSpeech/LJSpeech-1.1/wavs/LJ001-0001.wav')], sampling_rate=22050, num_samples=212893, duration=9.65501133786848, channel_ids=[0], transforms=None), custom={'normalized': 'printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the exhibition', 'phonemes': 'pɹɪntɪŋ, ɪnðɪ oʊnli sɛns wɪð wɪtʃ wiː ɑːɹ æt pɹɛzənt kənsɜːnd, dɪfɚz fɹʌm moʊst ɪf nɑːt fɹʌm ɔːl ðɪ ɑːɹts ænd kɹæfts ɹɛpɹᵻzɛntᵻd ɪnðɪ ɛksɪbɪʃən'})
= PhonemeCollater(cuts)
pc = pc(cuts)
tokens, tokens_len print(tokens, tokens_len)
print(pc.inverse(tokens, tokens_len))
tensor([[ 2, 18, 4, 36, 4, 35, 4, 16, 4, 20, 4, 35, 4, 26, 4, 5, 4, 4,
4, 35, 4, 16, 4, 25, 4, 35, 4, 4, 4, 17, 4, 39, 4, 16, 4, 14,
4, 12, 4, 4, 4, 19, 4, 32, 4, 16, 4, 19, 4, 4, 4, 22, 4, 35,
4, 25, 4, 4, 4, 22, 4, 35, 4, 20, 4, 38, 4, 4, 4, 22, 4, 12,
4, 41, 4, 4, 4, 28, 4, 41, 4, 36, 4, 4, 4, 24, 4, 20, 4, 4,
4, 18, 4, 36, 4, 32, 4, 23, 4, 30, 4, 16, 4, 20, 4, 4, 4, 13,
4, 30, 4, 16, 4, 19, 4, 33, 4, 41, 4, 16, 4, 9, 4, 5, 4, 4,
4, 9, 4, 35, 4, 11, 4, 31, 4, 23, 4, 4, 4, 11, 4, 36, 4, 40,
4, 15, 4, 4, 4, 15, 4, 17, 4, 39, 4, 19, 4, 20, 4, 4, 4, 35,
4, 11, 4, 4, 4, 16, 4, 28, 4, 41, 4, 20, 4, 4, 4, 11, 4, 36,
4, 40, 4, 15, 4, 4, 4, 29, 4, 41, 4, 14, 4, 4, 4, 25, 4, 35,
4, 4, 4, 28, 4, 41, 4, 36, 4, 20, 4, 19, 4, 4, 4, 24, 4, 16,
4, 9, 4, 4, 4, 13, 4, 36, 4, 24, 4, 11, 4, 20, 4, 19, 4, 4,
4, 36, 4, 32, 4, 18, 4, 36, 4, 42, 4, 23, 4, 32, 4, 16, 4, 20,
4, 42, 4, 9, 4, 4, 4, 35, 4, 16, 4, 25, 4, 35, 4, 4, 4, 32,
4, 13, 4, 19, 4, 35, 4, 8, 4, 35, 4, 38, 4, 30, 4, 16, 3, 0,
0],
[ 2, 35, 4, 16, 4, 4, 4, 8, 4, 12, 4, 41, 4, 35, 4, 26, 4, 4,
4, 13, 4, 30, 4, 15, 4, 18, 4, 24, 4, 36, 4, 30, 4, 20, 4, 35,
4, 21, 4, 14, 4, 12, 4, 4, 4, 15, 4, 28, 4, 41, 4, 9, 4, 31,
4, 16, 4, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0],
[ 2, 11, 4, 29, 4, 41, 4, 36, 4, 4, 4, 29, 4, 41, 4, 14, 4, 25,
4, 17, 4, 39, 4, 4, 4, 25, 4, 30, 4, 4, 4, 20, 4, 38, 4, 7,
4, 35, 4, 16, 4, 12, 4, 41, 4, 23, 4, 4, 4, 20, 4, 39, 4, 13,
4, 4, 4, 35, 4, 15, 4, 18, 4, 36, 4, 32, 4, 38, 4, 30, 4, 16,
4, 23, 4, 4, 4, 11, 4, 36, 4, 40, 4, 15, 4, 4, 4, 22, 4, 39,
4, 9, 4, 4, 4, 8, 4, 14, 4, 28, 4, 41, 4, 13, 4, 19, 4, 4,
4, 32, 4, 26, 4, 34, 4, 36, 4, 10, 4, 35, 4, 21, 4, 9, 4, 4,
4, 35, 4, 16, 4, 4, 4, 36, 4, 42, 4, 14, 4, 12, 4, 41, 4, 11,
4, 4, 4, 11, 4, 29, 4, 41, 4, 36, 4, 4, 4, 19, 4, 32, 4, 16,
4, 20, 4, 38, 4, 31, 4, 36, 4, 12, 4, 23, 4, 4, 4, 8, 4, 42,
4, 11, 4, 17, 4, 41, 4, 36, 4, 4, 4, 25, 4, 30, 4, 4, 4, 22,
4, 39, 4, 9, 4, 13, 4, 40, 4, 37, 4, 31, 4, 23, 4, 4, 4, 40,
4, 21, 4, 25, 4, 30, 4, 4, 4, 16, 4, 32, 4, 25, 4, 33, 4, 41,
4, 14, 4, 30, 4, 16, 4, 9, 4, 23, 4, 5, 4, 4, 4, 8, 4, 7,
4, 35, 4, 4, 4, 27, 4, 4, 4, 19, 4, 35, 4, 15, 4, 35, 4, 14,
4, 31, 4, 4, 4, 18, 4, 36, 4, 28, 4, 41, 4, 19, 4, 32, 4, 19,
3]]) tensor([287, 59, 289], dtype=torch.int32)
['p ɹ ɪ n t ɪ ŋ , ɪ n ð ɪ o ʊ n l i s ɛ n s w ɪ ð w ɪ t ʃ w i ː ɑ ː ɹ æ t p ɹ ɛ z ə n t k ə n s ɜ ː n d , d ɪ f ɚ z f ɹ ʌ m m o ʊ s t ɪ f n ɑ ː t f ɹ ʌ m ɔ ː l ð ɪ ɑ ː ɹ t s æ n d k ɹ æ f t s ɹ ɛ p ɹ ᵻ z ɛ n t ᵻ d ɪ n ð ɪ ɛ k s ɪ b ɪ ʃ ə n', 'ɪ n b i ː ɪ ŋ k ə m p æ ɹ ə t ɪ v l i m ɑ ː d ɚ n .', 'f ɔ ː ɹ ɔ ː l ð o ʊ ð ə t ʃ a ɪ n i ː z t ʊ k ɪ m p ɹ ɛ ʃ ə n z f ɹ ʌ m w ʊ d b l ɑ ː k s ɛ ŋ ɡ ɹ e ɪ v d ɪ n ɹ ᵻ l i ː f f ɔ ː ɹ s ɛ n t ʃ ɚ ɹ i z b ᵻ f o ː ɹ ð ə w ʊ d k ʌ ɾ ɚ z ʌ v ð ə n ɛ ð ɜ ː l ə n d z , b a ɪ ɐ s ɪ m ɪ l ɚ p ɹ ɑ ː s ɛ s']
class ValleDataset(Dataset):
def __init__(
self,
cuts:CutSet,=PrecomputedFeatures()
strategy:BatchIO
):self.extractor = strategy
self.tokenizer = PhonemeCollater(cuts)
def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
# getitem is on full cutset not just one cut like usual for pytorch datasets
= cuts.sort_by_duration()
cuts = self.extractor(cuts)
feats, feat_lens = self.tokenizer(cuts)
tokens, token_lens return {"feats_pad": feats, "feats_lens": feat_lens, "tokens_pad": tokens, "tokens_lens": token_lens}
= ValleDataset(cuts)
ds # Dataset performs batching by itself, so we have to indicate that to the DataLoader with batch_size=None
# train_sampler = BucketingSampler(cuts, max_duration=300, shuffle=True, bucket_method="equal_duration")
= DynamicBucketingSampler(cuts, max_duration=300, shuffle=True, num_buckets=2)
train_sampler = DataLoader(ds, sampler=train_sampler, batch_size=None, num_workers=0)
dl print(next(iter(dl)))
{'feats_pad': tensor([[[ 160.0000, 909.0000, 956.0117, ..., 594.9853, 432.9870,
962.9949],
[ 438.0000, 876.0039, 486.0096, ..., 602.0046, 997.9940,
262.0071],
[ 935.0078, 927.9921, 956.0148, ..., 371.9996, 338.9874,
228.0006],
...,
[ 475.0099, 856.9933, 653.0055, ..., 95.9989, 853.0098,
467.0154],
[ 105.9963, 544.0138, 785.9864, ..., 938.9966, 627.9919,
899.0155],
[ 474.9892, 913.0139, 981.9944, ..., 40.9858, 771.9880,
1012.0151]]]), 'feats_lens': tensor([725], dtype=torch.int32), 'tokens_pad': tensor([[ 2, 11, 4, 29, 4, 41, 4, 36, 4, 4, 4, 29, 4, 41, 4, 14, 4, 25,
4, 17, 4, 39, 4, 4, 4, 25, 4, 30, 4, 4, 4, 20, 4, 38, 4, 7,
4, 35, 4, 16, 4, 12, 4, 41, 4, 23, 4, 4, 4, 20, 4, 39, 4, 13,
4, 4, 4, 35, 4, 15, 4, 18, 4, 36, 4, 32, 4, 38, 4, 30, 4, 16,
4, 23, 4, 4, 4, 11, 4, 36, 4, 40, 4, 15, 4, 4, 4, 22, 4, 39,
4, 9, 4, 4, 4, 8, 4, 14, 4, 28, 4, 41, 4, 13, 4, 19, 4, 4,
4, 32, 4, 26, 4, 34, 4, 36, 4, 10, 4, 35, 4, 21, 4, 9, 4, 4,
4, 35, 4, 16, 4, 4, 4, 36, 4, 42, 4, 14, 4, 12, 4, 41, 4, 11,
4, 4, 4, 11, 4, 29, 4, 41, 4, 36, 4, 4, 4, 19, 4, 32, 4, 16,
4, 20, 4, 38, 4, 31, 4, 36, 4, 12, 4, 23, 4, 4, 4, 8, 4, 42,
4, 11, 4, 17, 4, 41, 4, 36, 4, 4, 4, 25, 4, 30, 4, 4, 4, 22,
4, 39, 4, 9, 4, 13, 4, 40, 4, 37, 4, 31, 4, 23, 4, 4, 4, 40,
4, 21, 4, 25, 4, 30, 4, 4, 4, 16, 4, 32, 4, 25, 4, 33, 4, 41,
4, 14, 4, 30, 4, 16, 4, 9, 4, 23, 4, 5, 4, 4, 4, 8, 4, 7,
4, 35, 4, 4, 4, 27, 4, 4, 4, 19, 4, 35, 4, 15, 4, 35, 4, 14,
4, 31, 4, 4, 4, 18, 4, 36, 4, 28, 4, 41, 4, 19, 4, 32, 4, 19,
3]]), 'tokens_lens': tensor([289], dtype=torch.int32)}