Audio Embedders

TODO: figure out encoder from hugging face lib

EncoDec

model = EncodecModel.encodec_model_24khz()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[17], line 1
----> 1 model = EncodecModel.encodec_model_24khz()

AttributeError: type object 'EncodecModel' has no attribute 'encodec_model_24khz'

Usage

wav, sr = torchaudio.load("../data/audio/obama.wav")
# wav, sr = torch.rand((1, 24000)), 24000
# wav, sr = np.random.random((1, 24000)), 24000

encodec = EncoDec(device='cpu')
codes = encodec(wav,sr)
print(f"wav: {wav.shape}, code: {codes.shape} ")
plt.rcParams["figure.figsize"] = (5,5)
plt.xlabel('frames')
plt.ylabel('quantization')
plt.imshow(codes.squeeze().cpu().numpy())
decoded = encodec.decode(codes)
plot_waveform(decoded.detach().cpu().squeeze(0), encodec.sample_rate)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[12], line 5
      1 wav, sr = torchaudio.load("../data/audio/obama.wav")
      2 # wav, sr = torch.rand((1, 24000)), 24000
      3 # wav, sr = np.random.random((1, 24000)), 24000
----> 5 encodec = EncoDec(device='cpu')
      6 codes = encodec(wav,sr)
      7 print(f"wav: {wav.shape}, code: {codes.shape} ")

Cell In[11], line 4, in EncoDec.__init__(self, device)
      3 def __init__(self, device:str='cpu'):
----> 4     self.model = EncodecModel.encodec_model_24khz()
      5     self._device = device
      6     self.model.to(self._device)

AttributeError: type object 'EncodecModel' has no attribute 'encodec_model_24khz'

plt.plot(codes[0][0])
print(codes[0][0].shape)

Lhotse-style Encodec feature extractor

encodec_extractor = EncoDecExtractor()
# cuts = CutSet.from_file("../recipes/tts/ljspeech/data/first_3.jsonl.gz")
cuts = CutSet.from_file("../data/en/LJSpeech-1.1/first_3.encodec.jsonl.gz")
print(cuts[0])
print(cuts[1])

# torch.set_num_threads(1)
# torch.set_num_interop_threads(1)

# feats = cuts.compute_and_store_features(extractor=Fbank(), storage_path="../recipes/tts/ljspeech/data/feats")

# storage_path = "../.data/en/LJSpeech-1.1"
# # storage_path = "../recipes/tts/ljspeech/data/feats"
# # TODO: make it work with num_jobs>1
# cuts = cuts.compute_and_store_features(
#     extractor=encodec_extractor,
#     storage_path=storage_path,
#     num_jobs=1,
# )
# cuts.to_file("../recipes/tts/ljspeech/data/cuts_encodec.jsonl.gz")
# print(cuts[0])
# cuts[0].plot_features()
# print(cuts)

files = "../data/en/LJSpeech-1.1/cuts_encodec.jsonl.gz"
# files = "../recipes/tts/ljspeech/data/cuts_encodec.jsonl.gz"
cuts = CutSet.from_file(files)
print(cuts)

### HF

# dummy dataset, however you can swap this with an dataset on the 🤗 hub or bring your own
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

# load the model + processor (for pre-processing the audio)
model = EncodecModel.from_pretrained("facebook/encodec_24khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")
librispeech_dummy[0]
# cast the audio data to the correct sampling rate for the model
librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
audio_sample = librispeech_dummy[0]["audio"]["array"]

AudioLM

# TO DO