Fix CLAP embedding extraction for transformers 5.x

In transformers 5.x, ClapModel.get_audio_features() returns a
BaseModelOutputWithPooling instead of a raw tensor. The 512-dim
embedding is in .pooler_output[0], not directly indexed. Added
backward-compatible extraction with hasattr check.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-22 10:55:38 -06:00
parent e9cf1e9b17
commit 6a2be93556

View File

@@ -42,9 +42,14 @@ def embed_audio(audio: np.ndarray, sample_rate: int = 48000) -> np.ndarray:
inputs = _processor(audio=audio, sampling_rate=sample_rate, return_tensors="pt") inputs = _processor(audio=audio, sampling_rate=sample_rate, return_tensors="pt")
with torch.no_grad(): with torch.no_grad():
embeddings = _model.get_audio_features(**inputs) output = _model.get_audio_features(**inputs)
# Flatten to 1-D and normalize # transformers 5.x returns BaseModelOutputWithPooling; extract pooler_output
emb = embeddings[0].numpy().flatten() if hasattr(output, "pooler_output"):
emb = output.pooler_output[0].numpy()
else:
emb = output[0].numpy()
# Normalize to unit vector
emb = emb / np.linalg.norm(emb) emb = emb / np.linalg.norm(emb)
return emb return emb