Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tools/mtmd/clip-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ struct clip_hparams {
int32_t n_mel_bins = 0; // whisper preprocessor
int32_t proj_stack_factor = 0; // ultravox

// audio-to-mel preprocessor params
int32_t audio_chunk_len = -1; // in seconds
int32_t audio_sample_rate = -1;
int32_t audio_n_fft = -1;
int32_t audio_window_len = -1;
int32_t audio_hop_len = -1;

// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
Expand Down Expand Up @@ -277,3 +284,5 @@ struct clip_model {
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
}
};

const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
19 changes: 16 additions & 3 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1141,11 +1141,15 @@ struct clip_model_loader {
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
model.proj_type == PROJECTOR_TYPE_VOXTRAL;
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
if (hparams.n_mel_bins != 128) {
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
}
hparams.ffn_op = FFN_GELU_ERF;
log_ffn_op = "gelu_erf"; // temporary solution for logging

// audio preprocessing params
hparams.audio_chunk_len = 30; // in seconds
hparams.audio_sample_rate = 16000;
hparams.audio_n_fft = 400;
hparams.audio_window_len = 400;
hparams.audio_hop_len = 160;
} break;
default:
break;
Expand Down Expand Up @@ -1183,6 +1187,11 @@ struct clip_model_loader {
LOG_INF("\n--- audio hparams ---\n");
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len);
LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate);
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
}
LOG_INF("\n");
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
Expand Down Expand Up @@ -3416,3 +3425,7 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
batch->entries.push_back(clip_image_f32_ptr(audio));
batch->is_audio = true;
}

const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
return &ctx->model.hparams;
}
Loading
Loading