Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,25 @@ add_library(mtmd
mtmd.cpp
mtmd-audio.cpp
mtmd.h
mtmd-helper.cpp
mtmd-helper.h
clip.cpp
clip.h
clip-impl.h
mtmd-helper.cpp
mtmd-helper.h
clip-model.h
clip-graph.h
models/models.h
models/cogvlm.cpp
models/internvl.cpp
models/kimivl.cpp
models/llama4.cpp
models/llava.cpp
models/minicpmv.cpp
models/pixtral.cpp
models/qwen2vl.cpp
models/qwen3vl.cpp
models/siglip.cpp
models/whisper-enc.cpp
)

set_target_properties(mtmd PROPERTIES
Expand Down
115 changes: 115 additions & 0 deletions tools/mtmd/clip-graph.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#pragma once

#include "ggml.h"
#include "ggml-cpp.h"
#include "clip.h"
#include "clip-impl.h"
#include "clip-model.h"

#include <vector>
#include <functional>

struct clip_graph {
const clip_model & model;
const clip_hparams & hparams;
projector_type proj_type;

// we only support single image per batch
const clip_image_f32 & img;

const int patch_size;
const int n_patches_x;
const int n_patches_y;
const int n_patches;
const int n_embd;
const int n_head;
const int d_head;
const int n_layer;
const int n_mmproj_embd;
const float eps;
const float kq_scale;
const clip_flash_attn_type flash_attn_type;

// for debugging
const bool debug_graph;
std::vector<ggml_tensor *> & debug_print_tensors;

ggml_context_ptr ctx0_ptr;
ggml_context * ctx0;
ggml_cgraph * gf;

clip_graph(clip_ctx * ctx, const clip_image_f32 & img);

virtual ~clip_graph() = default;
virtual ggml_cgraph * build() = 0;

//
// utility functions
//
void cb(ggml_tensor * cur0, const char * name, int il) const;

// siglip2 naflex
ggml_tensor * resize_position_embeddings();

// build vision transformer (ViT) cgraph
// this function should cover most of the models
// if your model has specific features, you should probably duplicate this function
ggml_tensor * build_vit(
ggml_tensor * inp,
int64_t n_pos,
norm_type norm_t,
ffn_op_type ffn_t,
ggml_tensor * learned_pos_embd,
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);

// build the input after conv2d (inp_raw --> patches)
// returns tensor with shape [n_embd, n_patches]
ggml_tensor * build_inp();

ggml_tensor * build_inp_raw(int channels = 3);

ggml_tensor * build_norm(
ggml_tensor * cur,
ggml_tensor * mw,
ggml_tensor * mb,
norm_type type,
float norm_eps,
int il) const;

ggml_tensor * build_ffn(
ggml_tensor * cur,
ggml_tensor * up,
ggml_tensor * up_b,
ggml_tensor * gate,
ggml_tensor * gate_b,
ggml_tensor * down,
ggml_tensor * down_b,
ffn_op_type type_op,
int il) const;

ggml_tensor * build_attn(
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_mask,
float kq_scale,
int il) const;

// implementation of the 2D RoPE without adding a new op in ggml
// this is not efficient (use double the memory), but works on all backends
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
ggml_tensor * build_rope_2d(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * pos_a, // first half
ggml_tensor * pos_b, // second half
const float freq_base,
const bool interleave_freq
);

// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
// support dynamic resolution
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
};
6 changes: 6 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#pragma once

#include "ggml.h"
#include "gguf.h"
#include "clip.h"
Expand Down Expand Up @@ -134,6 +136,10 @@
// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

// forward declaration
// TODO: improve this later
struct clip_ctx;

enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
Expand Down
Loading
Loading