Skip to content

Commit e39a2ce

Browse files
authored
clip: move model cgraphs into their own files (#17965)
* clip: move model cgraphs into their own files * more explicit enums * fix linux build * fix naming * missing headers * nits: add comments for contributors
1 parent a8c7f33 commit e39a2ce

File tree

18 files changed

+2386
-2188
lines changed

18 files changed

+2386
-2188
lines changed

tools/mtmd/CMakeLists.txt

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,25 @@ add_library(mtmd
66
mtmd.cpp
77
mtmd-audio.cpp
88
mtmd.h
9+
mtmd-helper.cpp
10+
mtmd-helper.h
911
clip.cpp
1012
clip.h
1113
clip-impl.h
12-
mtmd-helper.cpp
13-
mtmd-helper.h
14+
clip-model.h
15+
clip-graph.h
16+
models/models.h
17+
models/cogvlm.cpp
18+
models/internvl.cpp
19+
models/kimivl.cpp
20+
models/llama4.cpp
21+
models/llava.cpp
22+
models/minicpmv.cpp
23+
models/pixtral.cpp
24+
models/qwen2vl.cpp
25+
models/qwen3vl.cpp
26+
models/siglip.cpp
27+
models/whisper-enc.cpp
1428
)
1529

1630
set_target_properties(mtmd PROPERTIES

tools/mtmd/clip-graph.h

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#pragma once
2+
3+
#include "ggml.h"
4+
#include "ggml-cpp.h"
5+
#include "clip.h"
6+
#include "clip-impl.h"
7+
#include "clip-model.h"
8+
9+
#include <vector>
10+
#include <functional>
11+
12+
struct clip_graph {
13+
const clip_model & model;
14+
const clip_hparams & hparams;
15+
projector_type proj_type;
16+
17+
// we only support single image per batch
18+
const clip_image_f32 & img;
19+
20+
const int patch_size;
21+
const int n_patches_x;
22+
const int n_patches_y;
23+
const int n_patches;
24+
const int n_embd;
25+
const int n_head;
26+
const int d_head;
27+
const int n_layer;
28+
const int n_mmproj_embd;
29+
const float eps;
30+
const float kq_scale;
31+
const clip_flash_attn_type flash_attn_type;
32+
33+
// for debugging
34+
const bool debug_graph;
35+
std::vector<ggml_tensor *> & debug_print_tensors;
36+
37+
ggml_context_ptr ctx0_ptr;
38+
ggml_context * ctx0;
39+
ggml_cgraph * gf;
40+
41+
clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
42+
43+
virtual ~clip_graph() = default;
44+
virtual ggml_cgraph * build() = 0;
45+
46+
//
47+
// utility functions
48+
//
49+
void cb(ggml_tensor * cur0, const char * name, int il) const;
50+
51+
// siglip2 naflex
52+
ggml_tensor * resize_position_embeddings();
53+
54+
// build vision transformer (ViT) cgraph
55+
// this function should cover most of the models
56+
// if your model has specific features, you should probably duplicate this function
57+
ggml_tensor * build_vit(
58+
ggml_tensor * inp,
59+
int64_t n_pos,
60+
norm_type norm_t,
61+
ffn_op_type ffn_t,
62+
ggml_tensor * learned_pos_embd,
63+
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
64+
65+
// build the input after conv2d (inp_raw --> patches)
66+
// returns tensor with shape [n_embd, n_patches]
67+
ggml_tensor * build_inp();
68+
69+
ggml_tensor * build_inp_raw(int channels = 3);
70+
71+
ggml_tensor * build_norm(
72+
ggml_tensor * cur,
73+
ggml_tensor * mw,
74+
ggml_tensor * mb,
75+
norm_type type,
76+
float norm_eps,
77+
int il) const;
78+
79+
ggml_tensor * build_ffn(
80+
ggml_tensor * cur,
81+
ggml_tensor * up,
82+
ggml_tensor * up_b,
83+
ggml_tensor * gate,
84+
ggml_tensor * gate_b,
85+
ggml_tensor * down,
86+
ggml_tensor * down_b,
87+
ffn_op_type type_op,
88+
int il) const;
89+
90+
ggml_tensor * build_attn(
91+
ggml_tensor * wo,
92+
ggml_tensor * wo_b,
93+
ggml_tensor * q_cur,
94+
ggml_tensor * k_cur,
95+
ggml_tensor * v_cur,
96+
ggml_tensor * kq_mask,
97+
float kq_scale,
98+
int il) const;
99+
100+
// implementation of the 2D RoPE without adding a new op in ggml
101+
// this is not efficient (use double the memory), but works on all backends
102+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
103+
ggml_tensor * build_rope_2d(
104+
ggml_context * ctx0,
105+
ggml_tensor * cur,
106+
ggml_tensor * pos_a, // first half
107+
ggml_tensor * pos_b, // second half
108+
const float freq_base,
109+
const bool interleave_freq
110+
);
111+
112+
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
113+
// support dynamic resolution
114+
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
115+
};

tools/mtmd/clip-impl.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#pragma once
2+
13
#include "ggml.h"
24
#include "gguf.h"
35
#include "clip.h"
@@ -134,6 +136,10 @@
134136
// align x to upper multiple of n
135137
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
136138

139+
// forward declaration
140+
// TODO: improve this later
141+
struct clip_ctx;
142+
137143
enum projector_type {
138144
PROJECTOR_TYPE_MLP,
139145
PROJECTOR_TYPE_MLP_NORM,

0 commit comments

Comments
 (0)