Skip to content
Open
Show file tree
Hide file tree
Changes from 71 commits
Commits
Show all changes
99 commits
Select commit Hold shift + click to select a range
43a130b
mtmd: llama.cpp DeepSeekOCR support
sfallah Nov 14, 2025
b6b9f02
loading sam tensors
sfallah Nov 14, 2025
85c7cda
mtmd: fix vision model processing
bluebread Nov 15, 2025
578c8d7
Merge pull request #1 from bluebread/sf/deepseek-ocr
sfallah Nov 15, 2025
2aab52e
deepseek-ocr clip-vit model impl
sfallah Nov 15, 2025
eab28ed
mtmd: add DeepSeek-OCR LM support with standard attention
bluebread Nov 15, 2025
7630587
mtmd: successfully runs DeepSeek-OCR LM in llama-cli
bluebread Nov 16, 2025
2de3436
mtmd: Fix RoPE type for DeepSeek-OCR LM.
bluebread Nov 17, 2025
e8b2610
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Nov 17, 2025
97e0907
loading LM
sfallah Nov 17, 2025
13dc6fb
Merge branch 'sf/deepseek-ocr' into sf/deepseek-ocr
sfallah Nov 17, 2025
b32bb5e
Merge pull request #2 from bluebread/sf/deepseek-ocr
sfallah Nov 17, 2025
790bbb9
sam warmup working
sfallah Nov 17, 2025
cec9a5c
sam erroneous return corrected
sfallah Nov 17, 2025
8b3d319
clip-vit: corrected cls_embd concat
sfallah Nov 17, 2025
1e08157
clip-vit: model convert qkv_proj split
sfallah Nov 17, 2025
331cea8
corrected combining of image encoders' results
sfallah Nov 18, 2025
6c0715b
fix: update callback for ffn_moe_weighted and add callback for attn_o…
bluebread Nov 18, 2025
a65ddf5
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Nov 18, 2025
63a042f
concat image_newline and image_seperator tokens
sfallah Nov 18, 2025
89afda8
visual_model warmup (technically) works
sfallah Nov 18, 2025
88032f4
window partitioning using standard ggml ops
sfallah Nov 20, 2025
1268dc3
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Nov 20, 2025
68b206b
sam implementation without using CPU only ops
sfallah Nov 21, 2025
8bce66d
clip: fixed warnings
bluebread Nov 21, 2025
5e6cf3c
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Nov 21, 2025
7e9fbec
mtmd: fix get_rel_pos
bluebread Nov 21, 2025
0f5587d
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Nov 21, 2025
7b8d735
mtmd: fixed the wrong scaler for get_rel_pos
bluebread Nov 21, 2025
86f111f
image encoding technically works but the output can't be checked sing…
sfallah Nov 21, 2025
effe669
mtmd: minor changed
bluebread Nov 22, 2025
f8f66a1
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Nov 22, 2025
3fcfc3a
Merge pull request #3 from bluebread/sf/deepseek-ocr
sfallah Nov 22, 2025
ee8a148
mtmd: add native resolution support
bluebread Nov 22, 2025
4cfa15f
- image encoding debugged
sfallah Nov 22, 2025
3f71188
mtmd: correct token order
bluebread Nov 23, 2025
a594990
Merge pull request #5 from bluebread/dsocr-debug
sfallah Nov 23, 2025
6dfda99
Merge branch 'sf/deepseek-ocr' into sf/deepseek-ocr
sfallah Nov 23, 2025
7941f5d
Merge pull request #4 from bluebread/sf/deepseek-ocr
sfallah Nov 23, 2025
206f8ab
- dynamic resizing
sfallah Nov 23, 2025
40e7e6e
mtmd: quick fix token order
bluebread Nov 24, 2025
81533e4
mtmd: fix danling pointer
bluebread Nov 24, 2025
8810940
Merge pull request #6 from bluebread/sf/deepseek-ocr
sfallah Nov 24, 2025
a488b49
mtmd: SAM numerically works
bluebread Nov 29, 2025
ccb2f23
mtmd: debug CLIP-L (vit_pre_ln)
bluebread Nov 29, 2025
841a4a8
mtmd: debug CLIP-L & first working DeepSeek-OCR model
bluebread Nov 29, 2025
ed3b7f1
Merge remote-tracking branch 'sfallah/master' into sf/deepseek-ocr
sfallah Nov 30, 2025
5543094
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Nov 30, 2025
c5f4c64
mtmd : add --dsocr-mode CLI argument for DeepSeek-OCR resolution cont…
bluebread Nov 30, 2025
95239f9
mtmd: simplify SAM patch embedding
bluebread Dec 1, 2025
6b0e7cd
Merge pull request #7 from bluebread/sf/deepseek-ocr
sfallah Dec 2, 2025
6634166
Merge branch 'master' into sf/deepseek-ocr
sfallah Dec 2, 2025
c914e05
mtmd: adapt Pillow image resizing function
bluebread Dec 3, 2025
e20857b
mtmd: simplify DeepSeek-OCR dynamic resolution preprocessing
bluebread Dec 3, 2025
43dfc0c
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into s…
bluebread Dec 3, 2025
b696c54
mtmd: remove --dsocr-mode argument
bluebread Dec 3, 2025
b26b507
mtmd: refactor code & remove unused helper functions
bluebread Dec 3, 2025
7451b84
mtmd: fix tensor names for image newlines and view separator
bluebread Dec 4, 2025
386ba47
clean up
sfallah Dec 4, 2025
c73748a
Merge branch 'sf/deepseek-ocr' into sf/deepseek-ocr-cleanup
sfallah Dec 4, 2025
a661c52
reverting automatically removed spaces
sfallah Dec 4, 2025
0399ddf
reverting automatically removed spaces
sfallah Dec 4, 2025
c89171c
mtmd: fixed bad ocr check in Deepseek2 (LM)
bluebread Dec 4, 2025
2dd9924
Merge branch 'sf/deepseek-ocr-cleanup' of github.com:sfallah/llama.cp…
bluebread Dec 4, 2025
fc3f625
mtmd: support combined QKV projection in buid_vit
bluebread Dec 4, 2025
4d7d994
Merge pull request #8 from sfallah/sf/deepseek-ocr-cleanup
sfallah Dec 4, 2025
5381b9c
using common build_attn in sam
sfallah Dec 4, 2025
076138a
corrected code-branch when flash-attn disabled
sfallah Dec 4, 2025
d0c08e3
mtmd: minor fix
bluebread Dec 5, 2025
f5bd310
minor formatting and style
sfallah Dec 5, 2025
6687b4e
Merge pull request #9 from sfallah/sf/deepseek-ocr-attn
sfallah Dec 5, 2025
5f2ee1a
Merge branch 'ggml-org:master' into sf/deepseek-ocr
sfallah Dec 5, 2025
1c88647
fixed flake8 lint issues
sfallah Dec 5, 2025
d981f19
minor editorconfig-check fixes
sfallah Dec 5, 2025
705394c
minor editorconfig-check fixes
sfallah Dec 5, 2025
15f2ada
mtmd: simplify get_rel_pos
bluebread Dec 6, 2025
2d918b3
mtmd: make sam hparams configurable
bluebread Dec 6, 2025
5dfcc5a
mtmd: add detailed comments for resize_bicubic_pillow
bluebread Dec 7, 2025
53273f8
mtmd: fixed wrong input setting
bluebread Dec 7, 2025
48c6cf2
mtmd: convert model in FP16
bluebread Dec 8, 2025
5174a1e
mtmd: minor fix
bluebread Dec 8, 2025
0161406
mtmd: remove tweak to llama-mtmd-cli & deepseek-ocr template
bluebread Dec 9, 2025
ed944cd
fix: test-1.jpg ORC issue with small (640) resolution
sfallah Dec 10, 2025
aaf2fd1
minor: editconfig-check fix
sfallah Dec 11, 2025
33fabf0
Merge branch 'master' into sf/deepseek-ocr-merge-test
sfallah Dec 11, 2025
d70f171
merge with changes from https://github.com/ggml-org/llama.cpp/pull/17909
sfallah Dec 11, 2025
4cbbe8a
minor: editconfig-check fix
sfallah Dec 11, 2025
47f0fee
testing deepseek-ocr
sfallah Dec 11, 2025
e0e69fd
Merge remote-tracking branch 'sfallah/master' into sf/deepseek-ocr-me…
sfallah Dec 13, 2025
f95a6fe
quick and (potential) dirty merge with https://github.com/ggml-org/ll…
sfallah Dec 13, 2025
f7736f2
refactoring, one single builder function and static helpers
sfallah Dec 13, 2025
fb3bb6a
added deepseek-ocr test to tests.sh
sfallah Dec 13, 2025
1b38ccf
Merge pull request #11 from sfallah/sf/deepseek-ocr-merge_#17965
sfallah Dec 13, 2025
6c36c03
minor formatting fixes
sfallah Dec 14, 2025
dc2066e
check with fixed expected resutls
sfallah Dec 14, 2025
3fc61d4
Merge pull request #10 from sfallah/sf/deepseek-ocr-test-script
sfallah Dec 14, 2025
7f8621c
minor formatting
sfallah Dec 14, 2025
b3bf8cb
Merge remote-tracking branch 'sfallah/master' into sf/deepseek-ocr
sfallah Dec 15, 2025
8ad98ee
editorconfig-check fix
sfallah Dec 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 99 additions & 14 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,9 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
if "language_config" in config:
# rename for DeepSeekOCR
config["text_config"] = config["language_config"]
return config

@classmethod
Expand Down Expand Up @@ -1531,7 +1534,7 @@ class MmprojModel(ModelBase):
preprocessor_config: dict[str, Any]
global_config: dict[str, Any]

n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers"]

has_vision_encoder: bool = True # by default
has_audio_encoder: bool = False
Expand Down Expand Up @@ -1576,7 +1579,7 @@ def __init__(self, *args, **kwargs):

# TODO @ngxson : this is a hack to support both vision and audio encoders
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys)
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)

# load preprocessor config
Expand Down Expand Up @@ -5990,6 +5993,68 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

return [] # skip other tensors

@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
# default values below are taken from HF tranformers code
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)
# calculate proj_scale_factor (used by tinygemma3 test model)
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
n_per_side = int(image_seq_length ** 0.5)
image_size = self.hparams["image_size"]
patch_size = self.hparams["patch_size"]
proj_scale_factor = (image_size // patch_size) // n_per_side
if proj_scale_factor > 0 and proj_scale_factor != 4:
# we only need to write this if it's not the default value
# in this case, we are converting a test model
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)

# SAM configuration
sam_hparams = hparams['sam']
self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])

def get_vision_config(self) -> dict[str, Any]:
vision_config: dict[str, Any] | None = self.global_config.get("vision_config")

if not vision_config:
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")

vision_config['sam'] = vision_config['width']['sam_vit_b']
vision_config.update(vision_config['width']['clip-l-14-224'])
vision_config['hidden_size'] = vision_config['width']
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4

return vision_config


def tensor_force_quant(self, name, new_name, bid, n_dims):
# TODO: increase numercial stability. maybe delete later.
return gguf.GGMLQuantizationType.F32
# related to https://github.com/ggml-org/llama.cpp/issues/13025
# if "input_projection" in name:
# return gguf.GGMLQuantizationType.F16
# if ".embeddings." in name:
# return gguf.GGMLQuantizationType.F32
# return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Only process vision-related tensors, skip language model tensors
# Vision components: sam_model, vision_model, projector, image_newline, view_seperator
# Language model components to skip: lm_head, embed_tokens, layers, norm
if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")):
return []

if ".attn.rel_pos_h" in name or ".attn.rel_pos_w" in name:
return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)]

return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("Gemma3nForConditionalGeneration")
class Gemma3NModel(Gemma3Model):
Expand Down Expand Up @@ -7164,6 +7229,15 @@ def prepare_tensors(self):
class DeepseekV2Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
vision_config = self.hparams.get('vision_config', {}).get('width', {})

if 'clip-l-14-224' in vision_config and 'sam_vit_b' in vision_config:
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()

def set_vocab(self):
try:
self._set_vocab_gpt2()
Expand Down Expand Up @@ -7219,30 +7293,40 @@ def set_vocab(self):
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")

def set_gguf_parameters(self):
is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR)

# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
self.hparams["num_key_value_heads"] = 1
if is_ocr:
self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0)
self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6)
else:
# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
self.hparams["num_key_value_heads"] = 1

super().set_gguf_parameters()
hparams = self.hparams

kv_lora_rank = hparams["kv_lora_rank"] if hparams.get("kv_lora_rank") is not None else 512
routed_scaling_factor = hparams.get("routed_scaling_factor", 1.0)
norm_topk_prob = hparams.get("norm_topk_prob", False)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
if "kv_lora_rank" in hparams and hparams["kv_lora_rank"] is not None:
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)

# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
if not is_ocr:
self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(kv_lora_rank)
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)

self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

Expand All @@ -7252,12 +7336,14 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))

_experts: list[dict[str, Tensor]] | None = None

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# skip vision tensors and remove "language_model." for Kimi-VL
if "vision_tower" in name or "multi_modal_projector" in name:
if "vision_" in name or "multi_modal_projector" in name \
or "image_newline" in name or "model.projector" in name or "sam_model" in name or "view_seperator" in name:
return []

if name.startswith("language_model."):
Expand Down Expand Up @@ -7337,7 +7423,6 @@ def prepare_tensors(self):
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("MiniMaxM2ForCausalLM")
class MiniMaxM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MINIMAXM2
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-cuda/upscale.cu
Original file line number Diff line number Diff line change
Expand Up @@ -289,5 +289,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
sf0, sf1, sf2, sf3, pixel_offset, stream);
} else {
GGML_ABORT("fatal error");
}
}
1 change: 1 addition & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -5206,6 +5206,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
GGML_ASSERT(q->ne[3] == v->ne[3]);

if (mask) {
GGML_ASSERT(mask->type == GGML_TYPE_F16);
GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
Expand Down
92 changes: 92 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,10 @@ class Attention:
class Projector:
SCALE_FACTOR = "clip.vision.projector.scale_factor"

class SAM:
BLOCK_COUNT = "clip.vision.sam.block_count"
EMBEDDING_LENGTH = "clip.vision.sam.embedding_length"

class ClipAudio:
NUM_MEL_BINS = "clip.audio.num_mel_bins"
EMBEDDING_LENGTH = "clip.audio.embedding_length"
Expand Down Expand Up @@ -404,6 +408,7 @@ class MODEL_ARCH(IntEnum):
ARCTIC = auto()
DEEPSEEK = auto()
DEEPSEEK2 = auto()
DEEPSEEK2OCR = auto()
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
Expand Down Expand Up @@ -685,6 +690,22 @@ class MODEL_TENSOR(IntEnum):
V_MM_GATE = auto() # cogvlm
V_TOK_BOI = auto() # cogvlm
V_TOK_EOI = auto() # cogvlm
V_SAM_POS_EMBD = auto() # Deepseek-OCR
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
V_SAM_PRE_NORM = auto() # Deepseek-OCR
V_SAM_POST_NORM = auto() # Deepseek-OCR
V_SAM_ATTN_POS_H = auto() # Deepseek-OCR
V_SAM_ATTN_POS_W = auto() # Deepseek-OCR
V_SAM_ATTN_QKV = auto() # Deepseek-OCR
V_SAM_ATTN_OUT = auto() # Deepseek-OCR
V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR
V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR
V_SAM_NECK = auto() # Deepseek-OCR
V_SAM_NET_2 = auto() # Deepseek-OCR
V_SAM_NET_3 = auto() # Deepseek-OCR
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR

# audio (mtmd)
A_ENC_EMBD_POS = auto()
A_ENC_CONV1D = auto()
Expand Down Expand Up @@ -777,6 +798,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
Expand Down Expand Up @@ -1057,6 +1079,22 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_MM_GATE: "mm.gate",
MODEL_TENSOR.V_TOK_BOI: "v.boi",
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
# DeepSeek-OCR SAM
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln",
MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln",
MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h",
MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w",
MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv",
MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out",
MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1",
MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2",
MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}",
MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2",
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
# audio (mtmd)
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
Expand Down Expand Up @@ -1093,6 +1131,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_EMBD_IMGNL,
MODEL_TENSOR.V_ENC_EMBD_VSEP,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_QKV,
MODEL_TENSOR.V_ENC_ATTN_Q,
Expand Down Expand Up @@ -1135,6 +1175,19 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_MM_GATE,
MODEL_TENSOR.V_TOK_BOI,
MODEL_TENSOR.V_TOK_EOI,
MODEL_TENSOR.V_SAM_POS_EMBD,
MODEL_TENSOR.V_SAM_PATCH_EMBD,
MODEL_TENSOR.V_SAM_PRE_NORM,
MODEL_TENSOR.V_SAM_POST_NORM,
MODEL_TENSOR.V_SAM_ATTN_POS_H,
MODEL_TENSOR.V_SAM_ATTN_POS_W,
MODEL_TENSOR.V_SAM_ATTN_QKV,
MODEL_TENSOR.V_SAM_ATTN_OUT,
MODEL_TENSOR.V_SAM_MLP_LIN_1,
MODEL_TENSOR.V_SAM_MLP_LIN_2,
MODEL_TENSOR.V_SAM_NECK,
MODEL_TENSOR.V_SAM_NET_2,
MODEL_TENSOR.V_SAM_NET_3,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_CONV1D,
Expand Down Expand Up @@ -2303,7 +2356,41 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_KV_B,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_B,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_V_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
MODEL_ARCH.DEEPSEEK2OCR: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_KV_B,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_B,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_V_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
Expand Down Expand Up @@ -3139,6 +3226,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.DEEPSEEK2OCR: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.CHATGLM: [
MODEL_TENSOR.ROPE_FREQS,
],
Expand Down Expand Up @@ -3327,6 +3418,7 @@ class VisionProjectorType:
LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
DEEPSEEKOCR = "deepseekocr"


# Items here are (block size, type size)
Expand Down
6 changes: 6 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,12 @@ def add_vision_n_wa_pattern(self, value: int) -> None:
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)

def add_vision_sam_layers_count(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value)

def add_vision_sam_embedding_length(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value)

# audio models

def add_audio_projection_dim(self, value: int) -> None:
Expand Down
Loading
Loading