Skip to content

Commit 00d2357

Browse files
committed
Merge remote-tracking branch 'sfallah/master' into sf/deepseek-ocr
# Conflicts: # src/llama-arch.cpp
2 parents 512b2c8 + ec98e20 commit 00d2357

File tree

13 files changed

+2083
-2630
lines changed

13 files changed

+2083
-2630
lines changed

common/arg.cpp

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
835835
}
836836

837837
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
838+
// per-example default params
839+
// we define here to make sure it's included in llama-gen-docs
840+
if (ex == LLAMA_EXAMPLE_COMPLETION) {
841+
params.use_jinja = false; // disable jinja by default
842+
843+
} else if (ex == LLAMA_EXAMPLE_MTMD) {
844+
params.use_jinja = false; // disable jinja by default
845+
params.sampling.temp = 0.2; // lower temp by default for better quality
846+
847+
} else if (ex == LLAMA_EXAMPLE_SERVER) {
848+
params.n_parallel = -1; // auto by default
849+
}
850+
838851
params.use_color = tty_can_use_colors();
839852

840853
// load dynamic backends
@@ -1107,28 +1120,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11071120
).set_env("LLAMA_ARG_SWA_FULL"));
11081121
add_opt(common_arg(
11091122
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
1110-
string_format("max number of context checkpoints to create per slot (default: %d)\n"
1123+
string_format("max number of context checkpoints to create per slot (default: %d)"
11111124
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
11121125
[](common_params & params, int value) {
11131126
params.n_ctx_checkpoints = value;
11141127
}
11151128
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
11161129
add_opt(common_arg(
11171130
{"--cache-ram", "-cram"}, "N",
1118-
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1131+
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
11191132
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
11201133
[](common_params & params, int value) {
11211134
params.cache_ram_mib = value;
11221135
}
11231136
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
11241137
add_opt(common_arg(
11251138
{"--kv-unified", "-kvu"},
1126-
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
1127-
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1139+
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
11281140
[](common_params & params) {
11291141
params.kv_unified = true;
11301142
}
1131-
).set_env("LLAMA_ARG_KV_UNIFIED"));
1143+
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
11321144
add_opt(common_arg(
11331145
{"--context-shift"},
11341146
{"--no-context-shift"},
@@ -1888,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18881900
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
18891901
}
18901902
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
1891-
add_opt(common_arg(
1892-
{"-np", "--parallel"}, "N",
1893-
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1894-
[](common_params & params, int value) {
1895-
params.n_parallel = value;
1896-
}
1897-
).set_env("LLAMA_ARG_N_PARALLEL"));
1903+
if (ex == LLAMA_EXAMPLE_SERVER) {
1904+
// this is to make sure this option appears in the server-specific section of the help message
1905+
add_opt(common_arg(
1906+
{"-np", "--parallel"}, "N",
1907+
string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
1908+
[](common_params & params, int value) {
1909+
if (value == 0) {
1910+
throw std::invalid_argument("error: invalid value for n_parallel\n");
1911+
}
1912+
params.n_parallel = value;
1913+
}
1914+
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
1915+
} else {
1916+
add_opt(common_arg(
1917+
{"-np", "--parallel"}, "N",
1918+
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
1919+
[](common_params & params, int value) {
1920+
params.n_parallel = value;
1921+
}
1922+
).set_env("LLAMA_ARG_N_PARALLEL"));
1923+
}
18981924
add_opt(common_arg(
18991925
{"-ns", "--sequences"}, "N",
19001926
string_format("number of sequences to decode (default: %d)", params.n_sequences),

docs/development/HOWTO-add-model.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
9797
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
9898
2. In `src/llama-arch.cpp`:
9999
- Add the architecture name to the `LLM_ARCH_NAMES` map.
100-
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
100+
- Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
101101
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
102102
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
103103

examples/gen-docs/gen-docs.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
4848
}
4949
}
5050

51-
static void export_md(std::string fname, llama_example ex) {
51+
static void export_md(std::string fname, llama_example ex, std::string name) {
5252
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
5353

5454
common_params params;
@@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {
7272
write_table(file, common_options);
7373
file << "\n\n**Sampling params**\n\n";
7474
write_table(file, sparam_options);
75-
file << "\n\n**Example-specific params**\n\n";
75+
file << "\n\n**" << name << "-specific params**\n\n";
7676
write_table(file, specific_options);
7777
}
7878

7979
int main(int, char **) {
80-
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
81-
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
80+
// TODO: add CLI
81+
export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
82+
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
8283

8384
return 0;
8485
}

examples/model-conversion/scripts/causal/modelcard.template

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ base_model:
77
Recommended way to run this model:
88

99
```sh
10-
llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa
10+
llama-server -hf {namespace}/{model_name}-GGUF -c 0
1111
```
1212

1313
Then, access http://localhost:8080

0 commit comments

Comments
 (0)