ggml-org
diff --git a/‎common/arg.cpp‎
Lines changed: 38 additions & 12 deletions b/‎common/arg.cpp‎
Lines changed: 38 additions & 12 deletions
diff --git a/‎docs/development/HOWTO-add-model.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/development/HOWTO-add-model.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gen-docs/gen-docs.cpp‎
Lines changed: 5 additions & 4 deletions b/‎examples/gen-docs/gen-docs.cpp‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/model-conversion/scripts/causal/modelcard.template‎
Lines changed: 1 addition & 1 deletion b/‎examples/model-conversion/scripts/causal/modelcard.template‎
Lines changed: 1 addition & 1 deletion
@@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
 }
 
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    // per-example default params
+    // we define here to make sure it's included in llama-gen-docs
+    if (ex == LLAMA_EXAMPLE_COMPLETION) {
+        params.use_jinja = false;   // disable jinja by default
+
+    } else if (ex == LLAMA_EXAMPLE_MTMD) {
+        params.use_jinja = false;   // disable jinja by default
+        params.sampling.temp = 0.2; // lower temp by default for better quality
+
+    } else if (ex == LLAMA_EXAMPLE_SERVER) {
+        params.n_parallel = -1;     // auto by default
+    }
+
     params.use_color = tty_can_use_colors();
 
     // load dynamic backends
@@ -1107,28 +1120,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_SWA_FULL"));
     add_opt(common_arg(
         {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
-        string_format("max number of context checkpoints to create per slot (default: %d)\n"
+        string_format("max number of context checkpoints to create per slot (default: %d)"
             "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
         [](common_params & params, int value) {
             params.n_ctx_checkpoints = value;
         }
     ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--cache-ram", "-cram"}, "N",
-        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
+        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
             "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
         [](common_params & params, int value) {
             params.cache_ram_mib = value;
         }
     ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--kv-unified", "-kvu"},
-        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
-            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
+        "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
         [](common_params & params) {
             params.kv_unified = true;
         }
-    ).set_env("LLAMA_ARG_KV_UNIFIED"));
+    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--context-shift"},
         {"--no-context-shift"},
@@ -1888,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
-    add_opt(common_arg(
-        {"-np", "--parallel"}, "N",
-        string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
-        [](common_params & params, int value) {
-            params.n_parallel = value;
-        }
-    ).set_env("LLAMA_ARG_N_PARALLEL"));
+    if (ex == LLAMA_EXAMPLE_SERVER) {
+        // this is to make sure this option appears in the server-specific section of the help message
+        add_opt(common_arg(
+            {"-np", "--parallel"}, "N",
+            string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
+            [](common_params & params, int value) {
+                if (value == 0) {
+                    throw std::invalid_argument("error: invalid value for n_parallel\n");
+                }
+                params.n_parallel = value;
+            }
+        ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
+    } else {
+        add_opt(common_arg(
+            {"-np", "--parallel"}, "N",
+            string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+            [](common_params & params, int value) {
+                params.n_parallel = value;
+            }
+        ).set_env("LLAMA_ARG_N_PARALLEL"));
+    }
     add_opt(common_arg(
         {"-ns", "--sequences"}, "N",
         string_format("number of sequences to decode (default: %d)", params.n_sequences),
 
@@ -97,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
 1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
 2. In `src/llama-arch.cpp`:
     - Add the architecture name to the `LLM_ARCH_NAMES` map.
-    - Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
+    - Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
 3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
 4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
 
 
@@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
     }
 }
 
-static void export_md(std::string fname, llama_example ex) {
+static void export_md(std::string fname, llama_example ex, std::string name) {
     std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
 
     common_params params;
@@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {
     write_table(file, common_options);
     file << "\n\n**Sampling params**\n\n";
     write_table(file, sparam_options);
-    file << "\n\n**Example-specific params**\n\n";
+    file << "\n\n**" << name << "-specific params**\n\n";
     write_table(file, specific_options);
 }
 
 int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
-    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
+    // TODO: add CLI
+    export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
+    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
 
     return 0;
 }
@@ -7,7 +7,7 @@ base_model:
 Recommended way to run this model:
 
 ```sh
-llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa
+llama-server -hf {namespace}/{model_name}-GGUF -c 0
 ```
 
 Then, access http://localhost:8080
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)`
`48`	`48`	`}`
`49`	`49`	`}`
`50`	`50`
`51`		`-static void export_md(std::string fname, llama_example ex) {`
	`51`	`+static void export_md(std::string fname, llama_example ex, std::string name) {`
`52`	`52`	`std::ofstream file(fname, std::ofstream::out \| std::ofstream::trunc);`
`53`	`53`
`54`	`54`	`common_params params;`
`@@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {`
`72`	`72`	`write_table(file, common_options);`
`73`	`73`	`file << "\n\nSampling params\n\n";`
`74`	`74`	`write_table(file, sparam_options);`
`75`		`- file << "\n\nExample-specific params\n\n";`
	`75`	`+ file << "\n\n" << name << "-specific params\n\n";`
`76`	`76`	`write_table(file, specific_options);`
`77`	`77`	`}`
`78`	`78`
`79`	`79`	`int main(int, char **) {`
`80`		`- export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);`
`81`		`- export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);`
	`80`	`+ // TODO: add CLI`
	`81`	`+ export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");`
	`82`	`+ export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");`
`82`	`83`
`83`	`84`	`return 0;`
`84`	`85`	`}`