@@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
835835}
836836
837837common_params_context common_params_parser_init (common_params & params, llama_example ex, void (*print_usage)(int , char **)) {
838+ // per-example default params
839+ // we define here to make sure it's included in llama-gen-docs
840+ if (ex == LLAMA_EXAMPLE_COMPLETION) {
841+ params.use_jinja = false ; // disable jinja by default
842+
843+ } else if (ex == LLAMA_EXAMPLE_MTMD) {
844+ params.use_jinja = false ; // disable jinja by default
845+ params.sampling .temp = 0.2 ; // lower temp by default for better quality
846+
847+ } else if (ex == LLAMA_EXAMPLE_SERVER) {
848+ params.n_parallel = -1 ; // auto by default
849+ }
850+
838851 params.use_color = tty_can_use_colors ();
839852
840853 // load dynamic backends
@@ -1107,28 +1120,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11071120 ).set_env (" LLAMA_ARG_SWA_FULL" ));
11081121 add_opt (common_arg (
11091122 {" --ctx-checkpoints" , " --swa-checkpoints" }, " N" ,
1110- string_format (" max number of context checkpoints to create per slot (default: %d)\n "
1123+ string_format (" max number of context checkpoints to create per slot (default: %d)"
11111124 " [(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)" , params.n_ctx_checkpoints ),
11121125 [](common_params & params, int value) {
11131126 params.n_ctx_checkpoints = value;
11141127 }
11151128 ).set_env (" LLAMA_ARG_CTX_CHECKPOINTS" ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
11161129 add_opt (common_arg (
11171130 {" --cache-ram" , " -cram" }, " N" ,
1118- string_format (" set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n "
1131+ string_format (" set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
11191132 " [(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)" , params.cache_ram_mib ),
11201133 [](common_params & params, int value) {
11211134 params.cache_ram_mib = value;
11221135 }
11231136 ).set_env (" LLAMA_ARG_CACHE_RAM" ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
11241137 add_opt (common_arg (
11251138 {" --kv-unified" , " -kvu" },
1126- string_format (" use single unified KV buffer for the KV cache of all sequences (default: %s)\n "
1127- " [(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)" , params.kv_unified ? " true" : " false" ),
1139+ " use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)" ,
11281140 [](common_params & params) {
11291141 params.kv_unified = true ;
11301142 }
1131- ).set_env (" LLAMA_ARG_KV_UNIFIED" ));
1143+ ).set_env (" LLAMA_ARG_KV_UNIFIED" ). set_examples ({LLAMA_EXAMPLE_SERVER}) );
11321144 add_opt (common_arg (
11331145 {" --context-shift" },
11341146 {" --no-context-shift" },
@@ -1888,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18881900 LOG_WRN (" DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n " );
18891901 }
18901902 ).set_env (" LLAMA_ARG_DEFRAG_THOLD" ));
1891- add_opt (common_arg (
1892- {" -np" , " --parallel" }, " N" ,
1893- string_format (" number of parallel sequences to decode (default: %d)" , params.n_parallel ),
1894- [](common_params & params, int value) {
1895- params.n_parallel = value;
1896- }
1897- ).set_env (" LLAMA_ARG_N_PARALLEL" ));
1903+ if (ex == LLAMA_EXAMPLE_SERVER) {
1904+ // this is to make sure this option appears in the server-specific section of the help message
1905+ add_opt (common_arg (
1906+ {" -np" , " --parallel" }, " N" ,
1907+ string_format (" number of server slots (default: %d, -1 = auto)" , params.n_parallel ),
1908+ [](common_params & params, int value) {
1909+ if (value == 0 ) {
1910+ throw std::invalid_argument (" error: invalid value for n_parallel\n " );
1911+ }
1912+ params.n_parallel = value;
1913+ }
1914+ ).set_env (" LLAMA_ARG_N_PARALLEL" ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1915+ } else {
1916+ add_opt (common_arg (
1917+ {" -np" , " --parallel" }, " N" ,
1918+ string_format (" number of parallel sequences to decode (default: %d)" , params.n_parallel ),
1919+ [](common_params & params, int value) {
1920+ params.n_parallel = value;
1921+ }
1922+ ).set_env (" LLAMA_ARG_N_PARALLEL" ));
1923+ }
18981924 add_opt (common_arg (
18991925 {" -ns" , " --sequences" }, " N" ,
19001926 string_format (" number of sequences to decode (default: %d)" , params.n_sequences ),
0 commit comments