ggml-org · Chrisischris · Dec 20, 2025 · Dec 21, 2025 · CISC · Dec 21, 2025
@@ -181,10 +181,11 @@ jobs:
             os: ubuntu-22.04
           - build: 'arm64'
             os: ubuntu-22.04-arm
-          - build: 's390x'
-            os: ubuntu-24.04-s390x
-          - build: 'ppc64le'
-            os: ubuntu-24.04-ppc64le
+          # Disabled - requires special GitHub runners not available on forks
+          # - build: 's390x'
+          #   os: ubuntu-24.04-s390x
+          # - build: 'ppc64le'
+          #   os: ubuntu-24.04-ppc64le
 
     runs-on: ${{ matrix.os }}
 

@@ -134,8 +134,9 @@ jobs:
         include:
           - build: 'x64'
             os: ubuntu-22.04
-          - build: 's390x'
-            os: ubuntu-24.04-s390x
+          # Disabled - requires special GitHub runners not available on forks
+          # - build: 's390x'
+          #   os: ubuntu-24.04-s390x
           # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
           # - build: 'arm64'
           #   os: ubuntu-22.04-arm

@@ -2811,6 +2811,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_autoload = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
+    add_opt(common_arg(
+        {"--models-allow-extra-args"},
+        {"--no-models-allow-extra-args"},
+        string_format("for router server, whether to allow extra_args in /models/load endpoint (default: %s)", params.models_allow_extra_args ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.models_allow_extra_args = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS"));
     add_opt(common_arg(
         {"--jinja"},
         {"--no-jinja"},

@@ -498,6 +498,7 @@ struct common_params {
     std::string models_preset = ""; // directory containing model presets for the router server
     int models_max = 4;             // maximum number of models to load simultaneously
     bool models_autoload = true;    // automatically load models when requested via the router server
+    bool models_allow_extra_args = false; // allow extra_args in /models/load endpoint
 
     bool log_json = false;
 

@@ -379,7 +379,7 @@ void server_models::unload_lru() {
     }
 }
 
-void server_models::load(const std::string & name) {
+void server_models::load(const std::string & name, const std::vector<std::string> & extra_args) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
@@ -411,6 +411,10 @@ void server_models::load(const std::string & name) {
         inst.meta.update_args(ctx_preset, bin_path); // render args
 
         std::vector<std::string> child_args = inst.meta.args; // copy
+        // append extra_args if provided (requires --models-allow-extra-args)
+        if (!extra_args.empty()) {
+            child_args.insert(child_args.end(), extra_args.begin(), extra_args.end());
+        }
         std::vector<std::string> child_env  = base_env; // copy
         child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
 
@@ -743,7 +747,22 @@ void server_models_routes::init_routes() {
             res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
-        models.load(name);
+        // parse extra_args if provided and allowed
+        std::vector<std::string> extra_args;
+        if (body.contains("extra_args") && body["extra_args"].is_array()) {
+            if (!params.models_allow_extra_args) {
+                res_err(res, format_error_response(
+                    "extra_args not allowed; start server with --models-allow-extra-args to enable",
+                    ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+            for (const auto & arg : body["extra_args"]) {
+                if (arg.is_string()) {
+                    extra_args.push_back(arg.get<std::string>());
+                }
+            }
+        }
+        models.load(name, extra_args);
         res_ok(res, {{"success", true}});
         return res;
     };

@@ -114,7 +114,7 @@ struct server_models {
 
     // load and unload model instances
     // these functions are thread-safe
-    void load(const std::string & name);
+    void load(const std::string & name, const std::vector<std::string> & extra_args = {});
     void unload(const std::string & name);
     void unload_all();