diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
new file mode 100644
index 000000000..877d1a017
--- /dev/null
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -0,0 +1,303 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Command-line interface for ONNX Q/DQ autotuning."""
+
+import argparse
+import sys
+import tempfile
+from pathlib import Path
+
+from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
+
+DEFAULT_OUTPUT_DIR = "./autotuner_output"
+DEFAULT_NUM_SCHEMES = 30
+DEFAULT_QUANT_TYPE = "int8"
+DEFAULT_DQ_DTYPE = "float32"
+DEFAULT_TIMING_CACHE = str(Path(tempfile.gettempdir()) / "trtexec_timing.cache")
+DEFAULT_WARMUP_RUNS = 5
+DEFAULT_TIMING_RUNS = 20
+
+
+def validate_file_path(path: str | None, description: str) -> Path | None:
+    """Validate that a file path exists.
+
+    Args:
+        path: Path string to validate (can be None)
+        description: Description of the file for error messages
+
+    Returns:
+        Path object if valid, None if path is None
+
+    Raises:
+        SystemExit: If path is provided but doesn't exist
+    """
+    if path is None:
+        return None
+
+    path_obj = Path(path)
+    if not path_obj.exists():
+        logger.error(f"{description} not found: {path_obj}")
+        sys.exit(1)
+
+    return path_obj
+
+
+def log_benchmark_config(args):
+    """Log TensorRT benchmark configuration for transparency.
+
+    Logs timing cache path, warmup/timing run counts, and any custom
+    plugin libraries that will be loaded.
+
+    Args:
+        args: Parsed command-line arguments with benchmark configuration
+    """
+    logger.info("Initializing TensorRT benchmark")
+    logger.info(f"  Timing cache: {args.timing_cache}")
+    logger.info(f"  Warmup runs: {args.warmup_runs}")
+    logger.info(f"  Timing runs: {args.timing_runs}")
+    if args.plugin_libraries:
+        logger.info(f"  Plugin libraries: {', '.join(args.plugin_libraries)}")
+    if hasattr(args, "trtexec_benchmark_args") and args.trtexec_benchmark_args:
+        logger.info(f"  Trtexec args: {args.trtexec_benchmark_args}")
+
+
+def run_autotune() -> int:
+    """Execute the complete pattern-based Q/DQ autotuning workflow.
+
+    Parses command-line arguments, then:
+    1. Validates input paths (model, baseline, output directory)
+    2. Initializes TensorRT benchmark instance
+    3. Runs pattern-based region autotuning workflow
+    4. Handles interruptions gracefully with state preservation
+
+    Returns:
+        Exit code:
+        - 0: Success
+        - 1: Autotuning failed (exception occurred)
+        - 130: Interrupted by user (Ctrl+C)
+    """
+    args = _get_autotune_parser().parse_args()
+    model_path = validate_file_path(args.onnx_path, "Model file")
+    validate_file_path(args.qdq_baseline, "QDQ baseline model")
+    output_dir = Path(args.output_dir)
+
+    log_benchmark_config(args)
+    trtexec_args = getattr(args, "trtexec_benchmark_args", None)
+    benchmark_instance = init_benchmark_instance(
+        use_trtexec=args.use_trtexec,
+        plugin_libraries=args.plugin_libraries,
+        timing_cache_file=args.timing_cache,
+        warmup_runs=args.warmup_runs,
+        timing_runs=args.timing_runs,
+        trtexec_args=trtexec_args,
+    )
+
+    if benchmark_instance is None:
+        logger.error("Failed to initialize TensorRT benchmark")
+        return 1
+
+    try:
+        node_filter_list = None
+        if args.node_filter_list:
+            filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
+            if filter_file:
+                with open(filter_file) as f:
+                    node_filter_list = [
+                        line.strip()
+                        for line in f
+                        if line.strip() and not line.strip().startswith("#")
+                    ]
+                logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
+
+        region_pattern_autotuning_workflow(
+            model_path=str(model_path),
+            output_dir=output_dir,
+            num_schemes_per_region=args.num_schemes,
+            pattern_cache_file=args.pattern_cache_file,
+            state_file=args.state_file,
+            quant_type=args.quant_type,
+            default_dq_dtype=args.default_dq_dtype,
+            qdq_baseline_model=args.qdq_baseline,
+            node_filter_list=node_filter_list,
+            verbose=args.verbose,
+        )
+
+        logger.info("\n" + "=" * 70)
+        logger.info("✓ Autotuning completed successfully!")
+        logger.info(f"✓ Results: {output_dir}")
+        logger.info("=" * 70)
+        return 0
+
+    except KeyboardInterrupt:
+        logger.warning("\nInterrupted by user")
+        state_file = args.state_file or output_dir / "autotuner_state.yaml"
+        logger.info(f"Progress saved to: {state_file}")
+        return 130
+
+    except Exception as e:
+        logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose)
+        return 1
+
+
+def _get_autotune_parser() -> argparse.ArgumentParser:
+    """Create and configure the command-line argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="modelopt.onnx.quantization.autotune",
+        description="ONNX Q/DQ Autotuning with TensorRT",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx
+
+  # Import patterns from QDQ baseline model
+  python -m modelopt.onnx.quantization.autotune \\
+      --onnx_path model.onnx --qdq_baseline baseline.onnx
+
+  # Use pattern cache for warm-start
+  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml
+
+  # Full example with all options
+  python -m modelopt.onnx.quantization.autotune \\
+      --onnx_path model.onnx --schemes_per_region 50 \\
+      --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\
+      --quant_type int8 --verbose
+        """,
+    )
+
+    # Model and Output
+    io_group = parser.add_argument_group("Model and Output")
+    io_group.add_argument(
+        "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file"
+    )
+    io_group.add_argument(
+        "--output_dir",
+        "-o",
+        type=str,
+        default=DEFAULT_OUTPUT_DIR,
+        dest="output_dir",
+        help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})",
+    )
+
+    # Autotuning Strategy
+    strategy_group = parser.add_argument_group("Autotuning Strategy")
+    strategy_group.add_argument(
+        "--schemes_per_region",
+        "-s",
+        type=int,
+        default=DEFAULT_NUM_SCHEMES,
+        dest="num_schemes",
+        help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})",
+    )
+    strategy_group.add_argument(
+        "--pattern_cache",
+        type=str,
+        default=None,
+        dest="pattern_cache_file",
+        help="Path to pattern cache YAML for warm-start (optional)",
+    )
+    strategy_group.add_argument(
+        "--qdq_baseline",
+        type=str,
+        default=None,
+        help="Path to QDQ baseline ONNX model to import quantization patterns (optional)",
+    )
+    strategy_group.add_argument(
+        "--state_file",
+        type=str,
+        default=None,
+        help="State file path for resume capability (default: <output_dir>/autotuner_state.yaml)",
+    )
+    strategy_group.add_argument(
+        "--node_filter_list",
+        type=str,
+        default=None,
+        help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
+        "Regions without any matching nodes are skipped during autotuning.",
+    )
+
+    # Quantization
+    quant_group = parser.add_argument_group("Quantization")
+    quant_group.add_argument(
+        "--quant_type",
+        type=str,
+        default=DEFAULT_QUANT_TYPE,
+        choices=["int8", "fp8"],
+        help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})",
+    )
+    quant_group.add_argument(
+        "--default_dq_dtype",
+        type=str,
+        default=DEFAULT_DQ_DTYPE,
+        choices=["float16", "float32", "bfloat16"],
+        help="Default DQ output dtype if cannot be deduced (optional)",
+    )
+
+    # TensorRT Benchmark
+    trt_group = parser.add_argument_group("TensorRT Benchmark")
+    trt_group.add_argument(
+        "--use_trtexec",
+        action="store_true",
+        help="Use trtexec for benchmarking (default: False)",
+        default=False,
+    )
+    trt_group.add_argument(
+        "--timing_cache",
+        type=str,
+        default=DEFAULT_TIMING_CACHE,
+        help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})",
+    )
+    trt_group.add_argument(
+        "--warmup_runs",
+        type=int,
+        default=DEFAULT_WARMUP_RUNS,
+        help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})",
+    )
+    trt_group.add_argument(
+        "--timing_runs",
+        type=int,
+        default=DEFAULT_TIMING_RUNS,
+        help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})",
+    )
+    trt_group.add_argument(
+        "--plugin_libraries",
+        "--plugins",
+        type=str,
+        nargs="+",
+        default=None,
+        dest="plugin_libraries",
+        help="TensorRT plugin libraries (.so files) to load (optional, space-separated)",
+    )
+    trt_group.add_argument(
+        "--trtexec_benchmark_args",
+        type=str,
+        default=None,
+        help="Additional command-line arguments to pass to trtexec as a single quoted string. "
+        "Example: --trtexec_benchmark_args '--fp16 --workspace=4096 --verbose'",
+    )
+
+    # Logging
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging")
+
+    return parser
+
+
+if __name__ == "__main__":
+    sys.exit(run_autotune())
diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py
index 01fa4aaf4..d3b3de272 100644
--- a/modelopt/onnx/quantization/autotune/common.py
+++ b/modelopt/onnx/quantization/autotune/common.py
@@ -447,8 +447,11 @@ def from_dict(
     def __str__(self) -> str:
         """String representation for debugging."""
         best_latency = self.best_scheme.latency_ms if self.best_scheme else 0.0
+        pattern_str = self.pattern_signature[:40] + (
+            "..." if len(self.pattern_signature) > 40 else ""
+        )
         return (
-            f"PatternSchemes(pattern='{self.pattern_signature[:40]}...', "
+            f"PatternSchemes(pattern='{pattern_str}', "
             f"schemes={self.num_schemes}, best_latency={best_latency:.3f}ms)"
         )
 
@@ -516,19 +519,22 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None:
             for scheme in sorted_schemes:
                 # Check if this scheme is too similar to any already-filtered scheme
                 too_similar = False
+                existing_to_remove = None  # at most one; remove after inner loop
                 for existing_scheme in filtered_schemes:
                     distance = scheme.distance(existing_scheme)
                     if distance < self.minimum_distance:
                         # Schemes are too similar, keep the better one
                         if scheme.latency_ms < existing_scheme.latency_ms:
-                            # New scheme is better, remove existing and add new
-                            filtered_schemes.remove(existing_scheme)
+                            # New scheme is better; mark existing for removal
+                            existing_to_remove = existing_scheme
                             break
                         else:
                             # Existing scheme is better, skip new one
                             too_similar = True
                             break
 
+                if existing_to_remove is not None:
+                    filtered_schemes.remove(existing_to_remove)
                 if not too_similar:
                     filtered_schemes.append(scheme)
 
diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
new file mode 100644
index 000000000..025d9fac4
--- /dev/null
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -0,0 +1,376 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ONNX Q/DQ Autotuning Workflows.
+
+This module provides high-level workflow functions for automated Q/DQ (Quantization/Dequantization)
+optimization of ONNX models using pattern-based region analysis and TensorRT performance measurement.
+"""
+
+import fnmatch
+from pathlib import Path
+
+import onnx
+
+from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.autotuner import QDQAutotuner
+from modelopt.onnx.quantization.autotune.benchmark import TensorRTPyBenchmark, TrtExecBenchmark
+from modelopt.onnx.quantization.autotune.common import Config, PatternCache
+from modelopt.onnx.quantization.qdq_utils import get_quantized_tensors
+
+_benchmark_instance = None
+
+
+def benchmark_onnx_model(
+    model_path: str | bytes, log_file: str | None = None, flush_timing_cache: bool = False
+) -> float:
+    """Benchmark ONNX model inference latency using TensorRT Python API.
+
+    Args:
+        model_path: Path to ONNX model file, or bytes containing serialized model protobuf
+        log_file: Optional path to save detailed TensorRT build and benchmark logs
+                 (default: None, no logging)
+        flush_timing_cache: If True, flushes TensorRT timing cache before building engine.
+                           Useful for periodic cache refresh (default: False)
+
+    Returns:
+        Measured median inference latency in milliseconds.
+        Returns float('inf') on failure (invalid model, build error, etc.)
+
+    Raises:
+        No exceptions raised - errors are caught and logged, returning float('inf')
+    """
+    global _benchmark_instance
+
+    if _benchmark_instance is None:
+        logger.error("Benchmark instance not initialized")
+        return float("inf")
+
+    try:
+        latency = _benchmark_instance.run(
+            model_path, log_file=log_file, flush_timing_cache=flush_timing_cache
+        )
+
+        if latency == float("inf"):
+            if isinstance(model_path, bytes):
+                logger.warning("Benchmark failed for model bytes")
+            else:
+                logger.warning(f"Benchmark failed: {model_path}")
+            return float("inf")
+
+        logger.debug(f"Benchmark result: {latency:.2f} ms")
+        return latency
+
+    except Exception as e:
+        logger.error(f"Benchmark error: {e}", exc_info=True)
+        return float("inf")
+
+
+def init_benchmark_instance(
+    use_trtexec: bool = False,
+    plugin_libraries: list[str] | None = None,
+    timing_cache_file: str | None = None,
+    warmup_runs: int = 5,
+    timing_runs: int = 20,
+    trtexec_args: list[str] | None = None,
+):
+    """Initialize global TensorRT benchmark instance for model performance measurement.
+
+    Args:
+        use_trtexec: Whether to use trtexec for benchmarking.
+        plugin_libraries: List of paths to TensorRT plugin shared libraries (.so files).
+                          These plugins will be loaded by trtexec or TensorRT Python API during engine building.
+                          If None, no custom plugins are loaded.
+        timing_cache_file: Path to TensorRT timing cache file for faster engine builds.
+                          If None, uses default "trtexec_timing.cache" (default: None)
+        warmup_runs: Number of warmup inference iterations before measurement.
+                    Allows GPU to reach stable performance state (default: 5)
+        timing_runs: Number of timed inference iterations for latency measurement.
+                    Higher values give more stable median (default: 20)
+        trtexec_args: Additional command-line arguments to pass to trtexec as a string (only used if use_trtexec=True).
+                     Example: '--fp16 --workspace=4096 --verbose'
+    """
+    global _benchmark_instance
+    try:
+        if use_trtexec:
+            _benchmark_instance = TrtExecBenchmark(
+                timing_cache_file=timing_cache_file,
+                warmup_runs=warmup_runs,
+                timing_runs=timing_runs,
+                plugin_libraries=plugin_libraries,
+                trtexec_args=trtexec_args,
+            )
+            logger.info("Trtexec benchmark initialized")
+        else:
+            _benchmark_instance = TensorRTPyBenchmark(
+                timing_cache_file=timing_cache_file,
+                warmup_runs=warmup_runs,
+                timing_runs=timing_runs,
+                plugin_libraries=plugin_libraries,
+            )
+            logger.info("TensorRT Python API benchmark initialized")
+        logger.debug(
+            f"Settings: warmup={warmup_runs}, timing={timing_runs}, "
+            f"cache={timing_cache_file or 'trtexec_timing.cache'}, plugin_libraries={plugin_libraries}"
+        )
+        return _benchmark_instance
+    except Exception as e:
+        logger.error(f"TensorRT initialization failed: {e}", exc_info=True)
+        return None
+
+
+def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool:
+    """Check if any node in the region matches any of the filter patterns.
+
+    Args:
+        region: Region object to check
+        graph: ONNX graph (graphsurgeon) containing node information
+        filter_patterns: List of wildcard patterns to match against node names
+
+    Returns:
+        True if at least one node in the region matches any pattern, False otherwise
+    """
+    if not filter_patterns:
+        return True
+
+    node_indices = region.get_all_nodes_recursive()
+
+    for node_idx in node_indices:
+        if node_idx < len(graph.nodes):
+            node_name = graph.nodes[node_idx].name
+            for pattern in filter_patterns:
+                if fnmatch.fnmatch(node_name, pattern):
+                    return True
+
+    return False
+
+
+def region_pattern_autotuning_workflow(
+    model_path: str,
+    output_dir: Path,
+    num_schemes_per_region: int = 30,
+    pattern_cache_file: str | None = None,
+    state_file: str | None = None,
+    quant_type: str = "int8",
+    default_dq_dtype: str = "float32",
+    qdq_baseline_model: str | None = None,
+    node_filter_list: list[str] | None = None,
+    verbose: bool = False,
+) -> QDQAutotuner:
+    """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model.
+
+    This workflow uses pattern-based region optimization to efficiently find optimal
+    Q/DQ insertion points. The key insight: regions with identical structural patterns
+    can share the same Q/DQ scheme. When a best scheme is found for a pattern, it
+    automatically applies to all regions matching that pattern, making optimization
+    both efficient and consistent.
+
+    Automatically discovers regions, generates and tests Q/DQ insertion schemes,
+    and exports optimized model. Supports incremental state saving for crash recovery
+    and pattern cache-based warm-start.
+
+    **Workflow Steps:**
+    1. Load model and initialize autotuner with automatic hierarchical region discovery
+    2. Resume from checkpoint if state file exists (crash recovery)
+    3. Load pattern cache if provided (warm-start with known-good schemes)
+    4. Import Q/DQ patterns from baseline model if provided (transfer learning)
+    5. Measure baseline performance without Q/DQ insertions
+    6. For each discovered region pattern:
+       a. Generate Q/DQ insertion schemes (pattern-relative)
+       b. Build TensorRT engine and measure latency for each scheme
+       c. Select best scheme for this pattern (applies to all matching regions)
+       d. Save checkpoint and intermediate model
+    7. Export final optimized model with best Q/DQ scheme for each pattern
+
+    Args:
+        model_path: Path to ONNX model file to optimize
+        output_dir: Directory for output files (state, logs, models). Created if doesn't exist.
+        num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern.
+                               Higher values explore more configurations but take longer (default: 30)
+        pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes
+                           from previous runs. Enables warm-start optimization (default: None)
+        state_file: Optional path to state file for checkpoint/resume. If None, automatically
+                   uses <output_dir>/autotuner_state.yaml (default: None)
+        quant_type: Quantization data type - "int8" for INT8 quantization (default),
+                   "fp8" for FP8 quantization
+        qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided,
+                           extracts Q/DQ insertion patterns and adds them to pattern cache
+                           for warm-start (default: None)
+        node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions
+                         without any matching nodes are skipped during autotuning (default: None)
+        verbose: Enable verbose logging in Config for detailed autotuner output (default: False)
+
+    Returns:
+        QDQAutotuner instance after autotuning
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    logs_dir = output_dir / "logs"
+    logs_dir.mkdir(exist_ok=True)
+    models_dir = output_dir / "region_models"
+    models_dir.mkdir(exist_ok=True)
+
+    if state_file is None:
+        state_file = str(output_dir / "autotuner_state.yaml")
+    state_path = Path(state_file)
+
+    logger.info(f"Loading model: {model_path}")
+    model = onnx.load(model_path)
+
+    pattern_cache = None
+    if pattern_cache_file:
+        pattern_cache_path = Path(pattern_cache_file)
+        if pattern_cache_path.exists():
+            pattern_cache = PatternCache.load(str(pattern_cache_path))
+            logger.info(
+                f"Loaded pattern cache: {pattern_cache.num_patterns} patterns, "
+                f"{pattern_cache.total_schemes} schemes"
+            )
+        else:
+            logger.warning(f"Pattern cache not found: {pattern_cache_file}")
+
+    logger.info(
+        f"Initializing autotuner (quant_type={quant_type}, default_dq_dtype={default_dq_dtype})"
+    )
+    config = Config(
+        default_quant_type=quant_type,
+        default_dq_dtype=default_dq_dtype,
+        verbose=verbose,
+    )
+
+    autotuner = QDQAutotuner(model)
+    autotuner.initialize(config, pattern_cache)
+
+    if state_path.exists():
+        logger.info(f"Resuming from checkpoint: {state_path}")
+        autotuner.load_state(str(state_path))
+    else:
+        logger.info("Starting new autotuning session")
+
+    if qdq_baseline_model:
+        qdq_baseline_path = Path(qdq_baseline_model)
+        if qdq_baseline_path.exists():
+            logger.info(f"Importing patterns from QDQ baseline: {qdq_baseline_model}")
+            qdq_model = onnx.load(str(qdq_baseline_path))
+            quantized_tensors = get_quantized_tensors(qdq_model)
+            logger.debug(f"Found {len(quantized_tensors)} quantized tensors in baseline")
+            autotuner.import_insertion_points(quantized_tensors)
+            logger.info("Pattern import complete")
+        else:
+            logger.warning(f"QDQ baseline not found: {qdq_baseline_model}")
+
+    regions = autotuner.regions
+    logger.info(f"Ready to profile {len(regions)} regions")
+
+    if autotuner.baseline_latency_ms is None:
+        logger.info("Measuring baseline (no Q/DQ)")
+        baseline_path = output_dir / "baseline.onnx"
+        autotuner.export_onnx(str(baseline_path), insert_qdq=False)
+        baseline_log = logs_dir / "baseline.log"
+        baseline_latency = benchmark_onnx_model(str(baseline_path), str(baseline_log))
+        autotuner.submit(baseline_latency)
+        logger.info(f"Baseline: {baseline_latency:.2f} ms")
+    else:
+        baseline_latency = autotuner.baseline_latency_ms
+        logger.info(f"Using baseline from checkpoint: {baseline_latency:.2f} ms")
+
+    logger.info(f"Starting region profiling ({num_schemes_per_region} schemes per region)")
+
+    iteration_count = 0
+
+    for region_idx, region in enumerate(regions):
+        logger.info(
+            f"Region {region_idx + 1}/{len(regions)} (ID={region.id}, level={region.level})"
+        )
+
+        if node_filter_list and not _region_matches_filter(
+            region, autotuner.graph, node_filter_list
+        ):
+            logger.info("  Skipping (no nodes match filter patterns)")
+            continue
+
+        commit = region_idx > 0
+        autotuner.set_profile_region(region, commit=commit)
+
+        if autotuner.current_profile_pattern_schemes is None:
+            logger.info("  Skipping (already profiled)")
+            continue
+
+        schemes_tested = 0
+        for scheme_num in range(num_schemes_per_region):
+            iteration_count += 1
+            scheme_idx = autotuner.generate()
+
+            if scheme_idx == -1:
+                logger.debug(f"  Stopping at scheme {scheme_num + 1} (no more unique schemes)")
+                break
+
+            schemes_tested += 1
+            model_bytes = autotuner.export_onnx(None, insert_qdq=True)
+            test_log = logs_dir / f"region_{region.id}_scheme_{scheme_idx}.log"
+            flush_timing_cache = (iteration_count % 10) == 0
+            latency = benchmark_onnx_model(
+                model_bytes, str(test_log), flush_timing_cache=flush_timing_cache
+            )
+
+            autotuner.submit(latency, success=(latency != float("inf")))
+
+        ps = autotuner.current_profile_pattern_schemes
+        if ps and ps.schemes:
+            best_scheme = ps.best_scheme
+            if best_scheme and best_scheme.latency_ms < float("inf") and baseline_latency > 0:
+                speedup = baseline_latency / best_scheme.latency_ms
+                logger.info(
+                    f"  Tested {schemes_tested} schemes: "
+                    f"best {best_scheme.latency_ms:.2f} ms ({speedup:.3f}x speedup)"
+                )
+            else:
+                logger.info(f"  Tested {schemes_tested} schemes: no valid measurements")
+        else:
+            logger.info(f"  Tested {schemes_tested} schemes")
+
+        region_model_path = models_dir / f"region_{region.id}_level_{region.level}.onnx"
+        autotuner.export_onnx(str(region_model_path), insert_qdq=True, best=True)
+        logger.debug(f"  Saved best model: {region_model_path.name}")
+
+        # Save state after each region (incremental, crash recovery)
+        autotuner.save_state(str(state_path))
+        logger.debug("  Checkpoint saved")
+
+    # Commit final region
+    autotuner.set_profile_region(None, commit=True)
+
+    logger.info("Exporting final optimized model")
+    final_model_path = output_dir / "optimized_final.onnx"
+    autotuner.export_onnx(str(final_model_path), insert_qdq=True)
+    final_log = logs_dir / "final.log"
+    final_latency = benchmark_onnx_model(str(final_model_path), str(final_log))
+
+    if final_latency > 0 and final_latency != float("inf"):
+        speedup = baseline_latency / final_latency
+        logger.info(
+            f"Results: {baseline_latency:.2f} ms → {final_latency:.2f} ms ({speedup:.3f}x speedup)"
+        )
+    else:
+        logger.info(f"Results: {baseline_latency:.2f} ms → failed (invalid measurement)")
+
+    autotuner.save_state(str(state_path))
+
+    logger.info("Autotuning complete")
+    logger.info(f"  Final model: {final_model_path}")
+    logger.info(f"  State: {state_path}")
+    logger.debug(f"  Logs: {logs_dir}")
+    logger.debug(f"  Region models: {models_dir}")
+
+    return autotuner
diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py
index 4090cfef3..db9652e56 100644
--- a/tests/_test_utils/onnx/quantization/autotune/models.py
+++ b/tests/_test_utils/onnx/quantization/autotune/models.py
@@ -25,9 +25,9 @@
 
 def _create_simple_conv_onnx_model():
     """Build ONNX model: Input -> Conv -> Relu -> Output (minimal for autotuner tests)."""
-    input_tensor = helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 224, 224])
+    input_tensor = helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [32, 3, 224, 224])
     output_tensor = helper.make_tensor_value_info(
-        "output", onnx.TensorProto.FLOAT, [1, 64, 224, 224]
+        "output", onnx.TensorProto.FLOAT, [32, 64, 224, 224]
     )
     conv_node = helper.make_node(
         "Conv", inputs=["input", "conv_weight"], outputs=["conv_out"], name="conv"
diff --git a/tests/gpu/onnx/quantization/autotune/test_workflow.py b/tests/gpu/onnx/quantization/autotune/test_workflow.py
new file mode 100644
index 000000000..8066766a9
--- /dev/null
+++ b/tests/gpu/onnx/quantization/autotune/test_workflow.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+from pathlib import Path
+
+import onnx
+import pytest
+from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec
+from _test_utils.onnx.quantization.autotune import models as _test_models
+
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
+
+
+@pytest.fixture
+def simple_conv_model():
+    """Simple ONNX model: Input -> Conv -> Relu -> Output. Created via models.py."""
+    return _test_models._create_simple_conv_onnx_model()
+
+
+@pytest.mark.parametrize("use_trtexec", [True, False])
+def test_export_quantized_model(use_trtexec, simple_conv_model):
+    """Test exporting quantized model with Q/DQ."""
+    if use_trtexec:
+        skip_if_no_trtexec()
+    else:
+        skip_if_no_tensorrt()
+
+    with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+        baseline_model_path = f.name
+
+    # Save baseline model
+    onnx.save(simple_conv_model, baseline_model_path)
+
+    output_dir = baseline_model_path.replace(".onnx", "")
+    output_path = output_dir + ".quant.onnx"
+
+    try:
+        init_benchmark_instance(use_trtexec=use_trtexec, timing_runs=100)
+        autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir))
+
+        # Export model with Q/DQ insertion
+        autotuner.export_onnx(output_path, insert_qdq=True, best=True)
+
+        # Verify file was created
+        assert os.path.exists(output_path)
+
+        # Verify it's a valid ONNX model
+        exported_model = onnx.load(output_path)
+        assert exported_model is not None
+
+        # Verify that it contains Q/DQ nodes
+        qdq_nodes = [
+            n
+            for n in exported_model.graph.node
+            if n.op_type in ["QuantizeLinear", "DequantizeLinear"]
+        ]
+        assert qdq_nodes, "Q/DQ nodes not found in quantized model"
+    finally:
+        if os.path.exists(output_path):
+            os.unlink(output_path)
+        if os.path.exists(baseline_model_path):
+            os.unlink(baseline_model_path)
+        if os.path.isdir(output_dir):
+            shutil.rmtree(output_dir)
diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py
new file mode 100644
index 000000000..9ec99d65d
--- /dev/null
+++ b/tests/unit/onnx/quantization/autotune/test_config.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tests for the Config class in the autotuner.
+
+Tests configuration parameter validation and defaults.
+"""
+
+from modelopt.onnx.quantization.autotune.common import Config
+
+
+class TestConfig:
+    """Test Config class functionality."""
+
+    def test_default_values(self):
+        """Test that Config has correct default values."""
+        config = Config()
+
+        # Logging
+        assert not config.verbose
+
+        # Performance thresholds
+
+        # Q/DQ defaults
+        assert config.default_q_scale == 0.1
+        assert config.default_q_zero_point == 0
+        assert config.default_quant_type == "int8"
+
+        # Region builder settings
+        assert config.maximum_sequence_region_size == 10
+        assert config.minimum_topdown_search_size == 10
+
+        # Scheme generation parameters
+        assert config.top_percent_to_mutate == 0.1
+        assert config.minimum_schemes_to_mutate == 10
+        assert config.maximum_mutations == 3
+        assert config.maximum_generation_attempts == 100
+
+        # Pattern cache parameters
+        assert config.pattern_cache_minimum_distance == 4
+        assert config.pattern_cache_max_entries_per_pattern == 32
+
+    def test_custom_values(self):
+        """Test creating Config with custom values."""
+        config = Config(
+            verbose=True,
+            default_q_scale=0.05,
+            default_q_zero_point=128,
+            default_quant_type="fp8",
+            maximum_sequence_region_size=20,
+        )
+
+        assert config.verbose
+        assert config.default_q_scale == 0.05
+        assert config.default_q_zero_point == 128
+        assert config.default_quant_type == "fp8"
+        assert config.maximum_sequence_region_size == 20
+
+    def test_region_size_validation(self):
+        """Test that region size parameters are positive."""
+        config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5)
+        assert config.maximum_sequence_region_size > 0
+        assert config.minimum_topdown_search_size > 0
+
+    def test_genetic_algorithm_params(self):
+        """Test genetic algorithm parameters."""
+        config = Config(
+            top_percent_to_mutate=0.2,
+            minimum_schemes_to_mutate=2,
+            maximum_mutations=5,
+            maximum_generation_attempts=50,
+        )
+
+        assert config.top_percent_to_mutate == 0.2
+        assert config.minimum_schemes_to_mutate == 2
+        assert config.maximum_mutations == 5
+        assert config.maximum_generation_attempts == 50
+
+    def test_pattern_cache_params(self):
+        """Test pattern cache parameters."""
+        config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10)
+
+        assert config.pattern_cache_minimum_distance == 3
+        assert config.pattern_cache_max_entries_per_pattern == 10