diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py new file mode 100644 index 000000000..877d1a017 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -0,0 +1,303 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Command-line interface for ONNX Q/DQ autotuning.""" + +import argparse +import sys +import tempfile +from pathlib import Path + +from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) + +DEFAULT_OUTPUT_DIR = "./autotuner_output" +DEFAULT_NUM_SCHEMES = 30 +DEFAULT_QUANT_TYPE = "int8" +DEFAULT_DQ_DTYPE = "float32" +DEFAULT_TIMING_CACHE = str(Path(tempfile.gettempdir()) / "trtexec_timing.cache") +DEFAULT_WARMUP_RUNS = 5 +DEFAULT_TIMING_RUNS = 20 + + +def validate_file_path(path: str | None, description: str) -> Path | None: + """Validate that a file path exists. + + Args: + path: Path string to validate (can be None) + description: Description of the file for error messages + + Returns: + Path object if valid, None if path is None + + Raises: + SystemExit: If path is provided but doesn't exist + """ + if path is None: + return None + + path_obj = Path(path) + if not path_obj.exists(): + logger.error(f"{description} not found: {path_obj}") + sys.exit(1) + + return path_obj + + +def log_benchmark_config(args): + """Log TensorRT benchmark configuration for transparency. + + Logs timing cache path, warmup/timing run counts, and any custom + plugin libraries that will be loaded. + + Args: + args: Parsed command-line arguments with benchmark configuration + """ + logger.info("Initializing TensorRT benchmark") + logger.info(f" Timing cache: {args.timing_cache}") + logger.info(f" Warmup runs: {args.warmup_runs}") + logger.info(f" Timing runs: {args.timing_runs}") + if args.plugin_libraries: + logger.info(f" Plugin libraries: {', '.join(args.plugin_libraries)}") + if hasattr(args, "trtexec_benchmark_args") and args.trtexec_benchmark_args: + logger.info(f" Trtexec args: {args.trtexec_benchmark_args}") + + +def run_autotune() -> int: + """Execute the complete pattern-based Q/DQ autotuning workflow. + + Parses command-line arguments, then: + 1. Validates input paths (model, baseline, output directory) + 2. Initializes TensorRT benchmark instance + 3. Runs pattern-based region autotuning workflow + 4. Handles interruptions gracefully with state preservation + + Returns: + Exit code: + - 0: Success + - 1: Autotuning failed (exception occurred) + - 130: Interrupted by user (Ctrl+C) + """ + args = _get_autotune_parser().parse_args() + model_path = validate_file_path(args.onnx_path, "Model file") + validate_file_path(args.qdq_baseline, "QDQ baseline model") + output_dir = Path(args.output_dir) + + log_benchmark_config(args) + trtexec_args = getattr(args, "trtexec_benchmark_args", None) + benchmark_instance = init_benchmark_instance( + use_trtexec=args.use_trtexec, + plugin_libraries=args.plugin_libraries, + timing_cache_file=args.timing_cache, + warmup_runs=args.warmup_runs, + timing_runs=args.timing_runs, + trtexec_args=trtexec_args, + ) + + if benchmark_instance is None: + logger.error("Failed to initialize TensorRT benchmark") + return 1 + + try: + node_filter_list = None + if args.node_filter_list: + filter_file = validate_file_path(args.node_filter_list, "Node filter list file") + if filter_file: + with open(filter_file) as f: + node_filter_list = [ + line.strip() + for line in f + if line.strip() and not line.strip().startswith("#") + ] + logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") + + region_pattern_autotuning_workflow( + model_path=str(model_path), + output_dir=output_dir, + num_schemes_per_region=args.num_schemes, + pattern_cache_file=args.pattern_cache_file, + state_file=args.state_file, + quant_type=args.quant_type, + default_dq_dtype=args.default_dq_dtype, + qdq_baseline_model=args.qdq_baseline, + node_filter_list=node_filter_list, + verbose=args.verbose, + ) + + logger.info("\n" + "=" * 70) + logger.info("✓ Autotuning completed successfully!") + logger.info(f"✓ Results: {output_dir}") + logger.info("=" * 70) + return 0 + + except KeyboardInterrupt: + logger.warning("\nInterrupted by user") + state_file = args.state_file or output_dir / "autotuner_state.yaml" + logger.info(f"Progress saved to: {state_file}") + return 130 + + except Exception as e: + logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose) + return 1 + + +def _get_autotune_parser() -> argparse.ArgumentParser: + """Create and configure the command-line argument parser.""" + parser = argparse.ArgumentParser( + prog="modelopt.onnx.quantization.autotune", + description="ONNX Q/DQ Autotuning with TensorRT", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx + + # Import patterns from QDQ baseline model + python -m modelopt.onnx.quantization.autotune \\ + --onnx_path model.onnx --qdq_baseline baseline.onnx + + # Use pattern cache for warm-start + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml + + # Full example with all options + python -m modelopt.onnx.quantization.autotune \\ + --onnx_path model.onnx --schemes_per_region 50 \\ + --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\ + --quant_type int8 --verbose + """, + ) + + # Model and Output + io_group = parser.add_argument_group("Model and Output") + io_group.add_argument( + "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file" + ) + io_group.add_argument( + "--output_dir", + "-o", + type=str, + default=DEFAULT_OUTPUT_DIR, + dest="output_dir", + help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})", + ) + + # Autotuning Strategy + strategy_group = parser.add_argument_group("Autotuning Strategy") + strategy_group.add_argument( + "--schemes_per_region", + "-s", + type=int, + default=DEFAULT_NUM_SCHEMES, + dest="num_schemes", + help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})", + ) + strategy_group.add_argument( + "--pattern_cache", + type=str, + default=None, + dest="pattern_cache_file", + help="Path to pattern cache YAML for warm-start (optional)", + ) + strategy_group.add_argument( + "--qdq_baseline", + type=str, + default=None, + help="Path to QDQ baseline ONNX model to import quantization patterns (optional)", + ) + strategy_group.add_argument( + "--state_file", + type=str, + default=None, + help="State file path for resume capability (default: /autotuner_state.yaml)", + ) + strategy_group.add_argument( + "--node_filter_list", + type=str, + default=None, + help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). " + "Regions without any matching nodes are skipped during autotuning.", + ) + + # Quantization + quant_group = parser.add_argument_group("Quantization") + quant_group.add_argument( + "--quant_type", + type=str, + default=DEFAULT_QUANT_TYPE, + choices=["int8", "fp8"], + help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})", + ) + quant_group.add_argument( + "--default_dq_dtype", + type=str, + default=DEFAULT_DQ_DTYPE, + choices=["float16", "float32", "bfloat16"], + help="Default DQ output dtype if cannot be deduced (optional)", + ) + + # TensorRT Benchmark + trt_group = parser.add_argument_group("TensorRT Benchmark") + trt_group.add_argument( + "--use_trtexec", + action="store_true", + help="Use trtexec for benchmarking (default: False)", + default=False, + ) + trt_group.add_argument( + "--timing_cache", + type=str, + default=DEFAULT_TIMING_CACHE, + help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})", + ) + trt_group.add_argument( + "--warmup_runs", + type=int, + default=DEFAULT_WARMUP_RUNS, + help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})", + ) + trt_group.add_argument( + "--timing_runs", + type=int, + default=DEFAULT_TIMING_RUNS, + help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})", + ) + trt_group.add_argument( + "--plugin_libraries", + "--plugins", + type=str, + nargs="+", + default=None, + dest="plugin_libraries", + help="TensorRT plugin libraries (.so files) to load (optional, space-separated)", + ) + trt_group.add_argument( + "--trtexec_benchmark_args", + type=str, + default=None, + help="Additional command-line arguments to pass to trtexec as a single quoted string. " + "Example: --trtexec_benchmark_args '--fp16 --workspace=4096 --verbose'", + ) + + # Logging + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging") + + return parser + + +if __name__ == "__main__": + sys.exit(run_autotune()) diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py index 01fa4aaf4..d3b3de272 100644 --- a/modelopt/onnx/quantization/autotune/common.py +++ b/modelopt/onnx/quantization/autotune/common.py @@ -447,8 +447,11 @@ def from_dict( def __str__(self) -> str: """String representation for debugging.""" best_latency = self.best_scheme.latency_ms if self.best_scheme else 0.0 + pattern_str = self.pattern_signature[:40] + ( + "..." if len(self.pattern_signature) > 40 else "" + ) return ( - f"PatternSchemes(pattern='{self.pattern_signature[:40]}...', " + f"PatternSchemes(pattern='{pattern_str}', " f"schemes={self.num_schemes}, best_latency={best_latency:.3f}ms)" ) @@ -516,19 +519,22 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None: for scheme in sorted_schemes: # Check if this scheme is too similar to any already-filtered scheme too_similar = False + existing_to_remove = None # at most one; remove after inner loop for existing_scheme in filtered_schemes: distance = scheme.distance(existing_scheme) if distance < self.minimum_distance: # Schemes are too similar, keep the better one if scheme.latency_ms < existing_scheme.latency_ms: - # New scheme is better, remove existing and add new - filtered_schemes.remove(existing_scheme) + # New scheme is better; mark existing for removal + existing_to_remove = existing_scheme break else: # Existing scheme is better, skip new one too_similar = True break + if existing_to_remove is not None: + filtered_schemes.remove(existing_to_remove) if not too_similar: filtered_schemes.append(scheme) diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py new file mode 100644 index 000000000..025d9fac4 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -0,0 +1,376 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ONNX Q/DQ Autotuning Workflows. + +This module provides high-level workflow functions for automated Q/DQ (Quantization/Dequantization) +optimization of ONNX models using pattern-based region analysis and TensorRT performance measurement. +""" + +import fnmatch +from pathlib import Path + +import onnx + +from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.autotune.autotuner import QDQAutotuner +from modelopt.onnx.quantization.autotune.benchmark import TensorRTPyBenchmark, TrtExecBenchmark +from modelopt.onnx.quantization.autotune.common import Config, PatternCache +from modelopt.onnx.quantization.qdq_utils import get_quantized_tensors + +_benchmark_instance = None + + +def benchmark_onnx_model( + model_path: str | bytes, log_file: str | None = None, flush_timing_cache: bool = False +) -> float: + """Benchmark ONNX model inference latency using TensorRT Python API. + + Args: + model_path: Path to ONNX model file, or bytes containing serialized model protobuf + log_file: Optional path to save detailed TensorRT build and benchmark logs + (default: None, no logging) + flush_timing_cache: If True, flushes TensorRT timing cache before building engine. + Useful for periodic cache refresh (default: False) + + Returns: + Measured median inference latency in milliseconds. + Returns float('inf') on failure (invalid model, build error, etc.) + + Raises: + No exceptions raised - errors are caught and logged, returning float('inf') + """ + global _benchmark_instance + + if _benchmark_instance is None: + logger.error("Benchmark instance not initialized") + return float("inf") + + try: + latency = _benchmark_instance.run( + model_path, log_file=log_file, flush_timing_cache=flush_timing_cache + ) + + if latency == float("inf"): + if isinstance(model_path, bytes): + logger.warning("Benchmark failed for model bytes") + else: + logger.warning(f"Benchmark failed: {model_path}") + return float("inf") + + logger.debug(f"Benchmark result: {latency:.2f} ms") + return latency + + except Exception as e: + logger.error(f"Benchmark error: {e}", exc_info=True) + return float("inf") + + +def init_benchmark_instance( + use_trtexec: bool = False, + plugin_libraries: list[str] | None = None, + timing_cache_file: str | None = None, + warmup_runs: int = 5, + timing_runs: int = 20, + trtexec_args: list[str] | None = None, +): + """Initialize global TensorRT benchmark instance for model performance measurement. + + Args: + use_trtexec: Whether to use trtexec for benchmarking. + plugin_libraries: List of paths to TensorRT plugin shared libraries (.so files). + These plugins will be loaded by trtexec or TensorRT Python API during engine building. + If None, no custom plugins are loaded. + timing_cache_file: Path to TensorRT timing cache file for faster engine builds. + If None, uses default "trtexec_timing.cache" (default: None) + warmup_runs: Number of warmup inference iterations before measurement. + Allows GPU to reach stable performance state (default: 5) + timing_runs: Number of timed inference iterations for latency measurement. + Higher values give more stable median (default: 20) + trtexec_args: Additional command-line arguments to pass to trtexec as a string (only used if use_trtexec=True). + Example: '--fp16 --workspace=4096 --verbose' + """ + global _benchmark_instance + try: + if use_trtexec: + _benchmark_instance = TrtExecBenchmark( + timing_cache_file=timing_cache_file, + warmup_runs=warmup_runs, + timing_runs=timing_runs, + plugin_libraries=plugin_libraries, + trtexec_args=trtexec_args, + ) + logger.info("Trtexec benchmark initialized") + else: + _benchmark_instance = TensorRTPyBenchmark( + timing_cache_file=timing_cache_file, + warmup_runs=warmup_runs, + timing_runs=timing_runs, + plugin_libraries=plugin_libraries, + ) + logger.info("TensorRT Python API benchmark initialized") + logger.debug( + f"Settings: warmup={warmup_runs}, timing={timing_runs}, " + f"cache={timing_cache_file or 'trtexec_timing.cache'}, plugin_libraries={plugin_libraries}" + ) + return _benchmark_instance + except Exception as e: + logger.error(f"TensorRT initialization failed: {e}", exc_info=True) + return None + + +def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool: + """Check if any node in the region matches any of the filter patterns. + + Args: + region: Region object to check + graph: ONNX graph (graphsurgeon) containing node information + filter_patterns: List of wildcard patterns to match against node names + + Returns: + True if at least one node in the region matches any pattern, False otherwise + """ + if not filter_patterns: + return True + + node_indices = region.get_all_nodes_recursive() + + for node_idx in node_indices: + if node_idx < len(graph.nodes): + node_name = graph.nodes[node_idx].name + for pattern in filter_patterns: + if fnmatch.fnmatch(node_name, pattern): + return True + + return False + + +def region_pattern_autotuning_workflow( + model_path: str, + output_dir: Path, + num_schemes_per_region: int = 30, + pattern_cache_file: str | None = None, + state_file: str | None = None, + quant_type: str = "int8", + default_dq_dtype: str = "float32", + qdq_baseline_model: str | None = None, + node_filter_list: list[str] | None = None, + verbose: bool = False, +) -> QDQAutotuner: + """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model. + + This workflow uses pattern-based region optimization to efficiently find optimal + Q/DQ insertion points. The key insight: regions with identical structural patterns + can share the same Q/DQ scheme. When a best scheme is found for a pattern, it + automatically applies to all regions matching that pattern, making optimization + both efficient and consistent. + + Automatically discovers regions, generates and tests Q/DQ insertion schemes, + and exports optimized model. Supports incremental state saving for crash recovery + and pattern cache-based warm-start. + + **Workflow Steps:** + 1. Load model and initialize autotuner with automatic hierarchical region discovery + 2. Resume from checkpoint if state file exists (crash recovery) + 3. Load pattern cache if provided (warm-start with known-good schemes) + 4. Import Q/DQ patterns from baseline model if provided (transfer learning) + 5. Measure baseline performance without Q/DQ insertions + 6. For each discovered region pattern: + a. Generate Q/DQ insertion schemes (pattern-relative) + b. Build TensorRT engine and measure latency for each scheme + c. Select best scheme for this pattern (applies to all matching regions) + d. Save checkpoint and intermediate model + 7. Export final optimized model with best Q/DQ scheme for each pattern + + Args: + model_path: Path to ONNX model file to optimize + output_dir: Directory for output files (state, logs, models). Created if doesn't exist. + num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern. + Higher values explore more configurations but take longer (default: 30) + pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes + from previous runs. Enables warm-start optimization (default: None) + state_file: Optional path to state file for checkpoint/resume. If None, automatically + uses /autotuner_state.yaml (default: None) + quant_type: Quantization data type - "int8" for INT8 quantization (default), + "fp8" for FP8 quantization + qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided, + extracts Q/DQ insertion patterns and adds them to pattern cache + for warm-start (default: None) + node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions + without any matching nodes are skipped during autotuning (default: None) + verbose: Enable verbose logging in Config for detailed autotuner output (default: False) + + Returns: + QDQAutotuner instance after autotuning + """ + output_dir.mkdir(parents=True, exist_ok=True) + logs_dir = output_dir / "logs" + logs_dir.mkdir(exist_ok=True) + models_dir = output_dir / "region_models" + models_dir.mkdir(exist_ok=True) + + if state_file is None: + state_file = str(output_dir / "autotuner_state.yaml") + state_path = Path(state_file) + + logger.info(f"Loading model: {model_path}") + model = onnx.load(model_path) + + pattern_cache = None + if pattern_cache_file: + pattern_cache_path = Path(pattern_cache_file) + if pattern_cache_path.exists(): + pattern_cache = PatternCache.load(str(pattern_cache_path)) + logger.info( + f"Loaded pattern cache: {pattern_cache.num_patterns} patterns, " + f"{pattern_cache.total_schemes} schemes" + ) + else: + logger.warning(f"Pattern cache not found: {pattern_cache_file}") + + logger.info( + f"Initializing autotuner (quant_type={quant_type}, default_dq_dtype={default_dq_dtype})" + ) + config = Config( + default_quant_type=quant_type, + default_dq_dtype=default_dq_dtype, + verbose=verbose, + ) + + autotuner = QDQAutotuner(model) + autotuner.initialize(config, pattern_cache) + + if state_path.exists(): + logger.info(f"Resuming from checkpoint: {state_path}") + autotuner.load_state(str(state_path)) + else: + logger.info("Starting new autotuning session") + + if qdq_baseline_model: + qdq_baseline_path = Path(qdq_baseline_model) + if qdq_baseline_path.exists(): + logger.info(f"Importing patterns from QDQ baseline: {qdq_baseline_model}") + qdq_model = onnx.load(str(qdq_baseline_path)) + quantized_tensors = get_quantized_tensors(qdq_model) + logger.debug(f"Found {len(quantized_tensors)} quantized tensors in baseline") + autotuner.import_insertion_points(quantized_tensors) + logger.info("Pattern import complete") + else: + logger.warning(f"QDQ baseline not found: {qdq_baseline_model}") + + regions = autotuner.regions + logger.info(f"Ready to profile {len(regions)} regions") + + if autotuner.baseline_latency_ms is None: + logger.info("Measuring baseline (no Q/DQ)") + baseline_path = output_dir / "baseline.onnx" + autotuner.export_onnx(str(baseline_path), insert_qdq=False) + baseline_log = logs_dir / "baseline.log" + baseline_latency = benchmark_onnx_model(str(baseline_path), str(baseline_log)) + autotuner.submit(baseline_latency) + logger.info(f"Baseline: {baseline_latency:.2f} ms") + else: + baseline_latency = autotuner.baseline_latency_ms + logger.info(f"Using baseline from checkpoint: {baseline_latency:.2f} ms") + + logger.info(f"Starting region profiling ({num_schemes_per_region} schemes per region)") + + iteration_count = 0 + + for region_idx, region in enumerate(regions): + logger.info( + f"Region {region_idx + 1}/{len(regions)} (ID={region.id}, level={region.level})" + ) + + if node_filter_list and not _region_matches_filter( + region, autotuner.graph, node_filter_list + ): + logger.info(" Skipping (no nodes match filter patterns)") + continue + + commit = region_idx > 0 + autotuner.set_profile_region(region, commit=commit) + + if autotuner.current_profile_pattern_schemes is None: + logger.info(" Skipping (already profiled)") + continue + + schemes_tested = 0 + for scheme_num in range(num_schemes_per_region): + iteration_count += 1 + scheme_idx = autotuner.generate() + + if scheme_idx == -1: + logger.debug(f" Stopping at scheme {scheme_num + 1} (no more unique schemes)") + break + + schemes_tested += 1 + model_bytes = autotuner.export_onnx(None, insert_qdq=True) + test_log = logs_dir / f"region_{region.id}_scheme_{scheme_idx}.log" + flush_timing_cache = (iteration_count % 10) == 0 + latency = benchmark_onnx_model( + model_bytes, str(test_log), flush_timing_cache=flush_timing_cache + ) + + autotuner.submit(latency, success=(latency != float("inf"))) + + ps = autotuner.current_profile_pattern_schemes + if ps and ps.schemes: + best_scheme = ps.best_scheme + if best_scheme and best_scheme.latency_ms < float("inf") and baseline_latency > 0: + speedup = baseline_latency / best_scheme.latency_ms + logger.info( + f" Tested {schemes_tested} schemes: " + f"best {best_scheme.latency_ms:.2f} ms ({speedup:.3f}x speedup)" + ) + else: + logger.info(f" Tested {schemes_tested} schemes: no valid measurements") + else: + logger.info(f" Tested {schemes_tested} schemes") + + region_model_path = models_dir / f"region_{region.id}_level_{region.level}.onnx" + autotuner.export_onnx(str(region_model_path), insert_qdq=True, best=True) + logger.debug(f" Saved best model: {region_model_path.name}") + + # Save state after each region (incremental, crash recovery) + autotuner.save_state(str(state_path)) + logger.debug(" Checkpoint saved") + + # Commit final region + autotuner.set_profile_region(None, commit=True) + + logger.info("Exporting final optimized model") + final_model_path = output_dir / "optimized_final.onnx" + autotuner.export_onnx(str(final_model_path), insert_qdq=True) + final_log = logs_dir / "final.log" + final_latency = benchmark_onnx_model(str(final_model_path), str(final_log)) + + if final_latency > 0 and final_latency != float("inf"): + speedup = baseline_latency / final_latency + logger.info( + f"Results: {baseline_latency:.2f} ms → {final_latency:.2f} ms ({speedup:.3f}x speedup)" + ) + else: + logger.info(f"Results: {baseline_latency:.2f} ms → failed (invalid measurement)") + + autotuner.save_state(str(state_path)) + + logger.info("Autotuning complete") + logger.info(f" Final model: {final_model_path}") + logger.info(f" State: {state_path}") + logger.debug(f" Logs: {logs_dir}") + logger.debug(f" Region models: {models_dir}") + + return autotuner diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py index 4090cfef3..db9652e56 100644 --- a/tests/_test_utils/onnx/quantization/autotune/models.py +++ b/tests/_test_utils/onnx/quantization/autotune/models.py @@ -25,9 +25,9 @@ def _create_simple_conv_onnx_model(): """Build ONNX model: Input -> Conv -> Relu -> Output (minimal for autotuner tests).""" - input_tensor = helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 224, 224]) + input_tensor = helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [32, 3, 224, 224]) output_tensor = helper.make_tensor_value_info( - "output", onnx.TensorProto.FLOAT, [1, 64, 224, 224] + "output", onnx.TensorProto.FLOAT, [32, 64, 224, 224] ) conv_node = helper.make_node( "Conv", inputs=["input", "conv_weight"], outputs=["conv_out"], name="conv" diff --git a/tests/gpu/onnx/quantization/autotune/test_workflow.py b/tests/gpu/onnx/quantization/autotune/test_workflow.py new file mode 100644 index 000000000..8066766a9 --- /dev/null +++ b/tests/gpu/onnx/quantization/autotune/test_workflow.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile +from pathlib import Path + +import onnx +import pytest +from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec +from _test_utils.onnx.quantization.autotune import models as _test_models + +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) + + +@pytest.fixture +def simple_conv_model(): + """Simple ONNX model: Input -> Conv -> Relu -> Output. Created via models.py.""" + return _test_models._create_simple_conv_onnx_model() + + +@pytest.mark.parametrize("use_trtexec", [True, False]) +def test_export_quantized_model(use_trtexec, simple_conv_model): + """Test exporting quantized model with Q/DQ.""" + if use_trtexec: + skip_if_no_trtexec() + else: + skip_if_no_tensorrt() + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + baseline_model_path = f.name + + # Save baseline model + onnx.save(simple_conv_model, baseline_model_path) + + output_dir = baseline_model_path.replace(".onnx", "") + output_path = output_dir + ".quant.onnx" + + try: + init_benchmark_instance(use_trtexec=use_trtexec, timing_runs=100) + autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir)) + + # Export model with Q/DQ insertion + autotuner.export_onnx(output_path, insert_qdq=True, best=True) + + # Verify file was created + assert os.path.exists(output_path) + + # Verify it's a valid ONNX model + exported_model = onnx.load(output_path) + assert exported_model is not None + + # Verify that it contains Q/DQ nodes + qdq_nodes = [ + n + for n in exported_model.graph.node + if n.op_type in ["QuantizeLinear", "DequantizeLinear"] + ] + assert qdq_nodes, "Q/DQ nodes not found in quantized model" + finally: + if os.path.exists(output_path): + os.unlink(output_path) + if os.path.exists(baseline_model_path): + os.unlink(baseline_model_path) + if os.path.isdir(output_dir): + shutil.rmtree(output_dir) diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py new file mode 100644 index 000000000..9ec99d65d --- /dev/null +++ b/tests/unit/onnx/quantization/autotune/test_config.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests for the Config class in the autotuner. + +Tests configuration parameter validation and defaults. +""" + +from modelopt.onnx.quantization.autotune.common import Config + + +class TestConfig: + """Test Config class functionality.""" + + def test_default_values(self): + """Test that Config has correct default values.""" + config = Config() + + # Logging + assert not config.verbose + + # Performance thresholds + + # Q/DQ defaults + assert config.default_q_scale == 0.1 + assert config.default_q_zero_point == 0 + assert config.default_quant_type == "int8" + + # Region builder settings + assert config.maximum_sequence_region_size == 10 + assert config.minimum_topdown_search_size == 10 + + # Scheme generation parameters + assert config.top_percent_to_mutate == 0.1 + assert config.minimum_schemes_to_mutate == 10 + assert config.maximum_mutations == 3 + assert config.maximum_generation_attempts == 100 + + # Pattern cache parameters + assert config.pattern_cache_minimum_distance == 4 + assert config.pattern_cache_max_entries_per_pattern == 32 + + def test_custom_values(self): + """Test creating Config with custom values.""" + config = Config( + verbose=True, + default_q_scale=0.05, + default_q_zero_point=128, + default_quant_type="fp8", + maximum_sequence_region_size=20, + ) + + assert config.verbose + assert config.default_q_scale == 0.05 + assert config.default_q_zero_point == 128 + assert config.default_quant_type == "fp8" + assert config.maximum_sequence_region_size == 20 + + def test_region_size_validation(self): + """Test that region size parameters are positive.""" + config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5) + assert config.maximum_sequence_region_size > 0 + assert config.minimum_topdown_search_size > 0 + + def test_genetic_algorithm_params(self): + """Test genetic algorithm parameters.""" + config = Config( + top_percent_to_mutate=0.2, + minimum_schemes_to_mutate=2, + maximum_mutations=5, + maximum_generation_attempts=50, + ) + + assert config.top_percent_to_mutate == 0.2 + assert config.minimum_schemes_to_mutate == 2 + assert config.maximum_mutations == 5 + assert config.maximum_generation_attempts == 50 + + def test_pattern_cache_params(self): + """Test pattern cache parameters.""" + config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10) + + assert config.pattern_cache_minimum_distance == 3 + assert config.pattern_cache_max_entries_per_pattern == 10