diff --git a/CMakePresets.json b/CMakePresets.json
index 4d8b70f08b2..c8fba2b6a41 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -309,6 +309,14 @@
         "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/arm_ethosu_linux.cmake",
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake"
       }
+    },
+    {
+      "name": "esp-baremetal",
+      "displayName": "Build ExecuTorch for ESP baremetal",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/esp_baremetal.cmake"
+      }
     }
   ],
   "buildPresets": [
diff --git a/examples/espressif/README.md b/examples/espressif/README.md
new file mode 100644
index 00000000000..5c345b4d98f
--- /dev/null
+++ b/examples/espressif/README.md
@@ -0,0 +1,278 @@
+# ExecuTorch Executor Runner for Espressif ESP32/ESP32-S3
+
+> **:warning: <span style="color:red">**This example is not tested in CI. Use at your own risk.**</span>**
+
+This example demonstrates how to run an ExecuTorch model on Espressif ESP32 and
+ESP32-S3 microcontrollers. It is based on the
+[Arm Cortex-M executor runner](../arm/executor_runner/) and adapted for the
+ESP-IDF build system and ESP32 memory architecture.
+
+## Supported Targets
+
+| Chip     | CPU           | Internal SRAM | PSRAM (optional) |
+|----------|---------------|---------------|------------------|
+| ESP32    | Xtensa LX6 (dual-core, 240MHz) | ~520KB | 4-8MB |
+| ESP32-S3 | Xtensa LX7 (dual-core, 240MHz) | ~512KB | 2-32MB (Octal) |
+
+## Prerequisites
+
+1. **ESP-IDF v5.1+**: Install the ESP-IDF toolchain following the
+   [official guide](https://docs.espressif.com/projects/esp-idf/en/stable/esp32/get-started/).
+
+2. **ExecuTorch**: Clone and set up ExecuTorch:
+   ```bash
+   git clone https://github.com/pytorch/executorch.git
+   cd executorch
+   pip install -e .
+   ```
+
+3. **Cross-compiled ExecuTorch libraries**: Build ExecuTorch for the ESP32
+   target. See the [Cross-Compilation](#cross-compiling-executorch) section.
+
+4. **A .pte model file**: Export a PyTorch model to the ExecuTorch `.pte`
+   format. For small models suitable for ESP32, consider:
+   - A simple add/multiply model
+   - MobileNet V2 (quantized, with PSRAM)
+   - Custom small models
+
+## Project Structure
+
+```
+examples/espressif/
+├── README.md                    # This file
+├── build.sh                     # Build helper script
+├── executor_runner/
+│   ├── CMakeLists.txt           # Component/standalone CMake build
+│   ├── esp_executor_runner.cpp  # Main executor runner
+│   ├── esp_memory_allocator.h   # Custom memory allocator
+│   ├── esp_memory_allocator.cpp
+│   ├── esp_perf_monitor.h       # Performance monitoring
+│   ├── esp_perf_monitor.cpp
+│   └── pte_to_header.py         # Convert .pte to C header
+└── project/
+    ├── CMakeLists.txt           # ESP-IDF project file
+    ├── sdkconfig.defaults       # Default ESP-IDF configuration
+    ├── sdkconfig.defaults.esp32s3  # ESP32-S3 specific config
+    ├── partitions.csv  # Example partition table; adjust app partition size for your board and model
+    └── main/
+        ├── CMakeLists.txt       # Main component
+        └── main.cpp             # Entry point
+```
+
+## Quick Start
+
+The following example has been tested only on an ESP32-S3 dev board with 8 MB of Octal PSRAM. You may need to adjust the `sdkconfig` file for your specific board.
+
+### 1. Export a simple model
+
+```python
+import torch
+from executorch.exir import to_edge
+
+class SimpleModel(torch.nn.Module):
+    def forward(self, x):
+        return x + x
+
+model = SimpleModel()
+example_input = (torch.randn(1, 8),)
+
+# Export to ExecuTorch
+exported = torch.export.export(model, example_input)
+edge = to_edge(exported)
+et_program = edge.to_executorch()
+
+with open("simple_add.pte", "wb") as f:
+    f.write(et_program.buffer)
+```
+
+### 2. Convert the model to a C header
+
+```bash
+python3 examples/espressif/executor_runner/pte_to_header.py \
+    --pte simple_add.pte \
+    --outdir examples/espressif/project/
+```
+
+### 3. Build with ESP-IDF
+
+```bash
+# Source ESP-IDF environment
+. $IDF_PATH/export.sh
+
+# Using the build script:
+./examples/espressif/build.sh --target esp32s3 --pte simple_add.pte
+
+# Or manually:
+cd examples/espressif/project
+idf.py set-target esp32s3
+idf.py build
+```
+
+### 4. Flash and Monitor
+
+```bash
+cd examples/espressif/project
+idf.py -p /dev/ttyUSB0 flash monitor
+```
+
+You should see output like:
+```
+Starting executorch runner !
+I [executorch:esp_executor_runner.cpp:237 et_pal_init()] ESP32 ExecuTorch runner initialized. Free heap: 6097812 bytes.
+I [executorch:esp_executor_runner.cpp:242 et_pal_init()] PSRAM available. Free PSRAM: 5764716 bytes.
+I [executorch:esp_executor_runner.cpp:1047 executor_runner_main()] PTE @ 0x3c05f9f0 [----ET12]
+I [executorch:esp_executor_runner.cpp:568 runner_init()] PTE Model data loaded. Size: 952 bytes.
+I [executorch:esp_executor_runner.cpp:583 runner_init()] Model buffer loaded, has 1 methods
+I [executorch:esp_executor_runner.cpp:593 runner_init()] Running method forward
+I [executorch:esp_executor_runner.cpp:604 runner_init()] Setup Method allocator pool. Size: 2097152 bytes.
+I [executorch:esp_executor_runner.cpp:620 runner_init()] Setting up planned buffer 0, size 64.
+I [executorch:esp_executor_runner.cpp:716 runner_init()] Method 'forward' loaded.
+I [executorch:esp_executor_runner.cpp:718 runner_init()] Preparing inputs...
+I [executorch:esp_executor_runner.cpp:780 runner_init()] Input prepared.
+I [executorch:esp_executor_runner.cpp:979 run_model()] Starting running 1 inferences...
+I [executorch:esp_perf_monitor.cpp:41 StopMeasurements()] Profiler report:
+I [executorch:esp_perf_monitor.cpp:42 StopMeasurements()] Number of inferences: 1
+I [executorch:esp_perf_monitor.cpp:43 StopMeasurements()] Total CPU cycles: 49545 (49545.00 per inference)
+I [executorch:esp_perf_monitor.cpp:48 StopMeasurements()] Total wall time: 205 us (205.00 us per inference)
+I [executorch:esp_perf_monitor.cpp:53 StopMeasurements()] Average inference time: 0.205 ms
+I [executorch:esp_perf_monitor.cpp:59 StopMeasurements()] Free heap: 6097576 bytes
+I [executorch:esp_perf_monitor.cpp:63 StopMeasurements()] Min free heap ever: 6097576 bytes
+I [executorch:esp_executor_runner.cpp:999 run_model()] 1 inferences finished
+I [executorch:esp_executor_runner.cpp:867 print_outputs()] 1 outputs: 
+Output[0][0]: (float) 2.000000
+Output[0][1]: (float) 2.000000
+Output[0][2]: (float) 2.000000
+Output[0][3]: (float) 2.000000
+Output[0][4]: (float) 2.000000
+Output[0][5]: (float) 2.000000
+Output[0][6]: (float) 2.000000
+Output[0][7]: (float) 2.000000
+
+```
+
+## Cross-Compiling ExecuTorch
+
+ExecuTorch needs to be cross-compiled for the ESP32 target (Xtensa architecture).
+
+### Using the ESP-IDF toolchain
+
+```bash
+# Set up the cross-compilation toolchain
+export IDF_TARGET=esp32s3  # or esp32
+
+# Configure ExecuTorch build for ESP32
+#Make sure to adjust the list of ops for your model or alter to use one of the selective build methods
+cmake --preset esp-baremetal -B cmake-out-esp \
+    -DCMAKE_TOOLCHAIN_FILE=$IDF_PATH/tools/cmake/toolchain-${IDF_TARGET}.cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=OFF \
+    -DEXECUTORCH_SELECT_OPS_LIST="aten::add.out," \
+    .
+
+cmake --build cmake-out-esp -j$(nproc)
+cmake --build cmake-out-esp --target install
+```
+
+## Memory Considerations
+
+### ESP32 (no PSRAM)
+- Total available SRAM: ~520KB (shared between code and data)
+- Recommended method allocator pool: 128-256KB
+- Recommended scratch pool: 64-128KB
+- **Only very small models will fit!**
+
+### ESP32 / ESP32-S3 with PSRAM
+- Internal SRAM: ~512KB (used for code and fast data)
+- PSRAM: 2-32MB (used for model data and large buffers)
+- Recommended method allocator pool: 1-4MB
+- Recommended scratch pool: 256KB-1MB
+
+### Configuring Memory Pools
+
+Memory pool sizes auto-adjust based on PSRAM availability. Override with:
+
+```cmake
+# In your project CMakeLists.txt or via idf.py menuconfig
+set(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE "1048576")    # 1MB
+set(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE "524288") # 512KB
+```
+
+Or as compile definitions:
+```bash
+idf.py build -DET_ESP_METHOD_ALLOCATOR_POOL_SIZE=1048576
+```
+
+## Loading Models
+
+### Compiled-in (default)
+The model `.pte` file is converted to a C array and compiled into the firmware.
+This is the simplest approach but increases firmware size.
+
+### Filesystem (SPIFFS/LittleFS)
+For larger models, load from the filesystem at runtime:
+
+1. Add `-DFILESYSTEM_LOAD=ON` to your build
+2. Create a SPIFFS partition with your model:
+   ```bash
+   # Add to partitions.csv:
+   # storage, data, spiffs, , 0x200000
+   
+   # Create and flash SPIFFS image:
+   $IDF_PATH/components/spiffs/spiffsgen.py 0x200000 model_dir spiffs.bin
+   esptool.py write_flash 0x210000 spiffs.bin
+   ```
+
+## Configuration Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `ET_NUM_INFERENCES` | 1 | Number of inference runs |
+| `ET_LOG_DUMP_INPUT` | OFF | Log input tensor values |
+| `ET_LOG_DUMP_OUTPUT` | ON | Log output tensor values |
+| `ET_BUNDLE_IO` | OFF | Enable BundleIO test support |
+| `ET_EVENT_TRACER_ENABLED` | OFF | Enable ETDump profiling |
+| `FILESYSTEM_LOAD` | OFF | Load model from filesystem |
+| `ET_ESP_METHOD_ALLOCATOR_POOL_SIZE` | Auto | Method allocator size |
+| `ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE` | Auto | Scratch allocator size |
+
+## Differences from the Arm Example
+
+| Feature | Arm (Cortex-M) | ESP32/ESP32-S3 |
+|---------|----------------|----------------|
+| Build system | Bare-metal CMake + Arm toolchain | ESP-IDF (FreeRTOS-based) |
+| NPU | Ethos-U55/U65/U85 | None (CPU only) |
+| Memory | ITCM/DTCM/SRAM/DDR via linker script | IRAM/DRAM/PSRAM via ESP-IDF |
+| Performance monitor | ARM PMU + Ethos-U PMU | CPU cycle counter + esp_timer |
+| Semihosting | FVP simulator filesystem access | SPIFFS/LittleFS/SD filesystem |
+| Entry point | `main()` bare-metal | `app_main()` via FreeRTOS |
+| Timing | ARM_PMU_Get_CCNTR() | esp_cpu_get_cycle_count() |
+
+## Troubleshooting
+
+### Model too large for flash
+- Use filesystem loading (`FILESYSTEM_LOAD=ON`) with SPIFFS or SD card
+- Quantize the model to reduce size
+- Use a simpler/smaller model architecture
+
+### Out of memory during inference
+- Enable PSRAM if your board has it (`CONFIG_SPIRAM=y`)
+- Increase memory pool sizes
+- Use a smaller model
+- Check `log_mem_status()` output for memory usage details
+
+### Build errors with ExecuTorch libraries
+- Ensure ExecuTorch was cross-compiled with the same ESP-IDF toolchain
+- Check that `ET_BUILD_DIR_PATH` points to the correct build directory
+- Verify the target architecture matches (Xtensa LX6 for ESP32, LX7 for ESP32-S3)
+
+### Watchdog timer resets
+- Long inference times may trigger the task watchdog
+- Disable with `CONFIG_ESP_TASK_WDT_EN=n` in sdkconfig
+- Or increase the timeout: `CONFIG_ESP_TASK_WDT_TIMEOUT_S=30`
+
+## License
+
+This project is licensed under the BSD-style license found in the
+[LICENSE](../../../LICENSE) file in the root directory of the ExecuTorch
+source tree.
diff --git a/examples/espressif/build.sh b/examples/espressif/build.sh
new file mode 100755
index 00000000000..fd23aa0d7c2
--- /dev/null
+++ b/examples/espressif/build.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Build script for the ExecuTorch ESP32 executor runner example.
+#
+# Prerequisites:
+#   - ESP-IDF v5.1+ installed and sourced (. $IDF_PATH/export.sh)
+#   - ExecuTorch cross-compiled for the ESP32 target
+#   - Python 3.8+
+#
+# Usage:
+#   ./build.sh [--target esp32|esp32s3] [--pte <model.pte>] [--clean]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ET_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PROJECT_DIR="${SCRIPT_DIR}/project"
+TARGET="esp32s3"
+PTE_FILE=""
+CLEAN=false
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --target)
+            TARGET="$2"
+            shift 2
+            ;;
+        --pte)
+            PTE_FILE="$2"
+            shift 2
+            ;;
+        --clean)
+            CLEAN=true
+            shift
+            ;;
+        --help|-h)
+            echo "Usage: $0 [--target esp32|esp32s3] [--pte <model.pte>] [--clean]"
+            echo ""
+            echo "Options:"
+            echo "  --target    ESP32 target chip (default: esp32s3)"
+            echo "  --pte       Path to the .pte model file to embed"
+            echo "  --clean     Clean build directory before building"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Validate environment
+if [ -z "${IDF_PATH:-}" ]; then
+    echo "ERROR: IDF_PATH is not set. Please source ESP-IDF:"
+    echo "  . \$IDF_PATH/export.sh"
+    exit 1
+fi
+
+echo "=== ExecuTorch ESP32 Executor Runner Build ==="
+echo "Target: ${TARGET}"
+echo "ExecuTorch root: ${ET_ROOT}"
+echo "ESP-IDF: ${IDF_PATH}"
+
+# Convert PTE to header if provided
+if [ -n "${PTE_FILE}" ]; then
+    if [ ! -f "${PTE_FILE}" ]; then
+        echo "ERROR: PTE file not found: ${PTE_FILE}"
+        exit 1
+    fi
+
+    echo "Converting PTE to header: ${PTE_FILE}"
+    HEADER_DIR="${PROJECT_DIR}"
+    mkdir -p "${HEADER_DIR}"
+    python3 "${SCRIPT_DIR}/executor_runner/pte_to_header.py" \
+        --pte "${PTE_FILE}" \
+        --outdir "${HEADER_DIR}"
+    echo "Model header generated: ${HEADER_DIR}/model_pte.h"
+fi
+
+# Navigate to project directory
+cd "${PROJECT_DIR}"
+
+# Clean if requested
+if [ "${CLEAN}" = true ]; then
+    echo "Cleaning build directory..."
+    rm -rf build sdkconfig
+fi
+# Set target
+echo "Setting target to ${TARGET}..."
+idf.py set-target "${TARGET}"
+
+# Build
+echo "Building..."
+idf.py build
+
+echo ""
+echo "=== Build complete ==="
+echo ""
+echo "To flash and monitor:"
+echo "  cd ${PROJECT_DIR}"
+echo "  idf.py -p /dev/ttyUSB0 flash monitor"
+echo ""
+echo "To just monitor:"
+echo "  idf.py -p /dev/ttyUSB0 monitor"
diff --git a/examples/espressif/executor_runner/CMakeLists.txt b/examples/espressif/executor_runner/CMakeLists.txt
new file mode 100644
index 00000000000..63d701d38f1
--- /dev/null
+++ b/examples/espressif/executor_runner/CMakeLists.txt
@@ -0,0 +1,305 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ESP-IDF component CMakeLists.txt for the ExecuTorch executor runner.
+#
+# This file defines the executor_runner as an ESP-IDF component.
+# It is designed to work with the ESP-IDF build system (idf.py build).
+#
+# Project structure expected:
+#   my_project/
+#   ├── CMakeLists.txt           (project-level, uses this as a component)
+#   ├── main/
+#   │   └── CMakeLists.txt       (main component, depends on executor_runner)
+#   └── components/
+#       └── executor_runner/     (this component - symlink or copy)
+#
+# Or you can use this CMakeLists.txt directly as a standalone CMake build
+# for cross-compilation testing.
+
+cmake_minimum_required(VERSION 3.16)
+
+# ─── Option: ESP-IDF component mode vs. standalone CMake mode ───
+if(ESP_PLATFORM)
+  # ═══════════════════════════════════════════════════════════════
+  # ESP-IDF Component Build
+  # ═══════════════════════════════════════════════════════════════
+  idf_component_register(
+    SRCS
+      "esp_executor_runner.cpp"
+      "esp_pal.cpp"
+      "esp_memory_allocator.cpp"
+      "esp_perf_monitor.cpp"
+    INCLUDE_DIRS
+      "."
+    REQUIRES
+      esp_timer
+      esp_system
+      spiffs
+  )
+
+  # ExecuTorch pre-built library paths
+  set(ET_DIR_PATH
+      "${CMAKE_CURRENT_SOURCE_DIR}/../../.."
+      CACHE PATH "Path to ExecuTorch source dir"
+  )
+  set(ET_BUILD_DIR_PATH
+      "${ET_DIR_PATH}/cmake-out-esp"
+      CACHE PATH "Path to ExecuTorch build/install dir for ESP target"
+  )
+  set(ET_PTE_FILE_PATH
+      ""
+      CACHE PATH "Path to ExecuTorch model .pte file"
+  )
+  set(PYTHON_EXECUTABLE
+      "python3"
+      CACHE PATH "Python executable"
+  )
+
+  set(ET_NUM_INFERENCES
+      "10"
+      CACHE STRING "Number of inferences to run"
+  )
+  option(ET_LOG_DUMP_INPUT "Dump input in log" OFF)
+  option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON)
+  option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
+  set(ET_ATOL "0.01" CACHE STRING "Absolute tolerance for BundleIO testing")
+  set(ET_RTOL "0.01" CACHE STRING "Relative tolerance for BundleIO testing")
+  option(ET_DUMP_OUTPUTS "Collect and print outputs as base64 in log" OFF)
+  option(ET_DUMP_INTERMEDIATE_OUTPUTS "Collect and print intermediate outputs" OFF)
+  set(ET_DEBUG_BUFFER_SIZE "65536" CACHE STRING "Size of ETDump debug buffer")
+  option(FILESYSTEM_LOAD "Load model from filesystem instead of compiled-in data" OFF)
+
+  # Directory containing the generated model_pte.h header.
+  # By default this is the project source directory (where build.sh places it),
+  # but it can be overridden if you generate the header elsewhere.
+  set(ET_MODEL_HEADER_DIR
+      "${CMAKE_SOURCE_DIR}"
+      CACHE PATH "Directory containing the generated model_pte.h header"
+  )
+
+  # Memory pool sizes
+  set(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE "" CACHE STRING
+      "Method allocator pool size (empty = auto based on PSRAM availability)")
+  set(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE "" CACHE STRING
+      "Scratch temp allocator pool size (empty = auto based on PSRAM availability)")
+
+  # Find pre-built ExecuTorch libraries.
+  # TARGETS_GLOBAL is needed because ESP-IDF's project.cmake resolves link
+  # dependencies from the top-level project scope, but find_package runs
+  # inside this component's directory scope. Without GLOBAL, the imported
+  # targets (executorch, portable_kernels, etc.) are invisible at the
+  # project level and you get "No target executorch" errors.
+  set(CMAKE_FIND_PACKAGE_TARGETS_GLOBAL TRUE)
+  find_package(
+    executorch REQUIRED HINTS "${ET_BUILD_DIR_PATH}/lib/cmake/ExecuTorch"
+  )
+
+  # Convert pte to header if not using filesystem loading
+  if(NOT FILESYSTEM_LOAD AND ET_PTE_FILE_PATH)
+    add_custom_target(
+      gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
+    )
+    add_custom_command(
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
+      COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/pte_to_header.py
+              --pte ${ET_PTE_FILE_PATH}
+              --outdir ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${ET_PTE_FILE_PATH}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+    add_dependencies(${COMPONENT_LIB} gen_model_header)
+  endif()
+
+  # Include directories
+  target_include_directories(
+    ${COMPONENT_LIB}
+    PRIVATE
+      ${ET_DIR_PATH}/..
+      ${ET_DIR_PATH}/runtime/core/portable_type/c10
+      ${CMAKE_CURRENT_BINARY_DIR}
+      ${ET_MODEL_HEADER_DIR}
+  )
+
+  # Link ExecuTorch libraries
+  set(esp_runner_libs)
+  list(APPEND esp_runner_libs
+    extension_runner_util
+    executorch
+    executorch_selected_kernels
+  )
+
+  if(TARGET xnnpack_backend)
+    list(APPEND esp_runner_libs xnnpack_backend)
+  endif()
+
+  if(EXECUTORCH_ENABLE_EVENT_TRACER)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_EVENT_TRACER_ENABLED)
+    list(APPEND esp_runner_libs etdump flatccrt)
+  endif()
+
+  if(ET_BUNDLE_IO)
+    list(APPEND esp_runner_libs bundled_program)
+  endif()
+
+  target_link_libraries(${COMPONENT_LIB} PUBLIC ${esp_runner_libs})
+
+  # Compile definitions
+  target_compile_definitions(
+    ${COMPONENT_LIB} PRIVATE C10_USING_CUSTOM_GENERATED_MACROS
+  )
+
+  if(ET_NUM_INFERENCES)
+    target_compile_definitions(
+      ${COMPONENT_LIB} PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES}
+    )
+  endif()
+
+  if(ET_LOG_DUMP_INPUT)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_LOG_DUMP_INPUT)
+  endif()
+
+  if(ET_LOG_DUMP_OUTPUT)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_LOG_DUMP_OUTPUT)
+  endif()
+
+  if(ET_BUNDLE_IO)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_BUNDLE_IO)
+  endif()
+
+  if(ET_ATOL)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_ATOL=${ET_ATOL})
+  endif()
+
+  if(ET_RTOL)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_RTOL=${ET_RTOL})
+  endif()
+
+  if(ET_DUMP_OUTPUTS)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_DUMP_OUTPUTS)
+  endif()
+
+  if(ET_DUMP_INTERMEDIATE_OUTPUTS)
+    target_compile_definitions(
+      ${COMPONENT_LIB} PUBLIC ET_DUMP_INTERMEDIATE_OUTPUTS
+    )
+  endif()
+
+  if(ET_DEBUG_BUFFER_SIZE)
+    target_compile_definitions(
+      ${COMPONENT_LIB} PUBLIC ET_DEBUG_BUFFER_SIZE=${ET_DEBUG_BUFFER_SIZE}
+    )
+  endif()
+
+  if(FILESYSTEM_LOAD)
+    target_compile_definitions(${COMPONENT_LIB} PUBLIC FILESYSTEM_LOAD)
+  endif()
+
+  if(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE)
+    target_compile_definitions(
+      ${COMPONENT_LIB}
+      PUBLIC ET_ESP_METHOD_ALLOCATOR_POOL_SIZE=${ET_ESP_METHOD_ALLOCATOR_POOL_SIZE}
+    )
+  endif()
+
+  if(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+    target_compile_definitions(
+      ${COMPONENT_LIB}
+      PUBLIC ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
+    )
+  endif()
+
+else()
+  # ═══════════════════════════════════════════════════════════════
+  # Standalone CMake Build (for host testing / cross-compilation)
+  # ═══════════════════════════════════════════════════════════════
+  project(esp_executor_runner)
+
+  set(ET_DIR_PATH
+      "${CMAKE_CURRENT_SOURCE_DIR}/../../.."
+      CACHE PATH "Path to ExecuTorch dir"
+  )
+  include(${ET_DIR_PATH}/tools/cmake/Utils.cmake)
+  set(ET_BUILD_DIR_PATH
+      "${ET_DIR_PATH}/cmake-out"
+      CACHE PATH "Path to ExecuTorch build/install dir"
+  )
+  set(ET_INCLUDE_PATH
+      "${ET_DIR_PATH}/.."
+      CACHE PATH "Path to ExecuTorch headers"
+  )
+  set(ET_PTE_FILE_PATH
+      ""
+      CACHE PATH "Path to ExecuTorch model pte"
+  )
+  set(PYTHON_EXECUTABLE
+      "python3"
+      CACHE PATH "Python executable"
+  )
+
+  set(ET_NUM_INFERENCES "1" CACHE STRING "Number of inferences to run")
+  option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON)
+
+  if(NOT DEFINED ET_PTE_FILE_PATH OR ET_PTE_FILE_PATH STREQUAL "")
+    message(FATAL_ERROR "ET_PTE_FILE_PATH must be set to the .pte model file")
+  endif()
+
+  find_package(
+    executorch REQUIRED HINTS "${ET_BUILD_DIR_PATH}/lib/cmake/ExecuTorch"
+  )
+
+  # Convert pte to header
+  add_custom_target(
+    gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
+  )
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py --pte
+            ${ET_PTE_FILE_PATH} --outdir ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${ET_PTE_FILE_PATH}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  )
+
+  add_executable(esp_executor_runner)
+  target_sources(
+    esp_executor_runner PRIVATE
+      esp_executor_runner.cpp
+      esp_pal.cpp
+      esp_perf_monitor.cpp
+      esp_memory_allocator.cpp
+  )
+
+  target_link_libraries(
+    esp_executor_runner PUBLIC
+      extension_runner_util
+      executorch
+      portable_kernels
+  )
+
+  target_include_directories(
+    esp_executor_runner
+    PRIVATE
+      ${ET_INCLUDE_PATH}
+      ${ET_DIR_PATH}/runtime/core/portable_type/c10
+      ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  target_compile_definitions(
+    esp_executor_runner PRIVATE C10_USING_CUSTOM_GENERATED_MACROS
+  )
+
+  if(ET_NUM_INFERENCES)
+    target_compile_definitions(
+      esp_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES}
+    )
+  endif()
+
+  if(ET_LOG_DUMP_OUTPUT)
+    target_compile_definitions(esp_executor_runner PUBLIC ET_LOG_DUMP_OUTPUT)
+  endif()
+
+  add_dependencies(esp_executor_runner gen_model_header)
+endif()
diff --git a/examples/espressif/executor_runner/esp_executor_runner.cpp b/examples/espressif/executor_runner/esp_executor_runner.cpp
new file mode 100644
index 00000000000..6b95e16b768
--- /dev/null
+++ b/examples/espressif/executor_runner/esp_executor_runner.cpp
@@ -0,0 +1,1240 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/* This is an example ExecuTorch runner for Espressif ESP32 and ESP32-S3 chips.
+ * It is inspired by the Arm Cortex-M example runner and adapted for the
+ * ESP-IDF build system and ESP32 memory architecture.
+ *
+ * Some defines used to configure the code:
+ *
+ * ET_ESP_METHOD_ALLOCATOR_POOL_SIZE      - Size of memory area used when
+ *                                          setting up the model.
+ * ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE - Size of memory area used when
+ *                                           running inferences (scratch).
+ * ET_NUM_INFERENCES  - Number of times to run the inference.
+ * ET_LOG_DUMP_INPUT  - Control if you want input to be dumped to the log.
+ * ET_LOG_DUMP_OUTPUT - Control if you want output to be dumped to the log.
+ *
+ * Devtool BundleIO: Use Bundle PTE with input and reference output included
+ * to check if it matches.
+ *
+ * ET_BUNDLE_IO       - Build in Devtools BundleIO support. Makes it possible
+ *                      to use bpte with bundled input and output ref data.
+ *   ET_ATOL          - The atol used to compare output and ref data.
+ *   ET_RTOL          - The rtol used to compare output and ref data.
+ *
+ * Devtools ETDump: Speed and dumping output
+ *
+ * ET_EVENT_TRACER_ENABLED       - Build in Devtools ETDump event trace code
+ *                                 to generate cycle data.
+ * ET_DUMP_OUTPUTS               - Collect and print outputs as a base64
+ *                                 buffer in the log.
+ * ET_DUMP_INTERMEDIATE_OUTPUTS  - Collect and print intermediate outputs.
+ * ET_DEBUG_BUFFER_SIZE          - Override size of memory area used by
+ *                                 ET_DUMP_OUTPUTS /
+ * ET_DUMP_INTERMEDIATE_OUTPUTS.
+ *
+ * ESP32 Memory Notes:
+ *   - ESP32 has ~520KB internal SRAM, optionally 4-8MB PSRAM.
+ *   - ESP32-S3 has ~512KB internal SRAM, optionally 2-32MB PSRAM (octal).
+ *   - For larger models, PSRAM is required. Memory pools are placed in
+ *     PSRAM when available using EXT_RAM_BSS_ATTR.
+ *   - The model .pte data is converted to a C array and compiled in,
+ *     or can be loaded from SPIFFS/LittleFS/SD card filesystem.
+ *
+ * FILESYSTEM_LOAD - When defined, the runner will load the .pte model
+ *                   from the filesystem (SPIFFS/LittleFS/SD) instead of
+ *                   compiled-in data. Useful for larger models that don't
+ *                   fit in flash as a C array.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <cinttypes>
+#include <cstring>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <executorch/extension/data_loader/buffer_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include "esp_executor_runner.h"
+#include "esp_memory_allocator.h"
+#include "esp_perf_monitor.h"
+
+#if defined(ESP_PLATFORM)
+#include <esp_heap_caps.h>
+#include <esp_log.h>
+#include <esp_system.h>
+#include <freertos/FreeRTOS.h>
+#include <freertos/task.h>
+#endif
+
+#if defined(ET_BUNDLE_IO)
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#endif
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
+
+#if !defined(ET_DEBUG_BUFFER_SIZE)
+#define ET_DEBUG_BUFFER_SIZE (64 * 1024)
+#endif
+
+#endif // ET_DUMP_INTERMEDIATE_OUTPUTS || ET_DUMP_OUTPUTS
+
+#endif // ET_EVENT_TRACER_ENABLED
+
+#if defined(FILESYSTEM_LOAD)
+#include <sys/stat.h>
+#if defined(ESP_PLATFORM)
+#include <esp_spiffs.h>
+#endif
+#else
+/* When not loading from filesystem, include the model as a compiled-in
+ * C array. This header is generated by the build process from the .pte file
+ * specified in ET_PTE_FILE_PATH. */
+#include "model_pte.h"
+#endif
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::extension::BufferDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
+using executorch::runtime::TensorInfo;
+using executorch::runtime::toString;
+
+#if defined(ET_BUNDLE_IO)
+using executorch::bundled_program::compute_method_output_error_stats;
+using executorch::bundled_program::ErrorStats;
+using executorch::bundled_program::verify_method_outputs;
+#endif
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+using executorch::etdump::BufferDataSink;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::runtime::EventTracerDebugLogLevel;
+using torch::executor::etdump_result;
+#endif
+
+/**
+ * Memory pool sizes for the ExecuTorch runtime.
+ *
+ * ESP32:    ~520KB internal SRAM total. With PSRAM: 4-8MB external.
+ * ESP32-S3: ~512KB internal SRAM total. With PSRAM: 2-32MB external.
+ *
+ * For models that fit in internal SRAM, use smaller pool sizes.
+ * For larger models, enable PSRAM and increase these values.
+ *
+ * Default: 256KB method allocator, 128KB scratch (suitable for small models).
+ * With PSRAM: These can be increased significantly.
+ */
+#if !defined(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE)
+#if defined(CONFIG_SPIRAM)
+/* With PSRAM available, use larger pools */
+#define ET_ESP_METHOD_ALLOCATOR_POOL_SIZE (2 * 1024 * 1024)
+#else
+/* Internal SRAM only - conservative defaults */
+#define ET_ESP_METHOD_ALLOCATOR_POOL_SIZE (256 * 1024)
+#endif
+#endif
+
+#if !defined(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+#if defined(CONFIG_SPIRAM)
+#define ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE (512 * 1024)
+#else
+#define ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE (128 * 1024)
+#endif
+#endif
+
+/**
+ * Memory pool placement.
+ * On ESP32 with PSRAM, place large buffers in external RAM.
+ * EXT_RAM_BSS_ATTR places the buffer in PSRAM .bss section.
+ */
+#if defined(CONFIG_SPIRAM) && defined(ESP_PLATFORM)
+#include <esp_heap_caps.h>
+// Use PSRAM for large allocations
+static const size_t method_allocation_pool_size =
+    ET_ESP_METHOD_ALLOCATOR_POOL_SIZE;
+static uint8_t __attribute__((aligned(16)))
+method_allocation_pool[ET_ESP_METHOD_ALLOCATOR_POOL_SIZE] EXT_RAM_BSS_ATTR;
+
+static const size_t temp_allocation_pool_size =
+    ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE;
+static uint8_t __attribute__((aligned(16)))
+temp_allocation_pool[ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE] EXT_RAM_BSS_ATTR;
+#else
+// Internal SRAM allocation
+static const size_t method_allocation_pool_size =
+    ET_ESP_METHOD_ALLOCATOR_POOL_SIZE;
+static uint8_t __attribute__((
+    aligned(16))) method_allocation_pool[ET_ESP_METHOD_ALLOCATOR_POOL_SIZE];
+
+static const size_t temp_allocation_pool_size =
+    ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE;
+static uint8_t __attribute__((
+    aligned(16))) temp_allocation_pool[ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE];
+#endif
+
+#if defined(FILESYSTEM_LOAD)
+static char* model_pte = nullptr;
+static size_t model_pte_size = 0;
+#endif
+
+#if defined(ET_BUNDLE_IO)
+static const size_t testset_idx = 0;
+
+#if defined(ET_ATOL)
+static const float et_atol = ET_ATOL;
+#else
+static const float et_atol = 0.01;
+#endif
+
+#if defined(ET_RTOL)
+static const float et_rtol = ET_RTOL;
+#else
+static const float et_rtol = 0.01;
+#endif
+#endif // ET_BUNDLE_IO
+
+#if defined(ET_NUM_INFERENCES)
+static const int num_inferences = ET_NUM_INFERENCES;
+#else
+static const int num_inferences = 10;
+#endif
+
+namespace {
+
+/// Lightweight heapless container that constructs and stores a T in-place.
+/// Useful when you want to avoid heap allocations but need to delay
+/// construction.
+template <typename T>
+class Box {
+ public:
+  Box() = default;
+
+  ~Box() {
+    if (has_value) {
+      ptr()->~T();
+    }
+  }
+
+  Box(const Box&) = delete;
+  Box& operator=(const Box&) = delete;
+
+  template <typename... Args>
+  void reset(Args&&... args) {
+    if (has_value) {
+      reinterpret_cast<T*>(mem)->~T();
+    }
+    new (mem) T(std::forward<Args>(args)...);
+    has_value = true;
+  }
+
+  T& value() {
+    return *ptr();
+  }
+
+  const T& value() const {
+    return *ptr();
+  }
+
+  T* operator->() {
+    return ptr();
+  }
+
+  const T* operator->() const {
+    return ptr();
+  }
+
+ private:
+  alignas(T) uint8_t mem[sizeof(T)];
+  bool has_value = false;
+
+  T* ptr() {
+    return reinterpret_cast<T*>(mem);
+  }
+
+  const T* ptr() const {
+    return reinterpret_cast<const T*>(mem);
+  }
+};
+
+template <typename ValueType>
+void fill_tensor_with_default_value(Tensor& tensor) {
+  ValueType fill_value{};
+  if constexpr (std::is_same_v<ValueType, bool>) {
+    fill_value = true;
+  } else {
+    fill_value = ValueType(1);
+  }
+
+  ValueType* data_ptr = tensor.mutable_data_ptr<ValueType>();
+  std::fill(data_ptr, data_ptr + tensor.numel(), fill_value);
+}
+
+Error prepare_input_tensors(Method& method, MemoryAllocator& allocator) {
+  MethodMeta method_meta = method.method_meta();
+  size_t num_inputs = method_meta.num_inputs();
+
+  EValue* input_evalues = allocator.allocateList<EValue>(num_inputs);
+  ET_CHECK_OR_RETURN_ERROR(
+      input_evalues != nullptr,
+      MemoryAllocationFailed,
+      "Could not allocate memory for input evalues.");
+
+  Error err = method.get_inputs(input_evalues, num_inputs);
+  ET_CHECK_OK_OR_RETURN_ERROR(err);
+
+  for (size_t i = 0; i < num_inputs; i++) {
+    auto tag = method_meta.input_tag(i);
+    ET_CHECK_OK_OR_RETURN_ERROR(tag.error());
+
+    if (tag.get() != Tag::Tensor) {
+      ET_LOG(
+          Debug,
+          "Skipping non-tensor input %lu",
+          static_cast<unsigned long>(i));
+      continue;
+    }
+
+    // Fill tensors with default values (1) when no input data is provided
+    if (input_evalues[i].isTensor()) {
+      Tensor& tensor = input_evalues[i].toTensor();
+      switch (tensor.scalar_type()) {
+#define HANDLE_SCALAR_TYPE(cpp_type, scalar_name)     \
+  case ScalarType::scalar_name:                       \
+    fill_tensor_with_default_value<cpp_type>(tensor); \
+    break;
+        ET_FORALL_SCALAR_TYPES(HANDLE_SCALAR_TYPE)
+#undef HANDLE_SCALAR_TYPE
+        default:
+          ET_LOG(
+              Error, "Unhandled ScalarType %s", toString(tensor.scalar_type()));
+          err = Error::InvalidArgument;
+          break;
+      }
+    } else {
+      printf("Input[%lu]: Not Tensor\n", static_cast<unsigned long>(i));
+    }
+  }
+
+  return err;
+}
+
+#if defined(FILESYSTEM_LOAD)
+/**
+ * Load a binary file from the filesystem.
+ * Supports SPIFFS, LittleFS, or SD card mounted filesystems.
+ */
+std::pair<char*, size_t> load_file_from_fs(
+    const char* filepath,
+    MemoryAllocator& allocator) {
+  FILE* fp = fopen(filepath, "rb");
+  if (!fp) {
+    ET_LOG(Fatal, "Could not open file %s (errno: %d)", filepath, errno);
+    return std::make_pair(nullptr, 0);
+  }
+
+  if (fseek(fp, 0, SEEK_END) != 0) {
+    ET_LOG(
+        Fatal, "Failed to seek to end of file %s (errno: %d)", filepath, errno);
+    fclose(fp);
+    return std::make_pair(nullptr, 0);
+  }
+  auto file_size = ftell(fp);
+  if (file_size <= 0) {
+    ET_LOG(
+        Fatal,
+        "Failed to determine valid size for file %s (size: %ld, errno: %d)",
+        filepath,
+        static_cast<long>(file_size),
+        errno);
+    fclose(fp);
+    return std::make_pair(nullptr, 0);
+  }
+
+  if (fseek(fp, 0, SEEK_SET) != 0) {
+    ET_LOG(
+        Fatal,
+        "Failed to seek to beginning of file %s (errno: %d)",
+        filepath,
+        errno);
+    fclose(fp);
+    return std::make_pair(nullptr, 0);
+  }
+  const size_t size = static_cast<size_t>(file_size);
+  char* buffer = static_cast<char*>(allocator.allocate(size));
+  if (buffer == nullptr) {
+    ET_LOG(
+        Fatal,
+        "Failed to allocate %lu bytes for file %s",
+        static_cast<unsigned long>(size),
+        filepath);
+    fclose(fp);
+    return std::make_pair(nullptr, 0);
+  }
+
+  auto read_size = fread(buffer, 1, size, fp);
+  if (read_size != size) {
+    ET_LOG(
+        Fatal,
+        "Partial read of %s: got %lu of %lu bytes",
+        filepath,
+        static_cast<unsigned long>(read_size),
+        static_cast<unsigned long>(size));
+    fclose(fp);
+    return std::make_pair(nullptr, 0);
+  }
+  fclose(fp);
+  return std::make_pair(buffer, read_size);
+}
+
+#if defined(ESP_PLATFORM)
+/**
+ * Initialize SPIFFS filesystem for loading model files.
+ */
+bool init_spiffs(const char* base_path, const char* partition_label) {
+  esp_vfs_spiffs_conf_t conf = {
+      .base_path = base_path,
+      .partition_label = partition_label,
+      .max_files = 5,
+      .format_if_mount_failed = false,
+  };
+
+  esp_err_t ret = esp_vfs_spiffs_register(&conf);
+  if (ret != ESP_OK) {
+    if (ret == ESP_FAIL) {
+      ET_LOG(Error, "Failed to mount SPIFFS filesystem");
+    } else if (ret == ESP_ERR_NOT_FOUND) {
+      ET_LOG(Error, "SPIFFS partition not found");
+    } else {
+      ET_LOG(Error, "SPIFFS init failed: %s", esp_err_to_name(ret));
+    }
+    return false;
+  }
+
+  size_t total = 0, used = 0;
+  ret = esp_spiffs_info(partition_label, &total, &used);
+  if (ret == ESP_OK) {
+    ET_LOG(
+        Info,
+        "SPIFFS: total=%lu, used=%lu",
+        static_cast<unsigned long>(total),
+        static_cast<unsigned long>(used));
+  }
+  return true;
+}
+#endif // ESP_PLATFORM
+#endif // FILESYSTEM_LOAD
+
+/// Holds all state needed for setup and run phases
+struct RunnerContext {
+  RunnerContext() = default;
+  RunnerContext(const RunnerContext& ctx) = delete;
+  RunnerContext& operator=(const RunnerContext& ctx) = delete;
+
+  const char* method_name = nullptr;
+  size_t planned_buffer_memsize = 0;
+  size_t method_loaded_memsize = 0;
+  size_t executor_membase = 0;
+  size_t program_data_len = 0;
+  size_t input_memsize = 0;
+  size_t pte_size = 0;
+  bool bundle_io = false;
+  Box<BufferDataLoader> loader;
+  Box<Program> program;
+  Box<EspMemoryAllocator> method_allocator;
+  Box<EspMemoryAllocator> temp_allocator;
+  std::vector<Span<uint8_t>> planned_spans;
+  Box<HierarchicalAllocator> planned_memory;
+  Box<MemoryManager> memory_manager;
+  Box<Result<Method>> method;
+#if defined(ET_EVENT_TRACER_ENABLED)
+  Box<ETDumpGen> etdump_gen;
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  void* debug_buffer;
+#endif
+#endif
+};
+
+void runner_init(RunnerContext& ctx, size_t pte_size) {
+  const void* program_data = model_pte;
+  ctx.program_data_len = pte_size;
+  ctx.pte_size = pte_size;
+
+#if defined(ET_BUNDLE_IO)
+  ctx.bundle_io = executorch::bundled_program::is_bundled_program(
+      reinterpret_cast<void*>(model_pte), ctx.pte_size);
+  if (ctx.bundle_io) {
+    Error status = executorch::bundled_program::get_program_data(
+        reinterpret_cast<void*>(model_pte),
+        ctx.pte_size,
+        &program_data,
+        &ctx.program_data_len);
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "get_program_data() from bundle PTE failed: 0x%x",
+        (unsigned int)status);
+  }
+#endif
+
+  ctx.loader.reset(program_data, ctx.program_data_len);
+  auto& loader = ctx.loader.value();
+  ET_LOG(
+      Info,
+      "PTE Model data loaded. Size: %lu bytes.",
+      static_cast<unsigned long>(ctx.program_data_len));
+
+  // Parse the program file
+  Result<Program> program_result = Program::load(&loader);
+  ET_CHECK_MSG(
+      program_result.ok(),
+      "Program loading failed @ %p: 0x%" PRIx32,
+      program_data,
+      static_cast<uint32_t>(program_result.error()));
+  ctx.program.reset(std::move(program_result.get()));
+  Program& program = ctx.program.value();
+
+  ET_LOG(
+      Info,
+      "Model buffer loaded, has %lu methods",
+      static_cast<unsigned long>(program.num_methods()));
+
+  {
+    const auto method_name_result = program.get_method_name(0);
+    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
+    ctx.method_name = *method_name_result;
+  }
+  ET_LOG(Info, "Running method %s", ctx.method_name);
+
+  Result<MethodMeta> method_meta = program.method_meta(ctx.method_name);
+  ET_CHECK_MSG(
+      method_meta.ok(),
+      "Failed to get method_meta for %s: 0x%x",
+      ctx.method_name,
+      (unsigned int)method_meta.error());
+
+  ET_LOG(
+      Info,
+      "Setup Method allocator pool. Size: %lu bytes.",
+      static_cast<unsigned long>(method_allocation_pool_size));
+
+  ctx.method_allocator.reset(
+      method_allocation_pool_size, method_allocation_pool);
+
+  ctx.planned_spans.clear();
+  size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
+  ctx.planned_spans.reserve(num_memory_planned_buffers);
+  size_t planned_buffer_membase = ctx.method_allocator->used_size();
+
+  for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
+    size_t buffer_size =
+        static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
+    ET_LOG(
+        Info,
+        "Setting up planned buffer %lu, size %lu.",
+        static_cast<unsigned long>(id),
+        static_cast<unsigned long>(buffer_size));
+
+    uint8_t* buffer = reinterpret_cast<uint8_t*>(
+        ctx.method_allocator->allocate(buffer_size, 16UL));
+    ET_CHECK_MSG(
+        buffer != nullptr,
+        "Could not allocate memory for memory planned buffer size %lu",
+        static_cast<unsigned long>(buffer_size));
+    ctx.planned_spans.push_back({buffer, buffer_size});
+  }
+
+  ctx.planned_buffer_memsize =
+      ctx.method_allocator->used_size() - planned_buffer_membase;
+
+  Span<Span<uint8_t>> planned_memory_span;
+  if (!ctx.planned_spans.empty()) {
+    planned_memory_span =
+        Span<Span<uint8_t>>(ctx.planned_spans.data(), ctx.planned_spans.size());
+  }
+  ctx.planned_memory.reset(planned_memory_span);
+
+  ctx.temp_allocator.reset(temp_allocation_pool_size, temp_allocation_pool);
+
+  ctx.memory_manager.reset(
+      &ctx.method_allocator.value(),
+      &ctx.planned_memory.value(),
+      &ctx.temp_allocator.value());
+
+  size_t method_loaded_membase = ctx.method_allocator->used_size();
+
+  executorch::runtime::EventTracer* event_tracer_ptr = nullptr;
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+  ET_LOG(Info, "Setting up ETDump");
+  ctx.etdump_gen.reset();
+  event_tracer_ptr = &ctx.etdump_gen.value();
+
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  ctx.debug_buffer = ctx.method_allocator->allocate(ET_DEBUG_BUFFER_SIZE, 16);
+  if (ctx.debug_buffer != nullptr) {
+    Span<uint8_t> debug_buffer_span(
+        (uint8_t*)ctx.debug_buffer, ET_DEBUG_BUFFER_SIZE);
+
+    Result<bool> result =
+        ctx.etdump_gen.value().set_debug_buffer(debug_buffer_span);
+
+    if (result.ok()) {
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS)
+      ET_LOG(
+          Info,
+          "ETDump: Allocated intermediate output buffer size: %d at 0x%p",
+          ET_DEBUG_BUFFER_SIZE,
+          ctx.debug_buffer);
+      ctx.etdump_gen.value().set_event_tracer_debug_level(
+          EventTracerDebugLogLevel::kIntermediateOutputs);
+#else
+      ET_LOG(
+          Info,
+          "ETDump: Allocated output buffer size: %d at 0x%p",
+          ET_DEBUG_BUFFER_SIZE,
+          ctx.debug_buffer);
+      ctx.etdump_gen.value().set_event_tracer_debug_level(
+          EventTracerDebugLogLevel::kProgramOutputs);
+#endif
+    } else {
+      ctx.debug_buffer = nullptr;
+      ET_LOG(
+          Error,
+          "ETDump: Could not set_debug_buffer() error:0x%" PRIx32,
+          result.error());
+    }
+  } else {
+    ET_LOG(
+        Error,
+        "ETDump: Could not allocate output buffer size %lu",
+        static_cast<unsigned long>(ET_DEBUG_BUFFER_SIZE));
+  }
+#endif // ET_DUMP_INTERMEDIATE_OUTPUTS || ET_DUMP_OUTPUTS
+#endif // ET_EVENT_TRACER_ENABLED
+
+  ctx.method.reset(program.load_method(
+      ctx.method_name, &ctx.memory_manager.value(), event_tracer_ptr));
+
+  if (!ctx.method->ok()) {
+    ET_LOG(
+        Info,
+        "Loading of method %s failed with status 0x%" PRIx32,
+        ctx.method_name,
+        static_cast<unsigned long>(ctx.method->error()));
+  }
+  ctx.method_loaded_memsize =
+      ctx.method_allocator->used_size() - method_loaded_membase;
+  ET_LOG(Info, "Method '%s' loaded.", ctx.method_name);
+
+  ET_LOG(Info, "Preparing inputs...");
+  size_t input_membase = ctx.method_allocator->used_size();
+
+#if defined(ET_BUNDLE_IO)
+  if (ctx.bundle_io) {
+    ET_LOG(Info, "Input testset[%d] from bundled bpte", testset_idx);
+    Error status = executorch::bundled_program::load_bundled_input(
+        *ctx.method.value(), model_pte, testset_idx);
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "load_bundled_input failed with status 0x%" PRIx32,
+        status);
+  } else
+#endif
+  {
+    Error status = ::prepare_input_tensors(
+        *ctx.method.value(), ctx.method_allocator.value());
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Failed to prepare inputs 0x%" PRIx32,
+        static_cast<uint32_t>(status));
+  }
+
+#if defined(ET_LOG_DUMP_INPUT)
+  {
+    std::vector<EValue> inputs(ctx.method.value()->inputs_size());
+    ET_LOG(Info, "%lu inputs: ", static_cast<unsigned long>(inputs.size()));
+    Error status = ctx.method.value()->get_inputs(inputs.data(), inputs.size());
+    ET_CHECK(status == Error::Ok);
+
+    for (int i = 0; i < inputs.size(); ++i) {
+      if (inputs[i].isTensor()) {
+        Tensor tensor = inputs[i].toTensor();
+        for (int j = 0; j < tensor.numel(); ++j) {
+          if (tensor.scalar_type() == ScalarType::Int) {
+            printf(
+                "Input[%d][%d]: (int) %d\n",
+                i,
+                j,
+                tensor.const_data_ptr<int>()[j]);
+          } else if (tensor.scalar_type() == ScalarType::Float) {
+            printf(
+                "Input[%d][%d]: (float) %f\n",
+                i,
+                j,
+                tensor.const_data_ptr<float>()[j]);
+          } else if (tensor.scalar_type() == ScalarType::Char) {
+            printf(
+                "Input[%d][%d]: (char) %d\n",
+                i,
+                j,
+                tensor.const_data_ptr<int8_t>()[j]);
+          } else if (tensor.scalar_type() == ScalarType::Bool) {
+            printf(
+                "Input[%d][%d]: (bool) %s (0x%x)\n",
+                i,
+                j,
+                tensor.const_data_ptr<int8_t>()[j] ? "true" : "false",
+                tensor.const_data_ptr<int8_t>()[j]);
+          }
+        }
+      } else {
+        printf("Input[%d]: Not Tensor\n", i);
+      }
+    }
+  }
+#endif
+
+  ctx.input_memsize = ctx.method_allocator->used_size() - input_membase;
+  ctx.executor_membase = ctx.method_allocator->used_size();
+
+  ET_LOG(Info, "Input prepared.");
+}
+
+void log_mem_status(RunnerContext& ctx) {
+  size_t executor_memsize =
+      ctx.method_allocator->used_size() - ctx.executor_membase;
+
+  ET_LOG(
+      Info,
+      "model_pte_program_size:     %lu bytes.",
+      static_cast<unsigned long>(ctx.program_data_len));
+  ET_LOG(
+      Info,
+      "model_pte_loaded_size:      %lu bytes.",
+      static_cast<unsigned long>(ctx.pte_size));
+
+  if (ctx.method_allocator->size() != 0) {
+    size_t method_allocator_used = ctx.method_allocator->used_size();
+    ET_LOG(
+        Info,
+        "method_allocator_used:     %lu / %lu  free: %lu ( used: %lu %% ) ",
+        static_cast<unsigned long>(method_allocator_used),
+        static_cast<unsigned long>(ctx.method_allocator->size()),
+        static_cast<unsigned long>(ctx.method_allocator->free_size()),
+        static_cast<unsigned long>(
+            100 * method_allocator_used / ctx.method_allocator->size()));
+    ET_LOG(
+        Info,
+        "method_allocator_planned:  %lu bytes",
+        static_cast<unsigned long>(ctx.planned_buffer_memsize));
+    ET_LOG(
+        Info,
+        "method_allocator_loaded:   %lu bytes",
+        static_cast<unsigned long>(ctx.method_loaded_memsize));
+    ET_LOG(
+        Info,
+        "method_allocator_input:    %lu bytes",
+        static_cast<unsigned long>(ctx.input_memsize));
+    ET_LOG(
+        Info,
+        "method_allocator_executor: %lu bytes",
+        static_cast<unsigned long>(executor_memsize));
+  }
+  if (ctx.temp_allocator->size() > 0) {
+    ET_LOG(
+        Info,
+        "temp_allocator:            %lu",
+        static_cast<unsigned long>(ctx.temp_allocator->size()));
+  }
+
+#if defined(ESP_PLATFORM)
+  ET_LOG(
+      Info,
+      "ESP free heap:             %lu bytes",
+      static_cast<unsigned long>(esp_get_free_heap_size()));
+  ET_LOG(
+      Info,
+      "ESP min free heap ever:    %lu bytes",
+      static_cast<unsigned long>(esp_get_minimum_free_heap_size()));
+#if defined(CONFIG_SPIRAM)
+  ET_LOG(
+      Info,
+      "ESP free PSRAM:            %lu bytes",
+      static_cast<unsigned long>(heap_caps_get_free_size(MALLOC_CAP_SPIRAM)));
+#endif
+#endif
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  if (ctx.debug_buffer != nullptr) {
+    size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes();
+    ET_LOG(
+        Info,
+        "ETDump_outputs_buffer:     %lu / %lu free: %lu ( used: %lu %% ) ",
+        static_cast<unsigned long>(outputdump_len),
+        static_cast<unsigned long>(ET_DEBUG_BUFFER_SIZE),
+        static_cast<unsigned long>(ET_DEBUG_BUFFER_SIZE - outputdump_len),
+        static_cast<unsigned long>(
+            100 * outputdump_len / ET_DEBUG_BUFFER_SIZE));
+  }
+#endif
+#endif
+}
+
+void print_outputs(RunnerContext& ctx) {
+  std::vector<EValue> outputs(ctx.method.value()->outputs_size());
+  ET_LOG(Info, "%lu outputs: ", static_cast<unsigned long>(outputs.size()));
+  Error status =
+      ctx.method.value()->get_outputs(outputs.data(), outputs.size());
+  ET_CHECK(status == Error::Ok);
+
+  for (int i = 0; i < outputs.size(); ++i) {
+    if (outputs[i].isTensor()) {
+      Tensor tensor = outputs[i].toTensor();
+#if defined(ET_LOG_DUMP_OUTPUT)
+      for (int j = 0; j < tensor.numel(); ++j) {
+        if (tensor.scalar_type() == ScalarType::Int) {
+          printf(
+              "Output[%d][%d]: (int) %d\n",
+              i,
+              j,
+              tensor.const_data_ptr<int>()[j]);
+        } else if (tensor.scalar_type() == ScalarType::Float) {
+          printf(
+              "Output[%d][%d]: (float) %f\n",
+              i,
+              j,
+              tensor.const_data_ptr<float>()[j]);
+        } else if (tensor.scalar_type() == ScalarType::Char) {
+          printf(
+              "Output[%d][%d]: (char) %d\n",
+              i,
+              j,
+              tensor.const_data_ptr<int8_t>()[j]);
+        } else if (tensor.scalar_type() == ScalarType::Bool) {
+          printf(
+              "Output[%d][%d]: (bool) %s (0x%x)\n",
+              i,
+              j,
+              tensor.const_data_ptr<int8_t>()[j] ? "true " : "false",
+              tensor.const_data_ptr<int8_t>()[j]);
+        }
+      }
+#endif
+    } else {
+      printf("Output[%d]: Not Tensor\n", i);
+    }
+  }
+}
+
+void write_etdump(RunnerContext& ctx) {
+#if defined(ET_EVENT_TRACER_ENABLED)
+  ETDumpResult result = ctx.etdump_gen->get_etdump_data();
+  if (result.buf != nullptr && result.size > 0) {
+    ET_LOG(
+        Info,
+        "ETDump data generated: %lu bytes",
+        static_cast<unsigned long>(result.size));
+
+    // On ESP32, we could write to SPIFFS/SD or dump via serial.
+    // For now, log the size. In a production setup, you would
+    // write this to a filesystem or transmit over a network interface.
+#if defined(FILESYSTEM_LOAD) && defined(ESP_PLATFORM)
+    const char* etdump_filename = "/spiffs/etdump.bin";
+    ET_LOG(Info, "Writing etdump to file: %s", etdump_filename);
+    FILE* f = fopen(etdump_filename, "wb");
+    if (f) {
+      size_t bytes_written = fwrite((uint8_t*)result.buf, 1, result.size, f);
+      if (bytes_written != result.size) {
+        ET_LOG(
+            Error,
+            "Failed to write complete ETDump data to %s (wrote %lu of %lu bytes)",
+            etdump_filename,
+            static_cast<unsigned long>(bytes_written),
+            static_cast<unsigned long>(result.size));
+      }
+      fclose(f);
+    } else {
+      ET_LOG(Error, "Could not open %s for writing", etdump_filename);
+    }
+#endif
+  }
+#endif
+}
+
+bool verify_result(RunnerContext& ctx, const void* model_pte) {
+  bool model_ok = false;
+#if defined(ET_BUNDLE_IO)
+  if (ctx.bundle_io) {
+    ErrorStats stats = compute_method_output_error_stats(
+        *ctx.method.value(), model_pte, testset_idx);
+    if (stats.status == Error::Ok) {
+      ET_LOG(Info, "=== Error stats for testset %d ===", testset_idx);
+      ET_LOG(Info, " mean_absolute_error: %f", stats.mean_abs_error);
+      ET_LOG(Info, " max_absolute_error:  %f", stats.max_abs_error);
+      ET_LOG(Info, " mean_relative_error: %f", stats.mean_relative_error);
+      ET_LOG(Info, " max_relative_error:  %f", stats.max_relative_error);
+    } else {
+      ET_LOG(
+          Info,
+          "=== Error calculating stats for testset %d ERROR:%d ===",
+          testset_idx,
+          stats.status);
+    }
+
+    Error status = verify_method_outputs(
+        *ctx.method.value(), model_pte, testset_idx, et_rtol, et_atol);
+    if (status == Error::Ok) {
+      ET_LOG(Info, "Model output match expected BundleIO bpte ref data.");
+      ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx);
+      model_ok = true;
+    } else {
+      ET_LOG(
+          Error,
+          "Model output don't match expected BundleIO bpte ref data. rtol=%f atol=%f",
+          et_rtol,
+          et_atol);
+      ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx);
+      model_ok = false;
+    }
+  } else {
+    model_ok = true;
+  }
+#else
+  (void)ctx;
+  (void)model_pte;
+  model_ok = true;
+#endif
+  return model_ok;
+}
+
+bool run_model(RunnerContext& ctx, const void* model_pte) {
+  Error status = Error::Ok;
+  if (num_inferences <= 0) {
+    ET_LOG(
+        Info,
+        "num_inferences (%d) <= 0; skipping model execution.",
+        num_inferences);
+    // Nothing to run; treat as a no-op run.
+    return true;
+  }
+  ET_LOG(Info, "Starting running %d inferences...", num_inferences);
+  int successful_inferences = 0;
+  StartMeasurements();
+  for (int n = 0; n < num_inferences; n++) {
+    ET_LOG(Debug, "Running inference number %d", n);
+    status = ctx.method.value()->execute();
+    if (status != Error::Ok) {
+      break;
+    }
+    // Reset the temporary allocator between inferences
+    ctx.temp_allocator.reset(temp_allocation_pool_size, temp_allocation_pool);
+    successful_inferences++;
+  }
+  if (successful_inferences > 0) {
+    StopMeasurements(successful_inferences);
+  }
+
+  ET_CHECK_MSG(
+      status == Error::Ok,
+      "Execution of method %s failed with status 0x%" PRIx32,
+      ctx.method_name,
+      static_cast<unsigned long>(status));
+
+  ET_LOG(Info, "%d inferences finished", successful_inferences);
+  print_outputs(ctx);
+  bool model_ok = verify_result(ctx, model_pte);
+  ET_LOG(Info, "Model run: %d", model_ok);
+
+  return model_ok;
+}
+
+} // namespace
+
+// =====================================================================
+// Global runner state -- shared by the public et_runner_* API and by
+// executor_runner_main() for its multi-inference demo loop.
+// =====================================================================
+
+static RunnerContext g_runner_ctx;
+static bool g_runner_initialized = false;
+
+// Maximum number of input/output tensors handled in the public API.
+static const size_t kMaxInputOutputs = 16;
+
+// =====================================================================
+// Public API
+// =====================================================================
+
+bool et_runner_init(void) {
+  executorch::runtime::runtime_init();
+
+  size_t pte_size;
+
+#if defined(FILESYSTEM_LOAD)
+#if defined(ESP_PLATFORM)
+  if (!init_spiffs("/spiffs", "storage")) {
+    ET_LOG(Fatal, "Failed to initialize SPIFFS. Cannot load model.");
+    return false;
+  }
+#endif
+  EspMemoryAllocator file_allocator(
+      method_allocation_pool_size, method_allocation_pool);
+  auto [buffer, buffer_size] =
+      load_file_from_fs("/spiffs/model.pte", file_allocator);
+  if (buffer == nullptr) {
+    ET_LOG(Fatal, "Failed to load model from filesystem.");
+    return false;
+  }
+  model_pte = buffer;
+  model_pte_size = buffer_size;
+  pte_size = buffer_size;
+#else
+  pte_size = sizeof(model_pte);
+#endif
+
+  runner_init(g_runner_ctx, pte_size);
+  g_runner_initialized = g_runner_ctx.method->ok();
+  return g_runner_initialized;
+}
+
+bool et_runner_set_input(size_t input_idx, const void* data, size_t num_bytes) {
+  if (!g_runner_initialized) {
+    ET_LOG(Error, "Runner not initialized. Call et_runner_init() first.");
+    return false;
+  }
+
+  Method& method = *g_runner_ctx.method.value();
+  const size_t num_inputs = method.inputs_size();
+
+  if (input_idx >= num_inputs) {
+    ET_LOG(
+        Error,
+        "Input index %lu out of range (num_inputs=%lu).",
+        static_cast<unsigned long>(input_idx),
+        static_cast<unsigned long>(num_inputs));
+    return false;
+  }
+  if (num_inputs > kMaxInputOutputs) {
+    ET_LOG(
+        Error,
+        "Model has too many inputs (%lu > %lu).",
+        static_cast<unsigned long>(num_inputs),
+        static_cast<unsigned long>(kMaxInputOutputs));
+    return false;
+  }
+
+  // get_inputs() returns shallow copies whose data pointers alias the
+  // method's internal tensor storage, allowing direct writes.
+  EValue input_evalues[kMaxInputOutputs];
+  Error status = method.get_inputs(input_evalues, num_inputs);
+  if (status != Error::Ok) {
+    ET_LOG(
+        Error,
+        "get_inputs() failed with status 0x%" PRIx32,
+        static_cast<uint32_t>(status));
+    return false;
+  }
+
+  if (!input_evalues[input_idx].isTensor()) {
+    ET_LOG(
+        Error,
+        "Input %lu is not a Tensor.",
+        static_cast<unsigned long>(input_idx));
+    return false;
+  }
+
+  Tensor& tensor = input_evalues[input_idx].toTensor();
+  const size_t tensor_bytes = tensor.nbytes();
+  if (num_bytes > tensor_bytes) {
+    ET_LOG(
+        Error,
+        "Input %lu: provided %lu bytes exceeds tensor capacity %lu bytes.",
+        static_cast<unsigned long>(input_idx),
+        static_cast<unsigned long>(num_bytes),
+        static_cast<unsigned long>(tensor_bytes));
+    return false;
+  }
+  // Treat zero-length input as a no-op.
+  if (num_bytes == 0) {
+    return true;
+  }
+  // For non-zero length, the input data pointer must be non-null.
+  if (data == nullptr) {
+    ET_LOG(
+        Error,
+        "Input %lu: data pointer is null for non-zero num_bytes (%lu).",
+        static_cast<unsigned long>(input_idx),
+        static_cast<unsigned long>(num_bytes));
+    return false;
+  }
+
+  memcpy(tensor.mutable_data_ptr(), data, num_bytes);
+  return true;
+}
+
+bool et_runner_execute(void) {
+  if (!g_runner_initialized) {
+    ET_LOG(Error, "Runner not initialized. Call et_runner_init() first.");
+    return false;
+  }
+
+  Method& method = *g_runner_ctx.method.value();
+  Error status = method.execute();
+  // Reset the temporary allocator so it is ready for the next inference.
+  g_runner_ctx.temp_allocator.reset(
+      temp_allocation_pool_size, temp_allocation_pool);
+  if (status != Error::Ok) {
+    ET_LOG(
+        Error,
+        "execute() failed with status 0x%" PRIx32,
+        static_cast<uint32_t>(status));
+    return false;
+  }
+  return true;
+}
+
+bool et_runner_get_output(
+    size_t output_idx,
+    void* buffer,
+    size_t buffer_bytes,
+    size_t* out_num_elements) {
+  if (!g_runner_initialized) {
+    ET_LOG(Error, "Runner not initialized. Call et_runner_init() first.");
+    return false;
+  }
+
+  Method& method = *g_runner_ctx.method.value();
+  const size_t num_outputs = method.outputs_size();
+
+  if (output_idx >= num_outputs) {
+    ET_LOG(
+        Error,
+        "Output index %lu out of range (num_outputs=%lu).",
+        static_cast<unsigned long>(output_idx),
+        static_cast<unsigned long>(num_outputs));
+    return false;
+  }
+  if (num_outputs > kMaxInputOutputs) {
+    ET_LOG(
+        Error,
+        "Model has too many outputs (%lu > %lu).",
+        static_cast<unsigned long>(num_outputs),
+        static_cast<unsigned long>(kMaxInputOutputs));
+    return false;
+  }
+
+  EValue output_evalues[kMaxInputOutputs];
+  Error status = method.get_outputs(output_evalues, num_outputs);
+  if (status != Error::Ok) {
+    ET_LOG(
+        Error,
+        "get_outputs() failed with status 0x%" PRIx32,
+        static_cast<uint32_t>(status));
+    return false;
+  }
+
+  if (!output_evalues[output_idx].isTensor()) {
+    ET_LOG(
+        Error,
+        "Output %lu is not a Tensor.",
+        static_cast<unsigned long>(output_idx));
+    return false;
+  }
+
+  Tensor tensor = output_evalues[output_idx].toTensor();
+  const size_t tensor_bytes = tensor.nbytes();
+  if (buffer_bytes < tensor_bytes) {
+    ET_LOG(
+        Error,
+        "Output %lu: buffer too small (%lu bytes < %lu bytes required).",
+        static_cast<unsigned long>(output_idx),
+        static_cast<unsigned long>(buffer_bytes),
+        static_cast<unsigned long>(tensor_bytes));
+    return false;
+  }
+
+  memcpy(buffer, tensor.const_data_ptr(), tensor_bytes);
+  if (out_num_elements != nullptr) {
+    *out_num_elements = static_cast<size_t>(tensor.numel());
+  }
+  return true;
+}
+
+size_t et_runner_inputs_size(void) {
+  if (!g_runner_initialized) {
+    return 0;
+  }
+  return (*g_runner_ctx.method.value()).inputs_size();
+}
+
+size_t et_runner_outputs_size(void) {
+  if (!g_runner_initialized) {
+    return 0;
+  }
+  return (*g_runner_ctx.method.value()).outputs_size();
+}
+
+/**
+ * Main entry point for the ESP32 executor runner.
+ *
+ * On ESP-IDF, this is called from app_main() (see below).
+ * The function can also be compiled for host testing without ESP-IDF.
+ */
+void executor_runner_main(void) {
+  if (!et_runner_init()) {
+    return;
+  }
+
+  // Log the PTE magic bytes for quick sanity check
+  ET_LOG(
+      Info,
+      "PTE @ %p [----%c%c%c%c]",
+      model_pte,
+      model_pte[4],
+      model_pte[5],
+      model_pte[6],
+      model_pte[7]);
+
+  bool model_ok = run_model(g_runner_ctx, model_pte);
+  ET_LOG(Info, "Model run: %d", model_ok);
+
+  log_mem_status(g_runner_ctx);
+  write_etdump(g_runner_ctx);
+
+  ET_CHECK_MSG(model_ok == true, "Problem running model");
+
+  ET_LOG(Info, "Program complete.");
+}
\ No newline at end of file
diff --git a/examples/espressif/executor_runner/esp_executor_runner.h b/examples/espressif/executor_runner/esp_executor_runner.h
new file mode 100644
index 00000000000..86672d8c0bf
--- /dev/null
+++ b/examples/espressif/executor_runner/esp_executor_runner.h
@@ -0,0 +1,98 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Public API for the ESP32 ExecuTorch executor runner.
+ *
+ * Provides a simple interface to load a model once and run repeated inferences
+ * on dynamically generated input data:
+ *
+ *   et_runner_init();
+ *
+ *   // For each inference:
+ *   et_runner_set_input(0, my_input_data, my_input_bytes);
+ *   et_runner_execute();
+ *   et_runner_get_output(0, out_buf, out_buf_bytes, &num_elements);
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize the runner: load the model, allocate memory pools, and prepare
+ * the inference method. Must be called once before any other et_runner_*
+ * function.
+ *
+ * @returns true on success, false on failure.
+ */
+bool et_runner_init(void);
+
+/**
+ * Copy raw data into the input tensor at the given index.
+ *
+ * The runner must already be initialized with et_runner_init(). The data's
+ * layout (dtype and shape) must match the model's expected input tensor.
+ *
+ * @param input_idx  Zero-based index of the input tensor to set.
+ * @param data       Pointer to the source data in host memory.
+ * @param num_bytes  Number of bytes to copy. Must not exceed the tensor's
+ *                   total byte size (element_size * num_elements).
+ * @returns true on success, false on failure.
+ */
+bool et_runner_set_input(size_t input_idx, const void* data, size_t num_bytes);
+
+/**
+ * Execute one forward pass of the model.
+ *
+ * Must be called after et_runner_init(). Call et_runner_set_input() before
+ * this if you want to provide custom input data. Results are available via
+ * et_runner_get_output() after this call returns successfully.
+ *
+ * @returns true on success, false on failure.
+ */
+bool et_runner_execute(void);
+
+/**
+ * Copy the output tensor data at the given index into a caller-provided buffer.
+ *
+ * Must be called after a successful et_runner_execute().
+ *
+ * @param output_idx       Zero-based index of the output tensor to read.
+ * @param buffer           Caller-allocated destination buffer.
+ * @param buffer_bytes     Size of the destination buffer in bytes. Must be
+ *                         >= the output tensor's total byte size.
+ * @param out_num_elements If non-NULL, set to the number of elements in the
+ *                         output tensor (not bytes).
+ * @returns true on success, false on failure.
+ */
+bool et_runner_get_output(
+    size_t output_idx,
+    void* buffer,
+    size_t buffer_bytes,
+    size_t* out_num_elements);
+
+/**
+ * Returns the number of input tensors expected by the loaded model.
+ * Returns 0 if the runner is not yet initialized.
+ */
+size_t et_runner_inputs_size(void);
+
+/**
+ * Returns the number of output tensors produced by the loaded model.
+ * Returns 0 if the runner is not yet initialized.
+ */
+size_t et_runner_outputs_size(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/examples/espressif/executor_runner/esp_memory_allocator.cpp b/examples/espressif/executor_runner/esp_memory_allocator.cpp
new file mode 100644
index 00000000000..c68f94289df
--- /dev/null
+++ b/examples/espressif/executor_runner/esp_memory_allocator.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "esp_memory_allocator.h"
+
+EspMemoryAllocator::EspMemoryAllocator(uint32_t size, uint8_t* base_address)
+    : MemoryAllocator(size, base_address), used_(0) {}
+
+void* EspMemoryAllocator::allocate(size_t size, size_t alignment) {
+  void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
+  if (ret != nullptr) {
+    // Keep used_ in sync with the underlying MemoryAllocator by computing it
+    // from the returned pointer and requested size, which implicitly includes
+    // any padding/alignment the base allocator applied.
+    uint8_t* end_ptr = static_cast<uint8_t*>(ret) + size;
+    used_ = static_cast<size_t>(end_ptr - base_address());
+  }
+  return ret;
+}
+
+size_t EspMemoryAllocator::used_size() const {
+  return used_;
+}
+
+size_t EspMemoryAllocator::free_size() const {
+  return executorch::runtime::MemoryAllocator::size() - used_;
+}
+
+void EspMemoryAllocator::reset() {
+  executorch::runtime::MemoryAllocator::reset();
+  used_ = 0;
+}
diff --git a/examples/espressif/executor_runner/esp_memory_allocator.h b/examples/espressif/executor_runner/esp_memory_allocator.h
new file mode 100644
index 00000000000..377f608fe88
--- /dev/null
+++ b/examples/espressif/executor_runner/esp_memory_allocator.h
@@ -0,0 +1,36 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/memory_allocator.h>
+
+
+/**
+ * Custom allocator for Espressif ESP32/ESP32-S3 targets that tracks
+ * used and free memory. Extends the ExecuTorch MemoryAllocator with
+ * additional instrumentation useful for memory-constrained embedded
+ * environments.
+ */
+class EspMemoryAllocator : public executorch::runtime::MemoryAllocator {
+ public:
+  EspMemoryAllocator(uint32_t size, uint8_t* base_address);
+
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override;
+
+  /// Returns the used size of the allocator's memory buffer.
+  size_t used_size() const;
+
+  /// Returns the free size of the allocator's memory buffer.
+  size_t free_size() const;
+
+  /// Resets the allocator to its initial state.
+  void reset();
+
+ private:
+  size_t used_;
+};
diff --git a/examples/espressif/executor_runner/esp_pal.cpp b/examples/espressif/executor_runner/esp_pal.cpp
new file mode 100644
index 00000000000..90c227d8f99
--- /dev/null
+++ b/examples/espressif/executor_runner/esp_pal.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/platform.h>
+
+#if defined(ESP_PLATFORM)
+#include <esp_cpu.h>
+#include <esp_heap_caps.h>
+#include <esp_system.h>
+#include <esp_clk_tree.h>
+#endif
+
+extern "C" {
+
+void et_pal_init(void) {
+#if defined(ESP_PLATFORM)
+  ET_LOG(
+      Info,
+      "ESP32 ExecuTorch runner initialized. Free heap: %lu bytes.",
+      static_cast<unsigned long>(esp_get_free_heap_size()));
+#if defined(CONFIG_SPIRAM)
+  ET_LOG(
+      Info,
+      "PSRAM available. Free PSRAM: %lu bytes.",
+      static_cast<unsigned long>(heap_caps_get_free_size(MALLOC_CAP_SPIRAM)));
+#endif
+#endif
+}
+
+ET_NORETURN void et_pal_abort(void) {
+#if defined(ESP_PLATFORM)
+  esp_restart();
+#else
+  abort();
+#endif
+  while (1) {
+  }
+}
+
+et_timestamp_t et_pal_current_ticks(void) {
+#if defined(ESP_PLATFORM)
+  return (et_timestamp_t)esp_cpu_get_cycle_count();
+#else
+  return 0;
+#endif
+}
+
+et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
+#if defined(ESP_PLATFORM)
+  uint32_t cpu_freq_hz;
+  if (esp_clk_tree_src_get_freq_hz(SOC_MOD_CLK_CPU, ESP_CLK_TREE_SRC_FREQ_PRECISION_CACHED, &cpu_freq_hz) ==
+      ESP_OK) {
+    return {1000000000u, cpu_freq_hz};
+  }
+#endif
+  return {1000, 240}; // Default to 240 MHz if we can't get the actual frequency
+}
+
+void et_pal_emit_log_message(
+    ET_UNUSED et_timestamp_t timestamp,
+    et_pal_log_level_t level,
+    const char* filename,
+    const char* function,
+    size_t line,
+    const char* message,
+    ET_UNUSED size_t length) {
+  printf(
+      "%c [executorch:%s:%lu %s()] %s\n",
+      level,
+      filename,
+      static_cast<unsigned long>(line),
+      function,
+      message);
+  fflush(stdout);
+}
+
+void* et_pal_allocate(ET_UNUSED size_t size) {
+  return nullptr;
+}
+
+void et_pal_free(ET_UNUSED void* ptr) {}
+
+} // extern "C"
\ No newline at end of file
diff --git a/examples/espressif/executor_runner/esp_perf_monitor.cpp b/examples/espressif/executor_runner/esp_perf_monitor.cpp
new file mode 100644
index 00000000000..1b1a70987b5
--- /dev/null
+++ b/examples/espressif/executor_runner/esp_perf_monitor.cpp
@@ -0,0 +1,100 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cinttypes>
+
+#include "esp_perf_monitor.h"
+
+#if defined(ESP_PLATFORM)
+
+#include <esp_cpu.h>
+#include <esp_system.h>
+#include <esp_timer.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace {
+
+uint32_t start_cycle_count = 0;
+int64_t start_time_us = 0;
+
+} // namespace
+
+void StartMeasurements() {
+  start_cycle_count = esp_cpu_get_cycle_count();
+  start_time_us = esp_timer_get_time();
+}
+
+void StopMeasurements(int num_inferences) {
+  uint32_t end_cycle_count = esp_cpu_get_cycle_count();
+  int64_t end_time_us = esp_timer_get_time();
+
+  uint32_t delta_cycles = end_cycle_count - start_cycle_count;
+  uint64_t total_cycles = static_cast<uint64_t>(delta_cycles);
+  int64_t total_time_us = end_time_us - start_time_us;
+
+  ET_LOG(Info, "Profiler report:");
+  ET_LOG(Info, "Number of inferences: %d", num_inferences);
+
+  // Guard against division by zero or invalid counts when computing
+  // per-inference metrics.
+  if (num_inferences <= 0) {
+    ET_LOG(
+        Info,
+        "Total CPU cycles: %" PRIu64 " (per-inference metrics not computed)",
+        total_cycles);
+    ET_LOG(
+        Info,
+        "Total wall time: %" PRId64 " us (per-inference metrics not computed)",
+        total_time_us);
+    // Log ESP32 system memory info
+    ET_LOG(
+        Info,
+        "Free heap: %lu bytes",
+        static_cast<unsigned long>(esp_get_free_heap_size()));
+    ET_LOG(
+        Info,
+        "Min free heap ever: %lu bytes",
+        static_cast<unsigned long>(esp_get_minimum_free_heap_size()));
+    return;
+  }
+
+  ET_LOG(
+      Info,
+      "Total CPU cycles: %" PRIu64 " (%.2f per inference)",
+      total_cycles,
+      (double)total_cycles / num_inferences);
+  ET_LOG(
+      Info,
+      "Total wall time: %" PRId64 " us (%.2f us per inference)",
+      total_time_us,
+      (double)total_time_us / num_inferences);
+  ET_LOG(
+      Info,
+      "Average inference time: %.3f ms",
+      (double)total_time_us / num_inferences / 1000.0);
+
+  // Log ESP32 system memory info
+  ET_LOG(
+      Info,
+      "Free heap: %lu bytes",
+      static_cast<unsigned long>(esp_get_free_heap_size()));
+  ET_LOG(
+      Info,
+      "Min free heap ever: %lu bytes",
+      static_cast<unsigned long>(esp_get_minimum_free_heap_size()));
+}
+
+#else // !defined(ESP_PLATFORM)
+
+// Stub implementation for non-ESP builds (e.g. host testing)
+void StartMeasurements() {}
+
+void StopMeasurements(int num_inferences) {
+  (void)num_inferences;
+}
+
+#endif // defined(ESP_PLATFORM)
diff --git a/examples/espressif/executor_runner/esp_perf_monitor.h b/examples/espressif/executor_runner/esp_perf_monitor.h
new file mode 100644
index 00000000000..ccbdb07e331
--- /dev/null
+++ b/examples/espressif/executor_runner/esp_perf_monitor.h
@@ -0,0 +1,18 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * Performance monitoring helpers for Espressif ESP32/ESP32-S3.
+ *
+ * Uses the Xtensa/RISC-V CPU cycle counter (CCOUNT register on Xtensa,
+ * or esp_cpu_get_cycle_count() from ESP-IDF) for timing measurements.
+ */
+
+void StartMeasurements();
+void StopMeasurements(int num_inferences);
diff --git a/examples/espressif/executor_runner/pte_to_header.py b/examples/espressif/executor_runner/pte_to_header.py
new file mode 100644
index 00000000000..0a8935b7a92
--- /dev/null
+++ b/examples/espressif/executor_runner/pte_to_header.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Converts an ExecuTorch .pte model file to a C header file containing
+the model data as a byte array. This is used to embed the model directly
+into the firmware binary for ESP32/ESP32-S3 targets.
+
+Usage:
+    python pte_to_header.py --pte model.pte [--outdir .] [--outfile model_pte.h]
+"""
+
+import binascii
+import os
+from argparse import ArgumentParser, ArgumentTypeError
+
+bytes_per_line = 32
+hex_digits_per_line = bytes_per_line * 2
+
+
+def input_file_path(path):
+    if os.path.exists(path):
+        return path
+    else:
+        raise ArgumentTypeError(f"input filepath:{path} does not exist")
+
+
+parser = ArgumentParser(description="Convert .pte model to C header for ESP32")
+parser.add_argument(
+    "-p",
+    "--pte",
+    help="ExecuTorch .pte model file",
+    type=input_file_path,
+    required=True,
+)
+parser.add_argument(
+    "-d",
+    "--outdir",
+    help="Output dir for model header",
+    type=str,
+    required=False,
+    default=".",
+)
+parser.add_argument(
+    "-o",
+    "--outfile",
+    help="Output filename for model header",
+    type=str,
+    required=False,
+    default="model_pte.h",
+)
+parser.add_argument(
+    "-s",
+    "--section",
+    help="Section attribute for the data array (use 'none' for no section attribute)",
+    type=str,
+    required=False,
+    default="none",
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    outfile = os.path.join(args.outdir, args.outfile)
+
+    if args.section == "none":
+        # No section attribute - let the linker/compiler decide placement.
+        # On ESP32 with PSRAM, the compiler/linker or EXT_RAM_BSS_ATTR
+        # in the code handles placement.
+        attr = "__attribute__((aligned(16))) static const unsigned char "
+    else:
+        attr = f'__attribute__((section("{args.section}"), aligned(16))) static const unsigned char '
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    with open(args.pte, "rb") as fr, open(outfile, "w") as fw:
+        data = fr.read()
+        hexstream = binascii.hexlify(data).decode("utf-8")
+        
+        fw.write(
+            "/* Auto-generated model header for ESP32 ExecuTorch runner. */\n"
+        )
+        fw.write(f"/* Source: {os.path.basename(args.pte)} ({len(data)} bytes) */\n\n")
+        fw.write("#pragma once\n\n")
+        fw.write(attr + "model_pte[] = {")
+
+        for i in range(0, len(hexstream), 2):
+            if 0 == (i % hex_digits_per_line):
+                fw.write("\n")
+            fw.write("0x" + hexstream[i : i + 2] + ", ")
+
+        fw.write("\n};\n")
+        fw.flush()
+        os.fsync(fw.fileno())
+
+    print(
+        f"Input: {args.pte} with {len(data)} bytes. "
+        f"Output: {outfile} with {os.path.getsize(outfile)} bytes."
+    )
diff --git a/examples/espressif/project/CMakeLists.txt b/examples/espressif/project/CMakeLists.txt
new file mode 100644
index 00000000000..b467cb49baa
--- /dev/null
+++ b/examples/espressif/project/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example ESP-IDF project CMakeLists.txt
+#
+# This is a template project that uses the executor_runner component.
+# Copy this to your own project directory and adjust paths as needed.
+#
+# Usage:
+#   cd examples/espressif/project
+#   idf.py set-target esp32s3
+#   idf.py build
+#   idf.py flash monitor
+
+cmake_minimum_required(VERSION 3.16)
+
+# Set the path to ExecuTorch source
+set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../.." CACHE PATH "ExecuTorch root")
+
+# Add the executor_runner as an extra component
+set(EXTRA_COMPONENT_DIRS
+    "${CMAKE_CURRENT_SOURCE_DIR}/../executor_runner"
+)
+
+include($ENV{IDF_PATH}/tools/cmake/project.cmake)
+project(executorch_esp_runner)
diff --git a/examples/espressif/project/main/CMakeLists.txt b/examples/espressif/project/main/CMakeLists.txt
new file mode 100644
index 00000000000..2b2cd9d135a
--- /dev/null
+++ b/examples/espressif/project/main/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Main component CMakeLists.txt for the ESP-IDF project.
+# This is a minimal main component that depends on the executor_runner.
+
+idf_component_register(
+    SRCS "main.cpp"
+    INCLUDE_DIRS "."
+    REQUIRES executor_runner
+)
diff --git a/examples/espressif/project/main/main.cpp b/examples/espressif/project/main/main.cpp
new file mode 100644
index 00000000000..ac446d142f8
--- /dev/null
+++ b/examples/espressif/project/main/main.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Example ESP-IDF main component.
+ *
+ * The app_main() defined below performs optional initialization and then
+ * calls executor_runner_main().
+ *
+ * If you want to customize the runner behavior, you can modify the
+ * app_main() implementation here (e.g., add initialization or cleanup)
+ * while still delegating to executor_runner_main().
+ */
+
+
+#include <stdio.h>
+#include "sdkconfig.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "esp_system.h"
+
+extern void executor_runner_main(void);
+
+extern "C" void app_main(void) {
+    printf("Starting executorch runner !\n");
+    fflush(stdout);
+    // Custom initialization here
+    executor_runner_main();
+    for (int i = 5; i >= 0; i--) {
+        vTaskDelay(1000 / portTICK_PERIOD_MS);
+    }
+    esp_restart();
+}
diff --git a/examples/espressif/project/partitions.csv b/examples/espressif/project/partitions.csv
new file mode 100644
index 00000000000..e6d484d3f99
--- /dev/null
+++ b/examples/espressif/project/partitions.csv
@@ -0,0 +1,5 @@
+# ESP-IDF Partition Table
+# Name, Type, SubType, Offset, Size, Flags
+nvs,data,nvs,0x9000,24K,
+phy_init,data,phy,0xf000,4K,
+factory,app,factory,0x10000,2M,
diff --git a/examples/espressif/project/sdkconfig.defaults b/examples/espressif/project/sdkconfig.defaults
new file mode 100644
index 00000000000..08b09229148
--- /dev/null
+++ b/examples/espressif/project/sdkconfig.defaults
@@ -0,0 +1,50 @@
+# ESP-IDF sdkconfig defaults for ExecuTorch executor runner
+#
+# These settings are optimized for running ExecuTorch models on ESP32/ESP32-S3.
+# Copy this file as sdkconfig.defaults in your project directory.
+
+# ─── CPU Frequency ───
+# Run at maximum frequency for best inference performance
+CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
+
+# ─── PSRAM (if available) ───
+# Enable PSRAM for larger model support
+CONFIG_SPIRAM=y
+CONFIG_SPIRAM_MODE_QUAD=y
+CONFIG_SPIRAM_SPEED_80M=y
+# Allow malloc to fall back to PSRAM when internal RAM is exhausted
+CONFIG_SPIRAM_USE_CAPS_ALLOC=y
+# Place BSS in PSRAM (for large static buffers)
+CONFIG_SPIRAM_ALLOW_BSS_SEG_EXTERNAL_MEMORY=y
+
+# ─── Memory ───
+# Increase main task stack size for ExecuTorch
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=32768
+
+# ─── Flash ───
+# Use QIO flash mode for faster flash reads (model data)
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASHFREQ_80M=y
+# Larger flash size for model data
+CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
+
+# ─── Optimization ───
+# Optimize for performance
+CONFIG_COMPILER_OPTIMIZATION_PERF=y
+
+# ─── FreeRTOS ───
+# Increase tick rate for finer timing granularity
+CONFIG_FREERTOS_HZ=1000
+
+# ─── Logging ───
+# Default log level (can be changed at runtime)
+CONFIG_LOG_DEFAULT_LEVEL_INFO=y
+
+# ─── Watchdog ───
+# Disable task watchdog for long-running inference
+CONFIG_ESP_TASK_WDT_EN=n
+
+# ─── Custom partition table to be adjusted for larger builds ───
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions.csv"
+CONFIG_PARTITION_TABLE_FILENAME="partitions.csv"
\ No newline at end of file
diff --git a/examples/espressif/project/sdkconfig.defaults.esp32s3 b/examples/espressif/project/sdkconfig.defaults.esp32s3
new file mode 100644
index 00000000000..15f9c4eba30
--- /dev/null
+++ b/examples/espressif/project/sdkconfig.defaults.esp32s3
@@ -0,0 +1,42 @@
+# ESP-IDF sdkconfig defaults for ESP32-S3 target
+#
+# ESP32-S3 specific optimizations:
+# - Octal PSRAM support (up to 32MB)
+# - Dual-core Xtensa LX7 at 240MHz
+# - Vector extensions for faster computation
+
+# ─── CPU ───
+CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
+
+# ─── PSRAM (Octal PSRAM for ESP32-S3) ───
+CONFIG_SPIRAM=y
+#CONFIG_SPIRAM_MODE_QUAD=y
+CONFIG_SPIRAM_MODE_OCT=y
+CONFIG_SPIRAM_SPEED_80M=y
+CONFIG_SPIRAM_USE_CAPS_ALLOC=y
+CONFIG_SPIRAM_ALLOW_BSS_SEG_EXTERNAL_MEMORY=y
+
+# ─── Memory ───
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=32768
+
+# ─── Flash ───
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASHFREQ_80M=y
+CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
+
+# ─── Optimization ───
+CONFIG_COMPILER_OPTIMIZATION_PERF=y
+
+# ─── FreeRTOS ───
+CONFIG_FREERTOS_HZ=1000
+
+# ─── Watchdog ───
+CONFIG_ESP_TASK_WDT_EN=n
+
+# ─── Logging ───
+CONFIG_LOG_DEFAULT_LEVEL_INFO=y
+
+# ─── Custom partition table to be adjusted for larger builds ───
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions.csv"
+CONFIG_PARTITION_TABLE_FILENAME="partitions.csv"
\ No newline at end of file
diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index a15a2572669..1928892efe6 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -145,7 +145,7 @@ ThreadPool* get_threadpool() {
      * tricky to detect if we are running under tsan, for now capping the
      * default threadcount to the tsan limit unconditionally.
      */
-    constexpr unsigned int tsan_thread_limit = 63;
+    constexpr decltype(result) tsan_thread_limit = 63;
     return std::min(result, tsan_thread_limit);
   })();
 
diff --git a/tools/cmake/preset/esp_baremetal.cmake b/tools/cmake/preset/esp_baremetal.cmake
new file mode 100644
index 00000000000..cf86d5efc79
--- /dev/null
+++ b/tools/cmake/preset/esp_baremetal.cmake
@@ -0,0 +1,21 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}")
+set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
+set_overridable_option(EXECUTORCH_ENABLE_LOGGING ON)
+set_overridable_option(EXECUTORCH_BUILD_ARM_ETDUMP OFF)
+
+if("${EXECUTORCH_BUILD_ARM_ETDUMP}")
+  set(EXECUTORCH_BUILD_DEVTOOLS ON)
+  set(EXECUTORCH_ENABLE_EVENT_TRACER ON)
+  set(FLATCC_ALLOW_WERROR OFF)
+else()
+  set(EXECUTORCH_ENABLE_EVENT_TRACER OFF)
+endif()