diff --git a/CMakePresets.json b/CMakePresets.json index 4d8b70f08b2..c8fba2b6a41 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -309,6 +309,14 @@ "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/arm_ethosu_linux.cmake", "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake" } + }, + { + "name": "esp-baremetal", + "displayName": "Build ExecuTorch for ESP baremetal", + "inherits": ["common"], + "cacheVariables": { + "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/esp_baremetal.cmake" + } } ], "buildPresets": [ diff --git a/examples/espressif/README.md b/examples/espressif/README.md new file mode 100644 index 00000000000..5c345b4d98f --- /dev/null +++ b/examples/espressif/README.md @@ -0,0 +1,278 @@ +# ExecuTorch Executor Runner for Espressif ESP32/ESP32-S3 + +> **:warning: **This example is not tested in CI. Use at your own risk.**** + +This example demonstrates how to run an ExecuTorch model on Espressif ESP32 and +ESP32-S3 microcontrollers. It is based on the +[Arm Cortex-M executor runner](../arm/executor_runner/) and adapted for the +ESP-IDF build system and ESP32 memory architecture. + +## Supported Targets + +| Chip | CPU | Internal SRAM | PSRAM (optional) | +|----------|---------------|---------------|------------------| +| ESP32 | Xtensa LX6 (dual-core, 240MHz) | ~520KB | 4-8MB | +| ESP32-S3 | Xtensa LX7 (dual-core, 240MHz) | ~512KB | 2-32MB (Octal) | + +## Prerequisites + +1. **ESP-IDF v5.1+**: Install the ESP-IDF toolchain following the + [official guide](https://docs.espressif.com/projects/esp-idf/en/stable/esp32/get-started/). + +2. **ExecuTorch**: Clone and set up ExecuTorch: + ```bash + git clone https://github.com/pytorch/executorch.git + cd executorch + pip install -e . + ``` + +3. **Cross-compiled ExecuTorch libraries**: Build ExecuTorch for the ESP32 + target. See the [Cross-Compilation](#cross-compiling-executorch) section. + +4. **A .pte model file**: Export a PyTorch model to the ExecuTorch `.pte` + format. For small models suitable for ESP32, consider: + - A simple add/multiply model + - MobileNet V2 (quantized, with PSRAM) + - Custom small models + +## Project Structure + +``` +examples/espressif/ +├── README.md # This file +├── build.sh # Build helper script +├── executor_runner/ +│ ├── CMakeLists.txt # Component/standalone CMake build +│ ├── esp_executor_runner.cpp # Main executor runner +│ ├── esp_memory_allocator.h # Custom memory allocator +│ ├── esp_memory_allocator.cpp +│ ├── esp_perf_monitor.h # Performance monitoring +│ ├── esp_perf_monitor.cpp +│ └── pte_to_header.py # Convert .pte to C header +└── project/ + ├── CMakeLists.txt # ESP-IDF project file + ├── sdkconfig.defaults # Default ESP-IDF configuration + ├── sdkconfig.defaults.esp32s3 # ESP32-S3 specific config + ├── partitions.csv # Example partition table; adjust app partition size for your board and model + └── main/ + ├── CMakeLists.txt # Main component + └── main.cpp # Entry point +``` + +## Quick Start + +The following example has been tested only on an ESP32-S3 dev board with 8 MB of Octal PSRAM. You may need to adjust the `sdkconfig` file for your specific board. + +### 1. Export a simple model + +```python +import torch +from executorch.exir import to_edge + +class SimpleModel(torch.nn.Module): + def forward(self, x): + return x + x + +model = SimpleModel() +example_input = (torch.randn(1, 8),) + +# Export to ExecuTorch +exported = torch.export.export(model, example_input) +edge = to_edge(exported) +et_program = edge.to_executorch() + +with open("simple_add.pte", "wb") as f: + f.write(et_program.buffer) +``` + +### 2. Convert the model to a C header + +```bash +python3 examples/espressif/executor_runner/pte_to_header.py \ + --pte simple_add.pte \ + --outdir examples/espressif/project/ +``` + +### 3. Build with ESP-IDF + +```bash +# Source ESP-IDF environment +. $IDF_PATH/export.sh + +# Using the build script: +./examples/espressif/build.sh --target esp32s3 --pte simple_add.pte + +# Or manually: +cd examples/espressif/project +idf.py set-target esp32s3 +idf.py build +``` + +### 4. Flash and Monitor + +```bash +cd examples/espressif/project +idf.py -p /dev/ttyUSB0 flash monitor +``` + +You should see output like: +``` +Starting executorch runner ! +I [executorch:esp_executor_runner.cpp:237 et_pal_init()] ESP32 ExecuTorch runner initialized. Free heap: 6097812 bytes. +I [executorch:esp_executor_runner.cpp:242 et_pal_init()] PSRAM available. Free PSRAM: 5764716 bytes. +I [executorch:esp_executor_runner.cpp:1047 executor_runner_main()] PTE @ 0x3c05f9f0 [----ET12] +I [executorch:esp_executor_runner.cpp:568 runner_init()] PTE Model data loaded. Size: 952 bytes. +I [executorch:esp_executor_runner.cpp:583 runner_init()] Model buffer loaded, has 1 methods +I [executorch:esp_executor_runner.cpp:593 runner_init()] Running method forward +I [executorch:esp_executor_runner.cpp:604 runner_init()] Setup Method allocator pool. Size: 2097152 bytes. +I [executorch:esp_executor_runner.cpp:620 runner_init()] Setting up planned buffer 0, size 64. +I [executorch:esp_executor_runner.cpp:716 runner_init()] Method 'forward' loaded. +I [executorch:esp_executor_runner.cpp:718 runner_init()] Preparing inputs... +I [executorch:esp_executor_runner.cpp:780 runner_init()] Input prepared. +I [executorch:esp_executor_runner.cpp:979 run_model()] Starting running 1 inferences... +I [executorch:esp_perf_monitor.cpp:41 StopMeasurements()] Profiler report: +I [executorch:esp_perf_monitor.cpp:42 StopMeasurements()] Number of inferences: 1 +I [executorch:esp_perf_monitor.cpp:43 StopMeasurements()] Total CPU cycles: 49545 (49545.00 per inference) +I [executorch:esp_perf_monitor.cpp:48 StopMeasurements()] Total wall time: 205 us (205.00 us per inference) +I [executorch:esp_perf_monitor.cpp:53 StopMeasurements()] Average inference time: 0.205 ms +I [executorch:esp_perf_monitor.cpp:59 StopMeasurements()] Free heap: 6097576 bytes +I [executorch:esp_perf_monitor.cpp:63 StopMeasurements()] Min free heap ever: 6097576 bytes +I [executorch:esp_executor_runner.cpp:999 run_model()] 1 inferences finished +I [executorch:esp_executor_runner.cpp:867 print_outputs()] 1 outputs: +Output[0][0]: (float) 2.000000 +Output[0][1]: (float) 2.000000 +Output[0][2]: (float) 2.000000 +Output[0][3]: (float) 2.000000 +Output[0][4]: (float) 2.000000 +Output[0][5]: (float) 2.000000 +Output[0][6]: (float) 2.000000 +Output[0][7]: (float) 2.000000 + +``` + +## Cross-Compiling ExecuTorch + +ExecuTorch needs to be cross-compiled for the ESP32 target (Xtensa architecture). + +### Using the ESP-IDF toolchain + +```bash +# Set up the cross-compilation toolchain +export IDF_TARGET=esp32s3 # or esp32 + +# Configure ExecuTorch build for ESP32 +#Make sure to adjust the list of ops for your model or alter to use one of the selective build methods +cmake --preset esp-baremetal -B cmake-out-esp \ + -DCMAKE_TOOLCHAIN_FILE=$IDF_PATH/tools/cmake/toolchain-${IDF_TARGET}.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=OFF \ + -DEXECUTORCH_SELECT_OPS_LIST="aten::add.out," \ + . + +cmake --build cmake-out-esp -j$(nproc) +cmake --build cmake-out-esp --target install +``` + +## Memory Considerations + +### ESP32 (no PSRAM) +- Total available SRAM: ~520KB (shared between code and data) +- Recommended method allocator pool: 128-256KB +- Recommended scratch pool: 64-128KB +- **Only very small models will fit!** + +### ESP32 / ESP32-S3 with PSRAM +- Internal SRAM: ~512KB (used for code and fast data) +- PSRAM: 2-32MB (used for model data and large buffers) +- Recommended method allocator pool: 1-4MB +- Recommended scratch pool: 256KB-1MB + +### Configuring Memory Pools + +Memory pool sizes auto-adjust based on PSRAM availability. Override with: + +```cmake +# In your project CMakeLists.txt or via idf.py menuconfig +set(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE "1048576") # 1MB +set(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE "524288") # 512KB +``` + +Or as compile definitions: +```bash +idf.py build -DET_ESP_METHOD_ALLOCATOR_POOL_SIZE=1048576 +``` + +## Loading Models + +### Compiled-in (default) +The model `.pte` file is converted to a C array and compiled into the firmware. +This is the simplest approach but increases firmware size. + +### Filesystem (SPIFFS/LittleFS) +For larger models, load from the filesystem at runtime: + +1. Add `-DFILESYSTEM_LOAD=ON` to your build +2. Create a SPIFFS partition with your model: + ```bash + # Add to partitions.csv: + # storage, data, spiffs, , 0x200000 + + # Create and flash SPIFFS image: + $IDF_PATH/components/spiffs/spiffsgen.py 0x200000 model_dir spiffs.bin + esptool.py write_flash 0x210000 spiffs.bin + ``` + +## Configuration Options + +| Option | Default | Description | +|--------|---------|-------------| +| `ET_NUM_INFERENCES` | 1 | Number of inference runs | +| `ET_LOG_DUMP_INPUT` | OFF | Log input tensor values | +| `ET_LOG_DUMP_OUTPUT` | ON | Log output tensor values | +| `ET_BUNDLE_IO` | OFF | Enable BundleIO test support | +| `ET_EVENT_TRACER_ENABLED` | OFF | Enable ETDump profiling | +| `FILESYSTEM_LOAD` | OFF | Load model from filesystem | +| `ET_ESP_METHOD_ALLOCATOR_POOL_SIZE` | Auto | Method allocator size | +| `ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE` | Auto | Scratch allocator size | + +## Differences from the Arm Example + +| Feature | Arm (Cortex-M) | ESP32/ESP32-S3 | +|---------|----------------|----------------| +| Build system | Bare-metal CMake + Arm toolchain | ESP-IDF (FreeRTOS-based) | +| NPU | Ethos-U55/U65/U85 | None (CPU only) | +| Memory | ITCM/DTCM/SRAM/DDR via linker script | IRAM/DRAM/PSRAM via ESP-IDF | +| Performance monitor | ARM PMU + Ethos-U PMU | CPU cycle counter + esp_timer | +| Semihosting | FVP simulator filesystem access | SPIFFS/LittleFS/SD filesystem | +| Entry point | `main()` bare-metal | `app_main()` via FreeRTOS | +| Timing | ARM_PMU_Get_CCNTR() | esp_cpu_get_cycle_count() | + +## Troubleshooting + +### Model too large for flash +- Use filesystem loading (`FILESYSTEM_LOAD=ON`) with SPIFFS or SD card +- Quantize the model to reduce size +- Use a simpler/smaller model architecture + +### Out of memory during inference +- Enable PSRAM if your board has it (`CONFIG_SPIRAM=y`) +- Increase memory pool sizes +- Use a smaller model +- Check `log_mem_status()` output for memory usage details + +### Build errors with ExecuTorch libraries +- Ensure ExecuTorch was cross-compiled with the same ESP-IDF toolchain +- Check that `ET_BUILD_DIR_PATH` points to the correct build directory +- Verify the target architecture matches (Xtensa LX6 for ESP32, LX7 for ESP32-S3) + +### Watchdog timer resets +- Long inference times may trigger the task watchdog +- Disable with `CONFIG_ESP_TASK_WDT_EN=n` in sdkconfig +- Or increase the timeout: `CONFIG_ESP_TASK_WDT_TIMEOUT_S=30` + +## License + +This project is licensed under the BSD-style license found in the +[LICENSE](../../../LICENSE) file in the root directory of the ExecuTorch +source tree. diff --git a/examples/espressif/build.sh b/examples/espressif/build.sh new file mode 100755 index 00000000000..fd23aa0d7c2 --- /dev/null +++ b/examples/espressif/build.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Build script for the ExecuTorch ESP32 executor runner example. +# +# Prerequisites: +# - ESP-IDF v5.1+ installed and sourced (. $IDF_PATH/export.sh) +# - ExecuTorch cross-compiled for the ESP32 target +# - Python 3.8+ +# +# Usage: +# ./build.sh [--target esp32|esp32s3] [--pte ] [--clean] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ET_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +PROJECT_DIR="${SCRIPT_DIR}/project" +TARGET="esp32s3" +PTE_FILE="" +CLEAN=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --target) + TARGET="$2" + shift 2 + ;; + --pte) + PTE_FILE="$2" + shift 2 + ;; + --clean) + CLEAN=true + shift + ;; + --help|-h) + echo "Usage: $0 [--target esp32|esp32s3] [--pte ] [--clean]" + echo "" + echo "Options:" + echo " --target ESP32 target chip (default: esp32s3)" + echo " --pte Path to the .pte model file to embed" + echo " --clean Clean build directory before building" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Validate environment +if [ -z "${IDF_PATH:-}" ]; then + echo "ERROR: IDF_PATH is not set. Please source ESP-IDF:" + echo " . \$IDF_PATH/export.sh" + exit 1 +fi + +echo "=== ExecuTorch ESP32 Executor Runner Build ===" +echo "Target: ${TARGET}" +echo "ExecuTorch root: ${ET_ROOT}" +echo "ESP-IDF: ${IDF_PATH}" + +# Convert PTE to header if provided +if [ -n "${PTE_FILE}" ]; then + if [ ! -f "${PTE_FILE}" ]; then + echo "ERROR: PTE file not found: ${PTE_FILE}" + exit 1 + fi + + echo "Converting PTE to header: ${PTE_FILE}" + HEADER_DIR="${PROJECT_DIR}" + mkdir -p "${HEADER_DIR}" + python3 "${SCRIPT_DIR}/executor_runner/pte_to_header.py" \ + --pte "${PTE_FILE}" \ + --outdir "${HEADER_DIR}" + echo "Model header generated: ${HEADER_DIR}/model_pte.h" +fi + +# Navigate to project directory +cd "${PROJECT_DIR}" + +# Clean if requested +if [ "${CLEAN}" = true ]; then + echo "Cleaning build directory..." + rm -rf build sdkconfig +fi +# Set target +echo "Setting target to ${TARGET}..." +idf.py set-target "${TARGET}" + +# Build +echo "Building..." +idf.py build + +echo "" +echo "=== Build complete ===" +echo "" +echo "To flash and monitor:" +echo " cd ${PROJECT_DIR}" +echo " idf.py -p /dev/ttyUSB0 flash monitor" +echo "" +echo "To just monitor:" +echo " idf.py -p /dev/ttyUSB0 monitor" diff --git a/examples/espressif/executor_runner/CMakeLists.txt b/examples/espressif/executor_runner/CMakeLists.txt new file mode 100644 index 00000000000..63d701d38f1 --- /dev/null +++ b/examples/espressif/executor_runner/CMakeLists.txt @@ -0,0 +1,305 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# ESP-IDF component CMakeLists.txt for the ExecuTorch executor runner. +# +# This file defines the executor_runner as an ESP-IDF component. +# It is designed to work with the ESP-IDF build system (idf.py build). +# +# Project structure expected: +# my_project/ +# ├── CMakeLists.txt (project-level, uses this as a component) +# ├── main/ +# │ └── CMakeLists.txt (main component, depends on executor_runner) +# └── components/ +# └── executor_runner/ (this component - symlink or copy) +# +# Or you can use this CMakeLists.txt directly as a standalone CMake build +# for cross-compilation testing. + +cmake_minimum_required(VERSION 3.16) + +# ─── Option: ESP-IDF component mode vs. standalone CMake mode ─── +if(ESP_PLATFORM) + # ═══════════════════════════════════════════════════════════════ + # ESP-IDF Component Build + # ═══════════════════════════════════════════════════════════════ + idf_component_register( + SRCS + "esp_executor_runner.cpp" + "esp_pal.cpp" + "esp_memory_allocator.cpp" + "esp_perf_monitor.cpp" + INCLUDE_DIRS + "." + REQUIRES + esp_timer + esp_system + spiffs + ) + + # ExecuTorch pre-built library paths + set(ET_DIR_PATH + "${CMAKE_CURRENT_SOURCE_DIR}/../../.." + CACHE PATH "Path to ExecuTorch source dir" + ) + set(ET_BUILD_DIR_PATH + "${ET_DIR_PATH}/cmake-out-esp" + CACHE PATH "Path to ExecuTorch build/install dir for ESP target" + ) + set(ET_PTE_FILE_PATH + "" + CACHE PATH "Path to ExecuTorch model .pte file" + ) + set(PYTHON_EXECUTABLE + "python3" + CACHE PATH "Python executable" + ) + + set(ET_NUM_INFERENCES + "10" + CACHE STRING "Number of inferences to run" + ) + option(ET_LOG_DUMP_INPUT "Dump input in log" OFF) + option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON) + option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF) + set(ET_ATOL "0.01" CACHE STRING "Absolute tolerance for BundleIO testing") + set(ET_RTOL "0.01" CACHE STRING "Relative tolerance for BundleIO testing") + option(ET_DUMP_OUTPUTS "Collect and print outputs as base64 in log" OFF) + option(ET_DUMP_INTERMEDIATE_OUTPUTS "Collect and print intermediate outputs" OFF) + set(ET_DEBUG_BUFFER_SIZE "65536" CACHE STRING "Size of ETDump debug buffer") + option(FILESYSTEM_LOAD "Load model from filesystem instead of compiled-in data" OFF) + + # Directory containing the generated model_pte.h header. + # By default this is the project source directory (where build.sh places it), + # but it can be overridden if you generate the header elsewhere. + set(ET_MODEL_HEADER_DIR + "${CMAKE_SOURCE_DIR}" + CACHE PATH "Directory containing the generated model_pte.h header" + ) + + # Memory pool sizes + set(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE "" CACHE STRING + "Method allocator pool size (empty = auto based on PSRAM availability)") + set(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE "" CACHE STRING + "Scratch temp allocator pool size (empty = auto based on PSRAM availability)") + + # Find pre-built ExecuTorch libraries. + # TARGETS_GLOBAL is needed because ESP-IDF's project.cmake resolves link + # dependencies from the top-level project scope, but find_package runs + # inside this component's directory scope. Without GLOBAL, the imported + # targets (executorch, portable_kernels, etc.) are invisible at the + # project level and you get "No target executorch" errors. + set(CMAKE_FIND_PACKAGE_TARGETS_GLOBAL TRUE) + find_package( + executorch REQUIRED HINTS "${ET_BUILD_DIR_PATH}/lib/cmake/ExecuTorch" + ) + + # Convert pte to header if not using filesystem loading + if(NOT FILESYSTEM_LOAD AND ET_PTE_FILE_PATH) + add_custom_target( + gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + ) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/pte_to_header.py + --pte ${ET_PTE_FILE_PATH} + --outdir ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${ET_PTE_FILE_PATH} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + add_dependencies(${COMPONENT_LIB} gen_model_header) + endif() + + # Include directories + target_include_directories( + ${COMPONENT_LIB} + PRIVATE + ${ET_DIR_PATH}/.. + ${ET_DIR_PATH}/runtime/core/portable_type/c10 + ${CMAKE_CURRENT_BINARY_DIR} + ${ET_MODEL_HEADER_DIR} + ) + + # Link ExecuTorch libraries + set(esp_runner_libs) + list(APPEND esp_runner_libs + extension_runner_util + executorch + executorch_selected_kernels + ) + + if(TARGET xnnpack_backend) + list(APPEND esp_runner_libs xnnpack_backend) + endif() + + if(EXECUTORCH_ENABLE_EVENT_TRACER) + target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_EVENT_TRACER_ENABLED) + list(APPEND esp_runner_libs etdump flatccrt) + endif() + + if(ET_BUNDLE_IO) + list(APPEND esp_runner_libs bundled_program) + endif() + + target_link_libraries(${COMPONENT_LIB} PUBLIC ${esp_runner_libs}) + + # Compile definitions + target_compile_definitions( + ${COMPONENT_LIB} PRIVATE C10_USING_CUSTOM_GENERATED_MACROS + ) + + if(ET_NUM_INFERENCES) + target_compile_definitions( + ${COMPONENT_LIB} PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES} + ) + endif() + + if(ET_LOG_DUMP_INPUT) + target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_LOG_DUMP_INPUT) + endif() + + if(ET_LOG_DUMP_OUTPUT) + target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_LOG_DUMP_OUTPUT) + endif() + + if(ET_BUNDLE_IO) + target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_BUNDLE_IO) + endif() + + if(ET_ATOL) + target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_ATOL=${ET_ATOL}) + endif() + + if(ET_RTOL) + target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_RTOL=${ET_RTOL}) + endif() + + if(ET_DUMP_OUTPUTS) + target_compile_definitions(${COMPONENT_LIB} PUBLIC ET_DUMP_OUTPUTS) + endif() + + if(ET_DUMP_INTERMEDIATE_OUTPUTS) + target_compile_definitions( + ${COMPONENT_LIB} PUBLIC ET_DUMP_INTERMEDIATE_OUTPUTS + ) + endif() + + if(ET_DEBUG_BUFFER_SIZE) + target_compile_definitions( + ${COMPONENT_LIB} PUBLIC ET_DEBUG_BUFFER_SIZE=${ET_DEBUG_BUFFER_SIZE} + ) + endif() + + if(FILESYSTEM_LOAD) + target_compile_definitions(${COMPONENT_LIB} PUBLIC FILESYSTEM_LOAD) + endif() + + if(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE) + target_compile_definitions( + ${COMPONENT_LIB} + PUBLIC ET_ESP_METHOD_ALLOCATOR_POOL_SIZE=${ET_ESP_METHOD_ALLOCATOR_POOL_SIZE} + ) + endif() + + if(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE) + target_compile_definitions( + ${COMPONENT_LIB} + PUBLIC ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} + ) + endif() + +else() + # ═══════════════════════════════════════════════════════════════ + # Standalone CMake Build (for host testing / cross-compilation) + # ═══════════════════════════════════════════════════════════════ + project(esp_executor_runner) + + set(ET_DIR_PATH + "${CMAKE_CURRENT_SOURCE_DIR}/../../.." + CACHE PATH "Path to ExecuTorch dir" + ) + include(${ET_DIR_PATH}/tools/cmake/Utils.cmake) + set(ET_BUILD_DIR_PATH + "${ET_DIR_PATH}/cmake-out" + CACHE PATH "Path to ExecuTorch build/install dir" + ) + set(ET_INCLUDE_PATH + "${ET_DIR_PATH}/.." + CACHE PATH "Path to ExecuTorch headers" + ) + set(ET_PTE_FILE_PATH + "" + CACHE PATH "Path to ExecuTorch model pte" + ) + set(PYTHON_EXECUTABLE + "python3" + CACHE PATH "Python executable" + ) + + set(ET_NUM_INFERENCES "1" CACHE STRING "Number of inferences to run") + option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON) + + if(NOT DEFINED ET_PTE_FILE_PATH OR ET_PTE_FILE_PATH STREQUAL "") + message(FATAL_ERROR "ET_PTE_FILE_PATH must be set to the .pte model file") + endif() + + find_package( + executorch REQUIRED HINTS "${ET_BUILD_DIR_PATH}/lib/cmake/ExecuTorch" + ) + + # Convert pte to header + add_custom_target( + gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + ) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py --pte + ${ET_PTE_FILE_PATH} --outdir ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${ET_PTE_FILE_PATH} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + + add_executable(esp_executor_runner) + target_sources( + esp_executor_runner PRIVATE + esp_executor_runner.cpp + esp_pal.cpp + esp_perf_monitor.cpp + esp_memory_allocator.cpp + ) + + target_link_libraries( + esp_executor_runner PUBLIC + extension_runner_util + executorch + portable_kernels + ) + + target_include_directories( + esp_executor_runner + PRIVATE + ${ET_INCLUDE_PATH} + ${ET_DIR_PATH}/runtime/core/portable_type/c10 + ${CMAKE_CURRENT_BINARY_DIR} + ) + + target_compile_definitions( + esp_executor_runner PRIVATE C10_USING_CUSTOM_GENERATED_MACROS + ) + + if(ET_NUM_INFERENCES) + target_compile_definitions( + esp_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES} + ) + endif() + + if(ET_LOG_DUMP_OUTPUT) + target_compile_definitions(esp_executor_runner PUBLIC ET_LOG_DUMP_OUTPUT) + endif() + + add_dependencies(esp_executor_runner gen_model_header) +endif() diff --git a/examples/espressif/executor_runner/esp_executor_runner.cpp b/examples/espressif/executor_runner/esp_executor_runner.cpp new file mode 100644 index 00000000000..6b95e16b768 --- /dev/null +++ b/examples/espressif/executor_runner/esp_executor_runner.cpp @@ -0,0 +1,1240 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* This is an example ExecuTorch runner for Espressif ESP32 and ESP32-S3 chips. + * It is inspired by the Arm Cortex-M example runner and adapted for the + * ESP-IDF build system and ESP32 memory architecture. + * + * Some defines used to configure the code: + * + * ET_ESP_METHOD_ALLOCATOR_POOL_SIZE - Size of memory area used when + * setting up the model. + * ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE - Size of memory area used when + * running inferences (scratch). + * ET_NUM_INFERENCES - Number of times to run the inference. + * ET_LOG_DUMP_INPUT - Control if you want input to be dumped to the log. + * ET_LOG_DUMP_OUTPUT - Control if you want output to be dumped to the log. + * + * Devtool BundleIO: Use Bundle PTE with input and reference output included + * to check if it matches. + * + * ET_BUNDLE_IO - Build in Devtools BundleIO support. Makes it possible + * to use bpte with bundled input and output ref data. + * ET_ATOL - The atol used to compare output and ref data. + * ET_RTOL - The rtol used to compare output and ref data. + * + * Devtools ETDump: Speed and dumping output + * + * ET_EVENT_TRACER_ENABLED - Build in Devtools ETDump event trace code + * to generate cycle data. + * ET_DUMP_OUTPUTS - Collect and print outputs as a base64 + * buffer in the log. + * ET_DUMP_INTERMEDIATE_OUTPUTS - Collect and print intermediate outputs. + * ET_DEBUG_BUFFER_SIZE - Override size of memory area used by + * ET_DUMP_OUTPUTS / + * ET_DUMP_INTERMEDIATE_OUTPUTS. + * + * ESP32 Memory Notes: + * - ESP32 has ~520KB internal SRAM, optionally 4-8MB PSRAM. + * - ESP32-S3 has ~512KB internal SRAM, optionally 2-32MB PSRAM (octal). + * - For larger models, PSRAM is required. Memory pools are placed in + * PSRAM when available using EXT_RAM_BSS_ATTR. + * - The model .pte data is converted to a C array and compiled in, + * or can be loaded from SPIFFS/LittleFS/SD card filesystem. + * + * FILESYSTEM_LOAD - When defined, the runner will load the .pte model + * from the filesystem (SPIFFS/LittleFS/SD) instead of + * compiled-in data. Useful for larger models that don't + * fit in flash as a C array. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "esp_executor_runner.h" +#include "esp_memory_allocator.h" +#include "esp_perf_monitor.h" + +#if defined(ESP_PLATFORM) +#include +#include +#include +#include +#include +#endif + +#if defined(ET_BUNDLE_IO) +#include +#endif + +#if defined(ET_EVENT_TRACER_ENABLED) +#include + +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) +#include + +#if !defined(ET_DEBUG_BUFFER_SIZE) +#define ET_DEBUG_BUFFER_SIZE (64 * 1024) +#endif + +#endif // ET_DUMP_INTERMEDIATE_OUTPUTS || ET_DUMP_OUTPUTS + +#endif // ET_EVENT_TRACER_ENABLED + +#if defined(FILESYSTEM_LOAD) +#include +#if defined(ESP_PLATFORM) +#include +#endif +#else +/* When not loading from filesystem, include the model as a compiled-in + * C array. This header is generated by the build process from the .pte file + * specified in ET_PTE_FILE_PATH. */ +#include "model_pte.h" +#endif + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::extension::BufferDataLoader; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::Tag; +using executorch::runtime::TensorInfo; +using executorch::runtime::toString; + +#if defined(ET_BUNDLE_IO) +using executorch::bundled_program::compute_method_output_error_stats; +using executorch::bundled_program::ErrorStats; +using executorch::bundled_program::verify_method_outputs; +#endif + +#if defined(ET_EVENT_TRACER_ENABLED) +using executorch::etdump::BufferDataSink; +using executorch::etdump::ETDumpGen; +using executorch::etdump::ETDumpResult; +using executorch::runtime::EventTracerDebugLogLevel; +using torch::executor::etdump_result; +#endif + +/** + * Memory pool sizes for the ExecuTorch runtime. + * + * ESP32: ~520KB internal SRAM total. With PSRAM: 4-8MB external. + * ESP32-S3: ~512KB internal SRAM total. With PSRAM: 2-32MB external. + * + * For models that fit in internal SRAM, use smaller pool sizes. + * For larger models, enable PSRAM and increase these values. + * + * Default: 256KB method allocator, 128KB scratch (suitable for small models). + * With PSRAM: These can be increased significantly. + */ +#if !defined(ET_ESP_METHOD_ALLOCATOR_POOL_SIZE) +#if defined(CONFIG_SPIRAM) +/* With PSRAM available, use larger pools */ +#define ET_ESP_METHOD_ALLOCATOR_POOL_SIZE (2 * 1024 * 1024) +#else +/* Internal SRAM only - conservative defaults */ +#define ET_ESP_METHOD_ALLOCATOR_POOL_SIZE (256 * 1024) +#endif +#endif + +#if !defined(ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE) +#if defined(CONFIG_SPIRAM) +#define ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE (512 * 1024) +#else +#define ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE (128 * 1024) +#endif +#endif + +/** + * Memory pool placement. + * On ESP32 with PSRAM, place large buffers in external RAM. + * EXT_RAM_BSS_ATTR places the buffer in PSRAM .bss section. + */ +#if defined(CONFIG_SPIRAM) && defined(ESP_PLATFORM) +#include +// Use PSRAM for large allocations +static const size_t method_allocation_pool_size = + ET_ESP_METHOD_ALLOCATOR_POOL_SIZE; +static uint8_t __attribute__((aligned(16))) +method_allocation_pool[ET_ESP_METHOD_ALLOCATOR_POOL_SIZE] EXT_RAM_BSS_ATTR; + +static const size_t temp_allocation_pool_size = + ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE; +static uint8_t __attribute__((aligned(16))) +temp_allocation_pool[ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE] EXT_RAM_BSS_ATTR; +#else +// Internal SRAM allocation +static const size_t method_allocation_pool_size = + ET_ESP_METHOD_ALLOCATOR_POOL_SIZE; +static uint8_t __attribute__(( + aligned(16))) method_allocation_pool[ET_ESP_METHOD_ALLOCATOR_POOL_SIZE]; + +static const size_t temp_allocation_pool_size = + ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE; +static uint8_t __attribute__(( + aligned(16))) temp_allocation_pool[ET_ESP_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE]; +#endif + +#if defined(FILESYSTEM_LOAD) +static char* model_pte = nullptr; +static size_t model_pte_size = 0; +#endif + +#if defined(ET_BUNDLE_IO) +static const size_t testset_idx = 0; + +#if defined(ET_ATOL) +static const float et_atol = ET_ATOL; +#else +static const float et_atol = 0.01; +#endif + +#if defined(ET_RTOL) +static const float et_rtol = ET_RTOL; +#else +static const float et_rtol = 0.01; +#endif +#endif // ET_BUNDLE_IO + +#if defined(ET_NUM_INFERENCES) +static const int num_inferences = ET_NUM_INFERENCES; +#else +static const int num_inferences = 10; +#endif + +namespace { + +/// Lightweight heapless container that constructs and stores a T in-place. +/// Useful when you want to avoid heap allocations but need to delay +/// construction. +template +class Box { + public: + Box() = default; + + ~Box() { + if (has_value) { + ptr()->~T(); + } + } + + Box(const Box&) = delete; + Box& operator=(const Box&) = delete; + + template + void reset(Args&&... args) { + if (has_value) { + reinterpret_cast(mem)->~T(); + } + new (mem) T(std::forward(args)...); + has_value = true; + } + + T& value() { + return *ptr(); + } + + const T& value() const { + return *ptr(); + } + + T* operator->() { + return ptr(); + } + + const T* operator->() const { + return ptr(); + } + + private: + alignas(T) uint8_t mem[sizeof(T)]; + bool has_value = false; + + T* ptr() { + return reinterpret_cast(mem); + } + + const T* ptr() const { + return reinterpret_cast(mem); + } +}; + +template +void fill_tensor_with_default_value(Tensor& tensor) { + ValueType fill_value{}; + if constexpr (std::is_same_v) { + fill_value = true; + } else { + fill_value = ValueType(1); + } + + ValueType* data_ptr = tensor.mutable_data_ptr(); + std::fill(data_ptr, data_ptr + tensor.numel(), fill_value); +} + +Error prepare_input_tensors(Method& method, MemoryAllocator& allocator) { + MethodMeta method_meta = method.method_meta(); + size_t num_inputs = method_meta.num_inputs(); + + EValue* input_evalues = allocator.allocateList(num_inputs); + ET_CHECK_OR_RETURN_ERROR( + input_evalues != nullptr, + MemoryAllocationFailed, + "Could not allocate memory for input evalues."); + + Error err = method.get_inputs(input_evalues, num_inputs); + ET_CHECK_OK_OR_RETURN_ERROR(err); + + for (size_t i = 0; i < num_inputs; i++) { + auto tag = method_meta.input_tag(i); + ET_CHECK_OK_OR_RETURN_ERROR(tag.error()); + + if (tag.get() != Tag::Tensor) { + ET_LOG( + Debug, + "Skipping non-tensor input %lu", + static_cast(i)); + continue; + } + + // Fill tensors with default values (1) when no input data is provided + if (input_evalues[i].isTensor()) { + Tensor& tensor = input_evalues[i].toTensor(); + switch (tensor.scalar_type()) { +#define HANDLE_SCALAR_TYPE(cpp_type, scalar_name) \ + case ScalarType::scalar_name: \ + fill_tensor_with_default_value(tensor); \ + break; + ET_FORALL_SCALAR_TYPES(HANDLE_SCALAR_TYPE) +#undef HANDLE_SCALAR_TYPE + default: + ET_LOG( + Error, "Unhandled ScalarType %s", toString(tensor.scalar_type())); + err = Error::InvalidArgument; + break; + } + } else { + printf("Input[%lu]: Not Tensor\n", static_cast(i)); + } + } + + return err; +} + +#if defined(FILESYSTEM_LOAD) +/** + * Load a binary file from the filesystem. + * Supports SPIFFS, LittleFS, or SD card mounted filesystems. + */ +std::pair load_file_from_fs( + const char* filepath, + MemoryAllocator& allocator) { + FILE* fp = fopen(filepath, "rb"); + if (!fp) { + ET_LOG(Fatal, "Could not open file %s (errno: %d)", filepath, errno); + return std::make_pair(nullptr, 0); + } + + if (fseek(fp, 0, SEEK_END) != 0) { + ET_LOG( + Fatal, "Failed to seek to end of file %s (errno: %d)", filepath, errno); + fclose(fp); + return std::make_pair(nullptr, 0); + } + auto file_size = ftell(fp); + if (file_size <= 0) { + ET_LOG( + Fatal, + "Failed to determine valid size for file %s (size: %ld, errno: %d)", + filepath, + static_cast(file_size), + errno); + fclose(fp); + return std::make_pair(nullptr, 0); + } + + if (fseek(fp, 0, SEEK_SET) != 0) { + ET_LOG( + Fatal, + "Failed to seek to beginning of file %s (errno: %d)", + filepath, + errno); + fclose(fp); + return std::make_pair(nullptr, 0); + } + const size_t size = static_cast(file_size); + char* buffer = static_cast(allocator.allocate(size)); + if (buffer == nullptr) { + ET_LOG( + Fatal, + "Failed to allocate %lu bytes for file %s", + static_cast(size), + filepath); + fclose(fp); + return std::make_pair(nullptr, 0); + } + + auto read_size = fread(buffer, 1, size, fp); + if (read_size != size) { + ET_LOG( + Fatal, + "Partial read of %s: got %lu of %lu bytes", + filepath, + static_cast(read_size), + static_cast(size)); + fclose(fp); + return std::make_pair(nullptr, 0); + } + fclose(fp); + return std::make_pair(buffer, read_size); +} + +#if defined(ESP_PLATFORM) +/** + * Initialize SPIFFS filesystem for loading model files. + */ +bool init_spiffs(const char* base_path, const char* partition_label) { + esp_vfs_spiffs_conf_t conf = { + .base_path = base_path, + .partition_label = partition_label, + .max_files = 5, + .format_if_mount_failed = false, + }; + + esp_err_t ret = esp_vfs_spiffs_register(&conf); + if (ret != ESP_OK) { + if (ret == ESP_FAIL) { + ET_LOG(Error, "Failed to mount SPIFFS filesystem"); + } else if (ret == ESP_ERR_NOT_FOUND) { + ET_LOG(Error, "SPIFFS partition not found"); + } else { + ET_LOG(Error, "SPIFFS init failed: %s", esp_err_to_name(ret)); + } + return false; + } + + size_t total = 0, used = 0; + ret = esp_spiffs_info(partition_label, &total, &used); + if (ret == ESP_OK) { + ET_LOG( + Info, + "SPIFFS: total=%lu, used=%lu", + static_cast(total), + static_cast(used)); + } + return true; +} +#endif // ESP_PLATFORM +#endif // FILESYSTEM_LOAD + +/// Holds all state needed for setup and run phases +struct RunnerContext { + RunnerContext() = default; + RunnerContext(const RunnerContext& ctx) = delete; + RunnerContext& operator=(const RunnerContext& ctx) = delete; + + const char* method_name = nullptr; + size_t planned_buffer_memsize = 0; + size_t method_loaded_memsize = 0; + size_t executor_membase = 0; + size_t program_data_len = 0; + size_t input_memsize = 0; + size_t pte_size = 0; + bool bundle_io = false; + Box loader; + Box program; + Box method_allocator; + Box temp_allocator; + std::vector> planned_spans; + Box planned_memory; + Box memory_manager; + Box> method; +#if defined(ET_EVENT_TRACER_ENABLED) + Box etdump_gen; +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + void* debug_buffer; +#endif +#endif +}; + +void runner_init(RunnerContext& ctx, size_t pte_size) { + const void* program_data = model_pte; + ctx.program_data_len = pte_size; + ctx.pte_size = pte_size; + +#if defined(ET_BUNDLE_IO) + ctx.bundle_io = executorch::bundled_program::is_bundled_program( + reinterpret_cast(model_pte), ctx.pte_size); + if (ctx.bundle_io) { + Error status = executorch::bundled_program::get_program_data( + reinterpret_cast(model_pte), + ctx.pte_size, + &program_data, + &ctx.program_data_len); + ET_CHECK_MSG( + status == Error::Ok, + "get_program_data() from bundle PTE failed: 0x%x", + (unsigned int)status); + } +#endif + + ctx.loader.reset(program_data, ctx.program_data_len); + auto& loader = ctx.loader.value(); + ET_LOG( + Info, + "PTE Model data loaded. Size: %lu bytes.", + static_cast(ctx.program_data_len)); + + // Parse the program file + Result program_result = Program::load(&loader); + ET_CHECK_MSG( + program_result.ok(), + "Program loading failed @ %p: 0x%" PRIx32, + program_data, + static_cast(program_result.error())); + ctx.program.reset(std::move(program_result.get())); + Program& program = ctx.program.value(); + + ET_LOG( + Info, + "Model buffer loaded, has %lu methods", + static_cast(program.num_methods())); + + { + const auto method_name_result = program.get_method_name(0); + ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); + ctx.method_name = *method_name_result; + } + ET_LOG(Info, "Running method %s", ctx.method_name); + + Result method_meta = program.method_meta(ctx.method_name); + ET_CHECK_MSG( + method_meta.ok(), + "Failed to get method_meta for %s: 0x%x", + ctx.method_name, + (unsigned int)method_meta.error()); + + ET_LOG( + Info, + "Setup Method allocator pool. Size: %lu bytes.", + static_cast(method_allocation_pool_size)); + + ctx.method_allocator.reset( + method_allocation_pool_size, method_allocation_pool); + + ctx.planned_spans.clear(); + size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); + ctx.planned_spans.reserve(num_memory_planned_buffers); + size_t planned_buffer_membase = ctx.method_allocator->used_size(); + + for (size_t id = 0; id < num_memory_planned_buffers; ++id) { + size_t buffer_size = + static_cast(method_meta->memory_planned_buffer_size(id).get()); + ET_LOG( + Info, + "Setting up planned buffer %lu, size %lu.", + static_cast(id), + static_cast(buffer_size)); + + uint8_t* buffer = reinterpret_cast( + ctx.method_allocator->allocate(buffer_size, 16UL)); + ET_CHECK_MSG( + buffer != nullptr, + "Could not allocate memory for memory planned buffer size %lu", + static_cast(buffer_size)); + ctx.planned_spans.push_back({buffer, buffer_size}); + } + + ctx.planned_buffer_memsize = + ctx.method_allocator->used_size() - planned_buffer_membase; + + Span> planned_memory_span; + if (!ctx.planned_spans.empty()) { + planned_memory_span = + Span>(ctx.planned_spans.data(), ctx.planned_spans.size()); + } + ctx.planned_memory.reset(planned_memory_span); + + ctx.temp_allocator.reset(temp_allocation_pool_size, temp_allocation_pool); + + ctx.memory_manager.reset( + &ctx.method_allocator.value(), + &ctx.planned_memory.value(), + &ctx.temp_allocator.value()); + + size_t method_loaded_membase = ctx.method_allocator->used_size(); + + executorch::runtime::EventTracer* event_tracer_ptr = nullptr; + +#if defined(ET_EVENT_TRACER_ENABLED) + ET_LOG(Info, "Setting up ETDump"); + ctx.etdump_gen.reset(); + event_tracer_ptr = &ctx.etdump_gen.value(); + +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + ctx.debug_buffer = ctx.method_allocator->allocate(ET_DEBUG_BUFFER_SIZE, 16); + if (ctx.debug_buffer != nullptr) { + Span debug_buffer_span( + (uint8_t*)ctx.debug_buffer, ET_DEBUG_BUFFER_SIZE); + + Result result = + ctx.etdump_gen.value().set_debug_buffer(debug_buffer_span); + + if (result.ok()) { +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) + ET_LOG( + Info, + "ETDump: Allocated intermediate output buffer size: %d at 0x%p", + ET_DEBUG_BUFFER_SIZE, + ctx.debug_buffer); + ctx.etdump_gen.value().set_event_tracer_debug_level( + EventTracerDebugLogLevel::kIntermediateOutputs); +#else + ET_LOG( + Info, + "ETDump: Allocated output buffer size: %d at 0x%p", + ET_DEBUG_BUFFER_SIZE, + ctx.debug_buffer); + ctx.etdump_gen.value().set_event_tracer_debug_level( + EventTracerDebugLogLevel::kProgramOutputs); +#endif + } else { + ctx.debug_buffer = nullptr; + ET_LOG( + Error, + "ETDump: Could not set_debug_buffer() error:0x%" PRIx32, + result.error()); + } + } else { + ET_LOG( + Error, + "ETDump: Could not allocate output buffer size %lu", + static_cast(ET_DEBUG_BUFFER_SIZE)); + } +#endif // ET_DUMP_INTERMEDIATE_OUTPUTS || ET_DUMP_OUTPUTS +#endif // ET_EVENT_TRACER_ENABLED + + ctx.method.reset(program.load_method( + ctx.method_name, &ctx.memory_manager.value(), event_tracer_ptr)); + + if (!ctx.method->ok()) { + ET_LOG( + Info, + "Loading of method %s failed with status 0x%" PRIx32, + ctx.method_name, + static_cast(ctx.method->error())); + } + ctx.method_loaded_memsize = + ctx.method_allocator->used_size() - method_loaded_membase; + ET_LOG(Info, "Method '%s' loaded.", ctx.method_name); + + ET_LOG(Info, "Preparing inputs..."); + size_t input_membase = ctx.method_allocator->used_size(); + +#if defined(ET_BUNDLE_IO) + if (ctx.bundle_io) { + ET_LOG(Info, "Input testset[%d] from bundled bpte", testset_idx); + Error status = executorch::bundled_program::load_bundled_input( + *ctx.method.value(), model_pte, testset_idx); + ET_CHECK_MSG( + status == Error::Ok, + "load_bundled_input failed with status 0x%" PRIx32, + status); + } else +#endif + { + Error status = ::prepare_input_tensors( + *ctx.method.value(), ctx.method_allocator.value()); + ET_CHECK_MSG( + status == Error::Ok, + "Failed to prepare inputs 0x%" PRIx32, + static_cast(status)); + } + +#if defined(ET_LOG_DUMP_INPUT) + { + std::vector inputs(ctx.method.value()->inputs_size()); + ET_LOG(Info, "%lu inputs: ", static_cast(inputs.size())); + Error status = ctx.method.value()->get_inputs(inputs.data(), inputs.size()); + ET_CHECK(status == Error::Ok); + + for (int i = 0; i < inputs.size(); ++i) { + if (inputs[i].isTensor()) { + Tensor tensor = inputs[i].toTensor(); + for (int j = 0; j < tensor.numel(); ++j) { + if (tensor.scalar_type() == ScalarType::Int) { + printf( + "Input[%d][%d]: (int) %d\n", + i, + j, + tensor.const_data_ptr()[j]); + } else if (tensor.scalar_type() == ScalarType::Float) { + printf( + "Input[%d][%d]: (float) %f\n", + i, + j, + tensor.const_data_ptr()[j]); + } else if (tensor.scalar_type() == ScalarType::Char) { + printf( + "Input[%d][%d]: (char) %d\n", + i, + j, + tensor.const_data_ptr()[j]); + } else if (tensor.scalar_type() == ScalarType::Bool) { + printf( + "Input[%d][%d]: (bool) %s (0x%x)\n", + i, + j, + tensor.const_data_ptr()[j] ? "true" : "false", + tensor.const_data_ptr()[j]); + } + } + } else { + printf("Input[%d]: Not Tensor\n", i); + } + } + } +#endif + + ctx.input_memsize = ctx.method_allocator->used_size() - input_membase; + ctx.executor_membase = ctx.method_allocator->used_size(); + + ET_LOG(Info, "Input prepared."); +} + +void log_mem_status(RunnerContext& ctx) { + size_t executor_memsize = + ctx.method_allocator->used_size() - ctx.executor_membase; + + ET_LOG( + Info, + "model_pte_program_size: %lu bytes.", + static_cast(ctx.program_data_len)); + ET_LOG( + Info, + "model_pte_loaded_size: %lu bytes.", + static_cast(ctx.pte_size)); + + if (ctx.method_allocator->size() != 0) { + size_t method_allocator_used = ctx.method_allocator->used_size(); + ET_LOG( + Info, + "method_allocator_used: %lu / %lu free: %lu ( used: %lu %% ) ", + static_cast(method_allocator_used), + static_cast(ctx.method_allocator->size()), + static_cast(ctx.method_allocator->free_size()), + static_cast( + 100 * method_allocator_used / ctx.method_allocator->size())); + ET_LOG( + Info, + "method_allocator_planned: %lu bytes", + static_cast(ctx.planned_buffer_memsize)); + ET_LOG( + Info, + "method_allocator_loaded: %lu bytes", + static_cast(ctx.method_loaded_memsize)); + ET_LOG( + Info, + "method_allocator_input: %lu bytes", + static_cast(ctx.input_memsize)); + ET_LOG( + Info, + "method_allocator_executor: %lu bytes", + static_cast(executor_memsize)); + } + if (ctx.temp_allocator->size() > 0) { + ET_LOG( + Info, + "temp_allocator: %lu", + static_cast(ctx.temp_allocator->size())); + } + +#if defined(ESP_PLATFORM) + ET_LOG( + Info, + "ESP free heap: %lu bytes", + static_cast(esp_get_free_heap_size())); + ET_LOG( + Info, + "ESP min free heap ever: %lu bytes", + static_cast(esp_get_minimum_free_heap_size())); +#if defined(CONFIG_SPIRAM) + ET_LOG( + Info, + "ESP free PSRAM: %lu bytes", + static_cast(heap_caps_get_free_size(MALLOC_CAP_SPIRAM))); +#endif +#endif + +#if defined(ET_EVENT_TRACER_ENABLED) +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + if (ctx.debug_buffer != nullptr) { + size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes(); + ET_LOG( + Info, + "ETDump_outputs_buffer: %lu / %lu free: %lu ( used: %lu %% ) ", + static_cast(outputdump_len), + static_cast(ET_DEBUG_BUFFER_SIZE), + static_cast(ET_DEBUG_BUFFER_SIZE - outputdump_len), + static_cast( + 100 * outputdump_len / ET_DEBUG_BUFFER_SIZE)); + } +#endif +#endif +} + +void print_outputs(RunnerContext& ctx) { + std::vector outputs(ctx.method.value()->outputs_size()); + ET_LOG(Info, "%lu outputs: ", static_cast(outputs.size())); + Error status = + ctx.method.value()->get_outputs(outputs.data(), outputs.size()); + ET_CHECK(status == Error::Ok); + + for (int i = 0; i < outputs.size(); ++i) { + if (outputs[i].isTensor()) { + Tensor tensor = outputs[i].toTensor(); +#if defined(ET_LOG_DUMP_OUTPUT) + for (int j = 0; j < tensor.numel(); ++j) { + if (tensor.scalar_type() == ScalarType::Int) { + printf( + "Output[%d][%d]: (int) %d\n", + i, + j, + tensor.const_data_ptr()[j]); + } else if (tensor.scalar_type() == ScalarType::Float) { + printf( + "Output[%d][%d]: (float) %f\n", + i, + j, + tensor.const_data_ptr()[j]); + } else if (tensor.scalar_type() == ScalarType::Char) { + printf( + "Output[%d][%d]: (char) %d\n", + i, + j, + tensor.const_data_ptr()[j]); + } else if (tensor.scalar_type() == ScalarType::Bool) { + printf( + "Output[%d][%d]: (bool) %s (0x%x)\n", + i, + j, + tensor.const_data_ptr()[j] ? "true " : "false", + tensor.const_data_ptr()[j]); + } + } +#endif + } else { + printf("Output[%d]: Not Tensor\n", i); + } + } +} + +void write_etdump(RunnerContext& ctx) { +#if defined(ET_EVENT_TRACER_ENABLED) + ETDumpResult result = ctx.etdump_gen->get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + ET_LOG( + Info, + "ETDump data generated: %lu bytes", + static_cast(result.size)); + + // On ESP32, we could write to SPIFFS/SD or dump via serial. + // For now, log the size. In a production setup, you would + // write this to a filesystem or transmit over a network interface. +#if defined(FILESYSTEM_LOAD) && defined(ESP_PLATFORM) + const char* etdump_filename = "/spiffs/etdump.bin"; + ET_LOG(Info, "Writing etdump to file: %s", etdump_filename); + FILE* f = fopen(etdump_filename, "wb"); + if (f) { + size_t bytes_written = fwrite((uint8_t*)result.buf, 1, result.size, f); + if (bytes_written != result.size) { + ET_LOG( + Error, + "Failed to write complete ETDump data to %s (wrote %lu of %lu bytes)", + etdump_filename, + static_cast(bytes_written), + static_cast(result.size)); + } + fclose(f); + } else { + ET_LOG(Error, "Could not open %s for writing", etdump_filename); + } +#endif + } +#endif +} + +bool verify_result(RunnerContext& ctx, const void* model_pte) { + bool model_ok = false; +#if defined(ET_BUNDLE_IO) + if (ctx.bundle_io) { + ErrorStats stats = compute_method_output_error_stats( + *ctx.method.value(), model_pte, testset_idx); + if (stats.status == Error::Ok) { + ET_LOG(Info, "=== Error stats for testset %d ===", testset_idx); + ET_LOG(Info, " mean_absolute_error: %f", stats.mean_abs_error); + ET_LOG(Info, " max_absolute_error: %f", stats.max_abs_error); + ET_LOG(Info, " mean_relative_error: %f", stats.mean_relative_error); + ET_LOG(Info, " max_relative_error: %f", stats.max_relative_error); + } else { + ET_LOG( + Info, + "=== Error calculating stats for testset %d ERROR:%d ===", + testset_idx, + stats.status); + } + + Error status = verify_method_outputs( + *ctx.method.value(), model_pte, testset_idx, et_rtol, et_atol); + if (status == Error::Ok) { + ET_LOG(Info, "Model output match expected BundleIO bpte ref data."); + ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx); + model_ok = true; + } else { + ET_LOG( + Error, + "Model output don't match expected BundleIO bpte ref data. rtol=%f atol=%f", + et_rtol, + et_atol); + ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx); + model_ok = false; + } + } else { + model_ok = true; + } +#else + (void)ctx; + (void)model_pte; + model_ok = true; +#endif + return model_ok; +} + +bool run_model(RunnerContext& ctx, const void* model_pte) { + Error status = Error::Ok; + if (num_inferences <= 0) { + ET_LOG( + Info, + "num_inferences (%d) <= 0; skipping model execution.", + num_inferences); + // Nothing to run; treat as a no-op run. + return true; + } + ET_LOG(Info, "Starting running %d inferences...", num_inferences); + int successful_inferences = 0; + StartMeasurements(); + for (int n = 0; n < num_inferences; n++) { + ET_LOG(Debug, "Running inference number %d", n); + status = ctx.method.value()->execute(); + if (status != Error::Ok) { + break; + } + // Reset the temporary allocator between inferences + ctx.temp_allocator.reset(temp_allocation_pool_size, temp_allocation_pool); + successful_inferences++; + } + if (successful_inferences > 0) { + StopMeasurements(successful_inferences); + } + + ET_CHECK_MSG( + status == Error::Ok, + "Execution of method %s failed with status 0x%" PRIx32, + ctx.method_name, + static_cast(status)); + + ET_LOG(Info, "%d inferences finished", successful_inferences); + print_outputs(ctx); + bool model_ok = verify_result(ctx, model_pte); + ET_LOG(Info, "Model run: %d", model_ok); + + return model_ok; +} + +} // namespace + +// ===================================================================== +// Global runner state -- shared by the public et_runner_* API and by +// executor_runner_main() for its multi-inference demo loop. +// ===================================================================== + +static RunnerContext g_runner_ctx; +static bool g_runner_initialized = false; + +// Maximum number of input/output tensors handled in the public API. +static const size_t kMaxInputOutputs = 16; + +// ===================================================================== +// Public API +// ===================================================================== + +bool et_runner_init(void) { + executorch::runtime::runtime_init(); + + size_t pte_size; + +#if defined(FILESYSTEM_LOAD) +#if defined(ESP_PLATFORM) + if (!init_spiffs("/spiffs", "storage")) { + ET_LOG(Fatal, "Failed to initialize SPIFFS. Cannot load model."); + return false; + } +#endif + EspMemoryAllocator file_allocator( + method_allocation_pool_size, method_allocation_pool); + auto [buffer, buffer_size] = + load_file_from_fs("/spiffs/model.pte", file_allocator); + if (buffer == nullptr) { + ET_LOG(Fatal, "Failed to load model from filesystem."); + return false; + } + model_pte = buffer; + model_pte_size = buffer_size; + pte_size = buffer_size; +#else + pte_size = sizeof(model_pte); +#endif + + runner_init(g_runner_ctx, pte_size); + g_runner_initialized = g_runner_ctx.method->ok(); + return g_runner_initialized; +} + +bool et_runner_set_input(size_t input_idx, const void* data, size_t num_bytes) { + if (!g_runner_initialized) { + ET_LOG(Error, "Runner not initialized. Call et_runner_init() first."); + return false; + } + + Method& method = *g_runner_ctx.method.value(); + const size_t num_inputs = method.inputs_size(); + + if (input_idx >= num_inputs) { + ET_LOG( + Error, + "Input index %lu out of range (num_inputs=%lu).", + static_cast(input_idx), + static_cast(num_inputs)); + return false; + } + if (num_inputs > kMaxInputOutputs) { + ET_LOG( + Error, + "Model has too many inputs (%lu > %lu).", + static_cast(num_inputs), + static_cast(kMaxInputOutputs)); + return false; + } + + // get_inputs() returns shallow copies whose data pointers alias the + // method's internal tensor storage, allowing direct writes. + EValue input_evalues[kMaxInputOutputs]; + Error status = method.get_inputs(input_evalues, num_inputs); + if (status != Error::Ok) { + ET_LOG( + Error, + "get_inputs() failed with status 0x%" PRIx32, + static_cast(status)); + return false; + } + + if (!input_evalues[input_idx].isTensor()) { + ET_LOG( + Error, + "Input %lu is not a Tensor.", + static_cast(input_idx)); + return false; + } + + Tensor& tensor = input_evalues[input_idx].toTensor(); + const size_t tensor_bytes = tensor.nbytes(); + if (num_bytes > tensor_bytes) { + ET_LOG( + Error, + "Input %lu: provided %lu bytes exceeds tensor capacity %lu bytes.", + static_cast(input_idx), + static_cast(num_bytes), + static_cast(tensor_bytes)); + return false; + } + // Treat zero-length input as a no-op. + if (num_bytes == 0) { + return true; + } + // For non-zero length, the input data pointer must be non-null. + if (data == nullptr) { + ET_LOG( + Error, + "Input %lu: data pointer is null for non-zero num_bytes (%lu).", + static_cast(input_idx), + static_cast(num_bytes)); + return false; + } + + memcpy(tensor.mutable_data_ptr(), data, num_bytes); + return true; +} + +bool et_runner_execute(void) { + if (!g_runner_initialized) { + ET_LOG(Error, "Runner not initialized. Call et_runner_init() first."); + return false; + } + + Method& method = *g_runner_ctx.method.value(); + Error status = method.execute(); + // Reset the temporary allocator so it is ready for the next inference. + g_runner_ctx.temp_allocator.reset( + temp_allocation_pool_size, temp_allocation_pool); + if (status != Error::Ok) { + ET_LOG( + Error, + "execute() failed with status 0x%" PRIx32, + static_cast(status)); + return false; + } + return true; +} + +bool et_runner_get_output( + size_t output_idx, + void* buffer, + size_t buffer_bytes, + size_t* out_num_elements) { + if (!g_runner_initialized) { + ET_LOG(Error, "Runner not initialized. Call et_runner_init() first."); + return false; + } + + Method& method = *g_runner_ctx.method.value(); + const size_t num_outputs = method.outputs_size(); + + if (output_idx >= num_outputs) { + ET_LOG( + Error, + "Output index %lu out of range (num_outputs=%lu).", + static_cast(output_idx), + static_cast(num_outputs)); + return false; + } + if (num_outputs > kMaxInputOutputs) { + ET_LOG( + Error, + "Model has too many outputs (%lu > %lu).", + static_cast(num_outputs), + static_cast(kMaxInputOutputs)); + return false; + } + + EValue output_evalues[kMaxInputOutputs]; + Error status = method.get_outputs(output_evalues, num_outputs); + if (status != Error::Ok) { + ET_LOG( + Error, + "get_outputs() failed with status 0x%" PRIx32, + static_cast(status)); + return false; + } + + if (!output_evalues[output_idx].isTensor()) { + ET_LOG( + Error, + "Output %lu is not a Tensor.", + static_cast(output_idx)); + return false; + } + + Tensor tensor = output_evalues[output_idx].toTensor(); + const size_t tensor_bytes = tensor.nbytes(); + if (buffer_bytes < tensor_bytes) { + ET_LOG( + Error, + "Output %lu: buffer too small (%lu bytes < %lu bytes required).", + static_cast(output_idx), + static_cast(buffer_bytes), + static_cast(tensor_bytes)); + return false; + } + + memcpy(buffer, tensor.const_data_ptr(), tensor_bytes); + if (out_num_elements != nullptr) { + *out_num_elements = static_cast(tensor.numel()); + } + return true; +} + +size_t et_runner_inputs_size(void) { + if (!g_runner_initialized) { + return 0; + } + return (*g_runner_ctx.method.value()).inputs_size(); +} + +size_t et_runner_outputs_size(void) { + if (!g_runner_initialized) { + return 0; + } + return (*g_runner_ctx.method.value()).outputs_size(); +} + +/** + * Main entry point for the ESP32 executor runner. + * + * On ESP-IDF, this is called from app_main() (see below). + * The function can also be compiled for host testing without ESP-IDF. + */ +void executor_runner_main(void) { + if (!et_runner_init()) { + return; + } + + // Log the PTE magic bytes for quick sanity check + ET_LOG( + Info, + "PTE @ %p [----%c%c%c%c]", + model_pte, + model_pte[4], + model_pte[5], + model_pte[6], + model_pte[7]); + + bool model_ok = run_model(g_runner_ctx, model_pte); + ET_LOG(Info, "Model run: %d", model_ok); + + log_mem_status(g_runner_ctx); + write_etdump(g_runner_ctx); + + ET_CHECK_MSG(model_ok == true, "Problem running model"); + + ET_LOG(Info, "Program complete."); +} \ No newline at end of file diff --git a/examples/espressif/executor_runner/esp_executor_runner.h b/examples/espressif/executor_runner/esp_executor_runner.h new file mode 100644 index 00000000000..86672d8c0bf --- /dev/null +++ b/examples/espressif/executor_runner/esp_executor_runner.h @@ -0,0 +1,98 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Public API for the ESP32 ExecuTorch executor runner. + * + * Provides a simple interface to load a model once and run repeated inferences + * on dynamically generated input data: + * + * et_runner_init(); + * + * // For each inference: + * et_runner_set_input(0, my_input_data, my_input_bytes); + * et_runner_execute(); + * et_runner_get_output(0, out_buf, out_buf_bytes, &num_elements); + */ + +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the runner: load the model, allocate memory pools, and prepare + * the inference method. Must be called once before any other et_runner_* + * function. + * + * @returns true on success, false on failure. + */ +bool et_runner_init(void); + +/** + * Copy raw data into the input tensor at the given index. + * + * The runner must already be initialized with et_runner_init(). The data's + * layout (dtype and shape) must match the model's expected input tensor. + * + * @param input_idx Zero-based index of the input tensor to set. + * @param data Pointer to the source data in host memory. + * @param num_bytes Number of bytes to copy. Must not exceed the tensor's + * total byte size (element_size * num_elements). + * @returns true on success, false on failure. + */ +bool et_runner_set_input(size_t input_idx, const void* data, size_t num_bytes); + +/** + * Execute one forward pass of the model. + * + * Must be called after et_runner_init(). Call et_runner_set_input() before + * this if you want to provide custom input data. Results are available via + * et_runner_get_output() after this call returns successfully. + * + * @returns true on success, false on failure. + */ +bool et_runner_execute(void); + +/** + * Copy the output tensor data at the given index into a caller-provided buffer. + * + * Must be called after a successful et_runner_execute(). + * + * @param output_idx Zero-based index of the output tensor to read. + * @param buffer Caller-allocated destination buffer. + * @param buffer_bytes Size of the destination buffer in bytes. Must be + * >= the output tensor's total byte size. + * @param out_num_elements If non-NULL, set to the number of elements in the + * output tensor (not bytes). + * @returns true on success, false on failure. + */ +bool et_runner_get_output( + size_t output_idx, + void* buffer, + size_t buffer_bytes, + size_t* out_num_elements); + +/** + * Returns the number of input tensors expected by the loaded model. + * Returns 0 if the runner is not yet initialized. + */ +size_t et_runner_inputs_size(void); + +/** + * Returns the number of output tensors produced by the loaded model. + * Returns 0 if the runner is not yet initialized. + */ +size_t et_runner_outputs_size(void); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/examples/espressif/executor_runner/esp_memory_allocator.cpp b/examples/espressif/executor_runner/esp_memory_allocator.cpp new file mode 100644 index 00000000000..c68f94289df --- /dev/null +++ b/examples/espressif/executor_runner/esp_memory_allocator.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "esp_memory_allocator.h" + +EspMemoryAllocator::EspMemoryAllocator(uint32_t size, uint8_t* base_address) + : MemoryAllocator(size, base_address), used_(0) {} + +void* EspMemoryAllocator::allocate(size_t size, size_t alignment) { + void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment); + if (ret != nullptr) { + // Keep used_ in sync with the underlying MemoryAllocator by computing it + // from the returned pointer and requested size, which implicitly includes + // any padding/alignment the base allocator applied. + uint8_t* end_ptr = static_cast(ret) + size; + used_ = static_cast(end_ptr - base_address()); + } + return ret; +} + +size_t EspMemoryAllocator::used_size() const { + return used_; +} + +size_t EspMemoryAllocator::free_size() const { + return executorch::runtime::MemoryAllocator::size() - used_; +} + +void EspMemoryAllocator::reset() { + executorch::runtime::MemoryAllocator::reset(); + used_ = 0; +} diff --git a/examples/espressif/executor_runner/esp_memory_allocator.h b/examples/espressif/executor_runner/esp_memory_allocator.h new file mode 100644 index 00000000000..377f608fe88 --- /dev/null +++ b/examples/espressif/executor_runner/esp_memory_allocator.h @@ -0,0 +1,36 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + + +/** + * Custom allocator for Espressif ESP32/ESP32-S3 targets that tracks + * used and free memory. Extends the ExecuTorch MemoryAllocator with + * additional instrumentation useful for memory-constrained embedded + * environments. + */ +class EspMemoryAllocator : public executorch::runtime::MemoryAllocator { + public: + EspMemoryAllocator(uint32_t size, uint8_t* base_address); + + void* allocate(size_t size, size_t alignment = kDefaultAlignment) override; + + /// Returns the used size of the allocator's memory buffer. + size_t used_size() const; + + /// Returns the free size of the allocator's memory buffer. + size_t free_size() const; + + /// Resets the allocator to its initial state. + void reset(); + + private: + size_t used_; +}; diff --git a/examples/espressif/executor_runner/esp_pal.cpp b/examples/espressif/executor_runner/esp_pal.cpp new file mode 100644 index 00000000000..90c227d8f99 --- /dev/null +++ b/examples/espressif/executor_runner/esp_pal.cpp @@ -0,0 +1,91 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include + +#if defined(ESP_PLATFORM) +#include +#include +#include +#include +#endif + +extern "C" { + +void et_pal_init(void) { +#if defined(ESP_PLATFORM) + ET_LOG( + Info, + "ESP32 ExecuTorch runner initialized. Free heap: %lu bytes.", + static_cast(esp_get_free_heap_size())); +#if defined(CONFIG_SPIRAM) + ET_LOG( + Info, + "PSRAM available. Free PSRAM: %lu bytes.", + static_cast(heap_caps_get_free_size(MALLOC_CAP_SPIRAM))); +#endif +#endif +} + +ET_NORETURN void et_pal_abort(void) { +#if defined(ESP_PLATFORM) + esp_restart(); +#else + abort(); +#endif + while (1) { + } +} + +et_timestamp_t et_pal_current_ticks(void) { +#if defined(ESP_PLATFORM) + return (et_timestamp_t)esp_cpu_get_cycle_count(); +#else + return 0; +#endif +} + +et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) { +#if defined(ESP_PLATFORM) + uint32_t cpu_freq_hz; + if (esp_clk_tree_src_get_freq_hz(SOC_MOD_CLK_CPU, ESP_CLK_TREE_SRC_FREQ_PRECISION_CACHED, &cpu_freq_hz) == + ESP_OK) { + return {1000000000u, cpu_freq_hz}; + } +#endif + return {1000, 240}; // Default to 240 MHz if we can't get the actual frequency +} + +void et_pal_emit_log_message( + ET_UNUSED et_timestamp_t timestamp, + et_pal_log_level_t level, + const char* filename, + const char* function, + size_t line, + const char* message, + ET_UNUSED size_t length) { + printf( + "%c [executorch:%s:%lu %s()] %s\n", + level, + filename, + static_cast(line), + function, + message); + fflush(stdout); +} + +void* et_pal_allocate(ET_UNUSED size_t size) { + return nullptr; +} + +void et_pal_free(ET_UNUSED void* ptr) {} + +} // extern "C" \ No newline at end of file diff --git a/examples/espressif/executor_runner/esp_perf_monitor.cpp b/examples/espressif/executor_runner/esp_perf_monitor.cpp new file mode 100644 index 00000000000..1b1a70987b5 --- /dev/null +++ b/examples/espressif/executor_runner/esp_perf_monitor.cpp @@ -0,0 +1,100 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include "esp_perf_monitor.h" + +#if defined(ESP_PLATFORM) + +#include +#include +#include +#include + +namespace { + +uint32_t start_cycle_count = 0; +int64_t start_time_us = 0; + +} // namespace + +void StartMeasurements() { + start_cycle_count = esp_cpu_get_cycle_count(); + start_time_us = esp_timer_get_time(); +} + +void StopMeasurements(int num_inferences) { + uint32_t end_cycle_count = esp_cpu_get_cycle_count(); + int64_t end_time_us = esp_timer_get_time(); + + uint32_t delta_cycles = end_cycle_count - start_cycle_count; + uint64_t total_cycles = static_cast(delta_cycles); + int64_t total_time_us = end_time_us - start_time_us; + + ET_LOG(Info, "Profiler report:"); + ET_LOG(Info, "Number of inferences: %d", num_inferences); + + // Guard against division by zero or invalid counts when computing + // per-inference metrics. + if (num_inferences <= 0) { + ET_LOG( + Info, + "Total CPU cycles: %" PRIu64 " (per-inference metrics not computed)", + total_cycles); + ET_LOG( + Info, + "Total wall time: %" PRId64 " us (per-inference metrics not computed)", + total_time_us); + // Log ESP32 system memory info + ET_LOG( + Info, + "Free heap: %lu bytes", + static_cast(esp_get_free_heap_size())); + ET_LOG( + Info, + "Min free heap ever: %lu bytes", + static_cast(esp_get_minimum_free_heap_size())); + return; + } + + ET_LOG( + Info, + "Total CPU cycles: %" PRIu64 " (%.2f per inference)", + total_cycles, + (double)total_cycles / num_inferences); + ET_LOG( + Info, + "Total wall time: %" PRId64 " us (%.2f us per inference)", + total_time_us, + (double)total_time_us / num_inferences); + ET_LOG( + Info, + "Average inference time: %.3f ms", + (double)total_time_us / num_inferences / 1000.0); + + // Log ESP32 system memory info + ET_LOG( + Info, + "Free heap: %lu bytes", + static_cast(esp_get_free_heap_size())); + ET_LOG( + Info, + "Min free heap ever: %lu bytes", + static_cast(esp_get_minimum_free_heap_size())); +} + +#else // !defined(ESP_PLATFORM) + +// Stub implementation for non-ESP builds (e.g. host testing) +void StartMeasurements() {} + +void StopMeasurements(int num_inferences) { + (void)num_inferences; +} + +#endif // defined(ESP_PLATFORM) diff --git a/examples/espressif/executor_runner/esp_perf_monitor.h b/examples/espressif/executor_runner/esp_perf_monitor.h new file mode 100644 index 00000000000..ccbdb07e331 --- /dev/null +++ b/examples/espressif/executor_runner/esp_perf_monitor.h @@ -0,0 +1,18 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +/** + * Performance monitoring helpers for Espressif ESP32/ESP32-S3. + * + * Uses the Xtensa/RISC-V CPU cycle counter (CCOUNT register on Xtensa, + * or esp_cpu_get_cycle_count() from ESP-IDF) for timing measurements. + */ + +void StartMeasurements(); +void StopMeasurements(int num_inferences); diff --git a/examples/espressif/executor_runner/pte_to_header.py b/examples/espressif/executor_runner/pte_to_header.py new file mode 100644 index 00000000000..0a8935b7a92 --- /dev/null +++ b/examples/espressif/executor_runner/pte_to_header.py @@ -0,0 +1,100 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Converts an ExecuTorch .pte model file to a C header file containing +the model data as a byte array. This is used to embed the model directly +into the firmware binary for ESP32/ESP32-S3 targets. + +Usage: + python pte_to_header.py --pte model.pte [--outdir .] [--outfile model_pte.h] +""" + +import binascii +import os +from argparse import ArgumentParser, ArgumentTypeError + +bytes_per_line = 32 +hex_digits_per_line = bytes_per_line * 2 + + +def input_file_path(path): + if os.path.exists(path): + return path + else: + raise ArgumentTypeError(f"input filepath:{path} does not exist") + + +parser = ArgumentParser(description="Convert .pte model to C header for ESP32") +parser.add_argument( + "-p", + "--pte", + help="ExecuTorch .pte model file", + type=input_file_path, + required=True, +) +parser.add_argument( + "-d", + "--outdir", + help="Output dir for model header", + type=str, + required=False, + default=".", +) +parser.add_argument( + "-o", + "--outfile", + help="Output filename for model header", + type=str, + required=False, + default="model_pte.h", +) +parser.add_argument( + "-s", + "--section", + help="Section attribute for the data array (use 'none' for no section attribute)", + type=str, + required=False, + default="none", +) + +if __name__ == "__main__": + args = parser.parse_args() + outfile = os.path.join(args.outdir, args.outfile) + + if args.section == "none": + # No section attribute - let the linker/compiler decide placement. + # On ESP32 with PSRAM, the compiler/linker or EXT_RAM_BSS_ATTR + # in the code handles placement. + attr = "__attribute__((aligned(16))) static const unsigned char " + else: + attr = f'__attribute__((section("{args.section}"), aligned(16))) static const unsigned char ' + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + with open(args.pte, "rb") as fr, open(outfile, "w") as fw: + data = fr.read() + hexstream = binascii.hexlify(data).decode("utf-8") + + fw.write( + "/* Auto-generated model header for ESP32 ExecuTorch runner. */\n" + ) + fw.write(f"/* Source: {os.path.basename(args.pte)} ({len(data)} bytes) */\n\n") + fw.write("#pragma once\n\n") + fw.write(attr + "model_pte[] = {") + + for i in range(0, len(hexstream), 2): + if 0 == (i % hex_digits_per_line): + fw.write("\n") + fw.write("0x" + hexstream[i : i + 2] + ", ") + + fw.write("\n};\n") + fw.flush() + os.fsync(fw.fileno()) + + print( + f"Input: {args.pte} with {len(data)} bytes. " + f"Output: {outfile} with {os.path.getsize(outfile)} bytes." + ) diff --git a/examples/espressif/project/CMakeLists.txt b/examples/espressif/project/CMakeLists.txt new file mode 100644 index 00000000000..b467cb49baa --- /dev/null +++ b/examples/espressif/project/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Example ESP-IDF project CMakeLists.txt +# +# This is a template project that uses the executor_runner component. +# Copy this to your own project directory and adjust paths as needed. +# +# Usage: +# cd examples/espressif/project +# idf.py set-target esp32s3 +# idf.py build +# idf.py flash monitor + +cmake_minimum_required(VERSION 3.16) + +# Set the path to ExecuTorch source +set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../.." CACHE PATH "ExecuTorch root") + +# Add the executor_runner as an extra component +set(EXTRA_COMPONENT_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/../executor_runner" +) + +include($ENV{IDF_PATH}/tools/cmake/project.cmake) +project(executorch_esp_runner) diff --git a/examples/espressif/project/main/CMakeLists.txt b/examples/espressif/project/main/CMakeLists.txt new file mode 100644 index 00000000000..2b2cd9d135a --- /dev/null +++ b/examples/espressif/project/main/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Main component CMakeLists.txt for the ESP-IDF project. +# This is a minimal main component that depends on the executor_runner. + +idf_component_register( + SRCS "main.cpp" + INCLUDE_DIRS "." + REQUIRES executor_runner +) diff --git a/examples/espressif/project/main/main.cpp b/examples/espressif/project/main/main.cpp new file mode 100644 index 00000000000..ac446d142f8 --- /dev/null +++ b/examples/espressif/project/main/main.cpp @@ -0,0 +1,37 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Example ESP-IDF main component. + * + * The app_main() defined below performs optional initialization and then + * calls executor_runner_main(). + * + * If you want to customize the runner behavior, you can modify the + * app_main() implementation here (e.g., add initialization or cleanup) + * while still delegating to executor_runner_main(). + */ + + +#include +#include "sdkconfig.h" +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" +#include "esp_system.h" + +extern void executor_runner_main(void); + +extern "C" void app_main(void) { + printf("Starting executorch runner !\n"); + fflush(stdout); + // Custom initialization here + executor_runner_main(); + for (int i = 5; i >= 0; i--) { + vTaskDelay(1000 / portTICK_PERIOD_MS); + } + esp_restart(); +} diff --git a/examples/espressif/project/partitions.csv b/examples/espressif/project/partitions.csv new file mode 100644 index 00000000000..e6d484d3f99 --- /dev/null +++ b/examples/espressif/project/partitions.csv @@ -0,0 +1,5 @@ +# ESP-IDF Partition Table +# Name, Type, SubType, Offset, Size, Flags +nvs,data,nvs,0x9000,24K, +phy_init,data,phy,0xf000,4K, +factory,app,factory,0x10000,2M, diff --git a/examples/espressif/project/sdkconfig.defaults b/examples/espressif/project/sdkconfig.defaults new file mode 100644 index 00000000000..08b09229148 --- /dev/null +++ b/examples/espressif/project/sdkconfig.defaults @@ -0,0 +1,50 @@ +# ESP-IDF sdkconfig defaults for ExecuTorch executor runner +# +# These settings are optimized for running ExecuTorch models on ESP32/ESP32-S3. +# Copy this file as sdkconfig.defaults in your project directory. + +# ─── CPU Frequency ─── +# Run at maximum frequency for best inference performance +CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y + +# ─── PSRAM (if available) ─── +# Enable PSRAM for larger model support +CONFIG_SPIRAM=y +CONFIG_SPIRAM_MODE_QUAD=y +CONFIG_SPIRAM_SPEED_80M=y +# Allow malloc to fall back to PSRAM when internal RAM is exhausted +CONFIG_SPIRAM_USE_CAPS_ALLOC=y +# Place BSS in PSRAM (for large static buffers) +CONFIG_SPIRAM_ALLOW_BSS_SEG_EXTERNAL_MEMORY=y + +# ─── Memory ─── +# Increase main task stack size for ExecuTorch +CONFIG_ESP_MAIN_TASK_STACK_SIZE=32768 + +# ─── Flash ─── +# Use QIO flash mode for faster flash reads (model data) +CONFIG_ESPTOOLPY_FLASHMODE_QIO=y +CONFIG_ESPTOOLPY_FLASHFREQ_80M=y +# Larger flash size for model data +CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y + +# ─── Optimization ─── +# Optimize for performance +CONFIG_COMPILER_OPTIMIZATION_PERF=y + +# ─── FreeRTOS ─── +# Increase tick rate for finer timing granularity +CONFIG_FREERTOS_HZ=1000 + +# ─── Logging ─── +# Default log level (can be changed at runtime) +CONFIG_LOG_DEFAULT_LEVEL_INFO=y + +# ─── Watchdog ─── +# Disable task watchdog for long-running inference +CONFIG_ESP_TASK_WDT_EN=n + +# ─── Custom partition table to be adjusted for larger builds ─── +CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions.csv" +CONFIG_PARTITION_TABLE_FILENAME="partitions.csv" \ No newline at end of file diff --git a/examples/espressif/project/sdkconfig.defaults.esp32s3 b/examples/espressif/project/sdkconfig.defaults.esp32s3 new file mode 100644 index 00000000000..15f9c4eba30 --- /dev/null +++ b/examples/espressif/project/sdkconfig.defaults.esp32s3 @@ -0,0 +1,42 @@ +# ESP-IDF sdkconfig defaults for ESP32-S3 target +# +# ESP32-S3 specific optimizations: +# - Octal PSRAM support (up to 32MB) +# - Dual-core Xtensa LX7 at 240MHz +# - Vector extensions for faster computation + +# ─── CPU ─── +CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y + +# ─── PSRAM (Octal PSRAM for ESP32-S3) ─── +CONFIG_SPIRAM=y +#CONFIG_SPIRAM_MODE_QUAD=y +CONFIG_SPIRAM_MODE_OCT=y +CONFIG_SPIRAM_SPEED_80M=y +CONFIG_SPIRAM_USE_CAPS_ALLOC=y +CONFIG_SPIRAM_ALLOW_BSS_SEG_EXTERNAL_MEMORY=y + +# ─── Memory ─── +CONFIG_ESP_MAIN_TASK_STACK_SIZE=32768 + +# ─── Flash ─── +CONFIG_ESPTOOLPY_FLASHMODE_QIO=y +CONFIG_ESPTOOLPY_FLASHFREQ_80M=y +CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y + +# ─── Optimization ─── +CONFIG_COMPILER_OPTIMIZATION_PERF=y + +# ─── FreeRTOS ─── +CONFIG_FREERTOS_HZ=1000 + +# ─── Watchdog ─── +CONFIG_ESP_TASK_WDT_EN=n + +# ─── Logging ─── +CONFIG_LOG_DEFAULT_LEVEL_INFO=y + +# ─── Custom partition table to be adjusted for larger builds ─── +CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions.csv" +CONFIG_PARTITION_TABLE_FILENAME="partitions.csv" \ No newline at end of file diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp index a15a2572669..1928892efe6 100644 --- a/extension/threadpool/threadpool.cpp +++ b/extension/threadpool/threadpool.cpp @@ -145,7 +145,7 @@ ThreadPool* get_threadpool() { * tricky to detect if we are running under tsan, for now capping the * default threadcount to the tsan limit unconditionally. */ - constexpr unsigned int tsan_thread_limit = 63; + constexpr decltype(result) tsan_thread_limit = 63; return std::min(result, tsan_thread_limit); })(); diff --git a/tools/cmake/preset/esp_baremetal.cmake b/tools/cmake/preset/esp_baremetal.cmake new file mode 100644 index 00000000000..cf86d5efc79 --- /dev/null +++ b/tools/cmake/preset/esp_baremetal.cmake @@ -0,0 +1,21 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}") +set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER OFF) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR OFF) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF) +set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON) +set_overridable_option(EXECUTORCH_ENABLE_LOGGING ON) +set_overridable_option(EXECUTORCH_BUILD_ARM_ETDUMP OFF) + +if("${EXECUTORCH_BUILD_ARM_ETDUMP}") + set(EXECUTORCH_BUILD_DEVTOOLS ON) + set(EXECUTORCH_ENABLE_EVENT_TRACER ON) + set(FLATCC_ALLOW_WERROR OFF) +else() + set(EXECUTORCH_ENABLE_EVENT_TRACER OFF) +endif()