From 225c11569248d80b6c92b3696a07d510ed928d7b Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Tue, 3 Oct 2023 22:07:42 +0000 Subject: [PATCH 01/25] Initial Ethos-U runtime backend - Basic runtime targeting Corstone-300 with U55 - cross compile support with a cmake toolchain (Arm baremetal build) - support for a few models AoT -> TOSA -> Vela -> U55 hardware - dependencies on the ethos-u core driver and cmsis (submodules) Signed-off-by: Rob Elliott --- .gitmodules | 6 + CMakeLists.txt | 7 + backends/arm/CMakeLists.txt | 25 ++ backends/arm/arm_backend.py | 69 ++++- backends/arm/cmake/Dependencies.cmake | 12 + backends/arm/cmake/arm-none-eabi-gcc.cmake | 90 +++++++ backends/arm/cmake/build.sh | 53 ++++ backends/arm/cmake/toolchain.sh | 12 + backends/arm/runtime/ArmBackendEthosU.cpp | 261 +++++++++++++++++++ backends/arm/third-party/cmsis | 1 + backends/arm/third-party/ethos-u-core-driver | 1 + 11 files changed, 531 insertions(+), 6 deletions(-) create mode 100644 backends/arm/CMakeLists.txt create mode 100644 backends/arm/cmake/Dependencies.cmake create mode 100644 backends/arm/cmake/arm-none-eabi-gcc.cmake create mode 100755 backends/arm/cmake/build.sh create mode 100755 backends/arm/cmake/toolchain.sh create mode 100644 backends/arm/runtime/ArmBackendEthosU.cpp create mode 160000 backends/arm/third-party/cmsis create mode 160000 backends/arm/third-party/ethos-u-core-driver diff --git a/.gitmodules b/.gitmodules index 05143134bcf..0687c0e8b3f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -43,3 +43,9 @@ [submodule "examples/demo-apps/android/jni/third-party/fbjni"] path = examples/demo-apps/android/jni/third-party/fbjni url = https://github.com/facebookincubator/fbjni.git +[submodule "backends/arm/third-party/ethos-u-core-driver"] + path = backends/arm/third-party/ethos-u-core-driver + url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git +[submodule "backends/arm/third-party/cmsis"] + path = backends/arm/third-party/cmsis + url = https://github.com/ARM-software/CMSIS_5.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f0281766aab..122d9006b20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -334,6 +334,13 @@ if(EXECUTORCH_BUILD_QNN) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/qualcomm) endif() +# Build Arm Baremetal backend +option(EXECUTORCH_BUILD_ARM_BAREMETAL + "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF) +if(EXECUTORCH_BUILD_ARM_BAREMETAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +endif() + # Add selective build subdirectory if(BUILD_SELECTIVE_BUILD_TEST) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt new file mode 100644 index 00000000000..2cc5cf94740 --- /dev/null +++ b/backends/arm/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +cmake_minimum_required(VERSION 3.19) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +set(_common_include_directories ${EXECUTORCH_ROOT}/..) +set(_common_compile_options -Wno-deprecated-declarations) + +include(cmake/Dependencies.cmake) + +set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) +list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") +add_library(ethos_u STATIC ${_arm_baremetal_sources}) +target_include_directories(ethos_u PUBLIC ${_common_include_directories}) +target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 6b08d94e3aa..4f08856affa 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -13,6 +13,7 @@ import operator import os import tempfile +import subprocess from typing import final, List import numpy as np @@ -144,6 +145,64 @@ def dbg_tosa_dump(tosa_fb, path): f.write(js) f.close() +# Output to Vela with current file-based compilation +# WARNING: if this changes, the runtime reader also needs to change +def vela_compile(tosa_fb): + with tempfile.TemporaryDirectory() as tmpdir: + print(f"compiling to Vela in {tmpdir}") + + tosaname = "out.tosa" + flatbuffer = tosa_fb.serialize() + f = open(os.path.join(tmpdir,tosaname), "wb") + f.write(flatbuffer) + f.close() + + # invoke vela + # TODO target ethos-u55-128 + vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" + subprocess.run([vela_command], shell=True, check=True) + + np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz") + blocks = b'' + with np.load(np_path, allow_pickle=False) as data: + # Emit the NPZ regions as: + # - 16 byte block name null terminated string (padded to 16 if name shorter) + # - 4 byes of int32 block length and 12 bytes of 0's + # - block data (padded to 16 byte alignment at end) + # Repeat for all blocks + for key in data.keys(): + block_name = bytes(key,"utf8")[:15] + block_name = block_name + b'\x00'*(16-len(block_name)) + block_data = data[key].tobytes() + # We need the acual unpadded block lengths for hw setup + block_length = len(block_data).to_bytes(16, 'little') + # pad block data to multiple of 16 bytes + block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16) + + block = block_name + block_length + block_data + blocks = blocks + block + + # Add a block for scratch, inputs and outputs + # scratch shape is a 1 element array giving us size in bytes + block_name = bytes("scratch_data","utf8")[:15] + block_name = block_name + b'\x00'*(16-len(block_name)) + block_length = data["scratch_shape"][0].item() + print(f"scratch length = {block_length}") + block_length = block_length+(15-(block_length-1)%16) + block_data = b'\x00'*block_length + block_length = block_length.to_bytes(16, 'little') + print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}") + block = block_name + block_length + block_data + blocks = blocks + block + # TODO are these already in scratch shape? look to be + #input_shape * input_elem_size + #output_shape * output_elem_size + # input_offset and output_offset specify the location these arrays are written from base of scratch + + # return 16 byte VELA bin header + blocks + footer + header = bytes("vela_bin_stream","utf-8") + b'\x00' + footer = bytes("vela_end_stream","utf-8") + b'\x00' + return header + blocks + footer def dbg_fail(node, tosa_fb, path): dbg_tosa_dump(tosa_fb, path) @@ -242,10 +301,6 @@ def preprocess( # noqa: C901 path = spec.value.decode() debug_output = True - # in non debug builds we still pass files to vela - if path is None: - path = tempfile.mkdtemp(prefix="arm_tosa_") - # Converted output for this subgraph, serializer needs path early as it emits # const data directly. Path created and data written only in debug builds. tosa_fb = ts.TosaSerializer(path) @@ -891,5 +946,7 @@ def preprocess( # noqa: C901 dbg_tosa_dump(tosa_fb, path) # Serialize and return the tosa flatbuffer - fb = tosa_fb.serialize() - return PreprocessResult(processed_bytes=bytes(fb)) + # fb = bytes(tosa_fb.serialize()) + binary = vela_compile(tosa_fb) + + return PreprocessResult(processed_bytes=binary) diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake new file mode 100644 index 00000000000..27a587176bb --- /dev/null +++ b/backends/arm/cmake/Dependencies.cmake @@ -0,0 +1,12 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") + +# Ethos-U driver +set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver") +set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include") +add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} ) +include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} ) diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake new file mode 100644 index 00000000000..d70f79361cd --- /dev/null +++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake @@ -0,0 +1,90 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU") +string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR) + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_C_COMPILER "arm-none-eabi-gcc") +set(CMAKE_CXX_COMPILER "arm-none-eabi-g++") +set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc") +set(CMAKE_LINKER "arm-none-eabi-ld") + +set(CMAKE_EXECUTABLE_SUFFIX ".elf") +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# Select C/C++ version +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) + +set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR}) +string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU}) + +# Compile options +add_compile_options( + -mcpu=${GCC_CPU} + -mthumb + "$<$:-gdwarf-3>" + "$<$:-fno-unwind-tables;-fno-rtti;-fno-exceptions>" + -fdata-sections + -ffunction-sections) + +# Compile defines +add_compile_definitions( + "$<$>:NDEBUG>") + +# Link options +add_link_options( + -mcpu=${GCC_CPU} + -mthumb + --specs=nosys.specs) + +# Set floating point unit +if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp") + set(FLOAT soft) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)") + set(FLOAT hard) + set(FPU_CONFIG "fpv4-sp-d16") + add_compile_options(-mfpu=${FPU_CONFIG}) + add_link_options(-mfpu=${FPU_CONFIG}) +else() + set(FLOAT soft) +endif() + +if (FLOAT) + add_compile_options(-mfloat-abi=${FLOAT}) + add_link_options(-mfloat-abi=${FLOAT}) +endif() + +add_link_options(LINKER:--nmagic,--gc-sections) + +# Compilation warnings +add_compile_options( +# -Wall +# -Wextra + +# -Wcast-align +# -Wdouble-promotion +# -Wformat +# -Wmissing-field-initializers +# -Wnull-dereference +# -Wredundant-decls +# -Wshadow +# -Wswitch +# -Wswitch-default +# -Wunused + -Wno-redundant-decls + -Wno-psabi +) diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh new file mode 100755 index 00000000000..0dbb8cf2177 --- /dev/null +++ b/backends/arm/cmake/build.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# +# Setup toolchain +# +BASEDIR=`realpath $(dirname "$0")` +echo "building using build.sh in $BASEDIR" + +ARCH=$(uname -i) +GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/ + +echo $GCCPATH +if test -d "${GCCPATH}"; then + echo Using exising compiler ${GCCPATH} +else + pushd ${BASEDIR}/ + ./toolchain.sh + popd +fi +export PATH=${PATH}:${GCCPATH} + +echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"` + + +# +# Prepare and run clean build +# +rm -rf buck-out/ build/lib/ cmake-out/ +rm -rf cmake-corstone +mkdir cmake-corstone +cd cmake-corstone + +#cmake -DBUCK2=buck2 .. + +#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake .. +cmake -DFLATC_EXECUTABLE=flatc \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \ + --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \ + .. + +cd .. +cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh new file mode 100755 index 00000000000..92188ee982d --- /dev/null +++ b/backends/arm/cmake/toolchain.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon) +ARCH=$(uname -i) +curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz +tar xf gcc.tar.xz +export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)` diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp new file mode 100644 index 00000000000..e5d68e81156 --- /dev/null +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -0,0 +1,261 @@ +/* + * Copyright 2023 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Arm backend for Ethos-U baremetal driver stack, this relies on the + * ethos-u-core-driver for hardware interaction. + */ + +#include + +#include +#include +#include + +#include +#include + +namespace torch { +namespace executor { + +// TODO we should be in 0x31, not this lower 1MB sRAM +// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000 +#define CS300_SRAM_LOW ((void*)0x11000000) +#define CS300_SRAM_HIGH ((void*)0x110FFFFF) + +class ArmBackend final : public PyTorchBackendInterface { + +public: + ArmBackend() { + printf("Constructing ARM Backend\n"); + } + + ~ArmBackend() = default; + + virtual bool is_available() const override { + return 1; + } + + Result init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const override { + + ET_LOG(Info, "ArmBackend::init %p", processed->data() ); + + char *data = (char*)processed->data(); + size_t size = processed->size(); + char *foot = data + size - 16; + + // Header and footer both 16 bit aligned suggest valid structure and we + // wont walk off the end of the chunks and segfault + if( !((int)data == next_mul_16((int)data)) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !((int)foot == next_mul_16((int)foot)) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !(0 == strncmp( data, "vela_bin_stream", 15 )) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !(0 == strncmp( foot, "vela_end_stream", 15 )) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + // Verify address range is accessible current expectation is the program + // is wholly stored in SRAM + if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) ); + + // Return the same buffer we were passed - this data will be + // executed directly + return processed; + } + + Error execute( + BackendExecutionContext& context, + DelegateHandle* input_handle, + EValue** args) const override { + + FreeableBuffer* processed = (FreeableBuffer*)input_handle; + + ET_LOG(Info, "ArmBackend::execute %p", processed->data() ); + + vela_handles handles = { 0, 0, 0, 0, 0, 0 }; + + // Command stream - we know at this point it's aligned + char *data = (char*)processed->data(); + + // Read key sections from the vela_bin_stream + if( !this->vela_read( data, &handles, processed->size() ) ) + { + ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" ); + return Error::InvalidProgram; + } + + ET_LOG(Debug, "ArmBackend::execute: Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", + handles.cmd_data, handles.cmd_data_size, + handles.weight_data, handles.weight_data_size, + handles.scratch_data, handles.scratch_data_size ); + + // TMP emit scratch + printf("Scratch before:\n"); + for( int i=0; itoTensor(); + for(int j=0; j()[j] = output_address[j]; + } + + return Error::Ok; + } + + void destroy(DelegateHandle* handle) const override { + return; + } + +private: + typedef struct { + const char *cmd_data; size_t cmd_data_size; + const char *weight_data; size_t weight_data_size; + const char *scratch_data; size_t scratch_data_size; + size_t input_offset; size_t input_data_shape[3]; + size_t output_offset; size_t output_data_shape[3]; + } vela_handles; + + typedef struct { + char name[16]; + int size; char _pad[12]; + char data[]; + } vela_bin_block; + + static int next_mul_16( int n ) { + return ((n-1)|15)+1; + } + + int vela_read(char* data, vela_handles *h, int size ) const { + + // Read header string + if( strncmp( data, "vela_bin_stream", 15 ) ) + { + return 0; + } + data += 16; + + // Expect one or more 'vela_bin_block's + while( 1 ) + { + vela_bin_block *b = (vela_bin_block*)data; + data += 16 + 16 + next_mul_16(b->size); + + // Exit with success on finding end of stream + if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1; + + if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) ) + { + // This magic header confirms a valid command stream in binary + if( strncmp( b->data, "COP1", 4 ) ) return 0; + h->cmd_data = b->data; + h->cmd_data_size = b->size; + } + if( !strncmp( b->name, "weight_data", strlen("weight_data")) ) + { + h->weight_data = b->data;; + h->weight_data_size = b->size; + } + if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) + { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + + // capture inputs and outputs + if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) + { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + if( !strncmp( b->name, "input_offset", strlen("input_offset")) ) + { + h->input_offset = ((int*)b->data)[0]; + } + if( !strncmp( b->name, "output_offset", strlen("output_offset")) ) + { + h->output_offset = ((int*)b->data)[0]; + } + if( !strncmp( b->name, "input_shape", strlen("input_shape")) ) + { + h->input_data_shape[0] = ((int*)b->data)[0]; + h->input_data_shape[0] = ((int*)b->data)[1]; + h->input_data_shape[0] = ((int*)b->data)[2]; + + } + if( !strncmp( b->name, "output_shape", strlen("output_shape")) ) + { + h->output_data_shape[0] = ((int*)b->data)[0]; + h->output_data_shape[0] = ((int*)b->data)[1]; + h->output_data_shape[0] = ((int*)b->data)[2]; + } + } + } + +}; + +namespace { +auto backend = ArmBackend(); +Backend backend_id{"ArmBackend", &backend}; +static auto registered = register_backend(backend_id); +} + +} // namespace executor +} // namespace torch diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis new file mode 160000 index 00000000000..a75f01746df --- /dev/null +++ b/backends/arm/third-party/cmsis @@ -0,0 +1 @@ +Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3 diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver new file mode 160000 index 00000000000..90f9df900ac --- /dev/null +++ b/backends/arm/third-party/ethos-u-core-driver @@ -0,0 +1 @@ +Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5 From a7aa8489d90856fd016bf7b7f1c9bc5c4ecf2dae Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 08:35:54 +0000 Subject: [PATCH 02/25] Fixed error messages on runtime init Signed-off-by: Rob Elliott --- backends/arm/runtime/ArmBackendEthosU.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index e5d68e81156..eccacd7c1cc 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -55,27 +55,31 @@ class ArmBackend final : public PyTorchBackendInterface { // wont walk off the end of the chunks and segfault if( !((int)data == next_mul_16((int)data)) ) { - ET_LOG(Error, "ArmBackend::init header unaligned"); + ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned"); return Error::InvalidProgram; } if( !((int)foot == next_mul_16((int)foot)) ) { - ET_LOG(Error, "ArmBackend::init header unaligned"); + ET_LOG(Error, "ArmBackend::init: Program unexpected size"); return Error::InvalidProgram; } if( !(0 == strncmp( data, "vela_bin_stream", 15 )) ) { - ET_LOG(Error, "ArmBackend::init header unaligned"); + ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream"); return Error::InvalidProgram; } if( !(0 == strncmp( foot, "vela_end_stream", 15 )) ) { - ET_LOG(Error, "ArmBackend::init header unaligned"); + ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream"); return Error::InvalidProgram; } // Verify address range is accessible current expectation is the program // is wholly stored in SRAM - if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) ); + if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) ) + { + ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM"); + return Error::InvalidProgram; + } // Return the same buffer we were passed - this data will be // executed directly From f95feade8fb9e6b089120b68bc4f87f36265933c Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 08:49:42 +0000 Subject: [PATCH 03/25] lintrunner cleanup Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 44 +- backends/arm/cmake/arm-none-eabi-gcc.cmake | 2 +- backends/arm/runtime/ArmBackendEthosU.cpp | 446 ++++++++++----------- 3 files changed, 246 insertions(+), 246 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 4f08856affa..8185718f45e 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -12,8 +12,8 @@ import logging import operator import os -import tempfile import subprocess +import tempfile from typing import final, List import numpy as np @@ -145,6 +145,7 @@ def dbg_tosa_dump(tosa_fb, path): f.write(js) f.close() + # Output to Vela with current file-based compilation # WARNING: if this changes, the runtime reader also needs to change def vela_compile(tosa_fb): @@ -153,17 +154,19 @@ def vela_compile(tosa_fb): tosaname = "out.tosa" flatbuffer = tosa_fb.serialize() - f = open(os.path.join(tmpdir,tosaname), "wb") + f = open(os.path.join(tmpdir, tosaname), "wb") f.write(flatbuffer) f.close() # invoke vela # TODO target ethos-u55-128 - vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" + vela_command = ( + f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" + ) subprocess.run([vela_command], shell=True, check=True) - np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz") - blocks = b'' + np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") + blocks = b"" with np.load(np_path, allow_pickle=False) as data: # Emit the NPZ regions as: # - 16 byte block name null terminated string (padded to 16 if name shorter) @@ -171,39 +174,40 @@ def vela_compile(tosa_fb): # - block data (padded to 16 byte alignment at end) # Repeat for all blocks for key in data.keys(): - block_name = bytes(key,"utf8")[:15] - block_name = block_name + b'\x00'*(16-len(block_name)) - block_data = data[key].tobytes() + block_name = bytes(key, "utf8")[:15] + block_name = block_name + b"\x00" * (16 - len(block_name)) + block_data = data[key].tobytes() # We need the acual unpadded block lengths for hw setup - block_length = len(block_data).to_bytes(16, 'little') + block_length = len(block_data).to_bytes(16, "little") # pad block data to multiple of 16 bytes - block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16) + block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16) block = block_name + block_length + block_data blocks = blocks + block # Add a block for scratch, inputs and outputs # scratch shape is a 1 element array giving us size in bytes - block_name = bytes("scratch_data","utf8")[:15] - block_name = block_name + b'\x00'*(16-len(block_name)) + block_name = bytes("scratch_data", "utf8")[:15] + block_name = block_name + b"\x00" * (16 - len(block_name)) block_length = data["scratch_shape"][0].item() print(f"scratch length = {block_length}") - block_length = block_length+(15-(block_length-1)%16) - block_data = b'\x00'*block_length - block_length = block_length.to_bytes(16, 'little') + block_length = block_length + (15 - (block_length - 1) % 16) + block_data = b"\x00" * block_length + block_length = block_length.to_bytes(16, "little") print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}") block = block_name + block_length + block_data blocks = blocks + block # TODO are these already in scratch shape? look to be - #input_shape * input_elem_size - #output_shape * output_elem_size + # input_shape * input_elem_size + # output_shape * output_elem_size # input_offset and output_offset specify the location these arrays are written from base of scratch # return 16 byte VELA bin header + blocks + footer - header = bytes("vela_bin_stream","utf-8") + b'\x00' - footer = bytes("vela_end_stream","utf-8") + b'\x00' + header = bytes("vela_bin_stream", "utf-8") + b"\x00" + footer = bytes("vela_end_stream", "utf-8") + b"\x00" return header + blocks + footer + def dbg_fail(node, tosa_fb, path): dbg_tosa_dump(tosa_fb, path) logger.warn("Internal error due to poorly handled node:") @@ -948,5 +952,5 @@ def preprocess( # noqa: C901 # Serialize and return the tosa flatbuffer # fb = bytes(tosa_fb.serialize()) binary = vela_compile(tosa_fb) - + return PreprocessResult(processed_bytes=binary) diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake index d70f79361cd..10bc858ed46 100644 --- a/backends/arm/cmake/arm-none-eabi-gcc.cmake +++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake @@ -63,7 +63,7 @@ else() set(FLOAT soft) endif() -if (FLOAT) +if(FLOAT) add_compile_options(-mfloat-abi=${FLOAT}) add_link_options(-mfloat-abi=${FLOAT}) endif() diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index eccacd7c1cc..4c052ea60c7 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -28,238 +28,234 @@ namespace executor { #define CS300_SRAM_HIGH ((void*)0x110FFFFF) class ArmBackend final : public PyTorchBackendInterface { - -public: - ArmBackend() { - printf("Constructing ARM Backend\n"); - } - - ~ArmBackend() = default; - - virtual bool is_available() const override { - return 1; - } - - Result init( - BackendInitContext& context, - FreeableBuffer* processed, - ArrayRef compile_specs) const override { - - ET_LOG(Info, "ArmBackend::init %p", processed->data() ); - - char *data = (char*)processed->data(); - size_t size = processed->size(); - char *foot = data + size - 16; - - // Header and footer both 16 bit aligned suggest valid structure and we - // wont walk off the end of the chunks and segfault - if( !((int)data == next_mul_16((int)data)) ) - { - ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned"); - return Error::InvalidProgram; - } - if( !((int)foot == next_mul_16((int)foot)) ) - { - ET_LOG(Error, "ArmBackend::init: Program unexpected size"); - return Error::InvalidProgram; - } - if( !(0 == strncmp( data, "vela_bin_stream", 15 )) ) - { - ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream"); - return Error::InvalidProgram; - } - if( !(0 == strncmp( foot, "vela_end_stream", 15 )) ) - { - ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream"); - return Error::InvalidProgram; - } - // Verify address range is accessible current expectation is the program - // is wholly stored in SRAM - if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) ) - { - ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM"); - return Error::InvalidProgram; - } - - // Return the same buffer we were passed - this data will be - // executed directly - return processed; - } - - Error execute( - BackendExecutionContext& context, - DelegateHandle* input_handle, - EValue** args) const override { - - FreeableBuffer* processed = (FreeableBuffer*)input_handle; - - ET_LOG(Info, "ArmBackend::execute %p", processed->data() ); - - vela_handles handles = { 0, 0, 0, 0, 0, 0 }; - - // Command stream - we know at this point it's aligned - char *data = (char*)processed->data(); - - // Read key sections from the vela_bin_stream - if( !this->vela_read( data, &handles, processed->size() ) ) - { - ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" ); - return Error::InvalidProgram; - } - - ET_LOG(Debug, "ArmBackend::execute: Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", - handles.cmd_data, handles.cmd_data_size, - handles.weight_data, handles.weight_data_size, - handles.scratch_data, handles.scratch_data_size ); - - // TMP emit scratch - printf("Scratch before:\n"); - for( int i=0; i init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const override { + ET_LOG(Info, "ArmBackend::init %p", processed->data()); + + char* data = (char*)processed->data(); + size_t size = processed->size(); + char* foot = data + size - 16; + + // Header and footer both 16 bit aligned suggest valid structure and we + // wont walk off the end of the chunks and segfault + if (!((int)data == next_mul_16((int)data))) { + ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned"); + return Error::InvalidProgram; + } + if (!((int)foot == next_mul_16((int)foot))) { + ET_LOG(Error, "ArmBackend::init: Program unexpected size"); + return Error::InvalidProgram; + } + if (!(0 == strncmp(data, "vela_bin_stream", 15))) { + ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream"); + return Error::InvalidProgram; + } + if (!(0 == strncmp(foot, "vela_end_stream", 15))) { + ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream"); + return Error::InvalidProgram; + } + // Verify address range is accessible current expectation is the program + // is wholly stored in SRAM + if (!(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH)) { + ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM"); + return Error::InvalidProgram; + } + + // Return the same buffer we were passed - this data will be + // executed directly + return processed; + } + + Error execute( + BackendExecutionContext& context, + DelegateHandle* input_handle, + EValue** args) const override { + FreeableBuffer* processed = (FreeableBuffer*)input_handle; + + ET_LOG(Info, "ArmBackend::execute %p", processed->data()); + + vela_handles handles = {0, 0, 0, 0, 0, 0}; + + // Command stream - we know at this point it's aligned + char* data = (char*)processed->data(); + + // Read key sections from the vela_bin_stream + if (!this->vela_read(data, &handles, processed->size())) { + ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout"); + return Error::InvalidProgram; + } + + ET_LOG( + Debug, + "ArmBackend::execute: Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", + handles.cmd_data, + handles.cmd_data_size, + handles.weight_data, + handles.weight_data_size, + handles.scratch_data, + handles.scratch_data_size); + + // TMP emit scratch + printf("Scratch before:\n"); + for (int i = 0; i < handles.scratch_data_size; i++) { + if (i % 4 == 0) + ((char*)handles.scratch_data)[i] = 1; + printf("%02x ", ((char*)handles.scratch_data)[i]); + if (!((i + 1) % 4)) printf("\n"); - - // Process results into EValue storage - // TODO: optimise into direct write for compatible layouts - // TODO: get num in/out and layout? - int *output_address = (int*)(handles.scratch_data + handles.output_offset); - auto tensor = args[1]->toTensor(); - for(int j=0; j()[j] = output_address[j]; - } - - return Error::Ok; - } - - void destroy(DelegateHandle* handle) const override { - return; - } - -private: - typedef struct { - const char *cmd_data; size_t cmd_data_size; - const char *weight_data; size_t weight_data_size; - const char *scratch_data; size_t scratch_data_size; - size_t input_offset; size_t input_data_shape[3]; - size_t output_offset; size_t output_data_shape[3]; - } vela_handles; - - typedef struct { - char name[16]; - int size; char _pad[12]; - char data[]; - } vela_bin_block; - - static int next_mul_16( int n ) { - return ((n-1)|15)+1; - } - - int vela_read(char* data, vela_handles *h, int size ) const { - - // Read header string - if( strncmp( data, "vela_bin_stream", 15 ) ) - { - return 0; - } - data += 16; - - // Expect one or more 'vela_bin_block's - while( 1 ) - { - vela_bin_block *b = (vela_bin_block*)data; - data += 16 + 16 + next_mul_16(b->size); - - // Exit with success on finding end of stream - if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1; - - if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) ) - { - // This magic header confirms a valid command stream in binary - if( strncmp( b->data, "COP1", 4 ) ) return 0; - h->cmd_data = b->data; - h->cmd_data_size = b->size; - } - if( !strncmp( b->name, "weight_data", strlen("weight_data")) ) - { - h->weight_data = b->data;; - h->weight_data_size = b->size; - } - if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) - { - h->scratch_data = b->data; - h->scratch_data_size = b->size; - } - - // capture inputs and outputs - if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) - { - h->scratch_data = b->data; - h->scratch_data_size = b->size; - } - if( !strncmp( b->name, "input_offset", strlen("input_offset")) ) - { - h->input_offset = ((int*)b->data)[0]; - } - if( !strncmp( b->name, "output_offset", strlen("output_offset")) ) - { - h->output_offset = ((int*)b->data)[0]; - } - if( !strncmp( b->name, "input_shape", strlen("input_shape")) ) - { - h->input_data_shape[0] = ((int*)b->data)[0]; - h->input_data_shape[0] = ((int*)b->data)[1]; - h->input_data_shape[0] = ((int*)b->data)[2]; - - } - if( !strncmp( b->name, "output_shape", strlen("output_shape")) ) - { - h->output_data_shape[0] = ((int*)b->data)[0]; - h->output_data_shape[0] = ((int*)b->data)[1]; - h->output_data_shape[0] = ((int*)b->data)[2]; - } - } - } - + } + printf("\n"); + + // Allocate driver handle and synchronously invoke driver + ethosu_driver* drv = ethosu_reserve_driver(); + + uint64_t bases[2] = { + (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data}; + size_t bases_size[2] = { + handles.weight_data_size, handles.scratch_data_size}; + int result = ethosu_invoke_v3( + drv, + (void*)handles.cmd_data, + handles.cmd_data_size, + bases, + bases_size, + 2, + nullptr); + + if (result != 0) { + ET_LOG( + Error, + "ArmBackend::execute: Ethos-U invocation failed error (%d)", + result); + return Error::InvalidProgram; + } + + // TMP emit scratch + printf("Scratch after:\n"); + for (int i = 0; i < handles.scratch_data_size; i++) { + printf("%02x ", ((char*)handles.scratch_data)[i]); + if (!((i + 1) % 4)) + printf("\n"); + } + printf("\n"); + + // Process results into EValue storage + // TODO: optimise into direct write for compatible layouts + // TODO: get num in/out and layout? + int* output_address = (int*)(handles.scratch_data + handles.output_offset); + auto tensor = args[1]->toTensor(); + for (int j = 0; j < tensor.numel(); j++) { + tensor.mutable_data_ptr()[j] = output_address[j]; + } + + return Error::Ok; + } + + void destroy(DelegateHandle* handle) const override { + return; + } + + private: + typedef struct { + const char* cmd_data; + size_t cmd_data_size; + const char* weight_data; + size_t weight_data_size; + const char* scratch_data; + size_t scratch_data_size; + size_t input_offset; + size_t input_data_shape[3]; + size_t output_offset; + size_t output_data_shape[3]; + } vela_handles; + + typedef struct { + char name[16]; + int size; + char _pad[12]; + char data[]; + } vela_bin_block; + + static int next_mul_16(int n) { + return ((n - 1) | 15) + 1; + } + + int vela_read(char* data, vela_handles* h, int size) const { + // Read header string + if (strncmp(data, "vela_bin_stream", 15)) { + return 0; + } + data += 16; + + // Expect one or more 'vela_bin_block's + while (1) { + vela_bin_block* b = (vela_bin_block*)data; + data += 16 + 16 + next_mul_16(b->size); + + // Exit with success on finding end of stream + if (!strncmp(b->name, "vela_end_stream", 15)) + return 1; + + if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) { + // This magic header confirms a valid command stream in binary + if (strncmp(b->data, "COP1", 4)) + return 0; + h->cmd_data = b->data; + h->cmd_data_size = b->size; + } + if (!strncmp(b->name, "weight_data", strlen("weight_data"))) { + h->weight_data = b->data; + ; + h->weight_data_size = b->size; + } + if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + + // capture inputs and outputs + if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + if (!strncmp(b->name, "input_offset", strlen("input_offset"))) { + h->input_offset = ((int*)b->data)[0]; + } + if (!strncmp(b->name, "output_offset", strlen("output_offset"))) { + h->output_offset = ((int*)b->data)[0]; + } + if (!strncmp(b->name, "input_shape", strlen("input_shape"))) { + h->input_data_shape[0] = ((int*)b->data)[0]; + h->input_data_shape[0] = ((int*)b->data)[1]; + h->input_data_shape[0] = ((int*)b->data)[2]; + } + if (!strncmp(b->name, "output_shape", strlen("output_shape"))) { + h->output_data_shape[0] = ((int*)b->data)[0]; + h->output_data_shape[0] = ((int*)b->data)[1]; + h->output_data_shape[0] = ((int*)b->data)[2]; + } + } + } }; namespace { auto backend = ArmBackend(); Backend backend_id{"ArmBackend", &backend}; static auto registered = register_backend(backend_id); -} +} // namespace } // namespace executor } // namespace torch From 524afc6713690481ee33482e4f7b520939a003b5 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 12:16:20 +0000 Subject: [PATCH 04/25] Add ArmBackend to example scripts Signed-off-by: Rob Elliott --- examples/arm/run.sh | 1 + examples/arm/setup.sh | 69 +++++++++++++++++++++++++------------------ 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 828ac16bdc6..515240ef2ad 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -56,6 +56,7 @@ function build_executorch() { -DEXECUTORCH_BUILD_GFLAGS=OFF \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_SELECT_OPS_LIST="aten::_softmax.out" \ diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index d6f6880e173..c3518d3b24f 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -13,31 +13,7 @@ if [[ "${1}" == "-h" ]]; then fi ######## -### Hardcoded constants -######## -script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) - -# FVP -fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9" -fvp_model_dir="Linux64_GCC-9.3" -fvp_md5_checksum="98e93b949d0fbac977292d8668d34523" - -# toochain -toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz" -toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi" -toolchain_md5_checksum="00ebb1b70b1f88906c61206457eacb61" - -# ethos-u -ethos_u_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u" -ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c" - -######## -### Optional user args -######## -root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"} - -######## -### Functions +### Helper functions ######## function get_os_name() { # Returns the name of the system i.e. Linux or Darwin @@ -62,6 +38,44 @@ function verify_md5() { fi } +######## +### Hardcoded constants +######## +script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) + +if [[ $(get_cpu_arch) == "x86_64" ]]; then + # FVP + fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9" + fvp_model_dir="Linux64_GCC-9.3" + + # toochain + toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz" + toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi" +elif [[ $(get_cpu_arch) == "aarch64" ]]; then + # FVP + fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073" + fvp_model_dir="Linux64_armv8l_GCC-9.3" + + # toochain + toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz" + toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi" +else + echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1; +fi + +# ethos-u +ethos_u_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u" +ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c" + +######## +### Optional user args +######## +root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"} + +######## +### Functions +######## + function setup_fvp() { # Download and install the Corstone 300 FVP simulator platform cd "${root_dir}" @@ -137,9 +151,8 @@ function patch_repo() { ######## # do basic checks # Make sure we are on a supported platform -# Linux ARM64 is a supported platform - adding it here is a WIP -[[ "$(get_cpu_arch)" != "x86_64" ]] \ - && { echo "[main] Error: only x86-64 architecture is supported for now!"; exit 1; } +[[ $(get_cpu_arch) != "x86_64" ]] && [[ $(get_cpu_arch) != "aarch64" ]] \ + && { echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1; } # No OSx support for FVP [[ "$(get_os_name)" != "Linux" ]] \ From 2bfdb5fcb7024b40e7f703da0bcfce6774f9323e Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 14:57:04 +0000 Subject: [PATCH 05/25] Add delegate test and FVP output * There is a toolchain/linking issue mixing hard and soft float ABI which is causing the test to fail, but the structure is there and the delegate is registered. Signed-off-by: Rob Elliott --- .../0007-Add-delegate-runner-test.patch | 260 ++++++++++++++++++ examples/arm/run.sh | 21 +- 2 files changed, 279 insertions(+), 2 deletions(-) create mode 100644 examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch new file mode 100644 index 00000000000..b5c2ad68b9d --- /dev/null +++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch @@ -0,0 +1,260 @@ +From b6348ace74c14dc5b46060b792836399205834a7 Mon Sep 17 00:00:00 2001 +From: Rob Elliott +Date: Wed, 4 Oct 2023 13:31:33 +0000 +Subject: [PATCH] Add delegate runner test + +Signed-off-by: Rob Elliott +--- + applications/executorch_tests/CMakeLists.txt | 11 ++ + applications/executorch_tests/add.pte.h | 70 ++++++++++ + .../executorch_tests/runner_delegate.cpp | 132 ++++++++++++++++++ + 3 files changed, 213 insertions(+) + create mode 100644 applications/executorch_tests/add.pte.h + create mode 100644 applications/executorch_tests/runner_delegate.cpp + +diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt +index c95d53e..d0233bf 100644 +--- a/applications/executorch_tests/CMakeLists.txt ++++ b/applications/executorch_tests/CMakeLists.txt +@@ -44,6 +44,7 @@ message("**********************") + set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a") + set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a") + set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a") ++set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libethos_u.a") + + add_custom_target( + gen_model_header ALL +@@ -67,6 +68,16 @@ ethosu_add_executable_test(executor_runner PRIVATE + ${LIB_ET_OP_REGISTRATION} + ${LIB_ET_OP_KERNELS}) + ++ethosu_add_executable_test(executor_runner PRIVATE ++ WHOLE_ARCHIVE TRUE ++ SOURCES runner_delegate.cpp ++ LIBRARIES ++ ${LIB_ET_RUNTIME} ++ ${LIB_ET_OP_REGISTRATION} ++ ${LIB_ET_OP_KERNELS} ++ ${LIB_ET_ETHOS} ++ ) ++ + add_dependencies(executor_runner gen_model_header) + + target_include_directories(executor_runner PRIVATE +diff --git a/applications/executorch_tests/add.pte.h b/applications/executorch_tests/add.pte.h +new file mode 100644 +index 0000000..05bc0ec +--- /dev/null ++++ b/applications/executorch_tests/add.pte.h +@@ -0,0 +1,70 @@ ++__attribute__((section(".sram.data"), aligned(16))) char add_pte[] = { ++0x24, 0x00, 0x00, 0x00, 0x45, 0x54, 0x31, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, ++0x0c, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xf4, 0x04, 0x00, 0x00, 0xdc, 0x04, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x36, 0xf9, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0xb0, 0x04, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x62, 0x69, 0x6e, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, ++0x63, 0x6d, 0x64, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x43, 0x4f, 0x50, 0x31, 0x01, 0x00, 0x10, 0x00, 0x07, 0x18, 0x00, 0x00, 0x00, 0x00, 0x06, 0x10, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x55, 0x00, ++0x25, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x26, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x01, 0x01, 0x00, 0x00, 0x40, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x01, 0x00, 0x00, ++0x0c, 0x01, 0x00, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x04, 0x01, 0x04, 0x00, 0x06, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00, ++0x14, 0x00, 0x00, 0x00, 0x09, 0x01, 0x00, 0x00, 0x05, 0x01, 0x09, 0x00, 0x07, 0x01, 0x00, 0x00, 0x1f, 0x01, 0x01, 0x00, 0x10, 0x40, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x11, 0x40, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x12, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0x1a, 0x01, 0x00, 0x00, ++0x12, 0x01, 0x00, 0x00, 0x11, 0x01, 0x00, 0x00, 0x13, 0x01, 0x04, 0x00, 0x16, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x15, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x40, 0x00, 0x00, ++0x14, 0x00, 0x00, 0x00, 0x18, 0x01, 0x00, 0x00, 0x14, 0x01, 0x05, 0x01, 0x25, 0x01, 0x00, 0x00, 0x26, 0x01, 0x00, 0x80, 0x27, 0x01, 0xff, 0x7f, 0x16, 0x01, 0x00, 0x00, 0x15, 0x01, 0x01, 0x00, ++0x17, 0x01, 0x07, 0x00, 0x0d, 0x01, 0x16, 0x00, 0x2d, 0x01, 0x16, 0x00, 0x8d, 0x01, 0x0a, 0x00, 0x24, 0x01, 0x00, 0x00, 0x8f, 0x01, 0x01, 0x00, 0x80, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x81, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x82, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x83, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8b, 0x01, 0x00, 0x00, 0x8c, 0x01, 0x00, 0x00, ++0x8a, 0x01, 0x00, 0x00, 0x86, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x85, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x84, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x89, 0x01, 0x00, 0x00, ++0x85, 0x01, 0x09, 0x00, 0x80, 0x01, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, ++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x73, 0x68, 0x00, ++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x72, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x00, ++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, ++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, ++0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x65, 0x6e, 0x64, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, ++0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, 0x28, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x18, 0x00, 0x1c, 0x00, 0x20, 0x00, 0x24, 0x00, 0x16, 0x00, 0x00, 0x00, ++0x44, 0x03, 0x00, 0x00, 0xf0, 0x01, 0x00, 0x00, 0x2c, 0x01, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x14, 0x01, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, ++0x76, 0xff, 0xff, 0xff, 0x68, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x2f, 0x74, 0x6d, 0x70, 0x2f, 0x61, 0x72, 0x6d, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x6d, 0x7a, 0x74, 0x66, 0x5f, 0x62, 0x76, 0x67, 0x2f, 0x73, ++0x69, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x61, 0x64, 0x64, 0x2f, 0x74, 0x6f, 0x73, 0x61, 0x2f, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x64, 0x65, 0x62, 0x75, 0x67, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x70, ++0x61, 0x74, 0x68, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x41, 0x72, 0x6d, 0x42, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, ++0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x0c, 0x00, 0x00, 0x00, ++0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, ++0xb0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x05, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x14, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, ++0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x88, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, ++0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0e, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, ++0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x16, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, ++0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, ++0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, ++0x39, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, ++0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x00, 0x00, 0x00, ++0xfe, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, ++0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, ++0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, ++0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, ++0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, ++0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x7d, 0x2c, 0x20, ++0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, ++0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x5b, 0x5d, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, ++0x7d, 0x5d, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x00, }; +diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp +new file mode 100644 +index 0000000..fe77d83 +--- /dev/null ++++ b/applications/executorch_tests/runner_delegate.cpp +@@ -0,0 +1,132 @@ ++/* ++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates ++ * ++ * SPDX-License-Identifier: Apache-2.0 ++ * ++ * Licensed under the Apache License, Version 2.0 (the License); you may ++ * not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT ++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++/**************************************************************************** ++ * Includes ++ ****************************************************************************/ ++ ++#include ++#include ++#include ++ ++using namespace std; ++ ++#include ++#include ++#include ++#include ++#include ++ ++/**************************************************************************** ++ * Data ++ ****************************************************************************/ ++ ++// Our .pte file generated from the AoT flow ++#include "add.pte.h" ++ ++// Storage for intermediate data in SRAM ++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; ++ ++ ++int main() ++{ ++ printf("test test test NG ^2 22\n"); ++ printf("main: Initialising runtime\n"); ++ torch::executor::runtime_init(); ++ ++ using torch::executor::Result; ++ using torch::executor::Error; ++ ++ // Load pte from the global add_pte .pte file loaded into SRAM. ++ auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte)); ++ Result program = torch::executor::Program::load(&loader); ++ if(!program.ok()) { ++ printf("main: Program loading failed @ 0x%p: 0x%x", add_pte, (int)program.error()); ++ } ++ printf("main: Model buffer loaded, has %u methods\n", program->num_methods()); ++ ++ // Find our entrypoint in the .pte program ++ const char* method_name = nullptr; ++ const auto method_name_result = program->get_method_name(0); ++ ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); ++ method_name = *method_name_result; ++ printf("main: Found (and will run) method '%s'\n", method_name); ++ ++ // Allocate necessary memories for this method ++ Result method_meta = program->method_meta(method_name); ++ if (!method_meta.ok()) { ++ printf("main: Failed to get method_meta for %s: 0x%x", ++ method_name, (unsigned int)method_meta.error()); ++ } ++ ++ torch::executor::MemoryAllocator method_allocator{ ++ torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; ++ ++ std::vector> planned_buffers; // Owns the memory ++ std::vector> planned_spans; // Passed to the allocator ++ size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); ++ ++ for (size_t id = 0; id < num_memory_planned_buffers; ++id) { ++ size_t buffer_size = static_cast(method_meta->memory_planned_buffer_size(id).get()); ++ printf("main: Setting up planned buffer %zu, size %zu.\n", id, buffer_size); ++ ++ planned_buffers.push_back(std::make_unique(buffer_size)); ++ planned_spans.push_back({planned_buffers.back().get(), buffer_size}); ++ } ++ ++ torch::executor::HierarchicalAllocator planned_memory( ++ {planned_spans.data(), planned_spans.size()}); ++ ++ torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory); ++ ++ Result method = program->load_method(method_name, &memory_manager); ++ ++ if(!method.ok()) { ++ printf("main: Loading of method %s failed with status 0x%x\n", method_name, (int)method.error()); ++ } ++ printf("main: Loading of method '%s' succesful\n", method_name); ++ ++ printf("main: Preparing inputs...\n"); ++ auto inputs = torch::executor::util::PrepareInputTensors(*method); ++ ++ printf("main: Starting the model execution...\n"); ++ Error status = method->execute(); ++ if(status != Error::Ok){ ++ printf("main: Execution of method %s failed with status 0x%x\n", method_name, (int)status); ++ } else { ++ printf("main: Model executed successfully.\n"); ++ } ++ ++ // Print the outputs. ++ std::vector outputs(method->outputs_size()); ++ printf("main: %d outputs - ", outputs.size()); ++ status = method->get_outputs(outputs.data(), outputs.size()); ++ ET_CHECK(status == Error::Ok); ++ for (size_t i = 0; i < outputs.size(); ++i) ++ { ++ printf("main: Output %d numel %d\n", i, outputs[i].toTensor().numel()); ++ for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j) ++ { ++ printf("main: Output[%d]: %d\n", j, outputs[i].toTensor().const_data_ptr()[j]); ++ } ++ } ++ ++ return 0; ++} ++ ++ +-- +2.41.0 + diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 515240ef2ad..ef0a6d560a3 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -87,7 +87,7 @@ function build_executorch_runner() { echo "[${FUNCNAME[0]}] Configured CMAKE" n=$(nproc) - cmake --build build -- -j"$((n - 5))" executor_runner VERBOSE=1 + cmake --build build -- -j"$((n - 5))" executor_runner executor_runner_delegate VERBOSE=1 echo "[${FUNCNAME[0]}] Generated baremetal elf file:" find . -name "executor_runner.elf" } @@ -102,7 +102,21 @@ function run_fvp() { -C mps3_board.telnetterminal0.start_telnet=0 \ -C mps3_board.uart0.out_file='-' \ -a "${elf}" \ - --timelimit 10 # seconds + --timelimit 5 || true # seconds + echo "[${FUNCNAME[0]} Simulation complete, $?" +} + +# Execute the executor_runner on FVP Simulator +function run_fvp_delegate() { + elf=$(find ${ethos_u_build_dir} -name "executor_runner_delegate.elf") + [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner_delegate elf: ${elf}"; exit 1; } + FVP_Corstone_SSE-300_Ethos-U55 \ + -C ethosu.num_macs=128 \ + -C mps3_board.visualisation.disable-visualisation=1 \ + -C mps3_board.telnetterminal0.start_telnet=0 \ + -C mps3_board.uart0.out_file='-' \ + -a "${elf}" \ + --timelimit 5 || true echo "[${FUNCNAME[0]} Simulation complete, $?" } @@ -143,4 +157,7 @@ build_executorch_runner "${pte}" # run the app run_fvp +# run the delegate app +run_fvp_delegate + exit 0 From 8267612aafe02b36cfb81c4206b3de5f16a2521c Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 16:12:31 +0000 Subject: [PATCH 06/25] Fix delegate runner patch --- .../0007-Add-delegate-runner-test.patch | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch index b5c2ad68b9d..8b686c4f23f 100644 --- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch +++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch @@ -1,19 +1,19 @@ -From b6348ace74c14dc5b46060b792836399205834a7 Mon Sep 17 00:00:00 2001 +From 12e841a383069f0b3d0e9c51c793c2922a590ae0 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 13:31:33 +0000 Subject: [PATCH] Add delegate runner test Signed-off-by: Rob Elliott --- - applications/executorch_tests/CMakeLists.txt | 11 ++ + applications/executorch_tests/CMakeLists.txt | 21 ++- applications/executorch_tests/add.pte.h | 70 ++++++++++ .../executorch_tests/runner_delegate.cpp | 132 ++++++++++++++++++ - 3 files changed, 213 insertions(+) + 3 files changed, 221 insertions(+), 2 deletions(-) create mode 100644 applications/executorch_tests/add.pte.h create mode 100644 applications/executorch_tests/runner_delegate.cpp diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt -index c95d53e..d0233bf 100644 +index c95d53e..195c8a3 100644 --- a/applications/executorch_tests/CMakeLists.txt +++ b/applications/executorch_tests/CMakeLists.txt @@ -44,6 +44,7 @@ message("**********************") @@ -24,11 +24,17 @@ index c95d53e..d0233bf 100644 add_custom_target( gen_model_header ALL -@@ -67,6 +68,16 @@ ethosu_add_executable_test(executor_runner PRIVATE +@@ -67,10 +68,26 @@ ethosu_add_executable_test(executor_runner PRIVATE ${LIB_ET_OP_REGISTRATION} ${LIB_ET_OP_KERNELS}) -+ethosu_add_executable_test(executor_runner PRIVATE +-add_dependencies(executor_runner gen_model_header) +- + target_include_directories(executor_runner PRIVATE + ${ET_INCLUDE_PATH} + ${CMAKE_CURRENT_BINARY_DIR}) + ++ethosu_add_executable_test(executor_runner_delegate PRIVATE + WHOLE_ARCHIVE TRUE + SOURCES runner_delegate.cpp + LIBRARIES @@ -38,9 +44,15 @@ index c95d53e..d0233bf 100644 + ${LIB_ET_ETHOS} + ) + - add_dependencies(executor_runner gen_model_header) - - target_include_directories(executor_runner PRIVATE ++target_include_directories(executor_runner_delegate PRIVATE ++${ET_INCLUDE_PATH} ++${CMAKE_CURRENT_BINARY_DIR}) ++ ++add_dependencies(executor_runner gen_model_header) ++ ++ ++ + # TODO Memory setup diff --git a/applications/executorch_tests/add.pte.h b/applications/executorch_tests/add.pte.h new file mode 100644 index 0000000..05bc0ec From 6b398fe5fd32f06b3cbe5bf03cb5d82bf6bf3fe2 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 20:09:50 +0000 Subject: [PATCH 07/25] cmake compiler and log behaviour fixing * Override the default CPU in cmake which was causing a mixture of FPU and ABI flags to be passed to different compilation stages. * Updated fallback logging implementation in delegate app to fix sporadic crash Signed-off-by: Rob Elliott --- backends/arm/cmake/arm-none-eabi-gcc.cmake | 2 +- backends/arm/runtime/ArmBackendEthosU.cpp | 5 +- .../0007-Add-delegate-runner-test.patch | 66 +++++++++++++++---- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake index 10bc858ed46..0921a529037 100644 --- a/backends/arm/cmake/arm-none-eabi-gcc.cmake +++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake @@ -3,7 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU") +set(TARGET_CPU "cortex-m55" CACHE STRING "Target CPU") string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR) set(CMAKE_SYSTEM_NAME Generic) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 4c052ea60c7..7810c8f33fe 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -29,9 +29,8 @@ namespace executor { class ArmBackend final : public PyTorchBackendInterface { public: - ArmBackend() { - printf("Constructing ARM Backend\n"); - } + + ArmBackend() {} ~ArmBackend() = default; diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch index 8b686c4f23f..5e9e72b098c 100644 --- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch +++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch @@ -1,19 +1,20 @@ -From 12e841a383069f0b3d0e9c51c793c2922a590ae0 Mon Sep 17 00:00:00 2001 +From 6bd0ad55504da811159c58abeb5305a7a1ab884a Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 13:31:33 +0000 Subject: [PATCH] Add delegate runner test Signed-off-by: Rob Elliott --- - applications/executorch_tests/CMakeLists.txt | 21 ++- - applications/executorch_tests/add.pte.h | 70 ++++++++++ - .../executorch_tests/runner_delegate.cpp | 132 ++++++++++++++++++ - 3 files changed, 221 insertions(+), 2 deletions(-) + applications/executorch_tests/CMakeLists.txt | 19 +- + applications/executorch_tests/add.pte.h | 70 ++++++++ + .../executorch_tests/runner_delegate.cpp | 162 ++++++++++++++++++ + cmake/toolchain/arm-none-eabi-gcc.cmake | 6 +- + 4 files changed, 251 insertions(+), 6 deletions(-) create mode 100644 applications/executorch_tests/add.pte.h create mode 100644 applications/executorch_tests/runner_delegate.cpp diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt -index c95d53e..195c8a3 100644 +index c95d53e..943ade5 100644 --- a/applications/executorch_tests/CMakeLists.txt +++ b/applications/executorch_tests/CMakeLists.txt @@ -44,6 +44,7 @@ message("**********************") @@ -24,7 +25,7 @@ index c95d53e..195c8a3 100644 add_custom_target( gen_model_header ALL -@@ -67,10 +68,26 @@ ethosu_add_executable_test(executor_runner PRIVATE +@@ -67,10 +68,24 @@ ethosu_add_executable_test(executor_runner PRIVATE ${LIB_ET_OP_REGISTRATION} ${LIB_ET_OP_KERNELS}) @@ -39,8 +40,6 @@ index c95d53e..195c8a3 100644 + SOURCES runner_delegate.cpp + LIBRARIES + ${LIB_ET_RUNTIME} -+ ${LIB_ET_OP_REGISTRATION} -+ ${LIB_ET_OP_KERNELS} + ${LIB_ET_ETHOS} + ) + @@ -131,10 +130,10 @@ index 0000000..05bc0ec +0x7d, 0x5d, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x00, }; diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp new file mode 100644 -index 0000000..fe77d83 +index 0000000..a4024bb --- /dev/null +++ b/applications/executorch_tests/runner_delegate.cpp -@@ -0,0 +1,132 @@ +@@ -0,0 +1,162 @@ +/* + * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates + * @@ -179,6 +178,36 @@ index 0000000..fe77d83 +// Storage for intermediate data in SRAM +__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; + ++void et_pal_init(void) {} ++ ++__ET_NORETURN void et_pal_abort(void) { ++ __builtin_trap(); ++} ++ ++et_timestamp_t et_pal_current_ticks(void) { ++ // libc.a - warning: _gettimeofday is not implemented and will always fail ++ return 11223344; ++} ++ ++/** ++ * Emit a log message via platform output (serial port, console, etc). ++ */ ++void et_pal_emit_log_message( ++ __ET_UNUSED et_timestamp_t timestamp, ++ et_pal_log_level_t level, ++ const char* filename, ++ __ET_UNUSED const char* function, ++ size_t line, ++ const char* message, ++ __ET_UNUSED size_t length) { ++ fprintf( ++ stderr, ++ "%c executorch:%s:%zu] %s\n", ++ level, ++ filename, ++ line, ++ message); ++} + +int main() +{ @@ -267,6 +296,21 @@ index 0000000..fe77d83 +} + + +diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake +index 0e6a2ed..fdb0d7c 100644 +--- a/cmake/toolchain/arm-none-eabi-gcc.cmake ++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake +@@ -98,8 +98,6 @@ add_compile_options( + # -Wswitch + # -Wswitch-default + # -Wunused +- +- # -Wno-redundant-decls +- +- # -Wno-psabi ++ -Wno-redundant-decls ++ -Wno-psabi + ) -- 2.41.0 From 6ca9b26edf0b6623cc020591ed4fedaa8141b219 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 08:19:14 +0000 Subject: [PATCH 08/25] Minimal example of AoT with ArmPartitioner+Vela * uses the simple_add model to run through the AoT flow and the various debug options for looking at export. * produces a .pte file for runtime delegation on the ArmBackend for Ethos-U55 platforms Signed-off-by: Rob Elliott --- backends/arm/test/test_models.py | 2 + examples/arm/arm_ethosu_minimal.py | 212 +++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 examples/arm/arm_ethosu_minimal.py diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py index 3400a7c8f7c..b33007f0c84 100644 --- a/backends/arm/test/test_models.py +++ b/backends/arm/test/test_models.py @@ -25,6 +25,7 @@ class TosaProfile(Enum): BI = 0 # Base Inference MI = 1 # Main Inference MT = 2 # Main Training + BI_INT = 3 # integer only BI subset tests (for test graphs) class TorchBuilder: @@ -67,6 +68,7 @@ class simple_add(torch.nn.Module): inputs = { TosaProfile.BI: (torch.ones(5),), TosaProfile.MI: (torch.ones(5),), + TosaProfile.BI_INT: (torch.ones(5,dtype=torch.int32),), } def __init__(self): diff --git a/examples/arm/arm_ethosu_minimal.py b/examples/arm/arm_ethosu_minimal.py new file mode 100644 index 00000000000..a41cbb42cd5 --- /dev/null +++ b/examples/arm/arm_ethosu_minimal.py @@ -0,0 +1,212 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import subprocess +import tempfile + +import executorch.exir as exir + +import numpy as np +from executorch.backends.arm.arm_backend import ArmPartitioner +from executorch.backends.arm.test.test_models import TestList, TosaProfile +from executorch.backends.arm.test.test_tosa import prepare_model_and_ref + +from executorch.exir.backend.backend_api import to_backend +from executorch.exir.backend.canonical_partitioners.duplicate_dequant_node_pass import ( + DuplicateDequantNodePass, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec + +from executorch.exir.dialects._ops import ops as exir_ops + +# Assumes you have these two tools on your path +TOSA_REF_MODEL_PATH = "tosa_reference_model" +VELA_COMPILER_PATH = "vela" + +# Basic config for graph capture +_CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True) +_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( + _check_ir_validity=False, +) + +EXAMPLE_TEST_LIST = ["simple_add"] + +# +# +# +# +def tosa_ref_capture_inputs( + model_edge, + inputs, + path, + input_quantization_scales, + input_quantization_zps, + profile=TosaProfile.MI, +): + # Emit TOSA test data from the model inputs - assumes whole graph lowered so we just have + # placeholders for the TOSA delegate. Emits data in tosa_ref_model expected layout. + # - Skips placeholders which are encoded as constants (i.e. are already captured weights) + # - Assumes argument order is fixed + argument_names = [] + for node in model_edge.exported_program.graph.nodes: + gs = model_edge.exported_program.graph_signature + if node.op == "placeholder": + if node.name in gs.inputs_to_parameters: + pass + elif node.name in gs.inputs_to_buffers: + pass + else: + argument_names.append(node.name) + else: + break + + for arg in zip(argument_names, inputs): + name = arg[0] + data = arg[1].detach().numpy() + file_path = path + "/" + name + ".npy" + + # Torch is doing Input[FP32]->Q[INT8]->DQ[FP32]->Operator[FP32]->Q[INT]->DQ[FP32]->[Output]FP32 + # Need to quantize the input to INT8 for TOSA comsumption + if profile is TosaProfile.BI: + data_quantized = ( + (data / input_quantization_scales[name]) - input_quantization_zps[name] + ).astype(np.int8) + np.save(file_path, data_quantized, allow_pickle=False) + else: + np.save(file_path, data, allow_pickle=False) + +# +# Minimal sequence to take a model through the ArmPartitioner and produce +# both TOSA intermediate output, and an Ethos-U55 command stream within +# the ExecuTorch .pte binary +# +def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"): + # + # Minimal sequence to take model through TosaPartitioner and emit + # tosaout/ debug directory containing the flatbuffer - assumes one and will only save last output + # tosaout is generated even for partial/broken subgraph capture to aid in debg + # delegated.pte containing the flatbuffer within the executorch flatbuffer binary + # + print(f"\n\033[96mProcessing:::{op}\033[0m") + print(f"\033[96mDebug output path for intermediates: {output_path}\033[0m") + + os.makedirs(output_path, exist_ok=True) + + # Debug output for TORCH + TORCH_OUT_PATH = os.path.join(output_path, op, "torch", "") + os.makedirs(TORCH_OUT_PATH, exist_ok=True) + + # Debug output for TOSA + TOSA_OUT_PATH = os.path.join(output_path, op, "tosa", "") + os.makedirs(TOSA_OUT_PATH, exist_ok=True) + + model, inputs, torch_output = prepare_model_and_ref(op, profile) + + if inputs is None: + print("\033[96m Skipping, model has no inputs for TOSA profile \033[0m") + return + + print(f" Model: {op}\n Inputs: {inputs}\n Outputs: {torch_output}") + + # Export model + model_capture = exir.capture(model, inputs, _CAPTURE_CONFIG) + model_edge = model_capture.to_edge(_EDGE_COMPILE_CONFIG) + + # Partition with ArmBackend + ArmPartitioner.compile_spec = [CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8"))] + model_edge.exported_program = to_backend( + model_edge.transform(DuplicateDequantNodePass()).exported_program, + ArmPartitioner, + ) + exec_prog = model_edge.to_executorch() + + # Save .pte including delegated Vela section + with open(TORCH_OUT_PATH + "/delegated.pte", "wb") as fh: + fh.write(exec_prog.buffer) + + # NOTE: + # Additional steps from here are optional but can be helpful with + # debug as they will capture the inputs and outputs as well as running + # the intermediate output on the tosa_reference_model. + # This can ensure the compilation flow is working correctly as part of + # a development loop, ahead of running the example on hardware. + + # Save inputs for TOSA reference run + tosa_ref_capture_inputs(model_edge, inputs, TOSA_OUT_PATH, {}, {}, profile) + + # Save ground truth results to file + with open(TORCH_OUT_PATH + "/torch_output.npy", "wb") as f: + np.save(f, torch_output.detach().numpy()) + + # Convert TOSA Flatbuffer into JSON format for human debugging + cmd_flatc = ( + "flatc" + + " -o " + + TOSA_OUT_PATH + + " --raw-binary -t ./backends/arm/third-party/serialization_lib/schema/tosa.fbs -- " + + TOSA_OUT_PATH + + "/output.tosa" + ) + subprocess.run([cmd_flatc], shell=True, check=True) + + ### Run the TOSA flatbuffer through TOSA Ref_Model and print the results + DESC_FILE_NAME = "/desc.json" + DESC_FILE_PATH = TOSA_OUT_PATH + DESC_FILE_NAME + cmd_ref_model = TOSA_REF_MODEL_PATH + " --test_desc " + DESC_FILE_PATH + subprocess.run([cmd_ref_model], shell=True, check=True) + + ## Load in the JSON File, Read the tosa output + desc_file = open(DESC_FILE_PATH) + desc_json = json.load(desc_file) + tosa_out_filenames = desc_json["ofm_file"] + for tosa_out_fm_file_name in tosa_out_filenames: + f = open(TOSA_OUT_PATH + "/" + tosa_out_fm_file_name, "rb") + tosa_output = np.load(f) + + ## Read the Torch Output + torch_file = open(TORCH_OUT_PATH + "/torch_output.npy", "rb") + torch_output = np.load(torch_file) + + ## Compare Tosa and Torch Results + if np.allclose(tosa_output, torch_output, rtol=1e-1, atol=1e-1, equal_nan=True): + print( + "\033[92m" + + "Torch and Tosa Reference results are matching for operator: " + + op + + " from " + + str(str(profile)) + + "\033[0m" + ) + + else: + print("\033[91m" + "Sorry, Torch and Tosa Reference Results Do not Match!") + print("============================") + print("TOSA Output Shape is: " + str(tosa_output.shape)) + print("TOSA Output is: ") + print(tosa_output) + print("\033[93m") + print("============================") + print("Torch Output Shape is: " + str(torch_output.shape)) + print("Torch Output is: ") + print(torch_output) + print("\033[0m") + + if profile in ( TosaProfile.BI, TosaProfile.BI_INT ): + cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa" + try: + subprocess.run([cmd_vela], shell=True, check=True) + print("\033[92m" + "Vela compile worked for: " + op + "\033[0m") + except: + print("\033[91m" + "Vela compile failed for: " + op + "\033[0m") + else: + print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m") + + +# Temp systest mode for running all models against both inference profiles +if __name__ == "__main__": + for op in EXAMPLE_TEST_LIST: + run_test(op, profile=TosaProfile.BI_INT) From f40ab5f587e28ce399034677f656bbd935221582 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 09:57:52 +0000 Subject: [PATCH 09/25] Generate pte for delegate test on the fly Signed-off-by: Rob Elliott --- .../0007-Add-delegate-runner-test.patch | 166 ++++++++---------- examples/arm/run.sh | 17 +- 2 files changed, 92 insertions(+), 91 deletions(-) diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch index 5e9e72b098c..1f6dd480897 100644 --- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch +++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch @@ -1,23 +1,41 @@ -From 6bd0ad55504da811159c58abeb5305a7a1ab884a Mon Sep 17 00:00:00 2001 +From d9d89c7a1d45df7c7aab3142c47f1ff797e531fe Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 13:31:33 +0000 Subject: [PATCH] Add delegate runner test Signed-off-by: Rob Elliott --- - applications/executorch_tests/CMakeLists.txt | 19 +- - applications/executorch_tests/add.pte.h | 70 ++++++++ + applications/executorch_tests/CMakeLists.txt | 27 ++- + .../executorch_tests/pte_to_header.py | 11 +- .../executorch_tests/runner_delegate.cpp | 162 ++++++++++++++++++ cmake/toolchain/arm-none-eabi-gcc.cmake | 6 +- - 4 files changed, 251 insertions(+), 6 deletions(-) - create mode 100644 applications/executorch_tests/add.pte.h + 4 files changed, 197 insertions(+), 9 deletions(-) create mode 100644 applications/executorch_tests/runner_delegate.cpp diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt -index c95d53e..943ade5 100644 +index c95d53e..1118469 100644 --- a/applications/executorch_tests/CMakeLists.txt +++ b/applications/executorch_tests/CMakeLists.txt -@@ -44,6 +44,7 @@ message("**********************") +@@ -28,22 +28,26 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir") + set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir") + set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers") + set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte") ++set(ET_PTE_DELEGATE_FILE_PATH "${ET_PTE_DELGATE__FILE_PATH}" CACHE PATH "Path to ExecuTorch delegate model pte") + + get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) + get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) + get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH) + get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH) ++get_filename_component(ET_PTE_DELEGATE_FILE_PATH ${ET_PTE_DELEGATE_FILE_PATH} REALPATH) + + message("**********************") + message("ExecuTorch dir (ET_DIR_PATH) : ${ET_DIR_PATH}") + message("ExecuTorch build dir(ET_BUILD_DIR_PATH) : ${ET_BUILD_DIR_PATH}") + message("ExecuTorch headers (ET_INCUDE_PATH) : ${ET_INCLUDE_PATH}") + message("ExecuTorch pte file (ET_PTE_FILE_PATH) : ${ET_PTE_FILE_PATH}") ++message("ExecuTorch pte delegate file (ET_PTE_DELEGATE_FILE_PATH) : ${ET_PTE_DELEGATE_FILE_PATH}") + message("**********************") + set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a") set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a") set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a") @@ -25,7 +43,20 @@ index c95d53e..943ade5 100644 add_custom_target( gen_model_header ALL -@@ -67,10 +68,24 @@ ethosu_add_executable_test(executor_runner PRIVATE +@@ -54,8 +58,11 @@ add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/fake_dep + ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h ++ ${CMAKE_CURRENT_BINARY_DIR}/model_delegate_pte.h + COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_FILE_PATH} +- --out ${CMAKE_CURRENT_BINARY_DIR} ++ --outdir ${CMAKE_CURRENT_BINARY_DIR} ++ COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_DELEGATE_FILE_PATH} ++ --outdir ${CMAKE_CURRENT_BINARY_DIR} --outfile model_delegate_pte.h + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + +@@ -67,10 +74,24 @@ ethosu_add_executable_test(executor_runner PRIVATE ${LIB_ET_OP_REGISTRATION} ${LIB_ET_OP_KERNELS}) @@ -52,85 +83,42 @@ index c95d53e..943ade5 100644 + + # TODO Memory setup -diff --git a/applications/executorch_tests/add.pte.h b/applications/executorch_tests/add.pte.h -new file mode 100644 -index 0000000..05bc0ec ---- /dev/null -+++ b/applications/executorch_tests/add.pte.h -@@ -0,0 +1,70 @@ -+__attribute__((section(".sram.data"), aligned(16))) char add_pte[] = { -+0x24, 0x00, 0x00, 0x00, 0x45, 0x54, 0x31, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, -+0x0c, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xf4, 0x04, 0x00, 0x00, 0xdc, 0x04, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x36, 0xf9, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0xb0, 0x04, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x62, 0x69, 0x6e, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, -+0x63, 0x6d, 0x64, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x43, 0x4f, 0x50, 0x31, 0x01, 0x00, 0x10, 0x00, 0x07, 0x18, 0x00, 0x00, 0x00, 0x00, 0x06, 0x10, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x55, 0x00, -+0x25, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x26, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x01, 0x01, 0x00, 0x00, 0x40, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x01, 0x00, 0x00, -+0x0c, 0x01, 0x00, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x04, 0x01, 0x04, 0x00, 0x06, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00, -+0x14, 0x00, 0x00, 0x00, 0x09, 0x01, 0x00, 0x00, 0x05, 0x01, 0x09, 0x00, 0x07, 0x01, 0x00, 0x00, 0x1f, 0x01, 0x01, 0x00, 0x10, 0x40, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x11, 0x40, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x12, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0x1a, 0x01, 0x00, 0x00, -+0x12, 0x01, 0x00, 0x00, 0x11, 0x01, 0x00, 0x00, 0x13, 0x01, 0x04, 0x00, 0x16, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x15, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x40, 0x00, 0x00, -+0x14, 0x00, 0x00, 0x00, 0x18, 0x01, 0x00, 0x00, 0x14, 0x01, 0x05, 0x01, 0x25, 0x01, 0x00, 0x00, 0x26, 0x01, 0x00, 0x80, 0x27, 0x01, 0xff, 0x7f, 0x16, 0x01, 0x00, 0x00, 0x15, 0x01, 0x01, 0x00, -+0x17, 0x01, 0x07, 0x00, 0x0d, 0x01, 0x16, 0x00, 0x2d, 0x01, 0x16, 0x00, 0x8d, 0x01, 0x0a, 0x00, 0x24, 0x01, 0x00, 0x00, 0x8f, 0x01, 0x01, 0x00, 0x80, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x81, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x82, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x83, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8b, 0x01, 0x00, 0x00, 0x8c, 0x01, 0x00, 0x00, -+0x8a, 0x01, 0x00, 0x00, 0x86, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x85, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x84, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x89, 0x01, 0x00, 0x00, -+0x85, 0x01, 0x09, 0x00, 0x80, 0x01, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, -+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x73, 0x68, 0x00, -+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x72, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x00, -+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, -+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, -+0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x65, 0x6e, 0x64, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, -+0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, 0x28, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x18, 0x00, 0x1c, 0x00, 0x20, 0x00, 0x24, 0x00, 0x16, 0x00, 0x00, 0x00, -+0x44, 0x03, 0x00, 0x00, 0xf0, 0x01, 0x00, 0x00, 0x2c, 0x01, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x14, 0x01, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, -+0x76, 0xff, 0xff, 0xff, 0x68, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x2f, 0x74, 0x6d, 0x70, 0x2f, 0x61, 0x72, 0x6d, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x6d, 0x7a, 0x74, 0x66, 0x5f, 0x62, 0x76, 0x67, 0x2f, 0x73, -+0x69, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x61, 0x64, 0x64, 0x2f, 0x74, 0x6f, 0x73, 0x61, 0x2f, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x64, 0x65, 0x62, 0x75, 0x67, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x70, -+0x61, 0x74, 0x68, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x41, 0x72, 0x6d, 0x42, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, -+0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x0c, 0x00, 0x00, 0x00, -+0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, -+0xb0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x05, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x14, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, -+0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x88, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, -+0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0e, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, -+0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x16, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, -+0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, -+0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, -+0x39, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, -+0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x00, 0x00, 0x00, -+0xfe, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, -+0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, -+0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, -+0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, -+0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, -+0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x7d, 0x2c, 0x20, -+0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, -+0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x5b, 0x5d, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, -+0x7d, 0x5d, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x00, }; +diff --git a/applications/executorch_tests/pte_to_header.py b/applications/executorch_tests/pte_to_header.py +index 37d88aa..be3282d 100644 +--- a/applications/executorch_tests/pte_to_header.py ++++ b/applications/executorch_tests/pte_to_header.py +@@ -30,11 +30,18 @@ parser.add_argument( + ) + parser.add_argument( + "--outdir", +- help="Output dir for model_pte.h", ++ help="Output dir for model header", + type=str, + required=False, + default=".", + ) ++parser.add_argument( ++ "--outfile", ++ help="Output filename for model header", ++ type=str, ++ required=False, ++ default="model_pte.h", ++) + parser.add_argument( + "--section", + help="Section attribute for the data array", +@@ -43,7 +50,7 @@ parser.add_argument( + default=".sram.data", + ) + args = parser.parse_args() +-outfile = os.path.join(args.outdir, "model_pte.h") ++outfile = os.path.join(args.outdir, args.outfile) + attr = f'__attribute__((section("{args.section}"), aligned(16))) char ' + + with open(args.pte, "rb") as fr, open( diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp new file mode 100644 -index 0000000..a4024bb +index 0000000..6af6a92 --- /dev/null +++ b/applications/executorch_tests/runner_delegate.cpp @@ -0,0 +1,162 @@ @@ -173,7 +161,7 @@ index 0000000..a4024bb + ****************************************************************************/ + +// Our .pte file generated from the AoT flow -+#include "add.pte.h" ++#include "model_delegate_pte.h" // contains model_pte + +// Storage for intermediate data in SRAM +__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; @@ -218,11 +206,11 @@ index 0000000..a4024bb + using torch::executor::Result; + using torch::executor::Error; + -+ // Load pte from the global add_pte .pte file loaded into SRAM. -+ auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte)); ++ // Load pte from the global model_pte .pte file loaded into SRAM. ++ auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte)); + Result program = torch::executor::Program::load(&loader); + if(!program.ok()) { -+ printf("main: Program loading failed @ 0x%p: 0x%x", add_pte, (int)program.error()); ++ printf("main: Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error()); + } + printf("main: Model buffer loaded, has %u methods\n", program->num_methods()); + diff --git a/examples/arm/run.sh b/examples/arm/run.sh index ef0a6d560a3..f4ef588c4a5 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -43,6 +43,16 @@ function generate_pte_file() { echo "${pte_file}" } +# Generate the ethos delegate PTE file +function generate_ethos_pte_file() { + cd $et_root_dir + python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null + cd ./ethosout/simple_add/torch/ + local pte_file=$(readlink -f ./delegated.pte) + [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } + echo "${pte_file}" +} + # build ExecuTorch Libraries function build_executorch() { [[ -d "${et_build_dir}" ]] \ @@ -74,8 +84,9 @@ function build_executorch() { # build Arm Baremetal executor_runner function build_executorch_runner() { - [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expecting pte file as an argument got, $*"; exit 1; } + [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting 2 pte files as arguments got, $*"; exit 1; } local pte=${1} + local pte_delegate=${2} cd "${ethos_u_root_dir}"/core_platform cmake \ -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \ @@ -83,6 +94,7 @@ function build_executorch_runner() { -DET_DIR_PATH:PATH=${et_root_dir} \ -DET_BUILD_DIR_PATH:PATH=${et_build_dir} \ -DET_PTE_FILE_PATH:PATH="${pte}" \ + -DET_PTE_DELEGATE_FILE_PATH:PATH="${pte_delegate}" \ -DPYTHON_EXECUTABLE=$(which python3) echo "[${FUNCNAME[0]}] Configured CMAKE" @@ -147,12 +159,13 @@ type ${buck2} 2>&1 > /dev/null \ # get the pte pte=$(generate_pte_file) +pte_delegate=$(generate_ethos_pte_file) # build et build_executorch # build the et baremetal app -build_executorch_runner "${pte}" +build_executorch_runner "${pte}" "${pte_delegate}" # run the app run_fvp From d10d620b45c0df03dd3a73fe771186350c8c8bae Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 14:28:49 +0000 Subject: [PATCH 10/25] Added support for variable input output patterns Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 17 +++- backends/arm/runtime/ArmBackendEthosU.cpp | 105 ++++++++++++++++++---- backends/arm/test/test_models.py | 12 +++ examples/arm/arm_ethosu_minimal.py | 2 +- 4 files changed, 113 insertions(+), 23 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 8185718f45e..2e5cf4d9645 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -14,6 +14,7 @@ import os import subprocess import tempfile +import struct from typing import final, List import numpy as np @@ -176,7 +177,19 @@ def vela_compile(tosa_fb): for key in data.keys(): block_name = bytes(key, "utf8")[:15] block_name = block_name + b"\x00" * (16 - len(block_name)) - block_data = data[key].tobytes() + + block_data = b'' + if key in ( "input_shape", "output_shape" ): + inputs = data[key] + # Encode a struct of int len; and one or more int x,y,z,w shape; + input_struct = struct.pack(" +#include #include #include @@ -19,6 +20,8 @@ #include #include +using namespace std; + namespace torch { namespace executor { @@ -29,7 +32,6 @@ namespace executor { class ArmBackend final : public PyTorchBackendInterface { public: - ArmBackend() {} ~ArmBackend() = default; @@ -86,7 +88,7 @@ class ArmBackend final : public PyTorchBackendInterface { ET_LOG(Info, "ArmBackend::execute %p", processed->data()); - vela_handles handles = {0, 0, 0, 0, 0, 0}; + vela_handles handles; // Command stream - we know at this point it's aligned char* data = (char*)processed->data(); @@ -107,16 +109,45 @@ class ArmBackend final : public PyTorchBackendInterface { handles.scratch_data, handles.scratch_data_size); + printf("Processed inputs %d\n", handles.input_shape.size()); + for (int i = 0; i < handles.input_shape.size(); i++) + printf( + " %d %d %d %d\n", + handles.input_shape[i][0], + handles.input_shape[i][1], + handles.input_shape[i][2], + handles.input_shape[i][3]); + + // Input data from EValue + const char* input_addr = handles.scratch_data + handles.input_offset; + printf( + "accessing ethos input data at %p, offset %d\n", + handles.scratch_data, + handles.input_offset); + // Inputs are in the index first + int input_index = + 0; // handles.input_shape.size(); TODO: loop this for multiple inputs + printf("writing input to EValue input index %d\n", input_index); + + // Process input EValue into scratch + // TODO: optimise into direct write for compatible layouts + // is this contiguous for a memcpy of e_size*numel? + int* input_address = (int*)input_addr; + auto tensor_in = args[input_index]->toTensor(); + for (int j = 0; j < tensor_in.numel(); j++) { + // TODO: extend beyond 4 byte tensors + input_address[j] = tensor_in.mutable_data_ptr()[j]; + } + // TMP emit scratch - printf("Scratch before:\n"); + printf("Scratch after setup:\n"); for (int i = 0; i < handles.scratch_data_size; i++) { - if (i % 4 == 0) - ((char*)handles.scratch_data)[i] = 1; printf("%02x ", ((char*)handles.scratch_data)[i]); if (!((i + 1) % 4)) printf("\n"); } printf("\n"); + // END TMP emit scratch // Allocate driver handle and synchronously invoke driver ethosu_driver* drv = ethosu_reserve_driver(); @@ -151,13 +182,33 @@ class ArmBackend final : public PyTorchBackendInterface { } printf("\n"); + printf("Processed outputs %d\n", handles.output_shape.size()); + for (int i = 0; i < handles.output_shape.size(); i++) + printf( + " %d %d %d %d\n", + handles.output_shape[i][0], + handles.output_shape[i][1], + handles.output_shape[i][2], + handles.output_shape[i][3]); + + // output data from Ethos U + const char* output_addr = handles.scratch_data + handles.output_offset; + printf( + "accessing ethos output data at %p, offset %d\n", + handles.scratch_data, + handles.output_offset); + // Outputs are in the index immediately after inputs + int output_index = handles.input_shape.size(); + printf("writing output to EValue output index %d\n", output_index); + // Process results into EValue storage // TODO: optimise into direct write for compatible layouts - // TODO: get num in/out and layout? - int* output_address = (int*)(handles.scratch_data + handles.output_offset); - auto tensor = args[1]->toTensor(); - for (int j = 0; j < tensor.numel(); j++) { - tensor.mutable_data_ptr()[j] = output_address[j]; + // is this contiguous for a memcpy of e_size*numel? + int* output_address = (int*)output_addr; + auto tensor_out = args[output_index]->toTensor(); + for (int j = 0; j < tensor_out.numel(); j++) { + // TODO: extend beyond 4 byte tensors + tensor_out.mutable_data_ptr()[j] = output_address[j]; } return Error::Ok; @@ -176,9 +227,9 @@ class ArmBackend final : public PyTorchBackendInterface { const char* scratch_data; size_t scratch_data_size; size_t input_offset; - size_t input_data_shape[3]; + vector> input_shape; size_t output_offset; - size_t output_data_shape[3]; + vector> output_shape; } vela_handles; typedef struct { @@ -188,6 +239,11 @@ class ArmBackend final : public PyTorchBackendInterface { char data[]; } vela_bin_block; + typedef struct { + int count; + int shape[][4]; + } vela_shapes; + static int next_mul_16(int n) { return ((n - 1) | 15) + 1; } @@ -217,7 +273,6 @@ class ArmBackend final : public PyTorchBackendInterface { } if (!strncmp(b->name, "weight_data", strlen("weight_data"))) { h->weight_data = b->data; - ; h->weight_data_size = b->size; } if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { @@ -237,14 +292,26 @@ class ArmBackend final : public PyTorchBackendInterface { h->output_offset = ((int*)b->data)[0]; } if (!strncmp(b->name, "input_shape", strlen("input_shape"))) { - h->input_data_shape[0] = ((int*)b->data)[0]; - h->input_data_shape[0] = ((int*)b->data)[1]; - h->input_data_shape[0] = ((int*)b->data)[2]; + vela_shapes* shapes = (vela_shapes*)b->data; + for (int i = 0; i < shapes->count; i++) { + vector s = { + shapes->shape[i][0], + shapes->shape[i][1], + shapes->shape[i][2], + shapes->shape[i][3]}; + h->input_shape.push_back(s); + } } if (!strncmp(b->name, "output_shape", strlen("output_shape"))) { - h->output_data_shape[0] = ((int*)b->data)[0]; - h->output_data_shape[0] = ((int*)b->data)[1]; - h->output_data_shape[0] = ((int*)b->data)[2]; + vela_shapes* shapes = (vela_shapes*)b->data; + for (int i = 0; i < shapes->count; i++) { + vector s = { + shapes->shape[i][0], + shapes->shape[i][1], + shapes->shape[i][2], + shapes->shape[i][3]}; + h->output_shape.push_back(s); + } } } } diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py index b33007f0c84..1773eb72bfb 100644 --- a/backends/arm/test/test_models.py +++ b/backends/arm/test/test_models.py @@ -77,6 +77,18 @@ def __init__(self): def forward(self, x): return x + x + @register_test + class simple_add_2(torch.nn.Module): + inputs = { + TosaProfile.BI_INT: (torch.ones(5,dtype=torch.int32),torch.ones(5,dtype=torch.int32),), + } + + def __init__(self): + super().__init__() + + def forward(self, x, y): + return x + y + @register_test class simple_add_broadcast(torch.nn.Module): inputs = { diff --git a/examples/arm/arm_ethosu_minimal.py b/examples/arm/arm_ethosu_minimal.py index a41cbb42cd5..62411ca24c3 100644 --- a/examples/arm/arm_ethosu_minimal.py +++ b/examples/arm/arm_ethosu_minimal.py @@ -33,7 +33,7 @@ _check_ir_validity=False, ) -EXAMPLE_TEST_LIST = ["simple_add"] +EXAMPLE_TEST_LIST = ["simple_add", "simple_add_2"] # # From b812898abdf2bc0c197afabd90719b159ccd98f9 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 16:00:59 +0000 Subject: [PATCH 11/25] Handle multiple delegate inputs with SRAM offsets * export the list of offset from AoT floq * appropriately copy inputs and ouputs to/from SRAM Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 8 ++- backends/arm/runtime/ArmBackendEthosU.cpp | 84 ++++++++++------------- 2 files changed, 41 insertions(+), 51 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 2e5cf4d9645..200f5818be5 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -151,8 +151,6 @@ def dbg_tosa_dump(tosa_fb, path): # WARNING: if this changes, the runtime reader also needs to change def vela_compile(tosa_fb): with tempfile.TemporaryDirectory() as tmpdir: - print(f"compiling to Vela in {tmpdir}") - tosaname = "out.tosa" flatbuffer = tosa_fb.serialize() f = open(os.path.join(tmpdir, tosaname), "wb") @@ -188,6 +186,12 @@ def vela_compile(tosa_fb): inp_pad = inp.tolist() + [0] * (4 - len(inp)) input_struct = input_struct + struct.pack("toTensor(); - for (int j = 0; j < tensor_in.numel(); j++) { - // TODO: extend beyond 4 byte tensors - input_address[j] = tensor_in.mutable_data_ptr()[j]; + // Write inputs into SRAM scratch area defined by Vela + for (int i = 0; i < handles.input_shape.size(); i++) { + const char* input_addr = handles.scratch_data + handles.input_offset[i]; + // Process input EValue into scratch + // TODO: optimise into direct write for compatible, contig layout + int* input_address = (int*)input_addr; + auto tensor_in = args[i]->toTensor(); + for (int j = 0; j < tensor_in.numel(); j++) { + // TODO: extend beyond 4 byte tensors + input_address[j] = tensor_in.mutable_data_ptr()[j]; + } } +#if 0 // TMP emit scratch printf("Scratch after setup:\n"); for (int i = 0; i < handles.scratch_data_size; i++) { @@ -148,6 +132,7 @@ class ArmBackend final : public PyTorchBackendInterface { } printf("\n"); // END TMP emit scratch +#endif // Allocate driver handle and synchronously invoke driver ethosu_driver* drv = ethosu_reserve_driver(); @@ -173,6 +158,7 @@ class ArmBackend final : public PyTorchBackendInterface { return Error::InvalidProgram; } +#if 0 // TMP emit scratch printf("Scratch after:\n"); for (int i = 0; i < handles.scratch_data_size; i++) { @@ -181,29 +167,16 @@ class ArmBackend final : public PyTorchBackendInterface { printf("\n"); } printf("\n"); - - printf("Processed outputs %d\n", handles.output_shape.size()); - for (int i = 0; i < handles.output_shape.size(); i++) - printf( - " %d %d %d %d\n", - handles.output_shape[i][0], - handles.output_shape[i][1], - handles.output_shape[i][2], - handles.output_shape[i][3]); +#endif // output data from Ethos U - const char* output_addr = handles.scratch_data + handles.output_offset; - printf( - "accessing ethos output data at %p, offset %d\n", - handles.scratch_data, - handles.output_offset); + // We only handle one output at the moment + const char* output_addr = handles.scratch_data + handles.output_offset[0]; // Outputs are in the index immediately after inputs int output_index = handles.input_shape.size(); - printf("writing output to EValue output index %d\n", output_index); // Process results into EValue storage - // TODO: optimise into direct write for compatible layouts - // is this contiguous for a memcpy of e_size*numel? + // TODO: optimise into direct write for compatible, contig layout int* output_address = (int*)output_addr; auto tensor_out = args[output_index]->toTensor(); for (int j = 0; j < tensor_out.numel(); j++) { @@ -226,9 +199,9 @@ class ArmBackend final : public PyTorchBackendInterface { size_t weight_data_size; const char* scratch_data; size_t scratch_data_size; - size_t input_offset; + vector input_offset; vector> input_shape; - size_t output_offset; + vector output_offset; vector> output_shape; } vela_handles; @@ -244,6 +217,11 @@ class ArmBackend final : public PyTorchBackendInterface { int shape[][4]; } vela_shapes; + typedef struct { + int count; + int offsets[]; + } vela_offsets; + static int next_mul_16(int n) { return ((n - 1) | 15) + 1; } @@ -285,12 +263,20 @@ class ArmBackend final : public PyTorchBackendInterface { h->scratch_data = b->data; h->scratch_data_size = b->size; } + if (!strncmp(b->name, "input_offset", strlen("input_offset"))) { - h->input_offset = ((int*)b->data)[0]; + vela_offsets* offsets = (vela_offsets*)b->data; + for (int i = 0; i < offsets->count; i++) { + h->input_offset.push_back(offsets->offsets[i]); + } } if (!strncmp(b->name, "output_offset", strlen("output_offset"))) { - h->output_offset = ((int*)b->data)[0]; + vela_offsets* offsets = (vela_offsets*)b->data; + for (int i = 0; i < offsets->count; i++) { + h->output_offset.push_back(offsets->offsets[i]); + } } + if (!strncmp(b->name, "input_shape", strlen("input_shape"))) { vela_shapes* shapes = (vela_shapes*)b->data; for (int i = 0; i < shapes->count; i++) { From 44a46a14eec7fb973a19550b836c6c9bfe90fac6 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 17:03:13 +0000 Subject: [PATCH 12/25] Add TOSA ref model and Vela dependencies Signed-off-by: Rob Elliott --- examples/arm/setup.sh | 48 ++++++- ...001-Improve-rescale-codegen-for-TOSA.patch | 129 ++++++++++++++++++ 2 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index c3518d3b24f..dace442ea8b 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -146,6 +146,45 @@ function patch_repo() { echo -e "[${FUNCNAME[0]}] Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${repo_dir} dir.\n" } +function setup_tosa_reference_model() { + # The debug flow on the host includes running on a reference implementation of TOSA + # This is useful primarily for debug of quantization accuracy, but also for internal + # errors for the early codebase + cd "${root_dir}" + if [[ ! -e reference_model ]]; then + git clone https://git.mlplatform.org/tosa/reference_model.git -b v0.80.0 + cd reference_model + git submodule update --init --recursive + cd .. + fi + cd reference_model + mkdir -p build + cd build + cmake .. + make + cd reference_model + tosa_bin_path=`pwd` + echo adding ${tosa_bin_path} to path + echo "export PATH=\${PATH}:${tosa_bin_path}" >> ${update_path_script} + cd ../.. + echo back at `pwd` +} + +function setup_vela() { + # + # Prepare the Vela compiler for AoT to Ethos-U compilation + # + cd "${root_dir}/ethos-u/" + if [[ ! -e ethos-u-vela ]]; then + git clone https://git.mlplatform.org/ml/ethos-u/ethos-u-vela.git + name="ethos-u-vela" + base_rev=00a15db3e1a188b25065d095152d701f4394cdc5 + patch_repo + fi + pip install . + cd .. +} + ######## ### main ######## @@ -182,6 +221,13 @@ name="core_platform" base_rev=204210b1074071532627da9dc69950d058a809f4 patch_repo +# Setup the tosa_reference_model +setup_tosa_reference_model + +# Setup vela and patch in codegen fixes +setup_vela + echo "[main] update path by doing 'source ${setup_path_script}'" -echo "[main] sucecss!" + +echo "[main] success!" exit 0 diff --git a/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch new file mode 100644 index 00000000000..e131ca76ee8 --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch @@ -0,0 +1,129 @@ +From ef07230fbb15edbf27ecaf48994fb157430a5e7c Mon Sep 17 00:00:00 2001 +From: Rob Elliott +Date: Thu, 5 Oct 2023 16:45:42 +0000 +Subject: [PATCH] Improve rescale codegen for TOSA + +Signed-off-by: Rob Elliott +--- + ethosu/vela/tosa_graph_optimiser.py | 56 +++++++++++------------------ + ethosu/vela/tosa_mapping.py | 2 +- + 2 files changed, 22 insertions(+), 36 deletions(-) + +diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py +index df6b575..b2e3697 100644 +--- a/ethosu/vela/tosa_graph_optimiser.py ++++ b/ethosu/vela/tosa_graph_optimiser.py +@@ -337,7 +337,8 @@ def rewrite_concat(op): + + def remove_memory_ops(op, arch): + if op.run_on_npu and op.type in (Op.Reshape, Op.Identity): +- bypass_memory_only_ops(op) ++ # TODO: is this ok - function doesn't use arch or nng ++ bypass_memory_only_ops(op, arch, None) + + + def rewrite_activation(op, arch, nng): +@@ -357,7 +358,6 @@ def rewrite_activation(op, arch, nng): + + return op + +- + def rewrite_rescale(op, arch, nng): + if op.type == Op.Rescale: + ifm = op.ifm +@@ -368,7 +368,7 @@ def rewrite_rescale(op, arch, nng): + prev_op = ifm.ops[0] + + # TODO currently not supported +- assert len(ifm.consumer_list) == 1 ++ #assert len(ifm.consumer_list) == 1 + + input_zp = op.attrs["input_zp"] + output_zp = op.attrs["output_zp"] +@@ -390,6 +390,9 @@ def rewrite_rescale(op, arch, nng): + assert False + ifm.quantization.zero_point = input_zp + ofm.quantization.zero_point = output_zp ++ ++ assert False == per_channel, "Don't like per_channel!" ++ + for s, m in zip(shift, multiplier): + # TODO these are the TOSA limitations + assert m >= 0 +@@ -403,45 +406,28 @@ def rewrite_rescale(op, arch, nng): + else: + rounding_mode = RoundingMode.HalfUp + +- if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected: ++ fuse = len(ifm.ops) == 1 and prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() ++ if fuse: ++ # TODO: ERROR: bias.values didn't exist for an op like Add - presumably not a capability of that op + assert len(multiplier) == len(shift) == len(prev_op.bias.values) +- +- if ifm.dtype == DataType.int32 and per_channel: +- prev_op.explicit_scaling = explicit_scaling +- prev_op.rounding_mode = rounding_mode +- +- # Bypass op +- prev_op.set_output_tensor(ofm) +- DebugDatabase.add_optimised(op, prev_op) +- return op +- else: +- print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) +- assert False +- # TODO which are the cases we need to and can do standalone Rescale? +- # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops? +- # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE? +- # limited to these at the moment: +- elif ( +- (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8) +- or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8) +- or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8) +- ): +- # Create NOP performing the RESCALE ++ # TODO: generate replacement fusion code from below ++ assert False, "Fusion possible but i've not implemented it" ++ else: ++ # Generate Rescale behaviour attached to a compatible NOP ++ # TODO: I assume this attaches a new operator into the graph?? + avgpool_op = replace_rescale_with_avg_pool(op) + avgpool_op.rounding_mode = rounding_mode +- ++ + if per_channel: +- # TODO +- avgpool_op.explicit_scaling = explicit_scaling +- print("Warning, unsupported TOSA Rescale") +- assert False ++ assert False, "Assert above removed but still not implemented... :/" + else: + avgpool_op.explicit_scaling = explicit_scaling +- else: +- print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) +- assert False +- return op + ++ #print( len(multiplier), len(shift), len(prev_op.get_bias_tensors()) ) ++ #print( ifm.dtype, "PC:", per_channel, op.type ) ++ #print( ifm.dtype, ofm.dtype ) ++ ++ return op + + def convert_pad_in_width(op): + """ +diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py +index 2dafd81..ed5aa2e 100644 +--- a/ethosu/vela/tosa_mapping.py ++++ b/ethosu/vela/tosa_mapping.py +@@ -148,7 +148,7 @@ transpose_conv_attrs = AttrSerializer( + ) + transpose_attrs = AttrSerializer("TransposeAttribute", (("perms", is_vec),)) + axis_attrs = AttrSerializer("AxisAttribute", ("axis",)) +-reshape_attrs = AttrSerializer("ReshapeAttribute", (("shape", is_vec),)) ++reshape_attrs = AttrSerializer("ReshapeAttribute", (("newShape", is_vec),)) + slice_attrs = AttrSerializer("SliceAttribute", (("start", is_vec), ("size", is_vec))) + tile_attrs = AttrSerializer("TileAttribute", (("multiplies", is_vec),)) + resize_attrs = AttrSerializer( +-- +2.41.0 + From cb16e682da9965104a4e8650ff8e785756f48983 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 17:07:42 +0000 Subject: [PATCH 13/25] Cleanup from lintrunner and other bits of tidyup Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 8 ++++---- backends/arm/test/test_models.py | 9 ++++++--- examples/arm/arm_ethosu_minimal.py | 22 +++++++++++----------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 200f5818be5..da1b70e57d6 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -12,9 +12,9 @@ import logging import operator import os +import struct import subprocess import tempfile -import struct from typing import final, List import numpy as np @@ -176,8 +176,8 @@ def vela_compile(tosa_fb): block_name = bytes(key, "utf8")[:15] block_name = block_name + b"\x00" * (16 - len(block_name)) - block_data = b'' - if key in ( "input_shape", "output_shape" ): + block_data = b"" + if key in ("input_shape", "output_shape"): inputs = data[key] # Encode a struct of int len; and one or more int x,y,z,w shape; input_struct = struct.pack(" Date: Thu, 5 Oct 2023 19:15:31 +0000 Subject: [PATCH 14/25] Removed ethos u driver build and cmsis dependency Signed-off-by: Rob Elliott --- .gitmodules | 3 --- backends/arm/cmake/Dependencies.cmake | 2 -- backends/arm/third-party/cmsis | 1 - 3 files changed, 6 deletions(-) delete mode 160000 backends/arm/third-party/cmsis diff --git a/.gitmodules b/.gitmodules index 0687c0e8b3f..8cb71f3a18e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -46,6 +46,3 @@ [submodule "backends/arm/third-party/ethos-u-core-driver"] path = backends/arm/third-party/ethos-u-core-driver url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git -[submodule "backends/arm/third-party/cmsis"] - path = backends/arm/third-party/cmsis - url = https://github.com/ARM-software/CMSIS_5.git diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake index 27a587176bb..fae39dd53b9 100644 --- a/backends/arm/cmake/Dependencies.cmake +++ b/backends/arm/cmake/Dependencies.cmake @@ -6,7 +6,5 @@ set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") # Ethos-U driver -set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver") set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include") -add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} ) include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} ) diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis deleted file mode 160000 index a75f01746df..00000000000 --- a/backends/arm/third-party/cmsis +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3 From 3a5fd4f10cb29ba7a7f14d92e0b53edf72619403 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 19:55:29 +0000 Subject: [PATCH 15/25] renamed lib ethos_u to executorch_delegate_ethos_u Signed-off-by: Rob Elliott --- backends/arm/CMakeLists.txt | 6 +++--- .../patches/0007-Add-delegate-runner-test.patch | 10 ++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 2cc5cf94740..d7b61ce92ad 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -20,6 +20,6 @@ include(cmake/Dependencies.cmake) set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") -add_library(ethos_u STATIC ${_arm_baremetal_sources}) -target_include_directories(ethos_u PUBLIC ${_common_include_directories}) -target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}) +add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources}) +target_include_directories(executorch_delegate_ethos_u PUBLIC ${_common_include_directories}) +target_include_directories(executorch_delegate_ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}) diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch index 1f6dd480897..e80da67153f 100644 --- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch +++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch @@ -1,4 +1,4 @@ -From d9d89c7a1d45df7c7aab3142c47f1ff797e531fe Mon Sep 17 00:00:00 2001 +From 8201e36f90fed6e80fea7021ec4bad325d329bae Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 13:31:33 +0000 Subject: [PATCH] Add delegate runner test @@ -13,10 +13,10 @@ Signed-off-by: Rob Elliott create mode 100644 applications/executorch_tests/runner_delegate.cpp diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt -index c95d53e..1118469 100644 +index c95d53e..835f824 100644 --- a/applications/executorch_tests/CMakeLists.txt +++ b/applications/executorch_tests/CMakeLists.txt -@@ -28,22 +28,26 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir") +@@ -28,20 +28,24 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir") set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir") set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers") set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte") @@ -37,12 +37,10 @@ index c95d53e..1118469 100644 message("**********************") set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a") ++set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libexecutorch_delegate_ethos_u.a") set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a") set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a") -+set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libethos_u.a") - add_custom_target( - gen_model_header ALL @@ -54,8 +58,11 @@ add_custom_command( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/fake_dep From e67662058dfd69e1bd2407dd3f69017d0c2ebe52 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 20:05:52 +0000 Subject: [PATCH 16/25] lintfix Signed-off-by: Rob Elliott --- backends/arm/CMakeLists.txt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index d7b61ce92ad..4dcf2ff0539 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -20,6 +20,18 @@ include(cmake/Dependencies.cmake) set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") -add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources}) -target_include_directories(executorch_delegate_ethos_u PUBLIC ${_common_include_directories}) -target_include_directories(executorch_delegate_ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}) + +add_library( + executorch_delegate_ethos_u + STATIC ${_arm_baremetal_sources} +) +target_include_directories( + executorch_delegate_ethos_u + PUBLIC + ${_common_include_directories} +) +target_include_directories( + executorch_delegate_ethos_u + PUBLIC + ${DRIVER_ETHOSU_INCLUDE_DIR} +) From 4b1125eece425677a762e2b0c92d565cf0d34e84 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 5 Oct 2023 22:05:21 +0000 Subject: [PATCH 17/25] tidied delegate_runner output Signed-off-by: Rob Elliott --- .../0007-Add-delegate-runner-test.patch | 84 +++++++++---------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch index e80da67153f..c1270961510 100644 --- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch +++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch @@ -1,4 +1,4 @@ -From 8201e36f90fed6e80fea7021ec4bad325d329bae Mon Sep 17 00:00:00 2001 +From 0fe8caba3068da05021232912c069124a81e0d94 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 4 Oct 2023 13:31:33 +0000 Subject: [PATCH] Add delegate runner test @@ -7,9 +7,9 @@ Signed-off-by: Rob Elliott --- applications/executorch_tests/CMakeLists.txt | 27 ++- .../executorch_tests/pte_to_header.py | 11 +- - .../executorch_tests/runner_delegate.cpp | 162 ++++++++++++++++++ + .../executorch_tests/runner_delegate.cpp | 160 ++++++++++++++++++ cmake/toolchain/arm-none-eabi-gcc.cmake | 6 +- - 4 files changed, 197 insertions(+), 9 deletions(-) + 4 files changed, 195 insertions(+), 9 deletions(-) create mode 100644 applications/executorch_tests/runner_delegate.cpp diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt @@ -116,10 +116,10 @@ index 37d88aa..be3282d 100644 with open(args.pte, "rb") as fr, open( diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp new file mode 100644 -index 0000000..6af6a92 +index 0000000..ff40084 --- /dev/null +++ b/applications/executorch_tests/runner_delegate.cpp -@@ -0,0 +1,162 @@ +@@ -0,0 +1,160 @@ +/* + * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates + * @@ -167,12 +167,12 @@ index 0000000..6af6a92 +void et_pal_init(void) {} + +__ET_NORETURN void et_pal_abort(void) { -+ __builtin_trap(); ++ __builtin_trap(); +} + +et_timestamp_t et_pal_current_ticks(void) { -+ // libc.a - warning: _gettimeofday is not implemented and will always fail -+ return 11223344; ++ // libc.a - warning: _gettimeofday is not implemented and will always fail ++ return 11223344; +} + +/** @@ -186,46 +186,45 @@ index 0000000..6af6a92 + size_t line, + const char* message, + __ET_UNUSED size_t length) { -+ fprintf( -+ stderr, -+ "%c executorch:%s:%zu] %s\n", -+ level, -+ filename, -+ line, -+ message); ++ fprintf( ++ stderr, ++ "%c executorch:%s:%zu] %s\n", ++ level, ++ filename, ++ line, ++ message); +} + +int main() +{ -+ printf("test test test NG ^2 22\n"); -+ printf("main: Initialising runtime\n"); ++ ET_LOG(Info, "Initialising runtime"); + torch::executor::runtime_init(); + + using torch::executor::Result; + using torch::executor::Error; + -+ // Load pte from the global model_pte .pte file loaded into SRAM. ++ // Load pte from the global model_pte .pte file loaded into SRAM. + auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte)); + Result program = torch::executor::Program::load(&loader); + if(!program.ok()) { -+ printf("main: Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error()); ++ ET_LOG(Info, "Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error()); + } -+ printf("main: Model buffer loaded, has %u methods\n", program->num_methods()); ++ ET_LOG(Info, "Model buffer loaded, has %u methods", program->num_methods()); + -+ // Find our entrypoint in the .pte program ++ // Find our entrypoint in the .pte program + const char* method_name = nullptr; -+ const auto method_name_result = program->get_method_name(0); -+ ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); -+ method_name = *method_name_result; -+ printf("main: Found (and will run) method '%s'\n", method_name); ++ const auto method_name_result = program->get_method_name(0); ++ ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); ++ method_name = *method_name_result; ++ ET_LOG(Info, "Found (and will run) method '%s'", method_name); + -+ // Allocate necessary memories for this method ++ // Allocate necessary memories for this method + Result method_meta = program->method_meta(method_name); + if (!method_meta.ok()) { -+ printf("main: Failed to get method_meta for %s: 0x%x", ++ ET_LOG(Info, "Failed to get method_meta for %s: 0x%x", + method_name, (unsigned int)method_meta.error()); + } -+ ++ + torch::executor::MemoryAllocator method_allocator{ + torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; + @@ -235,7 +234,7 @@ index 0000000..6af6a92 + + for (size_t id = 0; id < num_memory_planned_buffers; ++id) { + size_t buffer_size = static_cast(method_meta->memory_planned_buffer_size(id).get()); -+ printf("main: Setting up planned buffer %zu, size %zu.\n", id, buffer_size); ++ ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size); + + planned_buffers.push_back(std::make_unique(buffer_size)); + planned_spans.push_back({planned_buffers.back().get(), buffer_size}); @@ -249,36 +248,35 @@ index 0000000..6af6a92 + Result method = program->load_method(method_name, &memory_manager); + + if(!method.ok()) { -+ printf("main: Loading of method %s failed with status 0x%x\n", method_name, (int)method.error()); ++ ET_LOG(Info, "Loading of method %s failed with status 0x%x", method_name, (int)method.error()); + } -+ printf("main: Loading of method '%s' succesful\n", method_name); ++ ET_LOG(Info, "Loading of method '%s' succesful", method_name); + -+ printf("main: Preparing inputs...\n"); + auto inputs = torch::executor::util::PrepareInputTensors(*method); + -+ printf("main: Starting the model execution...\n"); ++ ET_LOG(Info, "Starting the model execution..."); + Error status = method->execute(); + if(status != Error::Ok){ -+ printf("main: Execution of method %s failed with status 0x%x\n", method_name, (int)status); ++ ET_LOG(Info, "Execution of method %s failed with status 0x%x", method_name, (int)status); + } else { -+ printf("main: Model executed successfully.\n"); ++ ET_LOG(Info, "Model executed successfully."); + } + + // Print the outputs. + std::vector outputs(method->outputs_size()); -+ printf("main: %d outputs - ", outputs.size()); ++ ET_LOG(Info, "%d outputs - ", outputs.size()); + status = method->get_outputs(outputs.data(), outputs.size()); + ET_CHECK(status == Error::Ok); + for (size_t i = 0; i < outputs.size(); ++i) -+ { -+ printf("main: Output %d numel %d\n", i, outputs[i].toTensor().numel()); -+ for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j) -+ { -+ printf("main: Output[%d]: %d\n", j, outputs[i].toTensor().const_data_ptr()[j]); -+ } ++ { ++ ET_LOG(Info, "Output %d numel %d", i, outputs[i].toTensor().numel()); ++ for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j) ++ { ++ ET_LOG(Info, " Output[%d]: %d", j, outputs[i].toTensor().const_data_ptr()[j]); ++ } + } + -+ return 0; ++ return 0; +} + + From 7afc5e4b79f37dece3c265567a2beda75201dbef Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 09:31:15 +0000 Subject: [PATCH 18/25] Fixed some merge issues * File moved to new path in examples * fix arg handling and md5sum for setup scripts Signed-off-by: Rob Elliott --- .../0001-Improve-rescale-codegen-for-TOSA.patch | 0 examples/arm/run.sh | 5 +++-- examples/arm/setup.sh | 11 ++++++++--- 3 files changed, 11 insertions(+), 5 deletions(-) rename examples/{backend => }/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch (100%) diff --git a/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch similarity index 100% rename from examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch rename to examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch diff --git a/examples/arm/run.sh b/examples/arm/run.sh index f4ef588c4a5..2d255b6694e 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -7,7 +7,7 @@ set -eu -if [[ "${1}" == "-h" ]]; then +if [[ "${1:-"."}" == "-h" ]]; then echo "Usage: $(basename $0) [path-to-a-scratch-dir] [buck2 binary]" exit 0 fi @@ -18,7 +18,8 @@ fi script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) # Ethos-u -root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"} +root_dir=${1:-"${script_dir}/ethos-u-scratch"} +root_dir=$(realpath ${root_dir}) buck2=${2:-"/tmp/buck2"} ethos_u_root_dir="$(cd ${root_dir}/ethos-u && pwd)" ethos_u_build_dir=${ethos_u_root_dir}/core_platform/build diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index dace442ea8b..c4c644bef4f 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -7,7 +7,7 @@ set -eu -if [[ "${1}" == "-h" ]]; then +if [[ "${1:-'.'}" == "-h" ]]; then echo "Usage: $(basename $0) [path-to-a-scratch-dir]" exit 0 fi @@ -47,18 +47,22 @@ if [[ $(get_cpu_arch) == "x86_64" ]]; then # FVP fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9" fvp_model_dir="Linux64_GCC-9.3" + fvp_md5_checksum="98e93b949d0fbac977292d8668d34523" # toochain toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz" toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi" + toolchain_md5_checksum="00ebb1b70b1f88906c61206457eacb61" elif [[ $(get_cpu_arch) == "aarch64" ]]; then # FVP fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073" fvp_model_dir="Linux64_armv8l_GCC-9.3" + fvp_md5_checksum="cbbabbe39b07939cff7a3738e1492ef1" # toochain toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz" toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi" + toolchain_md5_checksum="02c9b0d3bb1110575877d8eee1f223f2" else echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1; fi @@ -70,7 +74,8 @@ ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c" ######## ### Optional user args ######## -root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"} +root_dir=${1:-"${script_dir}/ethos-u-scratch"} +root_dir=$(realpath ${root_dir}) ######## ### Functions @@ -165,7 +170,7 @@ function setup_tosa_reference_model() { cd reference_model tosa_bin_path=`pwd` echo adding ${tosa_bin_path} to path - echo "export PATH=\${PATH}:${tosa_bin_path}" >> ${update_path_script} + echo "export PATH=\${PATH}:${tosa_bin_path}" >> "${setup_path_script}" cd ../.. echo back at `pwd` } From e340b5c82380e6f6f0c9b2fdf0de4a3bf8bfb545 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 13:11:58 +0000 Subject: [PATCH 19/25] Test fixes for compiler output choice * Introduction of BI_INT to have a small set of pure int tests * added either vela or tosa output from compilation * Fixed tosa e2e tests to use tosa output form * unit tests currently use tosa due to missing vela dependency * vela e2e testing runs with default compile flags and emits to .pte Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 17 +++++++++++--- backends/arm/test/test_models.py | 14 +++++++----- backends/arm/test/test_tosa.py | 38 ++++++++++++++++++++++++-------- examples/arm/arm_tosa_e2e.py | 9 ++++++-- 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index da1b70e57d6..ac40042bcd9 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -315,10 +315,13 @@ def preprocess( # noqa: C901 # if a debug/test build capture output files from TOSA stage path = None debug_output = False + output_format = "vela" for spec in compile_spec: if spec.key == "debug_tosa_path": path = spec.value.decode() debug_output = True + if spec.key == "output_format": + output_format = spec.value.decode() # Converted output for this subgraph, serializer needs path early as it emits # const data directly. Path created and data written only in debug builds. @@ -964,8 +967,16 @@ def preprocess( # noqa: C901 if debug_output is True: dbg_tosa_dump(tosa_fb, path) - # Serialize and return the tosa flatbuffer - # fb = bytes(tosa_fb.serialize()) - binary = vela_compile(tosa_fb) + # Serialize and return the program. While we have always produced TOSA + # output as an intermediate, some flows compile to device binaries in + # preprocess and some consume TOSA fb directly. + if output_format == "vela": + # Emit vela_bin_stream format + binary = vela_compile(tosa_fb) + elif output_format == "tosa": + # Emit TOSA flatbuffer + binary = bytes(tosa_fb.serialize()) + else: + raise RuntimeError(f"Unknown format {output_format}") return PreprocessResult(processed_bytes=binary) diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py index 4172c8a97f2..46a57a601b8 100644 --- a/backends/arm/test/test_models.py +++ b/backends/arm/test/test_models.py @@ -95,6 +95,10 @@ def forward(self, x, y): @register_test class simple_add_broadcast(torch.nn.Module): inputs = { + TosaProfile.BI_INT: ( + torch.ones(10, 1, dtype=torch.int32), + torch.ones(10, 10, dtype=torch.int32), + ), TosaProfile.BI: ( torch.ones(10, 1), torch.ones(10, 10), @@ -127,7 +131,7 @@ def forward(self, x): x = self.fc(x) return x - @register_test + # @register_test class simple_conv2d(torch.nn.Module): inputs = { TosaProfile.BI: ( @@ -151,7 +155,7 @@ def forward(self, x): x = self.conv2d(x) return x - @register_test + # @register_test class block_two_conv2d(torch.nn.Module): inputs = { TosaProfile.BI: (torch.ones(1, 3, 256, 256),), @@ -172,7 +176,7 @@ def forward(self, x): x = self.conv2d_2(x) return x - @register_test + # @register_test class simple_depthwise_conv2d(torch.nn.Module): inputs = { TosaProfile.BI: ( @@ -276,7 +280,7 @@ def __init__(self): def forward(self, x): return self.softmax(x) - @register_test + # @register_test class block_conv_norm_activation(torch.nn.Module): inputs = { TosaProfile.BI: (torch.ones(1, 3, 256, 256),), @@ -298,7 +302,7 @@ def forward(self, x): x = self.relu6(x) return x - @register_test + # @register_test class block_bottleneck_residual(torch.nn.Module): # This is the essence of MobileNetV2 # Ref: https://arxiv.org/abs/1801.04381 diff --git a/backends/arm/test/test_tosa.py b/backends/arm/test/test_tosa.py index b3e59658641..9736503e626 100644 --- a/backends/arm/test/test_tosa.py +++ b/backends/arm/test/test_tosa.py @@ -17,6 +17,8 @@ from executorch.exir.backend.backend_api import to_backend +from executorch.exir.backend.compile_spec_schema import CompileSpec + # Config for Capturing the weights, will be moved in the future _CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True) _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( @@ -37,9 +39,12 @@ def test_minimal_MI(self): for test_model in TestList: print(f"Running test {test_model}") model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.MI) - - model_edge, exec_prog = export_model(model, inputs, []) - # TODO: check there is a tosa delegate blob in the output + if inputs is None: + print(" Skipping, no inputs for this profile") + continue + model_edge, exec_prog = export_model( + model, inputs, [CompileSpec("output_format", bytes("tosa", "utf8"))] + ) def test_minimal_BI(self): for test_model in TestList: @@ -48,14 +53,31 @@ def test_minimal_BI(self): if inputs is None: print(" Skipping, no inputs for this profile") continue - model_edge, exec_prog = export_model(model, inputs, []) - # TODO: check there is a tosa delegate blob in the output + model_edge, exec_prog = export_model( + model, inputs, [CompileSpec("output_format", bytes("tosa", "utf8"))] + ) + + def test_minimal_BI_INT(self): + for test_model in TestList: + print(f"Running test {test_model}") + model, inputs, outputs = prepare_model_and_ref( + test_model, TosaProfile.BI_INT + ) + if inputs is None: + print(" Skipping, no inputs for this profile") + continue + model_edge, exec_prog = export_model( + model, inputs, [CompileSpec("output_format", bytes("tosa", "utf8"))] + ) def prepare_model_and_ref(test_model, profile=TosaProfile.MI): model = TestList[test_model] model_inputs = model.inputs.get(profile) + if model_inputs is None: + return model, model_inputs, None + model.eval() if profile == TosaProfile.BI: # Quantize the model @@ -72,10 +94,8 @@ def prepare_model_and_ref(test_model, profile=TosaProfile.MI): prepared_model(*model.inputs[profile]) model = convert_pt2e(prepared_model) - if model_inputs is not None: - model_outputs = model.forward(*model_inputs) - return model, model_inputs, model_outputs - return model, model_inputs, None + model_outputs = model.forward(*model_inputs) + return model, model_inputs, model_outputs def export_model(model, inputs, compile_spec): diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py index 0dba4fa9866..80f1e19a357 100644 --- a/examples/arm/arm_tosa_e2e.py +++ b/examples/arm/arm_tosa_e2e.py @@ -144,8 +144,13 @@ def tosa_run_test(op, profile=TosaProfile.MI): # noqa: C901 TOSA_OUT_PATH = os.path.join(DEBUG_OUTPUT_PATH, op, "tosa", "") os.makedirs(TOSA_OUT_PATH, exist_ok=True) - # Debug flag for compilers - compile_spec = [CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8"))] + # Debug flags for compilers + # - Emit some debug files into /tmp + # - output_format TOSA for this test (and pure tosa flows) + compile_spec = [ + CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8")), + CompileSpec("output_format", bytes("tosa", "utf8")), + ] model, inputs, torch_output = prepare_model_and_ref(op, profile) From 6b7a18a7406a76158cf641bbdc725ad0982fec75 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 16:18:32 +0000 Subject: [PATCH 20/25] review fixes Signed-off-by: Rob Elliott --- backends/arm/CMakeLists.txt | 1 - backends/arm/arm_backend.py | 18 ++--- backends/arm/cmake/arm-none-eabi-gcc.cmake | 90 ---------------------- 3 files changed, 7 insertions(+), 102 deletions(-) delete mode 100644 backends/arm/cmake/arm-none-eabi-gcc.cmake diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 4dcf2ff0539..2b40086091b 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -14,7 +14,6 @@ endif() include(${EXECUTORCH_ROOT}/build/Utils.cmake) set(_common_include_directories ${EXECUTORCH_ROOT}/..) -set(_common_compile_options -Wno-deprecated-declarations) include(cmake/Dependencies.cmake) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index ac40042bcd9..407f233b02c 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -138,13 +138,11 @@ def dbg_tosa_dump(tosa_fb, path): fb = tosa_fb.serialize() js = tosa_fb.writeJson(filename) - f = open(path + filename, "wb") - f.write(fb) - f.close() + with open(path + filename, "wb") as f: + f.write(fb) - f = open(path + "desc.json", "w") - f.write(js) - f.close() + with open(path + "desc.json", "w") as f: + f.write(js) # Output to Vela with current file-based compilation @@ -153,12 +151,10 @@ def vela_compile(tosa_fb): with tempfile.TemporaryDirectory() as tmpdir: tosaname = "out.tosa" flatbuffer = tosa_fb.serialize() - f = open(os.path.join(tmpdir, tosaname), "wb") - f.write(flatbuffer) - f.close() + with open(os.path.join(tmpdir, tosaname), "wb") as f: + f.write(flatbuffer) # invoke vela - # TODO target ethos-u55-128 vela_command = ( f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" ) @@ -169,7 +165,7 @@ def vela_compile(tosa_fb): with np.load(np_path, allow_pickle=False) as data: # Emit the NPZ regions as: # - 16 byte block name null terminated string (padded to 16 if name shorter) - # - 4 byes of int32 block length and 12 bytes of 0's + # - 4 bytes of int32 block length and 12 bytes of 0's # - block data (padded to 16 byte alignment at end) # Repeat for all blocks for key in data.keys(): diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake deleted file mode 100644 index 0921a529037..00000000000 --- a/backends/arm/cmake/arm-none-eabi-gcc.cmake +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2023 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set(TARGET_CPU "cortex-m55" CACHE STRING "Target CPU") -string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR) - -set(CMAKE_SYSTEM_NAME Generic) -set(CMAKE_C_COMPILER "arm-none-eabi-gcc") -set(CMAKE_CXX_COMPILER "arm-none-eabi-g++") -set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc") -set(CMAKE_LINKER "arm-none-eabi-ld") - -set(CMAKE_EXECUTABLE_SUFFIX ".elf") -set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) -set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) - -# Select C/C++ version -set(CMAKE_C_STANDARD 11) -set(CMAKE_CXX_STANDARD 14) - -set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR}) -string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU}) - -# Compile options -add_compile_options( - -mcpu=${GCC_CPU} - -mthumb - "$<$:-gdwarf-3>" - "$<$:-fno-unwind-tables;-fno-rtti;-fno-exceptions>" - -fdata-sections - -ffunction-sections) - -# Compile defines -add_compile_definitions( - "$<$>:NDEBUG>") - -# Link options -add_link_options( - -mcpu=${GCC_CPU} - -mthumb - --specs=nosys.specs) - -# Set floating point unit -if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp") - set(FLOAT hard) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp") - set(FLOAT soft) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR - CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR - CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)") - set(FLOAT hard) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR - CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)") - set(FLOAT hard) - set(FPU_CONFIG "fpv4-sp-d16") - add_compile_options(-mfpu=${FPU_CONFIG}) - add_link_options(-mfpu=${FPU_CONFIG}) -else() - set(FLOAT soft) -endif() - -if(FLOAT) - add_compile_options(-mfloat-abi=${FLOAT}) - add_link_options(-mfloat-abi=${FLOAT}) -endif() - -add_link_options(LINKER:--nmagic,--gc-sections) - -# Compilation warnings -add_compile_options( -# -Wall -# -Wextra - -# -Wcast-align -# -Wdouble-promotion -# -Wformat -# -Wmissing-field-initializers -# -Wnull-dereference -# -Wredundant-decls -# -Wshadow -# -Wswitch -# -Wswitch-default -# -Wunused - -Wno-redundant-decls - -Wno-psabi -) From c453e4390bd64c07f726b9c06200e1fe529912fd Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 18:52:06 +0000 Subject: [PATCH 21/25] review feedback/improvements Signed-off-by: Rob Elliott --- backends/arm/runtime/ArmBackendEthosU.cpp | 125 +++++++++++----------- 1 file changed, 60 insertions(+), 65 deletions(-) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index f1da72b6396..85c10fed160 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -10,6 +10,7 @@ * ethos-u-core-driver for hardware interaction. */ +#include #include #include @@ -25,7 +26,9 @@ using namespace std; namespace torch { namespace executor { -// TODO we should be in 0x31, not this lower 1MB sRAM +// TODO: we should be in 0x31, to access a full 2MB SRAM +// region and enable maximum program performance up to +// 2MB, rather than 1. // SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000 #define CS300_SRAM_LOW ((void*)0x11000000) #define CS300_SRAM_HIGH ((void*)0x110FFFFF) @@ -37,6 +40,7 @@ class ArmBackend final : public PyTorchBackendInterface { ~ArmBackend() = default; virtual bool is_available() const override { + // TODO: revise to use a register check/init function return 1; } @@ -52,16 +56,19 @@ class ArmBackend final : public PyTorchBackendInterface { // Header and footer both 16 bit aligned suggest valid structure and we // wont walk off the end of the chunks and segfault - if (!((int)data == next_mul_16((int)data))) { + if (!((int)data == next_mul_16((uintptr_t)data))) { ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned"); return Error::InvalidProgram; } - if (!((int)foot == next_mul_16((int)foot))) { - ET_LOG(Error, "ArmBackend::init: Program unexpected size"); + if (!((int)foot == next_mul_16((uintptr_t)foot))) { + ET_LOG(Error, "ArmBackend::init: Footer expected to be 16 byte aligned"); + ET_LOG( + Error, + "ArmBackend::init: Program expected to be multiple of 16 bytes"); return Error::InvalidProgram; } if (!(0 == strncmp(data, "vela_bin_stream", 15))) { - ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream"); + ET_LOG(Error, "ArmBackend::init: Binary passed is not a vela_bin_stream"); return Error::InvalidProgram; } if (!(0 == strncmp(foot, "vela_end_stream", 15))) { @@ -70,8 +77,15 @@ class ArmBackend final : public PyTorchBackendInterface { } // Verify address range is accessible current expectation is the program // is wholly stored in SRAM + // TODO: expect to improve capabilities here by supporting DRAM storage + // and only moving required data into SRAM. if (!(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH)) { ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM"); + ET_LOG( + Error, + "ArmBackend::init: program binary range %p:%p", + data, + foot + 16); return Error::InvalidProgram; } @@ -88,7 +102,7 @@ class ArmBackend final : public PyTorchBackendInterface { ET_LOG(Info, "ArmBackend::execute %p", processed->data()); - vela_handles handles; + VelaHandles handles; // Command stream - we know at this point it's aligned char* data = (char*)processed->data(); @@ -110,7 +124,7 @@ class ArmBackend final : public PyTorchBackendInterface { handles.scratch_data_size); // Write inputs into SRAM scratch area defined by Vela - for (int i = 0; i < handles.input_shape.size(); i++) { + for (int i = 0; i < handles.input_shapes.size(); i++) { const char* input_addr = handles.scratch_data + handles.input_offset[i]; // Process input EValue into scratch // TODO: optimise into direct write for compatible, contig layout @@ -122,21 +136,16 @@ class ArmBackend final : public PyTorchBackendInterface { } } -#if 0 - // TMP emit scratch - printf("Scratch after setup:\n"); - for (int i = 0; i < handles.scratch_data_size; i++) { - printf("%02x ", ((char*)handles.scratch_data)[i]); - if (!((i + 1) % 4)) - printf("\n"); - } - printf("\n"); - // END TMP emit scratch -#endif - // Allocate driver handle and synchronously invoke driver ethosu_driver* drv = ethosu_reserve_driver(); + if (drv == NULL) { + ET_LOG(Error, "ArmBackend::execute: ethosu_reserve_driver failed"); + return Error::InvalidState; + } + // Ethos-U low level driver expected order for Ethos U-55, we have + // constant weight data, then scratch (which contains input and output) + // scratch is written above in this function. uint64_t bases[2] = { (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data}; size_t bases_size[2] = { @@ -147,7 +156,7 @@ class ArmBackend final : public PyTorchBackendInterface { handles.cmd_data_size, bases, bases_size, - 2, + 2, /* fixed array of pointers to binary interface*/ nullptr); if (result != 0) { @@ -158,22 +167,11 @@ class ArmBackend final : public PyTorchBackendInterface { return Error::InvalidProgram; } -#if 0 - // TMP emit scratch - printf("Scratch after:\n"); - for (int i = 0; i < handles.scratch_data_size; i++) { - printf("%02x ", ((char*)handles.scratch_data)[i]); - if (!((i + 1) % 4)) - printf("\n"); - } - printf("\n"); -#endif - // output data from Ethos U // We only handle one output at the moment const char* output_addr = handles.scratch_data + handles.output_offset[0]; // Outputs are in the index immediately after inputs - int output_index = handles.input_shape.size(); + int output_index = handles.input_shapes.size(); // Process results into EValue storage // TODO: optimise into direct write for compatible, contig layout @@ -200,103 +198,100 @@ class ArmBackend final : public PyTorchBackendInterface { const char* scratch_data; size_t scratch_data_size; vector input_offset; - vector> input_shape; + vector> input_shapes; vector output_offset; - vector> output_shape; - } vela_handles; + vector> output_shapes; + } VelaHandles; typedef struct { char name[16]; - int size; + uint32_t size; char _pad[12]; char data[]; - } vela_bin_block; + } VelaBinBlock; typedef struct { int count; int shape[][4]; - } vela_shapes; + } VelaShapes; typedef struct { int count; int offsets[]; - } vela_offsets; + } VelaOffsets; static int next_mul_16(int n) { return ((n - 1) | 15) + 1; } - int vela_read(char* data, vela_handles* h, int size) const { + int vela_read(char* data, VelaHandles* handles, int size) const { + constexpr const size_t header_size = 16; + // Read header string if (strncmp(data, "vela_bin_stream", 15)) { return 0; } - data += 16; + data += header_size; - // Expect one or more 'vela_bin_block's + // Expect one or more 'VelaBinBlock's while (1) { - vela_bin_block* b = (vela_bin_block*)data; - data += 16 + 16 + next_mul_16(b->size); + VelaBinBlock* b = (VelaBinBlock*)data; + data += sizeof(VelaBinBlock) + next_mul_16(b->size); // Exit with success on finding end of stream - if (!strncmp(b->name, "vela_end_stream", 15)) + if (!strncmp(b->name, "vela_end_stream", strlen("vela_end_stream"))) return 1; if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) { // This magic header confirms a valid command stream in binary - if (strncmp(b->data, "COP1", 4)) + if (strncmp(b->data, "COP1", strlen("COP1"))) return 0; - h->cmd_data = b->data; - h->cmd_data_size = b->size; + handles->cmd_data = b->data; + handles->cmd_data_size = b->size; } if (!strncmp(b->name, "weight_data", strlen("weight_data"))) { - h->weight_data = b->data; - h->weight_data_size = b->size; + handles->weight_data = b->data; + handles->weight_data_size = b->size; } if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { - h->scratch_data = b->data; - h->scratch_data_size = b->size; + handles->scratch_data = b->data; + handles->scratch_data_size = b->size; } // capture inputs and outputs - if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { - h->scratch_data = b->data; - h->scratch_data_size = b->size; - } - if (!strncmp(b->name, "input_offset", strlen("input_offset"))) { - vela_offsets* offsets = (vela_offsets*)b->data; + VelaOffsets* offsets = (VelaOffsets*)b->data; for (int i = 0; i < offsets->count; i++) { - h->input_offset.push_back(offsets->offsets[i]); + handles->input_offset.push_back(offsets->offsets[i]); } } if (!strncmp(b->name, "output_offset", strlen("output_offset"))) { - vela_offsets* offsets = (vela_offsets*)b->data; + VelaOffsets* offsets = (VelaOffsets*)b->data; for (int i = 0; i < offsets->count; i++) { - h->output_offset.push_back(offsets->offsets[i]); + handles->output_offset.push_back(offsets->offsets[i]); } } if (!strncmp(b->name, "input_shape", strlen("input_shape"))) { - vela_shapes* shapes = (vela_shapes*)b->data; + VelaShapes* shapes = (VelaShapes*)b->data; for (int i = 0; i < shapes->count; i++) { vector s = { shapes->shape[i][0], shapes->shape[i][1], shapes->shape[i][2], shapes->shape[i][3]}; - h->input_shape.push_back(s); + handles->input_shapes.push_back(s); } } if (!strncmp(b->name, "output_shape", strlen("output_shape"))) { - vela_shapes* shapes = (vela_shapes*)b->data; + VelaShapes* shapes = (VelaShapes*)b->data; for (int i = 0; i < shapes->count; i++) { vector s = { shapes->shape[i][0], shapes->shape[i][1], shapes->shape[i][2], shapes->shape[i][3]}; - h->output_shape.push_back(s); + handles->output_shapes.push_back(s); } } } From 1917b541c6fcc5034313fbf18d5b6cf9609bc5d7 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 19:43:01 +0000 Subject: [PATCH 22/25] tidy up example scripts Signed-off-by: Rob Elliott --- examples/arm/run.sh | 24 ++++++------------------ examples/arm/setup.sh | 4 ---- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 2d255b6694e..10a296242b0 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -49,7 +49,7 @@ function generate_ethos_pte_file() { cd $et_root_dir python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null cd ./ethosout/simple_add/torch/ - local pte_file=$(readlink -f ./delegated.pte) + local pte_file=$(realpath ./delegated.pte) [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } echo "${pte_file}" } @@ -107,7 +107,9 @@ function build_executorch_runner() { # Execute the executor_runner on FVP Simulator function run_fvp() { - elf=$(find ${ethos_u_build_dir} -name "executor_runner.elf") + [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expexted elf binary name, got $*"; exit 1; } + local elf_name=${1} + elf=$(find ${ethos_u_build_dir} -name "${elf_name}") [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner elf: ${elf}"; exit 1; } FVP_Corstone_SSE-300_Ethos-U55 \ -C ethosu.num_macs=128 \ @@ -119,20 +121,6 @@ function run_fvp() { echo "[${FUNCNAME[0]} Simulation complete, $?" } -# Execute the executor_runner on FVP Simulator -function run_fvp_delegate() { - elf=$(find ${ethos_u_build_dir} -name "executor_runner_delegate.elf") - [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner_delegate elf: ${elf}"; exit 1; } - FVP_Corstone_SSE-300_Ethos-U55 \ - -C ethosu.num_macs=128 \ - -C mps3_board.visualisation.disable-visualisation=1 \ - -C mps3_board.telnetterminal0.start_telnet=0 \ - -C mps3_board.uart0.out_file='-' \ - -a "${elf}" \ - --timelimit 5 || true - echo "[${FUNCNAME[0]} Simulation complete, $?" -} - ####### ### Main ####### @@ -169,9 +157,9 @@ build_executorch build_executorch_runner "${pte}" "${pte_delegate}" # run the app -run_fvp +run_fvp executor_runner.elf # run the delegate app -run_fvp_delegate +run_fvp executor_runner_delegate.elf exit 0 diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index c4c644bef4f..34b20498cd7 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -169,10 +169,7 @@ function setup_tosa_reference_model() { make cd reference_model tosa_bin_path=`pwd` - echo adding ${tosa_bin_path} to path echo "export PATH=\${PATH}:${tosa_bin_path}" >> "${setup_path_script}" - cd ../.. - echo back at `pwd` } function setup_vela() { @@ -187,7 +184,6 @@ function setup_vela() { patch_repo fi pip install . - cd .. } ######## From 20bf2ebc3aae8df17645ee8d700466288de086d5 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 20:06:35 +0000 Subject: [PATCH 23/25] further review comments Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 2 ++ backends/arm/runtime/ArmBackendEthosU.cpp | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 407f233b02c..d51ae3b4a36 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -184,6 +184,8 @@ def vela_compile(tosa_fb): block_data = input_struct elif key in ("input_offset", "output_offset"): inputs = data[key] + if key == "output_offset" && len(inputs) > 1: + raise RuntimeError("Currently only support one output in Vela ArmBackend") offset_struct = struct.pack("toTensor(); for (int j = 0; j < tensor_in.numel(); j++) { - // TODO: extend beyond 4 byte tensors + // TODO: extend beyond tensors with 4 byte elements input_address[j] = tensor_in.mutable_data_ptr()[j]; } } @@ -173,12 +174,18 @@ class ArmBackend final : public PyTorchBackendInterface { // Outputs are in the index immediately after inputs int output_index = handles.input_shapes.size(); + if (handles.output_shapes.size() != 1) { + ET_LOG( + Error, + "ArmBackend::execute: currently only support one return tensor"); + return Error::InvalidProgram; + } // Process results into EValue storage // TODO: optimise into direct write for compatible, contig layout int* output_address = (int*)output_addr; auto tensor_out = args[output_index]->toTensor(); for (int j = 0; j < tensor_out.numel(); j++) { - // TODO: extend beyond 4 byte tensors + // TODO: extend beyond tensors with 4 byte elements tensor_out.mutable_data_ptr()[j] = output_address[j]; } From 468d6fc4edaa3784d3f5bdcd7b0c3958e8cf97f0 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 20:21:12 +0000 Subject: [PATCH 24/25] revised path for ethosu_minimal Signed-off-by: Rob Elliott --- examples/arm/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 10a296242b0..3f9bd37d90c 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -47,7 +47,7 @@ function generate_pte_file() { # Generate the ethos delegate PTE file function generate_ethos_pte_file() { cd $et_root_dir - python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null + python3 examples/arm/arm_ethosu_minimal.py &> /dev/null cd ./ethosout/simple_add/torch/ local pte_file=$(realpath ./delegated.pte) [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } From 709a688c58fef40a748bc5b355761fcb969b5dae Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 6 Oct 2023 20:49:26 +0000 Subject: [PATCH 25/25] lintfix Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index d51ae3b4a36..f0f285418c6 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -184,8 +184,10 @@ def vela_compile(tosa_fb): block_data = input_struct elif key in ("input_offset", "output_offset"): inputs = data[key] - if key == "output_offset" && len(inputs) > 1: - raise RuntimeError("Currently only support one output in Vela ArmBackend") + if key == "output_offset" and len(inputs) > 1: + raise RuntimeError( + "Currently only support one output in Vela ArmBackend" + ) offset_struct = struct.pack("