From 225c11569248d80b6c92b3696a07d510ed928d7b Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Tue, 3 Oct 2023 22:07:42 +0000
Subject: [PATCH 01/25] Initial Ethos-U runtime backend

 - Basic runtime targeting Corstone-300 with U55
 - cross compile support with a cmake toolchain (Arm baremetal build)
 - support for a few models AoT -> TOSA -> Vela -> U55 hardware
 - dependencies on the ethos-u core driver and cmsis (submodules)

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .gitmodules                                  |   6 +
 CMakeLists.txt                               |   7 +
 backends/arm/CMakeLists.txt                  |  25 ++
 backends/arm/arm_backend.py                  |  69 ++++-
 backends/arm/cmake/Dependencies.cmake        |  12 +
 backends/arm/cmake/arm-none-eabi-gcc.cmake   |  90 +++++++
 backends/arm/cmake/build.sh                  |  53 ++++
 backends/arm/cmake/toolchain.sh              |  12 +
 backends/arm/runtime/ArmBackendEthosU.cpp    | 261 +++++++++++++++++++
 backends/arm/third-party/cmsis               |   1 +
 backends/arm/third-party/ethos-u-core-driver |   1 +
 11 files changed, 531 insertions(+), 6 deletions(-)
 create mode 100644 backends/arm/CMakeLists.txt
 create mode 100644 backends/arm/cmake/Dependencies.cmake
 create mode 100644 backends/arm/cmake/arm-none-eabi-gcc.cmake
 create mode 100755 backends/arm/cmake/build.sh
 create mode 100755 backends/arm/cmake/toolchain.sh
 create mode 100644 backends/arm/runtime/ArmBackendEthosU.cpp
 create mode 160000 backends/arm/third-party/cmsis
 create mode 160000 backends/arm/third-party/ethos-u-core-driver

diff --git a/.gitmodules b/.gitmodules
index 05143134bcf..0687c0e8b3f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -43,3 +43,9 @@
 [submodule "examples/demo-apps/android/jni/third-party/fbjni"]
 	path = examples/demo-apps/android/jni/third-party/fbjni
 	url = https://github.com/facebookincubator/fbjni.git
+[submodule "backends/arm/third-party/ethos-u-core-driver"]
+	path = backends/arm/third-party/ethos-u-core-driver
+	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
+[submodule "backends/arm/third-party/cmsis"]
+	path = backends/arm/third-party/cmsis
+	url = https://github.com/ARM-software/CMSIS_5.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0281766aab..122d9006b20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -334,6 +334,13 @@ if(EXECUTORCH_BUILD_QNN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/qualcomm)
 endif()
 
+# Build Arm Baremetal backend
+option(EXECUTORCH_BUILD_ARM_BAREMETAL
+       "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF)
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
 # Add selective build subdirectory
 if(BUILD_SELECTIVE_BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build)
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
new file mode 100644
index 00000000000..2cc5cf94740
--- /dev/null
+++ b/backends/arm/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_compile_options -Wno-deprecated-declarations)
+
+include(cmake/Dependencies.cmake)
+
+set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
+add_library(ethos_u STATIC ${_arm_baremetal_sources})
+target_include_directories(ethos_u PUBLIC ${_common_include_directories})
+target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR})
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 6b08d94e3aa..4f08856affa 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -13,6 +13,7 @@
 import operator
 import os
 import tempfile
+import subprocess
 from typing import final, List
 
 import numpy as np
@@ -144,6 +145,64 @@ def dbg_tosa_dump(tosa_fb, path):
     f.write(js)
     f.close()
 
+# Output to Vela with current file-based compilation
+# WARNING: if this changes, the runtime reader also needs to change
+def vela_compile(tosa_fb):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        print(f"compiling to Vela in {tmpdir}")
+
+        tosaname = "out.tosa"
+        flatbuffer = tosa_fb.serialize()
+        f = open(os.path.join(tmpdir,tosaname), "wb")
+        f.write(flatbuffer)
+        f.close()
+
+        # invoke vela
+        # TODO target ethos-u55-128
+        vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
+        subprocess.run([vela_command], shell=True, check=True)
+
+        np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz")
+        blocks = b''
+        with np.load(np_path, allow_pickle=False) as data:
+            # Emit the NPZ regions as:
+            #  - 16 byte block name null terminated string (padded to 16 if name shorter)
+            #  - 4 byes of int32 block length and 12 bytes of 0's
+            #  - block data (padded to 16 byte alignment at end)
+            # Repeat for all blocks
+            for key in data.keys():
+                block_name = bytes(key,"utf8")[:15]
+                block_name = block_name + b'\x00'*(16-len(block_name))
+                block_data = data[key].tobytes() 
+                # We need the acual unpadded block lengths for hw setup
+                block_length = len(block_data).to_bytes(16, 'little')
+                # pad block data to multiple of 16 bytes
+                block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16)
+
+                block = block_name + block_length + block_data
+                blocks = blocks + block
+
+            # Add a block for scratch, inputs and outputs
+            # scratch shape is a 1 element array giving us size in bytes
+            block_name = bytes("scratch_data","utf8")[:15]
+            block_name = block_name + b'\x00'*(16-len(block_name))
+            block_length = data["scratch_shape"][0].item()
+            print(f"scratch length = {block_length}")
+            block_length = block_length+(15-(block_length-1)%16)
+            block_data = b'\x00'*block_length
+            block_length = block_length.to_bytes(16, 'little')
+            print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}")
+            block = block_name + block_length + block_data
+            blocks = blocks + block
+            # TODO are these already in scratch shape? look to be
+            #input_shape * input_elem_size
+            #output_shape * output_elem_size
+            # input_offset and output_offset specify the location these arrays are written from base of scratch
+
+        # return 16 byte VELA bin header + blocks + footer
+        header = bytes("vela_bin_stream","utf-8") + b'\x00'
+        footer = bytes("vela_end_stream","utf-8") + b'\x00'
+        return header + blocks + footer
 
 def dbg_fail(node, tosa_fb, path):
     dbg_tosa_dump(tosa_fb, path)
@@ -242,10 +301,6 @@ def preprocess(  # noqa: C901
                 path = spec.value.decode()
                 debug_output = True
 
-        # in non debug builds we still pass files to vela
-        if path is None:
-            path = tempfile.mkdtemp(prefix="arm_tosa_")
-
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_fb = ts.TosaSerializer(path)
@@ -891,5 +946,7 @@ def preprocess(  # noqa: C901
             dbg_tosa_dump(tosa_fb, path)
 
         # Serialize and return the tosa flatbuffer
-        fb = tosa_fb.serialize()
-        return PreprocessResult(processed_bytes=bytes(fb))
+        # fb = bytes(tosa_fb.serialize())
+        binary = vela_compile(tosa_fb)
+        
+        return PreprocessResult(processed_bytes=binary)
diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake
new file mode 100644
index 00000000000..27a587176bb
--- /dev/null
+++ b/backends/arm/cmake/Dependencies.cmake
@@ -0,0 +1,12 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
+
+# Ethos-U driver
+set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver")
+set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
+add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} )
+include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake
new file mode 100644
index 00000000000..d70f79361cd
--- /dev/null
+++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake
@@ -0,0 +1,90 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU")
+string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
+set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_LINKER "arm-none-eabi-ld")
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# Select C/C++ version
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+
+set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR})
+string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU})
+
+# Compile options
+add_compile_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    "$<$<CONFIG:DEBUG>:-gdwarf-3>"
+    "$<$<COMPILE_LANGUAGE:CXX>:-fno-unwind-tables;-fno-rtti;-fno-exceptions>"
+    -fdata-sections
+    -ffunction-sections)
+
+# Compile defines
+add_compile_definitions(
+    "$<$<NOT:$<CONFIG:DEBUG>>:NDEBUG>")
+
+# Link options
+add_link_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    --specs=nosys.specs)
+
+# Set floating point unit
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp")
+    set(FLOAT soft)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)")
+    set(FLOAT hard)
+    set(FPU_CONFIG "fpv4-sp-d16")
+    add_compile_options(-mfpu=${FPU_CONFIG})
+    add_link_options(-mfpu=${FPU_CONFIG})
+else()
+    set(FLOAT soft)
+endif()
+
+if (FLOAT)
+    add_compile_options(-mfloat-abi=${FLOAT})
+    add_link_options(-mfloat-abi=${FLOAT})
+endif()
+
+add_link_options(LINKER:--nmagic,--gc-sections)
+
+# Compilation warnings
+add_compile_options(
+#    -Wall
+#    -Wextra
+
+#    -Wcast-align
+#    -Wdouble-promotion
+#    -Wformat
+#    -Wmissing-field-initializers
+#    -Wnull-dereference
+#    -Wredundant-decls
+#    -Wshadow
+#    -Wswitch
+#    -Wswitch-default
+#    -Wunused
+    -Wno-redundant-decls
+    -Wno-psabi
+)
diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
new file mode 100755
index 00000000000..0dbb8cf2177
--- /dev/null
+++ b/backends/arm/cmake/build.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+#
+# Setup toolchain
+#
+BASEDIR=`realpath $(dirname "$0")`
+echo "building using build.sh in $BASEDIR"
+
+ARCH=$(uname -i)
+GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/
+
+echo $GCCPATH
+if test -d "${GCCPATH}"; then
+	echo Using exising compiler ${GCCPATH}
+else
+	pushd ${BASEDIR}/
+	./toolchain.sh
+	popd
+fi
+export PATH=${PATH}:${GCCPATH}
+
+echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"`
+
+
+#
+# Prepare and run clean build
+#
+rm -rf buck-out/ build/lib/ cmake-out/
+rm -rf cmake-corstone
+mkdir cmake-corstone
+cd cmake-corstone
+
+#cmake -DBUCK2=buck2 ..
+
+#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake ..
+cmake -DFLATC_EXECUTABLE=flatc \
+	  -DEXECUTORCH_BUILD_XNNPACK=OFF \
+	  -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
+	  -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+	  -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \
+	  -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
+	  --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \
+	  ..
+
+cd ..
+cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels
diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh
new file mode 100755
index 00000000000..92188ee982d
--- /dev/null
+++ b/backends/arm/cmake/toolchain.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon)
+ARCH=$(uname -i)
+curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz
+tar xf gcc.tar.xz
+export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)`
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
new file mode 100644
index 00000000000..e5d68e81156
--- /dev/null
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2023 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Arm backend for Ethos-U baremetal driver stack, this relies on the
+ * ethos-u-core-driver for hardware interaction.
+ */
+
+#include <memory>
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <ethosu_driver.h>
+#include <pmu_ethosu.h>
+
+namespace torch {
+namespace executor {
+
+// TODO we should be in 0x31, not this lower 1MB sRAM
+// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000
+#define CS300_SRAM_LOW ((void*)0x11000000)
+#define CS300_SRAM_HIGH ((void*)0x110FFFFF)
+
+class ArmBackend final : public PyTorchBackendInterface {
+
+public:
+	ArmBackend() {
+	    printf("Constructing ARM Backend\n");
+	}
+	
+	~ArmBackend() = default;
+
+	virtual bool is_available() const override {
+		return 1;
+	}
+
+	Result<DelegateHandle*> init(
+		BackendInitContext& context,
+		FreeableBuffer* processed,
+		ArrayRef<CompileSpec> compile_specs) const override {
+
+        ET_LOG(Info, "ArmBackend::init %p", processed->data() );
+
+		char *data = (char*)processed->data();
+		size_t size = processed->size();
+		char *foot = data + size - 16;
+
+		// Header and footer both 16 bit aligned suggest valid structure and we
+		// wont walk off the end of the chunks and segfault
+		if( !((int)data == next_mul_16((int)data)) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !((int)foot == next_mul_16((int)foot)) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !(0 == strncmp( data, "vela_bin_stream", 15 )) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !(0 == strncmp( foot, "vela_end_stream", 15 )) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		// Verify address range is accessible current expectation is the program
+		// is wholly stored in SRAM
+		if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) );
+		
+		// Return the same buffer we were passed - this data will be
+		// executed directly
+		return processed;
+	}
+
+	Error execute(
+		BackendExecutionContext& context,
+		DelegateHandle* input_handle,
+		EValue** args) const override {
+
+		FreeableBuffer* processed = (FreeableBuffer*)input_handle;
+
+		ET_LOG(Info, "ArmBackend::execute %p", processed->data() );
+
+		vela_handles handles = { 0, 0, 0, 0, 0, 0 };
+
+		// Command stream - we know at this point it's aligned
+		char *data = (char*)processed->data();
+
+		// Read key sections from the vela_bin_stream
+		if( !this->vela_read( data, &handles, processed->size() ) )
+		{
+			ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" );
+			return Error::InvalidProgram;
+		}
+
+		ET_LOG(Debug, "ArmBackend::execute: Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
+			   handles.cmd_data, handles.cmd_data_size,
+			   handles.weight_data, handles.weight_data_size,
+			   handles.scratch_data, handles.scratch_data_size );
+
+		// TMP emit scratch
+		printf("Scratch before:\n");
+		for( int i=0; i<handles.scratch_data_size; i++ )
+		{
+			if( i%4 == 0 ) ((char*)handles.scratch_data)[i] = 1;
+			printf("%02x ", ((char*)handles.scratch_data)[i]);
+			if( !((i+1)%4) ) printf("\n");
+		}
+		printf("\n");
+		
+		// Allocate driver handle and synchronously invoke driver
+		ethosu_driver *drv = ethosu_reserve_driver();
+
+		uint64_t bases[2] = {(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
+		size_t bases_size[2] = {handles.weight_data_size, handles.scratch_data_size};
+		int result = ethosu_invoke_v3(drv,
+									  (void*)handles.cmd_data,
+									  handles.cmd_data_size,
+									  bases,
+									  bases_size,
+									  2,
+									  nullptr);
+
+		if(result != 0)
+		{
+			ET_LOG(Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result);
+			return Error::InvalidProgram;
+		}
+		
+		// TMP emit scratch
+        printf("Scratch after:\n");
+        for( int i=0; i<handles.scratch_data_size; i++ )
+        {
+            printf("%02x ", ((char*)handles.scratch_data)[i]);
+            if( !((i+1)%4) ) printf("\n");
+        }
+        printf("\n");
+		
+		// Process results into EValue storage
+		// TODO: optimise into direct write for compatible layouts
+		// TODO: get num in/out and layout?
+		int *output_address = (int*)(handles.scratch_data + handles.output_offset);
+		auto tensor = args[1]->toTensor();
+		for(int j=0; j<tensor.numel(); j++)
+		{
+			
+			tensor.mutable_data_ptr<int>()[j] = output_address[j];
+		}
+				
+		return Error::Ok;
+	}
+
+	void destroy(DelegateHandle* handle) const override {
+		return;
+	}
+
+private:
+	typedef struct {
+		const char *cmd_data; size_t cmd_data_size;
+		const char *weight_data; size_t weight_data_size;
+		const char *scratch_data; size_t scratch_data_size;
+		size_t input_offset; size_t input_data_shape[3];
+		size_t output_offset; size_t output_data_shape[3];
+	} vela_handles;
+
+	typedef struct {
+		char name[16];
+		int size; char _pad[12];
+		char data[];
+	} vela_bin_block;
+
+	static int next_mul_16( int n ) {
+		return ((n-1)|15)+1;
+	}
+	
+	int vela_read(char* data, vela_handles *h, int size ) const {
+
+		// Read header string
+		if( strncmp( data, "vela_bin_stream", 15 ) )
+		{
+			return 0;
+		}
+		data += 16;
+
+		// Expect one or more 'vela_bin_block's
+		while( 1 )
+		{
+			vela_bin_block *b = (vela_bin_block*)data;
+			data += 16 + 16 + next_mul_16(b->size);
+
+			// Exit with success on finding end of stream
+			if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1;
+
+			if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) )
+			{
+				// This magic header confirms a valid command stream in binary
+				if( strncmp( b->data, "COP1", 4 ) ) return 0;
+				h->cmd_data = b->data;
+				h->cmd_data_size = b->size;
+			}
+			if( !strncmp( b->name, "weight_data", strlen("weight_data")) )
+			{
+				h->weight_data = b->data;;
+				h->weight_data_size = b->size;
+			}
+			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
+			{
+				h->scratch_data = b->data;
+				h->scratch_data_size = b->size;
+			}
+
+			// capture inputs and outputs
+			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
+			{
+				h->scratch_data = b->data;
+				h->scratch_data_size = b->size;
+			}
+			if( !strncmp( b->name, "input_offset", strlen("input_offset")) )
+			{
+				h->input_offset = ((int*)b->data)[0];
+			}
+			if( !strncmp( b->name, "output_offset", strlen("output_offset")) )
+			{
+				h->output_offset = ((int*)b->data)[0];
+			}
+			if( !strncmp( b->name, "input_shape", strlen("input_shape")) )
+			{
+				h->input_data_shape[0] = ((int*)b->data)[0];
+				h->input_data_shape[0] = ((int*)b->data)[1];
+				h->input_data_shape[0] = ((int*)b->data)[2];
+				
+			}
+			if( !strncmp( b->name, "output_shape", strlen("output_shape")) )
+			{
+				h->output_data_shape[0] = ((int*)b->data)[0];
+				h->output_data_shape[0] = ((int*)b->data)[1];
+                h->output_data_shape[0] = ((int*)b->data)[2];
+            }							
+		}
+	}
+
+};
+
+namespace {
+auto backend = ArmBackend();
+Backend backend_id{"ArmBackend", &backend};
+static auto registered = register_backend(backend_id);
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis
new file mode 160000
index 00000000000..a75f01746df
--- /dev/null
+++ b/backends/arm/third-party/cmsis
@@ -0,0 +1 @@
+Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3
diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver
new file mode 160000
index 00000000000..90f9df900ac
--- /dev/null
+++ b/backends/arm/third-party/ethos-u-core-driver
@@ -0,0 +1 @@
+Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5

From a7aa8489d90856fd016bf7b7f1c9bc5c4ecf2dae Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 4 Oct 2023 08:35:54 +0000
Subject: [PATCH 02/25] Fixed error messages on runtime init

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/runtime/ArmBackendEthosU.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index e5d68e81156..eccacd7c1cc 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -55,27 +55,31 @@ class ArmBackend final : public PyTorchBackendInterface {
 		// wont walk off the end of the chunks and segfault
 		if( !((int)data == next_mul_16((int)data)) )
 		{
-			ET_LOG(Error, "ArmBackend::init header unaligned");
+			ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned");
 			return Error::InvalidProgram;
 		}
 		if( !((int)foot == next_mul_16((int)foot)) )
 		{
-			ET_LOG(Error, "ArmBackend::init header unaligned");
+			ET_LOG(Error, "ArmBackend::init: Program unexpected size");
 			return Error::InvalidProgram;
 		}
 		if( !(0 == strncmp( data, "vela_bin_stream", 15 )) )
 		{
-			ET_LOG(Error, "ArmBackend::init header unaligned");
+			ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream");
 			return Error::InvalidProgram;
 		}
 		if( !(0 == strncmp( foot, "vela_end_stream", 15 )) )
 		{
-			ET_LOG(Error, "ArmBackend::init header unaligned");
+			ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream");
 			return Error::InvalidProgram;
 		}
 		// Verify address range is accessible current expectation is the program
 		// is wholly stored in SRAM
-		if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) );
+		if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) )
+		{
+			ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM");
+			return Error::InvalidProgram;
+		}
 		
 		// Return the same buffer we were passed - this data will be
 		// executed directly

From f95feade8fb9e6b089120b68bc4f87f36265933c Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 4 Oct 2023 08:49:42 +0000
Subject: [PATCH 03/25] lintrunner cleanup

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py                |  44 +-
 backends/arm/cmake/arm-none-eabi-gcc.cmake |   2 +-
 backends/arm/runtime/ArmBackendEthosU.cpp  | 446 ++++++++++-----------
 3 files changed, 246 insertions(+), 246 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 4f08856affa..8185718f45e 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -12,8 +12,8 @@
 import logging
 import operator
 import os
-import tempfile
 import subprocess
+import tempfile
 from typing import final, List
 
 import numpy as np
@@ -145,6 +145,7 @@ def dbg_tosa_dump(tosa_fb, path):
     f.write(js)
     f.close()
 
+
 # Output to Vela with current file-based compilation
 # WARNING: if this changes, the runtime reader also needs to change
 def vela_compile(tosa_fb):
@@ -153,17 +154,19 @@ def vela_compile(tosa_fb):
 
         tosaname = "out.tosa"
         flatbuffer = tosa_fb.serialize()
-        f = open(os.path.join(tmpdir,tosaname), "wb")
+        f = open(os.path.join(tmpdir, tosaname), "wb")
         f.write(flatbuffer)
         f.close()
 
         # invoke vela
         # TODO target ethos-u55-128
-        vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
+        vela_command = (
+            f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
+        )
         subprocess.run([vela_command], shell=True, check=True)
 
-        np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz")
-        blocks = b''
+        np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
+        blocks = b""
         with np.load(np_path, allow_pickle=False) as data:
             # Emit the NPZ regions as:
             #  - 16 byte block name null terminated string (padded to 16 if name shorter)
@@ -171,39 +174,40 @@ def vela_compile(tosa_fb):
             #  - block data (padded to 16 byte alignment at end)
             # Repeat for all blocks
             for key in data.keys():
-                block_name = bytes(key,"utf8")[:15]
-                block_name = block_name + b'\x00'*(16-len(block_name))
-                block_data = data[key].tobytes() 
+                block_name = bytes(key, "utf8")[:15]
+                block_name = block_name + b"\x00" * (16 - len(block_name))
+                block_data = data[key].tobytes()
                 # We need the acual unpadded block lengths for hw setup
-                block_length = len(block_data).to_bytes(16, 'little')
+                block_length = len(block_data).to_bytes(16, "little")
                 # pad block data to multiple of 16 bytes
-                block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16)
+                block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
 
                 block = block_name + block_length + block_data
                 blocks = blocks + block
 
             # Add a block for scratch, inputs and outputs
             # scratch shape is a 1 element array giving us size in bytes
-            block_name = bytes("scratch_data","utf8")[:15]
-            block_name = block_name + b'\x00'*(16-len(block_name))
+            block_name = bytes("scratch_data", "utf8")[:15]
+            block_name = block_name + b"\x00" * (16 - len(block_name))
             block_length = data["scratch_shape"][0].item()
             print(f"scratch length = {block_length}")
-            block_length = block_length+(15-(block_length-1)%16)
-            block_data = b'\x00'*block_length
-            block_length = block_length.to_bytes(16, 'little')
+            block_length = block_length + (15 - (block_length - 1) % 16)
+            block_data = b"\x00" * block_length
+            block_length = block_length.to_bytes(16, "little")
             print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}")
             block = block_name + block_length + block_data
             blocks = blocks + block
             # TODO are these already in scratch shape? look to be
-            #input_shape * input_elem_size
-            #output_shape * output_elem_size
+            # input_shape * input_elem_size
+            # output_shape * output_elem_size
             # input_offset and output_offset specify the location these arrays are written from base of scratch
 
         # return 16 byte VELA bin header + blocks + footer
-        header = bytes("vela_bin_stream","utf-8") + b'\x00'
-        footer = bytes("vela_end_stream","utf-8") + b'\x00'
+        header = bytes("vela_bin_stream", "utf-8") + b"\x00"
+        footer = bytes("vela_end_stream", "utf-8") + b"\x00"
         return header + blocks + footer
 
+
 def dbg_fail(node, tosa_fb, path):
     dbg_tosa_dump(tosa_fb, path)
     logger.warn("Internal error due to poorly handled node:")
@@ -948,5 +952,5 @@ def preprocess(  # noqa: C901
         # Serialize and return the tosa flatbuffer
         # fb = bytes(tosa_fb.serialize())
         binary = vela_compile(tosa_fb)
-        
+
         return PreprocessResult(processed_bytes=binary)
diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake
index d70f79361cd..10bc858ed46 100644
--- a/backends/arm/cmake/arm-none-eabi-gcc.cmake
+++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake
@@ -63,7 +63,7 @@ else()
     set(FLOAT soft)
 endif()
 
-if (FLOAT)
+if(FLOAT)
     add_compile_options(-mfloat-abi=${FLOAT})
     add_link_options(-mfloat-abi=${FLOAT})
 endif()
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index eccacd7c1cc..4c052ea60c7 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -28,238 +28,234 @@ namespace executor {
 #define CS300_SRAM_HIGH ((void*)0x110FFFFF)
 
 class ArmBackend final : public PyTorchBackendInterface {
-
-public:
-	ArmBackend() {
-	    printf("Constructing ARM Backend\n");
-	}
-	
-	~ArmBackend() = default;
-
-	virtual bool is_available() const override {
-		return 1;
-	}
-
-	Result<DelegateHandle*> init(
-		BackendInitContext& context,
-		FreeableBuffer* processed,
-		ArrayRef<CompileSpec> compile_specs) const override {
-
-        ET_LOG(Info, "ArmBackend::init %p", processed->data() );
-
-		char *data = (char*)processed->data();
-		size_t size = processed->size();
-		char *foot = data + size - 16;
-
-		// Header and footer both 16 bit aligned suggest valid structure and we
-		// wont walk off the end of the chunks and segfault
-		if( !((int)data == next_mul_16((int)data)) )
-		{
-			ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned");
-			return Error::InvalidProgram;
-		}
-		if( !((int)foot == next_mul_16((int)foot)) )
-		{
-			ET_LOG(Error, "ArmBackend::init: Program unexpected size");
-			return Error::InvalidProgram;
-		}
-		if( !(0 == strncmp( data, "vela_bin_stream", 15 )) )
-		{
-			ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream");
-			return Error::InvalidProgram;
-		}
-		if( !(0 == strncmp( foot, "vela_end_stream", 15 )) )
-		{
-			ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream");
-			return Error::InvalidProgram;
-		}
-		// Verify address range is accessible current expectation is the program
-		// is wholly stored in SRAM
-		if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) )
-		{
-			ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM");
-			return Error::InvalidProgram;
-		}
-		
-		// Return the same buffer we were passed - this data will be
-		// executed directly
-		return processed;
-	}
-
-	Error execute(
-		BackendExecutionContext& context,
-		DelegateHandle* input_handle,
-		EValue** args) const override {
-
-		FreeableBuffer* processed = (FreeableBuffer*)input_handle;
-
-		ET_LOG(Info, "ArmBackend::execute %p", processed->data() );
-
-		vela_handles handles = { 0, 0, 0, 0, 0, 0 };
-
-		// Command stream - we know at this point it's aligned
-		char *data = (char*)processed->data();
-
-		// Read key sections from the vela_bin_stream
-		if( !this->vela_read( data, &handles, processed->size() ) )
-		{
-			ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" );
-			return Error::InvalidProgram;
-		}
-
-		ET_LOG(Debug, "ArmBackend::execute: Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
-			   handles.cmd_data, handles.cmd_data_size,
-			   handles.weight_data, handles.weight_data_size,
-			   handles.scratch_data, handles.scratch_data_size );
-
-		// TMP emit scratch
-		printf("Scratch before:\n");
-		for( int i=0; i<handles.scratch_data_size; i++ )
-		{
-			if( i%4 == 0 ) ((char*)handles.scratch_data)[i] = 1;
-			printf("%02x ", ((char*)handles.scratch_data)[i]);
-			if( !((i+1)%4) ) printf("\n");
-		}
-		printf("\n");
-		
-		// Allocate driver handle and synchronously invoke driver
-		ethosu_driver *drv = ethosu_reserve_driver();
-
-		uint64_t bases[2] = {(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
-		size_t bases_size[2] = {handles.weight_data_size, handles.scratch_data_size};
-		int result = ethosu_invoke_v3(drv,
-									  (void*)handles.cmd_data,
-									  handles.cmd_data_size,
-									  bases,
-									  bases_size,
-									  2,
-									  nullptr);
-
-		if(result != 0)
-		{
-			ET_LOG(Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result);
-			return Error::InvalidProgram;
-		}
-		
-		// TMP emit scratch
-        printf("Scratch after:\n");
-        for( int i=0; i<handles.scratch_data_size; i++ )
-        {
-            printf("%02x ", ((char*)handles.scratch_data)[i]);
-            if( !((i+1)%4) ) printf("\n");
-        }
+ public:
+  ArmBackend() {
+    printf("Constructing ARM Backend\n");
+  }
+
+  ~ArmBackend() = default;
+
+  virtual bool is_available() const override {
+    return 1;
+  }
+
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const override {
+    ET_LOG(Info, "ArmBackend::init %p", processed->data());
+
+    char* data = (char*)processed->data();
+    size_t size = processed->size();
+    char* foot = data + size - 16;
+
+    // Header and footer both 16 bit aligned suggest valid structure and we
+    // wont walk off the end of the chunks and segfault
+    if (!((int)data == next_mul_16((int)data))) {
+      ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned");
+      return Error::InvalidProgram;
+    }
+    if (!((int)foot == next_mul_16((int)foot))) {
+      ET_LOG(Error, "ArmBackend::init: Program unexpected size");
+      return Error::InvalidProgram;
+    }
+    if (!(0 == strncmp(data, "vela_bin_stream", 15))) {
+      ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream");
+      return Error::InvalidProgram;
+    }
+    if (!(0 == strncmp(foot, "vela_end_stream", 15))) {
+      ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream");
+      return Error::InvalidProgram;
+    }
+    // Verify address range is accessible current expectation is the program
+    // is wholly stored in SRAM
+    if (!(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH)) {
+      ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM");
+      return Error::InvalidProgram;
+    }
+
+    // Return the same buffer we were passed - this data will be
+    // executed directly
+    return processed;
+  }
+
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* input_handle,
+      EValue** args) const override {
+    FreeableBuffer* processed = (FreeableBuffer*)input_handle;
+
+    ET_LOG(Info, "ArmBackend::execute %p", processed->data());
+
+    vela_handles handles = {0, 0, 0, 0, 0, 0};
+
+    // Command stream - we know at this point it's aligned
+    char* data = (char*)processed->data();
+
+    // Read key sections from the vela_bin_stream
+    if (!this->vela_read(data, &handles, processed->size())) {
+      ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
+      return Error::InvalidProgram;
+    }
+
+    ET_LOG(
+        Debug,
+        "ArmBackend::execute: Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
+        handles.cmd_data,
+        handles.cmd_data_size,
+        handles.weight_data,
+        handles.weight_data_size,
+        handles.scratch_data,
+        handles.scratch_data_size);
+
+    // TMP emit scratch
+    printf("Scratch before:\n");
+    for (int i = 0; i < handles.scratch_data_size; i++) {
+      if (i % 4 == 0)
+        ((char*)handles.scratch_data)[i] = 1;
+      printf("%02x ", ((char*)handles.scratch_data)[i]);
+      if (!((i + 1) % 4))
         printf("\n");
-		
-		// Process results into EValue storage
-		// TODO: optimise into direct write for compatible layouts
-		// TODO: get num in/out and layout?
-		int *output_address = (int*)(handles.scratch_data + handles.output_offset);
-		auto tensor = args[1]->toTensor();
-		for(int j=0; j<tensor.numel(); j++)
-		{
-			
-			tensor.mutable_data_ptr<int>()[j] = output_address[j];
-		}
-				
-		return Error::Ok;
-	}
-
-	void destroy(DelegateHandle* handle) const override {
-		return;
-	}
-
-private:
-	typedef struct {
-		const char *cmd_data; size_t cmd_data_size;
-		const char *weight_data; size_t weight_data_size;
-		const char *scratch_data; size_t scratch_data_size;
-		size_t input_offset; size_t input_data_shape[3];
-		size_t output_offset; size_t output_data_shape[3];
-	} vela_handles;
-
-	typedef struct {
-		char name[16];
-		int size; char _pad[12];
-		char data[];
-	} vela_bin_block;
-
-	static int next_mul_16( int n ) {
-		return ((n-1)|15)+1;
-	}
-	
-	int vela_read(char* data, vela_handles *h, int size ) const {
-
-		// Read header string
-		if( strncmp( data, "vela_bin_stream", 15 ) )
-		{
-			return 0;
-		}
-		data += 16;
-
-		// Expect one or more 'vela_bin_block's
-		while( 1 )
-		{
-			vela_bin_block *b = (vela_bin_block*)data;
-			data += 16 + 16 + next_mul_16(b->size);
-
-			// Exit with success on finding end of stream
-			if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1;
-
-			if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) )
-			{
-				// This magic header confirms a valid command stream in binary
-				if( strncmp( b->data, "COP1", 4 ) ) return 0;
-				h->cmd_data = b->data;
-				h->cmd_data_size = b->size;
-			}
-			if( !strncmp( b->name, "weight_data", strlen("weight_data")) )
-			{
-				h->weight_data = b->data;;
-				h->weight_data_size = b->size;
-			}
-			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
-			{
-				h->scratch_data = b->data;
-				h->scratch_data_size = b->size;
-			}
-
-			// capture inputs and outputs
-			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
-			{
-				h->scratch_data = b->data;
-				h->scratch_data_size = b->size;
-			}
-			if( !strncmp( b->name, "input_offset", strlen("input_offset")) )
-			{
-				h->input_offset = ((int*)b->data)[0];
-			}
-			if( !strncmp( b->name, "output_offset", strlen("output_offset")) )
-			{
-				h->output_offset = ((int*)b->data)[0];
-			}
-			if( !strncmp( b->name, "input_shape", strlen("input_shape")) )
-			{
-				h->input_data_shape[0] = ((int*)b->data)[0];
-				h->input_data_shape[0] = ((int*)b->data)[1];
-				h->input_data_shape[0] = ((int*)b->data)[2];
-				
-			}
-			if( !strncmp( b->name, "output_shape", strlen("output_shape")) )
-			{
-				h->output_data_shape[0] = ((int*)b->data)[0];
-				h->output_data_shape[0] = ((int*)b->data)[1];
-                h->output_data_shape[0] = ((int*)b->data)[2];
-            }							
-		}
-	}
-
+    }
+    printf("\n");
+
+    // Allocate driver handle and synchronously invoke driver
+    ethosu_driver* drv = ethosu_reserve_driver();
+
+    uint64_t bases[2] = {
+        (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
+    size_t bases_size[2] = {
+        handles.weight_data_size, handles.scratch_data_size};
+    int result = ethosu_invoke_v3(
+        drv,
+        (void*)handles.cmd_data,
+        handles.cmd_data_size,
+        bases,
+        bases_size,
+        2,
+        nullptr);
+
+    if (result != 0) {
+      ET_LOG(
+          Error,
+          "ArmBackend::execute: Ethos-U invocation failed error (%d)",
+          result);
+      return Error::InvalidProgram;
+    }
+
+    // TMP emit scratch
+    printf("Scratch after:\n");
+    for (int i = 0; i < handles.scratch_data_size; i++) {
+      printf("%02x ", ((char*)handles.scratch_data)[i]);
+      if (!((i + 1) % 4))
+        printf("\n");
+    }
+    printf("\n");
+
+    // Process results into EValue storage
+    // TODO: optimise into direct write for compatible layouts
+    // TODO: get num in/out and layout?
+    int* output_address = (int*)(handles.scratch_data + handles.output_offset);
+    auto tensor = args[1]->toTensor();
+    for (int j = 0; j < tensor.numel(); j++) {
+      tensor.mutable_data_ptr<int>()[j] = output_address[j];
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle) const override {
+    return;
+  }
+
+ private:
+  typedef struct {
+    const char* cmd_data;
+    size_t cmd_data_size;
+    const char* weight_data;
+    size_t weight_data_size;
+    const char* scratch_data;
+    size_t scratch_data_size;
+    size_t input_offset;
+    size_t input_data_shape[3];
+    size_t output_offset;
+    size_t output_data_shape[3];
+  } vela_handles;
+
+  typedef struct {
+    char name[16];
+    int size;
+    char _pad[12];
+    char data[];
+  } vela_bin_block;
+
+  static int next_mul_16(int n) {
+    return ((n - 1) | 15) + 1;
+  }
+
+  int vela_read(char* data, vela_handles* h, int size) const {
+    // Read header string
+    if (strncmp(data, "vela_bin_stream", 15)) {
+      return 0;
+    }
+    data += 16;
+
+    // Expect one or more 'vela_bin_block's
+    while (1) {
+      vela_bin_block* b = (vela_bin_block*)data;
+      data += 16 + 16 + next_mul_16(b->size);
+
+      // Exit with success on finding end of stream
+      if (!strncmp(b->name, "vela_end_stream", 15))
+        return 1;
+
+      if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) {
+        // This magic header confirms a valid command stream in binary
+        if (strncmp(b->data, "COP1", 4))
+          return 0;
+        h->cmd_data = b->data;
+        h->cmd_data_size = b->size;
+      }
+      if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
+        h->weight_data = b->data;
+        ;
+        h->weight_data_size = b->size;
+      }
+      if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
+        h->scratch_data = b->data;
+        h->scratch_data_size = b->size;
+      }
+
+      // capture inputs and outputs
+      if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
+        h->scratch_data = b->data;
+        h->scratch_data_size = b->size;
+      }
+      if (!strncmp(b->name, "input_offset", strlen("input_offset"))) {
+        h->input_offset = ((int*)b->data)[0];
+      }
+      if (!strncmp(b->name, "output_offset", strlen("output_offset"))) {
+        h->output_offset = ((int*)b->data)[0];
+      }
+      if (!strncmp(b->name, "input_shape", strlen("input_shape"))) {
+        h->input_data_shape[0] = ((int*)b->data)[0];
+        h->input_data_shape[0] = ((int*)b->data)[1];
+        h->input_data_shape[0] = ((int*)b->data)[2];
+      }
+      if (!strncmp(b->name, "output_shape", strlen("output_shape"))) {
+        h->output_data_shape[0] = ((int*)b->data)[0];
+        h->output_data_shape[0] = ((int*)b->data)[1];
+        h->output_data_shape[0] = ((int*)b->data)[2];
+      }
+    }
+  }
 };
 
 namespace {
 auto backend = ArmBackend();
 Backend backend_id{"ArmBackend", &backend};
 static auto registered = register_backend(backend_id);
-}
+} // namespace
 
 } // namespace executor
 } // namespace torch

From 524afc6713690481ee33482e4f7b520939a003b5 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 4 Oct 2023 12:16:20 +0000
Subject: [PATCH 04/25] Add ArmBackend to example scripts

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 examples/arm/run.sh   |  1 +
 examples/arm/setup.sh | 69 +++++++++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 828ac16bdc6..515240ef2ad 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -56,6 +56,7 @@ function build_executorch() {
         -DEXECUTORCH_BUILD_GFLAGS=OFF                     \
         -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
         -DEXECUTORCH_BUILD_HOST_TARGETS=OFF               \
+        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
         -DCMAKE_BUILD_TYPE=Release                        \
         -DEXECUTORCH_ENABLE_LOGGING=ON                    \
         -DEXECUTORCH_SELECT_OPS_LIST="aten::_softmax.out" \
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index d6f6880e173..c3518d3b24f 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -13,31 +13,7 @@ if [[ "${1}" == "-h" ]]; then
 fi
 
 ########
-### Hardcoded constants
-########
-script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-
-# FVP
-fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9"
-fvp_model_dir="Linux64_GCC-9.3"
-fvp_md5_checksum="98e93b949d0fbac977292d8668d34523"
-
-# toochain
-toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz"
-toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi"
-toolchain_md5_checksum="00ebb1b70b1f88906c61206457eacb61"
-
-# ethos-u
-ethos_u_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u"
-ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c"
-
-########
-### Optional user args
-########
-root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"}
-
-########
-### Functions
+### Helper functions
 ########
 function get_os_name() {
     # Returns the name of the system i.e. Linux or Darwin
@@ -62,6 +38,44 @@ function verify_md5() {
     fi
 }
 
+########
+### Hardcoded constants
+########
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+
+if [[ $(get_cpu_arch) == "x86_64" ]]; then
+	# FVP
+	fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9"
+	fvp_model_dir="Linux64_GCC-9.3"
+
+	# toochain
+	toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz"
+	toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi"
+elif [[ $(get_cpu_arch) == "aarch64" ]]; then
+    # FVP
+	fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073"
+    fvp_model_dir="Linux64_armv8l_GCC-9.3"
+
+    # toochain
+    toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz"
+    toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi"
+else
+	echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1;
+fi
+
+# ethos-u
+ethos_u_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u"
+ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c"
+
+########
+### Optional user args
+########
+root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"}
+
+########
+### Functions
+########
+
 function setup_fvp() {
     # Download and install the Corstone 300 FVP simulator platform
     cd "${root_dir}"
@@ -137,9 +151,8 @@ function patch_repo() {
 ########
 # do basic checks
 # Make sure we are on a supported platform
-# Linux ARM64 is a supported platform - adding it here is a WIP
-[[ "$(get_cpu_arch)" != "x86_64" ]] \
-    && { echo "[main] Error: only x86-64 architecture is supported for now!"; exit 1; }
+[[ $(get_cpu_arch) != "x86_64" ]] && [[ $(get_cpu_arch) != "aarch64" ]] \
+    && { echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1; }
 
 # No OSx support for FVP
 [[ "$(get_os_name)" != "Linux" ]] \

From 2bfdb5fcb7024b40e7f703da0bcfce6774f9323e Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 4 Oct 2023 14:57:04 +0000
Subject: [PATCH 05/25] Add delegate test and FVP output

 * There is a toolchain/linking issue mixing hard and soft float ABI
which is causing the test to fail, but the structure is there and the
delegate is registered.

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .../0007-Add-delegate-runner-test.patch       | 260 ++++++++++++++++++
 examples/arm/run.sh                           |  21 +-
 2 files changed, 279 insertions(+), 2 deletions(-)
 create mode 100644 examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch

diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
new file mode 100644
index 00000000000..b5c2ad68b9d
--- /dev/null
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
@@ -0,0 +1,260 @@
+From b6348ace74c14dc5b46060b792836399205834a7 Mon Sep 17 00:00:00 2001
+From: Rob Elliott <robert.elliott@arm.com>
+Date: Wed, 4 Oct 2023 13:31:33 +0000
+Subject: [PATCH] Add delegate runner test
+
+Signed-off-by: Rob Elliott <robert.elliott@arm.com>
+---
+ applications/executorch_tests/CMakeLists.txt  |  11 ++
+ applications/executorch_tests/add.pte.h       |  70 ++++++++++
+ .../executorch_tests/runner_delegate.cpp      | 132 ++++++++++++++++++
+ 3 files changed, 213 insertions(+)
+ create mode 100644 applications/executorch_tests/add.pte.h
+ create mode 100644 applications/executorch_tests/runner_delegate.cpp
+
+diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
+index c95d53e..d0233bf 100644
+--- a/applications/executorch_tests/CMakeLists.txt
++++ b/applications/executorch_tests/CMakeLists.txt
+@@ -44,6 +44,7 @@ message("**********************")
+ set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a")
+ set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a")
+ set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a")
++set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libethos_u.a")
+ 
+ add_custom_target(
+     gen_model_header ALL
+@@ -67,6 +68,16 @@ ethosu_add_executable_test(executor_runner PRIVATE
+     ${LIB_ET_OP_REGISTRATION}
+     ${LIB_ET_OP_KERNELS})
+ 
++ethosu_add_executable_test(executor_runner PRIVATE
++    WHOLE_ARCHIVE TRUE
++    SOURCES runner_delegate.cpp
++    LIBRARIES
++    ${LIB_ET_RUNTIME}
++    ${LIB_ET_OP_REGISTRATION}
++    ${LIB_ET_OP_KERNELS}
++	${LIB_ET_ETHOS}
++  )
++
+ add_dependencies(executor_runner gen_model_header)
+ 
+ target_include_directories(executor_runner PRIVATE
+diff --git a/applications/executorch_tests/add.pte.h b/applications/executorch_tests/add.pte.h
+new file mode 100644
+index 0000000..05bc0ec
+--- /dev/null
++++ b/applications/executorch_tests/add.pte.h
+@@ -0,0 +1,70 @@
++__attribute__((section(".sram.data"), aligned(16))) char add_pte[] = {
++0x24, 0x00, 0x00, 0x00, 0x45, 0x54, 0x31, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 
++0x0c, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xf4, 0x04, 0x00, 0x00, 0xdc, 0x04, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x36, 0xf9, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0xb0, 0x04, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x62, 0x69, 0x6e, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, 
++0x63, 0x6d, 0x64, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x43, 0x4f, 0x50, 0x31, 0x01, 0x00, 0x10, 0x00, 0x07, 0x18, 0x00, 0x00, 0x00, 0x00, 0x06, 0x10, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x55, 0x00, 
++0x25, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x26, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x01, 0x01, 0x00, 0x00, 0x40, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x01, 0x00, 0x00, 
++0x0c, 0x01, 0x00, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x04, 0x01, 0x04, 0x00, 0x06, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00, 
++0x14, 0x00, 0x00, 0x00, 0x09, 0x01, 0x00, 0x00, 0x05, 0x01, 0x09, 0x00, 0x07, 0x01, 0x00, 0x00, 0x1f, 0x01, 0x01, 0x00, 0x10, 0x40, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x11, 0x40, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x12, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0x1a, 0x01, 0x00, 0x00, 
++0x12, 0x01, 0x00, 0x00, 0x11, 0x01, 0x00, 0x00, 0x13, 0x01, 0x04, 0x00, 0x16, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x15, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x40, 0x00, 0x00, 
++0x14, 0x00, 0x00, 0x00, 0x18, 0x01, 0x00, 0x00, 0x14, 0x01, 0x05, 0x01, 0x25, 0x01, 0x00, 0x00, 0x26, 0x01, 0x00, 0x80, 0x27, 0x01, 0xff, 0x7f, 0x16, 0x01, 0x00, 0x00, 0x15, 0x01, 0x01, 0x00, 
++0x17, 0x01, 0x07, 0x00, 0x0d, 0x01, 0x16, 0x00, 0x2d, 0x01, 0x16, 0x00, 0x8d, 0x01, 0x0a, 0x00, 0x24, 0x01, 0x00, 0x00, 0x8f, 0x01, 0x01, 0x00, 0x80, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x81, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x82, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x83, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8b, 0x01, 0x00, 0x00, 0x8c, 0x01, 0x00, 0x00, 
++0x8a, 0x01, 0x00, 0x00, 0x86, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x85, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x84, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x89, 0x01, 0x00, 0x00, 
++0x85, 0x01, 0x09, 0x00, 0x80, 0x01, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 
++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x73, 0x68, 0x00, 
++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x72, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x00, 
++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 
++0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 
++0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x65, 0x6e, 0x64, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
++0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, 0x28, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x18, 0x00, 0x1c, 0x00, 0x20, 0x00, 0x24, 0x00, 0x16, 0x00, 0x00, 0x00, 
++0x44, 0x03, 0x00, 0x00, 0xf0, 0x01, 0x00, 0x00, 0x2c, 0x01, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x14, 0x01, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 
++0x76, 0xff, 0xff, 0xff, 0x68, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x2f, 0x74, 0x6d, 0x70, 0x2f, 0x61, 0x72, 0x6d, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x6d, 0x7a, 0x74, 0x66, 0x5f, 0x62, 0x76, 0x67, 0x2f, 0x73, 
++0x69, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x61, 0x64, 0x64, 0x2f, 0x74, 0x6f, 0x73, 0x61, 0x2f, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x64, 0x65, 0x62, 0x75, 0x67, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x70, 
++0x61, 0x74, 0x68, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x41, 0x72, 0x6d, 0x42, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 
++0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x0c, 0x00, 0x00, 0x00, 
++0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 
++0xb0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x05, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x14, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 
++0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x88, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 
++0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0e, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 
++0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x16, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 
++0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
++0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 
++0x39, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 
++0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x00, 0x00, 0x00, 
++0xfe, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, 
++0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 
++0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, 
++0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 
++0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 
++0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x7d, 0x2c, 0x20, 
++0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 
++0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x5b, 0x5d, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 
++0x7d, 0x5d, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x00, };
+diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp
+new file mode 100644
+index 0000000..fe77d83
+--- /dev/null
++++ b/applications/executorch_tests/runner_delegate.cpp
+@@ -0,0 +1,132 @@
++/*
++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
++ *
++ * SPDX-License-Identifier: Apache-2.0
++ *
++ * Licensed under the Apache License, Version 2.0 (the License); you may
++ * not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/****************************************************************************
++ * Includes
++ ****************************************************************************/
++
++#include <stdio.h>
++#include <vector>
++#include <memory>
++
++using namespace std;
++
++#include <executorch/runtime/platform/runtime.h>
++#include <executorch/runtime/executor/program.h>
++#include <executorch/extension/data_loader/buffer_data_loader.h>
++#include <executorch/runtime/platform/log.h>
++#include <executorch/util/util.h>
++
++/****************************************************************************
++ * Data
++ ****************************************************************************/
++
++// Our .pte file generated from the AoT flow
++#include "add.pte.h"
++
++// Storage for intermediate data in SRAM
++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
++
++
++int main()
++{
++	printf("test test test NG ^2 22\n");
++	printf("main: Initialising runtime\n");
++    torch::executor::runtime_init();
++
++    using torch::executor::Result;
++    using torch::executor::Error;
++
++	// Load pte from the global add_pte .pte file loaded into SRAM.
++    auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte));
++    Result<torch::executor::Program> program = torch::executor::Program::load(&loader);
++    if(!program.ok()) {
++		printf("main: Program loading failed @ 0x%p: 0x%x", add_pte, (int)program.error());
++    }
++    printf("main: Model buffer loaded, has %u methods\n", program->num_methods());
++
++	// Find our entrypoint in the .pte program
++    const char* method_name = nullptr;
++	const auto method_name_result = program->get_method_name(0);
++	ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
++	method_name = *method_name_result;
++    printf("main: Found (and will run) method '%s'\n", method_name);
++
++	// Allocate necessary memories for this method
++    Result<torch::executor::MethodMeta> method_meta = program->method_meta(method_name);
++    if (!method_meta.ok()) {
++        printf("main: Failed to get method_meta for %s: 0x%x",
++                method_name, (unsigned int)method_meta.error());
++    }
++	
++    torch::executor::MemoryAllocator method_allocator{
++        torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
++
++    std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
++    std::vector<torch::executor::Span<uint8_t>> planned_spans; // Passed to the allocator
++    size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
++
++    for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
++        size_t buffer_size = static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
++        printf("main: Setting up planned buffer %zu, size %zu.\n", id, buffer_size);
++
++        planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
++        planned_spans.push_back({planned_buffers.back().get(), buffer_size});
++    }
++
++    torch::executor::HierarchicalAllocator planned_memory(
++      {planned_spans.data(), planned_spans.size()});
++
++    torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory);
++
++    Result<torch::executor::Method> method = program->load_method(method_name, &memory_manager);
++
++    if(!method.ok()) {
++        printf("main: Loading of method %s failed with status 0x%x\n", method_name, (int)method.error());
++    }
++	printf("main: Loading of method '%s' succesful\n", method_name);
++
++    printf("main: Preparing inputs...\n");
++    auto inputs = torch::executor::util::PrepareInputTensors(*method);
++
++    printf("main: Starting the model execution...\n");
++    Error status = method->execute();
++    if(status != Error::Ok){
++        printf("main: Execution of method %s failed with status 0x%x\n", method_name, (int)status);
++    } else {
++        printf("main: Model executed successfully.\n");
++    }
++
++    // Print the outputs.
++    std::vector<torch::executor::EValue> outputs(method->outputs_size());
++    printf("main: %d outputs - ", outputs.size());
++    status = method->get_outputs(outputs.data(), outputs.size());
++    ET_CHECK(status == Error::Ok);
++    for (size_t i = 0; i < outputs.size(); ++i)
++	{
++		printf("main: Output %d numel %d\n", i, outputs[i].toTensor().numel());
++		for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j)
++		{
++			printf("main:   Output[%d]: %d\n", j, outputs[i].toTensor().const_data_ptr<int>()[j]);
++		}
++    }
++
++	return 0;
++}
++
++
+-- 
+2.41.0
+
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 515240ef2ad..ef0a6d560a3 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -87,7 +87,7 @@ function build_executorch_runner() {
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
     n=$(nproc)
-    cmake --build build -- -j"$((n - 5))" executor_runner VERBOSE=1
+    cmake --build build -- -j"$((n - 5))" executor_runner executor_runner_delegate VERBOSE=1
     echo "[${FUNCNAME[0]}] Generated baremetal elf file:"
     find . -name "executor_runner.elf"
 }
@@ -102,7 +102,21 @@ function run_fvp() {
         -C mps3_board.telnetterminal0.start_telnet=0        \
         -C mps3_board.uart0.out_file='-'                    \
         -a "${elf}"                                         \
-        --timelimit 10 # seconds
+        --timelimit 5 || true # seconds
+    echo "[${FUNCNAME[0]} Simulation complete, $?"
+}
+
+# Execute the executor_runner on FVP Simulator
+function run_fvp_delegate() {
+    elf=$(find ${ethos_u_build_dir} -name "executor_runner_delegate.elf")
+    [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner_delegate elf: ${elf}"; exit 1; }
+    FVP_Corstone_SSE-300_Ethos-U55                          \
+        -C ethosu.num_macs=128                              \
+        -C mps3_board.visualisation.disable-visualisation=1 \
+        -C mps3_board.telnetterminal0.start_telnet=0        \
+        -C mps3_board.uart0.out_file='-'                    \
+        -a "${elf}"                                         \
+        --timelimit 5 || true
     echo "[${FUNCNAME[0]} Simulation complete, $?"
 }
 
@@ -143,4 +157,7 @@ build_executorch_runner "${pte}"
 # run the app
 run_fvp
 
+# run the delegate app
+run_fvp_delegate
+
 exit 0

From 8267612aafe02b36cfb81c4206b3de5f16a2521c Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 4 Oct 2023 16:12:31 +0000
Subject: [PATCH 06/25] Fix delegate runner patch

---
 .../0007-Add-delegate-runner-test.patch       | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
index b5c2ad68b9d..8b686c4f23f 100644
--- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
@@ -1,19 +1,19 @@
-From b6348ace74c14dc5b46060b792836399205834a7 Mon Sep 17 00:00:00 2001
+From 12e841a383069f0b3d0e9c51c793c2922a590ae0 Mon Sep 17 00:00:00 2001
 From: Rob Elliott <robert.elliott@arm.com>
 Date: Wed, 4 Oct 2023 13:31:33 +0000
 Subject: [PATCH] Add delegate runner test
 
 Signed-off-by: Rob Elliott <robert.elliott@arm.com>
 ---
- applications/executorch_tests/CMakeLists.txt  |  11 ++
+ applications/executorch_tests/CMakeLists.txt  |  21 ++-
  applications/executorch_tests/add.pte.h       |  70 ++++++++++
  .../executorch_tests/runner_delegate.cpp      | 132 ++++++++++++++++++
- 3 files changed, 213 insertions(+)
+ 3 files changed, 221 insertions(+), 2 deletions(-)
  create mode 100644 applications/executorch_tests/add.pte.h
  create mode 100644 applications/executorch_tests/runner_delegate.cpp
 
 diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
-index c95d53e..d0233bf 100644
+index c95d53e..195c8a3 100644
 --- a/applications/executorch_tests/CMakeLists.txt
 +++ b/applications/executorch_tests/CMakeLists.txt
 @@ -44,6 +44,7 @@ message("**********************")
@@ -24,11 +24,17 @@ index c95d53e..d0233bf 100644
  
  add_custom_target(
      gen_model_header ALL
-@@ -67,6 +68,16 @@ ethosu_add_executable_test(executor_runner PRIVATE
+@@ -67,10 +68,26 @@ ethosu_add_executable_test(executor_runner PRIVATE
      ${LIB_ET_OP_REGISTRATION}
      ${LIB_ET_OP_KERNELS})
  
-+ethosu_add_executable_test(executor_runner PRIVATE
+-add_dependencies(executor_runner gen_model_header)
+-
+ target_include_directories(executor_runner PRIVATE
+ ${ET_INCLUDE_PATH}
+ ${CMAKE_CURRENT_BINARY_DIR})
+ 
++ethosu_add_executable_test(executor_runner_delegate PRIVATE
 +    WHOLE_ARCHIVE TRUE
 +    SOURCES runner_delegate.cpp
 +    LIBRARIES
@@ -38,9 +44,15 @@ index c95d53e..d0233bf 100644
 +	${LIB_ET_ETHOS}
 +  )
 +
- add_dependencies(executor_runner gen_model_header)
- 
- target_include_directories(executor_runner PRIVATE
++target_include_directories(executor_runner_delegate PRIVATE
++${ET_INCLUDE_PATH}
++${CMAKE_CURRENT_BINARY_DIR})
++  
++add_dependencies(executor_runner gen_model_header)
++
++
++
+ # TODO Memory setup
 diff --git a/applications/executorch_tests/add.pte.h b/applications/executorch_tests/add.pte.h
 new file mode 100644
 index 0000000..05bc0ec

From 6b398fe5fd32f06b3cbe5bf03cb5d82bf6bf3fe2 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 4 Oct 2023 20:09:50 +0000
Subject: [PATCH 07/25] cmake compiler and log behaviour fixing

 * Override the default CPU in cmake which was causing a mixture of FPU
   and ABI flags to be passed to different compilation stages.
 * Updated fallback logging implementation in delegate app to fix
   sporadic crash

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/cmake/arm-none-eabi-gcc.cmake    |  2 +-
 backends/arm/runtime/ArmBackendEthosU.cpp     |  5 +-
 .../0007-Add-delegate-runner-test.patch       | 66 +++++++++++++++----
 3 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake
index 10bc858ed46..0921a529037 100644
--- a/backends/arm/cmake/arm-none-eabi-gcc.cmake
+++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU")
+set(TARGET_CPU "cortex-m55" CACHE STRING "Target CPU")
 string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 4c052ea60c7..7810c8f33fe 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -29,9 +29,8 @@ namespace executor {
 
 class ArmBackend final : public PyTorchBackendInterface {
  public:
-  ArmBackend() {
-    printf("Constructing ARM Backend\n");
-  }
+
+  ArmBackend() {}
 
   ~ArmBackend() = default;
 
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
index 8b686c4f23f..5e9e72b098c 100644
--- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
@@ -1,19 +1,20 @@
-From 12e841a383069f0b3d0e9c51c793c2922a590ae0 Mon Sep 17 00:00:00 2001
+From 6bd0ad55504da811159c58abeb5305a7a1ab884a Mon Sep 17 00:00:00 2001
 From: Rob Elliott <robert.elliott@arm.com>
 Date: Wed, 4 Oct 2023 13:31:33 +0000
 Subject: [PATCH] Add delegate runner test
 
 Signed-off-by: Rob Elliott <robert.elliott@arm.com>
 ---
- applications/executorch_tests/CMakeLists.txt  |  21 ++-
- applications/executorch_tests/add.pte.h       |  70 ++++++++++
- .../executorch_tests/runner_delegate.cpp      | 132 ++++++++++++++++++
- 3 files changed, 221 insertions(+), 2 deletions(-)
+ applications/executorch_tests/CMakeLists.txt  |  19 +-
+ applications/executorch_tests/add.pte.h       |  70 ++++++++
+ .../executorch_tests/runner_delegate.cpp      | 162 ++++++++++++++++++
+ cmake/toolchain/arm-none-eabi-gcc.cmake       |   6 +-
+ 4 files changed, 251 insertions(+), 6 deletions(-)
  create mode 100644 applications/executorch_tests/add.pte.h
  create mode 100644 applications/executorch_tests/runner_delegate.cpp
 
 diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
-index c95d53e..195c8a3 100644
+index c95d53e..943ade5 100644
 --- a/applications/executorch_tests/CMakeLists.txt
 +++ b/applications/executorch_tests/CMakeLists.txt
 @@ -44,6 +44,7 @@ message("**********************")
@@ -24,7 +25,7 @@ index c95d53e..195c8a3 100644
  
  add_custom_target(
      gen_model_header ALL
-@@ -67,10 +68,26 @@ ethosu_add_executable_test(executor_runner PRIVATE
+@@ -67,10 +68,24 @@ ethosu_add_executable_test(executor_runner PRIVATE
      ${LIB_ET_OP_REGISTRATION}
      ${LIB_ET_OP_KERNELS})
  
@@ -39,8 +40,6 @@ index c95d53e..195c8a3 100644
 +    SOURCES runner_delegate.cpp
 +    LIBRARIES
 +    ${LIB_ET_RUNTIME}
-+    ${LIB_ET_OP_REGISTRATION}
-+    ${LIB_ET_OP_KERNELS}
 +	${LIB_ET_ETHOS}
 +  )
 +
@@ -131,10 +130,10 @@ index 0000000..05bc0ec
 +0x7d, 0x5d, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x00, };
 diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp
 new file mode 100644
-index 0000000..fe77d83
+index 0000000..a4024bb
 --- /dev/null
 +++ b/applications/executorch_tests/runner_delegate.cpp
-@@ -0,0 +1,132 @@
+@@ -0,0 +1,162 @@
 +/*
 + * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 + *
@@ -179,6 +178,36 @@ index 0000000..fe77d83
 +// Storage for intermediate data in SRAM
 +__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
 +
++void et_pal_init(void) {}
++
++__ET_NORETURN void et_pal_abort(void) {
++	__builtin_trap();
++}
++
++et_timestamp_t et_pal_current_ticks(void) {
++	// libc.a - warning: _gettimeofday is not implemented and will always fail
++	return 11223344;
++}
++
++/**
++ * Emit a log message via platform output (serial port, console, etc).
++ */
++void et_pal_emit_log_message(
++    __ET_UNUSED et_timestamp_t timestamp,
++    et_pal_log_level_t level,
++    const char* filename,
++    __ET_UNUSED const char* function,
++    size_t line,
++    const char* message,
++    __ET_UNUSED size_t length) {
++	fprintf(
++		stderr,
++		"%c executorch:%s:%zu] %s\n",
++		level,
++		filename,
++		line,
++		message);
++}
 +
 +int main()
 +{
@@ -267,6 +296,21 @@ index 0000000..fe77d83
 +}
 +
 +
+diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake
+index 0e6a2ed..fdb0d7c 100644
+--- a/cmake/toolchain/arm-none-eabi-gcc.cmake
++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake
+@@ -98,8 +98,6 @@ add_compile_options(
+     # -Wswitch
+     # -Wswitch-default
+     # -Wunused
+-
+-    # -Wno-redundant-decls
+-
+-    # -Wno-psabi
++    -Wno-redundant-decls
++    -Wno-psabi
+ )
 -- 
 2.41.0
 

From 6ca9b26edf0b6623cc020591ed4fedaa8141b219 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 08:19:14 +0000
Subject: [PATCH 08/25] Minimal example of AoT with ArmPartitioner+Vela

 * uses the simple_add model to run through the AoT flow and the various
   debug options for looking at export.
 * produces a .pte file for runtime delegation on the ArmBackend for
   Ethos-U55 platforms

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/test/test_models.py   |   2 +
 examples/arm/arm_ethosu_minimal.py | 212 +++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+)
 create mode 100644 examples/arm/arm_ethosu_minimal.py

diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py
index 3400a7c8f7c..b33007f0c84 100644
--- a/backends/arm/test/test_models.py
+++ b/backends/arm/test/test_models.py
@@ -25,6 +25,7 @@ class TosaProfile(Enum):
     BI = 0  # Base Inference
     MI = 1  # Main Inference
     MT = 2  # Main Training
+    BI_INT = 3 # integer only BI subset tests (for test graphs)
 
 
 class TorchBuilder:
@@ -67,6 +68,7 @@ class simple_add(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (torch.ones(5),),
             TosaProfile.MI: (torch.ones(5),),
+            TosaProfile.BI_INT: (torch.ones(5,dtype=torch.int32),),
         }
 
         def __init__(self):
diff --git a/examples/arm/arm_ethosu_minimal.py b/examples/arm/arm_ethosu_minimal.py
new file mode 100644
index 00000000000..a41cbb42cd5
--- /dev/null
+++ b/examples/arm/arm_ethosu_minimal.py
@@ -0,0 +1,212 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import subprocess
+import tempfile
+
+import executorch.exir as exir
+
+import numpy as np
+from executorch.backends.arm.arm_backend import ArmPartitioner
+from executorch.backends.arm.test.test_models import TestList, TosaProfile
+from executorch.backends.arm.test.test_tosa import prepare_model_and_ref
+
+from executorch.exir.backend.backend_api import to_backend
+from executorch.exir.backend.canonical_partitioners.duplicate_dequant_node_pass import (
+    DuplicateDequantNodePass,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+# Assumes you have these two tools on your path
+TOSA_REF_MODEL_PATH = "tosa_reference_model"
+VELA_COMPILER_PATH = "vela"
+
+# Basic config for graph capture
+_CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True)
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+    _check_ir_validity=False,
+)
+
+EXAMPLE_TEST_LIST = ["simple_add"]
+
+#
+#
+#
+#
+def tosa_ref_capture_inputs(
+    model_edge,
+    inputs,
+    path,
+    input_quantization_scales,
+    input_quantization_zps,
+    profile=TosaProfile.MI,
+):
+    # Emit TOSA test data from the model inputs - assumes whole graph lowered so we just have
+    # placeholders for the TOSA delegate. Emits data in tosa_ref_model expected layout.
+    # - Skips placeholders which are encoded as constants (i.e. are already captured weights)
+    # - Assumes argument order is fixed
+    argument_names = []
+    for node in model_edge.exported_program.graph.nodes:
+        gs = model_edge.exported_program.graph_signature
+        if node.op == "placeholder":
+            if node.name in gs.inputs_to_parameters:
+                pass
+            elif node.name in gs.inputs_to_buffers:
+                pass
+            else:
+                argument_names.append(node.name)
+        else:
+            break
+
+    for arg in zip(argument_names, inputs):
+        name = arg[0]
+        data = arg[1].detach().numpy()
+        file_path = path + "/" + name + ".npy"
+
+        # Torch is doing Input[FP32]->Q[INT8]->DQ[FP32]->Operator[FP32]->Q[INT]->DQ[FP32]->[Output]FP32
+        # Need to quantize the input to INT8 for TOSA comsumption
+        if profile is TosaProfile.BI:
+            data_quantized = (
+                (data / input_quantization_scales[name]) - input_quantization_zps[name]
+            ).astype(np.int8)
+            np.save(file_path, data_quantized, allow_pickle=False)
+        else:
+            np.save(file_path, data, allow_pickle=False)
+
+#
+# Minimal sequence to take a model through the ArmPartitioner and produce
+# both TOSA intermediate output, and an Ethos-U55 command stream within
+# the ExecuTorch .pte binary
+#
+def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"):
+    #
+    # Minimal sequence to take model through TosaPartitioner and emit
+    # tosaout/ debug directory containing the flatbuffer - assumes one and will only save last output
+    # tosaout is generated even for partial/broken subgraph capture to aid in debg
+    # delegated.pte containing the flatbuffer within the executorch flatbuffer binary
+    #
+    print(f"\n\033[96mProcessing:::{op}\033[0m")
+    print(f"\033[96mDebug output path for intermediates: {output_path}\033[0m")
+    
+    os.makedirs(output_path, exist_ok=True)
+    
+    # Debug output for TORCH
+    TORCH_OUT_PATH = os.path.join(output_path, op, "torch", "")
+    os.makedirs(TORCH_OUT_PATH, exist_ok=True)
+
+    # Debug output for TOSA
+    TOSA_OUT_PATH = os.path.join(output_path, op, "tosa", "")
+    os.makedirs(TOSA_OUT_PATH, exist_ok=True)
+
+    model, inputs, torch_output = prepare_model_and_ref(op, profile)
+
+    if inputs is None:
+        print("\033[96m Skipping, model has no inputs for TOSA profile \033[0m")
+        return
+
+    print(f"  Model: {op}\n  Inputs: {inputs}\n  Outputs: {torch_output}")
+    
+    # Export model
+    model_capture = exir.capture(model, inputs, _CAPTURE_CONFIG)
+    model_edge = model_capture.to_edge(_EDGE_COMPILE_CONFIG)
+
+    # Partition with ArmBackend
+    ArmPartitioner.compile_spec = [CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8"))]
+    model_edge.exported_program = to_backend(
+        model_edge.transform(DuplicateDequantNodePass()).exported_program,
+        ArmPartitioner,
+    )
+    exec_prog = model_edge.to_executorch()
+
+    # Save .pte including delegated Vela section
+    with open(TORCH_OUT_PATH + "/delegated.pte", "wb") as fh:
+        fh.write(exec_prog.buffer)
+
+    # NOTE:
+    #   Additional steps from here are optional but can be helpful with
+    # debug as they will capture the inputs and outputs as well as running
+    # the intermediate output on the tosa_reference_model.
+    #   This can ensure the compilation flow is working correctly as part of
+    # a development loop, ahead of running the example on hardware.
+        
+    # Save inputs for TOSA reference run
+    tosa_ref_capture_inputs(model_edge, inputs, TOSA_OUT_PATH, {}, {}, profile)
+
+    # Save ground truth results to file
+    with open(TORCH_OUT_PATH + "/torch_output.npy", "wb") as f:
+        np.save(f, torch_output.detach().numpy())
+
+    # Convert TOSA Flatbuffer into JSON format for human debugging
+    cmd_flatc = (
+        "flatc"
+        + " -o "
+        + TOSA_OUT_PATH
+        + " --raw-binary -t ./backends/arm/third-party/serialization_lib/schema/tosa.fbs -- "
+        + TOSA_OUT_PATH
+        + "/output.tosa"
+    )
+    subprocess.run([cmd_flatc], shell=True, check=True)
+
+    ### Run the TOSA flatbuffer through TOSA Ref_Model and print the results
+    DESC_FILE_NAME = "/desc.json"
+    DESC_FILE_PATH = TOSA_OUT_PATH + DESC_FILE_NAME
+    cmd_ref_model = TOSA_REF_MODEL_PATH + " --test_desc " + DESC_FILE_PATH
+    subprocess.run([cmd_ref_model], shell=True, check=True)
+
+    ## Load in the JSON File, Read the tosa output
+    desc_file = open(DESC_FILE_PATH)
+    desc_json = json.load(desc_file)
+    tosa_out_filenames = desc_json["ofm_file"]
+    for tosa_out_fm_file_name in tosa_out_filenames:
+        f = open(TOSA_OUT_PATH + "/" + tosa_out_fm_file_name, "rb")
+        tosa_output = np.load(f)
+
+    ## Read the Torch Output
+    torch_file = open(TORCH_OUT_PATH + "/torch_output.npy", "rb")
+    torch_output = np.load(torch_file)
+
+    ## Compare Tosa and Torch Results
+    if np.allclose(tosa_output, torch_output, rtol=1e-1, atol=1e-1, equal_nan=True):
+        print(
+            "\033[92m"
+            + "Torch and Tosa Reference results are matching for operator: "
+            + op
+            + " from "
+            + str(str(profile))
+            + "\033[0m"
+        )
+
+    else:
+        print("\033[91m" + "Sorry, Torch and Tosa Reference Results Do not Match!")
+        print("============================")
+        print("TOSA Output Shape is: " + str(tosa_output.shape))
+        print("TOSA Output is: ")
+        print(tosa_output)
+        print("\033[93m")
+        print("============================")
+        print("Torch Output Shape is: " + str(torch_output.shape))
+        print("Torch Output is: ")
+        print(torch_output)
+        print("\033[0m")
+
+    if profile in ( TosaProfile.BI,  TosaProfile.BI_INT ):
+        cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa"
+        try:
+            subprocess.run([cmd_vela], shell=True, check=True)
+            print("\033[92m" + "Vela compile worked for: " + op + "\033[0m")
+        except:
+            print("\033[91m" + "Vela compile failed for: " + op + "\033[0m")
+    else:
+        print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m")
+
+
+# Temp systest mode for running all models against both inference profiles
+if __name__ == "__main__":
+    for op in EXAMPLE_TEST_LIST:
+        run_test(op, profile=TosaProfile.BI_INT)

From f40ab5f587e28ce399034677f656bbd935221582 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 09:57:52 +0000
Subject: [PATCH 09/25] Generate pte for delegate test on the fly

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .../0007-Add-delegate-runner-test.patch       | 166 ++++++++----------
 examples/arm/run.sh                           |  17 +-
 2 files changed, 92 insertions(+), 91 deletions(-)

diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
index 5e9e72b098c..1f6dd480897 100644
--- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
@@ -1,23 +1,41 @@
-From 6bd0ad55504da811159c58abeb5305a7a1ab884a Mon Sep 17 00:00:00 2001
+From d9d89c7a1d45df7c7aab3142c47f1ff797e531fe Mon Sep 17 00:00:00 2001
 From: Rob Elliott <robert.elliott@arm.com>
 Date: Wed, 4 Oct 2023 13:31:33 +0000
 Subject: [PATCH] Add delegate runner test
 
 Signed-off-by: Rob Elliott <robert.elliott@arm.com>
 ---
- applications/executorch_tests/CMakeLists.txt  |  19 +-
- applications/executorch_tests/add.pte.h       |  70 ++++++++
+ applications/executorch_tests/CMakeLists.txt  |  27 ++-
+ .../executorch_tests/pte_to_header.py         |  11 +-
  .../executorch_tests/runner_delegate.cpp      | 162 ++++++++++++++++++
  cmake/toolchain/arm-none-eabi-gcc.cmake       |   6 +-
- 4 files changed, 251 insertions(+), 6 deletions(-)
- create mode 100644 applications/executorch_tests/add.pte.h
+ 4 files changed, 197 insertions(+), 9 deletions(-)
  create mode 100644 applications/executorch_tests/runner_delegate.cpp
 
 diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
-index c95d53e..943ade5 100644
+index c95d53e..1118469 100644
 --- a/applications/executorch_tests/CMakeLists.txt
 +++ b/applications/executorch_tests/CMakeLists.txt
-@@ -44,6 +44,7 @@ message("**********************")
+@@ -28,22 +28,26 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir")
+ set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir")
+ set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers")
+ set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte")
++set(ET_PTE_DELEGATE_FILE_PATH "${ET_PTE_DELGATE__FILE_PATH}" CACHE PATH "Path to ExecuTorch delegate model pte")
+ 
+ get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
+ get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
+ get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH)
+ get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH)
++get_filename_component(ET_PTE_DELEGATE_FILE_PATH ${ET_PTE_DELEGATE_FILE_PATH} REALPATH)
+ 
+ message("**********************")
+ message("ExecuTorch dir      (ET_DIR_PATH)       : ${ET_DIR_PATH}")
+ message("ExecuTorch build dir(ET_BUILD_DIR_PATH) : ${ET_BUILD_DIR_PATH}")
+ message("ExecuTorch headers  (ET_INCUDE_PATH)    : ${ET_INCLUDE_PATH}")
+ message("ExecuTorch pte file (ET_PTE_FILE_PATH)  : ${ET_PTE_FILE_PATH}")
++message("ExecuTorch pte delegate file (ET_PTE_DELEGATE_FILE_PATH)  : ${ET_PTE_DELEGATE_FILE_PATH}")
+ message("**********************")
+ 
  set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a")
  set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a")
  set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a")
@@ -25,7 +43,20 @@ index c95d53e..943ade5 100644
  
  add_custom_target(
      gen_model_header ALL
-@@ -67,10 +68,24 @@ ethosu_add_executable_test(executor_runner PRIVATE
+@@ -54,8 +58,11 @@ add_custom_command(
+     OUTPUT
+         ${CMAKE_CURRENT_BINARY_DIR}/fake_dep
+         ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
++		${CMAKE_CURRENT_BINARY_DIR}/model_delegate_pte.h
+     COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_FILE_PATH}
+-    --out ${CMAKE_CURRENT_BINARY_DIR}
++    --outdir ${CMAKE_CURRENT_BINARY_DIR}
++    COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_DELEGATE_FILE_PATH}
++    --outdir ${CMAKE_CURRENT_BINARY_DIR} --outfile model_delegate_pte.h
+     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+     )
+ 
+@@ -67,10 +74,24 @@ ethosu_add_executable_test(executor_runner PRIVATE
      ${LIB_ET_OP_REGISTRATION}
      ${LIB_ET_OP_KERNELS})
  
@@ -52,85 +83,42 @@ index c95d53e..943ade5 100644
 +
 +
  # TODO Memory setup
-diff --git a/applications/executorch_tests/add.pte.h b/applications/executorch_tests/add.pte.h
-new file mode 100644
-index 0000000..05bc0ec
---- /dev/null
-+++ b/applications/executorch_tests/add.pte.h
-@@ -0,0 +1,70 @@
-+__attribute__((section(".sram.data"), aligned(16))) char add_pte[] = {
-+0x24, 0x00, 0x00, 0x00, 0x45, 0x54, 0x31, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 
-+0x0c, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xf4, 0x04, 0x00, 0x00, 0xdc, 0x04, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x36, 0xf9, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0xb0, 0x04, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x62, 0x69, 0x6e, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, 
-+0x63, 0x6d, 0x64, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x43, 0x4f, 0x50, 0x31, 0x01, 0x00, 0x10, 0x00, 0x07, 0x18, 0x00, 0x00, 0x00, 0x00, 0x06, 0x10, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x55, 0x00, 
-+0x25, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x26, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x01, 0x01, 0x00, 0x00, 0x40, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x01, 0x00, 0x00, 
-+0x0c, 0x01, 0x00, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x04, 0x01, 0x04, 0x00, 0x06, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00, 
-+0x14, 0x00, 0x00, 0x00, 0x09, 0x01, 0x00, 0x00, 0x05, 0x01, 0x09, 0x00, 0x07, 0x01, 0x00, 0x00, 0x1f, 0x01, 0x01, 0x00, 0x10, 0x40, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x11, 0x40, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x12, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0x1a, 0x01, 0x00, 0x00, 
-+0x12, 0x01, 0x00, 0x00, 0x11, 0x01, 0x00, 0x00, 0x13, 0x01, 0x04, 0x00, 0x16, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x15, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x40, 0x00, 0x00, 
-+0x14, 0x00, 0x00, 0x00, 0x18, 0x01, 0x00, 0x00, 0x14, 0x01, 0x05, 0x01, 0x25, 0x01, 0x00, 0x00, 0x26, 0x01, 0x00, 0x80, 0x27, 0x01, 0xff, 0x7f, 0x16, 0x01, 0x00, 0x00, 0x15, 0x01, 0x01, 0x00, 
-+0x17, 0x01, 0x07, 0x00, 0x0d, 0x01, 0x16, 0x00, 0x2d, 0x01, 0x16, 0x00, 0x8d, 0x01, 0x0a, 0x00, 0x24, 0x01, 0x00, 0x00, 0x8f, 0x01, 0x01, 0x00, 0x80, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x81, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x82, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x83, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8b, 0x01, 0x00, 0x00, 0x8c, 0x01, 0x00, 0x00, 
-+0x8a, 0x01, 0x00, 0x00, 0x86, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x85, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x84, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x89, 0x01, 0x00, 0x00, 
-+0x85, 0x01, 0x09, 0x00, 0x80, 0x01, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 
-+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x73, 0x68, 0x00, 
-+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x66, 0x61, 0x73, 0x74, 0x5f, 0x72, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 0x00, 
-+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x65, 0x6c, 0x65, 0x6d, 0x5f, 0x73, 0x69, 0x7a, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x00, 0x00, 0x00, 
-+0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x63, 0x72, 0x61, 0x74, 0x63, 0x68, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 
-+0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0x65, 0x6c, 0x61, 0x5f, 0x65, 0x6e, 0x64, 0x5f, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
-+0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, 0x28, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x18, 0x00, 0x1c, 0x00, 0x20, 0x00, 0x24, 0x00, 0x16, 0x00, 0x00, 0x00, 
-+0x44, 0x03, 0x00, 0x00, 0xf0, 0x01, 0x00, 0x00, 0x2c, 0x01, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x14, 0x01, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 
-+0x76, 0xff, 0xff, 0xff, 0x68, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x2f, 0x74, 0x6d, 0x70, 0x2f, 0x61, 0x72, 0x6d, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x6d, 0x7a, 0x74, 0x66, 0x5f, 0x62, 0x76, 0x67, 0x2f, 0x73, 
-+0x69, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x61, 0x64, 0x64, 0x2f, 0x74, 0x6f, 0x73, 0x61, 0x2f, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x64, 0x65, 0x62, 0x75, 0x67, 0x5f, 0x74, 0x6f, 0x73, 0x61, 0x5f, 0x70, 
-+0x61, 0x74, 0x68, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x41, 0x72, 0x6d, 0x42, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 
-+0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x0c, 0x00, 0x00, 0x00, 
-+0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 
-+0xb0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x05, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x14, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 
-+0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x88, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 
-+0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0e, 0x00, 0x07, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 
-+0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x16, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 
-+0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
-+0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 
-+0x39, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 
-+0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x00, 0x00, 0x00, 
-+0xfe, 0x00, 0x00, 0x00, 0x5b, 0x31, 0x2c, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, 
-+0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 
-+0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x74, 0x75, 0x70, 0x6c, 0x65, 
-+0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 
-+0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 
-+0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 0x7d, 0x2c, 0x20, 
-+0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x73, 0x2e, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 
-+0x78, 0x74, 0x22, 0x3a, 0x20, 0x22, 0x5b, 0x5d, 0x22, 0x2c, 0x20, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x22, 0x3a, 0x20, 0x5b, 0x5d, 0x7d, 0x5d, 
-+0x7d, 0x5d, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x00, };
+diff --git a/applications/executorch_tests/pte_to_header.py b/applications/executorch_tests/pte_to_header.py
+index 37d88aa..be3282d 100644
+--- a/applications/executorch_tests/pte_to_header.py
++++ b/applications/executorch_tests/pte_to_header.py
+@@ -30,11 +30,18 @@ parser.add_argument(
+ )
+ parser.add_argument(
+     "--outdir",
+-    help="Output dir for model_pte.h",
++    help="Output dir for model header",
+     type=str,
+     required=False,
+     default=".",
+ )
++parser.add_argument(
++    "--outfile",
++    help="Output filename for model header",
++    type=str,
++    required=False,
++    default="model_pte.h",
++)
+ parser.add_argument(
+     "--section",
+     help="Section attribute for the data array",
+@@ -43,7 +50,7 @@ parser.add_argument(
+     default=".sram.data",
+ )
+ args = parser.parse_args()
+-outfile = os.path.join(args.outdir, "model_pte.h")
++outfile = os.path.join(args.outdir, args.outfile)
+ attr = f'__attribute__((section("{args.section}"), aligned(16))) char '
+ 
+ with open(args.pte, "rb") as fr, open(
 diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp
 new file mode 100644
-index 0000000..a4024bb
+index 0000000..6af6a92
 --- /dev/null
 +++ b/applications/executorch_tests/runner_delegate.cpp
 @@ -0,0 +1,162 @@
@@ -173,7 +161,7 @@ index 0000000..a4024bb
 + ****************************************************************************/
 +
 +// Our .pte file generated from the AoT flow
-+#include "add.pte.h"
++#include "model_delegate_pte.h" // contains model_pte
 +
 +// Storage for intermediate data in SRAM
 +__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
@@ -218,11 +206,11 @@ index 0000000..a4024bb
 +    using torch::executor::Result;
 +    using torch::executor::Error;
 +
-+	// Load pte from the global add_pte .pte file loaded into SRAM.
-+    auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte));
++	// Load pte from the global model_pte .pte file loaded into SRAM.
++    auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte));
 +    Result<torch::executor::Program> program = torch::executor::Program::load(&loader);
 +    if(!program.ok()) {
-+		printf("main: Program loading failed @ 0x%p: 0x%x", add_pte, (int)program.error());
++		printf("main: Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error());
 +    }
 +    printf("main: Model buffer loaded, has %u methods\n", program->num_methods());
 +
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index ef0a6d560a3..f4ef588c4a5 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -43,6 +43,16 @@ function generate_pte_file() {
     echo "${pte_file}"
 }
 
+# Generate the ethos delegate PTE file
+function generate_ethos_pte_file() {
+    cd $et_root_dir
+	python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null
+	cd ./ethosout/simple_add/torch/
+    local pte_file=$(readlink -f ./delegated.pte)
+    [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
+    echo "${pte_file}"
+}
+
 # build ExecuTorch Libraries
 function build_executorch() {
     [[ -d "${et_build_dir}" ]] \
@@ -74,8 +84,9 @@ function build_executorch() {
 
 # build Arm Baremetal executor_runner
 function build_executorch_runner() {
-    [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expecting pte file as an argument got, $*"; exit 1; }
+    [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting 2 pte files as arguments got, $*"; exit 1; }
     local pte=${1}
+    local pte_delegate=${2}
     cd "${ethos_u_root_dir}"/core_platform
     cmake                                         \
         -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \
@@ -83,6 +94,7 @@ function build_executorch_runner() {
         -DET_DIR_PATH:PATH=${et_root_dir}         \
         -DET_BUILD_DIR_PATH:PATH=${et_build_dir}  \
         -DET_PTE_FILE_PATH:PATH="${pte}"          \
+        -DET_PTE_DELEGATE_FILE_PATH:PATH="${pte_delegate}" \
         -DPYTHON_EXECUTABLE=$(which python3)
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
@@ -147,12 +159,13 @@ type ${buck2} 2>&1 > /dev/null \
 
 # get the pte
 pte=$(generate_pte_file)
+pte_delegate=$(generate_ethos_pte_file)
 
 # build et
 build_executorch
 
 # build the et baremetal app
-build_executorch_runner "${pte}"
+build_executorch_runner "${pte}" "${pte_delegate}"
 
 # run the app
 run_fvp

From d10d620b45c0df03dd3a73fe771186350c8c8bae Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 14:28:49 +0000
Subject: [PATCH 10/25] Added support for variable input output patterns

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py               |  17 +++-
 backends/arm/runtime/ArmBackendEthosU.cpp | 105 ++++++++++++++++++----
 backends/arm/test/test_models.py          |  12 +++
 examples/arm/arm_ethosu_minimal.py        |   2 +-
 4 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 8185718f45e..2e5cf4d9645 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -14,6 +14,7 @@
 import os
 import subprocess
 import tempfile
+import struct
 from typing import final, List
 
 import numpy as np
@@ -176,7 +177,19 @@ def vela_compile(tosa_fb):
             for key in data.keys():
                 block_name = bytes(key, "utf8")[:15]
                 block_name = block_name + b"\x00" * (16 - len(block_name))
-                block_data = data[key].tobytes()
+
+                block_data = b''
+                if key in ( "input_shape", "output_shape" ):
+                    inputs = data[key]
+                    # Encode a struct of int len; and one or more int x,y,z,w shape;
+                    input_struct = struct.pack("<i", len(inputs))
+                    for inp in inputs:
+                        assert len(inp) <= 4
+                        inp_pad = inp.tolist() + [0] * (4 - len(inp))
+                        input_struct = input_struct + struct.pack("<iiii", *inp_pad)
+                    block_data = input_struct
+                else:
+                    block_data = data[key].tobytes()
                 # We need the acual unpadded block lengths for hw setup
                 block_length = len(block_data).to_bytes(16, "little")
                 # pad block data to multiple of 16 bytes
@@ -190,11 +203,9 @@ def vela_compile(tosa_fb):
             block_name = bytes("scratch_data", "utf8")[:15]
             block_name = block_name + b"\x00" * (16 - len(block_name))
             block_length = data["scratch_shape"][0].item()
-            print(f"scratch length = {block_length}")
             block_length = block_length + (15 - (block_length - 1) % 16)
             block_data = b"\x00" * block_length
             block_length = block_length.to_bytes(16, "little")
-            print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}")
             block = block_name + block_length + block_data
             blocks = blocks + block
             # TODO are these already in scratch shape? look to be
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 7810c8f33fe..38e0de2c169 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -11,6 +11,7 @@
  */
 
 #include <memory>
+#include <vector>
 
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -19,6 +20,8 @@
 #include <ethosu_driver.h>
 #include <pmu_ethosu.h>
 
+using namespace std;
+
 namespace torch {
 namespace executor {
 
@@ -29,7 +32,6 @@ namespace executor {
 
 class ArmBackend final : public PyTorchBackendInterface {
  public:
-
   ArmBackend() {}
 
   ~ArmBackend() = default;
@@ -86,7 +88,7 @@ class ArmBackend final : public PyTorchBackendInterface {
 
     ET_LOG(Info, "ArmBackend::execute %p", processed->data());
 
-    vela_handles handles = {0, 0, 0, 0, 0, 0};
+    vela_handles handles;
 
     // Command stream - we know at this point it's aligned
     char* data = (char*)processed->data();
@@ -107,16 +109,45 @@ class ArmBackend final : public PyTorchBackendInterface {
         handles.scratch_data,
         handles.scratch_data_size);
 
+    printf("Processed inputs %d\n", handles.input_shape.size());
+    for (int i = 0; i < handles.input_shape.size(); i++)
+      printf(
+          "  %d %d %d %d\n",
+          handles.input_shape[i][0],
+          handles.input_shape[i][1],
+          handles.input_shape[i][2],
+          handles.input_shape[i][3]);
+
+    // Input data from EValue
+    const char* input_addr = handles.scratch_data + handles.input_offset;
+    printf(
+        "accessing ethos input data at %p, offset %d\n",
+        handles.scratch_data,
+        handles.input_offset);
+    // Inputs are in the index first
+    int input_index =
+        0; // handles.input_shape.size(); TODO: loop this for multiple inputs
+    printf("writing input to EValue input index %d\n", input_index);
+
+    // Process input EValue into scratch
+    // TODO: optimise into direct write for compatible layouts
+    //       is this contiguous for a memcpy of e_size*numel?
+    int* input_address = (int*)input_addr;
+    auto tensor_in = args[input_index]->toTensor();
+    for (int j = 0; j < tensor_in.numel(); j++) {
+      // TODO: extend beyond 4 byte tensors
+      input_address[j] = tensor_in.mutable_data_ptr<int>()[j];
+    }
+
     // TMP emit scratch
-    printf("Scratch before:\n");
+    printf("Scratch after setup:\n");
     for (int i = 0; i < handles.scratch_data_size; i++) {
-      if (i % 4 == 0)
-        ((char*)handles.scratch_data)[i] = 1;
       printf("%02x ", ((char*)handles.scratch_data)[i]);
       if (!((i + 1) % 4))
         printf("\n");
     }
     printf("\n");
+    // END TMP emit scratch
 
     // Allocate driver handle and synchronously invoke driver
     ethosu_driver* drv = ethosu_reserve_driver();
@@ -151,13 +182,33 @@ class ArmBackend final : public PyTorchBackendInterface {
     }
     printf("\n");
 
+    printf("Processed outputs %d\n", handles.output_shape.size());
+    for (int i = 0; i < handles.output_shape.size(); i++)
+      printf(
+          "  %d %d %d %d\n",
+          handles.output_shape[i][0],
+          handles.output_shape[i][1],
+          handles.output_shape[i][2],
+          handles.output_shape[i][3]);
+
+    // output data from Ethos U
+    const char* output_addr = handles.scratch_data + handles.output_offset;
+    printf(
+        "accessing ethos output data at %p, offset %d\n",
+        handles.scratch_data,
+        handles.output_offset);
+    // Outputs are in the index immediately after inputs
+    int output_index = handles.input_shape.size();
+    printf("writing output to EValue output index %d\n", output_index);
+
     // Process results into EValue storage
     // TODO: optimise into direct write for compatible layouts
-    // TODO: get num in/out and layout?
-    int* output_address = (int*)(handles.scratch_data + handles.output_offset);
-    auto tensor = args[1]->toTensor();
-    for (int j = 0; j < tensor.numel(); j++) {
-      tensor.mutable_data_ptr<int>()[j] = output_address[j];
+    //       is this contiguous for a memcpy of e_size*numel?
+    int* output_address = (int*)output_addr;
+    auto tensor_out = args[output_index]->toTensor();
+    for (int j = 0; j < tensor_out.numel(); j++) {
+      // TODO: extend beyond 4 byte tensors
+      tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
     }
 
     return Error::Ok;
@@ -176,9 +227,9 @@ class ArmBackend final : public PyTorchBackendInterface {
     const char* scratch_data;
     size_t scratch_data_size;
     size_t input_offset;
-    size_t input_data_shape[3];
+    vector<vector<int>> input_shape;
     size_t output_offset;
-    size_t output_data_shape[3];
+    vector<vector<int>> output_shape;
   } vela_handles;
 
   typedef struct {
@@ -188,6 +239,11 @@ class ArmBackend final : public PyTorchBackendInterface {
     char data[];
   } vela_bin_block;
 
+  typedef struct {
+    int count;
+    int shape[][4];
+  } vela_shapes;
+
   static int next_mul_16(int n) {
     return ((n - 1) | 15) + 1;
   }
@@ -217,7 +273,6 @@ class ArmBackend final : public PyTorchBackendInterface {
       }
       if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
         h->weight_data = b->data;
-        ;
         h->weight_data_size = b->size;
       }
       if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
@@ -237,14 +292,26 @@ class ArmBackend final : public PyTorchBackendInterface {
         h->output_offset = ((int*)b->data)[0];
       }
       if (!strncmp(b->name, "input_shape", strlen("input_shape"))) {
-        h->input_data_shape[0] = ((int*)b->data)[0];
-        h->input_data_shape[0] = ((int*)b->data)[1];
-        h->input_data_shape[0] = ((int*)b->data)[2];
+        vela_shapes* shapes = (vela_shapes*)b->data;
+        for (int i = 0; i < shapes->count; i++) {
+          vector<int> s = {
+              shapes->shape[i][0],
+              shapes->shape[i][1],
+              shapes->shape[i][2],
+              shapes->shape[i][3]};
+          h->input_shape.push_back(s);
+        }
       }
       if (!strncmp(b->name, "output_shape", strlen("output_shape"))) {
-        h->output_data_shape[0] = ((int*)b->data)[0];
-        h->output_data_shape[0] = ((int*)b->data)[1];
-        h->output_data_shape[0] = ((int*)b->data)[2];
+        vela_shapes* shapes = (vela_shapes*)b->data;
+        for (int i = 0; i < shapes->count; i++) {
+          vector<int> s = {
+              shapes->shape[i][0],
+              shapes->shape[i][1],
+              shapes->shape[i][2],
+              shapes->shape[i][3]};
+          h->output_shape.push_back(s);
+        }
       }
     }
   }
diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py
index b33007f0c84..1773eb72bfb 100644
--- a/backends/arm/test/test_models.py
+++ b/backends/arm/test/test_models.py
@@ -77,6 +77,18 @@ def __init__(self):
         def forward(self, x):
             return x + x
 
+    @register_test
+    class simple_add_2(torch.nn.Module):
+        inputs = {
+            TosaProfile.BI_INT: (torch.ones(5,dtype=torch.int32),torch.ones(5,dtype=torch.int32),),
+        }
+
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return x + y
+
     @register_test
     class simple_add_broadcast(torch.nn.Module):
         inputs = {
diff --git a/examples/arm/arm_ethosu_minimal.py b/examples/arm/arm_ethosu_minimal.py
index a41cbb42cd5..62411ca24c3 100644
--- a/examples/arm/arm_ethosu_minimal.py
+++ b/examples/arm/arm_ethosu_minimal.py
@@ -33,7 +33,7 @@
     _check_ir_validity=False,
 )
 
-EXAMPLE_TEST_LIST = ["simple_add"]
+EXAMPLE_TEST_LIST = ["simple_add", "simple_add_2"]
 
 #
 #

From b812898abdf2bc0c197afabd90719b159ccd98f9 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 16:00:59 +0000
Subject: [PATCH 11/25] Handle multiple delegate inputs with SRAM offsets

 * export the list of offset from AoT floq
 * appropriately copy inputs and ouputs to/from SRAM

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py               |  8 ++-
 backends/arm/runtime/ArmBackendEthosU.cpp | 84 ++++++++++-------------
 2 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 2e5cf4d9645..200f5818be5 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -151,8 +151,6 @@ def dbg_tosa_dump(tosa_fb, path):
 # WARNING: if this changes, the runtime reader also needs to change
 def vela_compile(tosa_fb):
     with tempfile.TemporaryDirectory() as tmpdir:
-        print(f"compiling to Vela in {tmpdir}")
-
         tosaname = "out.tosa"
         flatbuffer = tosa_fb.serialize()
         f = open(os.path.join(tmpdir, tosaname), "wb")
@@ -188,6 +186,12 @@ def vela_compile(tosa_fb):
                         inp_pad = inp.tolist() + [0] * (4 - len(inp))
                         input_struct = input_struct + struct.pack("<iiii", *inp_pad)
                     block_data = input_struct
+                elif key in ( "input_offset", "output_offset" ):
+                    inputs = data[key]
+                    offset_struct = struct.pack("<i", len(inputs))
+                    for inp in inputs:
+                        offset_struct = offset_struct + struct.pack("<i", inp)
+                    block_data = offset_struct
                 else:
                     block_data = data[key].tobytes()
                 # We need the acual unpadded block lengths for hw setup
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 38e0de2c169..f1da72b6396 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -109,36 +109,20 @@ class ArmBackend final : public PyTorchBackendInterface {
         handles.scratch_data,
         handles.scratch_data_size);
 
-    printf("Processed inputs %d\n", handles.input_shape.size());
-    for (int i = 0; i < handles.input_shape.size(); i++)
-      printf(
-          "  %d %d %d %d\n",
-          handles.input_shape[i][0],
-          handles.input_shape[i][1],
-          handles.input_shape[i][2],
-          handles.input_shape[i][3]);
-
-    // Input data from EValue
-    const char* input_addr = handles.scratch_data + handles.input_offset;
-    printf(
-        "accessing ethos input data at %p, offset %d\n",
-        handles.scratch_data,
-        handles.input_offset);
-    // Inputs are in the index first
-    int input_index =
-        0; // handles.input_shape.size(); TODO: loop this for multiple inputs
-    printf("writing input to EValue input index %d\n", input_index);
-
-    // Process input EValue into scratch
-    // TODO: optimise into direct write for compatible layouts
-    //       is this contiguous for a memcpy of e_size*numel?
-    int* input_address = (int*)input_addr;
-    auto tensor_in = args[input_index]->toTensor();
-    for (int j = 0; j < tensor_in.numel(); j++) {
-      // TODO: extend beyond 4 byte tensors
-      input_address[j] = tensor_in.mutable_data_ptr<int>()[j];
+    // Write inputs into SRAM scratch area defined by Vela
+    for (int i = 0; i < handles.input_shape.size(); i++) {
+      const char* input_addr = handles.scratch_data + handles.input_offset[i];
+      // Process input EValue into scratch
+      // TODO: optimise into direct write for compatible, contig layout
+      int* input_address = (int*)input_addr;
+      auto tensor_in = args[i]->toTensor();
+      for (int j = 0; j < tensor_in.numel(); j++) {
+        // TODO: extend beyond 4 byte tensors
+        input_address[j] = tensor_in.mutable_data_ptr<int>()[j];
+      }
     }
 
+#if 0
     // TMP emit scratch
     printf("Scratch after setup:\n");
     for (int i = 0; i < handles.scratch_data_size; i++) {
@@ -148,6 +132,7 @@ class ArmBackend final : public PyTorchBackendInterface {
     }
     printf("\n");
     // END TMP emit scratch
+#endif
 
     // Allocate driver handle and synchronously invoke driver
     ethosu_driver* drv = ethosu_reserve_driver();
@@ -173,6 +158,7 @@ class ArmBackend final : public PyTorchBackendInterface {
       return Error::InvalidProgram;
     }
 
+#if 0
     // TMP emit scratch
     printf("Scratch after:\n");
     for (int i = 0; i < handles.scratch_data_size; i++) {
@@ -181,29 +167,16 @@ class ArmBackend final : public PyTorchBackendInterface {
         printf("\n");
     }
     printf("\n");
-
-    printf("Processed outputs %d\n", handles.output_shape.size());
-    for (int i = 0; i < handles.output_shape.size(); i++)
-      printf(
-          "  %d %d %d %d\n",
-          handles.output_shape[i][0],
-          handles.output_shape[i][1],
-          handles.output_shape[i][2],
-          handles.output_shape[i][3]);
+#endif
 
     // output data from Ethos U
-    const char* output_addr = handles.scratch_data + handles.output_offset;
-    printf(
-        "accessing ethos output data at %p, offset %d\n",
-        handles.scratch_data,
-        handles.output_offset);
+    // We only handle one output at the moment
+    const char* output_addr = handles.scratch_data + handles.output_offset[0];
     // Outputs are in the index immediately after inputs
     int output_index = handles.input_shape.size();
-    printf("writing output to EValue output index %d\n", output_index);
 
     // Process results into EValue storage
-    // TODO: optimise into direct write for compatible layouts
-    //       is this contiguous for a memcpy of e_size*numel?
+    // TODO: optimise into direct write for compatible, contig layout
     int* output_address = (int*)output_addr;
     auto tensor_out = args[output_index]->toTensor();
     for (int j = 0; j < tensor_out.numel(); j++) {
@@ -226,9 +199,9 @@ class ArmBackend final : public PyTorchBackendInterface {
     size_t weight_data_size;
     const char* scratch_data;
     size_t scratch_data_size;
-    size_t input_offset;
+    vector<size_t> input_offset;
     vector<vector<int>> input_shape;
-    size_t output_offset;
+    vector<size_t> output_offset;
     vector<vector<int>> output_shape;
   } vela_handles;
 
@@ -244,6 +217,11 @@ class ArmBackend final : public PyTorchBackendInterface {
     int shape[][4];
   } vela_shapes;
 
+  typedef struct {
+    int count;
+    int offsets[];
+  } vela_offsets;
+
   static int next_mul_16(int n) {
     return ((n - 1) | 15) + 1;
   }
@@ -285,12 +263,20 @@ class ArmBackend final : public PyTorchBackendInterface {
         h->scratch_data = b->data;
         h->scratch_data_size = b->size;
       }
+
       if (!strncmp(b->name, "input_offset", strlen("input_offset"))) {
-        h->input_offset = ((int*)b->data)[0];
+        vela_offsets* offsets = (vela_offsets*)b->data;
+        for (int i = 0; i < offsets->count; i++) {
+          h->input_offset.push_back(offsets->offsets[i]);
+        }
       }
       if (!strncmp(b->name, "output_offset", strlen("output_offset"))) {
-        h->output_offset = ((int*)b->data)[0];
+        vela_offsets* offsets = (vela_offsets*)b->data;
+        for (int i = 0; i < offsets->count; i++) {
+          h->output_offset.push_back(offsets->offsets[i]);
+        }
       }
+
       if (!strncmp(b->name, "input_shape", strlen("input_shape"))) {
         vela_shapes* shapes = (vela_shapes*)b->data;
         for (int i = 0; i < shapes->count; i++) {

From 44a46a14eec7fb973a19550b836c6c9bfe90fac6 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 17:03:13 +0000
Subject: [PATCH 12/25] Add TOSA ref model and Vela dependencies

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 examples/arm/setup.sh                         |  48 ++++++-
 ...001-Improve-rescale-codegen-for-TOSA.patch | 129 ++++++++++++++++++
 2 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch

diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index c3518d3b24f..dace442ea8b 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -146,6 +146,45 @@ function patch_repo() {
     echo -e "[${FUNCNAME[0]}] Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${repo_dir} dir.\n"
 }
 
+function setup_tosa_reference_model() {
+	# The debug flow on the host includes running on a reference implementation of TOSA
+	# This is useful primarily for debug of quantization accuracy, but also for internal
+	# errors for the early codebase
+	cd "${root_dir}"
+	if [[ ! -e reference_model ]]; then
+		git clone https://git.mlplatform.org/tosa/reference_model.git -b v0.80.0
+		cd reference_model
+		git submodule update --init --recursive
+		cd ..
+	fi
+	cd reference_model
+	mkdir -p build
+	cd build
+	cmake ..
+	make
+	cd reference_model
+	tosa_bin_path=`pwd`
+	echo adding ${tosa_bin_path} to path
+	echo "export PATH=\${PATH}:${tosa_bin_path}" >> ${update_path_script}
+	cd ../..
+	echo back at `pwd`
+}
+
+function setup_vela() {
+	#
+	# Prepare the Vela compiler for AoT to Ethos-U compilation
+	#
+	cd "${root_dir}/ethos-u/"
+	if [[ ! -e ethos-u-vela ]]; then
+		git clone https://git.mlplatform.org/ml/ethos-u/ethos-u-vela.git
+		name="ethos-u-vela"
+		base_rev=00a15db3e1a188b25065d095152d701f4394cdc5
+		patch_repo
+	fi
+	pip install .
+	cd ..
+}
+
 ########
 ### main
 ########
@@ -182,6 +221,13 @@ name="core_platform"
 base_rev=204210b1074071532627da9dc69950d058a809f4
 patch_repo
 
+# Setup the tosa_reference_model
+setup_tosa_reference_model
+
+# Setup vela and patch in codegen fixes
+setup_vela
+
 echo "[main] update path by doing 'source ${setup_path_script}'"
-echo "[main] sucecss!"
+
+echo "[main] success!"
 exit 0
diff --git a/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
new file mode 100644
index 00000000000..e131ca76ee8
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
@@ -0,0 +1,129 @@
+From ef07230fbb15edbf27ecaf48994fb157430a5e7c Mon Sep 17 00:00:00 2001
+From: Rob Elliott <robert.elliott@arm.com>
+Date: Thu, 5 Oct 2023 16:45:42 +0000
+Subject: [PATCH] Improve rescale codegen for TOSA
+
+Signed-off-by: Rob Elliott <robert.elliott@arm.com>
+---
+ ethosu/vela/tosa_graph_optimiser.py | 56 +++++++++++------------------
+ ethosu/vela/tosa_mapping.py         |  2 +-
+ 2 files changed, 22 insertions(+), 36 deletions(-)
+
+diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
+index df6b575..b2e3697 100644
+--- a/ethosu/vela/tosa_graph_optimiser.py
++++ b/ethosu/vela/tosa_graph_optimiser.py
+@@ -337,7 +337,8 @@ def rewrite_concat(op):
+ 
+ def remove_memory_ops(op, arch):
+     if op.run_on_npu and op.type in (Op.Reshape, Op.Identity):
+-        bypass_memory_only_ops(op)
++        # TODO: is this ok - function doesn't use arch or nng
++        bypass_memory_only_ops(op, arch, None)
+ 
+ 
+ def rewrite_activation(op, arch, nng):
+@@ -357,7 +358,6 @@ def rewrite_activation(op, arch, nng):
+ 
+     return op
+ 
+-
+ def rewrite_rescale(op, arch, nng):
+     if op.type == Op.Rescale:
+         ifm = op.ifm
+@@ -368,7 +368,7 @@ def rewrite_rescale(op, arch, nng):
+         prev_op = ifm.ops[0]
+ 
+         # TODO currently not supported
+-        assert len(ifm.consumer_list) == 1
++        #assert len(ifm.consumer_list) == 1
+ 
+         input_zp = op.attrs["input_zp"]
+         output_zp = op.attrs["output_zp"]
+@@ -390,6 +390,9 @@ def rewrite_rescale(op, arch, nng):
+             assert False
+         ifm.quantization.zero_point = input_zp
+         ofm.quantization.zero_point = output_zp
++
++        assert False == per_channel, "Don't like per_channel!"
++        
+         for s, m in zip(shift, multiplier):
+             # TODO these are the TOSA limitations
+             assert m >= 0
+@@ -403,45 +406,28 @@ def rewrite_rescale(op, arch, nng):
+         else:
+             rounding_mode = RoundingMode.HalfUp
+ 
+-        if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected:
++        fuse = len(ifm.ops) == 1 and prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op()
++        if fuse:
++            # TODO: ERROR: bias.values didn't exist for an op like Add - presumably not a capability of that op
+             assert len(multiplier) == len(shift) == len(prev_op.bias.values)
+-
+-            if ifm.dtype == DataType.int32 and per_channel:
+-                prev_op.explicit_scaling = explicit_scaling
+-                prev_op.rounding_mode = rounding_mode
+-
+-                # Bypass op
+-                prev_op.set_output_tensor(ofm)
+-                DebugDatabase.add_optimised(op, prev_op)
+-                return op
+-            else:
+-                print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+-                assert False
+-        # TODO which are the cases we need to and can do standalone Rescale?
+-        # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops?
+-        # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE?
+-        # limited to these at the moment:
+-        elif (
+-            (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8)
+-            or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8)
+-            or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8)
+-        ):
+-            # Create  NOP performing the RESCALE
++            # TODO: generate replacement fusion code from below
++            assert False, "Fusion possible but i've not implemented it"
++        else:
++            # Generate Rescale behaviour attached to a compatible NOP
++            # TODO: I assume this attaches a new operator into the graph??
+             avgpool_op = replace_rescale_with_avg_pool(op)
+             avgpool_op.rounding_mode = rounding_mode
+-
++            
+             if per_channel:
+-                # TODO
+-                avgpool_op.explicit_scaling = explicit_scaling
+-                print("Warning, unsupported TOSA Rescale")
+-                assert False
++                assert False, "Assert above removed but still not implemented... :/"
+             else:
+                 avgpool_op.explicit_scaling = explicit_scaling
+-        else:
+-            print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+-            assert False
+-    return op
+ 
++        #print( len(multiplier), len(shift), len(prev_op.get_bias_tensors()) )
++        #print( ifm.dtype, "PC:", per_channel, op.type )
++        #print( ifm.dtype, ofm.dtype )
++            
++    return op
+ 
+ def convert_pad_in_width(op):
+     """
+diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py
+index 2dafd81..ed5aa2e 100644
+--- a/ethosu/vela/tosa_mapping.py
++++ b/ethosu/vela/tosa_mapping.py
+@@ -148,7 +148,7 @@ transpose_conv_attrs = AttrSerializer(
+ )
+ transpose_attrs = AttrSerializer("TransposeAttribute", (("perms", is_vec),))
+ axis_attrs = AttrSerializer("AxisAttribute", ("axis",))
+-reshape_attrs = AttrSerializer("ReshapeAttribute", (("shape", is_vec),))
++reshape_attrs = AttrSerializer("ReshapeAttribute", (("newShape", is_vec),))
+ slice_attrs = AttrSerializer("SliceAttribute", (("start", is_vec), ("size", is_vec)))
+ tile_attrs = AttrSerializer("TileAttribute", (("multiplies", is_vec),))
+ resize_attrs = AttrSerializer(
+-- 
+2.41.0
+

From cb16e682da9965104a4e8650ff8e785756f48983 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 17:07:42 +0000
Subject: [PATCH 13/25] Cleanup from lintrunner and other bits of tidyup

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py        |  8 ++++----
 backends/arm/test/test_models.py   |  9 ++++++---
 examples/arm/arm_ethosu_minimal.py | 22 +++++++++++-----------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 200f5818be5..da1b70e57d6 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -12,9 +12,9 @@
 import logging
 import operator
 import os
+import struct
 import subprocess
 import tempfile
-import struct
 from typing import final, List
 
 import numpy as np
@@ -176,8 +176,8 @@ def vela_compile(tosa_fb):
                 block_name = bytes(key, "utf8")[:15]
                 block_name = block_name + b"\x00" * (16 - len(block_name))
 
-                block_data = b''
-                if key in ( "input_shape", "output_shape" ):
+                block_data = b""
+                if key in ("input_shape", "output_shape"):
                     inputs = data[key]
                     # Encode a struct of int len; and one or more int x,y,z,w shape;
                     input_struct = struct.pack("<i", len(inputs))
@@ -186,7 +186,7 @@ def vela_compile(tosa_fb):
                         inp_pad = inp.tolist() + [0] * (4 - len(inp))
                         input_struct = input_struct + struct.pack("<iiii", *inp_pad)
                     block_data = input_struct
-                elif key in ( "input_offset", "output_offset" ):
+                elif key in ("input_offset", "output_offset"):
                     inputs = data[key]
                     offset_struct = struct.pack("<i", len(inputs))
                     for inp in inputs:
diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py
index 1773eb72bfb..4172c8a97f2 100644
--- a/backends/arm/test/test_models.py
+++ b/backends/arm/test/test_models.py
@@ -25,7 +25,7 @@ class TosaProfile(Enum):
     BI = 0  # Base Inference
     MI = 1  # Main Inference
     MT = 2  # Main Training
-    BI_INT = 3 # integer only BI subset tests (for test graphs)
+    BI_INT = 3  # integer only BI subset tests (for test graphs)
 
 
 class TorchBuilder:
@@ -68,7 +68,7 @@ class simple_add(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (torch.ones(5),),
             TosaProfile.MI: (torch.ones(5),),
-            TosaProfile.BI_INT: (torch.ones(5,dtype=torch.int32),),
+            TosaProfile.BI_INT: (torch.ones(5, dtype=torch.int32),),
         }
 
         def __init__(self):
@@ -80,7 +80,10 @@ def forward(self, x):
     @register_test
     class simple_add_2(torch.nn.Module):
         inputs = {
-            TosaProfile.BI_INT: (torch.ones(5,dtype=torch.int32),torch.ones(5,dtype=torch.int32),),
+            TosaProfile.BI_INT: (
+                torch.ones(5, dtype=torch.int32),
+                torch.ones(5, dtype=torch.int32),
+            ),
         }
 
         def __init__(self):
diff --git a/examples/arm/arm_ethosu_minimal.py b/examples/arm/arm_ethosu_minimal.py
index 62411ca24c3..93b73909251 100644
--- a/examples/arm/arm_ethosu_minimal.py
+++ b/examples/arm/arm_ethosu_minimal.py
@@ -6,13 +6,12 @@
 import json
 import os
 import subprocess
-import tempfile
 
 import executorch.exir as exir
 
 import numpy as np
 from executorch.backends.arm.arm_backend import ArmPartitioner
-from executorch.backends.arm.test.test_models import TestList, TosaProfile
+from executorch.backends.arm.test.test_models import TosaProfile
 from executorch.backends.arm.test.test_tosa import prepare_model_and_ref
 
 from executorch.exir.backend.backend_api import to_backend
@@ -21,8 +20,6 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
-from executorch.exir.dialects._ops import ops as exir_ops
-
 # Assumes you have these two tools on your path
 TOSA_REF_MODEL_PATH = "tosa_reference_model"
 VELA_COMPILER_PATH = "vela"
@@ -79,6 +76,7 @@ def tosa_ref_capture_inputs(
         else:
             np.save(file_path, data, allow_pickle=False)
 
+
 #
 # Minimal sequence to take a model through the ArmPartitioner and produce
 # both TOSA intermediate output, and an Ethos-U55 command stream within
@@ -93,9 +91,9 @@ def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"):
     #
     print(f"\n\033[96mProcessing:::{op}\033[0m")
     print(f"\033[96mDebug output path for intermediates: {output_path}\033[0m")
-    
+
     os.makedirs(output_path, exist_ok=True)
-    
+
     # Debug output for TORCH
     TORCH_OUT_PATH = os.path.join(output_path, op, "torch", "")
     os.makedirs(TORCH_OUT_PATH, exist_ok=True)
@@ -111,13 +109,15 @@ def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"):
         return
 
     print(f"  Model: {op}\n  Inputs: {inputs}\n  Outputs: {torch_output}")
-    
+
     # Export model
     model_capture = exir.capture(model, inputs, _CAPTURE_CONFIG)
     model_edge = model_capture.to_edge(_EDGE_COMPILE_CONFIG)
 
     # Partition with ArmBackend
-    ArmPartitioner.compile_spec = [CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8"))]
+    ArmPartitioner.compile_spec = [
+        CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8"))
+    ]
     model_edge.exported_program = to_backend(
         model_edge.transform(DuplicateDequantNodePass()).exported_program,
         ArmPartitioner,
@@ -134,7 +134,7 @@ def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"):
     # the intermediate output on the tosa_reference_model.
     #   This can ensure the compilation flow is working correctly as part of
     # a development loop, ahead of running the example on hardware.
-        
+
     # Save inputs for TOSA reference run
     tosa_ref_capture_inputs(model_edge, inputs, TOSA_OUT_PATH, {}, {}, profile)
 
@@ -195,7 +195,7 @@ def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"):
         print(torch_output)
         print("\033[0m")
 
-    if profile in ( TosaProfile.BI,  TosaProfile.BI_INT ):
+    if profile in (TosaProfile.BI, TosaProfile.BI_INT):
         cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa"
         try:
             subprocess.run([cmd_vela], shell=True, check=True)
@@ -206,7 +206,7 @@ def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"):
         print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m")
 
 
-# Temp systest mode for running all models against both inference profiles
+# systest mode for running all models against both inference profiles
 if __name__ == "__main__":
     for op in EXAMPLE_TEST_LIST:
         run_test(op, profile=TosaProfile.BI_INT)

From 8aa80d038e7a9ac7ffad198514685a1c49e334ef Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 19:15:31 +0000
Subject: [PATCH 14/25] Removed ethos u driver build and cmsis dependency

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .gitmodules                           | 3 ---
 backends/arm/cmake/Dependencies.cmake | 2 --
 backends/arm/third-party/cmsis        | 1 -
 3 files changed, 6 deletions(-)
 delete mode 160000 backends/arm/third-party/cmsis

diff --git a/.gitmodules b/.gitmodules
index 0687c0e8b3f..8cb71f3a18e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -46,6 +46,3 @@
 [submodule "backends/arm/third-party/ethos-u-core-driver"]
 	path = backends/arm/third-party/ethos-u-core-driver
 	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
-[submodule "backends/arm/third-party/cmsis"]
-	path = backends/arm/third-party/cmsis
-	url = https://github.com/ARM-software/CMSIS_5.git
diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake
index 27a587176bb..fae39dd53b9 100644
--- a/backends/arm/cmake/Dependencies.cmake
+++ b/backends/arm/cmake/Dependencies.cmake
@@ -6,7 +6,5 @@
 set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 
 # Ethos-U driver
-set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver")
 set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
-add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} )
 include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis
deleted file mode 160000
index a75f01746df..00000000000
--- a/backends/arm/third-party/cmsis
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3

From 3a5fd4f10cb29ba7a7f14d92e0b53edf72619403 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 19:55:29 +0000
Subject: [PATCH 15/25] renamed lib ethos_u to executorch_delegate_ethos_u

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/CMakeLists.txt                            |  6 +++---
 .../patches/0007-Add-delegate-runner-test.patch        | 10 ++++------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 2cc5cf94740..d7b61ce92ad 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -20,6 +20,6 @@ include(cmake/Dependencies.cmake)
 
 set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
 list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
-add_library(ethos_u STATIC ${_arm_baremetal_sources})
-target_include_directories(ethos_u PUBLIC ${_common_include_directories})
-target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR})
+add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources})
+target_include_directories(executorch_delegate_ethos_u PUBLIC ${_common_include_directories})
+target_include_directories(executorch_delegate_ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR})
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
index 1f6dd480897..e80da67153f 100644
--- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
@@ -1,4 +1,4 @@
-From d9d89c7a1d45df7c7aab3142c47f1ff797e531fe Mon Sep 17 00:00:00 2001
+From 8201e36f90fed6e80fea7021ec4bad325d329bae Mon Sep 17 00:00:00 2001
 From: Rob Elliott <robert.elliott@arm.com>
 Date: Wed, 4 Oct 2023 13:31:33 +0000
 Subject: [PATCH] Add delegate runner test
@@ -13,10 +13,10 @@ Signed-off-by: Rob Elliott <robert.elliott@arm.com>
  create mode 100644 applications/executorch_tests/runner_delegate.cpp
 
 diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
-index c95d53e..1118469 100644
+index c95d53e..835f824 100644
 --- a/applications/executorch_tests/CMakeLists.txt
 +++ b/applications/executorch_tests/CMakeLists.txt
-@@ -28,22 +28,26 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir")
+@@ -28,20 +28,24 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir")
  set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir")
  set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers")
  set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte")
@@ -37,12 +37,10 @@ index c95d53e..1118469 100644
  message("**********************")
  
  set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a")
++set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libexecutorch_delegate_ethos_u.a")
  set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a")
  set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a")
-+set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libethos_u.a")
  
- add_custom_target(
-     gen_model_header ALL
 @@ -54,8 +58,11 @@ add_custom_command(
      OUTPUT
          ${CMAKE_CURRENT_BINARY_DIR}/fake_dep

From e67662058dfd69e1bd2407dd3f69017d0c2ebe52 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 20:05:52 +0000
Subject: [PATCH 16/25] lintfix

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/CMakeLists.txt | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index d7b61ce92ad..4dcf2ff0539 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -20,6 +20,18 @@ include(cmake/Dependencies.cmake)
 
 set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
 list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
-add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources})
-target_include_directories(executorch_delegate_ethos_u PUBLIC ${_common_include_directories})
-target_include_directories(executorch_delegate_ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR})
+
+add_library(
+  executorch_delegate_ethos_u
+  STATIC ${_arm_baremetal_sources}
+)
+target_include_directories(
+  executorch_delegate_ethos_u
+  PUBLIC
+  ${_common_include_directories}
+)
+target_include_directories(
+  executorch_delegate_ethos_u
+  PUBLIC
+  ${DRIVER_ETHOSU_INCLUDE_DIR}
+)

From 4b1125eece425677a762e2b0c92d565cf0d34e84 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 5 Oct 2023 22:05:21 +0000
Subject: [PATCH 17/25] tidied delegate_runner output

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .../0007-Add-delegate-runner-test.patch       | 84 +++++++++----------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
index e80da67153f..c1270961510 100644
--- a/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
@@ -1,4 +1,4 @@
-From 8201e36f90fed6e80fea7021ec4bad325d329bae Mon Sep 17 00:00:00 2001
+From 0fe8caba3068da05021232912c069124a81e0d94 Mon Sep 17 00:00:00 2001
 From: Rob Elliott <robert.elliott@arm.com>
 Date: Wed, 4 Oct 2023 13:31:33 +0000
 Subject: [PATCH] Add delegate runner test
@@ -7,9 +7,9 @@ Signed-off-by: Rob Elliott <robert.elliott@arm.com>
 ---
  applications/executorch_tests/CMakeLists.txt  |  27 ++-
  .../executorch_tests/pte_to_header.py         |  11 +-
- .../executorch_tests/runner_delegate.cpp      | 162 ++++++++++++++++++
+ .../executorch_tests/runner_delegate.cpp      | 160 ++++++++++++++++++
  cmake/toolchain/arm-none-eabi-gcc.cmake       |   6 +-
- 4 files changed, 197 insertions(+), 9 deletions(-)
+ 4 files changed, 195 insertions(+), 9 deletions(-)
  create mode 100644 applications/executorch_tests/runner_delegate.cpp
 
 diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
@@ -116,10 +116,10 @@ index 37d88aa..be3282d 100644
  with open(args.pte, "rb") as fr, open(
 diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp
 new file mode 100644
-index 0000000..6af6a92
+index 0000000..ff40084
 --- /dev/null
 +++ b/applications/executorch_tests/runner_delegate.cpp
-@@ -0,0 +1,162 @@
+@@ -0,0 +1,160 @@
 +/*
 + * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 + *
@@ -167,12 +167,12 @@ index 0000000..6af6a92
 +void et_pal_init(void) {}
 +
 +__ET_NORETURN void et_pal_abort(void) {
-+	__builtin_trap();
++    __builtin_trap();
 +}
 +
 +et_timestamp_t et_pal_current_ticks(void) {
-+	// libc.a - warning: _gettimeofday is not implemented and will always fail
-+	return 11223344;
++    // libc.a - warning: _gettimeofday is not implemented and will always fail
++    return 11223344;
 +}
 +
 +/**
@@ -186,46 +186,45 @@ index 0000000..6af6a92
 +    size_t line,
 +    const char* message,
 +    __ET_UNUSED size_t length) {
-+	fprintf(
-+		stderr,
-+		"%c executorch:%s:%zu] %s\n",
-+		level,
-+		filename,
-+		line,
-+		message);
++    fprintf(
++        stderr,
++        "%c executorch:%s:%zu] %s\n",
++        level,
++        filename,
++        line,
++        message);
 +}
 +
 +int main()
 +{
-+	printf("test test test NG ^2 22\n");
-+	printf("main: Initialising runtime\n");
++    ET_LOG(Info, "Initialising runtime");
 +    torch::executor::runtime_init();
 +
 +    using torch::executor::Result;
 +    using torch::executor::Error;
 +
-+	// Load pte from the global model_pte .pte file loaded into SRAM.
++    // Load pte from the global model_pte .pte file loaded into SRAM.
 +    auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte));
 +    Result<torch::executor::Program> program = torch::executor::Program::load(&loader);
 +    if(!program.ok()) {
-+		printf("main: Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error());
++        ET_LOG(Info, "Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error());
 +    }
-+    printf("main: Model buffer loaded, has %u methods\n", program->num_methods());
++    ET_LOG(Info, "Model buffer loaded, has %u methods", program->num_methods());
 +
-+	// Find our entrypoint in the .pte program
++    // Find our entrypoint in the .pte program
 +    const char* method_name = nullptr;
-+	const auto method_name_result = program->get_method_name(0);
-+	ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
-+	method_name = *method_name_result;
-+    printf("main: Found (and will run) method '%s'\n", method_name);
++    const auto method_name_result = program->get_method_name(0);
++    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
++    method_name = *method_name_result;
++    ET_LOG(Info, "Found (and will run) method '%s'", method_name);
 +
-+	// Allocate necessary memories for this method
++    // Allocate necessary memories for this method
 +    Result<torch::executor::MethodMeta> method_meta = program->method_meta(method_name);
 +    if (!method_meta.ok()) {
-+        printf("main: Failed to get method_meta for %s: 0x%x",
++        ET_LOG(Info, "Failed to get method_meta for %s: 0x%x",
 +                method_name, (unsigned int)method_meta.error());
 +    }
-+	
++    
 +    torch::executor::MemoryAllocator method_allocator{
 +        torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
 +
@@ -235,7 +234,7 @@ index 0000000..6af6a92
 +
 +    for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
 +        size_t buffer_size = static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
-+        printf("main: Setting up planned buffer %zu, size %zu.\n", id, buffer_size);
++        ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
 +
 +        planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
 +        planned_spans.push_back({planned_buffers.back().get(), buffer_size});
@@ -249,36 +248,35 @@ index 0000000..6af6a92
 +    Result<torch::executor::Method> method = program->load_method(method_name, &memory_manager);
 +
 +    if(!method.ok()) {
-+        printf("main: Loading of method %s failed with status 0x%x\n", method_name, (int)method.error());
++        ET_LOG(Info, "Loading of method %s failed with status 0x%x", method_name, (int)method.error());
 +    }
-+	printf("main: Loading of method '%s' succesful\n", method_name);
++    ET_LOG(Info, "Loading of method '%s' succesful", method_name);
 +
-+    printf("main: Preparing inputs...\n");
 +    auto inputs = torch::executor::util::PrepareInputTensors(*method);
 +
-+    printf("main: Starting the model execution...\n");
++    ET_LOG(Info, "Starting the model execution...");
 +    Error status = method->execute();
 +    if(status != Error::Ok){
-+        printf("main: Execution of method %s failed with status 0x%x\n", method_name, (int)status);
++        ET_LOG(Info, "Execution of method %s failed with status 0x%x", method_name, (int)status);
 +    } else {
-+        printf("main: Model executed successfully.\n");
++        ET_LOG(Info, "Model executed successfully.");
 +    }
 +
 +    // Print the outputs.
 +    std::vector<torch::executor::EValue> outputs(method->outputs_size());
-+    printf("main: %d outputs - ", outputs.size());
++    ET_LOG(Info, "%d outputs - ", outputs.size());
 +    status = method->get_outputs(outputs.data(), outputs.size());
 +    ET_CHECK(status == Error::Ok);
 +    for (size_t i = 0; i < outputs.size(); ++i)
-+	{
-+		printf("main: Output %d numel %d\n", i, outputs[i].toTensor().numel());
-+		for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j)
-+		{
-+			printf("main:   Output[%d]: %d\n", j, outputs[i].toTensor().const_data_ptr<int>()[j]);
-+		}
++    {
++        ET_LOG(Info, "Output %d numel %d", i, outputs[i].toTensor().numel());
++        for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j)
++        {
++            ET_LOG(Info, "   Output[%d]: %d", j, outputs[i].toTensor().const_data_ptr<int>()[j]);
++        }
 +    }
 +
-+	return 0;
++    return 0;
 +}
 +
 +

From 7afc5e4b79f37dece3c265567a2beda75201dbef Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 09:31:15 +0000
Subject: [PATCH 18/25] Fixed some merge issues

 * File moved to new path in examples
 * fix arg handling and md5sum for setup scripts

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .../0001-Improve-rescale-codegen-for-TOSA.patch       |  0
 examples/arm/run.sh                                   |  5 +++--
 examples/arm/setup.sh                                 | 11 ++++++++---
 3 files changed, 11 insertions(+), 5 deletions(-)
 rename examples/{backend => }/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch (100%)

diff --git a/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
similarity index 100%
rename from examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
rename to examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index f4ef588c4a5..2d255b6694e 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -7,7 +7,7 @@
 
 set -eu
 
-if [[ "${1}" == "-h" ]]; then
+if [[ "${1:-"."}" == "-h" ]]; then
     echo "Usage: $(basename $0) [path-to-a-scratch-dir] [buck2 binary]"
     exit 0
 fi
@@ -18,7 +18,8 @@ fi
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 
 # Ethos-u
-root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"}
+root_dir=${1:-"${script_dir}/ethos-u-scratch"}
+root_dir=$(realpath ${root_dir})
 buck2=${2:-"/tmp/buck2"}
 ethos_u_root_dir="$(cd ${root_dir}/ethos-u && pwd)"
 ethos_u_build_dir=${ethos_u_root_dir}/core_platform/build
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index dace442ea8b..c4c644bef4f 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -7,7 +7,7 @@
 
 set -eu
 
-if [[ "${1}" == "-h" ]]; then
+if [[ "${1:-'.'}" == "-h" ]]; then
     echo "Usage: $(basename $0) [path-to-a-scratch-dir]"
     exit 0
 fi
@@ -47,18 +47,22 @@ if [[ $(get_cpu_arch) == "x86_64" ]]; then
 	# FVP
 	fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9"
 	fvp_model_dir="Linux64_GCC-9.3"
+	fvp_md5_checksum="98e93b949d0fbac977292d8668d34523"
 
 	# toochain
 	toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz"
 	toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi"
+	toolchain_md5_checksum="00ebb1b70b1f88906c61206457eacb61"
 elif [[ $(get_cpu_arch) == "aarch64" ]]; then
     # FVP
 	fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073"
     fvp_model_dir="Linux64_armv8l_GCC-9.3"
+	fvp_md5_checksum="cbbabbe39b07939cff7a3738e1492ef1"
 
     # toochain
     toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz"
     toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi"
+	toolchain_md5_checksum="02c9b0d3bb1110575877d8eee1f223f2"
 else
 	echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1;
 fi
@@ -70,7 +74,8 @@ ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c"
 ########
 ### Optional user args
 ########
-root_dir=${1:-"$(realpath ${script_dir}/ethos-u-scratch)"}
+root_dir=${1:-"${script_dir}/ethos-u-scratch"}
+root_dir=$(realpath ${root_dir})
 
 ########
 ### Functions
@@ -165,7 +170,7 @@ function setup_tosa_reference_model() {
 	cd reference_model
 	tosa_bin_path=`pwd`
 	echo adding ${tosa_bin_path} to path
-	echo "export PATH=\${PATH}:${tosa_bin_path}" >> ${update_path_script}
+	echo "export PATH=\${PATH}:${tosa_bin_path}" >> "${setup_path_script}"
 	cd ../..
 	echo back at `pwd`
 }

From e340b5c82380e6f6f0c9b2fdf0de4a3bf8bfb545 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 13:11:58 +0000
Subject: [PATCH 19/25] Test fixes for compiler output choice

 * Introduction of BI_INT to have a small set of pure int tests
 * added either vela or tosa output from compilation
 * Fixed tosa e2e tests to use tosa output form
 * unit tests currently use tosa due to missing vela dependency
 * vela e2e testing runs with default compile flags and emits to .pte

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py      | 17 +++++++++++---
 backends/arm/test/test_models.py | 14 +++++++-----
 backends/arm/test/test_tosa.py   | 38 ++++++++++++++++++++++++--------
 examples/arm/arm_tosa_e2e.py     |  9 ++++++--
 4 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index da1b70e57d6..ac40042bcd9 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -315,10 +315,13 @@ def preprocess(  # noqa: C901
         # if a debug/test build capture output files from TOSA stage
         path = None
         debug_output = False
+        output_format = "vela"
         for spec in compile_spec:
             if spec.key == "debug_tosa_path":
                 path = spec.value.decode()
                 debug_output = True
+            if spec.key == "output_format":
+                output_format = spec.value.decode()
 
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
@@ -964,8 +967,16 @@ def preprocess(  # noqa: C901
         if debug_output is True:
             dbg_tosa_dump(tosa_fb, path)
 
-        # Serialize and return the tosa flatbuffer
-        # fb = bytes(tosa_fb.serialize())
-        binary = vela_compile(tosa_fb)
+        # Serialize and return the program. While we have always produced TOSA
+        # output as an intermediate, some flows compile to device binaries in
+        # preprocess and some consume TOSA fb directly.
+        if output_format == "vela":
+            # Emit vela_bin_stream format
+            binary = vela_compile(tosa_fb)
+        elif output_format == "tosa":
+            # Emit TOSA flatbuffer
+            binary = bytes(tosa_fb.serialize())
+        else:
+            raise RuntimeError(f"Unknown format {output_format}")
 
         return PreprocessResult(processed_bytes=binary)
diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py
index 4172c8a97f2..46a57a601b8 100644
--- a/backends/arm/test/test_models.py
+++ b/backends/arm/test/test_models.py
@@ -95,6 +95,10 @@ def forward(self, x, y):
     @register_test
     class simple_add_broadcast(torch.nn.Module):
         inputs = {
+            TosaProfile.BI_INT: (
+                torch.ones(10, 1, dtype=torch.int32),
+                torch.ones(10, 10, dtype=torch.int32),
+            ),
             TosaProfile.BI: (
                 torch.ones(10, 1),
                 torch.ones(10, 10),
@@ -127,7 +131,7 @@ def forward(self, x):
             x = self.fc(x)
             return x
 
-    @register_test
+    # @register_test
     class simple_conv2d(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (
@@ -151,7 +155,7 @@ def forward(self, x):
             x = self.conv2d(x)
             return x
 
-    @register_test
+    # @register_test
     class block_two_conv2d(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (torch.ones(1, 3, 256, 256),),
@@ -172,7 +176,7 @@ def forward(self, x):
             x = self.conv2d_2(x)
             return x
 
-    @register_test
+    # @register_test
     class simple_depthwise_conv2d(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (
@@ -276,7 +280,7 @@ def __init__(self):
         def forward(self, x):
             return self.softmax(x)
 
-    @register_test
+    # @register_test
     class block_conv_norm_activation(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (torch.ones(1, 3, 256, 256),),
@@ -298,7 +302,7 @@ def forward(self, x):
             x = self.relu6(x)
             return x
 
-    @register_test
+    # @register_test
     class block_bottleneck_residual(torch.nn.Module):
         # This is the essence of MobileNetV2
         # Ref: https://arxiv.org/abs/1801.04381
diff --git a/backends/arm/test/test_tosa.py b/backends/arm/test/test_tosa.py
index b3e59658641..9736503e626 100644
--- a/backends/arm/test/test_tosa.py
+++ b/backends/arm/test/test_tosa.py
@@ -17,6 +17,8 @@
 
 from executorch.exir.backend.backend_api import to_backend
 
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+
 # Config for Capturing the weights, will be moved in the future
 _CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True)
 _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
@@ -37,9 +39,12 @@ def test_minimal_MI(self):
         for test_model in TestList:
             print(f"Running test {test_model}")
             model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.MI)
-
-            model_edge, exec_prog = export_model(model, inputs, [])
-            # TODO: check there is a tosa delegate blob in the output
+            if inputs is None:
+                print("  Skipping, no inputs for this profile")
+                continue
+            model_edge, exec_prog = export_model(
+                model, inputs, [CompileSpec("output_format", bytes("tosa", "utf8"))]
+            )
 
     def test_minimal_BI(self):
         for test_model in TestList:
@@ -48,14 +53,31 @@ def test_minimal_BI(self):
             if inputs is None:
                 print("  Skipping, no inputs for this profile")
                 continue
-            model_edge, exec_prog = export_model(model, inputs, [])
-            # TODO: check there is a tosa delegate blob in the output
+            model_edge, exec_prog = export_model(
+                model, inputs, [CompileSpec("output_format", bytes("tosa", "utf8"))]
+            )
+
+    def test_minimal_BI_INT(self):
+        for test_model in TestList:
+            print(f"Running test {test_model}")
+            model, inputs, outputs = prepare_model_and_ref(
+                test_model, TosaProfile.BI_INT
+            )
+            if inputs is None:
+                print("  Skipping, no inputs for this profile")
+                continue
+            model_edge, exec_prog = export_model(
+                model, inputs, [CompileSpec("output_format", bytes("tosa", "utf8"))]
+            )
 
 
 def prepare_model_and_ref(test_model, profile=TosaProfile.MI):
     model = TestList[test_model]
     model_inputs = model.inputs.get(profile)
 
+    if model_inputs is None:
+        return model, model_inputs, None
+
     model.eval()
     if profile == TosaProfile.BI:
         # Quantize the model
@@ -72,10 +94,8 @@ def prepare_model_and_ref(test_model, profile=TosaProfile.MI):
         prepared_model(*model.inputs[profile])
         model = convert_pt2e(prepared_model)
 
-    if model_inputs is not None:
-        model_outputs = model.forward(*model_inputs)
-        return model, model_inputs, model_outputs
-    return model, model_inputs, None
+    model_outputs = model.forward(*model_inputs)
+    return model, model_inputs, model_outputs
 
 
 def export_model(model, inputs, compile_spec):
diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py
index 0dba4fa9866..80f1e19a357 100644
--- a/examples/arm/arm_tosa_e2e.py
+++ b/examples/arm/arm_tosa_e2e.py
@@ -144,8 +144,13 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
     TOSA_OUT_PATH = os.path.join(DEBUG_OUTPUT_PATH, op, "tosa", "")
     os.makedirs(TOSA_OUT_PATH, exist_ok=True)
 
-    # Debug flag for compilers
-    compile_spec = [CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8"))]
+    # Debug flags for compilers
+    # - Emit some debug files into /tmp
+    # - output_format TOSA for this test (and pure tosa flows)
+    compile_spec = [
+        CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8")),
+        CompileSpec("output_format", bytes("tosa", "utf8")),
+    ]
 
     model, inputs, torch_output = prepare_model_and_ref(op, profile)
 

From 6b7a18a7406a76158cf641bbdc725ad0982fec75 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 16:18:32 +0000
Subject: [PATCH 20/25] review fixes

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/CMakeLists.txt                |  1 -
 backends/arm/arm_backend.py                | 18 ++---
 backends/arm/cmake/arm-none-eabi-gcc.cmake | 90 ----------------------
 3 files changed, 7 insertions(+), 102 deletions(-)
 delete mode 100644 backends/arm/cmake/arm-none-eabi-gcc.cmake

diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 4dcf2ff0539..2b40086091b 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -14,7 +14,6 @@ endif()
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
-set(_common_compile_options -Wno-deprecated-declarations)
 
 include(cmake/Dependencies.cmake)
 
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index ac40042bcd9..407f233b02c 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -138,13 +138,11 @@ def dbg_tosa_dump(tosa_fb, path):
     fb = tosa_fb.serialize()
     js = tosa_fb.writeJson(filename)
 
-    f = open(path + filename, "wb")
-    f.write(fb)
-    f.close()
+    with open(path + filename, "wb") as f:
+        f.write(fb)
 
-    f = open(path + "desc.json", "w")
-    f.write(js)
-    f.close()
+    with open(path + "desc.json", "w") as f:
+        f.write(js)
 
 
 # Output to Vela with current file-based compilation
@@ -153,12 +151,10 @@ def vela_compile(tosa_fb):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
         flatbuffer = tosa_fb.serialize()
-        f = open(os.path.join(tmpdir, tosaname), "wb")
-        f.write(flatbuffer)
-        f.close()
+        with open(os.path.join(tmpdir, tosaname), "wb") as f:
+            f.write(flatbuffer)
 
         # invoke vela
-        # TODO target ethos-u55-128
         vela_command = (
             f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
         )
@@ -169,7 +165,7 @@ def vela_compile(tosa_fb):
         with np.load(np_path, allow_pickle=False) as data:
             # Emit the NPZ regions as:
             #  - 16 byte block name null terminated string (padded to 16 if name shorter)
-            #  - 4 byes of int32 block length and 12 bytes of 0's
+            #  - 4 bytes of int32 block length and 12 bytes of 0's
             #  - block data (padded to 16 byte alignment at end)
             # Repeat for all blocks
             for key in data.keys():
diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake
deleted file mode 100644
index 0921a529037..00000000000
--- a/backends/arm/cmake/arm-none-eabi-gcc.cmake
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2023 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set(TARGET_CPU "cortex-m55" CACHE STRING "Target CPU")
-string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR)
-
-set(CMAKE_SYSTEM_NAME Generic)
-set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
-set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
-set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc")
-set(CMAKE_LINKER "arm-none-eabi-ld")
-
-set(CMAKE_EXECUTABLE_SUFFIX ".elf")
-set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-# Select C/C++ version
-set(CMAKE_C_STANDARD 11)
-set(CMAKE_CXX_STANDARD 14)
-
-set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR})
-string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU})
-
-# Compile options
-add_compile_options(
-    -mcpu=${GCC_CPU}
-    -mthumb
-    "$<$<CONFIG:DEBUG>:-gdwarf-3>"
-    "$<$<COMPILE_LANGUAGE:CXX>:-fno-unwind-tables;-fno-rtti;-fno-exceptions>"
-    -fdata-sections
-    -ffunction-sections)
-
-# Compile defines
-add_compile_definitions(
-    "$<$<NOT:$<CONFIG:DEBUG>>:NDEBUG>")
-
-# Link options
-add_link_options(
-    -mcpu=${GCC_CPU}
-    -mthumb
-    --specs=nosys.specs)
-
-# Set floating point unit
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp")
-    set(FLOAT hard)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp")
-    set(FLOAT soft)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR
-       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR
-       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)")
-    set(FLOAT hard)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR
-       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)")
-    set(FLOAT hard)
-    set(FPU_CONFIG "fpv4-sp-d16")
-    add_compile_options(-mfpu=${FPU_CONFIG})
-    add_link_options(-mfpu=${FPU_CONFIG})
-else()
-    set(FLOAT soft)
-endif()
-
-if(FLOAT)
-    add_compile_options(-mfloat-abi=${FLOAT})
-    add_link_options(-mfloat-abi=${FLOAT})
-endif()
-
-add_link_options(LINKER:--nmagic,--gc-sections)
-
-# Compilation warnings
-add_compile_options(
-#    -Wall
-#    -Wextra
-
-#    -Wcast-align
-#    -Wdouble-promotion
-#    -Wformat
-#    -Wmissing-field-initializers
-#    -Wnull-dereference
-#    -Wredundant-decls
-#    -Wshadow
-#    -Wswitch
-#    -Wswitch-default
-#    -Wunused
-    -Wno-redundant-decls
-    -Wno-psabi
-)

From c453e4390bd64c07f726b9c06200e1fe529912fd Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 18:52:06 +0000
Subject: [PATCH 21/25] review feedback/improvements

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/runtime/ArmBackendEthosU.cpp | 125 +++++++++++-----------
 1 file changed, 60 insertions(+), 65 deletions(-)

diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index f1da72b6396..85c10fed160 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -10,6 +10,7 @@
  * ethos-u-core-driver for hardware interaction.
  */
 
+#include <cstring>
 #include <memory>
 #include <vector>
 
@@ -25,7 +26,9 @@ using namespace std;
 namespace torch {
 namespace executor {
 
-// TODO we should be in 0x31, not this lower 1MB sRAM
+// TODO: we should be in 0x31, to access a full 2MB SRAM
+// region and enable maximum program performance up to
+// 2MB, rather than 1.
 // SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000
 #define CS300_SRAM_LOW ((void*)0x11000000)
 #define CS300_SRAM_HIGH ((void*)0x110FFFFF)
@@ -37,6 +40,7 @@ class ArmBackend final : public PyTorchBackendInterface {
   ~ArmBackend() = default;
 
   virtual bool is_available() const override {
+    // TODO: revise to use a register check/init function
     return 1;
   }
 
@@ -52,16 +56,19 @@ class ArmBackend final : public PyTorchBackendInterface {
 
     // Header and footer both 16 bit aligned suggest valid structure and we
     // wont walk off the end of the chunks and segfault
-    if (!((int)data == next_mul_16((int)data))) {
+    if (!((int)data == next_mul_16((uintptr_t)data))) {
       ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned");
       return Error::InvalidProgram;
     }
-    if (!((int)foot == next_mul_16((int)foot))) {
-      ET_LOG(Error, "ArmBackend::init: Program unexpected size");
+    if (!((int)foot == next_mul_16((uintptr_t)foot))) {
+      ET_LOG(Error, "ArmBackend::init: Footer expected to be 16 byte aligned");
+      ET_LOG(
+          Error,
+          "ArmBackend::init: Program expected to be multiple of 16 bytes");
       return Error::InvalidProgram;
     }
     if (!(0 == strncmp(data, "vela_bin_stream", 15))) {
-      ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream");
+      ET_LOG(Error, "ArmBackend::init: Binary passed is not a vela_bin_stream");
       return Error::InvalidProgram;
     }
     if (!(0 == strncmp(foot, "vela_end_stream", 15))) {
@@ -70,8 +77,15 @@ class ArmBackend final : public PyTorchBackendInterface {
     }
     // Verify address range is accessible current expectation is the program
     // is wholly stored in SRAM
+    // TODO: expect to improve capabilities here by supporting DRAM storage
+    //       and only moving required data into SRAM.
     if (!(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH)) {
       ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM");
+      ET_LOG(
+          Error,
+          "ArmBackend::init: program binary range %p:%p",
+          data,
+          foot + 16);
       return Error::InvalidProgram;
     }
 
@@ -88,7 +102,7 @@ class ArmBackend final : public PyTorchBackendInterface {
 
     ET_LOG(Info, "ArmBackend::execute %p", processed->data());
 
-    vela_handles handles;
+    VelaHandles handles;
 
     // Command stream - we know at this point it's aligned
     char* data = (char*)processed->data();
@@ -110,7 +124,7 @@ class ArmBackend final : public PyTorchBackendInterface {
         handles.scratch_data_size);
 
     // Write inputs into SRAM scratch area defined by Vela
-    for (int i = 0; i < handles.input_shape.size(); i++) {
+    for (int i = 0; i < handles.input_shapes.size(); i++) {
       const char* input_addr = handles.scratch_data + handles.input_offset[i];
       // Process input EValue into scratch
       // TODO: optimise into direct write for compatible, contig layout
@@ -122,21 +136,16 @@ class ArmBackend final : public PyTorchBackendInterface {
       }
     }
 
-#if 0
-    // TMP emit scratch
-    printf("Scratch after setup:\n");
-    for (int i = 0; i < handles.scratch_data_size; i++) {
-      printf("%02x ", ((char*)handles.scratch_data)[i]);
-      if (!((i + 1) % 4))
-        printf("\n");
-    }
-    printf("\n");
-    // END TMP emit scratch
-#endif
-
     // Allocate driver handle and synchronously invoke driver
     ethosu_driver* drv = ethosu_reserve_driver();
+    if (drv == NULL) {
+      ET_LOG(Error, "ArmBackend::execute: ethosu_reserve_driver failed");
+      return Error::InvalidState;
+    }
 
+    // Ethos-U low level driver expected order for Ethos U-55, we have
+    // constant weight data, then scratch (which contains input and output)
+    // scratch is written above in this function.
     uint64_t bases[2] = {
         (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
     size_t bases_size[2] = {
@@ -147,7 +156,7 @@ class ArmBackend final : public PyTorchBackendInterface {
         handles.cmd_data_size,
         bases,
         bases_size,
-        2,
+        2, /* fixed array of pointers to binary interface*/
         nullptr);
 
     if (result != 0) {
@@ -158,22 +167,11 @@ class ArmBackend final : public PyTorchBackendInterface {
       return Error::InvalidProgram;
     }
 
-#if 0
-    // TMP emit scratch
-    printf("Scratch after:\n");
-    for (int i = 0; i < handles.scratch_data_size; i++) {
-      printf("%02x ", ((char*)handles.scratch_data)[i]);
-      if (!((i + 1) % 4))
-        printf("\n");
-    }
-    printf("\n");
-#endif
-
     // output data from Ethos U
     // We only handle one output at the moment
     const char* output_addr = handles.scratch_data + handles.output_offset[0];
     // Outputs are in the index immediately after inputs
-    int output_index = handles.input_shape.size();
+    int output_index = handles.input_shapes.size();
 
     // Process results into EValue storage
     // TODO: optimise into direct write for compatible, contig layout
@@ -200,103 +198,100 @@ class ArmBackend final : public PyTorchBackendInterface {
     const char* scratch_data;
     size_t scratch_data_size;
     vector<size_t> input_offset;
-    vector<vector<int>> input_shape;
+    vector<vector<int>> input_shapes;
     vector<size_t> output_offset;
-    vector<vector<int>> output_shape;
-  } vela_handles;
+    vector<vector<int>> output_shapes;
+  } VelaHandles;
 
   typedef struct {
     char name[16];
-    int size;
+    uint32_t size;
     char _pad[12];
     char data[];
-  } vela_bin_block;
+  } VelaBinBlock;
 
   typedef struct {
     int count;
     int shape[][4];
-  } vela_shapes;
+  } VelaShapes;
 
   typedef struct {
     int count;
     int offsets[];
-  } vela_offsets;
+  } VelaOffsets;
 
   static int next_mul_16(int n) {
     return ((n - 1) | 15) + 1;
   }
 
-  int vela_read(char* data, vela_handles* h, int size) const {
+  int vela_read(char* data, VelaHandles* handles, int size) const {
+    constexpr const size_t header_size = 16;
+
     // Read header string
     if (strncmp(data, "vela_bin_stream", 15)) {
       return 0;
     }
-    data += 16;
+    data += header_size;
 
-    // Expect one or more 'vela_bin_block's
+    // Expect one or more 'VelaBinBlock's
     while (1) {
-      vela_bin_block* b = (vela_bin_block*)data;
-      data += 16 + 16 + next_mul_16(b->size);
+      VelaBinBlock* b = (VelaBinBlock*)data;
+      data += sizeof(VelaBinBlock) + next_mul_16(b->size);
 
       // Exit with success on finding end of stream
-      if (!strncmp(b->name, "vela_end_stream", 15))
+      if (!strncmp(b->name, "vela_end_stream", strlen("vela_end_stream")))
         return 1;
 
       if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) {
         // This magic header confirms a valid command stream in binary
-        if (strncmp(b->data, "COP1", 4))
+        if (strncmp(b->data, "COP1", strlen("COP1")))
           return 0;
-        h->cmd_data = b->data;
-        h->cmd_data_size = b->size;
+        handles->cmd_data = b->data;
+        handles->cmd_data_size = b->size;
       }
       if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
-        h->weight_data = b->data;
-        h->weight_data_size = b->size;
+        handles->weight_data = b->data;
+        handles->weight_data_size = b->size;
       }
       if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
-        h->scratch_data = b->data;
-        h->scratch_data_size = b->size;
+        handles->scratch_data = b->data;
+        handles->scratch_data_size = b->size;
       }
 
       // capture inputs and outputs
-      if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
-        h->scratch_data = b->data;
-        h->scratch_data_size = b->size;
-      }
-
       if (!strncmp(b->name, "input_offset", strlen("input_offset"))) {
-        vela_offsets* offsets = (vela_offsets*)b->data;
+        VelaOffsets* offsets = (VelaOffsets*)b->data;
         for (int i = 0; i < offsets->count; i++) {
-          h->input_offset.push_back(offsets->offsets[i]);
+          handles->input_offset.push_back(offsets->offsets[i]);
         }
       }
       if (!strncmp(b->name, "output_offset", strlen("output_offset"))) {
-        vela_offsets* offsets = (vela_offsets*)b->data;
+        VelaOffsets* offsets = (VelaOffsets*)b->data;
         for (int i = 0; i < offsets->count; i++) {
-          h->output_offset.push_back(offsets->offsets[i]);
+          handles->output_offset.push_back(offsets->offsets[i]);
         }
       }
 
       if (!strncmp(b->name, "input_shape", strlen("input_shape"))) {
-        vela_shapes* shapes = (vela_shapes*)b->data;
+        VelaShapes* shapes = (VelaShapes*)b->data;
         for (int i = 0; i < shapes->count; i++) {
           vector<int> s = {
               shapes->shape[i][0],
               shapes->shape[i][1],
               shapes->shape[i][2],
               shapes->shape[i][3]};
-          h->input_shape.push_back(s);
+          handles->input_shapes.push_back(s);
         }
       }
       if (!strncmp(b->name, "output_shape", strlen("output_shape"))) {
-        vela_shapes* shapes = (vela_shapes*)b->data;
+        VelaShapes* shapes = (VelaShapes*)b->data;
         for (int i = 0; i < shapes->count; i++) {
           vector<int> s = {
               shapes->shape[i][0],
               shapes->shape[i][1],
               shapes->shape[i][2],
               shapes->shape[i][3]};
-          h->output_shape.push_back(s);
+          handles->output_shapes.push_back(s);
         }
       }
     }

From 1917b541c6fcc5034313fbf18d5b6cf9609bc5d7 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 19:43:01 +0000
Subject: [PATCH 22/25] tidy up example scripts

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 examples/arm/run.sh   | 24 ++++++------------------
 examples/arm/setup.sh |  4 ----
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 2d255b6694e..10a296242b0 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -49,7 +49,7 @@ function generate_ethos_pte_file() {
     cd $et_root_dir
 	python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null
 	cd ./ethosout/simple_add/torch/
-    local pte_file=$(readlink -f ./delegated.pte)
+    local pte_file=$(realpath ./delegated.pte)
     [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "${pte_file}"
 }
@@ -107,7 +107,9 @@ function build_executorch_runner() {
 
 # Execute the executor_runner on FVP Simulator
 function run_fvp() {
-    elf=$(find ${ethos_u_build_dir} -name "executor_runner.elf")
+    [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expexted elf binary name, got $*"; exit 1; }
+    local elf_name=${1}
+    elf=$(find ${ethos_u_build_dir} -name "${elf_name}")
     [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner elf: ${elf}"; exit 1; }
     FVP_Corstone_SSE-300_Ethos-U55                          \
         -C ethosu.num_macs=128                              \
@@ -119,20 +121,6 @@ function run_fvp() {
     echo "[${FUNCNAME[0]} Simulation complete, $?"
 }
 
-# Execute the executor_runner on FVP Simulator
-function run_fvp_delegate() {
-    elf=$(find ${ethos_u_build_dir} -name "executor_runner_delegate.elf")
-    [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner_delegate elf: ${elf}"; exit 1; }
-    FVP_Corstone_SSE-300_Ethos-U55                          \
-        -C ethosu.num_macs=128                              \
-        -C mps3_board.visualisation.disable-visualisation=1 \
-        -C mps3_board.telnetterminal0.start_telnet=0        \
-        -C mps3_board.uart0.out_file='-'                    \
-        -a "${elf}"                                         \
-        --timelimit 5 || true
-    echo "[${FUNCNAME[0]} Simulation complete, $?"
-}
-
 #######
 ### Main
 #######
@@ -169,9 +157,9 @@ build_executorch
 build_executorch_runner "${pte}" "${pte_delegate}"
 
 # run the app
-run_fvp
+run_fvp executor_runner.elf
 
 # run the delegate app
-run_fvp_delegate
+run_fvp executor_runner_delegate.elf
 
 exit 0
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index c4c644bef4f..34b20498cd7 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -169,10 +169,7 @@ function setup_tosa_reference_model() {
 	make
 	cd reference_model
 	tosa_bin_path=`pwd`
-	echo adding ${tosa_bin_path} to path
 	echo "export PATH=\${PATH}:${tosa_bin_path}" >> "${setup_path_script}"
-	cd ../..
-	echo back at `pwd`
 }
 
 function setup_vela() {
@@ -187,7 +184,6 @@ function setup_vela() {
 		patch_repo
 	fi
 	pip install .
-	cd ..
 }
 
 ########

From 20bf2ebc3aae8df17645ee8d700466288de086d5 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 20:06:35 +0000
Subject: [PATCH 23/25] further review comments

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py               |  2 ++
 backends/arm/runtime/ArmBackendEthosU.cpp | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 407f233b02c..d51ae3b4a36 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -184,6 +184,8 @@ def vela_compile(tosa_fb):
                     block_data = input_struct
                 elif key in ("input_offset", "output_offset"):
                     inputs = data[key]
+                    if key == "output_offset" && len(inputs) > 1:
+                        raise RuntimeError("Currently only support one output in Vela ArmBackend")
                     offset_struct = struct.pack("<i", len(inputs))
                     for inp in inputs:
                         offset_struct = offset_struct + struct.pack("<i", inp)
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 85c10fed160..17625bdf20d 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -127,11 +127,12 @@ class ArmBackend final : public PyTorchBackendInterface {
     for (int i = 0; i < handles.input_shapes.size(); i++) {
       const char* input_addr = handles.scratch_data + handles.input_offset[i];
       // Process input EValue into scratch
-      // TODO: optimise into direct write for compatible, contig layout
+      // TODO: Optimise into direct write from Vela into the SRAM or DRAM output
+      //       for compatible data layouts.
       int* input_address = (int*)input_addr;
       auto tensor_in = args[i]->toTensor();
       for (int j = 0; j < tensor_in.numel(); j++) {
-        // TODO: extend beyond 4 byte tensors
+        // TODO: extend beyond tensors with 4 byte elements
         input_address[j] = tensor_in.mutable_data_ptr<int>()[j];
       }
     }
@@ -173,12 +174,18 @@ class ArmBackend final : public PyTorchBackendInterface {
     // Outputs are in the index immediately after inputs
     int output_index = handles.input_shapes.size();
 
+    if (handles.output_shapes.size() != 1) {
+      ET_LOG(
+          Error,
+          "ArmBackend::execute: currently only support one return tensor");
+      return Error::InvalidProgram;
+    }
     // Process results into EValue storage
     // TODO: optimise into direct write for compatible, contig layout
     int* output_address = (int*)output_addr;
     auto tensor_out = args[output_index]->toTensor();
     for (int j = 0; j < tensor_out.numel(); j++) {
-      // TODO: extend beyond 4 byte tensors
+      // TODO: extend beyond tensors with 4 byte elements
       tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
     }
 

From 468d6fc4edaa3784d3f5bdcd7b0c3958e8cf97f0 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 20:21:12 +0000
Subject: [PATCH 24/25] revised path for ethosu_minimal

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 examples/arm/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 10a296242b0..3f9bd37d90c 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -47,7 +47,7 @@ function generate_pte_file() {
 # Generate the ethos delegate PTE file
 function generate_ethos_pte_file() {
     cd $et_root_dir
-	python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null
+	python3 examples/arm/arm_ethosu_minimal.py &> /dev/null
 	cd ./ethosout/simple_add/torch/
     local pte_file=$(realpath ./delegated.pte)
     [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }

From 709a688c58fef40a748bc5b355761fcb969b5dae Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 6 Oct 2023 20:49:26 +0000
Subject: [PATCH 25/25] lintfix

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index d51ae3b4a36..f0f285418c6 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -184,8 +184,10 @@ def vela_compile(tosa_fb):
                     block_data = input_struct
                 elif key in ("input_offset", "output_offset"):
                     inputs = data[key]
-                    if key == "output_offset" && len(inputs) > 1:
-                        raise RuntimeError("Currently only support one output in Vela ArmBackend")
+                    if key == "output_offset" and len(inputs) > 1:
+                        raise RuntimeError(
+                            "Currently only support one output in Vela ArmBackend"
+                        )
                     offset_struct = struct.pack("<i", len(inputs))
                     for inp in inputs:
                         offset_struct = offset_struct + struct.pack("<i", inp)