From 028e38e71aa508a78307018f3c63726e16386222 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 13 Nov 2025 17:52:03 -0800 Subject: [PATCH] Add Hexagon transpose kernels to YNNPACK These transpose kernels also cover all the kernels needed to implement packing for dots. This is the first Hexagon kernel. This also adds some build logic necessary for Hexagon. However, it is currently just a placeholder, it doesn't work with bazel yet. PiperOrigin-RevId: 832079166 --- ynnpack/BUILD | 18 +++++ ynnpack/base/arch.cc | 3 + ynnpack/base/arch.h | 3 + ynnpack/base/build_config.h | 4 + ynnpack/base/hexagon/BUILD | 14 ++++ ynnpack/base/hexagon/test_main.cc | 17 ++++ ynnpack/build_defs.bzl | 17 +++- ynnpack/kernels/transpose/BUILD | 4 + ynnpack/kernels/transpose/hvx.cc | 99 ++++++++++++++++++++++++ ynnpack/kernels/transpose/hvx.h | 61 +++++++++++++++ ynnpack/kernels/transpose/interleave.inc | 9 +++ ynnpack/kernels/transpose/transpose.inc | 9 +++ 12 files changed, 255 insertions(+), 3 deletions(-) create mode 100644 ynnpack/base/hexagon/BUILD create mode 100644 ynnpack/base/hexagon/test_main.cc create mode 100644 ynnpack/kernels/transpose/hvx.cc create mode 100644 ynnpack/kernels/transpose/hvx.h diff --git a/ynnpack/BUILD b/ynnpack/BUILD index a3a77860d8d..8d138ef748c 100644 --- a/ynnpack/BUILD +++ b/ynnpack/BUILD @@ -68,6 +68,19 @@ selects.config_setting_group( ], ) +# TODO: This doesn't actually do anything yet, it's just a placeholder for a toolchain configuration. +config_setting( + name = "hexagon-clang", + define_values = {"ynn_hexagon_clang": "true"}, +) + +selects.config_setting_group( + name = "hexagon", + match_any = [ + ":hexagon-clang", + ], +) + selects.config_setting_group( name = "apple_clang", match_any = [ @@ -252,3 +265,8 @@ define_build_option( name = "ynn_enable_x86_amxint8", default_all = [":ynn_enable_x86_amx"], ) + +define_build_option( + name = "ynn_enable_hvx", + default_all = [":hexagon"], +) diff --git a/ynnpack/base/arch.cc b/ynnpack/base/arch.cc index 1bacc88d8cd..f59a8b885cf 100644 --- a/ynnpack/base/arch.cc +++ b/ynnpack/base/arch.cc @@ -78,6 +78,9 @@ uint64_t get_supported_arch_flags() { if (cpuinfo_has_arm_sme2()) result |= arch_flag::sme2; #endif // YNN_ARCH_ARM #endif // YNN_ENABLE_CPUINFO +#ifdef YNN_ARCH_HEXAGON + result |= arch_flag::hvx; +#endif // YNN_ARCH_HEXAGON return result; }(); return flags; diff --git a/ynnpack/base/arch.h b/ynnpack/base/arch.h index eed63b7541f..abf10d8b151 100644 --- a/ynnpack/base/arch.h +++ b/ynnpack/base/arch.h @@ -46,6 +46,9 @@ enum { sme = 1 << 6, sme2 = 1 << 7, #endif +#ifdef YNN_ARCH_HEXAGON + hvx = 1 << 0, +#endif }; } // namespace arch_flag diff --git a/ynnpack/base/build_config.h b/ynnpack/base/build_config.h index aaa1780ed98..37fc54c10ef 100644 --- a/ynnpack/base/build_config.h +++ b/ynnpack/base/build_config.h @@ -34,6 +34,10 @@ #define YNN_ARCH_ARM #endif +#if defined(__hexagon__) +#define YNN_ARCH_HEXAGON +#endif + // We want to use _Float16 if the compiler supports it fully, but it's // tricky to do this detection; there are compiler versions that define the // type in broken ways. We're only going to bother using it if the support is diff --git a/ynnpack/base/hexagon/BUILD b/ynnpack/base/hexagon/BUILD new file mode 100644 index 00000000000..0b3ba456658 --- /dev/null +++ b/ynnpack/base/hexagon/BUILD @@ -0,0 +1,14 @@ +load("@rules_cc//cc:cc_library.bzl", "cc_library") + +licenses(["notice"]) + +cc_library( + name = "test_main", + testonly = True, + srcs = [ + "test_main.cc", + ], + visibility = ["//:__subpackages__"], + deps = ["@com_google_googletest//:gtest"], + alwayslink = True, +) diff --git a/ynnpack/base/hexagon/test_main.cc b/ynnpack/base/hexagon/test_main.cc new file mode 100644 index 00000000000..bff397d9745 --- /dev/null +++ b/ynnpack/base/hexagon/test_main.cc @@ -0,0 +1,17 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + + // This calls InitGoogleTest as well. It consume any arguments it understands + // from argv. + testing::InitGoogleMock(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/ynnpack/build_defs.bzl b/ynnpack/build_defs.bzl index 7fc816e1bc5..077c3925939 100644 --- a/ynnpack/build_defs.bzl +++ b/ynnpack/build_defs.bzl @@ -198,6 +198,10 @@ _YNN_PARAMS_FOR_ARCH = { "cond": "//ynnpack:ynn_enable_x86_amxint8", "copts": ["-mamx-tile", "-mamx-int8"], }, + "hvx": { + "cond": "//ynnpack:ynn_enable_hvx", + "copts": ["-mhvx"], + }, } def _map_copts_to_msvc(copts): @@ -238,23 +242,30 @@ def ynn_kernel_copts(unroll_loops = True): def ynn_binary_linkopts(): return select({ + "//ynnpack:hexagon": [ + "-shared", + "-Wno-unused-command-line-argument", + ], "//conditions:default": [], }) def ynn_binary_malloc(): return select({ + "//ynnpack:hexagon": "@bazel_tools//tools/cpp:malloc", "//conditions:default": "@bazel_tools//tools/cpp:malloc", }) def ynn_test_deps(): return select({ + "//ynnpack:hexagon": [ + "@com_google_googletest//:gtest", + "//ynnpack/base/hexagon:test_main", + ], "//conditions:default": ["@com_google_googletest//:gtest_main"], }) def ynn_benchmark_deps(): - return select({ - "//conditions:default": ["@com_google_benchmark//:benchmark_main"], - }) + return ["@com_google_benchmark//:benchmark_main"] def ynn_cc_library( name, diff --git a/ynnpack/kernels/transpose/BUILD b/ynnpack/kernels/transpose/BUILD index b67b580b6fd..1872479ad1a 100644 --- a/ynnpack/kernels/transpose/BUILD +++ b/ynnpack/kernels/transpose/BUILD @@ -37,6 +37,10 @@ ynn_cc_library( "x86_avx2.h", "x86_avx2.cc", ], + "hvx": [ + "hvx.h", + "hvx.cc", + ], }, visibility = ["//ynnpack:__subpackages__"], deps = [ diff --git a/ynnpack/kernels/transpose/hvx.cc b/ynnpack/kernels/transpose/hvx.cc new file mode 100644 index 00000000000..be899cb0319 --- /dev/null +++ b/ynnpack/kernels/transpose/hvx.cc @@ -0,0 +1,99 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "ynnpack/kernels/transpose/hvx.h" + +#include + +#include +#include +#include +#include + +#include "ynnpack/kernels/transpose/generic.h" +#include "ynnpack/kernels/transpose/interleave.h" +#include "ynnpack/kernels/transpose/transpose.h" + +namespace ynn { + +void transpose_x32_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a, + const void* a, size_t stride_x, void* x) { + transpose>(m, n, n_bytes_a, stride_a, a, stride_x, + x, + std::integral_constant{}); +} +void transpose_x64_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a, + const void* a, size_t stride_x, void* x) { + transpose>(m, n, n_bytes_a, stride_a, a, stride_x, + x, + std::integral_constant{}); +} +void transpose_x128_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a, + const void* a, size_t stride_x, void* x) { + transpose>(m, n, n_bytes_a, stride_a, a, stride_x, + x, + std::integral_constant{}); +} +void transpose_x256_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a, + const void* a, size_t stride_x, void* x) { + transpose>(m, n, n_bytes_a, stride_a, a, stride_x, + x, + std::integral_constant{}); +} +void transpose_x512_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a, + const void* a, size_t stride_x, void* x) { + transpose>(m, n, n_bytes_a, stride_a, a, stride_x, + x, + std::integral_constant{}); +} +void transpose_x1024_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a, + const void* a, size_t stride_x, void* x) { + transpose(m, n, n_bytes_a, stride_a, a, stride_x, x, + std::integral_constant{}); +} + +void interleave2_x8_hvx(size_t factor, size_t m, size_t n, size_t stride_a, + const void* a, void* x) { + assert(factor == 2); + interleave>(m, n, stride_a, a, x, + std::integral_constant{}); +} + +void interleave2_x16_hvx(size_t factor, size_t m, size_t n, size_t stride_a, + const void* a, void* x) { + assert(factor == 2); + interleave>(m, n, stride_a, a, x, + std::integral_constant{}); +} + +void interleave2_x32_hvx(size_t factor, size_t m, size_t n, size_t stride_a, + const void* a, void* x) { + assert(factor == 2); + interleave>(m, n, stride_a, a, x, + std::integral_constant{}); +} + +void interleave4_x8_hvx(size_t factor, size_t m, size_t n, size_t stride_a, + const void* a, void* x) { + assert(factor == 4); + interleave>(m, n, stride_a, a, x, + std::integral_constant{}); +} + +void interleave4_x16_hvx(size_t factor, size_t m, size_t n, size_t stride_a, + const void* a, void* x) { + assert(factor == 4); + interleave>(m, n, stride_a, a, x, + std::integral_constant{}); +} + +void interleave4_x32_hvx(size_t factor, size_t m, size_t n, size_t stride_a, + const void* a, void* x) { + assert(factor == 4); + interleave>(m, n, stride_a, a, x, + std::integral_constant{}); +} + +} // namespace ynn diff --git a/ynnpack/kernels/transpose/hvx.h b/ynnpack/kernels/transpose/hvx.h new file mode 100644 index 00000000000..44363e5d0a3 --- /dev/null +++ b/ynnpack/kernels/transpose/hvx.h @@ -0,0 +1,61 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef XNNPACK_YNNPACK_KERNELS_TRANSPOSE_HVX_H_ +#define XNNPACK_YNNPACK_KERNELS_TRANSPOSE_HVX_H_ + +#include +#include +#include + +#include +#include +#include +#include + +#include "ynnpack/base/arithmetic.h" + +namespace ynn { + +template +static std::array interleave(ElemSizeBits elem_size_bits, + std::array x) { + HVX_VectorPair x01 = Q6_W_vshuff_VVR(x[1], x[0], -(elem_size_bits / 8)); + return {Q6_V_lo_W(x01), Q6_V_hi_W(x01)}; +} + +template +static std::array load( + std::array, const void* a, size_t stride, size_t m, + std::integral_constant /*n_bytes*/) { + assert(m > 0); + assert(m <= M); + std::array x; + x[0] = *reinterpret_cast(a); + for (size_t i = 1; i < M; ++i) { + x[i] = + i < m + ? *reinterpret_cast(offset_bytes(a, i * stride)) + : Q6_V_vsplat_R(0); + } + return x; +} + +template +static void store(std::array x, void* a, size_t stride, size_t m, + std::integral_constant /*n_bytes*/) { + assert(m > 0); + assert(m <= M); + *reinterpret_cast(a) = x[0]; + for (size_t i = 1; i < M; ++i) { + if (i < m) { + *reinterpret_cast(offset_bytes(a, i * stride)) = x[i]; + } + } +} + +} // namespace ynn + +#endif // XNNPACK_YNNPACK_KERNELS_TRANSPOSE_HVX_H_ diff --git a/ynnpack/kernels/transpose/interleave.inc b/ynnpack/kernels/transpose/interleave.inc index 3fe9d0bc70b..83536d238a9 100644 --- a/ynnpack/kernels/transpose/interleave.inc +++ b/ynnpack/kernels/transpose/interleave.inc @@ -27,6 +27,15 @@ YNN_INTERLEAVE_KERNEL(arch_flag::neon, interleave4_x16_neon, 4, 16) YNN_INTERLEAVE_KERNEL(arch_flag::neon, interleave4_x32_neon, 4, 32) #endif // YNN_ARCH_ARM_NEON +#ifdef YNN_ARCH_HVX +YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave2_x8_hvx, 2, 8) +YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave2_x16_hvx, 2, 16) +YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave2_x32_hvx, 2, 32) +YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave4_x8_hvx, 4, 8) +YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave4_x16_hvx, 4, 16) +YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave4_x32_hvx, 4, 32) +#endif // YNN_ARCH_HVX + YNN_INTERLEAVE_KERNEL(arch_flag::none, interleave_x4, 0, 4) YNN_INTERLEAVE_KERNEL(arch_flag::none, interleave_x8, 0, 8) YNN_INTERLEAVE_KERNEL(arch_flag::none, interleave_x16, 0, 16) diff --git a/ynnpack/kernels/transpose/transpose.inc b/ynnpack/kernels/transpose/transpose.inc index 96dd97fdc05..3e3b92a8d5a 100644 --- a/ynnpack/kernels/transpose/transpose.inc +++ b/ynnpack/kernels/transpose/transpose.inc @@ -32,6 +32,15 @@ YNN_TRANSPOSE_KERNEL(arch_flag::neon, transpose_x512_neon, 512) YNN_TRANSPOSE_KERNEL(arch_flag::neon, transpose_x1024_neon, 1024) #endif // YNN_ARCH_ARM_NEON +#ifdef YNN_ARCH_HVX +YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x32_hvx, 32) +YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x64_hvx, 64) +YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x128_hvx, 128) +YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x256_hvx, 256) +YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x512_hvx, 512) +YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x1024_hvx, 1024) +#endif // YNN_ARCH_HVX + YNN_TRANSPOSE_KERNEL(arch_flag::none, transpose_x4, 4) YNN_TRANSPOSE_KERNEL(arch_flag::none, transpose_x8, 8) YNN_TRANSPOSE_KERNEL(arch_flag::none, transpose_x16, 16)