google · copybara-service · Nov 14, 2025
diff --git a/ynnpack/BUILD b/ynnpack/BUILD
@@ -68,6 +68,19 @@ selects.config_setting_group(
     ],
 )
 
+# TODO: This doesn't actually do anything yet, it's just a placeholder for a toolchain configuration.
+config_setting(
+    name = "hexagon-clang",
+    define_values = {"ynn_hexagon_clang": "true"},
+)
+
+selects.config_setting_group(
+    name = "hexagon",
+    match_any = [
+        ":hexagon-clang",
+    ],
+)
+
 selects.config_setting_group(
     name = "apple_clang",
     match_any = [
@@ -252,3 +265,8 @@ define_build_option(
     name = "ynn_enable_x86_amxint8",
     default_all = [":ynn_enable_x86_amx"],
 )
+
+define_build_option(
+    name = "ynn_enable_hvx",
+    default_all = [":hexagon"],
+)
diff --git a/ynnpack/base/arch.cc b/ynnpack/base/arch.cc
@@ -78,6 +78,9 @@ uint64_t get_supported_arch_flags() {
     if (cpuinfo_has_arm_sme2()) result |= arch_flag::sme2;
 #endif  // YNN_ARCH_ARM
 #endif  // YNN_ENABLE_CPUINFO
+#ifdef YNN_ARCH_HEXAGON
+    result |= arch_flag::hvx;
+#endif  // YNN_ARCH_HEXAGON
     return result;
   }();
   return flags;

diff --git a/ynnpack/base/arch.h b/ynnpack/base/arch.h
@@ -46,6 +46,9 @@ enum {
   sme = 1 << 6,
   sme2 = 1 << 7,
 #endif
+#ifdef YNN_ARCH_HEXAGON
+  hvx = 1 << 0,
+#endif
 };
 
 }  // namespace arch_flag

diff --git a/ynnpack/base/build_config.h b/ynnpack/base/build_config.h
@@ -34,6 +34,10 @@
 #define YNN_ARCH_ARM
 #endif
 
+#if defined(__hexagon__)
+#define YNN_ARCH_HEXAGON
+#endif
+
 // We want to use _Float16 if the compiler supports it fully, but it's
 // tricky to do this detection; there are compiler versions that define the
 // type in broken ways. We're only going to bother using it if the support is

diff --git a/ynnpack/base/hexagon/BUILD b/ynnpack/base/hexagon/BUILD
@@ -0,0 +1,14 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
+licenses(["notice"])
+
+cc_library(
+    name = "test_main",
+    testonly = True,
+    srcs = [
+        "test_main.cc",
+    ],
+    visibility = ["//:__subpackages__"],
+    deps = ["@com_google_googletest//:gtest"],
+    alwayslink = True,
+)
diff --git a/ynnpack/base/hexagon/test_main.cc b/ynnpack/base/hexagon/test_main.cc
@@ -0,0 +1,17 @@
+// Copyright 2025 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // This calls InitGoogleTest as well. It consume any arguments it understands
+  // from argv.
+  testing::InitGoogleMock(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/ynnpack/build_defs.bzl b/ynnpack/build_defs.bzl
@@ -198,6 +198,10 @@ _YNN_PARAMS_FOR_ARCH = {
         "cond": "//ynnpack:ynn_enable_x86_amxint8",
         "copts": ["-mamx-tile", "-mamx-int8"],
     },
+    "hvx": {
+        "cond": "//ynnpack:ynn_enable_hvx",
+        "copts": ["-mhvx"],
+    },
 }
 
 def _map_copts_to_msvc(copts):
@@ -238,23 +242,30 @@ def ynn_kernel_copts(unroll_loops = True):
 
 def ynn_binary_linkopts():
     return select({
+        "//ynnpack:hexagon": [
+            "-shared",
+            "-Wno-unused-command-line-argument",
+        ],
         "//conditions:default": [],
     })
 
 def ynn_binary_malloc():
     return select({
+        "//ynnpack:hexagon": "@bazel_tools//tools/cpp:malloc",
         "//conditions:default": "@bazel_tools//tools/cpp:malloc",
     })
 
 def ynn_test_deps():
     return select({
+        "//ynnpack:hexagon": [
+            "@com_google_googletest//:gtest",
+            "//ynnpack/base/hexagon:test_main",
+        ],
         "//conditions:default": ["@com_google_googletest//:gtest_main"],
     })
 
 def ynn_benchmark_deps():
-    return select({
-        "//conditions:default": ["@com_google_benchmark//:benchmark_main"],
-    })
+    return ["@com_google_benchmark//:benchmark_main"]
 
 def ynn_cc_library(
         name,

diff --git a/ynnpack/kernels/transpose/BUILD b/ynnpack/kernels/transpose/BUILD
@@ -37,6 +37,10 @@ ynn_cc_library(
             "x86_avx2.h",
             "x86_avx2.cc",
         ],
+        "hvx": [
+            "hvx.h",
+            "hvx.cc",
+        ],
     },
     visibility = ["//ynnpack:__subpackages__"],
     deps = [

diff --git a/ynnpack/kernels/transpose/hvx.cc b/ynnpack/kernels/transpose/hvx.cc
@@ -0,0 +1,99 @@
+// Copyright 2025 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "ynnpack/kernels/transpose/hvx.h"
+
+#include <hexagon_types.h>
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <type_traits>
+
+#include "ynnpack/kernels/transpose/generic.h"
+#include "ynnpack/kernels/transpose/interleave.h"
+#include "ynnpack/kernels/transpose/transpose.h"
+
+namespace ynn {
+
+void transpose_x32_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a,
+                       const void* a, size_t stride_x, void* x) {
+  transpose<std::array<HVX_Vector, 32>>(m, n, n_bytes_a, stride_a, a, stride_x,
+                                        x,
+                                        std::integral_constant<size_t, 32>{});
+}
+void transpose_x64_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a,
+                       const void* a, size_t stride_x, void* x) {
+  transpose<std::array<HVX_Vector, 16>>(m, n, n_bytes_a, stride_a, a, stride_x,
+                                        x,
+                                        std::integral_constant<size_t, 64>{});
+}
+void transpose_x128_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a,
+                        const void* a, size_t stride_x, void* x) {
+  transpose<std::array<HVX_Vector, 8>>(m, n, n_bytes_a, stride_a, a, stride_x,
+                                       x,
+                                       std::integral_constant<size_t, 128>{});
+}
+void transpose_x256_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a,
+                        const void* a, size_t stride_x, void* x) {
+  transpose<std::array<HVX_Vector, 4>>(m, n, n_bytes_a, stride_a, a, stride_x,
+                                       x,
+                                       std::integral_constant<size_t, 256>{});
+}
+void transpose_x512_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a,
+                        const void* a, size_t stride_x, void* x) {
+  transpose<std::array<HVX_Vector, 2>>(m, n, n_bytes_a, stride_a, a, stride_x,
+                                       x,
+                                       std::integral_constant<size_t, 512>{});
+}
+void transpose_x1024_hvx(size_t m, size_t n, size_t n_bytes_a, size_t stride_a,
+                         const void* a, size_t stride_x, void* x) {
+  transpose(m, n, n_bytes_a, stride_a, a, stride_x, x,
+            std::integral_constant<size_t, 1024>{});
+}
+
+void interleave2_x8_hvx(size_t factor, size_t m, size_t n, size_t stride_a,
+                        const void* a, void* x) {
+  assert(factor == 2);
+  interleave<std::array<HVX_Vector, 2>>(m, n, stride_a, a, x,
+                                        std::integral_constant<size_t, 8>{});
+}
+
+void interleave2_x16_hvx(size_t factor, size_t m, size_t n, size_t stride_a,
+                         const void* a, void* x) {
+  assert(factor == 2);
+  interleave<std::array<HVX_Vector, 2>>(m, n, stride_a, a, x,
+                                        std::integral_constant<size_t, 16>{});
+}
+
+void interleave2_x32_hvx(size_t factor, size_t m, size_t n, size_t stride_a,
+                         const void* a, void* x) {
+  assert(factor == 2);
+  interleave<std::array<HVX_Vector, 2>>(m, n, stride_a, a, x,
+                                        std::integral_constant<size_t, 32>{});
+}
+
+void interleave4_x8_hvx(size_t factor, size_t m, size_t n, size_t stride_a,
+                        const void* a, void* x) {
+  assert(factor == 4);
+  interleave<std::array<HVX_Vector, 4>>(m, n, stride_a, a, x,
+                                        std::integral_constant<size_t, 8>{});
+}
+
+void interleave4_x16_hvx(size_t factor, size_t m, size_t n, size_t stride_a,
+                         const void* a, void* x) {
+  assert(factor == 4);
+  interleave<std::array<HVX_Vector, 4>>(m, n, stride_a, a, x,
+                                        std::integral_constant<size_t, 16>{});
+}
+
+void interleave4_x32_hvx(size_t factor, size_t m, size_t n, size_t stride_a,
+                         const void* a, void* x) {
+  assert(factor == 4);
+  interleave<std::array<HVX_Vector, 4>>(m, n, stride_a, a, x,
+                                        std::integral_constant<size_t, 32>{});
+}
+
+}  // namespace ynn
diff --git a/ynnpack/kernels/transpose/hvx.h b/ynnpack/kernels/transpose/hvx.h
@@ -0,0 +1,61 @@
+// Copyright 2025 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef XNNPACK_YNNPACK_KERNELS_TRANSPOSE_HVX_H_
+#define XNNPACK_YNNPACK_KERNELS_TRANSPOSE_HVX_H_
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <type_traits>
+
+#include "ynnpack/base/arithmetic.h"
+
+namespace ynn {
+
+template <typename ElemSizeBits>
+static std::array<HVX_Vector, 2> interleave(ElemSizeBits elem_size_bits,
+                                            std::array<HVX_Vector, 2> x) {
+  HVX_VectorPair x01 = Q6_W_vshuff_VVR(x[1], x[0], -(elem_size_bits / 8));
+  return {Q6_V_lo_W(x01), Q6_V_hi_W(x01)};
+}
+
+template <size_t M>
+static std::array<HVX_Vector, M> load(
+    std::array<HVX_Vector, M>, const void* a, size_t stride, size_t m,
+    std::integral_constant<size_t, 16> /*n_bytes*/) {
+  assert(m > 0);
+  assert(m <= M);
+  std::array<HVX_Vector, M> x;
+  x[0] = *reinterpret_cast<const HVX_UVector*>(a);
+  for (size_t i = 1; i < M; ++i) {
+    x[i] =
+        i < m
+            ? *reinterpret_cast<const HVX_UVector*>(offset_bytes(a, i * stride))
+            : Q6_V_vsplat_R(0);
+  }
+  return x;
+}
+
+template <size_t M>
+static void store(std::array<HVX_Vector, M> x, void* a, size_t stride, size_t m,
+                  std::integral_constant<size_t, 16> /*n_bytes*/) {
+  assert(m > 0);
+  assert(m <= M);
+  *reinterpret_cast<HVX_UVector*>(a) = x[0];
+  for (size_t i = 1; i < M; ++i) {
+    if (i < m) {
+      *reinterpret_cast<HVX_UVector*>(offset_bytes(a, i * stride)) = x[i];
+    }
+  }
+}
+
+}  // namespace ynn
+
+#endif  // XNNPACK_YNNPACK_KERNELS_TRANSPOSE_HVX_H_
diff --git a/ynnpack/kernels/transpose/interleave.inc b/ynnpack/kernels/transpose/interleave.inc
@@ -27,6 +27,15 @@ YNN_INTERLEAVE_KERNEL(arch_flag::neon, interleave4_x16_neon, 4, 16)
 YNN_INTERLEAVE_KERNEL(arch_flag::neon, interleave4_x32_neon, 4, 32)
 #endif  // YNN_ARCH_ARM_NEON
 
+#ifdef YNN_ARCH_HVX
+YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave2_x8_hvx, 2, 8)
+YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave2_x16_hvx, 2, 16)
+YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave2_x32_hvx, 2, 32)
+YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave4_x8_hvx, 4, 8)
+YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave4_x16_hvx, 4, 16)
+YNN_INTERLEAVE_KERNEL(arch_flag::hvx, interleave4_x32_hvx, 4, 32)
+#endif  // YNN_ARCH_HVX
+
 YNN_INTERLEAVE_KERNEL(arch_flag::none, interleave_x4, 0, 4)
 YNN_INTERLEAVE_KERNEL(arch_flag::none, interleave_x8, 0, 8)
 YNN_INTERLEAVE_KERNEL(arch_flag::none, interleave_x16, 0, 16)

diff --git a/ynnpack/kernels/transpose/transpose.inc b/ynnpack/kernels/transpose/transpose.inc
@@ -32,6 +32,15 @@ YNN_TRANSPOSE_KERNEL(arch_flag::neon, transpose_x512_neon, 512)
 YNN_TRANSPOSE_KERNEL(arch_flag::neon, transpose_x1024_neon, 1024)
 #endif  // YNN_ARCH_ARM_NEON
 
+#ifdef YNN_ARCH_HVX
+YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x32_hvx, 32)
+YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x64_hvx, 64)
+YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x128_hvx, 128)
+YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x256_hvx, 256)
+YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x512_hvx, 512)
+YNN_TRANSPOSE_KERNEL(arch_flag::hvx, transpose_x1024_hvx, 1024)
+#endif  // YNN_ARCH_HVX
+
 YNN_TRANSPOSE_KERNEL(arch_flag::none, transpose_x4, 4)
 YNN_TRANSPOSE_KERNEL(arch_flag::none, transpose_x8, 8)
 YNN_TRANSPOSE_KERNEL(arch_flag::none, transpose_x16, 16)