WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit c4c989d

Browse files
authored
cpu: aarch64: conv: optimize sve_1x1 kernel (#4405)
1 parent b048ba2 commit c4c989d

File tree

4 files changed

+130
-124
lines changed

4 files changed

+130
-124
lines changed

src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -223,19 +223,6 @@ struct jit_brgemm_kernel_t : public jit_generator_t {
223223
PReg ld_tail_mask = PReg(3);
224224
PReg gemv_tail_mask = PReg(4);
225225

226-
void add_vl_or_imm(XReg dst, XReg src, int offset) {
227-
// If offset is a multiple of the vector length and
228-
// offset / vector_length is compatible with addvl
229-
// use the addvl instruction. Refer to https://developer.arm.com/documentation/ddi0596/2021-03/SVE-Instructions/ADDVL--Add-multiple-of-vector-register-size-to-scalar-register-
230-
// for details.
231-
if ((offset % cpu_sveLen == 0)
232-
&& (offset / static_cast<int>(cpu_sveLen) >= -32)
233-
&& (offset / cpu_sveLen <= 31))
234-
addvl(dst, src, offset / cpu_sveLen);
235-
else
236-
add_imm(dst, src, offset, X_TMP_0);
237-
}
238-
239226
ZReg accm(int ld_block, int bd, int ld) const {
240227
return ZReg(max_effective_vregs - 1 - (bd * ld_block + ld));
241228
}
@@ -727,7 +714,7 @@ void jit_brgemm_kernel_t::zero_accumulators(int bd_block2, bool is_bdb_tail,
727714
const int offset = C_offset(bd, ld);
728715

729716
if (!use_mul_vl(offset - base_offset, 4, cpu_sveLen)) {
730-
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset);
717+
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset, X_TMP_0);
731718
base_offset = offset;
732719
x_addr = reg_tmp_;
733720
}
@@ -1061,7 +1048,7 @@ void jit_brgemm_kernel_t::store_accumulators_apply_post_ops(
10611048
const int offset = D_offset(bd, ld);
10621049
if (!use_mul_vl(offset - base_offset,
10631050
types::data_type_size(brg.dt_d), cpu_sveLen)) {
1064-
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset);
1051+
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset, X_TMP_0);
10651052
base_offset = offset;
10661053
x_addr = reg_tmp_;
10671054
}
@@ -1207,7 +1194,7 @@ void jit_brgemm_kernel_t::store_accumulators_without_post_ops(
12071194
const auto mask = is_ld_tail ? ld_tail_mask : P_ALL_ONE;
12081195
const int offset = C_offset(bd, ld);
12091196
if (!use_mul_vl(offset - base_offset, 4, cpu_sveLen)) {
1210-
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset);
1197+
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset, X_TMP_0);
12111198
base_offset = offset;
12121199
x_addr = reg_tmp_;
12131200
}
@@ -1522,7 +1509,8 @@ void jit_brgemm_kernel_t::gemm_microkernel(int bd_block2, bool is_bdb_tail,
15221509
auto offset_ = offset - base_offset;
15231510
// The ld1rw immediate must be <=252 and a multiple of 4
15241511
// refer to https://developer.arm.com/documentation/ddi0602/2025-09/SVE-Instructions/LD1RW--Load-and-broadcast-unsigned-word-to-vector-
1525-
if ((offset_ > 252 || offset_ % 4 != 0) && !brg.is_gemv) {
1512+
if ((offset_ < 0 || offset_ > 252 || offset_ % 4 != 0)
1513+
&& !brg.is_gemv) {
15261514
add_imm(X_TMP_2, X_TMP_2, offset_, X_TMP_0);
15271515
base_offset += offset_;
15281516
offset_ = 0;
@@ -1573,7 +1561,7 @@ void jit_brgemm_kernel_t::gemm_microkernel(int bd_block2, bool is_bdb_tail,
15731561
auto vmm = accm(ld_block2, bd, ld);
15741562
if (is_emdbd) {
15751563
// The ld1rw immediate must be <=252 and a multiple of 4
1576-
if (A_offset(bd, rd) <= 252
1564+
if (A_offset(bd, rd) >= 0 && A_offset(bd, rd) <= 252
15771565
&& A_offset(bd, rd) % 4 == 0) {
15781566
ld1rw(load().s, mask / T_z,
15791567
ptr(reg_aux_A, A_offset(bd, rd)));
@@ -1602,7 +1590,8 @@ void jit_brgemm_kernel_t::gemm_microkernel(int bd_block2, bool is_bdb_tail,
16021590
} else {
16031591
const int offset = B_offset(ld, rd);
16041592
if (!use_mul_vl(offset - base_offset, 4, cpu_sveLen)) {
1605-
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset);
1593+
add_vl_or_imm(reg_tmp_, x_addr, offset - base_offset,
1594+
X_TMP_0);
16061595
base_offset = offset;
16071596
x_addr = reg_tmp_;
16081597
}
@@ -1631,7 +1620,7 @@ void jit_brgemm_kernel_t::gemm_microkernel(int bd_block2, bool is_bdb_tail,
16311620
? gemv_tail_mask
16321621
: P_ALL_ONE;
16331622
// The ld1rw immediate must be <= 252 and a multiple of 4
1634-
if (A_offset(bd, rd) <= 252
1623+
if (A_offset(bd, rd) >= 0 && A_offset(bd, rd) <= 252
16351624
&& A_offset(bd, rd) % 4 == 0) {
16361625
ld1rw(z_tmp_1().s, mask / T_z,
16371626
ptr(reg_aux_A, A_offset(bd, rd)));
@@ -1690,9 +1679,9 @@ void jit_brgemm_kernel_t::ldb_loop(int bd_block2, bool is_bdb_tail,
16901679
is_ld_tail, vpad, rows_for_rd_tail);
16911680

16921681
add_vl_or_imm(reg_aux_A, reg_aux_A,
1693-
brg.is_gemv ? cpu_sveLen : rdb_A_offset());
1682+
brg.is_gemv ? cpu_sveLen : rdb_A_offset(), X_TMP_0);
16941683
add_vl_or_imm(reg_aux_B, reg_aux_B,
1695-
brg.is_gemv ? cpu_sveLen : rdb_B_offset());
1684+
brg.is_gemv ? cpu_sveLen : rdb_B_offset(), X_TMP_0);
16961685

16971686
subs(reg_rdb_loop, reg_rdb_loop, 1);
16981687
}

src/cpu/aarch64/jit_generator.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,20 @@ class jit_generator_t : public Xbyak_aarch64::CodeGenerator,
261261
return (off_mod == 0 && -8 <= off_mul_vl && off_mul_vl <= 7);
262262
}
263263

264+
inline void add_vl_or_imm(Xbyak_aarch64::XReg dst, Xbyak_aarch64::XReg src,
265+
int offset, Xbyak_aarch64::XReg tmp) {
266+
// If offset is a multiple of the vector length and
267+
// offset / vector_length is compatible with addvl
268+
// use the addvl instruction. Refer to https://developer.arm.com/documentation/ddi0596/2021-03/SVE-Instructions/ADDVL--Add-multiple-of-vector-register-size-to-scalar-register-
269+
// for details.
270+
if ((offset % cpu_sveLen == 0)
271+
&& (offset / static_cast<int>(cpu_sveLen) >= -32)
272+
&& (offset / cpu_sveLen <= 31))
273+
addvl(dst, src, offset / cpu_sveLen);
274+
else
275+
add_imm(dst, src, offset, tmp);
276+
}
277+
264278
template <typename PRegBHSD, typename T>
265279
void set_preg(const PRegBHSD &p, T tail_size,
266280
const Xbyak_aarch64::XReg x_tmp0 = Xbyak_aarch64::XReg(DUMMY_IDX),

0 commit comments

Comments
 (0)