Optimize CRC32 Extend for large inputs on Arm

This is a temporary workaround for an apparent compiler bug with pmull(2) instructions. The current hot loop looks like this: mov w14, #0xef02, lsl x15, x15, #6, mov x13, xzr, movk w14, #0x740e, lsl #16, sub x15, x15, #0x40, ldr q4, [x16, #0x4e0], _LOOP_START: add x16, x9, x13, add x17, x12, x13, fmov d19, x14, <--------- This is Loop invariant and expensive add x13, x13, #0x40, cmp x15, x13, prfm pldl1keep, [x16, #0x140], prfm pldl1keep, [x17, #0x140], ldp x18, x0, [x16, #0x40], crc32cx w10, w10, x18, ldp x2, x18, [x16, #0x50], crc32cx w10, w10, x0, crc32cx w10, w10, x2, ldp x0, x2, [x16, #0x60], crc32cx w10, w10, x18, ldp x18, x16, [x16, #0x70], pmull2 v5.1q, v1.2d, v4.2d, pmull2 v6.1q, v0.2d, v4.2d, pmull2 v7.1q, v2.2d, v4.2d, pmull2 v16.1q, v3.2d, v4.2d, ldp q17, q18, [x17, #0x40], crc32cx w10, w10, x0, pmull v1.1q, v1.1d, v19.1d, crc32cx w10, w10, x2, pmull v0.1q, v0.1d, v19.1d, crc32cx w10, w10, x18, pmull v2.1q, v2.1d, v19.1d, crc32cx w10, w10, x16, pmull v3.1q, v3.1d, v19.1d, ldp q20, q21, [x17, #0x60], eor v1.16b, v17.16b, v1.16b, eor v0.16b, v18.16b, v0.16b, eor v1.16b, v1.16b, v5.16b, eor v2.16b, v20.16b, v2.16b, eor v0.16b, v0.16b, v6.16b, eor v3.16b, v21.16b, v3.16b, eor v2.16b, v2.16b, v7.16b, eor v3.16b, v3.16b, v16.16b, b.ne _LOOP_START There is a redundant fmov that moves the same constant into a Neon register every loop iteration to be used in the PMULL instructions. The PMULL2 instructions already have this constant loaded into Neon registers. After this change, both the PMULL and PMULL2 instructions use the values in q4, and they are not reloaded every iteration. This fmov was expensive because it contends for execution units with crc32cx instructions. This is up to 20% faster for large inputs. PiperOrigin-RevId: 567391972 Change-Id: I4c8e49750cfa5cc5730c3bb713bd9fd67657804a

Optimize CRC32 Extend for large inputs on Arm
This is a temporary workaround for an apparent compiler bug with pmull(2) instructions. The current hot loop looks like this: mov w14, #0xef02, lsl x15, x15, #6, mov x13, xzr, movk w14, #0x740e, lsl #16, sub x15, x15, #0x40, ldr q4, [x16, #0x4e0], _LOOP_START: add x16, x9, x13, add x17, x12, x13, fmov d19, x14, <--------- This is Loop invariant and expensive add x13, x13, #0x40, cmp x15, x13, prfm pldl1keep, [x16, #0x140], prfm pldl1keep, [x17, #0x140], ldp x18, x0, [x16, #0x40], crc32cx w10, w10, x18, ldp x2, x18, [x16, #0x50], crc32cx w10, w10, x0, crc32cx w10, w10, x2, ldp x0, x2, [x16, #0x60], crc32cx w10, w10, x18, ldp x18, x16, [x16, #0x70], pmull2 v5.1q, v1.2d, v4.2d, pmull2 v6.1q, v0.2d, v4.2d, pmull2 v7.1q, v2.2d, v4.2d, pmull2 v16.1q, v3.2d, v4.2d, ldp q17, q18, [x17, #0x40], crc32cx w10, w10, x0, pmull v1.1q, v1.1d, v19.1d, crc32cx w10, w10, x2, pmull v0.1q, v0.1d, v19.1d, crc32cx w10, w10, x18, pmull v2.1q, v2.1d, v19.1d, crc32cx w10, w10, x16, pmull v3.1q, v3.1d, v19.1d, ldp q20, q21, [x17, #0x60], eor v1.16b, v17.16b, v1.16b, eor v0.16b, v18.16b, v0.16b, eor v1.16b, v1.16b, v5.16b, eor v2.16b, v20.16b, v2.16b, eor v0.16b, v0.16b, v6.16b, eor v3.16b, v21.16b, v3.16b, eor v2.16b, v2.16b, v7.16b, eor v3.16b, v3.16b, v16.16b, b.ne _LOOP_START There is a redundant fmov that moves the same constant into a Neon register every loop iteration to be used in the PMULL instructions. The PMULL2 instructions already have this constant loaded into Neon registers. After this change, both the PMULL and PMULL2 instructions use the values in q4, and they are not reloaded every iteration. This fmov was expensive because it contends for execution units with crc32cx instructions. This is up to 20% faster for large inputs. PiperOrigin-RevId: 567391972 Change-Id: I4c8e49750cfa5cc5730c3bb713bd9fd67657804a
aa3c949a · Connal de Souza · Copybara-Service · 821756c3 · aa3c949a
Commit aa3c949a authored Sep 21, 2023 by Connal de Souza Committed by Copybara-Service Sep 21, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 5 deletions

absl/crc/internal/crc32_x86_arm_combined_simd.h
+9 -5

No files found.
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -225,8 +225,8 @@ inline void V128_Store(V128* dst, V128 data) {

 // Using inline assembly as clang does not generate the pmull2 instruction and
 // performance drops by 15-20%.
-// TODO(b/193678732): Investigate why the compiler decides not to generate
-// such instructions and why it becomes so much worse.
+// TODO(b/193678732): Investigate why there is a slight performance hit when
+// using intrinsics instead of inline assembly.
 inline V128 V128_PMulHi(const V128 l, const V128 r) {
  uint64x2_t res;
  __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
@@ -235,10 +235,14 @@ inline V128 V128_PMulHi(const V128 l, const V128 r) {
  return res;
 }

+// TODO(b/193678732): Investigate why the compiler decides to move the constant
+// loop multiplicands from GPR to Neon registers every loop iteration.
 inline V128 V128_PMulLow(const V128 l, const V128 r) {
-  return reinterpret_cast<V128>(vmull_p64(
-      reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
-      reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
+  uint64x2_t res;
+  __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
+                       : "=w"(res)
+                       : "w"(l), "w"(r));
+  return res;
 }

 inline V128 V128_PMul01(const V128 l, const V128 r) {