Commit aa3c949a by Connal de Souza Committed by Copybara-Service

Optimize CRC32 Extend for large inputs on Arm

This is a temporary workaround for an apparent compiler bug with pmull(2) instructions. The current hot loop looks like this:

mov	w14, #0xef02,
lsl	x15, x15, #6,
mov	x13, xzr,
movk	w14, #0x740e, lsl #16,
sub	x15, x15, #0x40,
ldr	q4, [x16, #0x4e0],

_LOOP_START:
add	x16, x9, x13,
add	x17, x12, x13,
fmov	d19, x14,            <--------- This is Loop invariant and expensive
add	x13, x13, #0x40,
cmp	x15, x13,
prfm	pldl1keep, [x16, #0x140],
prfm	pldl1keep, [x17, #0x140],
ldp	x18, x0, [x16, #0x40],
crc32cx	w10, w10, x18,
ldp	x2, x18, [x16, #0x50],
crc32cx	w10, w10, x0,
crc32cx	w10, w10, x2,
ldp	x0, x2, [x16, #0x60],
crc32cx	w10, w10, x18,
ldp	x18, x16, [x16, #0x70],
pmull2	v5.1q, v1.2d, v4.2d,
pmull2	v6.1q, v0.2d, v4.2d,
pmull2	v7.1q, v2.2d, v4.2d,
pmull2	v16.1q, v3.2d, v4.2d,
ldp	q17, q18, [x17, #0x40],
crc32cx	w10, w10, x0,
pmull	v1.1q, v1.1d, v19.1d,
crc32cx	w10, w10, x2,
pmull	v0.1q, v0.1d, v19.1d,
crc32cx	w10, w10, x18,
pmull	v2.1q, v2.1d, v19.1d,
crc32cx	w10, w10, x16,
pmull	v3.1q, v3.1d, v19.1d,
ldp	q20, q21, [x17, #0x60],
eor	v1.16b, v17.16b, v1.16b,
eor	v0.16b, v18.16b, v0.16b,
eor	v1.16b, v1.16b, v5.16b,
eor	v2.16b, v20.16b, v2.16b,
eor	v0.16b, v0.16b, v6.16b,
eor	v3.16b, v21.16b, v3.16b,
eor	v2.16b, v2.16b, v7.16b,
eor	v3.16b, v3.16b, v16.16b,
b.ne	_LOOP_START

There is a redundant fmov that moves the same constant into a Neon register every loop iteration to be used in the PMULL instructions. The PMULL2 instructions already have this constant loaded into Neon registers. After this change, both the PMULL and PMULL2 instructions use the values in q4, and they are not reloaded every iteration. This fmov was expensive because it contends for execution units with crc32cx instructions. This is up to 20% faster for large inputs.

PiperOrigin-RevId: 567391972
Change-Id: I4c8e49750cfa5cc5730c3bb713bd9fd67657804a
parent 821756c3
......@@ -225,8 +225,8 @@ inline void V128_Store(V128* dst, V128 data) {
// Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%.
// TODO(b/193678732): Investigate why the compiler decides not to generate
// such instructions and why it becomes so much worse.
// TODO(b/193678732): Investigate why there is a slight performance hit when
// using intrinsics instead of inline assembly.
inline V128 V128_PMulHi(const V128 l, const V128 r) {
uint64x2_t res;
__asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
......@@ -235,10 +235,14 @@ inline V128 V128_PMulHi(const V128 l, const V128 r) {
return res;
}
// TODO(b/193678732): Investigate why the compiler decides to move the constant
// loop multiplicands from GPR to Neon registers every loop iteration.
inline V128 V128_PMulLow(const V128 l, const V128 r) {
return reinterpret_cast<V128>(vmull_p64(
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
uint64x2_t res;
__asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
: "=w"(res)
: "w"(l), "w"(r));
return res;
}
inline V128 V128_PMul01(const V128 l, const V128 r) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment