Optimize crc32 V128_From2x64 on Arm

This removes redundant vector-vector moves and results in Extend being up to 3% faster. PiperOrigin-RevId: 621948170 Change-Id: Id82816aa6e294d34140ff591103cb20feac79d9a

Optimize crc32 V128_From2x64 on Arm
This removes redundant vector-vector moves and results in Extend being up to 3% faster. PiperOrigin-RevId: 621948170 Change-Id: Id82816aa6e294d34140ff591103cb20feac79d9a
61e47a45 · Connal de Souza · Copybara-Service · 1ec4a27e · 61e47a45 · 61e47a45
Commit 61e47a45 authored Apr 04, 2024 by Connal de Souza Committed by Copybara-Service Apr 04, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 12 deletions

absl/crc/internal/crc32_x86_arm_combined_simd.h
+10 -7

absl/crc/internal/crc_x86_arm_combined.cc
+5 -5

No files found.
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -102,10 +102,11 @@ V128 V128_Xor(const V128 l, const V128 r);
 // Produces an AND operation of |l| and |r|.
 V128 V128_And(const V128 l, const V128 r);
-// Sets two 64 bit integers to one 128 bit vector. The order is reverse.
+// Sets the lower half of a 128 bit register to the given 64-bit value and
+// zeroes the upper half.
 // dst[63:0] := |r|
-// dst[127:64] := |l|
+// dst[127:64] := |0|
-V128 V128_From2x64(const uint64_t l, const uint64_t r);
+V128 V128_From64WithZeroFill(const uint64_t r);
 // Shift |l| right by |imm| bytes while shifting in zeros.
 template <int imm>
@@ -171,8 +172,8 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
 inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
-inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
+inline V128 V128_From64WithZeroFill(const uint64_t r) {
-  return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
+  return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
 }
 template <int imm>
@@ -262,10 +263,12 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
 inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
-inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
+inline V128 V128_From64WithZeroFill(const uint64_t r){
-  return vcombine_u64(vcreate_u64(r), vcreate_u64(l));
+  constexpr uint64x2_t kZero = {0, 0};
+  return vsetq_lane_u64(r, kZero, 0);
 }
 template <int imm>
 inline V128 V128_ShiftRight(const V128 l) {
  return vreinterpretq_u64_s8(

--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -101,9 +101,9 @@ constexpr size_t kMediumCutoff = 2048;
 namespace {
 uint32_t multiply(uint32_t a, uint32_t b) {
-  V128 shifts = V128_From2x64(0, 1);
+  V128 shifts = V128_From64WithZeroFill(1);
-  V128 power = V128_From2x64(0, a);
+  V128 power = V128_From64WithZeroFill(a);
-  V128 crc = V128_From2x64(0, b);
+  V128 crc = V128_From64WithZeroFill(b);
  V128 res = V128_PMulLow(power, crc);
  // Combine crc values
@@ -444,11 +444,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
        V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
-        V128 tmp = V128_From2x64(0, l64);
+        V128 tmp = V128_From64WithZeroFill(l64);
        V128 res1 = V128_PMulLow(tmp, magic);
-        tmp = V128_From2x64(0, l641);
+        tmp = V128_From64WithZeroFill(l641);
        V128 res2 = V128_PMul10(tmp, magic);
        V128 x = V128_Xor(res1, res2);