Commit 61e47a45 by Connal de Souza Committed by Copybara-Service

Optimize crc32 V128_From2x64 on Arm

This removes redundant vector-vector moves and results in Extend being up to 3% faster.

PiperOrigin-RevId: 621948170
Change-Id: Id82816aa6e294d34140ff591103cb20feac79d9a
parent 1ec4a27e
...@@ -102,10 +102,11 @@ V128 V128_Xor(const V128 l, const V128 r); ...@@ -102,10 +102,11 @@ V128 V128_Xor(const V128 l, const V128 r);
// Produces an AND operation of |l| and |r|. // Produces an AND operation of |l| and |r|.
V128 V128_And(const V128 l, const V128 r); V128 V128_And(const V128 l, const V128 r);
// Sets two 64 bit integers to one 128 bit vector. The order is reverse. // Sets the lower half of a 128 bit register to the given 64-bit value and
// zeroes the upper half.
// dst[63:0] := |r| // dst[63:0] := |r|
// dst[127:64] := |l| // dst[127:64] := |0|
V128 V128_From2x64(const uint64_t l, const uint64_t r); V128 V128_From64WithZeroFill(const uint64_t r);
// Shift |l| right by |imm| bytes while shifting in zeros. // Shift |l| right by |imm| bytes while shifting in zeros.
template <int imm> template <int imm>
...@@ -171,8 +172,8 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); } ...@@ -171,8 +172,8 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); } inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { inline V128 V128_From64WithZeroFill(const uint64_t r) {
return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r)); return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
} }
template <int imm> template <int imm>
...@@ -262,10 +263,12 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); } ...@@ -262,10 +263,12 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); } inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { inline V128 V128_From64WithZeroFill(const uint64_t r){
return vcombine_u64(vcreate_u64(r), vcreate_u64(l)); constexpr uint64x2_t kZero = {0, 0};
return vsetq_lane_u64(r, kZero, 0);
} }
template <int imm> template <int imm>
inline V128 V128_ShiftRight(const V128 l) { inline V128 V128_ShiftRight(const V128 l) {
return vreinterpretq_u64_s8( return vreinterpretq_u64_s8(
......
...@@ -101,9 +101,9 @@ constexpr size_t kMediumCutoff = 2048; ...@@ -101,9 +101,9 @@ constexpr size_t kMediumCutoff = 2048;
namespace { namespace {
uint32_t multiply(uint32_t a, uint32_t b) { uint32_t multiply(uint32_t a, uint32_t b) {
V128 shifts = V128_From2x64(0, 1); V128 shifts = V128_From64WithZeroFill(1);
V128 power = V128_From2x64(0, a); V128 power = V128_From64WithZeroFill(a);
V128 crc = V128_From2x64(0, b); V128 crc = V128_From64WithZeroFill(b);
V128 res = V128_PMulLow(power, crc); V128 res = V128_PMulLow(power, crc);
// Combine crc values // Combine crc values
...@@ -444,11 +444,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams ...@@ -444,11 +444,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1); V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
V128 tmp = V128_From2x64(0, l64); V128 tmp = V128_From64WithZeroFill(l64);
V128 res1 = V128_PMulLow(tmp, magic); V128 res1 = V128_PMulLow(tmp, magic);
tmp = V128_From2x64(0, l641); tmp = V128_From64WithZeroFill(l641);
V128 res2 = V128_PMul10(tmp, magic); V128 res2 = V128_PMul10(tmp, magic);
V128 x = V128_Xor(res1, res2); V128 x = V128_Xor(res1, res2);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment