Commit 16452e14 by Chris Mihelich Committed by Copybara-Service

Decoder for Rust-style Punycode encodings of bounded length.

PiperOrigin-RevId: 647093624
Change-Id: Ic76bfa4aa8fb616cb23095ce7bfa30c3812dcb21
parent 63d4b2fe
......@@ -125,6 +125,8 @@ set(ABSL_INTERNAL_DLL_FILES
"debugging/internal/address_is_readable.cc"
"debugging/internal/address_is_readable.h"
"debugging/internal/bounded_utf8_length_sequence.h"
"debugging/internal/decode_rust_punycode.cc"
"debugging/internal/decode_rust_punycode.h"
"debugging/internal/demangle.cc"
"debugging/internal/demangle.h"
"debugging/internal/demangle_rust.cc"
......
......@@ -220,12 +220,14 @@ cc_library(
cc_library(
name = "demangle_internal",
srcs = [
"internal/decode_rust_punycode.cc",
"internal/demangle.cc",
"internal/demangle_rust.cc",
"internal/utf8_for_code_point.cc",
],
hdrs = [
"internal/bounded_utf8_length_sequence.h",
"internal/decode_rust_punycode.h",
"internal/demangle.h",
"internal/demangle_rust.h",
"internal/utf8_for_code_point.h",
......@@ -240,6 +242,7 @@ cc_library(
"//absl/base",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/base:nullability",
"//absl/numeric:bits",
],
)
......@@ -258,6 +261,19 @@ cc_test(
)
cc_test(
name = "decode_rust_punycode_test",
srcs = ["internal/decode_rust_punycode_test.cc"],
copts = ABSL_TEST_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
deps = [
":demangle_internal",
"//absl/base:config",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "demangle_rust_test",
srcs = ["internal/demangle_rust_test.cc"],
copts = ABSL_TEST_COPTS,
......
......@@ -196,10 +196,12 @@ absl_cc_library(
demangle_internal
HDRS
"internal/bounded_utf8_length_sequence.h"
"internal/decode_rust_punycode.h"
"internal/demangle.h"
"internal/demangle_rust.h"
"internal/utf8_for_code_point.h"
SRCS
"internal/decode_rust_punycode.cc"
"internal/demangle.cc"
"internal/demangle_rust.cc"
"internal/utf8_for_code_point.cc"
......@@ -209,6 +211,7 @@ absl_cc_library(
absl::base
absl::bits
absl::core_headers
absl::nullability
PUBLIC
)
......@@ -227,6 +230,19 @@ absl_cc_test(
absl_cc_test(
NAME
decode_rust_punycode_test
SRCS
"internal/decode_rust_punycode_test.cc"
COPTS
${ABSL_TEST_COPTS}
DEPS
absl::demangle_internal
absl::config
GTest::gmock_main
)
absl_cc_test(
NAME
demangle_rust_test
SRCS
"internal/demangle_rust_test.cc"
......
// Copyright 2024 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/debugging/internal/decode_rust_punycode.h"
#include <cstddef>
#include <cstdint>
#include <cstring>
#include "absl/base/config.h"
#include "absl/base/nullability.h"
#include "absl/debugging/internal/bounded_utf8_length_sequence.h"
#include "absl/debugging/internal/utf8_for_code_point.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace debugging_internal {
namespace {
// Decoding Punycode requires repeated random-access insertion into a stream of
// variable-length UTF-8 code-point encodings. We need this to be tolerably
// fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
// structures on the heap (async-signal-safety).
//
// It is pragmatic to impose a moderately low limit on the identifier length and
// bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently
// determines where to insert the next code point, and memmove efficiently makes
// room for it.
//
// The chosen limit is a round number several times larger than identifiers
// expected in practice, yet still small enough that a memmove of this many
// UTF-8 characters is not much more expensive than the division and modulus
// operations that Punycode decoding requires.
constexpr uint32_t kMaxChars = 256;
// Constants from RFC 3492 section 5.
constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
constexpr uint32_t kMaxCodePoint = 0x10ffff;
// Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
constexpr uint32_t kMaxI = 1 << 30;
// If punycode_begin .. punycode_end begins with a prefix matching the regular
// expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
// underscore into out_begin .. out_end, sets num_ascii_chars to the number of
// bytes copied, and returns true. (A prefix of this sort represents the
// nonempty subsequence of ASCII characters in the corresponding plaintext.)
//
// If punycode_begin .. punycode_end does not contain an underscore, sets
// num_ascii_chars to zero and returns true. (The encoding of a plaintext
// without any ASCII characters does not carry such a prefix.)
//
// Returns false and zeroes num_ascii_chars on failure (either parse error or
// not enough space in the output buffer).
bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
const char* const punycode_end,
char* const out_begin,
char* const out_end,
uint32_t& num_ascii_chars) {
num_ascii_chars = 0;
// Remember the last underscore if any. Also use the same string scan to
// reject any ASCII bytes that do not belong in an identifier, including NUL,
// as well as non-ASCII bytes, which should have been delta-encoded instead.
int last_underscore = -1;
for (int i = 0; i < punycode_end - punycode_begin; ++i) {
const char c = punycode_begin[i];
if (c == '_') {
last_underscore = i;
continue;
}
// We write out the meaning of absl::ascii_isalnum rather than call that
// function because its documentation does not promise it will remain
// async-signal-safe under future development.
if ('a' <= c && c <= 'z') continue;
if ('A' <= c && c <= 'Z') continue;
if ('0' <= c && c <= '9') continue;
return false;
}
// If there was no underscore, that means there were no ASCII characters in
// the plaintext, so there is no prefix to consume. Our work is done.
if (last_underscore < 0) return true;
// Otherwise there will be an underscore delimiter somewhere. It can't be
// initial because then there would be no ASCII characters to its left, and no
// delimiter would have been added in that case.
if (last_underscore == 0) return false;
// Any other position is reasonable. Make sure there's room in the buffer.
if (last_underscore + 1 > out_end - out_begin) return false;
// Consume and write out the ASCII characters.
num_ascii_chars = static_cast<uint32_t>(last_underscore);
std::memcpy(out_begin, punycode_begin, num_ascii_chars);
out_begin[num_ascii_chars] = '\0';
punycode_begin += num_ascii_chars + 1;
return true;
}
// Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
// or -1 if `c` is not such a digit.
int DigitValue(char c) {
if ('0' <= c && c <= '9') return c - '0' + 26;
if ('a' <= c && c <= 'z') return c - 'a';
if ('A' <= c && c <= 'Z') return c - 'A';
return -1;
}
// Consumes the next delta encoding from punycode_begin .. punycode_end,
// updating i accordingly. Returns true on success. Returns false on parse
// failure or arithmetic overflow.
bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
uint32_t bias, uint32_t& i) {
uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t
// "for k = base to infinity in steps of base do begin ... end" in RFC 3492
// section 6.2. Each loop iteration scans one digit of the delta.
for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
const int digit_value = DigitValue(*punycode_begin++);
if (digit_value < 0) return false;
// Compute this in 64-bit arithmetic so we can check for overflow afterward.
const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
// Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
// invalid input could encode an arbitrarily large delta. Nip that in the
// bud here.
static_assert(
kMaxI >= kMaxChars * kMaxCodePoint,
"kMaxI is too small to prevent spurious failures on good input");
if (new_i > kMaxI) return false;
static_assert(
kMaxI < (uint64_t{1} << 32),
"Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
i = static_cast<uint32_t>(new_i);
// Compute the threshold that determines whether this is the last digit and
// (if not) what the next digit's place value will be. This logic from RFC
// 3492 section 6.2 is explained in section 3.3.
uint32_t t;
if (k <= bias + kTMin) {
t = kTMin;
} else if (k >= bias + kTMax) {
t = kTMax;
} else {
t = k - bias;
}
if (static_cast<uint32_t>(digit_value) < t) return true;
// If this gets too large, the range check on new_i in the next iteration
// will catch it. We know this multiplication will not overwrap because w
// is 64 bits wide.
w *= kBase - t;
}
return false;
}
} // namespace
absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
const char* punycode_begin = options.punycode_begin;
const char* const punycode_end = options.punycode_end;
char* const out_begin = options.out_begin;
char* const out_end = options.out_end;
// Write a NUL terminator first. Later memcpy calls will keep bumping it
// along to its new right place.
const size_t out_size = static_cast<size_t>(out_end - out_begin);
if (out_size == 0) return nullptr;
*out_begin = '\0';
// RFC 3492 section 6.2 begins here. We retain the names of integer variables
// appearing in that text.
uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
// If there are any ASCII characters, consume them and their trailing
// underscore delimiter.
if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
out_begin, out_end, num_chars)) {
return nullptr;
}
uint32_t total_utf8_bytes = num_chars;
BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
// "while the input is not exhausted do begin ... end"
while (punycode_begin != punycode_end) {
if (num_chars >= kMaxChars) return nullptr;
const uint32_t old_i = i;
if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
// Update bias as in RFC 3492 section 6.1. (We have inlined adapt.)
uint32_t delta = i - old_i;
delta /= (old_i == 0 ? kDamp : 2);
delta += delta/(num_chars + 1);
bias = 0;
while (delta > ((kBase - kTMin) * kTMax)/2) {
delta /= kBase - kTMin;
bias += kBase;
}
bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
// Back in section 6.2, compute the new code point and insertion index.
static_assert(
kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
"Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
n += i/(num_chars + 1);
i %= num_chars + 1;
// To actually insert, we need to convert the code point n to UTF-8 and the
// character index i to an index into the byte stream emitted so far. First
// prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
// and anything that won't fit into the remaining output storage.
Utf8ForCodePoint utf8_for_code_point(n);
if (!utf8_for_code_point.ok()) return nullptr;
if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
return nullptr;
}
// Now insert the new character into both our length map and the output.
uint32_t n_index =
utf8_lengths.InsertAndReturnSumOfPredecessors(
i, utf8_for_code_point.length);
std::memmove(
out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
total_utf8_bytes + 1 - n_index);
std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
utf8_for_code_point.length);
total_utf8_bytes += utf8_for_code_point.length;
++num_chars;
// Finally, advance to the next state before continuing.
++i;
}
return out_begin + total_utf8_bytes;
}
} // namespace debugging_internal
ABSL_NAMESPACE_END
} // namespace absl
// Copyright 2024 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_
#define ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_
#include "absl/base/config.h"
#include "absl/base/nullability.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace debugging_internal {
struct DecodeRustPunycodeOptions {
const char* punycode_begin;
const char* punycode_end;
char* out_begin;
char* out_end;
};
// Given Rust Punycode in `punycode_begin .. punycode_end`, writes the
// corresponding UTF-8 plaintext into `out_begin .. out_end`, followed by a NUL
// character, and returns a pointer to that final NUL on success. On failure
// returns a null pointer, and the contents of `out_begin .. out_end` are
// unspecified.
//
// Failure occurs in precisely these cases:
// - Any input byte does not match [0-9a-zA-Z_].
// - The first input byte is an underscore, but no other underscore appears in
// the input.
// - The delta sequence does not represent a valid sequence of code-point
// insertions.
// - The plaintext would contain more than 256 code points.
//
// DecodeRustPunycode is async-signal-safe with bounded runtime and a small
// stack footprint, making it suitable for use in demangling Rust symbol names
// from a signal handler.
absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options);
} // namespace debugging_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_
// Copyright 2024 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/debugging/internal/decode_rust_punycode.h"
#include <cstddef>
#include <cstring>
#include <string>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/base/config.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace debugging_internal {
namespace {
using ::testing::AllOf;
using ::testing::Eq;
using ::testing::IsNull;
using ::testing::Pointee;
using ::testing::ResultOf;
using ::testing::StrEq;
class DecodeRustPunycodeTest : public ::testing::Test {
protected:
void FillBufferWithNonzeroBytes() {
// The choice of nonzero value to fill with is arbitrary. The point is just
// to fail tests if DecodeRustPunycode forgets to write the final NUL
// character.
std::memset(buffer_storage_, 0xab, sizeof(buffer_storage_));
}
DecodeRustPunycodeOptions WithAmpleSpace() {
FillBufferWithNonzeroBytes();
DecodeRustPunycodeOptions options;
options.punycode_begin = punycode_.data();
options.punycode_end = punycode_.data() + punycode_.size();
options.out_begin = buffer_storage_;
options.out_end = buffer_storage_ + sizeof(buffer_storage_);
return options;
}
DecodeRustPunycodeOptions WithJustEnoughSpace() {
FillBufferWithNonzeroBytes();
const size_t begin_offset = sizeof(buffer_storage_) - plaintext_.size() - 1;
DecodeRustPunycodeOptions options;
options.punycode_begin = punycode_.data();
options.punycode_end = punycode_.data() + punycode_.size();
options.out_begin = buffer_storage_ + begin_offset;
options.out_end = buffer_storage_ + sizeof(buffer_storage_);
return options;
}
DecodeRustPunycodeOptions WithOneByteTooFew() {
FillBufferWithNonzeroBytes();
const size_t begin_offset = sizeof(buffer_storage_) - plaintext_.size();
DecodeRustPunycodeOptions options;
options.punycode_begin = punycode_.data();
options.punycode_end = punycode_.data() + punycode_.size();
options.out_begin = buffer_storage_ + begin_offset;
options.out_end = buffer_storage_ + sizeof(buffer_storage_);
return options;
}
// Matches a correct return value of DecodeRustPunycode when `golden` is the
// expected plaintext output.
auto PointsToTheNulAfter(const std::string& golden) {
const size_t golden_size = golden.size();
return AllOf(
Pointee(Eq('\0')),
ResultOf("preceding string body",
[golden_size](const char* p) { return p - golden_size; },
StrEq(golden)));
}
std::string punycode_;
std::string plaintext_;
char buffer_storage_[1024];
};
TEST_F(DecodeRustPunycodeTest, MapsEmptyToEmpty) {
punycode_ = "";
plaintext_ = "";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest,
StripsTheTrailingDelimiterFromAPureRunOfBasicChars) {
punycode_ = "foo_";
plaintext_ = "foo";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, TreatsTheLastUnderscoreAsTheDelimiter) {
punycode_ = "foo_bar_";
plaintext_ = "foo_bar";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsALeadingUnderscoreIfNotTheDelimiter) {
punycode_ = "_foo_";
plaintext_ = "_foo";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, RejectsALeadingUnderscoreDelimiter) {
punycode_ = "_foo";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, RejectsEmbeddedNul) {
punycode_ = std::string("foo\0bar_", 8);
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, RejectsAsciiCharsOtherThanIdentifierChars) {
punycode_ = "foo\007_";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "foo-_";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "foo;_";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "foo\177_";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, RejectsRawNonAsciiChars) {
punycode_ = "\x80";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "\x80_";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "\xff";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "\xff_";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, RecognizesU0080) {
// a encodes 0, so the output is the smallest non-ASCII code point standing
// alone. (U+0080 PAD is not an identifier character, but DecodeRustPunycode
// does not check whether non-ASCII characters could belong to an identifier.)
punycode_ = "a";
plaintext_ = "\xc2\x80";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, OneByteDeltaSequencesMustBeA) {
// Because bias = 72 for the first code point, any digit but a/A is nonfinal
// in one of the first two bytes of a delta sequence.
punycode_ = "b";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "z";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "0";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "9";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsDeltaSequenceBA) {
punycode_ = "ba";
plaintext_ = "\xc2\x81";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsOtherDeltaSequencesWithSecondByteA) {
punycode_ = "ca";
plaintext_ = "\xc2\x82";
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "za";
plaintext_ = "\xc2\x99";
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "0a";
plaintext_ = "\xc2\x9a";
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "1a";
plaintext_ = "\xc2\x9b";
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "9a";
plaintext_ = "£"; // Pound sign, U+00A3
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
}
TEST_F(DecodeRustPunycodeTest, RejectsDeltaWhereTheSecondAndLastDigitIsNotA) {
punycode_ = "bb";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "zz";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "00";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
punycode_ = "99";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsDeltasWithSecondByteBFollowedByA) {
punycode_ = "bba";
plaintext_ = "¤"; // U+00A4
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "cba";
plaintext_ = "¥"; // U+00A5
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "zba";
plaintext_ = "¼"; // U+00BC
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "0ba";
plaintext_ = "½"; // U+00BD
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "1ba";
plaintext_ = "¾"; // U+00BE
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
punycode_ = "9ba";
plaintext_ = "Æ"; // U+00C6
EXPECT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
}
// Tests beyond this point use characters allowed in identifiers, so you can
// prepend _RNvC1cu<decimal length><underscore if [0-9_] follows> to a test
// input and run it through another Rust demangler to verify that the
// corresponding golden output is correct.
TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAlone) {
punycode_ = "0ca";
plaintext_ = "à";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharBeforeBasicChars) {
punycode_ = "_la_mode_yya";
plaintext_ = "à_la_mode";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAmidBasicChars) {
punycode_ = "verre__vin_m4a";
plaintext_ = "verre_à_vin";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsTwoByteCharAfterBasicChars) {
punycode_ = "belt_3na";
plaintext_ = "beltà";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedTwoByteChar) {
punycode_ = "0caaaa";
plaintext_ = "àààà";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsNearbyTwoByteCharsInOrder) {
punycode_ = "3camsuz";
plaintext_ = "ãéïôù";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsNearbyTwoByteCharsOutOfOrder) {
punycode_ = "3caltsx";
plaintext_ = "ùéôãï";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsThreeByteCharAlone) {
punycode_ = "fiq";
plaintext_ = "中";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedThreeByteChar) {
punycode_ = "fiqaaaa";
plaintext_ = "中中中中中";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsThreeByteCharsInOrder) {
punycode_ = "fiq228c";
plaintext_ = "中文";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsNearbyThreeByteCharsOutOfOrder) {
punycode_ = "fiq128c";
plaintext_ = "文中";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAlone) {
punycode_ = "uy7h";
plaintext_ = "🂻";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharBeforeBasicChars) {
punycode_ = "jack__uh63d";
plaintext_ = "jack_🂻";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAmidBasicChars) {
punycode_ = "jack__of_hearts_ki37n";
plaintext_ = "jack_🂻_of_hearts";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsFourByteCharAfterBasicChars) {
punycode_ = "_of_hearts_kz45i";
plaintext_ = "🂻_of_hearts";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsRepeatedFourByteChar) {
punycode_ = "uy7haaaa";
plaintext_ = "🂻🂻🂻🂻🂻";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsNearbyFourByteCharsInOrder) {
punycode_ = "8x7hcjmf";
plaintext_ = "🂦🂧🂪🂭🂮";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsNearbyFourByteCharsOutOfOrder) {
punycode_ = "8x7hcild";
plaintext_ = "🂮🂦🂭🂪🂧";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, AcceptsAMixtureOfByteLengths) {
punycode_ = "3caltsx2079ivf8aiuy7cja3a6ak";
plaintext_ = "ùéôãï中文🂮🂦🂭🂪🂧";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
TEST_F(DecodeRustPunycodeTest, RejectsOverlargeDeltas) {
punycode_ = "123456789a";
EXPECT_THAT(DecodeRustPunycode(WithAmpleSpace()), IsNull());
}
// Finally, we test on a few prose and poetry snippets as a defense in depth.
// If our artificial short test inputs did not exercise a bug that is tickled by
// patterns typical of real human writing, maybe real human writing will catch
// that.
//
// These test inputs are extracted from texts old enough to be out of copyright
// that probe a variety of ranges of code-point space. All are longer than 32
// code points, so they exercise the carrying of seminibbles from one uint64_t
// to the next higher one in BoundedUtf8LengthSequence.
// The first three lines of the Old English epic _Beowulf_, mostly ASCII with a
// few archaic two-byte letters interspersed.
TEST_F(DecodeRustPunycodeTest, Beowulf) {
punycode_ = "hwt_we_gardena_in_geardagum_"
"eodcyninga_rym_gefrunon_"
"hu_a_elingas_ellen_fremedon_hxg9c70do9alau";
plaintext_ = "hwæt_we_gardena_in_geardagum_"
"þeodcyninga_þrym_gefrunon_"
"hu_ða_æþelingas_ellen_fremedon";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
// The whole of 過故人莊 by the 8th-century Chinese poet 孟浩然
// (Meng Haoran), exercising three-byte-character processing.
TEST_F(DecodeRustPunycodeTest, MengHaoran) {
punycode_ = "gmq4ss0cfvao1e2wg8mcw8b0wkl9a7tt90a8riuvbk7t8kbv9a66ogofvzlf6"
"3d01ybn1u28dyqi5q2cxyyxnk5d2gx1ks9ddvfm17bk6gbsd6wftrav60u4ta";
plaintext_ = "故人具雞黍" "邀我至田家"
"綠樹村邊合" "青山郭外斜"
"開軒面場圃" "把酒話桑麻"
"待到重陽日" "還來就菊花";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
// A poem of the 8th-century Japanese poet 山上憶良 (Yamanoue no Okura).
// Japanese mixes two-byte and three-byte characters: a good workout for codecs.
TEST_F(DecodeRustPunycodeTest, YamanoueNoOkura) {
punycode_ = "48jdaa3a6ccpepjrsmlb0q4bwcdtid8fg6c0cai9822utqeruk3om0u4f2wbp0"
"em23do0op23cc2ff70mb6tae8aq759gja";
plaintext_ = "瓜食めば"
"子ども思ほゆ"
"栗食めば"
"まして偲はゆ"
"何処より"
"来りしものそ"
"眼交に"
"もとな懸りて"
"安眠し寝さぬ";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
// The first two lines of the Phoenician-language inscription on the sarcophagus
// of Eshmunazar II of Sidon, 6th century BCE. Phoenician and many other
// archaic scripts are allocated in the Supplemental Multilingual Plane (U+10000
// through U+1FFFF) and thus exercise four-byte-character processing.
TEST_F(DecodeRustPunycodeTest, EshmunazarSarcophagus) {
punycode_ = "wj9caaabaabbaaohcacxvhdc7bgxbccbdcjeacddcedcdlddbdbddcdbdcknfcee"
"ifel8del2a7inq9fhcpxikms7a4a9ac9ataaa0g";
plaintext_ = "𐤁𐤉𐤓𐤇𐤁𐤋𐤁𐤔𐤍𐤕𐤏𐤎𐤓"
"𐤅𐤀𐤓𐤁𐤏𐤗𐤖𐤖𐤖𐤖𐤋𐤌𐤋𐤊𐤉𐤌𐤋𐤊"
"𐤀𐤔𐤌𐤍𐤏𐤆𐤓𐤌𐤋𐤊𐤑𐤃𐤍𐤌"
"𐤁𐤍𐤌𐤋𐤊𐤕𐤁𐤍𐤕𐤌𐤋𐤊𐤑𐤃𐤍𐤌"
"𐤃𐤁𐤓𐤌𐤋𐤊𐤀𐤔𐤌𐤍𐤏𐤆𐤓𐤌𐤋𐤊"
"𐤑𐤃𐤍𐤌𐤋𐤀𐤌𐤓𐤍𐤂𐤆𐤋𐤕";
ASSERT_THAT(DecodeRustPunycode(WithAmpleSpace()),
PointsToTheNulAfter(plaintext_));
ASSERT_THAT(DecodeRustPunycode(WithJustEnoughSpace()),
PointsToTheNulAfter(plaintext_));
EXPECT_THAT(DecodeRustPunycode(WithOneByteTooFew()), IsNull());
}
} // namespace
} // namespace debugging_internal
ABSL_NAMESPACE_END
} // namespace absl
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment