Commit d30298a1 by Chris Mihelich Committed by Copybara-Service

UTF-8 encoding library to support Rust Punycode decoding.

PiperOrigin-RevId: 641983507
Change-Id: Iad7933884aef6bfd90d159c049a1d698d19456c6
parent 96cdf6cc
...@@ -134,6 +134,8 @@ set(ABSL_INTERNAL_DLL_FILES ...@@ -134,6 +134,8 @@ set(ABSL_INTERNAL_DLL_FILES
"debugging/internal/stack_consumption.h" "debugging/internal/stack_consumption.h"
"debugging/internal/stacktrace_config.h" "debugging/internal/stacktrace_config.h"
"debugging/internal/symbolize.h" "debugging/internal/symbolize.h"
"debugging/internal/utf8_for_code_point.cc"
"debugging/internal/utf8_for_code_point.h"
"debugging/internal/vdso_support.cc" "debugging/internal/vdso_support.cc"
"debugging/internal/vdso_support.h" "debugging/internal/vdso_support.h"
"functional/any_invocable.h" "functional/any_invocable.h"
......
...@@ -222,10 +222,12 @@ cc_library( ...@@ -222,10 +222,12 @@ cc_library(
srcs = [ srcs = [
"internal/demangle.cc", "internal/demangle.cc",
"internal/demangle_rust.cc", "internal/demangle_rust.cc",
"internal/utf8_for_code_point.cc",
], ],
hdrs = [ hdrs = [
"internal/demangle.h", "internal/demangle.h",
"internal/demangle_rust.h", "internal/demangle_rust.h",
"internal/utf8_for_code_point.h",
], ],
copts = ABSL_DEFAULT_COPTS, copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS, linkopts = ABSL_DEFAULT_LINKOPTS,
...@@ -271,6 +273,19 @@ cc_test( ...@@ -271,6 +273,19 @@ cc_test(
], ],
) )
cc_test(
name = "utf8_for_code_point_test",
srcs = ["internal/utf8_for_code_point_test.cc"],
copts = ABSL_TEST_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
deps = [
":demangle_internal",
"//absl/base:config",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
cc_library( cc_library(
name = "leak_check", name = "leak_check",
srcs = ["leak_check.cc"], srcs = ["leak_check.cc"],
......
...@@ -197,9 +197,11 @@ absl_cc_library( ...@@ -197,9 +197,11 @@ absl_cc_library(
HDRS HDRS
"internal/demangle.h" "internal/demangle.h"
"internal/demangle_rust.h" "internal/demangle_rust.h"
"internal/utf8_for_code_point.h"
SRCS SRCS
"internal/demangle.cc" "internal/demangle.cc"
"internal/demangle_rust.cc" "internal/demangle_rust.cc"
"internal/utf8_for_code_point.cc"
COPTS COPTS
${ABSL_DEFAULT_COPTS} ${ABSL_DEFAULT_COPTS}
DEPS DEPS
...@@ -238,6 +240,19 @@ absl_cc_test( ...@@ -238,6 +240,19 @@ absl_cc_test(
GTest::gmock_main GTest::gmock_main
) )
absl_cc_test(
NAME
utf8_for_code_point_test
SRCS
"internal/utf8_for_code_point_test.cc"
COPTS
${ABSL_TEST_COPTS}
DEPS
absl::demangle_internal
absl::config
GTest::gmock_main
)
absl_cc_library( absl_cc_library(
NAME NAME
leak_check leak_check
......
// Copyright 2024 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/debugging/internal/utf8_for_code_point.h"
#include <cstdint>
#include "absl/base/config.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace debugging_internal {
namespace {
// UTF-8 encoding bounds.
constexpr uint32_t kMinSurrogate = 0xd800, kMaxSurrogate = 0xdfff;
constexpr uint32_t kMax1ByteCodePoint = 0x7f;
constexpr uint32_t kMax2ByteCodePoint = 0x7ff;
constexpr uint32_t kMax3ByteCodePoint = 0xffff;
constexpr uint32_t kMaxCodePoint = 0x10ffff;
} // namespace
Utf8ForCodePoint::Utf8ForCodePoint(uint64_t code_point) {
if (code_point <= kMax1ByteCodePoint) {
length = 1;
bytes[0] = static_cast<char>(code_point);
return;
}
if (code_point <= kMax2ByteCodePoint) {
length = 2;
bytes[0] = static_cast<char>(0xc0 | (code_point >> 6));
bytes[1] = static_cast<char>(0x80 | (code_point & 0x3f));
return;
}
if (kMinSurrogate <= code_point && code_point <= kMaxSurrogate) return;
if (code_point <= kMax3ByteCodePoint) {
length = 3;
bytes[0] = static_cast<char>(0xe0 | (code_point >> 12));
bytes[1] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f));
bytes[2] = static_cast<char>(0x80 | (code_point & 0x3f));
return;
}
if (code_point > kMaxCodePoint) return;
length = 4;
bytes[0] = static_cast<char>(0xf0 | (code_point >> 18));
bytes[1] = static_cast<char>(0x80 | ((code_point >> 12) & 0x3f));
bytes[2] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f));
bytes[3] = static_cast<char>(0x80 | (code_point & 0x3f));
}
} // namespace debugging_internal
ABSL_NAMESPACE_END
} // namespace absl
// Copyright 2024 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_
#define ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_
#include <cstdint>
#include "absl/base/config.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace debugging_internal {
struct Utf8ForCodePoint {
// Converts a Unicode code point to the corresponding UTF-8 byte sequence.
// Async-signal-safe to support use in symbolizing stack traces from a signal
// handler.
explicit Utf8ForCodePoint(uint64_t code_point);
// Returns true if the constructor's code_point argument was valid.
bool ok() const { return length != 0; }
// If code_point was in range, then 1 <= length <= 4, and the UTF-8 encoding
// is found in bytes[0 .. (length - 1)]. If code_point was invalid, then
// length == 0. In either case, the contents of bytes[length .. 3] are
// unspecified.
char bytes[4] = {};
uint32_t length = 0;
};
} // namespace debugging_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_
// Copyright 2024 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/debugging/internal/utf8_for_code_point.h"
#include <cstdint>
#include "gtest/gtest.h"
#include "absl/base/config.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace debugging_internal {
namespace {
TEST(Utf8ForCodePointTest, RecognizesTheSmallestCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0});
ASSERT_EQ(utf8.length, 1);
EXPECT_EQ(utf8.bytes[0], '\0');
}
TEST(Utf8ForCodePointTest, RecognizesAsciiSmallA) {
Utf8ForCodePoint utf8(std::uint64_t{'a'});
ASSERT_EQ(utf8.length, 1);
EXPECT_EQ(utf8.bytes[0], 'a');
}
TEST(Utf8ForCodePointTest, RecognizesTheLargestOneByteCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0x7f});
ASSERT_EQ(utf8.length, 1);
EXPECT_EQ(utf8.bytes[0], '\x7f');
}
TEST(Utf8ForCodePointTest, RecognizesTheSmallestTwoByteCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0x80});
ASSERT_EQ(utf8.length, 2);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xc2));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x80));
}
TEST(Utf8ForCodePointTest, RecognizesSmallNWithTilde) {
Utf8ForCodePoint utf8(std::uint64_t{0xf1});
ASSERT_EQ(utf8.length, 2);
const char* want = "ñ";
EXPECT_EQ(utf8.bytes[0], want[0]);
EXPECT_EQ(utf8.bytes[1], want[1]);
}
TEST(Utf8ForCodePointTest, RecognizesCapitalPi) {
Utf8ForCodePoint utf8(std::uint64_t{0x3a0});
ASSERT_EQ(utf8.length, 2);
const char* want = "Π";
EXPECT_EQ(utf8.bytes[0], want[0]);
EXPECT_EQ(utf8.bytes[1], want[1]);
}
TEST(Utf8ForCodePointTest, RecognizesTheLargestTwoByteCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0x7ff});
ASSERT_EQ(utf8.length, 2);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xdf));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xbf));
}
TEST(Utf8ForCodePointTest, RecognizesTheSmallestThreeByteCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0x800});
ASSERT_EQ(utf8.length, 3);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xe0));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xa0));
EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80));
}
TEST(Utf8ForCodePointTest, RecognizesTheChineseCharacterZhong1AsInZhong1Wen2) {
Utf8ForCodePoint utf8(std::uint64_t{0x4e2d});
ASSERT_EQ(utf8.length, 3);
const char* want = "中";
EXPECT_EQ(utf8.bytes[0], want[0]);
EXPECT_EQ(utf8.bytes[1], want[1]);
EXPECT_EQ(utf8.bytes[2], want[2]);
}
TEST(Utf8ForCodePointTest, RecognizesOneBeforeTheSmallestSurrogate) {
Utf8ForCodePoint utf8(std::uint64_t{0xd7ff});
ASSERT_EQ(utf8.length, 3);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xed));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x9f));
EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf));
}
TEST(Utf8ForCodePointTest, RejectsTheSmallestSurrogate) {
Utf8ForCodePoint utf8(std::uint64_t{0xd800});
EXPECT_EQ(utf8.length, 0);
}
TEST(Utf8ForCodePointTest, RejectsTheLargestSurrogate) {
Utf8ForCodePoint utf8(std::uint64_t{0xdfff});
EXPECT_EQ(utf8.length, 0);
}
TEST(Utf8ForCodePointTest, RecognizesOnePastTheLargestSurrogate) {
Utf8ForCodePoint utf8(std::uint64_t{0xe000});
ASSERT_EQ(utf8.length, 3);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xee));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x80));
EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80));
}
TEST(Utf8ForCodePointTest, RecognizesTheLargestThreeByteCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0xffff});
ASSERT_EQ(utf8.length, 3);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xef));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0xbf));
EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf));
}
TEST(Utf8ForCodePointTest, RecognizesTheSmallestFourByteCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0x10000});
ASSERT_EQ(utf8.length, 4);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xf0));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x90));
EXPECT_EQ(utf8.bytes[2], static_cast<char>(0x80));
EXPECT_EQ(utf8.bytes[3], static_cast<char>(0x80));
}
TEST(Utf8ForCodePointTest, RecognizesTheJackOfHearts) {
Utf8ForCodePoint utf8(std::uint64_t{0x1f0bb});
ASSERT_EQ(utf8.length, 4);
const char* want = "🂻";
EXPECT_EQ(utf8.bytes[0], want[0]);
EXPECT_EQ(utf8.bytes[1], want[1]);
EXPECT_EQ(utf8.bytes[2], want[2]);
EXPECT_EQ(utf8.bytes[3], want[3]);
}
TEST(Utf8ForCodePointTest, RecognizesTheLargestFourByteCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0x10ffff});
ASSERT_EQ(utf8.length, 4);
EXPECT_EQ(utf8.bytes[0], static_cast<char>(0xf4));
EXPECT_EQ(utf8.bytes[1], static_cast<char>(0x8f));
EXPECT_EQ(utf8.bytes[2], static_cast<char>(0xbf));
EXPECT_EQ(utf8.bytes[3], static_cast<char>(0xbf));
}
TEST(Utf8ForCodePointTest, RejectsTheSmallestOverlargeCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0x110000});
EXPECT_EQ(utf8.length, 0);
}
TEST(Utf8ForCodePointTest, RejectsAThroughlyOverlargeCodePoint) {
Utf8ForCodePoint utf8(std::uint64_t{0xffffffff00000000});
EXPECT_EQ(utf8.length, 0);
}
TEST(Utf8ForCodePointTest, OkReturnsTrueForAValidCodePoint) {
EXPECT_TRUE(Utf8ForCodePoint(std::uint64_t{0}).ok());
}
TEST(Utf8ForCodePointTest, OkReturnsFalseForAnInvalidCodePoint) {
EXPECT_FALSE(Utf8ForCodePoint(std::uint64_t{0xffffffff00000000}).ok());
}
} // namespace
} // namespace debugging_internal
ABSL_NAMESPACE_END
} // namespace absl
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment