diff --git a/ecmascript/base/number_helper.cpp b/ecmascript/base/number_helper.cpp index a6c3410babc5fbb3f8ae8a33f4cd0e984ee2453d..034e541060e23ed4086a1e506b30839c3d08e409 100644 --- a/ecmascript/base/number_helper.cpp +++ b/ecmascript/base/number_helper.cpp @@ -76,7 +76,7 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) ++size; utf8Bit >>= 1UL; } - if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, 1, 0) <= 0) { + if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, SIZE_MAX, &c, 1, 0) <= 0) { return true; } } diff --git a/ecmascript/base/utf_helper.cpp b/ecmascript/base/utf_helper.cpp index 5cb168e06f8e10c0c3fccec8f7e428c0999166a5..1859863fa06d6cdabe5c60b3a9c4b45c75a9aa1b 100644 --- a/ecmascript/base/utf_helper.cpp +++ b/ecmascript/base/utf_helper.cpp @@ -14,6 +14,7 @@ */ #include "ecmascript/base/utf_helper.h" +#include // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000; @@ -216,26 +217,34 @@ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com return {pair, UtfLength::FOUR}; } -size_t Utf8ToUtf16Size(const uint8_t *utf8) +size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len) { size_t res = 0; - while (*utf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8); + const uint8_t *putf8 = utf8; + if (utf8Len == SIZE_MAX) { + utf8Len = strlen((const char *)utf8); + } + while (putf8 - utf8 < (int)utf8Len) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) + auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(putf8); res += pair > 0xffff ? UtfLength::TWO : UtfLength::ONE; // NOLINT(readability-magic-numbers) - utf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) + putf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) } return res; } -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start) +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, size_t iLen, uint16_t *utf16Out, size_t utf16Len, size_t start) { ASSERT(utf16Out != nullptr); size_t outPos = 0; - while (*utf8In != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8In); + const uint8_t *putf8 = utf8In; + if (iLen == SIZE_MAX) { + iLen = strlen((const char *)utf8In); + } + while (putf8 - utf8In < (int)iLen) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) + auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(putf8); auto [pHi, pLo] = utf::SplitUtf16Pair(pair); - utf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) + putf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) if (start > 0) { start -= nbytes; continue; diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 29abed490829a3e6d433f3755d681e37f6a3e7ba..d927269d3d98cadab4267c82c032bf23ecb3fc31 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -69,9 +69,9 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false); -size_t Utf8ToUtf16Size(const uint8_t *utf8); +size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len = SIZE_MAX); -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start); +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, size_t iLen, uint16_t *utf16Out, size_t utf16Len, size_t start); static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) { @@ -83,4 +83,4 @@ static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) } } // namespace panda::ecmascript::base::utf_helper -#endif // ECMASCRIPT_BASE_UTF_HELPER_H \ No newline at end of file +#endif // ECMASCRIPT_BASE_UTF_HELPER_H diff --git a/ecmascript/ecma_string-inl.h b/ecmascript/ecma_string-inl.h index 5cffa1c1b1eaabc26f13cfeb27c8d6fb80082cbb..dad7a2b4d71126191788cb15743d79ba2bc3f35d 100644 --- a/ecmascript/ecma_string-inl.h +++ b/ecmascript/ecma_string-inl.h @@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8Data, uint32_t UNREACHABLE(); } } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len); string = AllocStringObject(utf16Len, false, vm); ASSERT(string != nullptr); [[maybe_unused]] auto len = - base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf16Len, 0); + base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, utf8Len, string->GetDataUtf16Writable(), utf16Len, 0); ASSERT(len == utf16Len); } diff --git a/ecmascript/ecma_string.cpp b/ecmascript/ecma_string.cpp index 7e9fa5fbbab0ba01b7f9f55df234b776687d61e3..2fa9b1b13ab9bdca4820327cbeb20cffb935c80b 100644 --- a/ecmascript/ecma_string.cpp +++ b/ecmascript/ecma_string.cpp @@ -239,15 +239,18 @@ int32_t EcmaString::IndexOf(const EcmaString *rhs, int32_t pos) const } // static -bool EcmaString::CanBeCompressed(const uint8_t *utf8Data) +bool EcmaString::CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len) { if (!compressedStringsEnabled) { return false; } bool isCompressed = true; int index = 0; + if (utf8Len == UINT32_MAX) { + utf8Len = strlen((const char *)utf8Data); + } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - while (utf8Data[index] != '\0') { + while (index < (int)utf8Len) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) if (!IsASCIICharacter(utf8Data[index])) { isCompressed = false; @@ -257,7 +260,6 @@ bool EcmaString::CanBeCompressed(const uint8_t *utf8Data) } return isCompressed; } - /* static */ bool EcmaString::CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len) { @@ -321,7 +323,7 @@ bool EcmaString::StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8 Span data2(utf8Data, utf8Len); return EcmaString::StringsAreEquals(data1, data2); } - return IsUtf8EqualsUtf16(utf8Data, str1->GetDataUtf16(), str1->GetLength()); + return IsUtf8EqualsUtf16(utf8Data, utf8Len, str1->GetDataUtf16(), str1->GetLength()); } /* static */ @@ -331,7 +333,7 @@ bool EcmaString::StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *ut if (str1->GetLength() != utf16Len) { result = false; } else if (!str1->IsUtf16()) { - result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), utf16Data, utf16Len); + result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), str1->GetLength(), utf16Data, utf16Len); } else { Span data1(str1->GetDataUtf16(), str1->GetLength()); Span data2(utf16Data, utf16Len); @@ -392,13 +394,13 @@ static int32_t ComputeHashForData(const T *data, size_t size) return static_cast(hash); } -static int32_t ComputeHashForUtf8(const uint8_t *utf8Data) +static int32_t ComputeHashForUtf8(const uint8_t *utf8Data, size_t utf8Len) { if (utf8Data == nullptr) { return 0; } uint32_t hash = 0; - while (*utf8Data != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) + for (size_t i = 0; i < utf8Len; i++) { constexpr size_t SHIFT = 5; hash = (hash << SHIFT) - hash + *utf8Data++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) } @@ -422,15 +424,16 @@ uint32_t EcmaString::ComputeHashcode() const } /* static */ -uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress) +uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress) { uint32_t hash; if (canBeCompress) { - hash = ComputeHashForUtf8(utf8Data); + hash = ComputeHashForUtf8(utf8Data, utf8Len); } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len); CVector tmpBuffer(utf16Len); - [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf16Len, 0); + [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, utf8Len, + tmpBuffer.data(), utf16Len, 0); ASSERT(len == utf16Len); hash = ComputeHashForData(tmpBuffer.data(), utf16Len); } @@ -444,12 +447,14 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le } /* static */ -bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len) +bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8Len, + const uint16_t *utf16Data, uint32_t utf16Len) { // length is one more than compared utf16Data, don't need convert all utf8Data to utf16Data uint32_t utf8ConvertLength = utf16Len + 1; CVector tmpBuffer(utf8ConvertLength); - auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8ConvertLength, 0); + auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, utf8Len, + tmpBuffer.data(), utf8ConvertLength, 0); if (len != utf16Len) { return false; } diff --git a/ecmascript/ecma_string.h b/ecmascript/ecma_string.h index 453b2d12506ca847e00362f9a3d15826078887d4..144118505fcc55dac6000de40cbc5942b1de5465 100644 --- a/ecmascript/ecma_string.h +++ b/ecmascript/ecma_string.h @@ -171,7 +171,7 @@ public: } return length; } - return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, maxLength, start); + return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), GetLength(), buf, maxLength, start); } // NOLINTNEXTLINE(modernize-avoid-c-arrays) @@ -245,7 +245,7 @@ public: * Compares strings by bytes, It doesn't check canonical unicode equivalence. */ static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len); - static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress); + static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress); static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); static void SetCompressedStringsEnabled(bool val) @@ -260,7 +260,7 @@ public: static EcmaString *AllocStringObject(size_t length, bool compressed, const EcmaVM *vm); - static bool CanBeCompressed(const uint8_t *utf8Data); + static bool CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len = UINT32_MAX); static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len); private: @@ -303,7 +303,8 @@ private: * str1 should have the same length as utf16_data. * Converts utf8Data to utf16 and compare it with given utf16_data. */ - static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len); + static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8Len, + const uint16_t *utf16Data, uint32_t utf16Len); template /** diff --git a/ecmascript/ecma_string_table.cpp b/ecmascript/ecma_string_table.cpp index e74381da57e69604b35678e2681b3541a8c1d45c..33629dad267963831cf8d6da61f05bc3f5e191b7 100644 --- a/ecmascript/ecma_string_table.cpp +++ b/ecmascript/ecma_string_table.cpp @@ -26,7 +26,7 @@ EcmaStringTable::EcmaStringTable(const EcmaVM *vm) : vm_(vm) {} EcmaString *EcmaStringTable::GetString(const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress) const { - uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, canBeCompress); + uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, utf8Len, canBeCompress); for (auto it = table_.find(hashCode); it != table_.end(); it++) { auto foundedString = it->second; if (EcmaString::StringsAreEqualUtf8(foundedString, utf8Data, utf8Len, canBeCompress)) { diff --git a/ecmascript/object_factory.cpp b/ecmascript/object_factory.cpp index a6262714c550f3bf1ee8509d7545e95129a6636d..1719437e40aaf02beb3f4ee3cc7303ed1f36a209 100644 --- a/ecmascript/object_factory.cpp +++ b/ecmascript/object_factory.cpp @@ -2087,7 +2087,7 @@ JSHandle ObjectFactory::NewFromStdStringUnCheck(const std::string &d JSHandle ObjectFactory::NewFromUtf8(const uint8_t *utf8Data, uint32_t utf8Len) { - bool canBeCompress = EcmaString::CanBeCompressed(utf8Data); + bool canBeCompress = EcmaString::CanBeCompressed(utf8Data, utf8Len); return GetStringFromStringTable(utf8Data, utf8Len, canBeCompress); } diff --git a/ecmascript/tests/ecma_string_test.cpp b/ecmascript/tests/ecma_string_test.cpp index f3e39b5b2e3592787c6ca6481be4c48c2d8825ff..0769dc10f13c8e9a012e5c638262de6f06eb5d59 100644 --- a/ecmascript/tests/ecma_string_test.cpp +++ b/ecmascript/tests/ecma_string_test.cpp @@ -1647,7 +1647,7 @@ HWTEST_F_L0(EcmaStringTest, ComputeHashcodeUtf8) for (uint32_t i = 0; i < lengthEcmaStrU8; i++) { hashExpect = hashExpect * 31 + arrayU8[i]; } - EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8), static_cast(hashExpect)); + EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8, false), static_cast(hashExpect)); } /*