diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 6b3fc9485ec1a..e615702f7bbb8 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1249,6 +1249,7 @@ if(LLVM_LIBC_FULL_BUILD) # wchar.h entrypoints libc.src.wchar.mbrtowc + libc.src.wchar.mbsrtowcs libc.src.wchar.mbtowc libc.src.wchar.wcrtomb libc.src.wchar.wctomb diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 397296894829d..576cf09b86696 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -38,6 +38,15 @@ functions: - type: const char *__restrict - type: size_t - type: mbstate_t *__restrict + - name: mbsrtowcs + standards: + - stdc + return_type: size_t + arguments: + - type: wchar_t *__restrict + - type: const char **__restrict + - type: size_t + - type: mbstate_t *__restrict - name: mbtowc standards: - stdc diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt index 86a47319f278a..c06b1023180ad 100644 --- a/libc/src/__support/wchar/CMakeLists.txt +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -53,3 +53,19 @@ add_object_library( .character_converter .mbstate ) + +add_object_library( + mbsrtowcs + HDRS + mbsrtowcs.h + SRCS + mbsrtowcs.cpp + DEPENDS + libc.hdr.types.wchar_t + libc.hdr.types.size_t + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + .mbstate + .mbrtowc +) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index c54a1b751f402..af48e7d332667 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -27,7 +27,6 @@ constexpr uint32_t MASK_ENCODED_BITS = mask_trailing_ones(); // Maximum value for utf-32 for a utf-8 sequence of a given length constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff}; -constexpr int MAX_UTF8_LENGTH = 4; CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; } diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index d9a63fdc0522c..f032005a79dd5 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -18,6 +18,8 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { +constexpr int MAX_UTF8_LENGTH = 4; + class CharacterConverter { private: mbstate *state; diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp index 3b8f7666026c3..ae61999bdb1dc 100644 --- a/libc/src/__support/wchar/mbrtowc.cpp +++ b/libc/src/__support/wchar/mbrtowc.cpp @@ -36,7 +36,7 @@ ErrorOr mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, return Error(EILSEQ); } auto wc = char_conv.pop_utf32(); - if (wc.has_value()) { + if (wc.has_value() && pwc != nullptr) { *pwc = wc.value(); // null terminator -> return 0 if (wc.value() == L'\0') diff --git a/libc/src/__support/wchar/mbsrtowcs.cpp b/libc/src/__support/wchar/mbsrtowcs.cpp new file mode 100644 index 0000000000000..50ef85755941b --- /dev/null +++ b/libc/src/__support/wchar/mbsrtowcs.cpp @@ -0,0 +1,50 @@ +//===-- Implementation for mbsrtowcs function -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/wchar/mbsrtowcs.h" +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/character_converter.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +ErrorOr mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src, + size_t len, mbstate *__restrict ps) { + size_t i = 0; + // Converting characters until we reach error or null terminator + for (; i < len; ++i) { + wchar_t temp; + auto check = internal::mbrtowc(dst == nullptr ? &temp : dst, *src, + MAX_UTF8_LENGTH, ps); + // Encoding error/invalid mbstate + if (!check.has_value()) + return check; + // Successfully encoded, check for null terminator + if (temp == L'\0' || (dst != nullptr && *dst == L'\0')) { + *src = nullptr; + return i; + } + // Set src to point right after the last character converted + *src = *src + check.value(); + // Incrementing destination + if (dst != nullptr) + ++dst; + } + return i; +} + +} // namespace internal + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wchar/mbsrtowcs.h b/libc/src/__support/wchar/mbsrtowcs.h new file mode 100644 index 0000000000000..5eda23fa7baad --- /dev/null +++ b/libc/src/__support/wchar/mbsrtowcs.h @@ -0,0 +1,29 @@ +//===-- Implementation header for mbsrtowcs function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCHAR_MBSRTOWCS +#define LLVM_LIBC_SRC___SUPPORT_WCHAR_MBSRTOWCS + +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +ErrorOr mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src, + size_t len, mbstate *__restrict ps); + +} // namespace internal + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCHAR_MBSRTOWCS diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 16664100d42c7..7a27ff8544a1e 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -78,6 +78,24 @@ add_entrypoint_object( libc.src.__support.wchar.mbstate ) +add_entrypoint_object( + mbsrtowcs + SRCS + mbsrtowcs.cpp + HDRS + mbsrtowcs.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.mbstate_t + libc.hdr.types.wchar_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.wchar.mbsrtowcs + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate + libc.src.__support.macros.null_check +) + add_entrypoint_object( mbtowc SRCS diff --git a/libc/src/wchar/mbsrtowcs.cpp b/libc/src/wchar/mbsrtowcs.cpp new file mode 100644 index 0000000000000..3c3140ec1717f --- /dev/null +++ b/libc/src/wchar/mbsrtowcs.cpp @@ -0,0 +1,41 @@ +//===-- Implementation of mbsrtowcs ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mbsrtowcs.h" + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/null_check.h" +#include "src/__support/wchar/mbsrtowcs.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(size_t, mbsrtowcs, + (wchar_t *__restrict dst, const char **__restrict src, + size_t len, mbstate_t *__restrict ps)) { + LIBC_CRASH_ON_NULLPTR(src); + static internal::mbstate internal_mbstate; + len = dst == nullptr ? SIZE_MAX : len; + auto ret = internal::mbsrtowcs( + dst, src, len, + ps == nullptr ? &internal_mbstate + : reinterpret_cast(ps)); + if (!ret.has_value()) { + // Encoding failure + libc_errno = ret.error(); + return -1; + } + return ret.value(); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mbsrtowcs.h b/libc/src/wchar/mbsrtowcs.h new file mode 100644 index 0000000000000..f8d4cc26e63ae --- /dev/null +++ b/libc/src/wchar/mbsrtowcs.h @@ -0,0 +1,24 @@ +//===-- Implementation header for mbsrtowcs -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBSRTOWCS_H +#define LLVM_LIBC_SRC_WCHAR_MBSRTOWCS_H + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src, + size_t len, mbstate_t *__restrict ps); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBSRTOWCS_H diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index bf16fdd7f8c4d..44f0e7238012b 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -39,6 +39,20 @@ add_libc_test( libc.test.UnitTest.ErrnoCheckingTest ) +add_libc_test( + mbsrtowcs_test + SUITE + libc_wchar_unittests + SRCS + mbsrtowcs_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.string.memset + libc.src.wchar.mbsrtowcs + libc.hdr.types.mbstate_t + libc.hdr.types.wchar_t +) + add_libc_test( mbtowc_test SUITE diff --git a/libc/test/src/wchar/mbsrtowcs_test.cpp b/libc/test/src/wchar/mbsrtowcs_test.cpp new file mode 100644 index 0000000000000..336da37d158b4 --- /dev/null +++ b/libc/test/src/wchar/mbsrtowcs_test.cpp @@ -0,0 +1,149 @@ +//===-- Unittests for mbsrtowcs -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/__support/wchar/mbstate.h" +#include "src/string/memset.h" +#include "src/wchar/mbsrtowcs.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBSRToWCSTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBSRToWCSTest, OneByteOneCharacter) { + mbstate_t *mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + const char *ch = "A"; + wchar_t dest[2]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &ch, 2, mb); + ASSERT_ERRNO_SUCCESS(); + ASSERT_TRUE(dest[0] == L'A'); + ASSERT_TRUE(dest[1] == L'\0'); + // Should not count null terminator in number + ASSERT_EQ(static_cast(n), 1); + // Should set ch to nullptr after reading null terminator + ASSERT_EQ(ch, nullptr); +} + +TEST_F(LlvmLibcMBSRToWCSTest, MultiByteOneCharacter) { + const char *src = "\xf0\x9f\x98\xb9"; // laughing cat emoji 😹 + wchar_t dest[2]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &src, 2, nullptr); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(static_cast(dest[0]), 128569); + ASSERT_TRUE(dest[1] == L'\0'); + // Should not count null terminator in number + ASSERT_EQ(static_cast(n), 1); + // Should set ch to nullptr after reading null terminator + ASSERT_EQ(src, nullptr); +} + +TEST_F(LlvmLibcMBSRToWCSTest, MultiByteTwoCharacters) { + // Two laughing cat emojis "😹😹" + const char *src = "\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9"; + wchar_t dest[3]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &src, 3, nullptr); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(static_cast(dest[0]), 128569); + ASSERT_EQ(static_cast(dest[1]), 128569); + ASSERT_TRUE(dest[2] == L'\0'); + // Should not count null terminator in number + ASSERT_EQ(static_cast(n), 2); + // Should set ch to nullptr after reading null terminator + ASSERT_EQ(src, nullptr); +} + +TEST_F(LlvmLibcMBSRToWCSTest, MixedNumberOfBytes) { + // 'A', sigma symbol 'Σ', recycling symbol '♻', laughing cat emoji '😹' + const char *src = "A\xce\xa3\xe2\x99\xbb\xf0\x9f\x98\xb9"; + wchar_t dest[5]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &src, 5, nullptr); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(static_cast(dest[0]), 'A'); + ASSERT_EQ(static_cast(dest[1]), 931); + ASSERT_EQ(static_cast(dest[2]), 9851); + ASSERT_EQ(static_cast(dest[3]), 128569); + ASSERT_TRUE(dest[4] == L'\0'); + // Should not count null terminator in number + ASSERT_EQ(static_cast(n), 4); + // Should set ch to nullptr after reading null terminator + ASSERT_EQ(src, nullptr); +} + +TEST_F(LlvmLibcMBSRToWCSTest, ReadLessThanStringLength) { + // Four laughing cat emojis "😹😹😹😹" + const char *src = + "\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9"; + const char *check = src; + wchar_t dest[3]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &src, 3, nullptr); + ASSERT_ERRNO_SUCCESS(); + // Should have read 3 emojis + ASSERT_EQ(static_cast(n), 3); + ASSERT_EQ(static_cast(dest[0]), 128569); + ASSERT_EQ(static_cast(dest[1]), 128569); + ASSERT_EQ(static_cast(dest[2]), 128569); + // src should now point to the 4th cat emoji aka 13th byte + ASSERT_EQ((check + 12), src); +} + +TEST_F(LlvmLibcMBSRToWCSTest, InvalidFirstByte) { + // 0x80 is invalid first byte of mb character + const char *src = + "\x80\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9"; + wchar_t dest[3]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &src, 3, nullptr); + // Should return error and set errno + ASSERT_EQ(static_cast(n), -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBSRToWCSTest, InvalidMiddleByte) { + // The 7th byte is invalid for a 4 byte character + const char *src = + "\xf0\x9f\x98\xb9\xf0\x9f\xf0\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9"; + wchar_t dest[3]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &src, 5, nullptr); + // Should return error and set errno + ASSERT_EQ(static_cast(n), -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBSRToWCSTest, NullDestination) { + // Four laughing cat emojis "😹😹😹😹" + const char *src = + "\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9"; + size_t n = LIBC_NAMESPACE::mbsrtowcs(nullptr, &src, 2, nullptr); + ASSERT_ERRNO_SUCCESS(); + // Null destination should ignore len and read till end of string + ASSERT_EQ(static_cast(n), 4); +} + +TEST_F(LlvmLibcMBSRToWCSTest, InvalidMBState) { + mbstate_t *mb; + LIBC_NAMESPACE::internal::mbstate inv; + inv.total_bytes = 6; + mb = reinterpret_cast(&inv); + // Four laughing cat emojis "😹😹😹😹" + const char *src = + "\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9\xf0\x9f\x98\xb9"; + wchar_t dest[3]; + size_t n = LIBC_NAMESPACE::mbsrtowcs(dest, &src, 3, mb); + // Should fail from invalid mbstate + ASSERT_EQ(static_cast(n), -1); + ASSERT_ERRNO_EQ(EINVAL); +} + +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) +TEST_F(LlvmLibcMBSRToWCSTest, NullSource) { + // Passing in a nullptr source should crash the program + EXPECT_DEATH([] { LIBC_NAMESPACE::mbsrtowcs(nullptr, nullptr, 1, nullptr); }, + WITH_SIGNAL(-1)); +} +#endif // LIBC_HAS_ADDRESS_SANITIZER