kaitai-io · GreyCat · Jul 24, 2023 · Jul 27, 2023 · Apr 6, 2025 · Apr 6, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -16,6 +16,13 @@ jobs:
           - '98'
           - '11'
           - '20'
+        encoding:
+          - ICONV
+          - ICU
+        exclude:
+          # Exclude ICU on C++98, as modern ICU library requires at least C++11
+          - cpp-standard: '98'
+            encoding: ICU
     steps:
       - uses: actions/checkout@v4
       - name: Install GoogleTest
@@ -26,15 +33,20 @@ jobs:
         run: sudo apt-get update
       - name: Install packages
         run: sudo apt-get install -y iwyu valgrind
+      - name: Install ICU
+        run: sudo apt-get install -y libicu-dev
+        if: matrix.encoding == 'ICU'
       - name: Build
         env:
           CPP_STANDARD: ${{ matrix.cpp-standard }}
+          ENCODING_TYPE: ${{ matrix.encoding }}
           # This tells the C++ compiler to produce debugging info that Valgrind needs to report line numbers.
           # See also https://valgrind.org/docs/manual/manual-core.html#manual-core.started
           CMAKE_BUILD_TYPE: Debug
         run: |
           .build/build \
             -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
+            -DSTRING_ENCODING_TYPE="$ENCODING_TYPE" \
             -DCMAKE_CXX_STANDARD="$CPP_STANDARD" -DCMAKE_CXX_STANDARD_REQUIRED=ON -DCMAKE_CXX_EXTENSIONS=OFF \
             -DCMAKE_CXX_INCLUDE_WHAT_YOU_USE='include-what-you-use;-Xiwyu;--verbose=3'
       - name: Run tests

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,6 +10,8 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON)
 find_package(ZLIB)
 find_package(Iconv)
 
+find_package(ICU COMPONENTS uc)
+
 set (HEADERS
     kaitai/kaitaistream.h
     kaitai/kaitaistruct.h
@@ -20,7 +22,7 @@ set (SOURCES
     kaitai/kaitaistream.cpp
 )
 
-set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")
+set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)")
 
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
@@ -37,6 +39,10 @@ if(Iconv_FOUND)
     target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv)
 endif()
 
+if(ICU_FOUND)
+    target_link_libraries(${PROJECT_NAME} PRIVATE ICU::uc)
+endif()
+
 include(Common.cmake)
 
 install(TARGETS ${PROJECT_NAME}

diff --git a/Common.cmake b/Common.cmake
@@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV")
     target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
 elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
     target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
+elseif (STRING_ENCODING_TYPE STREQUAL "ICU")
+    target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU)
 elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
     target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
 else()

diff --git a/kaitai/kaitaistream.cpp b/kaitai/kaitaistream.cpp
@@ -1197,6 +1197,112 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
     return utf8;
 }
 
+#elif defined(KS_STR_ENCODING_ICU)
+#include <unicode/ucnv.h>
+#include <unicode/ustring.h>
+
+std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
+    UErrorCode err = U_ZERO_ERROR;
+
+    // Open the source converter
+    UConverter* conv = ucnv_open(src_enc, &err);
+    if (U_FAILURE(err)) {
+        if (err == U_FILE_ACCESS_ERROR) {
+            throw unknown_encoding(src_enc);
+        }
+        throw bytes_to_str_error(u_errorName(err));
+    }
+
+    // Open UTF-8 converter
+    UConverter* utf8Conv = ucnv_open("UTF-8", &err);
+    if (U_FAILURE(err)) {
+        ucnv_close(conv);
+        throw bytes_to_str_error(u_errorName(err));
+    }
+
+    // Configure source converter to stop on illegal sequences
+    err = U_ZERO_ERROR;
+    ucnv_setToUCallBack(
+        conv,
+        UCNV_TO_U_CALLBACK_STOP,
+        nullptr,
+        nullptr,
+        nullptr,
+        &err);
+    if (U_FAILURE(err)) {
+        ucnv_close(conv);
+        ucnv_close(utf8Conv);
+        throw illegal_seq_in_encoding(u_errorName(err));
+    }
+
+    // Allocate buffer for UTF-16 intermediate representation
+    const int32_t uniStrCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(src.length(), ucnv_getMaxCharSize(conv));
+    UChar* uniStr = new UChar[uniStrCapacity];
+
+    // Convert from source encoding to UTF-16
+    err = U_ZERO_ERROR;
+    int32_t uniLength = ucnv_toUChars(
+        conv,
+        uniStr,
+        uniStrCapacity,
+        src.c_str(),
+        src.length(),
+        &err);
+    if (U_FAILURE(err)) {
+        delete[] uniStr;
+        ucnv_close(conv);
+        ucnv_close(utf8Conv);
+        throw illegal_seq_in_encoding(u_errorName(err));
+    }
+
+    // Configure target converter to stop on illegal sequences
+    err = U_ZERO_ERROR;
+    ucnv_setFromUCallBack(
+        utf8Conv,
+        UCNV_FROM_U_CALLBACK_STOP,
+        nullptr,
+        nullptr,
+        nullptr,
+        &err);
+    if (U_FAILURE(err)) {
+        delete[] uniStr;
+        ucnv_close(conv);
+        ucnv_close(utf8Conv);
+        throw illegal_seq_in_encoding(u_errorName(err));
+    }
+
+    // Allocate buffer for UTF-8 output
+    const int32_t dstCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(uniLength, ucnv_getMaxCharSize(utf8Conv));
+    char* dst = new char[dstCapacity];
+
+    // Convert from UTF-16 to UTF-8
+    err = U_ZERO_ERROR;
+    int32_t outputLength = ucnv_fromUChars(
+        utf8Conv,
+        dst,
+        dstCapacity,
+        uniStr,
+        uniLength,
+        &err);
+    if (U_FAILURE(err)) {
+        delete[] uniStr;
+        delete[] dst;
+        ucnv_close(conv);
+        ucnv_close(utf8Conv);
+        throw illegal_seq_in_encoding(u_errorName(err));
+    }
+
+    // Create result string
+    std::string result(dst, outputLength);
+
+    // Clean up
+    delete[] uniStr;
+    delete[] dst;
+    ucnv_close(conv);
+    ucnv_close(utf8Conv);
+
+    return result;
+}
 #else
-#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
+#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE
 #endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -23,4 +23,8 @@ target_compile_options(unittest PRIVATE
 # Link the test executable with the main library and the test framework/library
 target_link_libraries(unittest PRIVATE kaitai_struct_cpp_stl_runtime GTest::GTest GTest::Main)
 
+if(ICU_FOUND)
+    target_link_libraries(unittest PRIVATE ICU::uc)
+endif()
+
 add_test(NAME unittest COMMAND unittest)
diff --git a/tests/unittest.cpp b/tests/unittest.cpp
@@ -540,6 +540,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
 #elif defined(KS_STR_ENCODING_WIN32API)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
+#elif defined(KS_STR_ENCODING_ICU)
+        EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
 #else
 #error Unknown KS_STR_ENCODING
 #endif
@@ -556,6 +558,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
 #elif defined(KS_STR_ENCODING_WIN32API)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
+#elif defined(KS_STR_ENCODING_ICU)
+        EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
 #else
 #error Unknown KS_STR_ENCODING
 #endif
@@ -581,6 +585,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ"));
 #elif defined(KS_STR_ENCODING_WIN32API)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
+#elif defined(KS_STR_ENCODING_ICU)
+        EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_ILLEGAL_CHAR_FOUND"));
 #else
 #error Unknown KS_STR_ENCODING
 #endif
@@ -598,6 +604,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_odd_bytes)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
 #elif defined(KS_STR_ENCODING_WIN32API)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete"));
+#elif defined(KS_STR_ENCODING_ICU)
+        EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
 #else
 #error Unknown KS_STR_ENCODING
 #endif
@@ -616,15 +624,32 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_incomplete_high_surrogat
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
 #elif defined(KS_STR_ENCODING_WIN32API)
         EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte"));
+#elif defined(KS_STR_ENCODING_ICU)
+        EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
 #else
 #error Unknown KS_STR_ENCODING
 #endif
     }
 }
 #endif
 
+#if defined(KS_STR_ENCODING_ICU)
+#include <unicode/uclean.h>
+#endif
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
+    const int ret = RUN_ALL_TESTS();
+#if defined(KS_STR_ENCODING_ICU)
+    // See <https://unicode-org.github.io/icu/userguide/icu/design.html#icu4c-initialization-and-termination>:
+    //
+    // > When an application is terminating it should call the function `u_cleanup()`,
+    // > which frees all heap storage and other system resources that are held internally
+    // > by the ICU library. While the use of `u_cleanup()` is not strictly required,
+    // > failure to call it will cause memory leak checking tools to report problems for
+    // > resources being held by ICU library.
+    u_cleanup();
+#endif
+    return ret;
 }