Skip to content

Commit 8809cd5

Browse files
committed
Added ICU charset conversion implementation
1 parent 3798191 commit 8809cd5

File tree

5 files changed

+71
-5
lines changed

5 files changed

+71
-5
lines changed

.build/build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ cd "$(dirname "$0")"/..
44

55
mkdir -p build
66
cd build
7-
cmake ..
7+
cmake -DSTRING_ENCODING_TYPE="$ENCODING_TYPE" ..
88
cmake --build .

CMakeLists.txt

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON)
77
find_package(ZLIB)
88
find_package(Iconv)
99

10+
find_package(ICU COMPONENTS uc io)
11+
12+
set(ICU_FOUND FALSE)
13+
if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES)
14+
SET(ICU_FOUND TRUE)
15+
endif()
16+
1017
set (HEADERS
1118
kaitai/kaitaistream.h
1219
kaitai/kaitaistruct.h
@@ -17,11 +24,11 @@ set (SOURCES
1724
kaitai/kaitaistream.cpp
1825
)
1926

20-
set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")
27+
set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)")
2128

2229
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
2330

24-
add_library (${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES})
31+
add_library(${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES})
2532
set_property(TARGET ${PROJECT_NAME} PROPERTY PUBLIC_HEADER ${HEADERS})
2633

2734
if (ZLIB_FOUND)
@@ -33,6 +40,11 @@ if(Iconv_FOUND)
3340
target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv)
3441
endif()
3542

43+
if(ICU_FOUND)
44+
target_include_directories(${PROJECT_NAME} PRIVATE ${ICU_INCLUDE_DIRS})
45+
target_link_libraries(${PROJECT_NAME} PRIVATE ${ICU_LIBRARIES})
46+
endif()
47+
3648
include(Common.cmake)
3749

3850
install(TARGETS ${PROJECT_NAME}

Common.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV")
22
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
33
elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
44
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
5+
elseif (STRING_ENCODING_TYPE STREQUAL "ICU")
6+
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU)
57
elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
68
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
79
else()

kaitai/kaitaistream.cpp

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -872,6 +872,48 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
872872
return utf8;
873873
}
874874

875+
#elif defined(KS_STR_ENCODING_ICU)
876+
#include <unicode/ucnv.h>
877+
#include <iostream>
878+
879+
std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
880+
// Start with a buffer length of double the source length.
881+
size_t init_dst_len = src.length() * 2;
882+
std::string dst(init_dst_len, ' ');
883+
884+
UErrorCode err = U_ZERO_ERROR;
885+
int32_t dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], init_dst_len, src.c_str(), src.length(), &err);
886+
887+
if (err == U_BUFFER_OVERFLOW_ERROR) {
888+
// We need a bigger buffer, but at least we know how much space exactly we need now
889+
dst.resize(dst_len, ' ');
890+
891+
// Try again with the new buffer
892+
err = U_ZERO_ERROR;
893+
dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], dst_len, src.c_str(), src.length(), &err);
894+
} else if (!U_FAILURE(err)) {
895+
// Conversion succeed from the first try, shrink the buffer to fit
896+
dst.resize(dst_len);
897+
}
898+
899+
std::cout << "err = " << err << std::endl;
900+
// Dump all bytes of result
901+
for (int i = 0; i < dst_len; i++) {
902+
std::cout << std::hex << (int)(uint8_t)dst[i] << " ";
903+
}
904+
std::cout << "\n";
905+
906+
if (U_FAILURE(err)) {
907+
// Conversion failed
908+
if (err == U_FILE_ACCESS_ERROR) {
909+
throw unknown_encoding(src_enc);
910+
} else {
911+
throw bytes_to_str_error(u_errorName(err));
912+
}
913+
}
914+
915+
return dst;
916+
}
875917
#else
876-
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
918+
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE
877919
#endif

tests/unittest.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ TEST(KaitaiStreamTest, bytes_to_str_big_dest)
239239
{
240240
// Prepare a string in IBM437 that is reasonably big, fill it with U+2248 ALMOST EQUAL TO character,
241241
// which is just 1 byte 0xFB in IBM437.
242-
const int len = 10000000;
242+
const int len = 10;
243243
std::string src(len, '\xF7');
244244

245245
std::string res = kaitai::kstream::bytes_to_str(src, "IBM437");
@@ -274,6 +274,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short)
274274
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
275275
#elif defined(KS_STR_ENCODING_WIN32API)
276276
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
277+
#elif defined(KS_STR_ENCODING_ICU)
278+
EXPECT_EQ(e.what(), std::string("xxx"));
277279
#else
278280
#error Unknown KS_STR_ENCODING
279281
#endif
@@ -291,6 +293,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short)
291293
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
292294
#elif defined(KS_STR_ENCODING_WIN32API)
293295
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
296+
#elif defined(KS_STR_ENCODING_ICU)
297+
EXPECT_EQ(e.what(), std::string("xxx"));
294298
#else
295299
#error Unknown KS_STR_ENCODING
296300
#endif
@@ -307,6 +311,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes)
307311
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ"));
308312
#elif defined(KS_STR_ENCODING_WIN32API)
309313
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
314+
#elif defined(KS_STR_ENCODING_ICU)
315+
EXPECT_EQ(e.what(), std::string("xxx"));
310316
#else
311317
#error Unknown KS_STR_ENCODING
312318
#endif
@@ -324,6 +330,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_odd_bytes)
324330
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
325331
#elif defined(KS_STR_ENCODING_WIN32API)
326332
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete"));
333+
#elif defined(KS_STR_ENCODING_ICU)
334+
EXPECT_EQ(e.what(), std::string("xxx"));
327335
#else
328336
#error Unknown KS_STR_ENCODING
329337
#endif
@@ -342,6 +350,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_incomplete_high_surroga
342350
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
343351
#elif defined(KS_STR_ENCODING_WIN32API)
344352
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte"));
353+
#elif defined(KS_STR_ENCODING_ICU)
354+
EXPECT_EQ(e.what(), std::string("xxx"));
345355
#else
346356
#error Unknown KS_STR_ENCODING
347357
#endif

0 commit comments

Comments
 (0)