Skip to content

Commit 1f42029

Browse files
Fixes for tidy_bytes
admittedly not the cleanest way to do this, although it works for #25.
1 parent bed3f61 commit 1f42029

File tree

1 file changed

+44
-12
lines changed

1 file changed

+44
-12
lines changed

lib/wayback_machine_downloader/tidy_bytes.rb

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,56 @@
22

33
# essentially, this is for converting a string with a potentially
44
# broken or unknown encoding into a valid UTF-8 string
5+
# @todo: consider using charlock_holmes for this in the future
56
module TidyBytes
7+
UNICODE_REPLACEMENT_CHARACTER = "�"
8+
9+
# common encodings to try for best multilingual compatibility
10+
COMMON_ENCODINGS = [
11+
Encoding::UTF_8,
12+
Encoding::Windows_1251, # Cyrillic/Russian legacy
13+
Encoding::GB18030, # Simplified Chinese
14+
Encoding::Shift_JIS, # Japanese
15+
Encoding::EUC_KR, # Korean
16+
Encoding::ISO_8859_1, # Western European
17+
Encoding::Windows_1252 # Western European/Latin1 superset
18+
].select { |enc| Encoding.name_list.include?(enc.name) }
19+
20+
# returns true if the string appears to be binary (has null bytes)
21+
def binary_data?
22+
self.include?("\x00".b)
23+
end
24+
25+
# attempts to return a valid UTF-8 version of the string
626
def tidy_bytes
7-
# return if the string is already valid UTF-8
8-
return self if self.valid_encoding? && self.encoding == Encoding::UTF_8
27+
return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
28+
return self.dup.force_encoding("BINARY") if binary_data?
929

10-
# create a mutable copy so we don't modify the original string
1130
str = self.dup
31+
COMMON_ENCODINGS.each do |enc|
32+
str.force_encoding(enc)
33+
begin
34+
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
35+
return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
36+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
37+
# try next encoding
38+
end
39+
end
1240

13-
# attempt to encode to UTF-8
14-
begin
15-
return str.encode(Encoding::UTF-8)
16-
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
41+
# if no clean conversion found, try again but accept replacement characters
42+
str = self.dup
43+
COMMON_ENCODINGS.each do |enc|
44+
str.force_encoding(enc)
45+
begin
46+
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
47+
return utf8 if utf8.valid_encoding?
48+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
49+
# try next encoding
50+
end
1751
end
1852

19-
# if it failed, force the encoding to ISO-8859-1, transcode the
20-
# string to UTF-8, and use replacement options for any characters
21-
# that might still be problematic
22-
str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '�')
53+
# fallback: replace all invalid/undefined bytes
54+
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
2355
end
2456

2557
def tidy_bytes!
@@ -43,4 +75,4 @@ def tidy_bytes!
4375

4476
class String
4577
include TidyBytes
46-
end
78+
end

0 commit comments

Comments
 (0)