2
2
3
3
# essentially, this is for converting a string with a potentially
4
4
# broken or unknown encoding into a valid UTF-8 string
5
+ # @todo: consider using charlock_holmes for this in the future
5
6
module TidyBytes
7
+ UNICODE_REPLACEMENT_CHARACTER = "�"
8
+
9
+ # common encodings to try for best multilingual compatibility
10
+ COMMON_ENCODINGS = [
11
+ Encoding ::UTF_8 ,
12
+ Encoding ::Windows_1251 , # Cyrillic/Russian legacy
13
+ Encoding ::GB18030 , # Simplified Chinese
14
+ Encoding ::Shift_JIS , # Japanese
15
+ Encoding ::EUC_KR , # Korean
16
+ Encoding ::ISO_8859_1 , # Western European
17
+ Encoding ::Windows_1252 # Western European/Latin1 superset
18
+ ] . select { |enc | Encoding . name_list . include? ( enc . name ) }
19
+
20
+ # returns true if the string appears to be binary (has null bytes)
21
+ def binary_data?
22
+ self . include? ( "\x00 " . b )
23
+ end
24
+
25
+ # attempts to return a valid UTF-8 version of the string
6
26
def tidy_bytes
7
- # return if the string is already valid UTF-8
8
- return self if self . valid_encoding? && self . encoding == Encoding :: UTF_8
27
+ return self if self . encoding == Encoding :: UTF_8 && self . valid_encoding?
28
+ return self . dup . force_encoding ( "BINARY" ) if binary_data?
9
29
10
- # create a mutable copy so we don't modify the original string
11
30
str = self . dup
31
+ COMMON_ENCODINGS . each do |enc |
32
+ str . force_encoding ( enc )
33
+ begin
34
+ utf8 = str . encode ( Encoding ::UTF_8 , invalid : :replace , undef : :replace , replace : UNICODE_REPLACEMENT_CHARACTER )
35
+ return utf8 if utf8 . valid_encoding? && !utf8 . include? ( UNICODE_REPLACEMENT_CHARACTER )
36
+ rescue Encoding ::UndefinedConversionError , Encoding ::InvalidByteSequenceError
37
+ # try next encoding
38
+ end
39
+ end
12
40
13
- # attempt to encode to UTF-8
14
- begin
15
- return str . encode ( Encoding ::UTF -8 )
16
- rescue Encoding ::UndefinedConversionError , Encoding ::InvalidByteSequenceError
41
+ # if no clean conversion found, try again but accept replacement characters
42
+ str = self . dup
43
+ COMMON_ENCODINGS . each do |enc |
44
+ str . force_encoding ( enc )
45
+ begin
46
+ utf8 = str . encode ( Encoding ::UTF_8 , invalid : :replace , undef : :replace , replace : UNICODE_REPLACEMENT_CHARACTER )
47
+ return utf8 if utf8 . valid_encoding?
48
+ rescue Encoding ::UndefinedConversionError , Encoding ::InvalidByteSequenceError
49
+ # try next encoding
50
+ end
17
51
end
18
52
19
- # if it failed, force the encoding to ISO-8859-1, transcode the
20
- # string to UTF-8, and use replacement options for any characters
21
- # that might still be problematic
22
- str . force_encoding ( Encoding ::ISO_8859_1 ) . encode ( Encoding ::UTF_8 , invalid : :replace , undef : :replace , replace : '�' )
53
+ # fallback: replace all invalid/undefined bytes
54
+ str . encode ( Encoding ::UTF_8 , invalid : :replace , undef : :replace , replace : UNICODE_REPLACEMENT_CHARACTER )
23
55
end
24
56
25
57
def tidy_bytes!
@@ -43,4 +75,4 @@ def tidy_bytes!
43
75
44
76
class String
45
77
include TidyBytes
46
- end
78
+ end
0 commit comments