1
1
# frozen_string_literal: true
2
2
3
+ # essentially, this is for converting a string with a potentially
4
+ # broken or unknown encoding into a valid UTF-8 string
3
5
module TidyBytes
4
- # precomputing CP1252 to UTF-8 mappings for bytes 128-159
5
- CP1252_MAP = ( 128 ..159 ) . map do |byte |
6
- case byte
7
- when 128 then [ 226 , 130 , 172 ] # EURO SIGN
8
- when 130 then [ 226 , 128 , 154 ] # SINGLE LOW-9 QUOTATION MARK
9
- when 131 then [ 198 , 146 ] # LATIN SMALL LETTER F WITH HOOK
10
- when 132 then [ 226 , 128 , 158 ] # DOUBLE LOW-9 QUOTATION MARK
11
- when 133 then [ 226 , 128 , 166 ] # HORIZONTAL ELLIPSIS
12
- when 134 then [ 226 , 128 , 160 ] # DAGGER
13
- when 135 then [ 226 , 128 , 161 ] # DOUBLE DAGGER
14
- when 136 then [ 203 , 134 ] # MODIFIER LETTER CIRCUMFLEX ACCENT
15
- when 137 then [ 226 , 128 , 176 ] # PER MILLE SIGN
16
- when 138 then [ 197 , 160 ] # LATIN CAPITAL LETTER S WITH CARON
17
- when 139 then [ 226 , 128 , 185 ] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
18
- when 140 then [ 197 , 146 ] # LATIN CAPITAL LIGATURE OE
19
- when 142 then [ 197 , 189 ] # LATIN CAPITAL LETTER Z WITH CARON
20
- when 145 then [ 226 , 128 , 152 ] # LEFT SINGLE QUOTATION MARK
21
- when 146 then [ 226 , 128 , 153 ] # RIGHT SINGLE QUOTATION MARK
22
- when 147 then [ 226 , 128 , 156 ] # LEFT DOUBLE QUOTATION MARK
23
- when 148 then [ 226 , 128 , 157 ] # RIGHT DOUBLE QUOTATION MARK
24
- when 149 then [ 226 , 128 , 162 ] # BULLET
25
- when 150 then [ 226 , 128 , 147 ] # EN DASH
26
- when 151 then [ 226 , 128 , 148 ] # EM DASH
27
- when 152 then [ 203 , 156 ] # SMALL TILDE
28
- when 153 then [ 226 , 132 , 162 ] # TRADE MARK SIGN
29
- when 154 then [ 197 , 161 ] # LATIN SMALL LETTER S WITH CARON
30
- when 155 then [ 226 , 128 , 186 ] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
31
- when 156 then [ 197 , 147 ] # LATIN SMALL LIGATURE OE
32
- when 158 then [ 197 , 190 ] # LATIN SMALL LETTER Z WITH CARON
33
- when 159 then [ 197 , 184 ] # LATIN SMALL LETTER Y WITH DIAERESIS
34
- else nil # ANYTHING ELSE...
35
- end
36
- end . freeze
6
+ def tidy_bytes
7
+ # return if the string is already valid UTF-8
8
+ return self if self . valid_encoding? && self . encoding == Encoding ::UTF_8
9
+
10
+ # create a mutable copy so we don't modify the original string
11
+ str = self . dup
37
12
38
- # precomputing all possible byte conversions
39
- CP1252_TO_UTF8 = Array . new ( 256 ) do |b |
40
- if ( 128 ..159 ) . cover? ( b )
41
- CP1252_MAP [ b - 128 ] &.pack ( 'C*' )
42
- elsif b < 128
43
- b . chr
44
- else
45
- b < 192 ? [ 194 , b ] . pack ( 'C*' ) : [ 195 , b - 64 ] . pack ( 'C*' )
13
+ # attempt to encode to UTF-8
14
+ begin
15
+ return str . encode ( Encoding ::UTF -8 )
16
+ rescue Encoding ::UndefinedConversionError , Encoding ::InvalidByteSequenceError
46
17
end
47
- end . freeze
18
+
19
+ # if it failed, force the encoding to ISO-8859-1, transcode the
20
+ # string to UTF-8, and use replacement options for any characters
21
+ # that might still be problematic
22
+ str . force_encoding ( Encoding ::ISO_8859_1 ) . encode ( Encoding ::UTF_8 , invalid : :replace , undef : :replace , replace : '�' )
23
+ end
24
+
25
+ def tidy_bytes!
26
+ replace ( self . tidy_bytes )
27
+ end
48
28
49
29
def self . included ( base )
50
- base . class_eval do
51
- def tidy_bytes ( force = false )
52
- return nil if empty?
53
-
54
- if force
55
- buffer = String . new ( capacity : bytesize )
56
- each_byte { |b | buffer << CP1252_TO_UTF8 [ b ] }
57
- return buffer . force_encoding ( Encoding ::UTF_8 )
58
- end
30
+ base . send ( :include , InstanceMethods )
31
+ end
59
32
60
- begin
61
- encode ( 'UTF-8' )
62
- rescue Encoding ::UndefinedConversionError , Encoding ::InvalidByteSequenceError
63
- buffer = String . new ( capacity : bytesize )
64
- scrub { |b | CP1252_TO_UTF8 [ b . ord ] }
65
- end
66
- end
33
+ module InstanceMethods
34
+ def tidy_bytes
35
+ TidyBytes . instance_method ( :tidy_bytes ) . bind ( self ) . call
36
+ end
67
37
68
- def tidy_bytes! ( force = false )
69
- result = tidy_bytes ( force )
70
- result ? replace ( result ) : self
71
- end
38
+ def tidy_bytes!
39
+ TidyBytes . instance_method ( :tidy_bytes! ) . bind ( self ) . call
72
40
end
73
41
end
74
42
end
75
43
76
44
class String
77
45
include TidyBytes
78
- end
46
+ end
0 commit comments