Skip to content

Commit bc868e6

Browse files
Refactor tidy_bytes.rb
I'm not sure if we can easily determine the encoding behind each site (and I don't think Wayback Machine does that), *but* we can at least translate it and get it to download. This should be mostly useful for other, non-Western European languages. See #25
1 parent 2bf04af commit bc868e6

File tree

1 file changed

+31
-63
lines changed

1 file changed

+31
-63
lines changed
Lines changed: 31 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,46 @@
11
# frozen_string_literal: true
22

3+
# essentially, this is for converting a string with a potentially
4+
# broken or unknown encoding into a valid UTF-8 string
35
module TidyBytes
4-
# precomputing CP1252 to UTF-8 mappings for bytes 128-159
5-
CP1252_MAP = (128..159).map do |byte|
6-
case byte
7-
when 128 then [226, 130, 172] # EURO SIGN
8-
when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
9-
when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
10-
when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
11-
when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
12-
when 134 then [226, 128, 160] # DAGGER
13-
when 135 then [226, 128, 161] # DOUBLE DAGGER
14-
when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
15-
when 137 then [226, 128, 176] # PER MILLE SIGN
16-
when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
17-
when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
18-
when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
19-
when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
20-
when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
21-
when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
22-
when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
23-
when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
24-
when 149 then [226, 128, 162] # BULLET
25-
when 150 then [226, 128, 147] # EN DASH
26-
when 151 then [226, 128, 148] # EM DASH
27-
when 152 then [203, 156] # SMALL TILDE
28-
when 153 then [226, 132, 162] # TRADE MARK SIGN
29-
when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
30-
when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
31-
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
32-
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
33-
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
34-
else nil # ANYTHING ELSE...
35-
end
36-
end.freeze
6+
def tidy_bytes
7+
# return if the string is already valid UTF-8
8+
return self if self.valid_encoding? && self.encoding == Encoding::UTF_8
9+
10+
# create a mutable copy so we don't modify the original string
11+
str = self.dup
3712

38-
# precomputing all possible byte conversions
39-
CP1252_TO_UTF8 = Array.new(256) do |b|
40-
if (128..159).cover?(b)
41-
CP1252_MAP[b - 128]&.pack('C*')
42-
elsif b < 128
43-
b.chr
44-
else
45-
b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
13+
# attempt to encode to UTF-8
14+
begin
15+
return str.encode(Encoding::UTF-8)
16+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
4617
end
47-
end.freeze
18+
19+
# if it failed, force the encoding to ISO-8859-1, transcode the
20+
# string to UTF-8, and use replacement options for any characters
21+
# that might still be problematic
22+
str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '�')
23+
end
24+
25+
def tidy_bytes!
26+
replace(self.tidy_bytes)
27+
end
4828

4929
def self.included(base)
50-
base.class_eval do
51-
def tidy_bytes(force = false)
52-
return nil if empty?
53-
54-
if force
55-
buffer = String.new(capacity: bytesize)
56-
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
57-
return buffer.force_encoding(Encoding::UTF_8)
58-
end
30+
base.send(:include, InstanceMethods)
31+
end
5932

60-
begin
61-
encode('UTF-8')
62-
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
63-
buffer = String.new(capacity: bytesize)
64-
scrub { |b| CP1252_TO_UTF8[b.ord] }
65-
end
66-
end
33+
module InstanceMethods
34+
def tidy_bytes
35+
TidyBytes.instance_method(:tidy_bytes).bind(self).call
36+
end
6737

68-
def tidy_bytes!(force = false)
69-
result = tidy_bytes(force)
70-
result ? replace(result) : self
71-
end
38+
def tidy_bytes!
39+
TidyBytes.instance_method(:tidy_bytes!).bind(self).call
7240
end
7341
end
7442
end
7543

7644
class String
7745
include TidyBytes
78-
end
46+
end

0 commit comments

Comments
 (0)