Skip to content

Commit ea1259a

Browse files
Add new stripquoted keyword arg and fix stripwhitespace (#112)
* Add new stripquoted keyword arg and fix stripwhitespace Fixes #109. As noted in that issue, stripping whitespace *within* quoted strings, IMO, should be considered a bug, since one of the primary reasons for quoting strings in various applications is to delineate the exact characters that make up the string. This PR fixes `stripwhitespace` to preserve whitepace encountered within strings, and only strip whitespace for non-quoted strings (leading or trailing) and leading/trailing around quoted fields. On the other hand, there are legitimate use-cases for also stripping whitespace within quoted strings, so we add a new opt-in `stripquoted` keyword argument that allows the additional precision of also stripping whitespace inside quotes. Note that passing `stripquoted=true` implies `stripwhitespace=true`, so it can be considered a "stronger" version of `stripewhitespace`. * Update src/Parsers.jl Co-authored-by: Nick Robinson <npr251@gmail.com> * Update test/runtests.jl Co-authored-by: Nick Robinson <npr251@gmail.com> Co-authored-by: Nick Robinson <npr251@gmail.com>
1 parent 227ffa1 commit ea1259a

File tree

3 files changed

+46
-21
lines changed

3 files changed

+46
-21
lines changed

src/Parsers.jl

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ end
5353
* `quoted=false`: whether parsing should check for `openquotechar` and `closequotechar` characters to signal quoted fields
5454
* `comment=nothing`: a string which, if matched at the start of a line, will make parsing consume the rest of the line
5555
* `ignoreemptylines=false`: after parsing a value, if a newline is detected, another immediately proceeding newline will be checked for and consumed
56-
* `stripwhitespace=false`: if true, leading and trailing whitespace is stripped from string fields
56+
* `stripwhitespace=false`: if true, leading and trailing whitespace is stripped from string fields, note that for *quoted* strings however, whitespace is preserved within quotes (but ignored before/after quote characters). To also strip *within* quotes, see `stripquoted`
57+
* `stripquoted=false`: if true, whitespace is also stripped within quoted strings. If true, `stripwhitespace` is also set to true.
5758
* `debug=false`: if `true`, various debug logging statements will be printed while parsing; useful when diagnosing why parsing returns certain `Parsers.ReturnCode` values
5859
"""
5960
struct Options
@@ -74,6 +75,7 @@ struct Options
7475
dateformat::Union{Nothing, Format}
7576
cmt::Union{Nothing, PtrLen}
7677
stripwhitespace::Bool
78+
stripquoted::Bool
7779
end
7880

7981
prepare(x::Vector{String}) = sort!(map(ptrlen, x), by=x->x[2], rev=true)
@@ -92,7 +94,7 @@ function Options(
9294
trues::Union{Nothing, Vector{String}},
9395
falses::Union{Nothing, Vector{String}},
9496
dateformat::Union{Nothing, String, Dates.DateFormat, Format},
95-
ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace=false)
97+
ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace=false, stripquoted=false)
9698
asciival(wh1) && asciival(wh2) || throw(ArgumentError("whitespace characters must be ASCII"))
9799
asciival(oq) && asciival(cq) && asciival(e) || throw(ArgumentError("openquotechar, closequotechar, and escapechar must be ASCII characters"))
98100
(oq == delim) || (cq == delim) || (e == delim) && throw(ArgumentError("delim argument must be different than openquotechar, closequotechar, and escapechar arguments"))
@@ -134,7 +136,7 @@ function Options(
134136
cmt = ptrlen(comment)
135137
end
136138
df = dateformat === nothing ? nothing : dateformat isa String ? Format(dateformat) : dateformat isa Dates.DateFormat ? Format(dateformat) : dateformat
137-
return Options(refs, sent, ignorerepeated, ignoreemptylines, wh1 % UInt8, wh2 % UInt8, quoted, oq % UInt8, cq % UInt8, e % UInt8, del, decimal % UInt8, trues, falses, df, cmt, stripwhitespace)
139+
return Options(refs, sent, ignorerepeated, ignoreemptylines, wh1 % UInt8, wh2 % UInt8, quoted, oq % UInt8, cq % UInt8, e % UInt8, del, decimal % UInt8, trues, falses, df, cmt, stripwhitespace || stripquoted, stripquoted)
138140
end
139141

140142
Options(;
@@ -155,7 +157,8 @@ Options(;
155157
quoted::Bool=false,
156158
debug::Bool=false,
157159
stripwhitespace::Bool=false,
158-
) = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace)
160+
stripquoted::Bool=false,
161+
) = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace, stripquoted)
159162

160163
const OPTIONS = Options(nothing, UInt8(' '), UInt8('\t'), UInt8('"'), UInt8('"'), UInt8('"'), nothing, UInt8('.'), nothing, nothing, nothing, false, false, nothing, false, false, false)
161164
const XOPTIONS = Options(missing, UInt8(' '), UInt8('\t'), UInt8('"'), UInt8('"'), UInt8('"'), UInt8(','), UInt8('.'), nothing, nothing, nothing, false, false, nothing, true, false, false)
@@ -206,8 +209,8 @@ A [`Parsers.Result`](@ref) struct is returned, with the following fields:
206209
function xparse end
207210

208211
# for testing purposes only, it's much too slow to dynamically create Options for every xparse call
209-
function xparse(::Type{T}, buf::Union{AbstractVector{UInt8}, AbstractString, IO}; pos::Integer=1, len::Integer=buf isa IO ? 0 : sizeof(buf), sentinel=nothing, wh1::Union{UInt8, Char}=UInt8(' '), wh2::Union{UInt8, Char}=UInt8('\t'), quoted::Bool=true, openquotechar::Union{UInt8, Char}=UInt8('"'), closequotechar::Union{UInt8, Char}=UInt8('"'), escapechar::Union{UInt8, Char}=UInt8('"'), ignorerepeated::Bool=false, ignoreemptylines::Bool=false, delim::Union{UInt8, Char, PtrLen, AbstractString, Nothing}=UInt8(','), decimal::Union{UInt8, Char}=UInt8('.'), comment=nothing, trues=nothing, falses=nothing, dateformat::Union{Nothing, String, Dates.DateFormat}=nothing, debug::Bool=false, stripwhitespace::Bool=false) where {T}
210-
options = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace)
212+
function xparse(::Type{T}, buf::Union{AbstractVector{UInt8}, AbstractString, IO}; pos::Integer=1, len::Integer=buf isa IO ? 0 : sizeof(buf), sentinel=nothing, wh1::Union{UInt8, Char}=UInt8(' '), wh2::Union{UInt8, Char}=UInt8('\t'), quoted::Bool=true, openquotechar::Union{UInt8, Char}=UInt8('"'), closequotechar::Union{UInt8, Char}=UInt8('"'), escapechar::Union{UInt8, Char}=UInt8('"'), ignorerepeated::Bool=false, ignoreemptylines::Bool=false, delim::Union{UInt8, Char, PtrLen, AbstractString, Nothing}=UInt8(','), decimal::Union{UInt8, Char}=UInt8('.'), comment=nothing, trues=nothing, falses=nothing, dateformat::Union{Nothing, String, Dates.DateFormat}=nothing, debug::Bool=false, stripwhitespace::Bool=false, stripquoted::Bool=false) where {T}
213+
options = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace, stripquoted)
211214
return xparse(T, buf, pos, len, options)
212215
end
213216

src/strings.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
4343
pos += 1
4444
incr!(source)
4545
vpos = pos
46-
if options.stripwhitespace
46+
if options.stripquoted
4747
vstartpos = pos
4848
end
4949
if eof(source, pos, len)
@@ -97,7 +97,7 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
9797
code |= INVALID_QUOTED_FIELD | EOF
9898
@goto donedone
9999
end
100-
if options.stripwhitespace && b != options.wh1 && b != options.wh2
100+
if options.stripquoted && b != options.wh1 && b != options.wh2
101101
lastnonwhitespacepos = pos
102102
end
103103
b = peekbyte(source, pos)
@@ -282,7 +282,7 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
282282
if eof(source, pos, len)
283283
code |= EOF
284284
end
285-
if options.stripwhitespace
285+
if options.stripquoted || (options.stripwhitespace && !quoted)
286286
vpos = lastnonwhitespacepos
287287
end
288288
poslen = PosLen(vstartpos, vpos - vstartpos, ismissing, escapedstring(code))

test/runtests.jl

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -195,13 +195,13 @@ testcases = [
195195
(str="1\r\n# \r\n\r\n", kwargs=(ignoreemptylines=true, comment="#",), x=1, code=(OK | NEWLINE | EOF), vpos=1, vlen=1, tlen=10),
196196
(str="1,\r\n# \r\n\r\n,", kwargs=(ignorerepeated=true, ignoreemptylines=true, comment="#", delim=UInt8(',')), x=1, code=(OK | NEWLINE | DELIMITED), vpos=1, vlen=1, tlen=12),
197197
(str="1::\r\n# \r\n\r\n::", kwargs=(ignorerepeated=true, ignoreemptylines=true, comment="#", delim="::"), x=1, code=(OK | NEWLINE | DELIMITED), vpos=1, vlen=1, tlen=14),
198-
# stripwhitespace
199-
(str=" 1", kwargs=(stripwhitespace=true,), x=1, code=(OK | EOF), vpos=2, vlen=1, tlen=2),
200-
(str="{ 1}", kwargs=(stripwhitespace=true,), x=1, code=(OK | QUOTED | EOF), vpos=3, vlen=1, tlen=4),
201-
(str="{1 }", kwargs=(stripwhitespace=true,), x=1, code=(OK | QUOTED | EOF), vpos=2, vlen=1, tlen=4),
202-
(str="1 ", kwargs=(stripwhitespace=true,), x=1, code=(OK | EOF), vpos=1, vlen=1, tlen=2),
203-
(str="1 ,", kwargs=(stripwhitespace=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED), vpos=1, vlen=1, tlen=3),
204-
(str="{1 } ,", kwargs=(stripwhitespace=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED | QUOTED), vpos=2, vlen=1, tlen=6),
198+
# stripquoted
199+
(str=" 1", kwargs=(stripquoted=true,), x=1, code=(OK | EOF), vpos=2, vlen=1, tlen=2),
200+
(str="{ 1}", kwargs=(stripquoted=true,), x=1, code=(OK | QUOTED | EOF), vpos=3, vlen=1, tlen=4),
201+
(str="{1 }", kwargs=(stripquoted=true,), x=1, code=(OK | QUOTED | EOF), vpos=2, vlen=1, tlen=4),
202+
(str="1 ", kwargs=(stripquoted=true,), x=1, code=(OK | EOF), vpos=1, vlen=1, tlen=2),
203+
(str="1 ,", kwargs=(stripquoted=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED), vpos=1, vlen=1, tlen=3),
204+
(str="{1 } ,", kwargs=(stripquoted=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED | QUOTED), vpos=2, vlen=1, tlen=6),
205205
];
206206

207207
for useio in (false, true)
@@ -241,22 +241,44 @@ end
241241
res = Parsers.xparse(String, "{hey there}"; openquotechar='{', closequotechar='}', stripwhitespace=true)
242242
@test res.val.pos == 2 && res.val.len == 9
243243
res = Parsers.xparse(String, "{hey there }"; openquotechar='{', closequotechar='}', stripwhitespace=true)
244-
@test res.val.pos == 2 && res.val.len == 9
244+
@test res.val.pos == 2 && res.val.len == 10
245245
res = Parsers.xparse(String, "{hey there },"; openquotechar='{', closequotechar='}', delim=',', stripwhitespace=true)
246-
@test res.val.pos == 2 && res.val.len == 9
246+
@test res.val.pos == 2 && res.val.len == 10
247247
res = Parsers.xparse(String, "{hey there } ,"; openquotechar='{', closequotechar='}', delim=',', stripwhitespace=true)
248-
@test res.val.pos == 2 && res.val.len == 9
248+
@test res.val.pos == 2 && res.val.len == 10
249249
res = Parsers.xparse(String, "{hey there } a,"; openquotechar='{', closequotechar='}', delim=',', stripwhitespace=true)
250-
@test res.val.pos == 2 && res.val.len == 9 && Parsers.invaliddelimiter(res.code)
250+
@test res.val.pos == 2 && res.val.len == 10 && Parsers.invaliddelimiter(res.code)
251251
res = Parsers.xparse(String, "{hey there } a "; openquotechar='{', closequotechar='}', delim=nothing, stripwhitespace=true)
252-
@test res.val.pos == 2 && res.val.len == 9 && res.tlen == 13
252+
@test res.val.pos == 2 && res.val.len == 10 && res.tlen == 13
253253
res = Parsers.xparse(String, "hey there ,"; delim=',', stripwhitespace=true)
254254
@test res.val.pos == 1 && res.val.len == 9
255255
res = Parsers.xparse(String, " hey there "; stripwhitespace=true)
256256
@test res.val.pos == 2 && res.val.len == 9
257257
res = Parsers.xparse(String, " hey there "; delim=nothing, stripwhitespace=true)
258258
@test res.val.pos == 2 && res.val.len == 9
259259

260+
res = Parsers.xparse(String, "{hey there}"; openquotechar='{', closequotechar='}', stripquoted=true)
261+
@test res.val.pos == 2 && res.val.len == 9
262+
res = Parsers.xparse(String, "{hey there }"; openquotechar='{', closequotechar='}', stripquoted=true)
263+
@test res.val.pos == 2 && res.val.len == 9
264+
res = Parsers.xparse(String, "{hey there },"; openquotechar='{', closequotechar='}', delim=',', stripquoted=true)
265+
@test res.val.pos == 2 && res.val.len == 9
266+
res = Parsers.xparse(String, "{hey there } ,"; openquotechar='{', closequotechar='}', delim=',', stripquoted=true)
267+
@test res.val.pos == 2 && res.val.len == 9
268+
res = Parsers.xparse(String, "{hey there } a,"; openquotechar='{', closequotechar='}', delim=',', stripquoted=true)
269+
@test res.val.pos == 2 && res.val.len == 9 && Parsers.invaliddelimiter(res.code)
270+
res = Parsers.xparse(String, "{hey there } a "; openquotechar='{', closequotechar='}', delim=nothing, stripquoted=true)
271+
@test res.val.pos == 2 && res.val.len == 9 && res.tlen == 13
272+
res = Parsers.xparse(String, "hey there ,"; delim=',', stripquoted=true)
273+
@test res.val.pos == 1 && res.val.len == 9
274+
res = Parsers.xparse(String, " hey there "; stripquoted=true)
275+
@test res.val.pos == 2 && res.val.len == 9
276+
res = Parsers.xparse(String, " hey there "; delim=nothing, stripquoted=true)
277+
@test res.val.pos == 2 && res.val.len == 9
278+
# `stripquoted=true` should always override `stripwhitespace` to `true`
279+
res = Parsers.xparse(String, " hey there "; delim=nothing, stripquoted=true, stripwhitespace=false)
280+
@test res.val.pos == 2 && res.val.len == 9
281+
260282
end # @testset "Core Parsers.xparse"
261283

262284
@testset "ints" begin

0 commit comments

Comments
 (0)