Skip to content

Commit 0eb5f46

Browse files
When delim=groupmark=x, treat x as delim unless input is quoted (#182)
* When delim=groupmark=`x`, treat `x` as delim * Move groupmark checker to helper function * Adapt tests for new `groupmark` handling * Simplify tests * More test cases (based on old test cases) * Bump version --------- Co-authored-by: Drvi <tomas.drvostep@gmail.com>
1 parent d1c6fc5 commit 0eb5f46

File tree

6 files changed

+231
-54
lines changed

6 files changed

+231
-54
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Parsers"
22
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
33
authors = ["quinnj <quinn.jacobd@gmail.com>"]
4-
version = "2.7.2"
4+
version = "2.8.0"
55

66
[deps]
77
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

src/Parsers.jl

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ end
111111
* `ignoreemptylines=false`: after parsing a value, if a newline is detected, another immediately proceeding newline will be checked for and consumed
112112
* `stripwhitespace=nothing`: if true, leading and trailing whitespace is stripped from string fields, note that for *quoted* strings however, whitespace is preserved within quotes (but ignored before/after quote characters). To also strip *within* quotes, see `stripquoted`
113113
* `stripquoted=false`: if true, whitespace is also stripped within quoted strings. If true, `stripwhitespace` is also set to true.
114-
* `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`).
114+
* `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`). When the `groupmark` is ambiguous with the `delim`, the user must quote the number if it contains group marks.
115115
* `rounding=RoundNearest`: optionally specify a rounding mode to use when parsing. No rounding means the result will be marked with `INEXACT` code if the value is not exactly representable in the target type.
116116
"""
117117
struct Options
@@ -141,12 +141,49 @@ function Base.getproperty(x::Options, nm::Symbol)
141141
end
142142
end
143143

144-
const OPTIONS = Options(Flags(false, false, false, false, false, false, false, false, false), UInt8('.'),
145-
Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(""), Token(""),
146-
nothing, nothing, nothing, nothing, nothing)
147-
const XOPTIONS = Options(Flags(false, false, false, false, true, true, true, false, false), UInt8('.'),
148-
Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(UInt8(',')), Token(""),
149-
nothing, nothing, nothing, nothing, nothing)
144+
# Get the default options for single-value parsing (i.e. not delimited), used
145+
# by Parsers.parse and Parsers.tryparse via Parser.xparse2
146+
function _get_default_options(;
147+
flags::Flags=Flags(false, false, false, false, false, false, false, false, false),
148+
decimal::UInt8=UInt8('.'),
149+
oq::Token=Token(UInt8('"')),
150+
cq::Token=Token(UInt8('"')),
151+
e::UInt8=UInt8('"'),
152+
sentinel::Vector{Token}=Token[],
153+
delim::Token=Token(""),
154+
cmt::Token=Token(""),
155+
trues::Union{Nothing, Vector{String}}=nothing,
156+
falses::Union{Nothing, Vector{String}}=nothing,
157+
dateformat::Union{Nothing, Format}=nothing,
158+
groupmark::Union{Nothing,UInt8}=nothing,
159+
rounding::Union{Nothing,RoundingMode}=nothing,
160+
)
161+
return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
162+
end
163+
164+
# Get the default options for delimited parsing, used by Parsers.xparse
165+
function _get_default_xoptions(;
166+
flags::Flags=Flags(false, false, false, false, true, true, true, false, false),
167+
decimal::UInt8=UInt8('.'),
168+
oq::Token=Token(UInt8('"')),
169+
cq::Token=Token(UInt8('"')),
170+
e::UInt8=UInt8('"'),
171+
sentinel::Vector{Token}=Token[],
172+
delim::Token=Token(UInt8(',')),
173+
cmt::Token=Token(""),
174+
trues::Union{Nothing, Vector{String}}=nothing,
175+
falses::Union{Nothing, Vector{String}}=nothing,
176+
dateformat::Union{Nothing, Format}=nothing,
177+
groupmark::Union{Nothing,UInt8}=nothing,
178+
rounding::Union{Nothing,RoundingMode}=nothing,
179+
)
180+
return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
181+
end
182+
183+
# What is used by default in Parsers.parse, Parsers.tryparse, Parsers.xparse2
184+
const OPTIONS = _get_default_options()
185+
# What is used by default in Parsers.xparse
186+
const XOPTIONS = _get_default_xoptions()
150187

151188
prepare!(x::Vector) = sort!(x, by=x->sizeof(x), rev=true)
152189
asciival(c::Char) = isascii(c)
@@ -446,6 +483,16 @@ function checkdelim!(source::AbstractVector{UInt8}, pos, len, options::Options)
446483
return pos
447484
end
448485

486+
@inline function _has_groupmark(opts::Options, code::ReturnCode)
487+
if opts.groupmark !== nothing
488+
isquoted = (code & QUOTED) != 0
489+
if isquoted || (opts.groupmark != opts.delim)
490+
return true
491+
end
492+
end
493+
return false
494+
end
495+
449496
include("ints.jl")
450497
include("floats.jl")
451498
include("strings.jl")

src/floats.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ rettype(::Type{T}) where {T} = T === Number ? Nothing : T
233233
@inline function parsedigits(conf::AbstractConf{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, overflow_invalid::Bool=false, ndigits::Int=0, f::F=nothing) where {T, IntType, F}
234234
x = zero(T)
235235
anydigits = false
236-
has_groupmark = options.groupmark !== nothing
236+
has_groupmark = _has_groupmark(options, code)
237237
groupmark0 = something(options.groupmark, 0xff) - UInt8('0')
238238

239239
# we already previously checked if `b` was decimal or a digit, so don't need to check explicitly again

src/ints.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))
77
@inline function typeparser(::AbstractConf{T}, source, pos, len, b, code, pl, opts) where {T <: Integer}
88
x = zero(T)
99
neg = false
10-
has_groupmark = opts.groupmark !== nothing
10+
has_groupmark = _has_groupmark(opts, code)
1111
groupmark0 = something(opts.groupmark, 0xff) - UInt8('0')
1212
# start actual int parsing
1313
neg = b == UInt8('-')

test/floats.jl

Lines changed: 81 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -369,48 +369,114 @@ end
369369
@test Parsers.tryparse(Float64, "0e+") === nothing
370370

371371
@testset "groupmark" begin
372-
@test Parsers.xparse(Float64, "100,000,000.99"; groupmark=',').val == 100_000_000.99
373-
@test Parsers.xparse(Float64, "100,000,000"; groupmark=',').val == 100_000_000.0
374-
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val == 100_000_000.99
372+
# `parse` is used for parsing inputs with a single value in them,
373+
# so when delims==groupmarks, we assume what we see are groupmarks
374+
@testset "Parsers.parse" begin
375+
groupmark(c::Char) = Parsers._get_default_options(groupmark=UInt8(c))
376+
@testset "$T" for T in (Float32, Float64)
377+
# comma
378+
@test Parsers.parse(T, "1,0,0,0,0,0,0,0,099e-2", groupmark(',')) 100_000_000.99
379+
@test Parsers.parse(T, "100,000,00099e-2", groupmark(',')) 100_000_000.99
380+
@test Parsers.parse(T, "100,000,000.99", groupmark(',')) 100_000_000.99
381+
@test Parsers.parse(T, "100,000,000", groupmark(',')) 100_000_000
382+
# space
383+
@test Parsers.parse(T, "1 0 0 0 0 0 0 0 099e-2", groupmark(' ')) 100_000_000.99
384+
@test Parsers.parse(T, "100 000 00099e-2", groupmark(' ')) 100_000_000.99
385+
@test Parsers.parse(T, "100 000 000.99", groupmark(' ')) 100_000_000.99
386+
@test Parsers.parse(T, "100 000 000", groupmark(' ')) 100_000_000
387+
end
388+
end
389+
@test Parsers.xparse(Float64, "100_000_000.99"; groupmark='_').val == 100_000_000.99
390+
@test Parsers.xparse(Float64, "100_000_000"; groupmark='_').val == 100_000_000.0
391+
@test Parsers.xparse(Float64, "1_0_0_0_0_0_0_0_0.99"; groupmark='_').val == 100_000_000.99
392+
375393
@test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val == 100_000_000.99
376394
@test Parsers.xparse(Float64, "100000000.99"; groupmark=',').val == 100_000_000.99
377395
@test Parsers.xparse(Float64, "100000000.99,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 13, 1.0000000099e8)
378396
@test Parsers.xparse(Float64, "\"100,000,000.99\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
379-
@test Parsers.xparse(Float64, "100,000,000.99,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
397+
@test Parsers.xparse(Float64, "100,000,000.99,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 4, 100.0)
398+
@test Parsers.xparse(Float64, "100_000_000.99,100"; groupmark='_', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
380399
@test Parsers.xparse(Float64, "\"100,000,000\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 14, 1.0e8)
381400
res = Parsers.xparse(Float64, "100,000,000,aaa"; groupmark=',')
382-
@test res.code == EOF | INVALID | INVALID_DELIMITER
383-
@test res.tlen == 15
401+
@test res.code == OK | DELIMITED
402+
@test res.tlen == 4
403+
res = Parsers.xparse(Float64, "100_000_000,aaa"; groupmark='_')
404+
@test res.code == OK | DELIMITED
405+
@test res.tlen == 12
384406

385-
@test Parsers.xparse(Float32, "100,000,000.99"; groupmark=',').val 100_000_000.99
386-
@test Parsers.xparse(Float32, "100,000,000"; groupmark=',').val 100_000_000.0
387-
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val 100_000_000.99
407+
@test Parsers.xparse(Float32, "100_000_000.99"; groupmark='_').val 100_000_000.99
408+
@test Parsers.xparse(Float32, "100_000_000"; groupmark='_').val 100_000_000.0
409+
@test Parsers.xparse(Float32, "1_0_0_0_0_0_0_0_0.99"; groupmark='_').val 100_000_000.99
388410
@test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val 100_000_000.99
389411
@test Parsers.xparse(Float32, "100000000.99"; groupmark=',').val 100_000_000.99
390412
res = Parsers.xparse(Float32, "100000000.99,aaa"; groupmark=',')
391413
@test res.code == OK | DELIMITED
392414
@test res.tlen == 13
393415
@test res.val 100_000_000.99
394416
res = Parsers.xparse(Float32, "100,000,000,aaa"; groupmark=',')
395-
@test res.code == EOF | INVALID | INVALID_DELIMITER
396-
@test res.tlen == 15
417+
@test res.code == OK | DELIMITED
418+
@test res.tlen == 4
419+
res = Parsers.xparse(Float32, "100_000_000,aaa"; groupmark='_')
420+
@test res.code == OK | DELIMITED
421+
@test res.tlen == 12
422+
423+
@test Parsers.xparse(Float64, "100,000,00099e-2"; groupmark=',').val == 100.0
424+
@test Parsers.xparse(Float64, "100_000_00099e-2"; groupmark='_').val == 100_000_000.99
425+
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val == 1.0
426+
@test Parsers.xparse(Float64, "1_0_0_0_0_0_0_0_099e-2"; groupmark='_').val == 100_000_000.99
397427

398-
@test Parsers.xparse(Float64, "100,000,00099e-2"; groupmark=',').val == 100_000_000.99
399-
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val == 100_000_000.99
400428
@test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val == 100_000_000.99
401429
@test Parsers.xparse(Float64, "10000000099e-2"; groupmark=',').val == 100_000_000.99
402430
@test Parsers.xparse(Float64, "10000000099e-2,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 15, 1.0000000099e8)
403431
@test Parsers.xparse(Float64, "\"10000000099e-2\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
404432
@test Parsers.xparse(Float64, "10000000099e-2,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
405433

406-
@test Parsers.xparse(Float32, "100,000,00099e-2"; groupmark=',').val 100_000_000.99
407-
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val 100_000_000.99
434+
@test Parsers.xparse(Float32, "100,000,00099e-2"; groupmark=',').val 100.0
435+
@test Parsers.xparse(Float32, "100_000_00099e-2"; groupmark='_').val 100_000_000.99
436+
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val 1.0
437+
@test Parsers.xparse(Float32, "1_0_0_0_0_0_0_0_099e-2"; groupmark='_').val 100_000_000.99
438+
408439
@test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val 100_000_000.99
409440
@test Parsers.xparse(Float32, "10000000099e-2"; groupmark=',').val 100_000_000.99
410441
res = Parsers.xparse(Float32, "10000000099e-2,aaa"; groupmark=',')
411442
@test res.code == OK | DELIMITED
412443
@test res.tlen == 15
413444
@test res.val 100_000_000.99
445+
446+
@testset "$T groupmark=$(repr(g))" for g in (',',' '), T in (Float32, Float64)
447+
xgroupmark(c::Char) = Parsers._get_default_xoptions(groupmark=UInt8(c))
448+
# Groupmark tests for floats
449+
for (input, expected_vals) in [
450+
("1000,0000,2000,3000" => (1000.0,0.0,2000.0,3000.0,)),
451+
("\"1000\",\"0000\",\"2000\",\"3000\"" => (1000.0,0.0,2000.0,3000.0,)),
452+
("\"1$(g)0$(g)0$(g)0\",0000,\"2$(g)0$(g)0$(g)0\",3000" => (1000.0,0.0,2000.0,3000.0,)),
453+
("1000,\"0$(g)0$(g)0$(g)0\",2000,\"3$(g)0$(g)0$(g)0\"" => (1000.0,0.0,2000.0,3000.0,)),
454+
("1000.00,0000.00,2000.00,3000.00" => (1000.0,0.0,2000.0,3000.0,)),
455+
("\"1000.00\",\"0000.00\",\"2000.00\",\"3000.00\"" => (1000.0,0.0,2000.0,3000.0,)),
456+
("\"1$(g)0$(g)0$(g)0.00\",0000.00,\"2$(g)0$(g)0$(g)0.00\",3000.00" => (1000.0,0.0,2000.0,3000.0,)),
457+
("1000,\"0$(g)0$(g)0$(g)0.00\",2000.00,\"3$(g)0$(g)0$(g)0.00\"" => (1000.0,0.0,2000.0,3000.0,)),
458+
("1000.00e0,0000.00e0,2000.00e0,3000.00e0" => (1000.0,0.0,2000.0,3000.0,)),
459+
("\"1000.00e0\",\"0000.00e0\",\"2000.00e0\",\"3000.00e0\"" => (1000.0,0.0,2000.0,3000.0,)),
460+
("\"1$(g)0$(g)0$(g)0.00e0\",0000.00e0,\"2$(g)0$(g)0$(g)0.00e0\",3000.00e0" => (1000.0,0.0,2000.0,3000.0,)),
461+
("1000,\"0$(g)0$(g)0$(g)0.00e0\",2000.00e0,\"3$(g)0$(g)0$(g)0.00e0\"" => (1000.0,0.0,2000.0,3000.0,)),
462+
("1000e0,0000e0,2000e0,3000e0" => (1000.0,0.0,2000.0,3000.0,)),
463+
("\"1000e0\",\"0000e0\",\"2000e0\",\"3000e0\"" => (1000.0,0.0,2000.0,3000.0,)),
464+
("\"1$(g)0$(g)0$(g)0e0\",0000e0,\"2$(g)0$(g)0$(g)0e0\",3000e0" => (1000.0,0.0,2000.0,3000.0,)),
465+
("1000,\"0$(g)0$(g)0$(g)0e0\",2000e0,\"3$(g)0$(g)0$(g)0e0\"" => (1000.0,0.0,2000.0,3000.0,)),
466+
]
467+
pos = 1
468+
len = length(input)
469+
local res
470+
for expected in expected_vals
471+
res = Parsers.xparse(T, input, pos, len, xgroupmark(g))
472+
@test res.val == expected
473+
@test Parsers.ok(res.code)
474+
pos += res.tlen
475+
end
476+
@test Parsers.ok(res.code)
477+
@test Parsers.eof(res.code)
478+
end
479+
end
414480
end
415481

416482
@testset "BigFloats" begin

0 commit comments

Comments
 (0)