Skip to content

Commit fd01f68

Browse files
authored
WIP: Fix IterableTables integration, support both missing and DataValue (#204)
- add `missingtype = Missing` keyword argument where needed - add `convertmissing` function
1 parent 0d7493b commit fd01f68

14 files changed

+241
-94
lines changed

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,8 @@
1818

1919
- **(breaking)** Switch from DataValues to Missing. Related: `dropna` has been changed to `dropmissing`.
2020
- **(breaking)** Depend on OnlineStatsBase rather than OnlineStats.
21+
22+
## v0.10.0
23+
24+
- **(breaking)** Support for both DataValues and Missing (default). When `join` generates missing values, use the keyword argument `missingtype` to set the type (`Missing` or `DataValue`)
25+
- Use `IndexedTables.convertmissing(tbl, T)` to convert the missing values in `tbl` to be of type `T`.

REQUIRE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ julia 0.7
22
OnlineStatsBase 0.9.1
33
PooledArrays 0.4.1
44
WeakRefStrings 0.4.4
5-
TableTraits 0.3.0
6-
TableTraitsUtils 0.2.0
75
IteratorInterfaceExtensions 0.1.0
6+
TableTraits
87
Tables
98
StructArrays 0.2.0
9+
DataValues

src/IndexedTables.jl

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
module IndexedTables
22

3-
using PooledArrays, SparseArrays, Statistics, WeakRefStrings, TableTraits,
4-
TableTraitsUtils, IteratorInterfaceExtensions
3+
using PooledArrays, SparseArrays, Statistics, WeakRefStrings
54

65
using OnlineStatsBase: OnlineStat, fit!
7-
import Tables
6+
using StructArrays: StructVector, StructArray, foreachfield, fieldarrays,
7+
collect_structarray, staticschema, ArrayInitializer, refine_perm!, collect_structarray,
8+
collect_empty_structarray, grow_to_structarray!, collect_to_structarray!
9+
10+
import Tables, TableTraits, IteratorInterfaceExtensions
11+
12+
import DataValues: DataValue, DataValueArray, isna
813

914
import Base:
1015
show, eltype, length, getindex, setindex!, ndims, map, convert, keys, values,
@@ -14,9 +19,6 @@ import Base:
1419
tuple_type_cons, tuple_type_head, tuple_type_tail, in, convert
1520

1621

17-
using StructArrays: StructVector, StructArray, foreachfield, fieldarrays, collect_structarray, staticschema, ArrayInitializer,
18-
refine_perm!, collect_structarray, collect_empty_structarray, grow_to_structarray!, collect_to_structarray!
19-
2022
#-----------------------------------------------------------------------# exports
2123
export
2224
# macros
@@ -77,9 +79,7 @@ include("reduce.jl")
7779
include("flatten.jl")
7880
include("join.jl")
7981
include("reshape.jl")
80-
81-
# TableTraits/Tables integration
82-
include("tabletraits.jl")
8382
include("tables.jl")
83+
include("tabletraits.jl")
8484

8585
end # module

src/collect.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
const default_initializer = ArrayInitializer(t -> t<:Union{Tuple, NamedTuple, Pair})
1+
const default_initializer = ArrayInitializer(t -> t<:Union{Tuple, NamedTuple, Pair}, (T, sz) -> similar(arrayof(T), sz))
22

33
"""
44
collect_columns(itr)

src/columns.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,9 @@ function ColDict(t; copy=nothing)
423423
ColDict(Int[], t, convert(Array{Any}, collect(cnames)), Any[columns(t)...], copy)
424424
end
425425

426+
Base.keys(d::ColDict) = d.names
427+
Base.values(d::ColDict) = d.columns
428+
426429
function Base.getindex(d::ColDict{<:Columns})
427430
Columns(Tuple(d.columns); names=d.names)
428431
end

src/indexedtable.jl

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,9 @@ _impl(impl::Val) = impl
131131
_impl(impl::Val, x::AbstractArray, z...) = _impl(impl, z...)
132132
_impl(x::AbstractArray...) = _impl(Val{:serial}(), x...)
133133

134+
# table methods go through here
134135
function table(cs::Tup; chunks=nothing, kwargs...)
135-
if chunks !== nothing
136-
impl = Val{:distributed}()
137-
else
138-
impl = _impl(astuple(cs)...)
139-
end
136+
impl = chunks !== nothing ? Val(:distributed) : _impl(astuple(cs)...)
140137
table(impl, cs; chunks=chunks, kwargs...)
141138
end
142139

src/join.jl

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ end
5050
end
5151

5252
function _join!(::Val{typ}, ::Val{grp}, ::Val{keepkeys}, f, I, data, ks, lout, rout,
53-
lnull, rnull, lkey, rkey, ldata, rdata, lperm, rperm, init_group, accumulate) where {typ, grp, keepkeys}
53+
lnull, rnull, lkey, rkey, ldata, rdata, lperm, rperm, init_group, accumulate, missingtype) where {typ, grp, keepkeys}
5454

5555
ll, rr = length(lkey), length(rkey)
5656

@@ -102,7 +102,8 @@ function _join!(::Val{typ}, ::Val{grp}, ::Val{keepkeys}, f, I, data, ks, lout, r
102102
# optimized push! method for concat_tup
103103
_push!(Val{:both}(), f, data,
104104
lout, rout, ldata, rdata,
105-
lperm[x], rperm[y], missing, missing)
105+
lperm[x], rperm[y],
106+
missing_instance(missingtype), missing_instance(missingtype))
106107
end
107108
end
108109
else
@@ -167,11 +168,36 @@ function _join!(::Val{typ}, ::Val{grp}, ::Val{keepkeys}, f, I, data, ks, lout, r
167168
lnull_idx, rnull_idx
168169
end
169170

170-
nullrow(t::Type{<:Tuple}) = Tuple(map(x->missing, fieldtypes(t)))
171-
nullrow(t::Type{<:NamedTuple}) = t(Tuple(map(x->missing, fieldtypes(t))))
172-
nullrow(t) = missing
173171

174-
function init_join_output(typ, grp, f, ldata, rdata, left, keepkeys, lkey, rkey, init_group, accumulate)
172+
# Missing
173+
nullrow(t::Type{<:Tuple}, ::Type{Missing}) = Tuple(map(x->missing, fieldtypes(t)))
174+
nullrow(t::Type{<:NamedTuple}, ::Type{Missing}) = t(Tuple(map(x->missing, fieldtypes(t))))
175+
nullrow(t, ::Type{Missing}) = missing
176+
function outvec(col, idxs, ::Type{Missing})
177+
v = convert(Vector{Union{Missing, eltype(col)}}, col)
178+
v[idxs] .= missing
179+
v
180+
end
181+
182+
183+
# DataValue
184+
nullrow(::Type{T}, ::Type{DataValue}) where {T <: Tuple} = Tuple(fieldtype(T, i)() for i = 1:fieldcount(T))
185+
function nullrow(::Type{NamedTuple{names, T}}, ::Type{DataValue}) where {names, T}
186+
NamedTuple{names, T}(Tuple(fieldtype(T, i)() for i = 1:fieldcount(T)))
187+
end
188+
nullrow(t, ::Type{DataValue}) = DataValue()
189+
nullrow(t::DataValue, ::Type{DataValue}) = t()
190+
function outvec(col, idxs, ::Type{DataValue})
191+
nulls = zeros(Bool, length(col))
192+
nulls[idxs] .= true
193+
if col isa DataValueArray
194+
col.isna[idxs] .= true
195+
else
196+
DataValueArray(col, nulls)
197+
end
198+
end
199+
200+
function init_join_output(typ, grp, f, ldata, rdata, left, keepkeys, lkey, rkey, init_group, accumulate, missingtype)
175201
lnull = nothing
176202
rnull = nothing
177203
loutput = nothing
@@ -181,16 +207,16 @@ function init_join_output(typ, grp, f, ldata, rdata, left, keepkeys, lkey, rkey,
181207

182208
left_type = eltype(ldata)
183209
if !isa(typ, Union{Val{:left}, Val{:inner}, Val{:anti}})
184-
null_left_type = map_params(x -> Union{Missing, x}, eltype(ldata))
185-
lnull = nullrow(null_left_type)
210+
null_left_type = map_params(x -> type2missingtype(x, missingtype), eltype(ldata))
211+
lnull = nullrow(null_left_type, missingtype)
186212
else
187213
null_left_type = left_type
188214
end
189215

190216
right_type = eltype(rdata)
191217
if !isa(typ, Val{:inner})
192-
null_right_type = map_params(x->Union{Missing, x}, eltype(rdata))
193-
rnull = nullrow(null_right_type)
218+
null_right_type = map_params(x -> type2missingtype(x, missingtype), eltype(rdata))
219+
rnull = nullrow(null_right_type, missingtype)
194220
else
195221
null_right_type = right_type
196222
end
@@ -253,11 +279,19 @@ with each `right` row, resulting in `n_occurrences_left * n_occurrences_right` o
253279
254280
# Options (keyword arguments)
255281
256-
- `how = :inner` -- join method to use. Described below.
257-
- `lkey = pkeys(left)` -- fields from `left` to match on (see [`pkeys`](@ref))
258-
- `rkey = pkeys(right)` -- fields from `right` to match on
259-
- `lselect = Not(lkey)` -- output columns from `left` (see [`Not`](@ref))
260-
- `rselect = Not(rkey)` -- output columns from `right`
282+
- `how = :inner`
283+
- Join method to use. Described below.
284+
- `lkey = pkeys(left)`
285+
- Fields from `left` to match on (see [`pkeys`](@ref)).
286+
- `rkey = pkeys(right)`
287+
- Fields from `right` to match on.
288+
- `lselect = Not(lkey)`
289+
- Output columns from `left` (see [`Not`](@ref))
290+
- `rselect = Not(rkey)`
291+
- Output columns from `right`.
292+
- `missingtype = Missing`
293+
- Type of missing values that can be created through `:left` and `:outer` joins.
294+
- Other supported option is `DataValue`.
261295
262296
## Join methods (`how = :inner`)
263297
@@ -289,7 +323,8 @@ function Base.join(f, left::Dataset, right::Dataset;
289323
keepkeys=false, # defaults to keeping the keys for only the joined columns
290324
init_group=nothing,
291325
accumulate=nothing,
292-
cache=true)
326+
cache=true,
327+
missingtype=Missing)
293328

294329
if !(how in [:inner, :left, :outer, :anti])
295330
error("Invalid how: supported join types are :inner, :left, :outer, and :anti")
@@ -331,26 +366,18 @@ function Base.join(f, left::Dataset, right::Dataset;
331366
I, data, ks, lout, rout, lnull, rnull, init_group, accumulate =
332367
init_join_output(typ, grp, f, ldata, rdata,
333368
left, keepkeys, lkey, rkey,
334-
init_group, accumulate)
369+
init_group, accumulate, missingtype)
335370

336371
lnull_idx, rnull_idx = _join!(typ, grp, Val{keepkeys}(), f, I,
337372
data, ks, lout, rout, lnull, rnull,
338373
lkey, rkey, ldata, rdata, lperm,
339-
rperm, init_group, accumulate)
374+
rperm, init_group, accumulate, missingtype)
340375

341376
if !isempty(lnull_idx) && lout !== nothing
342-
lnulls = zeros(Bool, length(lout))
343-
lnulls[lnull_idx] .= true
344377
lout = if lout isa Columns
345-
Columns(map(columns(lout)) do col
346-
v = convert(Vector{Union{Missing, eltype(col)}}, col)
347-
v[lnull_idx] .= missing
348-
v
349-
end)
378+
Columns(map(col -> outvec(col, lnull_idx, missingtype), columns(lout)))
350379
else
351-
v = convert(Vector{Union{Missing, eltype(lout)}}, lout)
352-
v[lnull_idx] .= missing
353-
v
380+
outvec(col, lnull_idx, missingtype)
354381
end
355382
data = concat_cols(lout, rout)
356383
end
@@ -359,15 +386,9 @@ function Base.join(f, left::Dataset, right::Dataset;
359386
rnulls = zeros(Bool, length(rout))
360387
rnulls[rnull_idx] .= true
361388
rout = if rout isa Columns
362-
Columns(map(columns(rout)) do col
363-
v = convert(Vector{Union{Missing, eltype(col)}}, col)
364-
v[rnull_idx] .= missing
365-
v
366-
end)
389+
Columns(map(col -> outvec(col, rnull_idx, missingtype), columns(rout)))
367390
else
368-
v = convert(Vector{Union{Missing, eltype(rout)}}, rout)
369-
v[rnull_idx] .= missing
370-
v
391+
outvec(col, rnull_idx, missingtype)
371392
end
372393
data = concat_cols(lout, rout)
373394
end

src/selection.jl

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Select all or a subset of columns, or a single column from the table.
1111
4. `AbstractArray` -- returns the array itself. This must be the same length as the table.
1212
5. `Tuple` of `Selection` -- returns a table containing a column for every selector in the tuple. The tuple may also contain the type `Pair{Symbol, Selection}`, which the selection a name. The most useful form of this when introducing a new column.
1313
6. `Regex` -- returns the columns with names that match the regular expression.
14+
7. `Type` -- returns columns with elements of the given type.
1415
1516
# Examples:
1617
@@ -122,7 +123,7 @@ Returns a new table if `f` returns a tuple or named tuple. If not, returns a ve
122123
function map(f, t::AbstractIndexedTable; select=nothing) end
123124

124125
function map(f, t::Dataset; select=nothing, copy=false, kwargs...)
125-
if isa(f, Tup) && select===nothing
126+
if isa(f, Tup) && select === nothing
126127
select = colnames(t)
127128
elseif select === nothing
128129
select = valuenames(t)
@@ -132,48 +133,90 @@ function map(f, t::Dataset; select=nothing, copy=false, kwargs...)
132133
isa(x, Columns) ? table(x; copy=false, kwargs...) : x
133134
end
134135

135-
function _non_missing(t::Union{Columns, IndexedTable}, sel=(colnames(t)...,))
136+
137+
missing_indxs(v::Vector) = findall(!_ismissing, v)
138+
139+
function missing_indxs(t::StructArray)
136140
indxs = collect(1:length(t))
137-
by = isa(sel, Tuple) ? sel : (sel,)
138-
bycols = columns(t, by)
139-
d = ColDict(t)
140-
for (key, c) in zip(by, bycols)
141-
x = rows(t, c)
142-
if Missing <: eltype(x)
143-
filt_by_col!(!ismissing, x, indxs)
144-
y = Vector{Base.nonmissingtype(eltype(x))}(undef, length(x))
145-
y[indxs] = x[indxs]
146-
d[key] = y
147-
else
148-
d[key] = x
149-
end
141+
for vec in getfield(t, :fieldarrays)
142+
filter!(i -> !_ismissing(vec[i]), indxs)
150143
end
151-
(d[], indxs)
144+
indxs
152145
end
153146

154147
"""
155-
dropmissing(t)
148+
dropmissing(t )
156149
dropmissing(t, select)
157150
158-
Drop rows of table `t` which contain `missing` values, optionally only
159-
using the columns in `select`.
151+
Drop rows of table `t` which contain missing values (either `Missing` or `DataValue`),
152+
optionally only using the columns in `select`. Column types will be converted to
153+
non-missing types. For example:
160154
161-
Column types will be converted to non-`Missing` types. E.g. `Array{Union{Int, Missing}}`
162-
to `Array{Int}`.
155+
- `Vector{Union{Int, Missing}}` -> `Vector{Int}`
156+
- `DataValueArray{Int}` -> Vector{Int}
163157
164158
# Example
165159
166160
t = table([0.1,0.5,missing,0.7], [2,missing,4,5], [missing,6,missing,7], names=[:t,:x,:y])
167161
dropmissing(t)
168162
dropmissing(t, (:t, :x))
169163
"""
170-
function dropmissing(t::Dataset, by=colnames(t))
171-
subtable(_non_missing(t, by)...)
164+
function dropmissing(t::IndexedTable, sel = All())
165+
selection = lowerselection(t, sel)
166+
indxs = missing_indxs(rows(t, selection))
167+
t2 = subtable(t, indxs)
168+
d = ColDict(t2)
169+
for s in selection
170+
T = eltype(d[s])
171+
d[s] = convert(Vector{missingtype2type(T)}, d[s])
172+
end
173+
174+
table(d[], copy=false, perms=t.perms, presorted=true)
172175
end
173176

174-
@deprecate dropna dropmissing
177+
dropmissing(t::NDSparse, sel=All()) = ndsparse(dropmissing(table(t), sel))
178+
179+
Base.@deprecate_binding dropna dropmissing
180+
181+
"""
182+
convertmissing(tbl, missingtype)
183+
184+
Convert the missing value representation in `tbl` to be of type `missingtype`.
185+
186+
# Example
187+
188+
using IndexedTables, DataValues
189+
t = table([1,2,missing], [1,missing,3])
190+
IndexedTables.convertmissing(t, DataValue)
191+
"""
192+
function convertmissing(t::IndexedTable, ::Type{Missing})
193+
d = ColDict(t)
194+
for (k, v) in pairs(d)
195+
T = eltype(v)
196+
if T <: DataValue
197+
indxs = findall(!isna, v)
198+
y = Vector{Union{Missing, missingtype2type(T)}}(missing, length(v))
199+
y[indxs] = get.(v[indxs])
200+
d[k] = y
201+
end
202+
end
203+
subtable(d[], 1:length(t))
204+
end
205+
function convertmissing(t::IndexedTable, ::Type{DataValue})
206+
d = ColDict(t)
207+
for (k, v) in pairs(d)
208+
T = eltype(v)
209+
if Missing <: T
210+
indxs = findall(!ismissing, v)
211+
y = DataValueArray(Vector{Base.nonmissingtype(T)}(undef, length(v)), trues(length(v)))
212+
y[indxs] = v[indxs]
213+
d[k] = y
214+
end
215+
end
216+
subtable(d[], 1:length(t))
217+
end
218+
convertmissing(t::NDSparse, Typ) = ndsparse(convertmissing(table(t), Typ))
175219

176-
filt_by_col!(f, col, indxs) = filter!(i->f(col[i]), indxs)
177220

178221
"""
179222
filter(f, t::Union{IndexedTable, NDSparse}; select)
@@ -215,6 +258,8 @@ function Base.filter(pred::Tuple, t::Dataset; select=nothing)
215258
subtable(t, indxs, presorted=true)
216259
end
217260

261+
filt_by_col!(f, col, indxs) = filter!(i->f(col[i]), indxs)
262+
218263
function Base.filter(pred::Pair, t::Dataset; select=nothing)
219264
filter((pred,), t, select=select)
220265
end

0 commit comments

Comments
 (0)