From 0461bc1935b99da4e5ea89e0e38aab0faf2d3879 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 10:37:31 +0200 Subject: [PATCH 01/29] allow bin width to be given to fixed rect bin directly --- .../rectangular_binning.jl | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 25d90a5b1..2e8231e7e 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -39,27 +39,32 @@ const ValidFixedBinInputs = Union{Number, NTuple} """ FixedRectangularBinning <: AbstractBinning - FixedRectangularBinning(ϵmin::NTuple, ϵmax::NTuple, N::Int) + FixedRectangularBinning(ϵmin::NTuple, ϵmax::NTuple, z::Union{Real, Int}) Rectangular box partition of state space where the extent of the grid is explicitly -specified by `ϵmin` and `emax`, and along each dimension, the grid is subdivided into `N` -subintervals. Points falling outside the partition do not contribute to probabilities. +specified by `ϵmin` and `emax`, and along each dimension. + +The third argument `subdiv` decides how the grid is subdivided: +if it is an integer, the grid is subdivided into that many subintervals. +Otherwise it is a real, and it is taken as the edge width. + +Points falling outside the partition do not contribute to probabilities. This binning type leads to a well-defined outcome space without knowledge of input data, see [`ValueHistogram`](@ref). `ϵmin`/`emax` must be `NTuple{D, <:Real}` for input of `D`-dimensional data. - FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N::Int, D::Int = 1) + FixedRectangularBinning(ϵmin::Real, ϵmax::Real, subdiv, D::Int = 1) This is a convenience method where each dimension of the binning has the same extent and the input data are `D` dimensional, which defaults to 1 (timeseries). """ -struct FixedRectangularBinning{D,T<:Real} <: AbstractBinning +struct FixedRectangularBinning{D,T<:Real,S<:Union{<:Real,Int}} <: AbstractBinning ϵmin::NTuple{D,T} ϵmax::NTuple{D,T} - N::Int + subdiv::S end -function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N::Int, D::Int = 1) +function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N, D::Int = 1) FixedRectangularBinning(ntuple(x->ϵmin, D), ntuple(x->ϵmax, D), N) end @@ -76,7 +81,9 @@ The second signature does not need `x` because (1) the binning is fixed, and the size of `x` doesn't matter, and (2) because the binning contains the dimensionality information as `ϵmin/max` is already an `NTuple`. -Due to roundoff error when computing bin edges, the computed bin widths +When specifying `n::Int` amount of bins instead of directly providing an +edge with in the `binning`, to roundoff errors typically occur when computing bin edges. +In this case, the computed bin widths are increased to their `nextfloat` `n_eps` times to ensure the correct number of bins is produced. @@ -175,9 +182,15 @@ function RectangularBinEncoding(b::FixedRectangularBinning{D, T}; n_eps = 2) whe ϵmin, ϵmax = b.ϵmin, b.ϵmax mini = SVector{D, T}(ϵmin) maxi = SVector{D, T}(ϵmax) - edgelengths_nonadjusted = @. (maxi .- mini) / b.N - edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) - histsize = SVector{D,Int}(fill(b.N, D)) + if b.subdiv isa Real + edgelengths = one(SVector{D, T}) .* b.subdiv + histsize = ceil.(Int, (maxi .- mini) ./ edgelengths) + elseif b.subdiv isa Int + N = b.subdiv + edgelengths_nonadjusted = @. (maxi .- mini) / N + edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) + histsize = SVector{D,Int}(fill(N, D)) + end ci = CartesianIndices(Tuple(histsize)) li = LinearIndices(ci) RectangularBinEncoding(b, typeof(edgelengths)(mini), edgelengths, histsize, ci, li) From e0edcdf9533d164904a529a72c4ad0c0cfeacd53 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 10:38:10 +0200 Subject: [PATCH 02/29] remove incorrect edge application of nextfloat for given edge --- src/encoding_implementations/rectangular_binning.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 2e8231e7e..870978bf0 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -146,7 +146,7 @@ function RectangularBinEncoding(b::RectangularBinning, x; n_eps = 2) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} edgelengths = SVector{D,T}(ϵ .* v) - histsize = ceil.(Int, nextfloat.((maxi .- mini), n_eps) ./ edgelengths) + histsize = ceil.(Int, (maxi .- mini) ./ edgelengths) elseif ϵ isa Int || ϵ isa Vector{Int} edgeslengths_nonadjusted = @. (maxi - mini)/ϵ # Just taking nextfloat once here isn't enough for bins to cover data when using From 37514e88c5a43bd24024a3f78792adf9ac411b38 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 10:40:31 +0200 Subject: [PATCH 03/29] better doc for fixed rect bin --- src/encoding_implementations/rectangular_binning.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 870978bf0..5e4471a5c 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -39,14 +39,15 @@ const ValidFixedBinInputs = Union{Number, NTuple} """ FixedRectangularBinning <: AbstractBinning - FixedRectangularBinning(ϵmin::NTuple, ϵmax::NTuple, z::Union{Real, Int}) + FixedRectangularBinning(xmin::NTuple, xmax::NTuple, subdiv::Union{Real, Int}) Rectangular box partition of state space where the extent of the grid is explicitly -specified by `ϵmin` and `emax`, and along each dimension. +specified by by providing the minium `xmin` and maximum `xmax` +values along each dimension. The third argument `subdiv` decides how the grid is subdivided: -if it is an integer, the grid is subdivided into that many subintervals. -Otherwise it is a real, and it is taken as the edge width. +if it is an integer, each dimension is subdivided into that many subintervals. +Otherwise it is a real, and it is taken as the edge width for each dimension. Points falling outside the partition do not contribute to probabilities. This binning type leads to a well-defined outcome space without knowledge of input data, @@ -54,7 +55,7 @@ see [`ValueHistogram`](@ref). `ϵmin`/`emax` must be `NTuple{D, <:Real}` for input of `D`-dimensional data. - FixedRectangularBinning(ϵmin::Real, ϵmax::Real, subdiv, D::Int = 1) + FixedRectangularBinning(xmin::Real, xmax::Real, subdiv, D::Int = 1) This is a convenience method where each dimension of the binning has the same extent and the input data are `D` dimensional, which defaults to 1 (timeseries). From 9933870263319ca64a52d8ddcd18c3770b032d98 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 11:21:16 +0200 Subject: [PATCH 04/29] better docuemnt and clarify bins --- src/encoding_implementations/rectangular_binning.jl | 10 +++++++--- src/probabilities_estimators/value_histogram.jl | 11 ++++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 5e4471a5c..bce8f4a19 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -50,8 +50,12 @@ if it is an integer, each dimension is subdivided into that many subintervals. Otherwise it is a real, and it is taken as the edge width for each dimension. Points falling outside the partition do not contribute to probabilities. -This binning type leads to a well-defined outcome space without knowledge of input data, -see [`ValueHistogram`](@ref). +Since bins are always left-closed-right-open `[a, b)`, keep this in mind +when specifying bin width directly. I.e., if `xmax = 1`, +a data point with value `1` does not fall into any bins! + +`FixedRectangularBinning` leads to a well-defined outcome space without knowledge of input +data, see [`ValueHistogram`](@ref). `ϵmin`/`emax` must be `NTuple{D, <:Real}` for input of `D`-dimensional data. @@ -184,7 +188,7 @@ function RectangularBinEncoding(b::FixedRectangularBinning{D, T}; n_eps = 2) whe mini = SVector{D, T}(ϵmin) maxi = SVector{D, T}(ϵmax) if b.subdiv isa Real - edgelengths = one(SVector{D, T}) .* b.subdiv + edgelengths = ones(SVector{D, T}) .* b.subdiv histsize = ceil.(Int, (maxi .- mini) ./ edgelengths) elseif b.subdiv isa Int N = b.subdiv diff --git a/src/probabilities_estimators/value_histogram.jl b/src/probabilities_estimators/value_histogram.jl index cbddb4063..240427229 100644 --- a/src/probabilities_estimators/value_histogram.jl +++ b/src/probabilities_estimators/value_histogram.jl @@ -27,11 +27,12 @@ and initializes this binning directly. ## Outcomes The outcome space for `ValueHistogram` is the unique bins constructed -from `b`. Each bin is identified by its left (lowest-value) corner. +from `b`. Each bin is identified by its left (lowest-value) corner, +because bins are always left-closed-right-open intervals `[a, b)`. The bins are in data units, not integer (cartesian indices units), and are returned as `SVector`s. -For [`FixedRectangularBinning`](@ref) this is well-defined from the binning, but for -[`RectangularBinning`](@ref) input `x` is needed for a well-defined [`outcome_space`](@ref). +For [`FixedRectangularBinning`](@ref) the [`outcome_space`](@ref) is well-defined from the +binning, but for [`RectangularBinning`](@ref) input `x` is needed as well. """ struct ValueHistogram{B<:AbstractBinning} <: ProbabilitiesEstimator binning::B @@ -63,3 +64,7 @@ function probabilities_and_outcomes(est::ValueHistogram, x) end outcome_space(est::ValueHistogram, x) = outcome_space(RectangularBinEncoding(est.binning, x)) + +function outcome_space(est::ValueHistogram{<:FixedRectangularBinning}) + return outcome_space(RectangularBinEncoding(est.binning)) +end \ No newline at end of file From a180cd0ffbdda7e1f6cd4f577eab9a4798e51f14 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 12:43:13 +0200 Subject: [PATCH 05/29] compelte rewrite of binnings based on ranges --- .../rectangular_binning.jl | 202 ++++++++++-------- .../estimators/value_histogram.jl | 2 +- 2 files changed, 116 insertions(+), 88 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index bce8f4a19..5a9ed3819 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -19,6 +19,8 @@ Rectangular box partition of state space using the scheme `ϵ`, deducing the coordinates of the grid axis minima from the input data. Generally it is preferred to use [`FixedRectangularBinning`](@ref) instead, as it has a well defined outcome space without knowledge of input data. +`RectangularBinning` is re-cast into [`FixedRectangularBinning`](@ref) +once the data are provided, so see that docstring for info on the bin calculation. Binning instructions are deduced from the type of `ϵ` as follows: @@ -35,114 +37,112 @@ struct RectangularBinning{E} <: AbstractBinning ϵ::E end -const ValidFixedBinInputs = Union{Number, NTuple} - """ FixedRectangularBinning <: AbstractBinning - FixedRectangularBinning(xmin::NTuple, xmax::NTuple, subdiv::Union{Real, Int}) + FixedRectangularBinning(ranges::Tuple{<:AbstractRange...}, precise = false) -Rectangular box partition of state space where the extent of the grid is explicitly -specified by by providing the minium `xmin` and maximum `xmax` -values along each dimension. +Rectangular box partition of state space where the partition along each dimension +is explicitly given by each range `ranges`, which is a tuple of `AbstractRange` subtypes. +Typically, each range is the output of the `range` Base function, e.g., +`ranges = (0:0.1:1, range(0, 1; length = 101), range(2.1, 3.2; step = 0.33))`. +All ranges must be sorted. -The third argument `subdiv` decides how the grid is subdivided: -if it is an integer, each dimension is subdivided into that many subintervals. -Otherwise it is a real, and it is taken as the edge width for each dimension. +The optional second argument `precise` dictates whether Julia Base's `TwicePrecision` +is used for when searching where a point falls into the range. +Only useful for edge cases of points being almost exactly on the bin edges, +and comes with a performance downside, so by default it is `false`. Points falling outside the partition do not contribute to probabilities. -Since bins are always left-closed-right-open `[a, b)`, keep this in mind -when specifying bin width directly. I.e., if `xmax = 1`, -a data point with value `1` does not fall into any bins! +Bins are always left-closed-right-open `[a, b)`. +**This means that the last value of +each of the ranges dictates the last right-closing value.** +This value does _not_ belong to the histogram! +E.g., if given a range `r = range(0, 1; length = 11)`, with `r[end] = 1`, +the value `1` is outside the partition and would not attribute any +increase of the probability corresponding to the last bin (here `[0.9, 1)`)! + +Equivalently, the size of the histogram is +`histsize = map(r -> length(r)-1, ranges)`. `FixedRectangularBinning` leads to a well-defined outcome space without knowledge of input data, see [`ValueHistogram`](@ref). -`ϵmin`/`emax` must be `NTuple{D, <:Real}` for input of `D`-dimensional data. + FixedRectangularBinning(range::AbstractRange, D::Int = 1, precise = false) - FixedRectangularBinning(xmin::Real, xmax::Real, subdiv, D::Int = 1) - -This is a convenience method where each dimension of the binning has the same extent +This is a convenience method where each dimension of the binning has the same range and the input data are `D` dimensional, which defaults to 1 (timeseries). """ -struct FixedRectangularBinning{D,T<:Real,S<:Union{<:Real,Int}} <: AbstractBinning - ϵmin::NTuple{D,T} - ϵmax::NTuple{D,T} - subdiv::S +struct FixedRectangularBinning{R<:Tuple} <: AbstractBinning + ranges::R + precise::Bool +end +function FixedRectangularBinning(ranges::R, precise = false) where {R} + if any(r -> !issorted(r), ranges) + throw(ArgumentError("All input ranges must be sorted!")) + end + return FixedRectangularBinning{R}(ranges, precise) +end +# Convenience +function FixedRectangularBinning(r::AbstractRange, D::Int = 1, precise = false) + FixedRectangularBinning(ntuple(x->r, D), precise) +end + +# Deprecations +function FixedRectangularBinning(ϵmin::NTuple{D,T}, ϵmax::NTuple{D,T}, N::Int) + FixedRectangularBinning(ntuple(x->range(ϵmin,ϵmax;length=N), D)) end function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N, D::Int = 1) - FixedRectangularBinning(ntuple(x->ϵmin, D), ntuple(x->ϵmax, D), N) + if N isa Int + FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; length = N), D)) + else + FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; step = N), D)) + end end """ RectangularBinEncoding <: Encoding - RectangularBinEncoding(binning::RectangularBinning, x; n_eps = 2) - RectangularBinEncoding(binning::FixedRectangularBinning; n_eps = 2) + RectangularBinEncoding(binning::RectangularBinning, x) + RectangularBinEncoding(binning::FixedRectangularBinning) An encoding scheme that [`encode`](@ref)s points `χ ∈ x` into their histogram bins. -It finds the minima along each dimension, and computes appropriate -edge lengths for each dimension of `x` given a rectangular binning. -The second signature does not need `x` because (1) the binning is fixed, and the -size of `x` doesn't matter, and (2) because the binning contains the dimensionality -information as `ϵmin/max` is already an `NTuple`. +The first call signature simply initializes a [`FixedRectangularBinning`](@ref) +and then calls the second call signature. -When specifying `n::Int` amount of bins instead of directly providing an -edge with in the `binning`, to roundoff errors typically occur when computing bin edges. -In this case, the computed bin widths -are increased to their `nextfloat` `n_eps` times -to ensure the correct number of bins is produced. - -See also: [`RectangularBinning`](@ref), [`FixedRectangularBinning`](@ref). +See [`FixedRectangularBinning`](@ref) for info on mapping points to bins. """ -struct RectangularBinEncoding{B, D, T, C, L} <: HistogramEncoding - binning::B # either RectangularBinning or FixedRectangularBinning - mini::SVector{D,T} # fields are either static vectors or numbers - edgelengths::SVector{D,T} - histsize::SVector{D,Int} +struct RectangularBinEncoding{R<:Tuple, C, L} <: HistogramEncoding + ranges::R + precise::Bool + histsize::NTuple{D,Int} ci::C # cartesian indices li::L # linear indices end function Base.show(io::IO, x::RectangularBinEncoding) return print(io, "RectangularBinEncoding\n" * - " box corners: $(x.mini)\n" * - " edgelengths: $(x.edgelengths)\n" * + " ranges: $(x.ranges)\n" * " histogram size: $(x.histsize)" ) end -function encode(e::RectangularBinEncoding, point) - (; mini, edgelengths) = e - # Map a data point to its bin edge (plus one because indexing starts from 1) - bin = floor.(Int, (point .- mini) ./ edgelengths) .+ 1 - cartidx = CartesianIndex(Tuple(bin)) - # We have decided on the arbitrary convention that out of bound points - # will get the special symbol `-1`. Erroring doesn't make sense as it is expected - # that for fixed histograms there may be points outside of them. - if checkbounds(Bool, e.li, cartidx) - return @inbounds e.li[cartidx] - else - return -1 - end -end +################################################################## +# Initialization of encoding +################################################################## +# input data not needed here +RectangularBinEncoding(b::FixedRectangularBinning, x) = +RectangularBinEncoding(b::FixedRectangularBinning) -function decode(e::RectangularBinEncoding{B, D, T}, bin::Int) where {B, D, T} - V = SVector{D,T} - if checkbounds(Bool, e.ci, bin) - @inbounds cartesian = e.ci[bin] - else - error("Cannot decode integer $(bin): out of bounds of underlying histogram.") - end - (; mini, edgelengths) = e - # Remove one because we want lowest value corner, and we get indices starting from 1 - return (V(Tuple(cartesian)) .- 1) .* edgelengths .+ mini +function RectangularBinEncoding(b::FixedRectangularBinning) + ranges = b.binning.ranges + histsize = map(r -> length(r)-1, ranges) + ci = CartesianIndices(Tuple(histsize)) + li = LinearIndices(ci) + RectangularBinEncoding(ranges, b.precise, histsize, ci, li) end -################################################################## -# Initialization of encodings -################################################################## -# Data-controlled grid -function RectangularBinEncoding(b::RectangularBinning, x; n_eps = 2) +# Data-controlled grid: just cast into FixesRectangularBinning +function RectangularBinEncoding(b::RectangularBinning, x) # This function always returns static vectors and is type stable D = dimension(x) T = eltype(x) @@ -150,26 +150,17 @@ function RectangularBinEncoding(b::RectangularBinning, x; n_eps = 2) mini, maxi = minmaxima(x) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} - edgelengths = SVector{D,T}(ϵ .* v) - histsize = ceil.(Int, (maxi .- mini) ./ edgelengths) + widths = SVector{D,T}(ϵ .* v) + ranges = ntuple(range(mini[i], maxi[i]; step = widths[i]), D) elseif ϵ isa Int || ϵ isa Vector{Int} - edgeslengths_nonadjusted = @. (maxi - mini)/ϵ - # Just taking nextfloat once here isn't enough for bins to cover data when using - # `encode_as_bin` later, because subtraction and division leads to loss - # of precision. We need a slightly bigger number, so apply nextfloat twice. - edgelengths = SVector{D,T}(nextfloat.(edgeslengths_nonadjusted, n_eps)) - if ϵ isa Vector{Int} - histsize = SVector{D, Int}(ϵ) - else - histsize = SVector{D, Int}(fill(ϵ, D)) - end + lengths = ϵ .* ones(SVector{D,Int}) + ranges = ntuple(range(mini[i], maxi[i]; length = lengths[i]), D) else error("Invalid ϵ for binning of a dataset") end - # Cartesian indices of the underlying histogram - ci = CartesianIndices(Tuple(histsize)) - li = LinearIndices(ci) - RectangularBinEncoding(b, mini, edgelengths, histsize, ci, li) + # By default we have the imprecise version here; + # use `Fixed` if you want precise + return RectangularBinEncoding(FixedRectangularBinning(ranges)) end # fixed grid @@ -201,6 +192,43 @@ function RectangularBinEncoding(b::FixedRectangularBinning{D, T}; n_eps = 2) whe RectangularBinEncoding(b, typeof(edgelengths)(mini), edgelengths, histsize, ci, li) end +################################################################## +# encode/decode +################################################################## +function encode(e::RectangularBinEncoding, point) + ranges = e.ranges + if e.precise + cartidx = CartesianIndex(map(searchsortedlast, ranges, Tuple(point))) + else + # These are extracted at compile time and are `Tuple`s + edgelengths = map(step, ranges) + mini = map(minimum, range) + # Map a data point to its bin edge (plus one because indexing starts from 1) + bin = floor.(Int, (point .- mini) ./ edgelengths) .+ 1 + cartidx = CartesianIndex(bin) + end + # We have decided on the arbitrary convention that out of bound points + # will get the special symbol `-1`. Erroring doesn't make sense as it is expected + # that for fixed histograms there may be points outside of them. + if checkbounds(Bool, e.li, cartidx) + return @inbounds e.li[cartidx] + else + return -1 + end +end + +function decode(e::RectangularBinEncoding{R}, bin::Int) where {R} + V = SVector{length(R), eltype(first(e.ranges))} + if checkbounds(Bool, e.ci, bin) + @inbounds cartesian = e.ci[bin] + else + error("Cannot decode integer $(bin): out of bounds of underlying histogram.") + end + (; mini, edgelengths) = e + # Remove one because we want lowest value corner, and we get indices starting from 1 + return (V(Tuple(cartesian)) .- 1) .* edgelengths .+ mini +end + ################################################################## # Outcomes / total outcomes ################################################################## diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index 2d743f4cd..0f2ef49bd 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -88,7 +88,7 @@ end n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 ε = nextfloat(0.1, 2) # this guarantees that we get the same as the `n` above! - bin = FixedRectangularBinning(0, 1, n, 2) + bin = FixedRectangularBinning(0.0, 1.0, n, 2) est = ValueHistogram(bin) p = probabilities(est, x) From 1aa1547fc8e2e86881f2ac15ac40378eeb92005e Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 12:52:32 +0200 Subject: [PATCH 06/29] make compilable --- .../rectangular_binning.jl | 56 ++++++------------- 1 file changed, 16 insertions(+), 40 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 5a9ed3819..58bc054cd 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -53,7 +53,7 @@ Only useful for edge cases of points being almost exactly on the bin edges, and comes with a performance downside, so by default it is `false`. Points falling outside the partition do not contribute to probabilities. -Bins are always left-closed-right-open `[a, b)`. +Bins are always left-closed-right-open: `[a, b)`. **This means that the last value of each of the ranges dictates the last right-closing value.** This value does _not_ belong to the histogram! @@ -61,8 +61,8 @@ E.g., if given a range `r = range(0, 1; length = 11)`, with `r[end] = 1`, the value `1` is outside the partition and would not attribute any increase of the probability corresponding to the last bin (here `[0.9, 1)`)! -Equivalently, the size of the histogram is -`histsize = map(r -> length(r)-1, ranges)`. +**Equivalently, the size of the histogram is +`histsize = map(r -> length(r)-1, ranges)`!** `FixedRectangularBinning` leads to a well-defined outcome space without knowledge of input data, see [`ValueHistogram`](@ref). @@ -88,8 +88,8 @@ function FixedRectangularBinning(r::AbstractRange, D::Int = 1, precise = false) end # Deprecations -function FixedRectangularBinning(ϵmin::NTuple{D,T}, ϵmax::NTuple{D,T}, N::Int) - FixedRectangularBinning(ntuple(x->range(ϵmin,ϵmax;length=N), D)) +function FixedRectangularBinning(ϵmin::NTuple{D,T}, ϵmax::NTuple{D,T}, N::Int) where {D, T} + FixedRectangularBinning(ntuple(x->range(ϵmin[i],ϵmax[i];length=N), D)) end function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N, D::Int = 1) if N isa Int @@ -111,7 +111,7 @@ and then calls the second call signature. See [`FixedRectangularBinning`](@ref) for info on mapping points to bins. """ -struct RectangularBinEncoding{R<:Tuple, C, L} <: HistogramEncoding +struct RectangularBinEncoding{R<:Tuple, D, C, L} <: HistogramEncoding ranges::R precise::Bool histsize::NTuple{D,Int} @@ -129,17 +129,22 @@ end ################################################################## # Initialization of encoding ################################################################## -# input data not needed here -RectangularBinEncoding(b::FixedRectangularBinning, x) = -RectangularBinEncoding(b::FixedRectangularBinning) - +# Fixed grid function RectangularBinEncoding(b::FixedRectangularBinning) - ranges = b.binning.ranges + ranges = b.ranges histsize = map(r -> length(r)-1, ranges) ci = CartesianIndices(Tuple(histsize)) li = LinearIndices(ci) RectangularBinEncoding(ranges, b.precise, histsize, ci, li) end +function RectangularBinEncoding(b::FixedRectangularBinning, x) + if length(e.ranges) != dimension(x) + throw(ArgumentError(""" + The dimensionality of the `FixedRectangularBinning` and input `x` do not match. + Got $(e.ranges) and $(dimension(x)).""")) + end + return RectangularBinEncoding(b) +end # Data-controlled grid: just cast into FixesRectangularBinning function RectangularBinEncoding(b::RectangularBinning, x) @@ -163,35 +168,6 @@ function RectangularBinEncoding(b::RectangularBinning, x) return RectangularBinEncoding(FixedRectangularBinning(ranges)) end -# fixed grid -function RectangularBinEncoding(b::FixedRectangularBinning{D}, x; n_eps = 2) where {D} - if D != dimension(x) - throw(ArgumentError(""" - The dimensionality of the `FixedRectangularBinning` and input `x` do not match. - Got $(D) and $(dimension(x)).""")) - end - return RectangularBinEncoding(b; n_eps) -end - -function RectangularBinEncoding(b::FixedRectangularBinning{D, T}; n_eps = 2) where {D,T} - # This function always returns static vectors and is type stable - ϵmin, ϵmax = b.ϵmin, b.ϵmax - mini = SVector{D, T}(ϵmin) - maxi = SVector{D, T}(ϵmax) - if b.subdiv isa Real - edgelengths = ones(SVector{D, T}) .* b.subdiv - histsize = ceil.(Int, (maxi .- mini) ./ edgelengths) - elseif b.subdiv isa Int - N = b.subdiv - edgelengths_nonadjusted = @. (maxi .- mini) / N - edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) - histsize = SVector{D,Int}(fill(N, D)) - end - ci = CartesianIndices(Tuple(histsize)) - li = LinearIndices(ci) - RectangularBinEncoding(b, typeof(edgelengths)(mini), edgelengths, histsize, ci, li) -end - ################################################################## # encode/decode ################################################################## From 8c988c0b2429b3ed7a00babdf0ad9451f15a0f2b Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:03:21 +0200 Subject: [PATCH 07/29] super simple outcome space --- .../rectangular_binning.jl | 25 +++++++++++-------- .../value_histogram.jl | 5 +++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 58bc054cd..035ce64fd 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -148,7 +148,6 @@ end # Data-controlled grid: just cast into FixesRectangularBinning function RectangularBinEncoding(b::RectangularBinning, x) - # This function always returns static vectors and is type stable D = dimension(x) T = eltype(x) ϵ = b.ϵ @@ -177,10 +176,10 @@ function encode(e::RectangularBinEncoding, point) cartidx = CartesianIndex(map(searchsortedlast, ranges, Tuple(point))) else # These are extracted at compile time and are `Tuple`s - edgelengths = map(step, ranges) - mini = map(minimum, range) + widths = map(step, ranges) + mini = map(minimum, ranges) # Map a data point to its bin edge (plus one because indexing starts from 1) - bin = floor.(Int, (point .- mini) ./ edgelengths) .+ 1 + bin = floor.(Int, (point .- mini) ./ widths) .+ 1 cartidx = CartesianIndex(bin) end # We have decided on the arbitrary convention that out of bound points @@ -193,16 +192,18 @@ function encode(e::RectangularBinEncoding, point) end end -function decode(e::RectangularBinEncoding{R}, bin::Int) where {R} - V = SVector{length(R), eltype(first(e.ranges))} +function decode(e::RectangularBinEncoding, bin::Int) if checkbounds(Bool, e.ci, bin) @inbounds cartesian = e.ci[bin] else error("Cannot decode integer $(bin): out of bounds of underlying histogram.") end - (; mini, edgelengths) = e + ranges = e.ranges + V = SVector{length(ranges), eltype(float(first(ranges)))} + widths = map(step, ranges) + mini = map(minimum, ranges) # Remove one because we want lowest value corner, and we get indices starting from 1 - return (V(Tuple(cartesian)) .- 1) .* edgelengths .+ mini + return (V(Tuple(cartesian)) .- 1) .* widths .+ mini end ################################################################## @@ -211,9 +212,13 @@ end total_outcomes(e::RectangularBinEncoding) = prod(e.histsize) function outcome_space(e::RectangularBinEncoding) - # this is super simple :P could be optimized but its not a frequent operation - return [decode(e, i) for i in 1:total_outcomes(e)] + # This is super simple thanks to using ranges :) + # (and super performant) + reduced_ranges = map(r -> r[1:end-1], e.ranges) + return collect(Iterators.product(reduced_ranges...)) end +outcome_space(b::AbstractBinning, args...) = +outcome_space(RectangularBinEncoding(b, args...)) ################################################################## # low level histogram call diff --git a/src/probabilities_estimators/value_histogram.jl b/src/probabilities_estimators/value_histogram.jl index 240427229..df74104ac 100644 --- a/src/probabilities_estimators/value_histogram.jl +++ b/src/probabilities_estimators/value_histogram.jl @@ -30,7 +30,10 @@ The outcome space for `ValueHistogram` is the unique bins constructed from `b`. Each bin is identified by its left (lowest-value) corner, because bins are always left-closed-right-open intervals `[a, b)`. The bins are in data units, not integer (cartesian indices units), and -are returned as `SVector`s. +are returned as `Tuple`s. For convenience, [`outcome_space`](@ref) +returns the outcomes in the same array format as the histogram +(e.g., `Matrix` for 2D input). + For [`FixedRectangularBinning`](@ref) the [`outcome_space`](@ref) is well-defined from the binning, but for [`RectangularBinning`](@ref) input `x` is needed as well. """ From 6c3f14eab13788010fb6c893f766ba307be71a9b Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:04:03 +0200 Subject: [PATCH 08/29] move deprecations to their appropriate file --- src/deprecations.jl | 13 +++++++++++++ src/encoding_implementations/rectangular_binning.jl | 12 ------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/deprecations.jl b/src/deprecations.jl index 73476c099..cd590d0a8 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -1,3 +1,16 @@ +# from before histogram-from-ranges rework +function FixedRectangularBinning(ϵmin::NTuple{D,T}, ϵmax::NTuple{D,T}, N::Int) where {D, T} + FixedRectangularBinning(ntuple(x->range(ϵmin[i],ϵmax[i];length=N), D)) +end +function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N, D::Int = 1) + if N isa Int + FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; length = N), D)) + else + FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; step = N), D)) + end +end + + # from before https://github.com/JuliaDynamics/ComplexityMeasures.jl/pull/239 function entropy(e::EntropyDefinition, est::DiffEntropyEst, x) if e isa Shannon diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 035ce64fd..a1829099c 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -87,18 +87,6 @@ function FixedRectangularBinning(r::AbstractRange, D::Int = 1, precise = false) FixedRectangularBinning(ntuple(x->r, D), precise) end -# Deprecations -function FixedRectangularBinning(ϵmin::NTuple{D,T}, ϵmax::NTuple{D,T}, N::Int) where {D, T} - FixedRectangularBinning(ntuple(x->range(ϵmin[i],ϵmax[i];length=N), D)) -end -function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N, D::Int = 1) - if N isa Int - FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; length = N), D)) - else - FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; step = N), D)) - end -end - """ RectangularBinEncoding <: Encoding RectangularBinEncoding(binning::RectangularBinning, x) From 119ce689edd84ba0fc1ffaf32e5eb3a26bc7bd6b Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:06:40 +0200 Subject: [PATCH 09/29] more clear docs for rect bin --- .../rectangular_binning.jl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index a1829099c..7df603744 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -7,20 +7,19 @@ abstract type HistogramEncoding <: Encoding end ################################################################## # Structs and docstrings ################################################################## -# Notice that the binning types are intermediate structs that are not used directly -# in the source code. Their only purpose is instructions of how to create a -# `RectangularBinEncoder`. All actual source code functionality of `ValueHistogram` -# is implemented based on `RectangularBinEncoder`. +# The binning types are intermediate structs whose only purpose is +# instructions for initializing the `RectangularBinEncoding` """ RectangularBinning(ϵ) <: AbstractBinning Rectangular box partition of state space using the scheme `ϵ`, -deducing the coordinates of the grid axis minima from the input data. -Generally it is preferred to use [`FixedRectangularBinning`](@ref) instead, -as it has a well defined outcome space without knowledge of input data. -`RectangularBinning` is re-cast into [`FixedRectangularBinning`](@ref) -once the data are provided, so see that docstring for info on the bin calculation. +deducing the histogram extent and bin width from the input data. + +`RectangularBinning` is a convenience struct. +It is re-cast into [`FixedRectangularBinning`](@ref) +once the data are provided, so see that docstring for info on the bin calculation +and for the possibility of more precision during the histogram calculation. Binning instructions are deduced from the type of `ϵ` as follows: From 9ae366795e499990dcce542f36ca46cf84c9d8e1 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:16:05 +0200 Subject: [PATCH 10/29] move deprecations to file --- src/ComplexityMeasures.jl | 4 +++- .../rectangular_binning.jl | 14 ++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/ComplexityMeasures.jl b/src/ComplexityMeasures.jl index 19968dba2..01e019376 100644 --- a/src/ComplexityMeasures.jl +++ b/src/ComplexityMeasures.jl @@ -21,7 +21,6 @@ include("encodings.jl") include("complexity.jl") include("multiscale.jl") include("convenience.jl") -include("deprecations.jl") # Library implementations (files include other files) include("encoding_implementations/encoding_implementations.jl") @@ -30,6 +29,9 @@ include("entropies_definitions/entropies_definitions.jl") include("entropies_estimators/entropies_estimators.jl") include("complexity_measures/complexity_measures.jl") +# deprecations (must be after all declarations) +include("deprecations.jl") + # Update messages: using Scratch display_update = false diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 7df603744..eac7ae1c0 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -65,23 +65,25 @@ increase of the probability corresponding to the last bin (here `[0.9, 1)`)! `FixedRectangularBinning` leads to a well-defined outcome space without knowledge of input data, see [`ValueHistogram`](@ref). - - FixedRectangularBinning(range::AbstractRange, D::Int = 1, precise = false) - -This is a convenience method where each dimension of the binning has the same range -and the input data are `D` dimensional, which defaults to 1 (timeseries). """ struct FixedRectangularBinning{R<:Tuple} <: AbstractBinning ranges::R precise::Bool end + function FixedRectangularBinning(ranges::R, precise = false) where {R} if any(r -> !issorted(r), ranges) throw(ArgumentError("All input ranges must be sorted!")) end return FixedRectangularBinning{R}(ranges, precise) end -# Convenience + +""" + FixedRectangularBinning(range::AbstractRange, D::Int = 1, precise = false) + +This is a convenience method where each dimension of the binning has the same range +and the input data are `D` dimensional, which defaults to 1 (timeseries). +""" function FixedRectangularBinning(r::AbstractRange, D::Int = 1, precise = false) FixedRectangularBinning(ntuple(x->r, D), precise) end From c89427b7d9ff38dfbd2c1b910079c8886f2bd0c4 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:20:15 +0200 Subject: [PATCH 11/29] allow geting FRB from RB --- .../rectangular_binning.jl | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index eac7ae1c0..aeba450b7 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -18,8 +18,7 @@ deducing the histogram extent and bin width from the input data. `RectangularBinning` is a convenience struct. It is re-cast into [`FixedRectangularBinning`](@ref) -once the data are provided, so see that docstring for info on the bin calculation -and for the possibility of more precision during the histogram calculation. +once the data are provided, so see that docstring for info on the bin calculation. Binning instructions are deduced from the type of `ϵ` as follows: @@ -31,6 +30,9 @@ Binning instructions are deduced from the type of `ϵ` as follows: intervals that cover all data. 4. `ϵ::Vector{Float64}` divides the i-th coordinate axis into intervals of fixed size `ϵ[i]`, starting from the axis minima until the data is completely covered by boxes. + +To ensure all data are covered, the `nextfloat` of the data maximum value is used +as the maximum along each dimension. """ struct RectangularBinning{E} <: AbstractBinning ϵ::E @@ -137,13 +139,19 @@ end # Data-controlled grid: just cast into FixesRectangularBinning function RectangularBinEncoding(b::RectangularBinning, x) + RectangularBinEncoding(FixedRectangularBinning(b, x)) +end +function FixedRectangularBinning(b::RectangularBinning, x) D = dimension(x) T = eltype(x) ϵ = b.ϵ mini, maxi = minmaxima(x) + # use `nextfloat` to ensure all data are covered! + maxi = nextfloat.(maxi) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} widths = SVector{D,T}(ϵ .* v) + # use `nextfloat` to ensure all data are covered! ranges = ntuple(range(mini[i], maxi[i]; step = widths[i]), D) elseif ϵ isa Int || ϵ isa Vector{Int} lengths = ϵ .* ones(SVector{D,Int}) @@ -153,7 +161,7 @@ function RectangularBinEncoding(b::RectangularBinning, x) end # By default we have the imprecise version here; # use `Fixed` if you want precise - return RectangularBinEncoding(FixedRectangularBinning(ranges)) + return FixedRectangularBinning(ranges) end ################################################################## From ac40e535ee0a81fc17afae46fa42f86e0ebebc18 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:27:00 +0200 Subject: [PATCH 12/29] use `nextfloat` to ensure promise --- src/encoding_implementations/rectangular_binning.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index aeba450b7..707382aae 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -129,7 +129,7 @@ function RectangularBinEncoding(b::FixedRectangularBinning) RectangularBinEncoding(ranges, b.precise, histsize, ci, li) end function RectangularBinEncoding(b::FixedRectangularBinning, x) - if length(e.ranges) != dimension(x) + if length(b.ranges) != dimension(x) throw(ArgumentError(""" The dimensionality of the `FixedRectangularBinning` and input `x` do not match. Got $(e.ranges) and $(dimension(x)).""")) @@ -176,7 +176,9 @@ function encode(e::RectangularBinEncoding, point) widths = map(step, ranges) mini = map(minimum, ranges) # Map a data point to its bin edge (plus one because indexing starts from 1) - bin = floor.(Int, (point .- mini) ./ widths) .+ 1 + # we also use `Tuple` because `point` is `SVector` and the broadcast + # below results in `Vector` if you mix `Tuple` with `SVector`. + bin = floor.(Int, (Tuple(point) .- mini) ./ widths) .+ 1 cartidx = CartesianIndex(bin) end # We have decided on the arbitrary convention that out of bound points From c4fd1def9a0e535aa6e2c699a67bd7224de894ef Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:47:30 +0200 Subject: [PATCH 13/29] increase performance of inprecise version --- .../rectangular_binning.jl | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 707382aae..63e1237d5 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -50,8 +50,8 @@ All ranges must be sorted. The optional second argument `precise` dictates whether Julia Base's `TwicePrecision` is used for when searching where a point falls into the range. -Only useful for edge cases of points being almost exactly on the bin edges, -and comes with a performance downside, so by default it is `false`. +Useful for edge cases of points being almost exactly on the bin edges, +and is exactly twice as slow, so by default it is `false`. Points falling outside the partition do not contribute to probabilities. Bins are always left-closed-right-open: `[a, b)`. @@ -102,10 +102,12 @@ and then calls the second call signature. See [`FixedRectangularBinning`](@ref) for info on mapping points to bins. """ -struct RectangularBinEncoding{R<:Tuple, D, C, L} <: HistogramEncoding +struct RectangularBinEncoding{R<:Tuple, D, T, C, L} <: HistogramEncoding ranges::R precise::Bool - histsize::NTuple{D,Int} + mini::SVector{D, T} + widths::SVector{D, T} + histsize::NTuple{D, Int} ci::C # cartesian indices li::L # linear indices end @@ -124,9 +126,13 @@ end function RectangularBinEncoding(b::FixedRectangularBinning) ranges = b.ranges histsize = map(r -> length(r)-1, ranges) + D = length(ranges) + T = float(eltype(first(ranges))) + mini = SVector{D,T}(map(minimum, ranges)) + widths = SVector{D,T}(map(step, ranges)) ci = CartesianIndices(Tuple(histsize)) li = LinearIndices(ci) - RectangularBinEncoding(ranges, b.precise, histsize, ci, li) + RectangularBinEncoding(ranges, b.precise, mini, widths, histsize, ci, li) end function RectangularBinEncoding(b::FixedRectangularBinning, x) if length(b.ranges) != dimension(x) @@ -170,16 +176,11 @@ end function encode(e::RectangularBinEncoding, point) ranges = e.ranges if e.precise + # Don't know how to make this faster unfurtunately... cartidx = CartesianIndex(map(searchsortedlast, ranges, Tuple(point))) else - # These are extracted at compile time and are `Tuple`s - widths = map(step, ranges) - mini = map(minimum, ranges) - # Map a data point to its bin edge (plus one because indexing starts from 1) - # we also use `Tuple` because `point` is `SVector` and the broadcast - # below results in `Vector` if you mix `Tuple` with `SVector`. - bin = floor.(Int, (Tuple(point) .- mini) ./ widths) .+ 1 - cartidx = CartesianIndex(bin) + bin = floor.(Int, (point .- e.mini) ./ e.widths) .+ 1 + cartidx = CartesianIndex(Tuple(bin)) end # We have decided on the arbitrary convention that out of bound points # will get the special symbol `-1`. Erroring doesn't make sense as it is expected From 053a905ebe08ee5ceada8d9f17b994b6bab0a8e2 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 13:58:10 +0200 Subject: [PATCH 14/29] correct extraction of length of range --- src/encoding_implementations/rectangular_binning.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 63e1237d5..5a9c9fa3f 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -158,10 +158,12 @@ function FixedRectangularBinning(b::RectangularBinning, x) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} widths = SVector{D,T}(ϵ .* v) # use `nextfloat` to ensure all data are covered! - ranges = ntuple(range(mini[i], maxi[i]; step = widths[i]), D) + ranges = ntuple(i -> range(mini[i], maxi[i]; step = widths[i]), D) elseif ϵ isa Int || ϵ isa Vector{Int} - lengths = ϵ .* ones(SVector{D,Int}) - ranges = ntuple(range(mini[i], maxi[i]; length = lengths[i]), D) + # We add one, because the user input specifies the number of bins, + # and the number of bins is the range length - 1 + lengths = ϵ .* ones(SVector{D,Int}) .+ 1 + ranges = ntuple(i -> range(mini[i], maxi[i]; length = lengths[i]), D) else error("Invalid ϵ for binning of a dataset") end From e3e082dad1969431142b09d7ac72ba188655115e Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 14:46:49 +0200 Subject: [PATCH 15/29] re-write tests --- .../rectangular_binning.jl | 37 ++++-- .../value_histogram.jl | 6 +- .../estimators/value_histogram.jl | 121 +++++++++--------- 3 files changed, 92 insertions(+), 72 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 5a9c9fa3f..118630fad 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -11,14 +11,15 @@ abstract type HistogramEncoding <: Encoding end # instructions for initializing the `RectangularBinEncoding` """ - RectangularBinning(ϵ) <: AbstractBinning + RectangularBinning(ϵ, precise = false) <: AbstractBinning Rectangular box partition of state space using the scheme `ϵ`, deducing the histogram extent and bin width from the input data. `RectangularBinning` is a convenience struct. It is re-cast into [`FixedRectangularBinning`](@ref) -once the data are provided, so see that docstring for info on the bin calculation. +once the data are provided, so see that docstring for info on the bin calculation +and the meaning of `precise`. Binning instructions are deduced from the type of `ϵ` as follows: @@ -32,11 +33,14 @@ Binning instructions are deduced from the type of `ϵ` as follows: `ϵ[i]`, starting from the axis minima until the data is completely covered by boxes. To ensure all data are covered, the `nextfloat` of the data maximum value is used -as the maximum along each dimension. +as the maximum along each dimension. However this will lead to 100% accurate results +only when `precise = true`. """ struct RectangularBinning{E} <: AbstractBinning ϵ::E + precise::Bool end +RectangularBinning(ε) = RectangularBinning(ε, false) """ FixedRectangularBinning <: AbstractBinning @@ -51,7 +55,7 @@ All ranges must be sorted. The optional second argument `precise` dictates whether Julia Base's `TwicePrecision` is used for when searching where a point falls into the range. Useful for edge cases of points being almost exactly on the bin edges, -and is exactly twice as slow, so by default it is `false`. +but it is exactly four times as slow, so by default it is `false`. Points falling outside the partition do not contribute to probabilities. Bins are always left-closed-right-open: `[a, b)`. @@ -157,7 +161,6 @@ function FixedRectangularBinning(b::RectangularBinning, x) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} widths = SVector{D,T}(ϵ .* v) - # use `nextfloat` to ensure all data are covered! ranges = ntuple(i -> range(mini[i], maxi[i]; step = widths[i]), D) elseif ϵ isa Int || ϵ isa Vector{Int} # We add one, because the user input specifies the number of bins, @@ -169,7 +172,7 @@ function FixedRectangularBinning(b::RectangularBinning, x) end # By default we have the imprecise version here; # use `Fixed` if you want precise - return FixedRectangularBinning(ranges) + return FixedRectangularBinning(ranges, b.precise) end ################################################################## @@ -198,14 +201,20 @@ function decode(e::RectangularBinEncoding, bin::Int) if checkbounds(Bool, e.ci, bin) @inbounds cartesian = e.ci[bin] else - error("Cannot decode integer $(bin): out of bounds of underlying histogram.") + throw(ArgumentError( + "Cannot decode integer $(bin): out of bounds of underlying binning." + )) end + # The decoding step is rather trivial here; we just index the ranges at the index ranges = e.ranges - V = SVector{length(ranges), eltype(float(first(ranges)))} - widths = map(step, ranges) - mini = map(minimum, ranges) + D = length(ranges) + left_edge = @inbounds ntuple(i -> ranges[i][cartesian[i]], D) + return SVector{D}(left_edge) + # widths = e.widths + # mini = e.mini + # V = SVector{length(ranges), eltype(float(first(ranges)))} # Remove one because we want lowest value corner, and we get indices starting from 1 - return (V(Tuple(cartesian)) .- 1) .* widths .+ mini + # return (V(Tuple(cartesian)) .- 1) .* widths .+ mini end ################################################################## @@ -215,9 +224,11 @@ total_outcomes(e::RectangularBinEncoding) = prod(e.histsize) function outcome_space(e::RectangularBinEncoding) # This is super simple thanks to using ranges :) - # (and super performant) reduced_ranges = map(r -> r[1:end-1], e.ranges) - return collect(Iterators.product(reduced_ranges...)) + iter = Iterators.product(reduced_ranges...) + # Convert to `SVector` because that's the agreed outcome space type + V = SVector{length(e.ranges), eltype(float(first(e.ranges)))} + return V.(iter) end outcome_space(b::AbstractBinning, args...) = outcome_space(RectangularBinEncoding(b, args...)) diff --git a/src/probabilities_estimators/value_histogram.jl b/src/probabilities_estimators/value_histogram.jl index df74104ac..59ff304b4 100644 --- a/src/probabilities_estimators/value_histogram.jl +++ b/src/probabilities_estimators/value_histogram.jl @@ -30,8 +30,10 @@ The outcome space for `ValueHistogram` is the unique bins constructed from `b`. Each bin is identified by its left (lowest-value) corner, because bins are always left-closed-right-open intervals `[a, b)`. The bins are in data units, not integer (cartesian indices units), and -are returned as `Tuple`s. For convenience, [`outcome_space`](@ref) -returns the outcomes in the same array format as the histogram +are returned as `SVector`s, i.e., same type as input data. + +For convenience, [`outcome_space`](@ref) +returns the outcomes in the same array format as the underlying binning (e.g., `Matrix` for 2D input). For [`FixedRectangularBinning`](@ref) the [`outcome_space`](@ref) is well-defined from the diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index 0f2ef49bd..f12a1b9d9 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -2,29 +2,43 @@ using ComplexityMeasures using Test using Random -@testset "Rectangular binning" begin +@testset "Standard ranges binning" begin x = Dataset(rand(Random.MersenneTwister(1234), 100_000, 2)) push!(x, SVector(0, 0)) # ensure both 0 and 1 have values in, exactly. push!(x, SVector(1, 1)) - # All these binnings should give approximately same probabilities - n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 - ε = nextfloat(0.1, 2) # this guarantees that we get the same as the `n` above! + # All these binnings give exactly the same ranges because of the 0-1 values + # The value 1 is always included, due to the call of `nextfloat` when making + # ranges for the `FixedRectangularBinning` + n = 10 + ε = nextfloat(0.1) + + # All following binnings are equivalent binnings = [ - n, - ε, RectangularBinning(n), + RectangularBinning(n, true), RectangularBinning(ε), RectangularBinning([n, n]), - RectangularBinning([ε, ε]) + RectangularBinning([ε, ε]), + FixedRectangularBinning(range(0, nextfloat(1.0); length = n+1), 2), + FixedRectangularBinning(range(0, nextfloat(1.0); length = n+1), 2, true), + FixedRectangularBinning( + (range(0, nextfloat(1.0); step = ε), range(0, nextfloat(1.0); step = ε)) + ), + ] for bin in binnings - @testset "bin isa $(typeof(bin))" begin + @testset "bin isa $(nameof(typeof(bin)))" begin est = ValueHistogram(bin) + out = outcome_space(est, x) + @test length(out) == n^2 + p = probabilities(est, x) + # all bins are covered due to random data @test length(p) == 100 + # ensure uniform coverage since input is uniformly random @test all(e -> 0.009 ≤ e ≤ 0.011, p) p2, o = probabilities_and_outcomes(est, x) @@ -34,14 +48,16 @@ using Random @test all(x -> x < 1, maximum(o)) o2 = outcomes(est, x) @test o2 == o - end - end - @testset "Check rogue 1s" begin - b = RectangularBinning(0.1) # no `nextfloat` here, so the rogue (1, 1) is in extra bin! - p = probabilities(ValueHistogram(b), x) - @test length(p) == 100 + 1 - @test p[end] ≈ 1/100_000 atol = 1e-5 + ospace = outcome_space(est, x) + @test ospace isa Matrix{SVector{2, Float64}} + @test size(ospace) == (n,n) + @test SVector(0.0, 0.0) ∈ ospace + + # ensure 1 is included, and must also be in the last bin + rbe = RectangularBinEncoding(bin, x) + @test encode(rbe, SVector(1.0, 1.0)) == n^2 + end end @testset "vector" begin @@ -56,51 +72,42 @@ using Random end end - # An extra bin might appear due to roundoff error after using nextfloat when - # constructing `RectangularBinEncoding`s. - # The following tests ensure with *some* certainty that this does not occur. - @testset "Rogue extra bins" begin - # Concrete examples where a rogue extra bin has appeared. - x1 = [0.5213236385155418, 0.03516318860292644, 0.5437726723245310, 0.52598710966469610, 0.34199879802511246, 0.6017129426606275, 0.6972844365031351, 0.89163995617220900, 0.39009862510518045, 0.06296038912844315, 0.9897176284081909, 0.7935001082966890, 0.890198448900077700, 0.11762640519877565, 0.7849413168095061, 0.13768932585886573, 0.50869900547793430, 0.18042178201388548, 0.28507312391861270, 0.96480406570924970] - x2 = [0.4125754262679051, 0.52844411982339560, 0.4535277505543355, 0.25502420827802674, 0.77862522996085940, 0.6081939026664078, 0.2628674795466387, 0.18846258495465185, 0.93320375283233840, 0.40093871561247874, 0.8032730760974603, 0.3531608285217499, 0.018436525139752136, 0.55541857934068420, 0.9907521337888632, 0.15382361136212420, 0.01774321666660561, 0.67569337507728300, 0.06130971689608822, 0.31417161558476836] - N = 10 - b = RectangularBinning(N) - rb1 = RectangularBinEncoding(b, x1; n_eps = 1) - rb2 = RectangularBinEncoding(b, x1; n_eps = 2) - @test encode(rb1, maximum(x1)) == -1 # shouldn't occur, but does when tolerance is too low - @test encode(rb2, maximum(x1)) == 10 - - rb1 = RectangularBinEncoding(b, x2; n_eps = 1) - rb2 = RectangularBinEncoding(b, x2; n_eps = 2) - @test encode(rb1, maximum(x2)) == -1 # shouldn't occur, but does when tolerance is too low - @test encode(rb2, maximum(x2)) == 10 + @testset "convenience" begin + @test ValueHistogram(n) == ValueHistogram(RectangularBinning(n)) + @test ValueHistogram(ε) == ValueHistogram(RectangularBinning(ε)) end end +@testset "Encodings, edge cases" begin + # Encoding has been practically tested fully in the codes above. + # here we just ensure the API works as expected and edge cases also + # work as expected. + # Concrete examples where a rogue extra bin has appeared. + x1 = [0.5213236385155418, 0.03516318860292644, 0.5437726723245310, 0.52598710966469610, 0.34199879802511246, 0.6017129426606275, 0.6972844365031351, 0.89163995617220900, 0.39009862510518045, 0.06296038912844315, 0.9897176284081909, 0.7935001082966890, 0.890198448900077700, 0.11762640519877565, 0.7849413168095061, 0.13768932585886573, 0.50869900547793430, 0.18042178201388548, 0.28507312391861270, 0.96480406570924970] + N = 10 + + b1 = RectangularBinning(N) + rb1 = RectangularBinEncoding(RectangularBinning(N, false), x1) + rb2 = RectangularBinEncoding(RectangularBinning(N, true), x1) + + # With low accuracy we get this rounding error + @test encode(rb1, maximum(x1)) == -1 + @test encode(rb2, maximum(x1)) == 10 + + x2 = [0.4125754262679051, 0.52844411982339560, 0.4535277505543355, 0.25502420827802674, 0.77862522996085940, 0.6081939026664078, 0.2628674795466387, 0.18846258495465185, 0.93320375283233840, 0.40093871561247874, 0.8032730760974603, 0.3531608285217499, 0.018436525139752136, 0.55541857934068420, 0.9907521337888632, 0.15382361136212420, 0.01774321666660561, 0.67569337507728300, 0.06130971689608822, 0.31417161558476836] + rb1 = RectangularBinEncoding(RectangularBinning(N, false), x2) + rb2 = RectangularBinEncoding(RectangularBinning(N, true), x2) + @test encode(rb1, maximum(x2)) == -1 + @test encode(rb2, maximum(x2)) == 10 + + # and a final analytic test with decode + bin = FixedRectangularBinning(range(0, 1.0; length = 11), 2, true) + rbc = RectangularBinEncoding(bin) + @test encode(rbc, SVector(0.0, 0.0)) == 1 + @test encode(rbc, SVector(1.0, 1.0)) == -1 + @test decode(rbc, 1) == SVector(0.0, 0.0) + @test decode(rbc, 100) == SVector(0.9, 0.9) +end -@testset "Fixed Rectangular binning" begin - - x = Dataset(rand(Random.MersenneTwister(1234), 100_000, 2)) - push!(x, SVector(0, 0)) # ensure both 0 and 1 have values in, exactly. - push!(x, SVector(1, 1)) - # All these binnings should give approximately same probabilities - n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 - ε = nextfloat(0.1, 2) # this guarantees that we get the same as the `n` above! - - bin = FixedRectangularBinning(0.0, 1.0, n, 2) - - est = ValueHistogram(bin) - p = probabilities(est, x) - @test length(p) == 100 - @test all(e -> 0.009 ≤ e ≤ 0.011, p) - - p2, o = probabilities_and_outcomes(est, x) - @test p2 == p - @test o isa Vector{SVector{2, Float64}} - @test length(o) == length(p) - @test all(x -> x < 1, maximum(o)) - o2 = outcomes(est, x) - @test o2 == o -end From dd73c8c9ecb4808dbf8d433d5284ad4fa2d8baf4 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 15:03:56 +0200 Subject: [PATCH 16/29] more clarity in tests --- test/probabilities/estimators/value_histogram.jl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index f12a1b9d9..aac1d37f1 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -12,9 +12,10 @@ using Random # The value 1 is always included, due to the call of `nextfloat` when making # ranges for the `FixedRectangularBinning` n = 10 - ε = nextfloat(0.1) + ε = nextfloat(0.1) # when dividing 0-nextfloat(1) into 10, you get this width # All following binnings are equivalent + # (`nextfloat` is necessary in Fixed, due to the promise given in the regular) binnings = [ RectangularBinning(n), RectangularBinning(n, true), @@ -26,8 +27,13 @@ using Random FixedRectangularBinning( (range(0, nextfloat(1.0); step = ε), range(0, nextfloat(1.0); step = ε)) ), - ] + # all reduce to these ranges (due to demanding ALL data to be in + # the histogram, i.e., also the `SVector(1,1)`) + casted_ranges = ( + 0.0:0.10000000000000002:1.0000000000000002, + 0.0:0.10000000000000002:1.0000000000000002 + ) for bin in binnings @testset "bin isa $(nameof(typeof(bin)))" begin @@ -57,6 +63,8 @@ using Random # ensure 1 is included, and must also be in the last bin rbe = RectangularBinEncoding(bin, x) @test encode(rbe, SVector(1.0, 1.0)) == n^2 + + @test rbe.ranges == casted_ranges end end From 56df9dee476698cf682191eb00dcac06991572f4 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:37:25 +0200 Subject: [PATCH 17/29] fix diversityy --- src/deprecations.jl | 4 ++-- src/probabilities_estimators/diversity.jl | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/deprecations.jl b/src/deprecations.jl index cd590d0a8..6f2c81472 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -4,9 +4,9 @@ function FixedRectangularBinning(ϵmin::NTuple{D,T}, ϵmax::NTuple{D,T}, N::Int) end function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N, D::Int = 1) if N isa Int - FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; length = N), D)) + FixedRectangularBinning(ntuple(x-> range(ϵmin, nextfloat(ϵmax); length = N), D)) else - FixedRectangularBinning(ntuple(x-> range(ϵmin, ϵmax; step = N), D)) + FixedRectangularBinning(ntuple(x-> range(ϵmin, nextfloat(ϵmax); step = N), D)) end end diff --git a/src/probabilities_estimators/diversity.jl b/src/probabilities_estimators/diversity.jl index 3304d1c51..7d2c1039c 100644 --- a/src/probabilities_estimators/diversity.jl +++ b/src/probabilities_estimators/diversity.jl @@ -21,7 +21,8 @@ Diversity probabilities are computed as follows. 2. Compute ``D = \\{d(\\bf x_t, \\bf x_{t+1}) \\}_{t=1}^{N-mτ-1}``, where ``d(\\cdot, \\cdot)`` is the cosine similarity between two `m`-dimensional vectors in the embedding. -3. Divide the interval `[-1, 1]` into `nbins` equally sized subintervals. +3. Divide the interval `[-1, 1]` into `nbins` equally sized subintervals + (including the value `+1`). 4. Construct a histogram of cosine similarities ``d \\in D`` over those subintervals. 5. Sum-normalize the histogram to obtain probabilities. @@ -70,5 +71,6 @@ end cosine_similarity(xᵢ, xⱼ) = sum(xᵢ .* xⱼ) / (sqrt(sum(xᵢ .^ 2)) * sqrt(sum(xⱼ .^ 2))) function encoding_for_diversity(nbins::Int) - RectangularBinEncoding(FixedRectangularBinning(-1.0, 1.0, nbins)) + binning = FixedRectangularBinning((range(-1.0, nextfloat(1.0); length = nbins),)) + return RectangularBinEncoding(binning) end From bc9fa6af02828c405910e74f2e57d5711f051a42 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:37:35 +0200 Subject: [PATCH 18/29] remove `transferoperatorencoder` in favor of `RectangularBinEncoding` --- .../transfer_operator/transfer_operator.jl | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/probabilities_estimators/transfer_operator/transfer_operator.jl b/src/probabilities_estimators/transfer_operator/transfer_operator.jl index 9ca53b850..ed30d4b2a 100644 --- a/src/probabilities_estimators/transfer_operator/transfer_operator.jl +++ b/src/probabilities_estimators/transfer_operator/transfer_operator.jl @@ -179,17 +179,6 @@ struct TransferOperatorApproximationRectangular{ visitors::Vector{Vector{Int}} end -function transferoperatorencoder( - binning::Union{FixedRectangularBinning, RectangularBinning}, x::AbstractDataset -) - if binning isa FixedRectangularBinning - return RectangularBinEncoding(binning) - elseif binning isa RectangularBinning - return RectangularBinEncoding(binning, x) - else - throw(ArgumentError("Binning $(typeof(binning)) not supported for transfer operator")) - end -end """ transferoperator(pts::AbstractDataset, binning::RectangularBinning) → TransferOperatorApproximationRectangular @@ -217,7 +206,7 @@ function transferoperator(pts::AbstractDataset{D, T}, boundary_condition = :circular) where {D, T<:Real} L = length(pts) - encoder = transferoperatorencoder(binning, pts) + encoder = RectangularBinEncoding(binning, pts) # The L points visits a total of L bins, which are the following bins (referenced # here as cartesian coordinates, not absolute bins): @@ -479,7 +468,7 @@ function transfermatrix(iv::InvariantMeasure) end function probabilities_and_outcomes(est::TransferOperator, x::Array_or_Dataset) - encoder = transferoperatorencoder(est.binning, x) + encoder = RectangularBinEncoding(est.binning, x) to = transferoperator(x, encoder.binning) probs = invariantmeasure(to).ρ From 90f654153318037f85cf5fccb59f8c5815b9a6b0 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:42:55 +0200 Subject: [PATCH 19/29] add catesian bin index internal function --- .../rectangular_binning.jl | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 118630fad..8c4178b58 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -179,14 +179,7 @@ end # encode/decode ################################################################## function encode(e::RectangularBinEncoding, point) - ranges = e.ranges - if e.precise - # Don't know how to make this faster unfurtunately... - cartidx = CartesianIndex(map(searchsortedlast, ranges, Tuple(point))) - else - bin = floor.(Int, (point .- e.mini) ./ e.widths) .+ 1 - cartidx = CartesianIndex(Tuple(bin)) - end + # We have decided on the arbitrary convention that out of bound points # will get the special symbol `-1`. Erroring doesn't make sense as it is expected # that for fixed histograms there may be points outside of them. @@ -197,6 +190,26 @@ function encode(e::RectangularBinEncoding, point) end end +""" + cartesian_bin_index(e::RectangularBinEncoding, point::SVector) + +Internal function called by `encode`. Returns the cartesian index of +the given `point` within the binning encapsulated in `e`. +""" +function cartesian_bin_index(e::RectangularBinEncoding, point) + ranges = e.ranges + if e.precise + # Don't know how to make this faster unfurtunately... + cartidx = CartesianIndex(map(searchsortedlast, ranges, Tuple(point))) + else + bin = floor.(Int, (point .- e.mini) ./ e.widths) .+ 1 + cartidx = CartesianIndex(Tuple(bin)) + end + return cartidx +end + + + function decode(e::RectangularBinEncoding, bin::Int) if checkbounds(Bool, e.ci, bin) @inbounds cartesian = e.ci[bin] From 95557852b6cbd91e38e976ab18306812207382d4 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:43:51 +0200 Subject: [PATCH 20/29] actually call the function --- src/encoding_implementations/rectangular_binning.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 8c4178b58..18d83d885 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -179,7 +179,7 @@ end # encode/decode ################################################################## function encode(e::RectangularBinEncoding, point) - + cartidx = cartesian_bin_index(e, point) # We have decided on the arbitrary convention that out of bound points # will get the special symbol `-1`. Erroring doesn't make sense as it is expected # that for fixed histograms there may be points outside of them. @@ -193,8 +193,9 @@ end """ cartesian_bin_index(e::RectangularBinEncoding, point::SVector) -Internal function called by `encode`. Returns the cartesian index of +Return the cartesian index of the given `point` within the binning encapsulated in `e`. +Internal function called by `encode`. """ function cartesian_bin_index(e::RectangularBinEncoding, point) ranges = e.ranges @@ -208,8 +209,6 @@ function cartesian_bin_index(e::RectangularBinEncoding, point) return cartidx end - - function decode(e::RectangularBinEncoding, bin::Int) if checkbounds(Bool, e.ci, bin) @inbounds cartesian = e.ci[bin] From 976ee77d3b11b9fed6fd9646d65a3b9af7fc5602 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:47:25 +0200 Subject: [PATCH 21/29] fix diversity: given bins + 1 to the range --- src/deprecations.jl | 4 ++-- src/probabilities_estimators/diversity.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/deprecations.jl b/src/deprecations.jl index 6f2c81472..8a89fbcb4 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -4,9 +4,9 @@ function FixedRectangularBinning(ϵmin::NTuple{D,T}, ϵmax::NTuple{D,T}, N::Int) end function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N, D::Int = 1) if N isa Int - FixedRectangularBinning(ntuple(x-> range(ϵmin, nextfloat(ϵmax); length = N), D)) + FixedRectangularBinning(ntuple(x-> range(ϵmin, nextfloat(float(ϵmax)); length = N), D)) else - FixedRectangularBinning(ntuple(x-> range(ϵmin, nextfloat(ϵmax); step = N), D)) + FixedRectangularBinning(ntuple(x-> range(ϵmin, nextfloat(float(ϵmax)); step = N), D)) end end diff --git a/src/probabilities_estimators/diversity.jl b/src/probabilities_estimators/diversity.jl index 7d2c1039c..c74c4840c 100644 --- a/src/probabilities_estimators/diversity.jl +++ b/src/probabilities_estimators/diversity.jl @@ -49,7 +49,7 @@ end function probabilities_and_outcomes(est::Diversity, x::AbstractVector{T}) where T <: Real ds, rbc = similarities_and_binning(est, x) - return probabilities_and_outcomes(ValueHistogram(rbc.binning), ds) + return probabilities_and_outcomes(rbc, ds) end outcome_space(est::Diversity) = outcome_space(encoding_for_diversity(est.nbins)) @@ -71,6 +71,6 @@ end cosine_similarity(xᵢ, xⱼ) = sum(xᵢ .* xⱼ) / (sqrt(sum(xᵢ .^ 2)) * sqrt(sum(xⱼ .^ 2))) function encoding_for_diversity(nbins::Int) - binning = FixedRectangularBinning((range(-1.0, nextfloat(1.0); length = nbins),)) + binning = FixedRectangularBinning((range(-1.0, nextfloat(1.0); length = nbins+1),)) return RectangularBinEncoding(binning) end From 14e534aa608822f13b40e29f4d9e23a633320423 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:47:33 +0200 Subject: [PATCH 22/29] allow dispatch to encoding --- src/probabilities_estimators/value_histogram.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/probabilities_estimators/value_histogram.jl b/src/probabilities_estimators/value_histogram.jl index 59ff304b4..be4b45ffb 100644 --- a/src/probabilities_estimators/value_histogram.jl +++ b/src/probabilities_estimators/value_histogram.jl @@ -61,6 +61,9 @@ end function probabilities_and_outcomes(est::ValueHistogram, x) encoding = RectangularBinEncoding(est.binning, x) + return probabilities_and_outcomes(encoding, x) +end +function probabilities_and_outcomes(encoding::RectangularBinEncoding, x) probs, bins = fasthist(encoding, x) # bins are integers here unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: From 65faafd9074efc892538c96a153565f57689fbec Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:50:53 +0200 Subject: [PATCH 23/29] fix incorrect definition of outcome_space in TrOp --- .../transfer_operator/transfer_operator.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/probabilities_estimators/transfer_operator/transfer_operator.jl b/src/probabilities_estimators/transfer_operator/transfer_operator.jl index ed30d4b2a..df30f3ede 100644 --- a/src/probabilities_estimators/transfer_operator/transfer_operator.jl +++ b/src/probabilities_estimators/transfer_operator/transfer_operator.jl @@ -481,5 +481,4 @@ function probabilities_and_outcomes(est::TransferOperator, x::Array_or_Dataset) return probs, outcomes end -outcome_space(x, est::TransferOperator) = outcome_space(x, est.binning) -total_outcomes(x, est::TransferOperator) = total_outcomes(x, est.binning) +outcome_space(est::TransferOperator, x) = outcome_space(est.encoder, x) From 77bb880b9d9ddad445bd5edf05016f3967f724dd Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:53:28 +0200 Subject: [PATCH 24/29] fix some tests of TrOp... --- .../transfer_operator/transfer_operator.jl | 3 +-- test/probabilities/estimators/transfer_operator.jl | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/probabilities_estimators/transfer_operator/transfer_operator.jl b/src/probabilities_estimators/transfer_operator/transfer_operator.jl index df30f3ede..13b9a0f5e 100644 --- a/src/probabilities_estimators/transfer_operator/transfer_operator.jl +++ b/src/probabilities_estimators/transfer_operator/transfer_operator.jl @@ -468,8 +468,7 @@ function transfermatrix(iv::InvariantMeasure) end function probabilities_and_outcomes(est::TransferOperator, x::Array_or_Dataset) - encoder = RectangularBinEncoding(est.binning, x) - to = transferoperator(x, encoder.binning) + to = transferoperator(x, est.binning) probs = invariantmeasure(to).ρ # Note: bins are *not* sorted. They occur in the order of first appearance, according diff --git a/test/probabilities/estimators/transfer_operator.jl b/test/probabilities/estimators/transfer_operator.jl index 06c061f07..933c11581 100644 --- a/test/probabilities/estimators/transfer_operator.jl +++ b/test/probabilities/estimators/transfer_operator.jl @@ -9,7 +9,7 @@ binnings = [ RectangularBinning(0.2), RectangularBinning([2, 3]), RectangularBinning([0.2, 0.3]), - FixedRectangularBinning(0, 1, 5, 2) + FixedRectangularBinning(rage(0, 1; length = 5), 2) ] # There's not easy way of constructing an analytical example for the resulting From 201f5042cb371f6faa8d6102240b37dd74405ca6 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:56:10 +0200 Subject: [PATCH 25/29] remove completely unused computations from function in TrOp... --- .../transfer_operator/transfer_operator.jl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/probabilities_estimators/transfer_operator/transfer_operator.jl b/src/probabilities_estimators/transfer_operator/transfer_operator.jl index 13b9a0f5e..11b2928b2 100644 --- a/src/probabilities_estimators/transfer_operator/transfer_operator.jl +++ b/src/probabilities_estimators/transfer_operator/transfer_operator.jl @@ -439,10 +439,6 @@ function invariantmeasure(to::TransferOperatorApproximationRectangular; distribution = distribution ./ colsum_distribution end - # Find partition elements with strictly positive measure. - δ = tolerance/size(to.transfermatrix, 1) - inds_nonzero = findall(distribution .> δ) - # Extract the elements of the invariant measure corresponding to these indices return InvariantMeasure(to, Probabilities(distribution)) end From bb0477473c5b2e342d7c20492da475515e0602bb Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 16:58:03 +0200 Subject: [PATCH 26/29] remove more completely unused codfe from TrOp... --- .../transfer_operator/transfer_operator.jl | 2 -- test/probabilities/estimators/transfer_operator.jl | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/probabilities_estimators/transfer_operator/transfer_operator.jl b/src/probabilities_estimators/transfer_operator/transfer_operator.jl index 11b2928b2..b776e79b8 100644 --- a/src/probabilities_estimators/transfer_operator/transfer_operator.jl +++ b/src/probabilities_estimators/transfer_operator/transfer_operator.jl @@ -285,12 +285,10 @@ function transferoperator(pts::AbstractDataset{D, T}, # To which boxes do each of the visitors to bᵢ jump in the next # time step? target_bins = visits_whichbin[timeindices_visiting_pts .+ 1] - unique_target_bins = unique(target_bins) # Count how many points jump from the i-th bin to each of # the unique target bins, and use that to calculate the transition # probability from bᵢ to bⱼ. - #for j in 1:length(unique_target_bins) for (j, bᵤ) in enumerate(unique(target_bins)) n_transitions_i_to_j = sum(target_bins .== bᵤ) diff --git a/test/probabilities/estimators/transfer_operator.jl b/test/probabilities/estimators/transfer_operator.jl index 933c11581..0fe02fc7b 100644 --- a/test/probabilities/estimators/transfer_operator.jl +++ b/test/probabilities/estimators/transfer_operator.jl @@ -9,7 +9,7 @@ binnings = [ RectangularBinning(0.2), RectangularBinning([2, 3]), RectangularBinning([0.2, 0.3]), - FixedRectangularBinning(rage(0, 1; length = 5), 2) + FixedRectangularBinning(range(0, 1; length = 5), 2) ] # There's not easy way of constructing an analytical example for the resulting From e15374176a2f5cff1d45bd29fec55accf0c30479 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 17:00:46 +0200 Subject: [PATCH 27/29] remove more unused code and comments --- .../transfer_operator/transfer_operator.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/probabilities_estimators/transfer_operator/transfer_operator.jl b/src/probabilities_estimators/transfer_operator/transfer_operator.jl index b776e79b8..8be7c8f5e 100644 --- a/src/probabilities_estimators/transfer_operator/transfer_operator.jl +++ b/src/probabilities_estimators/transfer_operator/transfer_operator.jl @@ -117,8 +117,7 @@ end # for datasets of >100 000+ points function inds_in_terms_of_unique_sorted(x) # assumes sorted @assert issorted(x) - U = unique(x) - N, Nu = length(x), length(U) + N = length(x) prev = view(x, 1) inds = zeros(Int, N) uidx = 1 @@ -236,10 +235,8 @@ function transferoperator(pts::AbstractDataset{D, T}, n_visitsᵢ::Int = 0 if boundary_condition == :circular - #warn("Using circular boundary condition") append!(visits_whichbin, [1]) elseif boundary_condition == :random - #warn("Using random circular boundary condition") append!(visits_whichbin, [rand(1:length(visits_whichbin))]) else error("Boundary condition $(boundary_condition) not implemented") From 937973c944222cd7f0fcbd29fd21bd9eb6f086cf Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 17:24:26 +0200 Subject: [PATCH 28/29] ensre ranges are large enough to cover data in fixed witdth --- .../rectangular_binning.jl | 24 ++++++++++++------- .../estimators/value_histogram.jl | 14 +++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/encoding_implementations/rectangular_binning.jl b/src/encoding_implementations/rectangular_binning.jl index 18d83d885..de161f430 100644 --- a/src/encoding_implementations/rectangular_binning.jl +++ b/src/encoding_implementations/rectangular_binning.jl @@ -32,9 +32,8 @@ Binning instructions are deduced from the type of `ϵ` as follows: 4. `ϵ::Vector{Float64}` divides the i-th coordinate axis into intervals of fixed size `ϵ[i]`, starting from the axis minima until the data is completely covered by boxes. -To ensure all data are covered, the `nextfloat` of the data maximum value is used -as the maximum along each dimension. However this will lead to 100% accurate results -only when `precise = true`. +`RectangularBinning` ensures all input data are covered by extending the created +ranges if need be. """ struct RectangularBinning{E} <: AbstractBinning ϵ::E @@ -156,13 +155,22 @@ function FixedRectangularBinning(b::RectangularBinning, x) T = eltype(x) ϵ = b.ϵ mini, maxi = minmaxima(x) - # use `nextfloat` to ensure all data are covered! - maxi = nextfloat.(maxi) - v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} - widths = SVector{D,T}(ϵ .* v) - ranges = ntuple(i -> range(mini[i], maxi[i]; step = widths[i]), D) + widths = SVector{D,T}(ϵ .* ones(SVector{D,T})) + # To ensure all points are guaranteed to be covered, we add the width + # to the max, if the max isn't included in the resulting range + function covering_range(i) + r = range(mini[i], maxi[i]; step = widths[i]) + if maxi[i] ∉ r + return range(mini[i], maxi[i] + widths[i]; step = widths[i]) + else + return r + end + end + ranges = ntuple(i -> covering_range(i), D) elseif ϵ isa Int || ϵ isa Vector{Int} + # use `nextfloat` to ensure all data are covered! + maxi = nextfloat.(maxi) # We add one, because the user input specifies the number of bins, # and the number of bins is the range length - 1 lengths = ϵ .* ones(SVector{D,Int}) .+ 1 diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index aac1d37f1..ea6fda248 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -118,4 +118,18 @@ end @test decode(rbc, 100) == SVector(0.9, 0.9) end +@testset "All points covered" begin + x = Dataset(rand(100, 2)) + binnings = [ + RectangularBinning(5, true), + RectangularBinning(0.2, true), + RectangularBinning([2, 4], true), + RectangularBinning([0.5, 0.25], true), + ] + for bin in binnings + rbe = RectangularBinEncoding(bin, x) + visited_bins = map(pᵢ -> encode(rbe, pᵢ), x) + @test -1 ∉ visited_bins + end +end From 68e759f304d8523413f4a404d0a3f07f604c17d5 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 14 Jan 2023 17:27:31 +0200 Subject: [PATCH 29/29] add changelog entry --- CHANGELOG.md | 4 ++++ Project.toml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1776a4a2..bda10d584 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ Changelog is kept with respect to version 0.11 of Entropies.jl. From version v2.0 onwards, this package has been renamed to ComplexityMeasures.jl. +## 2.4 +- Rectangular binnings have been reformed to operate based on ranges. This leads to much more intuitive bin sizes and edges. For `RectangularBinning` nothing changes, while for `FixedRectangularBinning` the ranges should be given explicitly. Backwards compatible deprecations have been added. +- This also allows for a new `precise` option that utilizes Base Julia `TwinPrecision` to make more accurate mapping of points to bins at the cost of performance. + ## 2.3 - Like differential entropies, discrete entropies now also have their own estimator type. - The approach of giving both an entropy definition, and an entropy estimator to `entropy` has been dropped. Now the entropy estimators know what definitions they are applied for. This change is a deprecation, i.e., backwards compatible. diff --git a/Project.toml b/Project.toml index 710bd61fe..b79e58b70 100644 --- a/Project.toml +++ b/Project.toml @@ -2,7 +2,7 @@ name = "ComplexityMeasures" uuid = "ab4b797d-85ee-42ba-b621-05d793b346a2" authors = "Kristian Agasøster Haaga , George Datseries " repo = "https://github.com/juliadynamics/ComplexityMeasures.jl.git" -version = "2.3.2" +version = "2.4.0" [deps] Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"