From 66a37c95ea60c1f22e352e5f6de9b33f781005da Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 09:06:35 +0200 Subject: [PATCH 01/12] Walkthrough entropy --- docs/make.jl | 3 +- docs/src/Walkthrough.md | 101 +++++++++++++ docs/src/index.md | 6 + src/Entropies.jl | 3 + src/api.jl | 34 +++++ src/excess/excess_entropy.jl | 35 +++++ src/excess/walkthrough_entropy.jl | 229 ++++++++++++++++++++++++++++++ src/utils.jl | 38 +++++ test/runtests.jl | 13 +- 9 files changed, 459 insertions(+), 3 deletions(-) create mode 100644 docs/src/Walkthrough.md create mode 100644 src/api.jl create mode 100644 src/excess/excess_entropy.jl create mode 100644 src/excess/walkthrough_entropy.jl create mode 100644 src/utils.jl diff --git a/docs/make.jl b/docs/make.jl index 03d18dff6..a993ae309 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -42,7 +42,8 @@ PAGES = [ "Permutation.md", "NearestNeighbors.md", "NaiveKernel.md", - "TimeScaleMODWT.md" + "TimeScaleMODWT.md", + "Walkthrough.md" ], "Non-exported" => "nonexported.md" ] diff --git a/docs/src/Walkthrough.md b/docs/src/Walkthrough.md new file mode 100644 index 000000000..c76e9fa73 --- /dev/null +++ b/docs/src/Walkthrough.md @@ -0,0 +1,101 @@ +# Walkthrough entropy + +```@docs +WalkthroughEntropy +walkthrough_entropy +``` + +## Examples + +Here, we reproduce parts of Fig. 1 from Stoop et al. (2021). + +We start by creating some symbolic time series of length `N`. Then, because we're +generating the walkthrough entropy for multiple positions ``n``, we use +`entropygenerator` for each time series, so that initialization computations only happens +once per time series. Finally, we compute the walkthrough entropy at all positions `1:N`. + +```@example +using Entropies, PyPlot + +N = 1200 +# Generate time series +x_lowfreq = "a"^(N ÷ 3) * "b"^(N ÷ 3) * "c"^(N ÷ 3); +x_hifreq = "abc"^(N ÷ 3) +x_rw3 = rand(['a', 'b', 'c'], N) +x_rw2 = rand(['a', 'b'], N) +x_a = "a"^(N) +x_ab_2 = "ab"^(N ÷ 2) +x_ab_20 = ("a"^20*"b"^20)^(N ÷ 40) +x_ab_200 = ("a"^200*"b"^200)^(N ÷ 400) + +# Initialize entropy generators +method = WalkthroughEntropy() +e_lofreq = entropygenerator(x_lowfreq, method); +e_hifreq = entropygenerator(x_hifreq, method); +e_rw3 = entropygenerator(x_rw3, method); +e_rw2 = entropygenerator(x_rw2, method); +e_a = entropygenerator(x_a, method); +e_ab_2 = entropygenerator(x_ab_2, method); +e_ab_20 = entropygenerator(x_ab_20, method); +e_ab_200 = entropygenerator(x_ab_200, method); + +# Compute walkthrough entropies through positions 1:N +base = MathConstants.e +hs_lofreq = [e_lofreq(i, base = base) for i = 1:N] +hs_hifreq = [e_hifreq(i, base = base) for i = 1:N] +hs_wn3 = [e_rw3(i, base = base) for i = 1:N] +hs_wn2 = [e_rw2(i, base = base) for i = 1:N] +hs_a = [e_a(i, base = base) for i = 1:N] +hs_ab_2 = [e_ab_2(i, base = base) for i = 1:N] +hs_ab_20 = [e_ab_20(i, base = base) for i = 1:N] +hs_ab_200 = [e_ab_200(i, base = base) for i = 1:N] + +# Plot +ns = (1:N |> collect) ./ N +unit = "nats" + + +f = figure(figsize = (10,7)) +ax = subplot(231) +plot(xlabel = "n/N", ylabel = "h [$unit]"); +plot(ns, hs_hifreq, label = "abcabc...") +plot(ns, hs_lofreq, label = "aa...bb..ccc") +xlabel("n/N") +ylabel("h ($unit)") +legend() + +ax = subplot(232) +plot(xlabel = "n/N", ylabel = "h [$unit]"); +plot(ns, hs_wn3, label = "RW (k = 3)") +xlabel("n/N") +ylabel("h ($unit)") +legend() + +ax = subplot(234) +plot(ns, hs_a, label = "k = 1") +plot(ns, hs_ab_2, label = "k = 2, T = 2") +plot(ns, hs_ab_20, label = "k = 2, T = 20") +xlabel("n/N") +ylabel("h ($unit)") +legend() + +ax = subplot(235) +plot(ns, hs_a, label = "k = 1") +plot(ns, hs_ab_2, label = "k = 2, T = 2") +plot(ns, hs_ab_200, label = "k = 2, T = 200") +xlabel("n/N") +ylabel("h ($unit)") +legend() + +ax = subplot(236) +plot(ns, hs_wn2, label = "RW (k = 2)") +plot(ns, hs_ab_2, label = "k = 2, T = 2") +xlabel("n/N") +ylabel("h ($unit)") + +legend() +tight_layout() +PyPlot.savefig("walkthrough_entropy.png") +``` + +![Walkthrough entropy](walkthrough_entropy.png) diff --git a/docs/src/index.md b/docs/src/index.md index 4d29d68ed..88227635b 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -36,6 +36,12 @@ ProbabilitiesEstimator Entropies.genentropy ``` +## Reusable entropy generator + +```@docs +entropygenerator +``` + ## Fast histograms ```@docs diff --git a/src/Entropies.jl b/src/Entropies.jl index 8c5662747..48f65add7 100644 --- a/src/Entropies.jl +++ b/src/Entropies.jl @@ -1,5 +1,7 @@ module Entropies include("core.jl") + include("api.jl") + include("utils.jl") include("histogram_estimation.jl") include("counting_based/CountOccurrences.jl") include("symbolic/symbolic.jl") @@ -7,4 +9,5 @@ module Entropies include("kerneldensity/kerneldensity.jl") include("wavelet/wavelet.jl") include("nearest_neighbors/nearest_neighbors.jl") + include("excess/walkthrough_entropy.jl") end diff --git a/src/api.jl b/src/api.jl new file mode 100644 index 000000000..79fe0b658 --- /dev/null +++ b/src/api.jl @@ -0,0 +1,34 @@ +using Random +export EntropyGenerator +export entropygenerator + + +struct EntropyGenerator{S <: EntropyEstimator, X, A, R <: AbstractRNG} + method::S # method with its input parameters + x::X # input data + init::A # pre-initialized things that speed up entropy estimation + rng::R # random number generator object +end + +""" + entropygenerator(x, method::EntropyEstimator[, rng]) → sg::EntropyGenerator + +Initialize a generator that computes entropies of `x` on demand, based on the given `method`. +This is efficient, because for most methods some things can be initialized and reused +for every computation. Optionally you can provide an `rng::AbstractRNG` object that will +control random number generation and hence establish reproducibility of the +generated entropy values, if they rely on random number generation. By default +`Random.default_rng()` is used. + +Note: not all entropy estimators have this functionality enabled yet. The documentation +strings for individual methods indicate whether entropy generators are available. + +To compute entropy using a generator, call `eg` as a function with the optional `base` +argument, e.g. + +```julia +eg = entropygenerator(x, method) +h = eg(; base = 2) +``` +""" +function entropygenerator end diff --git a/src/excess/excess_entropy.jl b/src/excess/excess_entropy.jl new file mode 100644 index 000000000..649da93d4 --- /dev/null +++ b/src/excess/excess_entropy.jl @@ -0,0 +1,35 @@ + + +struct ExcessEntropy <: EntropyMethod end + +# TODO: not finished. lacks proper normalization of 𝐧, but the paper is a bit unclear +# about this step. +function entropygenerator(x, method::ExcessEntropy, rng = Random.default_rng()) + # 𝐔: unique elements in `x` + # 𝐍: the corresponding frequencies (𝐍[i] = # times 𝐔[i] occurs). + 𝐔, 𝐍 = vec_countmap(x) + + # 𝐧 is a Vector{Vector{Int}}. 𝐧[i][j] contains the number of times the unique element + # 𝐔[j] appears in the subsequence `x[1:i]`. + 𝐧 = visitations_per_position(x, 𝐔) + + init = ( + 𝐔 = 𝐔, + 𝐍 = 𝐍, + 𝐧 = 𝐧, + # ... additional normalized 𝐧 needed for excess entropy. + ) + + return EntropyGenerator(method, x, init, rng) +end + + +""" + excess_entropy(x) + +Compute the length-normalized excess entropy of `x`. +""" +function excess_entropy(x, n; base = MathConstants.e) + g = entropygenerator(x, ExcessEntropy()) + return g(n; base = base) +end diff --git a/src/excess/walkthrough_entropy.jl b/src/excess/walkthrough_entropy.jl new file mode 100644 index 000000000..1fd89197f --- /dev/null +++ b/src/excess/walkthrough_entropy.jl @@ -0,0 +1,229 @@ +export walkthrough_entropy +export WalkthroughEntropy + +""" + WalkthroughEntropy + +The walkthrough entropy method (Stoop et al., 2021)[^Stoop2021]. + + +Does not work with `genentropy`, but combination with `entropygenerator`, we can use +this estimator to compute walkthrough entropy for multiple `n` with a single initialization +step (instead of initializing once per `n`). + +## Examples + +```jldoctest; setup = :(using Entropies) +julia> x = "abc"^2 +"abcabc" + +julia> wg = entropygenerator(x, WalkthroughEntropy()); + +julia> [wg(n) for n = 1:length(x)] +6-element Vector{Float64}: + 1.0986122886681098 + 1.3217558399823195 + 0.9162907318741551 + 1.3217558399823195 + 1.0986122886681098 + -0.0 +``` + +See also: [`entropygenerator`](@ref). + +[^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. +""" +struct WalkthroughEntropy <: EntropyEstimator end + +# write into the pre-allocated 𝐧ₙ (a vector containing counts for each unique +# state up to index n). +function count_occurrences_up_to_n!(𝐧ₙ, n, x, unique_symbols) + n <= length(x) || throw(ArgumentError("n cannot be larger than length(x)")) + + for i in eachindex(unique_symbols) + s = unique_symbols[i] + for j in 1:n + if (x[j] == s) + 𝐧ₙ[i] += 1 + end + end + end + return 𝐧ₙ +end + +function visitations_per_position(x, unique_symbols) + N = length(x) + + 𝐧 = [zeros(BigInt, length(unique_symbols)) for i = 1:N] + for n in 1:N + count_occurrences_up_to_n!(𝐧[n], n, x, unique_symbols) + end + + return 𝐧 +end + + +function entropygenerator(x, method::WalkthroughEntropy, rng = Random.default_rng()) + # 𝐔: unique elements in `x` + # 𝐍: the corresponding frequencies (𝐍[i] = # times 𝐔[i] occurs). + 𝐔, 𝐍 = vec_countmap(x) + + # 𝐧 is a Vector{Vector{Int}}. 𝐧[i][j] contains the number of times the unique element + # 𝐔[j] appears in the subsequence `x[1:i]`. + 𝐧 = visitations_per_position(x, 𝐔) + + init = ( + 𝐔 = 𝐔, + 𝐍 = 𝐍, + 𝐧 = 𝐧, + ) + + return EntropyGenerator(method, x, init, rng) +end + +# NB: this function, and the following function, don't actually work when n is large - the +# factorial blows up. I've just implemented them for the sake of understanding the formulas. +function outer_weight(n::Int, 𝐍) + factorial(BigInt(n)) / prod(factorial.(BigInt.(𝐍))) +end + +# Also doesn't work in practice, because factorial(N - n) blows up. +function inner_weight(n::Int, N::Int, 𝐍, 𝐧ₙ) + s = length(𝐍) + + denominator_elements = zeros(s) + + for j = 1:s + # The total number of occurrences for the j-th state. + Nⱼ = 𝐍[j] + + # The number of times the j-th state has been visited up to position `n` + nⱼ = 𝐧ₙ[j] + + denominator_elements[j] = factorial(BigInt(Nⱼ - nⱼ)) + end + return factorial(BigInt(N) - BigInt(n)) / prod(denominator_elements) +end + +""" + walkthrough_prob(x, n::Int) + +The walk-through probability (Stoop et al., 2021)[^Stoop2021] for a symbol sequence `x` +(can be a string, or categorical sequence (e.g. integer vector or `Dataset` of state +vectors). + +- `n`: The position within the sequence, where `n ∈ [1, 2, …, N]` and `N` is the total + number of elements in the sequence. + +[^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. +""" +function walkthrough_prob(x, n::Int, 𝐍, 𝐧) + 𝐧ₙ = 𝐧[n] + N = length(x) + + 𝐏 = 𝐍 ./ N + + # First weight is simple. + w1 = outer_weight(n, 𝐍) + w2 = inner_weight(n, N, 𝐍, 𝐧ₙ) + + c2 = [(pⱼ^nⱼ) * w2 for (pⱼ, nⱼ) in zip(𝐏, 𝐧ₙ)] + c3 = [pⱼ^(𝐧[end][j] - 𝐧[n][j]) for (j, pⱼ) in enumerate(𝐏)] + + return w1 * prod(c2) * prod(c3) +end + +function conditional_walkprob(n::Int, N::Int, 𝐍, 𝐧) + if (n == 0) + return 1.0 + else + 𝐧ₙ = 𝐧[n] + s = length(𝐍) + a = prod([binomial(𝐍[j], 𝐧ₙ[j]) for j = 1:s]) + b = binomial(BigInt(N), BigInt(n)) + return a / b + end +end + +function _walkthrough_entropy(n::Int, N::Int, 𝐍, 𝐧; length_normalize = false, + base = MathConstants.e) + + if (!length_normalize) + # P(𝐧|𝐍) + p = conditional_walkprob(n, N, 𝐍, 𝐧) + return -log(base, p) + else + # P(𝐧|𝐍) + p = conditional_walkprob(n, N, 𝐍, 𝐧) + + # NB: Not sure about the normalization step. + # Why? + # Citing from paper: P(𝐧|𝐍) is the multivariate hypergeometric probability + # distribution (i.e., for sampling without replacement). It has expectation + # 𝔼(𝐧) = n𝐩. + # + # Length-normalized excess entropy is defined as + # H(n) = P(𝐧|𝐍) / P(𝔼(𝐧)|𝐍). + # + # To compute P(𝔼(𝐧)|𝐍), the elements of 𝐧 must be integers (because the binomial + # formula is used). However, 𝔼(𝐧) is in general not an integer vector, because 𝐩 is + # a probability vector, so the integer-vector product n𝐩 yields a vector of floats. + # + # I'm not exactly sure what they do in the original implementation, but here I'll + # just + #𝐄𝐧 = [ceil(StatsBase.mean(nᵢ)) for nᵢ in 𝐧] + p𝐄 = conditional_walkprob(n, N, 𝐍, 𝐧) + -log(base, p / N) + end +end + +function (eg::EntropyGenerator{<:WalkthroughEntropy})(n::Int; + length_normalize = false, base = MathConstants.e) + + 𝐔, 𝐍, 𝐧 = getfield.( + Ref(eg.init), + (:𝐔, :𝐍, :𝐧) + ) + + x = eg.x + N = length(x) + + 0 <= n <= N || throw(ArgumentError("n ∈ [1, 2, …, length(x)] is required. Got n=$n and length(x)=$(length(x))")) + + wte = _walkthrough_entropy(n, N, 𝐍, 𝐧; length_normalize = length_normalize, base = base) + return convert(Float64, wte) +end + + +""" + walkthrough_entropy(x, n::Int; base = MathConstants.e) + +Compute the walk-through entropy (Stoop et al., 2021)[^Stoop2021] to the given `base` at +position `n` for a symbol sequence `x`, where `x` can be any categorical iterable. + +If computing the walkthrough entropy for multiple `n`, use [`entropygenerator`](@ref) with + [`WalkthroughEntropy`](@ref). + +!!! info + This estimator is only available for entropy estimation. Probabilities + cannot be obtained directly. + + +## Examples + +```jldoctest; setup = :(using Entropies) +julia> x = "abc"^10 +"abcabcabcabcabcabcabcabcabcabc" + +julia> walkthrough_entropy(x, 5) +1.9512293105329133 +``` + +[^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. +""" +function walkthrough_entropy(x, n::Int; base = MathConstants.e) + g = entropygenerator(x, WalkthroughEntropy()) + # The length-normalized walkthrough entropy is the excess entropy, so here we never + # normalize. + return g(n; base = base, length_normalize = false) +end diff --git a/src/utils.jl b/src/utils.jl new file mode 100644 index 000000000..bc7ca4d6f --- /dev/null +++ b/src/utils.jl @@ -0,0 +1,38 @@ +using DelayEmbeddings +# Get characters to sort them. +vec_countmap(x::AbstractString) = vec_countmap(x |> collect) +vec_countmap(x::AbstractDataset) = vec_countmap(x.data) + +function vec_countmap(x, T = BigInt) + L = length(x) + + hist = Vector{T}() + T = eltype(x) + unique_vals = Vector{T}() + + # Reserve enough space for histogram: + sizehint!(hist, L) + sizehint!(unique_vals, L) + sx = sort(x, alg = QuickSort) + + # Fill the histogram by counting consecutive equal values: + prev_val, count = sx[1], 0 + push!(unique_vals, sx[1]) + for val in sx + if val == prev_val + count += 1 + else + push!(hist, count) + push!(unique_vals, val) + prev_val = val + count = 1 + end + end + push!(hist, count) + + # Shrink histogram capacity to fit its size: + sizehint!(hist, length(hist)) + sizehint!(unique_vals, length(unique_vals)) + + return unique_vals, hist +end diff --git a/test/runtests.jl b/test/runtests.jl index cdcd8070a..9fb21cb48 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -262,8 +262,8 @@ end D = Dataset(rand(100, 3)) @testset "Counting visits" begin - @test Entropies.marginal_visits(D, RectangularBinning(0.2), 1:2) isa Vector{Vector{Int}} - @test Entropies.joint_visits(D, RectangularBinning(0.2)) isa Vector{Vector{Int}} + @test Entropies.marginal_visits(D, RectangularBinning(0.2), 1:2) isa Vector{<:AbstractVector} + @test Entropies.joint_visits(D, RectangularBinning(0.2)) isa Vector{<:AbstractVector} end binnings = [ @@ -353,4 +353,13 @@ end @test probabilities(D, TransferOperator(binnings[i])) isa Probabilities end end + + @testset "Walkthrough entropy" begin + x = "ab"^10 + @test WalkthroughEntropy() isa WalkthroughEntropy + @test walkthrough_entropy(x, 5) isa Float64 + eg = entropygenerator(x, WalkthroughEntropy()) + @test eg isa EntropyGenerator + @test [eg(n) for n in 1:length(x)] isa Vector{Float64} + end end From 4bbae113acb620a3e90d539e91b7dc68cb40c9ed Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 09:25:57 +0200 Subject: [PATCH 02/12] Don't include excess entropy --- src/excess/excess_entropy.jl | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 src/excess/excess_entropy.jl diff --git a/src/excess/excess_entropy.jl b/src/excess/excess_entropy.jl deleted file mode 100644 index 649da93d4..000000000 --- a/src/excess/excess_entropy.jl +++ /dev/null @@ -1,35 +0,0 @@ - - -struct ExcessEntropy <: EntropyMethod end - -# TODO: not finished. lacks proper normalization of 𝐧, but the paper is a bit unclear -# about this step. -function entropygenerator(x, method::ExcessEntropy, rng = Random.default_rng()) - # 𝐔: unique elements in `x` - # 𝐍: the corresponding frequencies (𝐍[i] = # times 𝐔[i] occurs). - 𝐔, 𝐍 = vec_countmap(x) - - # 𝐧 is a Vector{Vector{Int}}. 𝐧[i][j] contains the number of times the unique element - # 𝐔[j] appears in the subsequence `x[1:i]`. - 𝐧 = visitations_per_position(x, 𝐔) - - init = ( - 𝐔 = 𝐔, - 𝐍 = 𝐍, - 𝐧 = 𝐧, - # ... additional normalized 𝐧 needed for excess entropy. - ) - - return EntropyGenerator(method, x, init, rng) -end - - -""" - excess_entropy(x) - -Compute the length-normalized excess entropy of `x`. -""" -function excess_entropy(x, n; base = MathConstants.e) - g = entropygenerator(x, ExcessEntropy()) - return g(n; base = base) -end From 38f4953b37da09fb2e1a58e75f462cc041dfcb75 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 09:26:42 +0200 Subject: [PATCH 03/12] Normalization does nothing --- src/excess/walkthrough_entropy.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/excess/walkthrough_entropy.jl b/src/excess/walkthrough_entropy.jl index 1fd89197f..7d32cb087 100644 --- a/src/excess/walkthrough_entropy.jl +++ b/src/excess/walkthrough_entropy.jl @@ -145,6 +145,8 @@ function conditional_walkprob(n::Int, N::Int, 𝐍, 𝐧) end end +# TODO: normalization does nothing at the moment, but is only needed for excess entropy, +# so this doesn't affect the walkthrough entropy. See comment inside function. function _walkthrough_entropy(n::Int, N::Int, 𝐍, 𝐧; length_normalize = false, base = MathConstants.e) @@ -153,8 +155,8 @@ function _walkthrough_entropy(n::Int, N::Int, 𝐍, 𝐧; length_normalize = fal p = conditional_walkprob(n, N, 𝐍, 𝐧) return -log(base, p) else - # P(𝐧|𝐍) - p = conditional_walkprob(n, N, 𝐍, 𝐧) + # P(𝐧|𝐍) + p = conditional_walkprob(n, N, 𝐍, 𝐧) # NB: Not sure about the normalization step. # Why? @@ -169,11 +171,9 @@ function _walkthrough_entropy(n::Int, N::Int, 𝐍, 𝐧; length_normalize = fal # formula is used). However, 𝔼(𝐧) is in general not an integer vector, because 𝐩 is # a probability vector, so the integer-vector product n𝐩 yields a vector of floats. # - # I'm not exactly sure what they do in the original implementation, but here I'll - # just - #𝐄𝐧 = [ceil(StatsBase.mean(nᵢ)) for nᵢ in 𝐧] - p𝐄 = conditional_walkprob(n, N, 𝐍, 𝐧) - -log(base, p / N) + #𝐄𝐧 = [ceil(Int, StatsBase.mean(nᵢ)) for nᵢ in 𝐧] + #p𝐄 = conditional_walkprob(n, N, 𝐍, 𝐧) + -log(base, p) end end From 75f623de761899525b1456fbfdbab600a7e212f3 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 09:40:52 +0200 Subject: [PATCH 04/12] Keep all histogram estimation in one place --- src/Entropies.jl | 1 - src/histogram_estimation.jl | 44 ++++++++++++++++++++++++++++++++++++- src/utils.jl | 38 -------------------------------- 3 files changed, 43 insertions(+), 40 deletions(-) delete mode 100644 src/utils.jl diff --git a/src/Entropies.jl b/src/Entropies.jl index 48f65add7..1d8ffc0aa 100644 --- a/src/Entropies.jl +++ b/src/Entropies.jl @@ -1,7 +1,6 @@ module Entropies include("core.jl") include("api.jl") - include("utils.jl") include("histogram_estimation.jl") include("counting_based/CountOccurrences.jl") include("symbolic/symbolic.jl") diff --git a/src/histogram_estimation.jl b/src/histogram_estimation.jl index 928ee6c1f..b396c124d 100644 --- a/src/histogram_estimation.jl +++ b/src/histogram_estimation.jl @@ -28,4 +28,46 @@ function _non0hist(x) return Probabilities(hist ./ L) end _non0hist(x::AbstractDataset) = _non0hist(x.data) -probabilities(x::AbstractDataset) = _non0hist(x.data) \ No newline at end of file +probabilities(x::AbstractDataset) = _non0hist(x.data) + + +using DelayEmbeddings + +# Get both unique elements and their counts +function vec_countmap(x, T = BigInt) + L = length(x) + + hist = Vector{T}() + T = eltype(x) + unique_vals = Vector{T}() + + # Reserve enough space for histogram: + sizehint!(hist, L) + sizehint!(unique_vals, L) + sx = sort(x, alg = QuickSort) + + # Fill the histogram by counting consecutive equal values: + prev_val, count = sx[1], 0 + push!(unique_vals, sx[1]) + for val in sx + if val == prev_val + count += 1 + else + push!(hist, count) + push!(unique_vals, val) + prev_val = val + count = 1 + end + end + push!(hist, count) + + # Shrink histogram capacity to fit its size: + sizehint!(hist, length(hist)) + sizehint!(unique_vals, length(unique_vals)) + + return unique_vals, hist +end + +# Get characters to sort them. +vec_countmap(x::AbstractString) = vec_countmap(x |> collect) +vec_countmap(x::AbstractDataset) = vec_countmap(x.data) diff --git a/src/utils.jl b/src/utils.jl deleted file mode 100644 index bc7ca4d6f..000000000 --- a/src/utils.jl +++ /dev/null @@ -1,38 +0,0 @@ -using DelayEmbeddings -# Get characters to sort them. -vec_countmap(x::AbstractString) = vec_countmap(x |> collect) -vec_countmap(x::AbstractDataset) = vec_countmap(x.data) - -function vec_countmap(x, T = BigInt) - L = length(x) - - hist = Vector{T}() - T = eltype(x) - unique_vals = Vector{T}() - - # Reserve enough space for histogram: - sizehint!(hist, L) - sizehint!(unique_vals, L) - sx = sort(x, alg = QuickSort) - - # Fill the histogram by counting consecutive equal values: - prev_val, count = sx[1], 0 - push!(unique_vals, sx[1]) - for val in sx - if val == prev_val - count += 1 - else - push!(hist, count) - push!(unique_vals, val) - prev_val = val - count = 1 - end - end - push!(hist, count) - - # Shrink histogram capacity to fit its size: - sizehint!(hist, length(hist)) - sizehint!(unique_vals, length(unique_vals)) - - return unique_vals, hist -end From ce9d94b1691810923c3161637f1245fdd566fe04 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 09:50:06 +0200 Subject: [PATCH 05/12] Add method for multiple `n` --- src/excess/walkthrough_entropy.jl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/excess/walkthrough_entropy.jl b/src/excess/walkthrough_entropy.jl index 7d32cb087..c16485df3 100644 --- a/src/excess/walkthrough_entropy.jl +++ b/src/excess/walkthrough_entropy.jl @@ -221,9 +221,18 @@ julia> walkthrough_entropy(x, 5) [^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. """ -function walkthrough_entropy(x, n::Int; base = MathConstants.e) +function walkthrough_entropy(x, n::J, base = MathConstants.e) where {J <: Integer} g = entropygenerator(x, WalkthroughEntropy()) # The length-normalized walkthrough entropy is the excess entropy, so here we never # normalize. return g(n; base = base, length_normalize = false) end + +function walkthrough_entropy(x, n::AbstractVector{J}; + base = MathConstants.e) where {J <: Integer} + + g = entropygenerator(x, WalkthroughEntropy()) + # The length-normalized walkthrough entropy is the excess entropy, so here we never + # normalize. + return [g(nᵢ; base = base, length_normalize = false) for nᵢ in n] +end From ec33801e55911d65e655607e685015728bbf7c7f Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 10:00:11 +0200 Subject: [PATCH 06/12] Separate (unused) walkthrough probability --- .../walkthrough_entropy.jl | 54 ------------------- src/walkthrough/walkthrough_prob.jl | 51 ++++++++++++++++++ 2 files changed, 51 insertions(+), 54 deletions(-) rename src/{excess => walkthrough}/walkthrough_entropy.jl (76%) create mode 100644 src/walkthrough/walkthrough_prob.jl diff --git a/src/excess/walkthrough_entropy.jl b/src/walkthrough/walkthrough_entropy.jl similarity index 76% rename from src/excess/walkthrough_entropy.jl rename to src/walkthrough/walkthrough_entropy.jl index c16485df3..1f15fd1a7 100644 --- a/src/excess/walkthrough_entropy.jl +++ b/src/walkthrough/walkthrough_entropy.jl @@ -6,7 +6,6 @@ export WalkthroughEntropy The walkthrough entropy method (Stoop et al., 2021)[^Stoop2021]. - Does not work with `genentropy`, but combination with `entropygenerator`, we can use this estimator to compute walkthrough entropy for multiple `n` with a single initialization step (instead of initializing once per `n`). @@ -62,7 +61,6 @@ function visitations_per_position(x, unique_symbols) return 𝐧 end - function entropygenerator(x, method::WalkthroughEntropy, rng = Random.default_rng()) # 𝐔: unique elements in `x` # 𝐍: the corresponding frequencies (𝐍[i] = # times 𝐔[i] occurs). @@ -81,58 +79,6 @@ function entropygenerator(x, method::WalkthroughEntropy, rng = Random.default_rn return EntropyGenerator(method, x, init, rng) end -# NB: this function, and the following function, don't actually work when n is large - the -# factorial blows up. I've just implemented them for the sake of understanding the formulas. -function outer_weight(n::Int, 𝐍) - factorial(BigInt(n)) / prod(factorial.(BigInt.(𝐍))) -end - -# Also doesn't work in practice, because factorial(N - n) blows up. -function inner_weight(n::Int, N::Int, 𝐍, 𝐧ₙ) - s = length(𝐍) - - denominator_elements = zeros(s) - - for j = 1:s - # The total number of occurrences for the j-th state. - Nⱼ = 𝐍[j] - - # The number of times the j-th state has been visited up to position `n` - nⱼ = 𝐧ₙ[j] - - denominator_elements[j] = factorial(BigInt(Nⱼ - nⱼ)) - end - return factorial(BigInt(N) - BigInt(n)) / prod(denominator_elements) -end - -""" - walkthrough_prob(x, n::Int) - -The walk-through probability (Stoop et al., 2021)[^Stoop2021] for a symbol sequence `x` -(can be a string, or categorical sequence (e.g. integer vector or `Dataset` of state -vectors). - -- `n`: The position within the sequence, where `n ∈ [1, 2, …, N]` and `N` is the total - number of elements in the sequence. - -[^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. -""" -function walkthrough_prob(x, n::Int, 𝐍, 𝐧) - 𝐧ₙ = 𝐧[n] - N = length(x) - - 𝐏 = 𝐍 ./ N - - # First weight is simple. - w1 = outer_weight(n, 𝐍) - w2 = inner_weight(n, N, 𝐍, 𝐧ₙ) - - c2 = [(pⱼ^nⱼ) * w2 for (pⱼ, nⱼ) in zip(𝐏, 𝐧ₙ)] - c3 = [pⱼ^(𝐧[end][j] - 𝐧[n][j]) for (j, pⱼ) in enumerate(𝐏)] - - return w1 * prod(c2) * prod(c3) -end - function conditional_walkprob(n::Int, N::Int, 𝐍, 𝐧) if (n == 0) return 1.0 diff --git a/src/walkthrough/walkthrough_prob.jl b/src/walkthrough/walkthrough_prob.jl new file mode 100644 index 000000000..ba45b80b0 --- /dev/null +++ b/src/walkthrough/walkthrough_prob.jl @@ -0,0 +1,51 @@ +# The walkthrough probability is not used in itself at the moment, but is included here for +# completeness for the reproduction of Stoop et al. (2021). + +function outer_weight(n::Int, 𝐍) + factorial(BigInt(n)) / prod(factorial.(BigInt.(𝐍))) +end + +function inner_weight(n::Int, N::Int, 𝐍, 𝐧ₙ) + s = length(𝐍) + + denominator_elements = zeros(s) + + for j = 1:s + # The total number of occurrences for the j-th state. + Nⱼ = 𝐍[j] + + # The number of times the j-th state has been visited up to position `n` + nⱼ = 𝐧ₙ[j] + + denominator_elements[j] = factorial(BigInt(Nⱼ - nⱼ)) + end + return factorial(BigInt(N) - BigInt(n)) / prod(denominator_elements) +end + +""" + walkthrough_prob(x, n::Int) + +The walk-through probability (Stoop et al., 2021)[^Stoop2021] for a symbol sequence `x` +(can be a string, or categorical sequence (e.g. integer vector or `Dataset` of state +vectors). + +- `n`: The position within the sequence, where `n ∈ [1, 2, …, N]` and `N` is the total + number of elements in the sequence. + +[^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. +""" +function walkthrough_prob(x, n::Int, 𝐍, 𝐧) + 𝐧ₙ = 𝐧[n] + N = length(x) + + 𝐏 = 𝐍 ./ N + + # First weight is simple. + w1 = outer_weight(n, 𝐍) + w2 = inner_weight(n, N, 𝐍, 𝐧ₙ) + + c2 = [(pⱼ^nⱼ) * w2 for (pⱼ, nⱼ) in zip(𝐏, 𝐧ₙ)] + c3 = [pⱼ^(𝐧[end][j] - 𝐧[n][j]) for (j, pⱼ) in enumerate(𝐏)] + + return w1 * prod(c2) * prod(c3) +end From 787cfc93e95adf8d48e33c2b189ba385326fc109 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 10:03:00 +0200 Subject: [PATCH 07/12] Add Random as dependency --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index f495bf3ab..e1ea1fa39 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Neighborhood = "645ca80c-8b79-4109-87ea-e1f58159d116" QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" From 0c32d05bb12a0827ac0524611d40300db85b40fe Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 10:03:54 +0200 Subject: [PATCH 08/12] Correct path --- src/Entropies.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Entropies.jl b/src/Entropies.jl index a73b94355..ab1424a85 100644 --- a/src/Entropies.jl +++ b/src/Entropies.jl @@ -8,6 +8,6 @@ module Entropies include("kerneldensity/kerneldensity.jl") include("wavelet/wavelet.jl") include("nearest_neighbors/nearest_neighbors.jl") - include("excess/walkthrough_entropy.jl") + include("walkthrough/walkthrough_entropy.jl") include("dispersion/dispersion_entropy.jl") end From 3f07fe646e1a84bc55839740933f4ef579f89529 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 10:36:01 +0200 Subject: [PATCH 09/12] Also test walkthrough probability --- src/walkthrough/walkthrough_prob.jl | 10 +++++++++- test/runtests.jl | 15 ++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/walkthrough/walkthrough_prob.jl b/src/walkthrough/walkthrough_prob.jl index ba45b80b0..262445413 100644 --- a/src/walkthrough/walkthrough_prob.jl +++ b/src/walkthrough/walkthrough_prob.jl @@ -23,7 +23,8 @@ function inner_weight(n::Int, N::Int, 𝐍, 𝐧ₙ) end """ - walkthrough_prob(x, n::Int) + walkthrough_prob(x, n::Int, 𝐍, 𝐧) + walkthrough_prob(x, n::Int, g::EntropyGenerator{WalkthroughEntropy}) The walk-through probability (Stoop et al., 2021)[^Stoop2021] for a symbol sequence `x` (can be a string, or categorical sequence (e.g. integer vector or `Dataset` of state @@ -31,6 +32,9 @@ vectors). - `n`: The position within the sequence, where `n ∈ [1, 2, …, N]` and `N` is the total number of elements in the sequence. +- `𝐍`: a vector of counts (frequencies) for each unique state in `x`. +- `𝐧`: a vector of vectors, where inner vectors all have `length(unique(x))` elements, + where `𝐧[i][j]` counts the number of times unique state `j` has appeared in `x[1:i]`. [^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. """ @@ -49,3 +53,7 @@ function walkthrough_prob(x, n::Int, 𝐍, 𝐧) return w1 * prod(c2) * prod(c3) end + +function walkthrough_prob(x, n, g::EntropyGenerator{<:WalkthroughEntropy}) + walkthrough_prob(x, n, g.init.𝐍, g.init.𝐧) +end diff --git a/test/runtests.jl b/test/runtests.jl index 6ae9fe76e..126c638cd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -354,7 +354,20 @@ end end end + @testset begin "Walktrough probability" + x = "abc"^5 + eg = entropygenerator(x, WalkthroughEntropy()) + eg.init.𝐍 + wp = walkthrough_prob(x, 3, eg) + ow = outer_weight(3, eg.init.𝐍) + iw = inner_weight(3, length(x), eg.init.𝐍, eg.init.𝐧[3]) + @test wp |> typeof <: BigFloat + @test ow |> typeof <: BigFloat + @test iw |> typeof <: BigFloat + end + @testset "Walkthrough entropy" begin + x = "ab"^10 @test WalkthroughEntropy() isa WalkthroughEntropy @test walkthrough_entropy(x, 5) isa Float64 @@ -362,7 +375,7 @@ end @test eg isa EntropyGenerator @test [eg(n) for n in 1:length(x)] isa Vector{Float64} end - + @testset "Dispersion entropy" begin # Li et al. (2018) recommends using at least 1000 data points when estimating # dispersion entropy. From 9967bb87fb9c0f69ef371a7a514a44d69fe446f8 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 10:41:17 +0200 Subject: [PATCH 10/12] Methods are not exported --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 126c638cd..05e5799ca 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -358,9 +358,9 @@ end x = "abc"^5 eg = entropygenerator(x, WalkthroughEntropy()) eg.init.𝐍 - wp = walkthrough_prob(x, 3, eg) - ow = outer_weight(3, eg.init.𝐍) - iw = inner_weight(3, length(x), eg.init.𝐍, eg.init.𝐧[3]) + wp = Entropies.walkthrough_prob(x, 3, eg) + ow = Entropies.outer_weight(3, eg.init.𝐍) + iw = Entropies.inner_weight(3, length(x), eg.init.𝐍, eg.init.𝐧[3]) @test wp |> typeof <: BigFloat @test ow |> typeof <: BigFloat @test iw |> typeof <: BigFloat From c242a5c8faaf14a41708cbd4f38d02a3556cc103 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 10:54:13 +0200 Subject: [PATCH 11/12] Include file to test it --- src/Entropies.jl | 2 +- src/walkthrough/walkthrough.jl | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 src/walkthrough/walkthrough.jl diff --git a/src/Entropies.jl b/src/Entropies.jl index ab1424a85..a70f96582 100644 --- a/src/Entropies.jl +++ b/src/Entropies.jl @@ -8,6 +8,6 @@ module Entropies include("kerneldensity/kerneldensity.jl") include("wavelet/wavelet.jl") include("nearest_neighbors/nearest_neighbors.jl") - include("walkthrough/walkthrough_entropy.jl") + include("walkthrough/walkthrough.jl") include("dispersion/dispersion_entropy.jl") end diff --git a/src/walkthrough/walkthrough.jl b/src/walkthrough/walkthrough.jl new file mode 100644 index 000000000..7fb51a42e --- /dev/null +++ b/src/walkthrough/walkthrough.jl @@ -0,0 +1,2 @@ +include("walkthrough_prob.jl") +include("walkthrough_entropy.jl") From dd10654f69ad45304c97e398d3d450dd3f423a37 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Thu, 25 Aug 2022 11:02:57 +0200 Subject: [PATCH 12/12] Estimator is needed in both files --- src/walkthrough/walkthrough.jl | 34 ++++++++++++++++++++++++++ src/walkthrough/walkthrough_entropy.jl | 33 ------------------------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/walkthrough/walkthrough.jl b/src/walkthrough/walkthrough.jl index 7fb51a42e..b304e098c 100644 --- a/src/walkthrough/walkthrough.jl +++ b/src/walkthrough/walkthrough.jl @@ -1,2 +1,36 @@ + +""" + WalkthroughEntropy + +The walkthrough entropy method (Stoop et al., 2021)[^Stoop2021]. + +Does not work with `genentropy`, but combination with `entropygenerator`, we can use +this estimator to compute walkthrough entropy for multiple `n` with a single initialization +step (instead of initializing once per `n`). + +## Examples + +```jldoctest; setup = :(using Entropies) +julia> x = "abc"^2 +"abcabc" + +julia> wg = entropygenerator(x, WalkthroughEntropy()); + +julia> [wg(n) for n = 1:length(x)] +6-element Vector{Float64}: + 1.0986122886681098 + 1.3217558399823195 + 0.9162907318741551 + 1.3217558399823195 + 1.0986122886681098 + -0.0 +``` + +See also: [`entropygenerator`](@ref). + +[^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. +""" +struct WalkthroughEntropy <: EntropyEstimator end + include("walkthrough_prob.jl") include("walkthrough_entropy.jl") diff --git a/src/walkthrough/walkthrough_entropy.jl b/src/walkthrough/walkthrough_entropy.jl index 1f15fd1a7..7820c3980 100644 --- a/src/walkthrough/walkthrough_entropy.jl +++ b/src/walkthrough/walkthrough_entropy.jl @@ -1,39 +1,6 @@ export walkthrough_entropy export WalkthroughEntropy -""" - WalkthroughEntropy - -The walkthrough entropy method (Stoop et al., 2021)[^Stoop2021]. - -Does not work with `genentropy`, but combination with `entropygenerator`, we can use -this estimator to compute walkthrough entropy for multiple `n` with a single initialization -step (instead of initializing once per `n`). - -## Examples - -```jldoctest; setup = :(using Entropies) -julia> x = "abc"^2 -"abcabc" - -julia> wg = entropygenerator(x, WalkthroughEntropy()); - -julia> [wg(n) for n = 1:length(x)] -6-element Vector{Float64}: - 1.0986122886681098 - 1.3217558399823195 - 0.9162907318741551 - 1.3217558399823195 - 1.0986122886681098 - -0.0 -``` - -See also: [`entropygenerator`](@ref). - -[^Stoop2021]: Stoop, R. L., Stoop, N., Kanders, K., & Stoop, R. (2021). Excess entropies suggest the physiology of neurons to be primed for higher-level computation. Physical Review Letters, 127(14), 148101. -""" -struct WalkthroughEntropy <: EntropyEstimator end - # write into the pre-allocated 𝐧ₙ (a vector containing counts for each unique # state up to index n). function count_occurrences_up_to_n!(𝐧ₙ, n, x, unique_symbols)