Minor release with Gramian training for OLS (#151)

tlienart · ablaom · adienes · web-flow · commit 0887cfeefeed · 2023-10-03T01:33:57.000+02:00
* fix a doc typo * fixes following discussion around #147 * small adjustments + typo fixes * first pass at Gramian training for OLS (#146) * proof of concept * AbstractMatrix -> AVR * cleaner impl * endline * fix error type * construct kernels if not passed in * add test case for implicit gram construction * last endline * check for isempty instead of iszero * Prepare minor release with gramian training --------- Co-authored-by: Anthony D. Blaom <anthony.blaom@gmail.com> Co-authored-by: adienes <51664769+adienes@users.noreply.github.com>
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJLinearModels"
 uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 authors = ["Thibaut Lienart <tlienart@me.com>"]
-version = "0.9.2"
+version = "0.10.0"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
diff --git a/src/fit/default.jl b/src/fit/default.jl
@@ -33,13 +33,22 @@ $SIGNATURES
 Fit a generalised linear regression model using an appropriate solver based on
 the loss and penalty of the model. A method can, in some cases, be specified.
 """
-function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR;
+function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing,
              solver::Solver=_solver(glr, size(X)))
-    check_nrows(X, y)
-    n, p = size(X)
-    c = getc(glr, y)
-    return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
+    if hasproperty(solver, :gram) && solver.gram
+        # interpret X,y as X'X, X'y
+        data = verify_or_construct_gramian(glr, X, y, data)
+        p = size(data.XX, 2)
+        return _fit(glr, solver, data.XX, data.Xy, (; dims=(data.n, p, 0)))
+    else
+        check_nrows(X, y)
+        n, p = size(X)
+        c = getc(glr, y)
+        return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
+    end
 end
+fit(glr::GLR; kwargs...) = fit(glr, zeros((0,0)), zeros((0,)); kwargs...)
+
 
 function scratch(n, p, c=0; i=false)
     p_ = p + Int(i)
diff --git a/src/fit/proxgrad.jl b/src/fit/proxgrad.jl
@@ -3,7 +3,7 @@
 # Assumption: loss has gradient; penalty has prox e.g.: Lasso
 # J(θ) = f(θ) + r(θ) where f is smooth
 function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
-    _,p,c = npc(scratch)
+    n,p,c = npc(scratch)
     c > 0 && (p *= c)
     # vector caches + eval cache
     θ   = zeros(p)   # θ_k
@@ -19,9 +19,18 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
     η   = 1.0   # stepsize (1/L)
     acc = ifelse(solver.accel, 1.0, 0.0) # if 0, no extrapolation (ISTA)
     # functions
-    _f      = smooth_objective(glr, X, y; c=c)
-    _fg!    = smooth_fg!(glr, X, y, scratch)
-    _prox!  = prox!(glr, size(X, 1))
+    _f = if solver.gram
+        smooth_gram_objective(glr, X, y, n)
+    else
+        smooth_objective(glr, X, y; c=c)
+    end
+
+    _fg! = if solver.gram
+        smooth_gram_fg!(glr, X, y, n)
+    else
+        smooth_fg!(glr, X, y, scratch)
+    end
+    _prox!  = prox!(glr, n)
     bt_cond = θ̂ ->
                 _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η)
     # loop-related
diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl
@@ -133,6 +133,7 @@ Proximal Gradient solver for non-smooth objective functions.
     tol::Float64   = 1e-4  # tol relative change of θ i.e. norm(θ-θ_)/norm(θ)
     max_inner::Int = 100   # β^max_inner should be > 1e-10
     beta::Float64  = 0.8   # in (0, 1); shrinkage in the backtracking step
+    gram::Bool = false     # use precomputed Gramian for lsq where possible
 end
 
 FISTA(; kwa...) = ProxGrad(;accel = true, kwa...)
diff --git a/src/glr/d_l2loss.jl b/src/glr/d_l2loss.jl
@@ -72,3 +72,12 @@ function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y, scratch)
         return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ))
     end
 end
+
+function smooth_gram_fg!(glr::GLR{L2Loss,<:ENR}, XX, Xy, n)
+    λ = get_penalty_scale_l2(glr, n)
+    (g, θ) -> begin
+        _g = XX * θ .- Xy
+        g .= _g .+ λ .* θ
+        return θ'*_g + get_l2(glr.penalty)(view_θ(glr, θ))
+    end
+end
diff --git a/src/glr/utils.jl b/src/glr/utils.jl
@@ -1,4 +1,4 @@
-export objective, smooth_objective
+export objective, smooth_objective, smooth_gram_objective
 
 # NOTE: RobustLoss are not always everywhere  smooth but "smooth-enough".
 const SmoothLoss = Union{L2Loss, LogisticLoss, MultinomialLoss, RobustLoss}
@@ -37,6 +37,9 @@ Return the smooth part of the objective function of a GLR.
 """
 smooth_objective(glr::GLR{<:SmoothLoss,<:ENR}, n) = glr.loss + get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.)
 
+smooth_gram_objective(glr::GLR{<:SmoothLoss,<:ENR}, XX, Xy, n) =
+    θ -> (θ'*XX*θ)/2 - (θ'*Xy) + (get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.))(θ)
+
 smooth_objective(::GLR) = @error "Case not implemented yet."
 
 """
diff --git a/src/mlj/classifiers.jl b/src/mlj/classifiers.jl
@@ -65,7 +65,7 @@ See also [`MultinomialClassifier`](@ref).
     """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`, `Newton`,
     `NewtonCG`, `ProxGrad`; but subject to the following restrictions:
 
-    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only
+    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only
       option.
 
     - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed.
@@ -142,7 +142,7 @@ See also [`LogisticClassifier`](@ref).
     """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`,
     `NewtonCG`, `ProxGrad`; but subject to the following restrictions:
 
-    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only
+    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only
       option.
 
     - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed.
diff --git a/src/utils.jl b/src/utils.jl
@@ -9,6 +9,29 @@ function check_nrows(X::AbstractMatrix, y::AbstractVecOrMat)::Nothing
     throw(DimensionMismatch("`X` and `y` must have the same number of rows."))
 end
 
+function verify_or_construct_gramian(glr, X, y, data)
+    check_nrows(X, y)
+    isnothing(data) && return (; XX = X'X, Xy = X'y, n = length(y))
+
+    !all(hasproperty.(Ref(data), (:XX, :Xy, :n))) && throw(ArgumentError("data must contain XX, Xy, n"))
+    size(data.XX, 1) != size(data.Xy, 1) && throw(DimensionMismatch("`XX` and Xy` must have the same number of rows."))
+    !issymmetric(data.XX) && throw(ArgumentError("Input `XX` must be symmetric"))
+
+    c = getc(glr, data.Xy)
+    !iszero(c) && throw(ArgumentError("Categorical loss not supported with Gramian kernel"))
+    glr.fit_intercept && throw(ArgumentError("Intercept not supported with Gramian kernel"))
+
+    if any(!isempty, (X, y))
+        all((
+            isapprox(X'X, data.XX; rtol=1e-5),
+            isapprox(X'y, data.Xy; rtol=1e-5),
+            length(y) == data.n
+        )) || throw(ArgumentError("Inputs `X` and `y` do not match inputs `XX` and `Xy`."))
+    end
+
+    return data
+end
+
 """
 $SIGNATURES
 
diff --git a/test/Project.toml b/test/Project.toml
@@ -2,6 +2,7 @@
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
diff --git a/test/benchmarks/robust.jl b/test/benchmarks/robust.jl
@@ -2,9 +2,9 @@
 
 if DO_COMPARISONS
     @testset "Comp-QR-147" begin
-        using CSV, DataFrames
+        using CSV, DataFrames, Downloads
 
-        dataset = CSV.read(download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame)
+        dataset = CSV.read(Downloads.download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame)
         tau     = 0.3
 
         y  = Vector(dataset[!,:rent_euro])
diff --git a/test/fit/ols-ridge-lasso-elnet.jl b/test/fit/ols-ridge-lasso-elnet.jl
@@ -146,3 +146,18 @@ end
         @test nnz(θ_sk) == 8
     end
 end
+
+@testset "gramian" begin
+    λ = 0.1
+    γ = 0.1
+    enr = ElasticNetRegression(λ, γ; fit_intercept=false,
+                                     scale_penalty_with_samples=false)    
+    XX = X'X
+    Xy = X'y
+    n = size(X, 1)
+    θ_fista = fit(enr, X, y; solver=FISTA(max_iter=5000))
+    θ_gram_explicit = fit(enr; data=(; XX, Xy, n), solver=FISTA(max_iter=5000, gram=true))
+    θ_gram_implicit = fit(enr, X, y; solver=FISTA(max_iter=5000, gram=true))
+    @test isapprox(θ_fista, θ_gram_explicit, rtol=1e-5)
+    @test isapprox(θ_gram_explicit, θ_gram_implicit; rtol=1e-5)
+end
diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl
@@ -74,7 +74,7 @@ end
 ## With Sparsity penalty ##
 ###########################
 
-Jn, p = 500, 100
+n, p = 500, 100
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=51112, sparse=0.1)
 # pepper with outliers
 y1a  = outlify(y1, 0.1)