From 26f6c38103b05843229de1927fcf9c8d78c6a453 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Wed, 25 Mar 2026 22:11:24 +0100 Subject: [PATCH] add fixedpoint method with 2 algorithms --- src/OptimKit.jl | 68 +++++++++-- src/anderson.jl | 258 +++++++++++++++++++++++++++++++++++++++++ src/simpleiteration.jl | 92 +++++++++++++++ 3 files changed, 411 insertions(+), 7 deletions(-) create mode 100644 src/anderson.jl create mode 100644 src/simpleiteration.jl diff --git a/src/OptimKit.jl b/src/OptimKit.jl index 6415200..f6ab04f 100644 --- a/src/OptimKit.jl +++ b/src/OptimKit.jl @@ -11,12 +11,13 @@ const LS_MAXITER = ScopedValue(10) const LS_MAXFG = ScopedValue(20) const LS_VERBOSITY = ScopedValue(1) -const GRADTOL = ScopedValue(1e-8) +const GRADTOL = ScopedValue(1.0e-8) const MAXITER = ScopedValue(1_000_000) const VERBOSITY = ScopedValue(1) # Default values for the manifold structure _retract(x, d, α) = (add(x, d, α), d) +_invretract(x, y) = add(y, x, -1) _inner(x, v1, v2) = v1 === v2 ? norm(v1)^2 : real(inner(v1, v2)) _transport!(v, xold, d, α, xnew) = v _scale!(v, α) = scale!!(v, α) @@ -26,7 +27,7 @@ _precondition(x, g) = deepcopy(g) _finalize!(x, f, g, numiter) = x, f, g # Default structs for new convergence and termination keywords -@kwdef struct DefaultHasConverged{T<:Real} +@kwdef struct DefaultHasConverged{T <: Real} gradtol::T end @@ -44,6 +45,7 @@ end # Optimization abstract type OptimizationAlgorithm end +abstract type FixedPointAlgorithm end const _xlast = Ref{Any}() const _glast = Ref{Any}() @@ -104,11 +106,61 @@ Also see [`GradientDescent`](@ref), [`ConjugateGradient`](@ref), [`LBFGS`](@ref) """ function optimize end +""" + fixedpoint(fp, x, alg; + finalize! = _finalize!, + shouldstop = DefaultShouldStop(alg.maxiter), + hasconverged = DefaultHasConverged(alg.gradtol), + retract = _retract, invretract = _invretract, + inner = _inner, (transport!) = _transport!, + (scale!) = _scale!, (add!) = _add!, + isometrictransport = (transport! == _transport! && inner == _inner)) + -> x, g, numfp, history + +Find a fixed point of the function `fp` starting from an initial point `x₀` and using the fixed point algorithm `alg`, +which is an instance of `SimpleIteration` or `AndersonMixing`. + +Returns the final point `x`, the residual `g`, the total number of calls to `fp`, +and the history of the norm of the residual across the different iterations. + +The algorithm is run until either `hasconverged(x, false, g, normg)` returns `true` or +`shouldstop(x, false, g, numfp, numiter, time)` returns `true`. The latter case happening before +the former is considered to be a failure to converge, and a warning is issued. + +The keyword arguments are: +- `finalize!::Function`: A function that takes the final point `x`, a dummy variable `f = false`, + the residual `g`, and the iteration number, and returns a possibly modified values for + `x`, `f` and `g`. By default, the identity is used. + It is the user's responsibility to ensure that the modified values do not lead to + inconsistencies within the optimization algorithm. +- `hasconverged::Function`: A function that takes the current point `x`, a dummy variable `f = false`, + the residual `r`, and the norm of the residual, and returns a boolean indicating whether + the fixed point iteration has converged. By default, the norm of the residual is compared to the + tolerance `gradtol` as encoded in the algorithm instance. +- `shouldstop::Function`: A function that takes the current point `x`, a dummy variable `f = false`, + the residuals `g`, the number of calls to `fg`, the iteration number, and the time spent + so far, and returns a boolean indicating whether the optimization should stop. By default, + the number of iterations is compared to the maximum number of iterations as encoded in the + algorithm instance. + +Check the README of this package for further details on creating an algorithm instance, +as well as for the meaning of the remaining keyword arguments and their default values. + +!!! Warning + + The default values of `hasconverged` and `shouldstop` are provided to ensure continuity + with the previous versions of this package. However, this behaviour might change in the + future. + +Also see [`SimpleIteration`](@ref) and [`AndersonMixing`](@ref). +""" +function fixedpoint end + function format_time(t::Float64) - if t < 1e-3 - return @sprintf("%5.1f μs", 1e6*t) + if t < 1.0e-3 + return @sprintf("%5.1f μs", 1.0e6 * t) elseif t < 1 - return @sprintf("%5.1f ms", 1e3*t) + return @sprintf("%5.1f ms", 1.0e3 * t) elseif t < 60 return @sprintf("%5.2f s", t) elseif t < 3600 @@ -122,7 +174,8 @@ include("linesearches.jl") include("gd.jl") include("cg.jl") include("lbfgs.jl") - +include("simpleiteration.jl") +include("anderson.jl") const gd = GradientDescent() const cg = ConjugateGradient() const lbfgs = LBFGS() @@ -131,6 +184,7 @@ export optimize, gd, cg, lbfgs, optimtest export GradientDescent, ConjugateGradient, LBFGS export FletcherReeves, HestenesStiefel, PolakRibiere, HagerZhang, DaiYuan export HagerZhangLineSearch +export fixedpoint, SimpleIteration, AndersonMixing """ optimtest(fg, x, [d]; alpha = -0.1:0.001:0.1, retract = _retract, inner = _inner) @@ -142,7 +196,7 @@ Test the compatibility between the computation of the gradient, the retraction a It is up to the user to check that the values in `dfs1` and `dfs2` match up to expected precision, by inspecting the numerical values or plotting them. If these values don't match, the linesearch in `optimize` cannot be expected to work. """ -function optimtest(fg, x, d=fg(x)[2]; alpha=-0.1:0.001:0.1, retract=_retract, inner=_inner) +function optimtest(fg, x, d = fg(x)[2]; alpha = -0.1:0.001:0.1, retract = _retract, inner = _inner) # evaluate function at given edge points fs_edges = map(alpha) do a f, = fg(retract(x, d, a)[1]) diff --git a/src/anderson.jl b/src/anderson.jl new file mode 100644 index 0000000..1d9327f --- /dev/null +++ b/src/anderson.jl @@ -0,0 +1,258 @@ +""" + struct AndersonMixing{T <: Real} <: FixedPointAlgorithm + AndersonMixing(m::Int; + damping::Real = 1, + maxiter::Int=MAXITER[], # 1_000_000 + gradtol::Real=GRADTOL[], # 1e-8 + verbosity::Int=VERBOSITY[]) # 1 + +Anderson mixing, also known as Anderson acceleration, for fixed point problems. + +## Parameters +- `m::Int`: The number of previous iterates to use for Anderson extrapolation. +- `damping::Real`: The damping parameter for Anderson extrapolation; a value of 1 corresponds to no damping, while a value between 0 and 1 corresponds to under-relaxation. +- `maxiter::Int`: The maximum number of iterations. +- `gradtol::T`: The tolerance for the norm of the residual. +- `verbosity::Int`: The verbosity level of the optimization algorithm. + +The verbosity level use the following scheme: +- 0: no output +- 1: only warnings upon non-convergence +- 2: convergence information at the end of the algorithm +- 3: progress information after each iteration +""" +struct AndersonMixing{T <: Real} <: FixedPointAlgorithm + m::Int + damping::T + maxiter::Int + gradtol::T + verbosity::Int +end +function AndersonMixing(m::Int=8; + damping::Real = 1, + maxiter::Int=MAXITER[], + gradtol::Real=GRADTOL[], + verbosity::Int=VERBOSITY[]) + damping′, gradtol′ = promote(damping, gradtol) + return AndersonMixing(m, damping′, maxiter, gradtol′, verbosity) +end + +using LinearAlgebra +function fixedpoint( + fp::F, x₀, alg::AndersonMixing; + (finalize!) = _finalize!, + shouldstop = DefaultShouldStop(alg.maxiter), + hasconverged = DefaultHasConverged(alg.gradtol), + retract = _retract, invretract = _invretract, + inner = _inner, (transport!) = _transport!, + (scale!) = _scale!, (add!) = _add!, + isometrictransport = (transport! == _transport! && inner == _inner) + ) where {F} + + t₀ = time() + verbosity = alg.verbosity + x = x₀ + fx = fp(x) + numfp = 1 + numiter = 0 + g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case + x, _, g = finalize!(x, false, g, numiter) + innergg = inner(x, g, g) + normg = sqrt(innergg) + normghistory = [normg] + verbosity >= 2 && + @info @sprintf("Anderson: initializing with ‖x - F(x)‖ = %.4e", normg) + t = time() - t₀ + _hasconverged = hasconverged(x, false, g, normg) + _shouldstop = shouldstop(x, false, g, numfp, numiter, t) + + if !(_hasconverged || _shouldstop) # first iteration is special + xprev = x + gprev = g + Δxprev = deepcopy(g) + told = t + x, Δx = retract(x, g, 1) + fx = fp(x) + numfp += 1 + numiter += 1 + g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case + x, _, g = finalize!(x, false, g, numiter) + innergg = inner(x, g, g) + normg = sqrt(innergg) + push!(normghistory, normg) + t = time() - t₀ + Δt = t - told + _hasconverged = hasconverged(x, false, g, normg) + _shouldstop = shouldstop(x, false, g, numfp, numiter, t) + end + TangentType = typeof(g) + H = AndersonHistory(alg.m, TangentType[], TangentType[]) + overlap_full = zeros(typeof(innergg), alg.m, alg.m) + rhs_full = zeros(typeof(innergg), alg.m) + while !(_hasconverged || _shouldstop) + verbosity >= 3 && + @info @sprintf("Anderson acceleration: iter %4d, Δt %s: ‖f(x) - x‖ = %.4e", numiter, format_time(Δt), normg) + + told = t + # compute new update direction using Anderson extrapolation + gprev = transport!(gprev, xprev, Δxprev, 1, x) + Δg = add!(deepcopy(g), gprev, -1) + # Δx = transport!(deepcopy(Δx), xprev, Δx, 1, x) # transport previous Anderson extrapolation step to current point + + mold = length(H) + push!(H, (Δg, Δx)) + for k in 1:(length(H) - 1) + (Δgₖ, Δxₖ) = H[k] + Δgₖ = transport!(Δgₖ, xprev, Δxprev, 1, x) # transport stored residuals to current point + Δxₖ = transport!(Δxₖ, xprev, Δxprev, 1, x) + H[k] = (Δgₖ, Δxₖ) # update stored residuals and steps to current point + end + m = length(H) + + overlap = view(overlap_full, 1:m, 1:m) + rhs = view(rhs_full, 1:m) + if isometrictransport + if mold == m + @inbounds for j in 1:(m - 1) + for i in 1:(m - 1) + overlap[i, j] = overlap[i + 1, j + 1] + end + end + end + @inbounds for i in 1:(m - 1) + (Δgᵢ, Δxᵢ) = H[i] + overlap[m, i] = overlap[i, m] = inner(x, Δgᵢ, Δg) + end + overlap[m, m] = inner(x, Δg, Δg) + else + @inbounds for j in 1:m + (Δgⱼ, Δxⱼ) = H[j] + for i in 1:(j - 1) + (Δgᵢ, Δxᵢ) = H[i] + overlap[j, i] = overlap[i, j] = inner(x, Δgᵢ, Δgⱼ) + end + overlap[j, j] = inner(x, Δgⱼ, Δgⱼ) + end + end + @inbounds for i in 1:m + (Δgᵢ, Δxᵢ) = H[i] + rhs[i] = inner(x, Δgᵢ, g) + end + # overlapX = [inner(x, H[i][2], H[j][2]) for i in 1:m, j in 1:m] + # overlap += sqrt(eps(one(eltype(overlap)))) * overlapX + overlap += LinearAlgebra.tr(overlap) / size(overlap, 1) * sqrt(eps(one(eltype(overlap)))) * I + Γ = overlap \ rhs # solve least squares problem to get Anderson coefficients + ḡ = deepcopy(g) + Δx = scale!(deepcopy(Δx), 0) + @inbounds for k in 1:m + (Δgₖ, Δxₖ) = H[k] + ḡ = add!(ḡ, Δgₖ, -Γ[k]) + Δx = add!(Δx, Δxₖ, -Γ[k]) # compute Anderson extrapolation direction + end + @show sqrt(inner(x, ḡ, ḡ)) + Δx = add!(Δx, ḡ, alg.damping) # add damping to Anderson extrapolation direction + + # store current quantities as previous quantities + xprev = x + Δxprev = Δx + gprev = g + _xlast[] = x # store result in global variables to debug failures + _glast[] = g + + # take step and evaluate new point + x, Δx = retract(x, Δx, 1) + fx = fp(x) + numfp += 1 + numiter += 1 + g = invretract(x, fx) # Direction from x to fx, i.e. similar to F(x) - x in the vector case + x, _, g = finalize!(x, false, g, numiter) + innergg = inner(x, g, g) + normg = sqrt(innergg) + push!(normghistory, normg) + t = time() - t₀ + Δt = t - told + _hasconverged = hasconverged(x, false, g, normg) + _shouldstop = shouldstop(x, false, g, numfp, numiter, t) + + # check stopping criteria and print info + if _hasconverged || _shouldstop + break + end + end + if _hasconverged + verbosity >= 2 && + @info @sprintf("Anderson acceleration: converged after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg) + else + verbosity >= 1 && + @warn @sprintf("Anderson acceleration: not converged to requested tol after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg) + end + return x, g, numfp, normghistory +end + +mutable struct AndersonHistory{TangentType} + maxlength::Int + length::Int + first::Int + Δgesiduals::Vector{TangentType} + Δpositions::Vector{TangentType} + function AndersonHistory{T}(maxlength::Int, Δgesiduals::Vector{T}, Δpositions::Vector{T}) where {T} + l = length(Δgesiduals) + @assert l == length(Δpositions) "AndersonHistory: Δgesiduals and Δpositions must have the same length" + @assert l <= maxlength "AndersonHistory: initial history length cannot exceed maxlength" + Δgesiduals = resize!(copy(Δgesiduals), maxlength) + Δpositions = resize!(copy(Δpositions), maxlength) + return new{T}(maxlength, l, 1, Δgesiduals, Δpositions) + end +end +function AndersonHistory(maxlength::Int, Δgesiduals::Vector{T}, Δpositions::Vector{T}) where {T} + return AndersonHistory{T}(maxlength, Δgesiduals, Δpositions) +end + +Base.length(H::AndersonHistory) = H.length + +@inline function Base.getindex(H::AndersonHistory, i::Int) + @boundscheck if i < 1 || i > H.length + throw(BoundsError(H, i)) + end + n = H.maxlength + idx = H.first + i - 1 + idx = ifelse(idx > n, idx - n, idx) + return (getindex(H.Δgesiduals, idx), getindex(H.Δpositions, idx)) +end + +@inline function Base.setindex!(H::AndersonHistory, (Δg, Δx), i) + @boundscheck if i < 1 || i > H.length + throw(BoundsError(H, i)) + end + idx = mod1(H.first + i - 1, H.maxlength) + setindex!(H.Δgesiduals, Δg, idx) + setindex!(H.Δpositions, Δx, idx) + return (Δg, Δx) +end + +@inline function Base.push!(H::AndersonHistory, value) + if H.length < H.maxlength + H.length += 1 + else + H.first = mod1(H.first + 1, H.maxlength) + end + @inbounds setindex!(H, value, H.length) + return H +end +@inline function Base.pop!(H::AndersonHistory) + @inbounds v = H[H.length] + H.length -= 1 + return v +end +@inline function Base.popfirst!(H::AndersonHistory) + @inbounds v = H[1] + H.first = mod1(H.first + 1, H.maxlength) + H.length -= 1 + return v +end + +@inline function Base.empty!(H::AndersonHistory) + H.length = 0 + H.first = 1 + return H +end diff --git a/src/simpleiteration.jl b/src/simpleiteration.jl new file mode 100644 index 0000000..8c4eccd --- /dev/null +++ b/src/simpleiteration.jl @@ -0,0 +1,92 @@ +""" + struct SimpleIteration{T <: Real} <: FixedPointAlgorithm + SimpleIteration(; + maxiter::Int=MAXITER[], # 1_000_000 + gradtol::Real=GRADTOL[], # 1e-8 + verbosity::Int=VERBOSITY[]) # 1 + +Simple iteration for fixed point problems. + +## Parameters +- `maxiter::Int`: The maximum number of iterations. +- `gradtol::Real`: The tolerance for the norm of the residual. +- `verbosity::Int`: The verbosity level of the optimization algorithm. + +The verbosity level use the following scheme: +- 0: no output +- 1: only warnings upon non-convergence +- 2: convergence information at the end of the algorithm +- 3: progress information after each iteration +""" +struct SimpleIteration{T <: Real} <: FixedPointAlgorithm + maxiter::Int + gradtol::T + verbosity::Int +end +function SimpleIteration(; + maxiter::Int=MAXITER[], + gradtol::Real=GRADTOL[], + verbosity::Int=VERBOSITY[]) + return SimpleIteration(maxiter, gradtol, verbosity) +end + + +function fixedpoint( + fp, x₀, alg::SimpleIteration; + (finalize!) = _finalize!, + shouldstop = DefaultShouldStop(alg.maxiter), + hasconverged = DefaultHasConverged(alg.gradtol), + retract = _retract, invretract = _invretract, + inner = _inner, (transport!) = _transport!, + (scale!) = _scale!, (add!) = _add!, + isometrictransport = (transport! == _transport! && inner == _inner) + ) + + t₀ = time() + verbosity = alg.verbosity + x = x₀ + fx = fp(x) + numfp = 1 + numiter = 0 + g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case + x, _, g = finalize!(x, false, g, numiter) + innergg = inner(x, g, g) + normg = sqrt(innergg) + normghistory = [normg] + verbosity >= 2 && + @info @sprintf("SimpleIteration: initializing with ‖x - F(x)‖ = %.4e", normg) + t = time() - t₀ + _hasconverged = hasconverged(x, false, g, normg) + _shouldstop = shouldstop(x, false, g, numfp, numiter, t) + + while !(_hasconverged || _shouldstop) + told = t + x = fx + fx = fp(x) + numfp += 1 + numiter += 1 + g = invretract(x, fx) # residual, i.e. direction from x to fx, similar to F(x) - x in the vector case + x, _, g = finalize!(x, false, g, numiter) + innergg = inner(x, g, g) + normg = sqrt(innergg) + push!(normghistory, normg) + t = time() - t₀ + Δt = t - told + _hasconverged = hasconverged(x, false, g, normg) + _shouldstop = shouldstop(x, false, g, numfp, numiter, t) + # check stopping criteria and print info + if _hasconverged || _shouldstop + break + end + verbosity >= 3 && + @info @sprintf("SimpleIteration: iter %4d, Δt %s: ‖f(x) - x‖ = %.4e", numiter, format_time(Δt), normg) + end + if _hasconverged + verbosity >= 2 && + @info @sprintf("SimpleIteration: converged after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg) + else + verbosity >= 1 && + @warn @sprintf("SimpleIteration: not converged to requested tol after %d iterations and time %s: ‖f(x) - x‖ = %.4e", numiter, format_time(t), normg) + end + return x, g, numfp, normghistory +end