cmdla/project/L-BFGS/BFGS.jl

module BFGS

using LinearAlgebra: norm, I, dot, diagm, mul!

using ..OracleFunction

export BroydenFletcherGoldfarbShanno, BroydenFletcherGoldfarbShannoDogleg

const armijiowolfeorexact = :exact
BFGSorDFP = :BFGS

function ArmijoWolfeLineSearch(
        f::Union{LeastSquaresF, OracleF},
        x::AbstractArray,
        p::AbstractArray,
        MaxEvaluations::Integer;
        αinit::Real=1,
        τ::Real=1.1,
        c1::Real=1e-4,
        c2::Real=0.9,
        ϵα::Real=1e-16,
        ϵgrad::Real=1e-12,
        safeguard::Real=0.20,
    )::Tuple{Real, Integer}

    ϕ = (α) -> begin
        v = f.eval(x + α * p)
        gradient = f.grad(x + α * p)
        return (v, dot(p, gradient))
    end

    α = αinit
    local αgrad

    ϕ_0, ϕd_0 = ϕ(0)

    while MaxEvaluations > 0
        αcurr, αgrad = ϕ(α)
        MaxEvaluations -= 2

        if (αcurr ≤ ϕ_0 + c1 * α * ϕd_0) && (abs(αgrad) ≤ -c2 * ϕd_0)
            return (α, MaxEvaluations)
        end

        if αgrad ≥ 0
            break
        end
        α *= τ
    end

    αlo = 0
    αhi = α
    αlograd = ϕd_0
    αhigrad = αgrad

    while (MaxEvaluations > 0) && (αhi - αlo) > ϵα && (αgrad > ϵgrad)
        α = (αlo * αhigrad - αhi * αlograd)/(αhigrad - αlograd)
        α = max(
            αlo + (αhi - αlo) * safeguard,
            min(αhi - (αhi - αlo) * safeguard, α)
        )

        αcurr, αgrad = ϕ(α)
        MaxEvaluations -= 2

        if (αcurr ≤ ϕ_0 + c1 * α * ϕd_0) && (abs(αgrad) ≤ -c2 * ϕd_0)
            break
        end

        if αgrad < 0
            αlo = α
            αlograd = αgrad
        else
            αhi = α
            if αhi ≤ ϵα
                break
            end
            αhigrad = αgrad
        end
    end

    return (α, MaxEvaluations)
end

function ExactLineSearch(
        f::LeastSquaresF,
        x::AbstractArray,
        p::AbstractArray,
        MaxEvaluations::Integer
    )
    MaxEvaluations -= 1
    return (tomography(f, x, p), MaxEvaluations)
end


@doc raw"""
```julia
BroydenFletcherGoldfarbShanno(f::Union{LeastSquaresF, OracleF}, [x::AbstractVector{T}, ϵ::T=1e-6, MaxEvaluations::Integer=10000])
```

Computes the minimum of the input function `f`.

### Input

- `f` -- the input function to minimize.
- `x` -- the starting point, if not specified the default one for the function `f` is used.
- `ϵ` -- the tollerance for the stopping criteria.
- `m` -- maximum number of vector to store that compute the approximate hessian.
- `MaxEvaluations` -- maximum number of function evaluations. Both ```f.eval``` and ```f.grad``` are counted.

### Output

A named tuple containing:
- `x` -- the minimum found
- `eval` -- the value of the function at the minimum
- `grad` -- the gradient of the function at the minimum
- `RemainingEvaluations` -- the number of function evaluation not used.

"""
function BroydenFletcherGoldfarbShanno(
        f::Union{LeastSquaresF, OracleF};
        x::Union{Nothing, AbstractVector{T}}=nothing,
        ϵ::T=1e-6,
        MaxEvaluations::Integer=10000
    )::NamedTuple where {T}

    if isnothing(x)
        x = f.starting_point
    end

    gradient = f.grad(x)
    MaxEvaluations -= 1
    normgradient0 = norm(gradient)
    H = diagm(ones(length(x)))
    tmp1 = similar(H)
    tmp2 = similar(H)

    firstEvaluation = true

    while MaxEvaluations > 0 && norm(gradient) > ϵ * normgradient0
        p = -H * gradient # direction

        α, MaxEvaluations =
            if armijiowolfeorexact === :armijiowolfe || f isa OracleF
                ArmijoWolfeLineSearch(f, x, p, MaxEvaluations)
            elseif armijiowolfeorexact === :exact
                ExactLineSearch(f, x, p, MaxEvaluations)
            end

        previousx = x
        x = x + α * p

        previousgradient = gradient
        gradient = f.grad(x)
        MaxEvaluations -= 1

        s = x - previousx
        y = gradient - previousgradient
        ρ = inv(dot(y, s))

        # if its the first iteration then set H to an aproximation of the Hessian
        if firstEvaluation
            mul!(H, I, dot(y, s)/dot(y, y))
            firstEvaluation = false
        end

        if BFGSorDFP == :DFP
            # DFP update -------------------------------------------
            # H = H - (H * y * y' * H)/(y' * H * y) + (s * s')/(y' * s)

            mul!(tmp1, H * y * y', H)
            mul!(tmp2, s, s')
            H .+= -tmp1/dot(y, H, y) .+ ρ * tmp2
        elseif BFGSorDFP == :BFGS
            # BFGS update ------------------------------------------
            # H = (I - ρ * s * y') * H * (I - ρ * y * s') + ρ * s * s'

            mul!(tmp1, H * y, s')
            mul!(tmp2, s, s')
            H .+= ρ * ((1 + ρ * dot(y, H, y)) .* tmp2 .- tmp1 .- tmp1')
        end
    end

    return (;
        :x => x,
        :eval => f.eval(x),
        :grad => gradient,
        :RemainingEvaluations => MaxEvaluations)
end


function BroydenFletcherGoldfarbShannoDogleg(
        f::Union{LeastSquaresF, OracleF};
        x::Union{Nothing, AbstractVector{T}}=nothing,
        ϵ::T=1e-6,
        MaxEvaluations::Integer=10000
    )::NamedTuple where {T}

    if isnothing(x)
        x = f.starting_point
    end

    Δ = 1 # initial size of trust region
    smallestΔ = 1e-4 # smallest size where linear aproximation is applied

    gradient = f.grad(x)
    MaxEvaluations -= 1
    normgradient0 = norm(gradient)
    normgradient = normgradient0
    H = diagm(ones(length(x)))
    B = copy(H)
    tmp1 = similar(H)
    tmp2 = similar(H)
    tmp3 = similar(H)

    firstEvaluation = true

    while MaxEvaluations > 0 && norm(gradient) > ϵ * normgradient0
        # compute s by solving the subproblem min_s grad' * s + 0.5 s' * B * s with norm(s) ≤ Δ
        CauchyPoint = -(Δ/normgradient) * gradient
        τ = if dot(gradient, B, gradient) ≤ 0
                1
            else
                min((normgradient^3)/(Δ * dot(gradient, B, gradient)), 1)
            end

        if Δ ≤ smallestΔ || B == I
            # the Cauchy point is enought for small regions (linear aproximation)
            s = τ * CauchyPoint
        else
            pB = -H * gradient
            pU = -dot(gradient, gradient)/dot(gradient, B, gradient) * gradient

            if norm(pB) ≤ Δ
                # the region is larger than the dogleg
                s = pB
            elseif Δ ≤ norm(pU)
                # the region is smaller than the first step
                s = Δ/norm(pU) * pU
            else
                # solve the quadratic sistem for the dogleg
                one = dot(pU, (pB - pU))
                two = dot(pB - pU, pB - pU)
                three = dot(pU, pU)

                τ = (-one+two + sqrt(one^2 - three * two + two * Δ^2))/two
                s = pU + (τ - 1) * (pB - pU)
            end
        end

        previousx = x
        x = x + s

        previousgradient = gradient
        gradient = f.grad(x)
        normgradient = norm(gradient)
        MaxEvaluations -= 1

        y = gradient - previousgradient
        ρ = inv(dot(y, s))

        ared = f.eval(x) - f.eval(x + s) # actual reduction
        pred = -(dot(gradient, s) + 0.5 * dot(s, B, s)) # predicted reduction
        MaxEvaluations -= 2

        # expand or contract the region
        if (0.75 < ared/pred) && (0.8 * Δ < norm(s))
            Δ = 2 * Δ
        elseif (ared/pred < 0.1)
            Δ = 0.5 * Δ
        end

        # if its the first iteration then set H to an aproximation of the Hessian
        if firstEvaluation
            mul!(H, I, dot(y, s)/dot(y, y))
            firstEvaluation = false
        end

        if BFGSorDFP == :DFP
            # DFP update -------------------------------------------
            # H = H - (H * y * y' * H)/(y' * H * y) + (s * s')/(y' * s)

            mul!(tmp1, H * y * y', H)
            mul!(tmp2, s, s')
            H .+= -tmp1/dot(y, H, y) .+ ρ * tmp2

            mul!(tmp1, y, s')
            tmp2 = I - ρ * tmp1
            mul!(tmp1, tmp2, B)
            mul!(tmp3, tmp1, tmp2')
            mul!(tmp2, y, y')
            B .= tmp3 .+ ρ * tmp2
        elseif BFGSorDFP == :BFGS
            # BFGS update ------------------------------------------
            # H = (I - ρ * s * y') * H * (I - ρ * y * s') + ρ * s * s'

            mul!(tmp1, H * y, s')
            mul!(tmp2, s, s')
            H .+= ρ * ((1 + ρ * dot(y, H, y)) .* tmp2 .- tmp1 .- tmp1')

            mul!(tmp1, B * s * s', B)
            mul!(tmp2, y, y')
            B .+= -tmp1/dot(s, B, s) .+ ρ * tmp2
        end
    end

    return (;
        :x => x,
        :eval => f.eval(x),
        :grad => gradient,
        :RemainingEvaluations => MaxEvaluations)
end

end # module BFGS