cmdla/11-22/NCG.jl

using LinearAlgebra, Printf, Plots

function NCG(f;
             x::Union{Nothing, Vector}=nothing,
             wf::Integer=0,
             rstart::Integer=0,
             eps::Real=1e-6,
             astart::Real=1,
             MaxFeval::Integer=1000,
             m1::Real=0.01,
             m2::Real=0.9,
             tau::Real=0.9,
             sfgrd::Real=0.2,
             MInf::Real=-Inf,
             mina::Real=1e-16,
             plt::Union{Plots.Plot, Nothing}=nothing,
             plotatend::Bool=false,
             Plotf::Integer=0,
             printing::Bool=true)

    #function [ x , status ] = NCG( f , x , wf , rstart , eps , astart ,
    #                               MaxFeval , m1 , m2 , tau , sfgrd , MInf ,
    #                               mina )
    #
    # Apply a Nonlinear Conjugated Gradient algorithm for the minimiztion of
    # the provided function f, which must have the following interface:
    #
    #   [ v , g ] = f( x )
    #
    # Input:
    #
    # - x is either a [ n x 1 ] real (column) vector denoting the input of
    #   f(), or [] (empty).
    #
    # Output:
    #
    # - v (real, scalar): if x == [] this is the best known lower bound on
    #   the unconstrained global optimum of f(); it can be -Inf if either f()
    #   is not bounded below, or no such information is available. If x ~= []
    #   then v = f(x).
    #
    # - g (real, [ n x 1 ] real vector): this also depends on x. if x == []
    #   this is the standard starting point from which the algorithm should
    #   start, otherwise it is the gradient of f() at x (or a subgradient if
    #   f() is not differentiable at x, which it should not be if you are
    #   applying the gradient method to it).
    #
    # The other [optional] input parameters are:
    #
    # - x (either [ n x 1 ] real vector or [], default []): starting point.
    #   If x == [], the default starting point provided by f() is used.
    #
    # - wf (integer scalar, optional, default value 0): which of the Nonlinear
    #   Conjugated Gradient formulae to use. Possible values are:
    #   = 0: Fletcher-Reeves
    #   = 1: Polak-Ribiere
    #   = 2: Hestenes-Stiefel
    #   = 3: Dai-Yuan
    #
    # - rstart (integer scalar, optional, default value 0): if > 0, restarts
    #   (setting beta = 0) are performed every n * rstart iterations
    #
    # - eps (real scalar, optional, default value 1e-6): the accuracy in the
    #   stopping criterion: the algorithm is stopped when the norm of the
    #   gradient is less than or equal to eps. If a negative value is provided,
    #   this is used in a *relative* stopping criterion: the algorithm is
    #   stopped when the norm of the gradient is less than or equal to
    #   (- eps) * || norm of the first gradient ||.
    #
    # - astart (real scalar, optional, default value 1): starting value of
    #   alpha in the line search (> 0)
    #
    # - MaxFeval (integer scalar, optional, default value 1000): the maximum
    #   number of function evaluations (hence, iterations will be not more than
    #   MaxFeval because at each iteration at least a function evaluation is
    #   performed, possibly more due to the line search).
    #
    # - m1 (real scalar, optional, default value 0.01): first parameter of the
    #   Armijo-Wolfe-type line search (sufficient decrease). Has to be in (0,1)
    #
    # - m2 (real scalar, optional, default value 0.9): typically the second
    #   parameter of the Armijo-Wolfe-type line search (strong curvature
    #   condition). It should to be in (0,1); if not, it is taken to mean that
    #   the simpler Backtracking line search should be used instead
    #
    # - tau (real scalar, optional, default value 0.9): scaling parameter for
    #   the line search. In the Armijo-Wolfe line search it is used in the
    #   first phase: if the derivative is not positive, then the step is
    #   divided by tau (which is < 1, hence it is increased). In the
    #   Backtracking line search, each time the step is multiplied by tau
    #   (hence it is decreased).
    #
    # - sfgrd (real scalar, optional, default value 0.2): safeguard parameter
    #   for the line search. to avoid numerical problems that can occur with
    #   the quadratic interpolation if the derivative at one endpoint is too
    #   large w.r.t. the one at the other (which leads to choosing a point
    #   extremely near to the other endpoint), a *safeguarded* version of
    #   interpolation is used whereby the new point is chosen in the interval
    #   [ as * ( 1 + sfgrd ) , am * ( 1 - sfgrd ) ], being [ as , am ] the
    #   current interval, whatever quadratic interpolation says. If you
    #   experiemce problems with the line search taking too many iterations to
    #   converge at "nasty" points, try to increase this
    #
    # - MInf (real scalar, optional, default value -Inf): if the algorithm
    #   determines a value for f() <= MInf this is taken as an indication that
    #   the problem is unbounded below and computation is stopped
    #   (a "finite -Inf").
    #
    # - mina (real scalar, optional, default value 1e-16): if the algorithm
    #   determines a stepsize value <= mina, this is taken as an indication
    #   that something has gone wrong (the gradient is not a direction of
    #   descent, so maybe the function is not differentiable) and computation
    #   is stopped. It is legal to take mina = 0, thereby in fact skipping this
    #   test.
    #
    # Output:
    #
    # - x ([ n x 1 ] real column vector): the best solution found so far.
    #
    # - status (string): a string describing the status of the algorithm at
    #   termination
    #
    #   = 'optimal': the algorithm terminated having proven that x is a(n
    #     approximately) optimal solution, i.e., the norm of the gradient at x
    #     is less than the required threshold
    #
    #   = 'unbounded': the algorithm has determined an extrenely large negative
    #     value for f() that is taken as an indication that the problem is
    #     unbounded below (a "finite -Inf", see MInf above)
    #
    #   = 'stopped': the algorithm terminated having exhausted the maximum
    #     number of iterations: x is the bast solution found so far, but not
    #     necessarily the optimal one
    #
    #   = 'error': the algorithm found a numerical error that prevents it from
    #     continuing optimization (see mina above)
    #
    #{
    # =======================================
    # Author: Antonio Frangioni
    # Date: 10-11-22
    # Version 1.21
    # Copyright Antonio Frangioni
    # =======================================
    #}


    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # inner functions - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    function f2phi(alpha, derivate=false)
        # computes and returns the value of the tomography at alpha
        #
        #    phi( alpha ) = f( x + alpha * d )
        #
        # if Plotf > 2 saves the data in gap() for plotting
        #
        # if the second output parameter is required, put there the derivative
        # of the tomography in alpha
        #
        #    phi'( alpha ) = < \nabla f( x + alpha * d ) , d >
        #
        # saves the point in lastx, the gradient in lastg and increases feval

        lastx = x + alpha * d
        phi, lastg, _ = f(lastx)

        if Plotf > 2
            if fStar > -Inf
                push!(gap, (phi - fStar) / max(abs(fStar), 1))
            else
                push!(gap, phi)
            end
        end

        feval += 1

        if derivate
            return (phi, dot(d, lastg))
        end
        return (phi, nothing)
    end

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    function ArmijoWolfeLS(phi0, phip0, as, m1, m2, tau)
        # performs an Armijo-Wolfe Line Search.
        #
        # phi0 = phi( 0 ), phip0 = phi'( 0 ) < 0
        #
        # as > 0 is the first value to be tested: if phi'( as ) < 0 then as is
        # divided by tau < 1 (hence it is increased) until this does not happen
        # any longer
        #
        # m1 and m2 are the standard Armijo-Wolfe parameters; note that the strong
        # Wolfe condition is used
        #
        # returns the optimal step and the optimal f-value

        lsiter = 1 # count iterations of first phase
        local phips, phia
        while feval ≤ MaxFeval
            phia, phips = f2phi(as, true)

            if (phia ≤ phi0 + m1 * as * phip0) && (abs(phips) ≤ -m2 * phip0)
                if printing
                    @printf("\t%2d", lsiter)
                end
                a = as
                return (a, phia) # Armijo + strong Wolfe satisfied, we are done
            end
            if phips ≥ 0 # derivative is positive, break
                break
            end
            as = as / tau
            lsiter += 1
        end

        if printing
            @printf("\t%2d ", lsiter)
        end
        lsiter = 1  # count iterations of second phase
        am = 0
        a = as
        phipm = phip0
        while (feval ≤ MaxFeval ) && (as - am) > mina && (phips > 1e-12)

            # compute the new value by safeguarded quadratic interpolation
            a = (am * phips - as * phipm) / (phips - phipm)
            a = max(am + ( as - am ) * sfgrd, min(as - ( as - am ) * sfgrd, a))

            # compute phi(a)
            phia, phip = f2phi(a, true)

            if (phia ≤ phi0 + m1 * a * phip0) && (abs(phip) ≤ -m2 * phip0)
                break  # Armijo + strong Wolfe satisfied, we are done
            end

            # restrict the interval based on sign of the derivative in a
            if phip < 0
                am = a
                phipm = phip
            else
                as = a
                if as ≤ mina
                    break
                end
                phips = phip
            end
            lsiter += 1
        end

        if printing
            @printf("%2d", lsiter)
        end
        return (a, phia)
    end

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    function BacktrackingLS(phi0, phip0, as, m1, tau)
        # performs a Backtracking Line Search.
        #
        # phi0 = phi( 0 ), phip0 = phi'( 0 ) < 0
        #
        # as > 0 is the first value to be tested, which is decreased by
        # multiplying it by tau < 1 until the Armijo condition with parameter
        # m1 is satisfied
        #
        # returns the optimal step and the optimal f-value

        lsiter = 1  # count ls iterations
        while feval ≤ MaxFeval && as > mina
            phia, _ = f2phi(as)
            if phia ≤ phi0 + m1 * as * phip0 # Armijo satisfied
                break                        # we are done
            end
            as *= tau
            lsiter += 1
        end

        if printing
            @printf("\t%2d", lsiter)
        end

        return (as, phia)
    end

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    Plotf = 1;
    # 1 = the level sets of f and the trajectory are plotted (when n = 2)
    # 2 = the function value / gap are plotted, iteration-wise
    # 3 = the function value / gap are plotted, function-evaluation-wise
    # all the rest: nothing is plotted

    Interactive = false # if we pause at every iteration

    PXY = Matrix{Real}(undef, 2, 0)

    local gap
    if Plotf > 1
        if Plotf == 2
            MaxIter = 50  # expected number of iterations for the gap plot
        else
            MaxIter = 70  # expected number of iterations for the gap plot
        end
        gap = []
    end

    if Plotf == 2 && plt == nothing
        plt = plot(xlims=(0, MaxIter), ylims=(1e-15, 1e+1))
    end
    if Plotf > 1 && plt == nothing
        plt = plot(xlims=(0, MaxIter))
    end
    if plt == nothing
        plt = plot()
    end

    # reading and checking input- - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    local fStar
    if isnothing(x)
        fStar, x, _ = f(nothing)
    else
        fStar, _, _ = f(nothing)
    end

    n = size(x, 1)

    if wf < 0 || wf > 3
        error("unknown NCG formula %d", wf)
    end
    if astart ≤ 0
        error("astart must be > 0")
    end

    if m1 ≤ 0 || m1 ≥ 1
        error("m1 is not in (0 ,1)")
    end

    AWLS = ( m2 > 0 && m2 < 1 )

    if tau ≤ 0 || tau ≥ 1
        error("tau is not in (0 ,1)")
    end

    if sfgrd ≤ 0 || sfgrd ≥ 1
        error("sfgrd is not in (0, 1)")
   end

    if mina < 0
        error("mina is < 0")
    end

    # "global" variables- - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    lastx = zeros(n, 1)  # last point visited in the line search
    lastg = zeros(n, 1)  # gradient of lastx
    pastg = zeros(n, 1)
    pastd = zeros(n, 1)
    d = zeros(n, 1)      # NGC's direction
    feval = 0            # f() evaluations count ("common" with LSs)

    status = "error"

    # initializations - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    if printing
        @printf("NCG method ")
        if wf == 0
            @printf("(Fletcher-Reeves)\n")
        elseif wf == 1
            @printf("(Polak-Ribiere)\n")
        elseif wf == 2
            @printf("(Hestenes-Stiefel)\n")
        elseif wf == 3
            @printf("(Dai-Yuan)\n")
        end

        if fStar > -Inf
            @printf("feval\trel gap")
        else
            @printf("feval\tf(x)")
        end
        @printf("\t\t|| g(x) ||\tbeta\tls feval\ta*\n\n")
    end

    v, _ = f2phi(0)
    ng = norm(lastg)
    if eps < 0
        ng0 = -ng # norm of first subgradient: why is there a "-"? ;-)
    else
        ng0 = 1   # un-scaled stopping criterion
    end

    iter = 1      # iterations count (as distinguished from f() evals)

    # main loop - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    while true
        # output statistics - - - - - - - - - - - - - - - - - - - - - - - - - -

        if fStar > - Inf
            gapk = (v - fStar) / max(abs(fStar), 1)

            if printing
                @printf("%4d\t%1.4e\t%1.4e", feval, gapk, ng)
            end
            if Plotf == 2
                push!(gap, gapk)
            end
        else
            if printing
                @printf("%4d\t%1.8e\t\t%1.4e", feval, v, ng)
            end
            if Plotf == 2
                push!(gap, v)
            end
        end

        # stopping criteria - - - - - - - - - - - - - - - - - - - - - - - - - -

        if ng ≤ eps * ng0
            status = "optimal"
            break
        end

        if feval > MaxFeval
            status = "stopped"
            break
        end

        # compute search direction- - - - - - - - - - - - - - - - - - - - - - -
        # formulae could be streamlined somewhat and some norms could be saved
        # from previous iterations

        if iter == 1  # first iteration is off-line, standard gradient
            d = -lastg
            if printing
                @printf("\t")
            end
        else    # normal iterations, use appropriate NCG formula
            if rstart > 0 && mod(iter, n * rstart) == 0
                # ... unless a restart is being prformed
                beta = 0
                if printing
                    @printf("\t(res)")
                end
            else
                if wf == 0 # Fletcher-Reeves
                    beta = (ng / norm(pastg))^2
                elseif wf == 1 # Polak-Ribiere
                    beta = (dot(lastg, (lastg - pastg))) / norm(pastg)^2
                    beta = max(beta, 0)
                elseif wf == 2 # Hestenes-Stiefel
                    beta = (dot(lastg, (lastg - pastg))) / (dot((lastg - pastg), pastd))
                    if beta < 0
                        beta = 0
                    end
                elseif wf == 3 # Dai-Yuan
                    beta = ng^2 / (dot((lastg - pastg), pastd) )
                end
                if printing
                    @printf("\t%1.4f", beta)
                end
            end

            if beta ≠ 0
                d = -lastg + beta * pastd
            else
                d = -lastg
            end
        end

        pastg = lastg  # previous gradient
        pastd = d      # previous search direction

        # compute step size - - - - - - - - - - - - - - - - - - - - - - - - - -

        phip0 = dot(lastg, d)

        local a
        if AWLS
            a, v = ArmijoWolfeLS(v, phip0, astart, m1, m2, tau)
        else
            a, v = BacktrackingLS(v, phip0, astart, m1, tau)
        end

        # output statistics - - - - - - - - - - - - - - - - - - - - - - - - - -

        if printing
            @printf("\t%1.2e\n", a)
        end

        if a ≤ mina
            status = "error"
            break
        end

        if v ≤ MInf
            status = "unbounded"
            break
        end

        # compute new point - - - - - - - - - - - - - - - - - - - - - - - - - -

        # possibly plot the trajectory
        if n == 2 && Plotf == 1
            PXY = hcat(PXY, hcat(x, lastx))
        end

        x = lastx
        ng = norm(lastg)

        # iterate - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

        iter += 1

        if Interactive
            readline()
        end
    end

    # end of main loop- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    if Plotf ≥ 2
        plot!(plt, gap)
    elseif Plotf == 1 && n == 2
        plot!(plt, PXY[1, :], PXY[2, :])
    end
    if plotatend
        display(plt)
    end

    return (x, status)
end  # the end- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -