% chktex-file 9 chktex-file 17
\chapter{Algorithms}\label{ch: algorithms}

\section{QR}
The algorithm has been implemented considering that the input matrix $A \in \mathbb{R}^{m \times n}$, where $m$ may be different from $n$, namely it can be rectangular \textit{horizontally} or \textit{vertically}. In this version we store in a proper data structure a matrix $\Upsilon \in {m \times n}$ of the following form ($m > n$ in this example):

\begin{equation*}
    \Upsilon = {(\upsilon_{i,j})}_{i,j} = \begin{tikzpicture}[baseline=-1ex] 
    \matrix[%
        matrix of math nodes,
        nodes in empty cells,
        left delimiter={[},right delimiter={]}, 
        inner xsep=2pt, column sep=6pt, 
        ] (m)
    {%
        \vphantom{1} & * & \cdots & * \\
        & \vphantom{1} & \ddots & \vdots \\
        & & \vphantom{1} & * \\
        & & & \vphantom{1} \\
        u_1 & u_2 & \cdots & u_n \\
        \vphantom{1} & \vphantom{1} & \vphantom{1} & \vphantom{1} \\
    };
    \node[rectangle, draw, fit={(m-1-1) (m-6-1)}, inner sep=-1.5pt, text width=22pt] {};
    \node[rectangle, draw, fit={(m-2-2) (m-6-2)}, inner sep=-1.5pt, text width=22pt] {};
    \node[rectangle, draw, fit={(m-3-3) (m-6-3)}, inner sep=-1.5pt, text width=22pt] {};
    \node[rectangle, draw, fit={(m-4-4) (m-6-4)}, inner sep=-1.5pt, text width=22pt] {};
    \end{tikzpicture}
\end{equation*}

\begin{center}
    $u_k \in \mathbb{R}^{m - k + 1},\ 1 \leq k \leq n$
\end{center}
and the values of the diagonal of R in a vector $d \in \mathbb{R}^{n}$. The $*$ entries are elements computed in the QR factorization belonging to the upper triangular matrix, yielded by line 6 of \hyperref[algo: thinQR]{Algorithm 1}. In this way we are allowed to lazily perform the products $Qy$ and $Q^T y$ by means of the householder vectors $u_1 \dots, u_n $ that we stored. On the other hand, to compute a product between the upper part of $\Upsilon$ and an input vector we reconstruct the upper triangular matrix by taking element $\upsilon_{ij} \text{ such that } j > i$ and attach the vector $d$ as the diagonal of the resulting matrix.
The zeros of the matrix $R$ are ignored.

\begin{algorithm}[H]
    \SetAlgoLined%
    \caption{Thin QR}\label{algo: thinQR}
    \SetKwInOut{Input}{Input}
    \SetKwInOut{Output}{Output}

    \BlankLine%
    \Input{$A \in \mathbb{R}^{m \times n}$}
    \Output{$Q \in \mathbb{R}^{m \times m},\ R \in \mathbb{R}^{m \times n}$ implicit $QR$ factorization of $A$}
  
    \BlankLine%
    $\Upsilon = copy(A)$ \\
    $d = zeros(\min(m, n))$ \\

    \For{
        $k \in 1 \dots \min(m, n)$
    }{
        $u_k, s_k = householder\_vector(\Upsilon[k:m, k])$\\
        $d_k = s_k$ \\
        $\Upsilon[k:m, k+1:n] = \Upsilon[k:m, k+1:n] - 2u(u^T \Upsilon[k:m, k+1:n])$\\
        $\Upsilon[k:m, k] = u_k$
    }

    \Return$\Upsilon, d$
\end{algorithm}

\begin{algorithm}[H]
    \SetAlgoLined%
    \caption{householder\_vector}\label{algo: householder_vector}
    \SetKwInOut{Input}{Input}
    \SetKwInOut{Output}{Output}
    
    \BlankLine%
    \Input{$x \in \mathbb{R}^d$}
    \Output{$u \in \mathbb{R}^{d},\ s \in \mathbb{R}$ householder vector of $x$}
  
    \BlankLine%
    $s = \norm{x}$ \\
    \If{$x_1 \geq 0$}{
        $s = -s$
    }
    $u = copy(x)$ \\
    $u_1 = u_1 - s$ \\
    $u = u\ / \norm{u}$ \\

    \Return$u, s$
\end{algorithm}

We assume $m > n$ as the case $n > m$ is similar for the complexity analysis. The time complexity of this algorithm is $\theta\bigl(mn^2 \bigr) \approx \theta\bigl(n^3 \bigr)$, because $m \approx n$ in (P). We will see in \hyperref[ch: experiments]{section Experiments} that the running time scales linearly with $m$ as expected, where $m$ is the size of $\hat{X}$.

\newpage
\section{L-BFGS}

We follow the syntax from \textit{Numerical Optimization}\cite{Numerical-Optimization-2006} and define $f_k = f(x_k)$

\begin{algorithm}[H]
    \SetAlgoLined%
    \caption{Limited Memory BFGS}\label{algo: L-BFGS}
    \SetKwInOut{Input}{Input}
    \SetKwInOut{Output}{Output}
    
    \BlankLine%
    \Input{$\textbf{f}: \mathbb{R}^n \longrightarrow \mathbb{R},\ \textbf{x} \in \mathbb{R}^n,\ m \text{ memory, } \epsilon \text{ tolerance}$}
    \Output{${\bf x^*}\ \text{ending point},\ {\bf f(x^*)},\ {\bf \nabla f(x^*)}$}
  
    \BlankLine%
    $k = 0$ \\
    \While{$\nabla f_k \geq \epsilon \nabla f_0$} {
        \uIf{storage is empty}{
            $H_k^0 = I$
        }\uElse{
            $H_k^0 = \frac{\langle y_{k-1}, s_{k-1} \rangle}{\norm{y_{k-1}}^2} \cdot I$
        }
        Calculate $p_k = H_k \nabla{f_k}$ with \hyperref[algo: L-BFGS Two-Loop Recursion]{\textbf{Algorithm 4}} \\
        Choose $\alpha_k$ satisfying the Armijo-Wolfe conditions or with exact line search \\
        $x_{k+1} = x_k + \alpha_k p_k$ \\
        $s_k = x_{k+1} - x_k$ \\
        $y_k = \nabla f_{k+1} - \nabla f_k$ \\
        $curvature = \langle y_k, s_k \rangle$ \\
        $\rho_k = curvature^{-1}$ \\
        \uIf{$curvature \leq 10^{-16}$}{
            free the storage and start again from gradient descent
        }\uElse{
            Discard the vector pair $\{s_{k-m}, y_{k-m}, \rho_{k-m}\}$ from storage \\
            Save $s_k, y_k, \rho_k$
        }

        $k = k + 1$
    }
    \Return$x_k$, $f_k$, $\nabla f_k$
\end{algorithm}
\begin{algorithm}[H]
    \SetAlgoLined%
    \caption{Limited Memory BFGS {-} Two-Loop Recursion}\label{algo: L-BFGS Two-Loop Recursion}

    $q = \nabla f_k$ \\
    \For{$i = (k - 1), \dots, (k - m)$}{
        $\alpha_i = \rho_i s_i^T q$ \\
        $q = q - \alpha_i y_i$ \\
    }

    $r = H_k^0 q$ \\
    \For{$i = (k - m), \dots, (k - 1)$}{
        $\beta = \rho_i y_i^T r$ \\
        $r = r + s_i\bigl(\alpha_i - \beta\bigr)$ \\
    }

    \Return$-r$

\end{algorithm}
In our implementation we keep the triplets $(s_k, y_k, \rho_k)$ in a circular buffer with capacity $m$ and the values of $\alpha_i$ in \hyperref[algo: L-BFGS Two-Loop Recursion]{Algorithm 4} in a stack such that no explicit indices are needed. 

In case the curvature of the function is too small, we free the storage and restart with a gradient step.

We prefer using an exact line search to compute the step size over an inexact line search since the computational cost for our problem is lesser.

\subsection*{Convergence}
To prove that the implemented method converges to the global minimum of the function we have to optimize, we follow~\cite{convergence_lbfgs} and state the following assumptions about our problem:
\begin{enumerate}
    \item\label{algo: convergence1} $f \in C^2$
    \item\label{algo: convergence2} The level sets $\mathcal{L} = \{ x \in \mathbb{R}^n\ |\ f(x) \leq f(x_0) \} $ is convex
    \item\label{algo: convergence3} $\exists\ M_1, M_2 \in \mathbb{R}^+$ such that 
      \begin{equation*}
        M_1\norm{z}^2 \leq z^T G(x) z \leq  M_2\norm{z}^2\label{eq:6}
      \end{equation*}
    $\forall z \in \mathbb{R}^n$ and $\forall x \in \mathcal{L}$
\end{enumerate}

We follow the publication's notation and define:

\[ G(x) \coloneqq \nabla^{2}f(x) \]
\[ \bar{G}_k(x) \coloneqq \int_0^1 G(x_k + \tau \alpha_k p_k) d\tau \]

From Taylor's theorem:

\begin{equation}\label{algo: definition y_k}
        y_k = \bar{G}_k \alpha_k p_k = \bar{G}_k s_k
\end{equation}

The first assumption for our problem follows from the definition. The second assumption is proved by \autoref{definitions: hessian tomography}. The third assumption is also a consequence of the fact that the hessian of $f$ is constant.

% \[ z_k \coloneqq {\bar{G}_k}^{1/2} s_k \]

\begin{mtheo}
    Let $B_0$ be any symmetric positive definite initial matrix, and let $x_0$ be a starting point for which the Assumptions~\ref{algo: convergence1},~\ref{algo: convergence2} and~\ref{algo: convergence3} hold, then the sequence ${x_k}$ generated by the L-BFGS algorithm converges to the minimizer $x^*$ of $f$ linearly.
\end{mtheo}

\begin{mproof}
    Using \autoref{algo: definition y_k} and Assumption~\ref{algo: convergence3}:
    \[ M_1 \norm{s_k}^2 \leq y_k^T s_k \leq M_2 \norm{s_k}^2 \]
    and:
    \[ \frac{\norm{y_k}^2}{y_k^T s_k} = \frac{s_k^T \hat{G}_k^2 s_k}{s_k^T \hat{G}_k s_k} \]
    Both trace and determinant can be expressed in terms of the trace and determinant of the starting matrix from which the approximate hessian is constructed:
    \begin{align*}
        \Tr(B_{k+1}) &\leq \Tr(B_k^{(0)}) + \Tilde{m} M_2 \leq M_3 \\
        \det(B_{k+1}) &= \det(B_k^{(0)}) \cdot \prod_{l=0}^{\Tilde{m}-1} \frac{y_l^T s_l}{s_l^T B_k^{(l)} s_l} \geq \det\left(B_k^{(0)} {\left(\frac{M_1}{M_3}\right)}^{\Tilde{m}}\right) \geq M_4
    \end{align*}
    where $\Tilde{m}$ is the memory size and $M_3$ and $M_4$ are chosen appropriately in $\mathbb{R}^+$.

    From these two bounds we have that for some constant $\delta > 0$:

    \[ \cos(\theta_k) = \frac{s_k^T B_k s_k}{\norm{s_k} \norm{B_k s_k}} \geq \delta \]
    Since with exact line search the Armijo condition $f(x_k + \alpha_k p_k) \leq f(x_k) + m_1 \alpha_k \nabla f(x_k)$ is always satisfied if the constant $m_1$ does not exclude the minimum $x_*$ and since the strong Wolfe condition $\norm{\nabla f(x_k + \alpha_k p_k)} \leq m_3 \norm{\nabla f(x_k)}$ is also always satisfied since $\norm{\nabla f(x_k + \alpha_k p_k)} = O(u)$, follows from the two conditions and Assumptions~\ref{algo: convergence1} and~\ref{algo: convergence2} that:
    \begin{align*}
        & f(x_{k+1}) - f(x_*) \leq (1 - c \cos^2(\theta_k) (f(x_k) - f(x_*))) \\
        \implies& f(x_k) - f(x_*) \leq {(1 - c \cdot \delta^2)}^k (f(x_0) - f(x_*)) \\
        \implies& f(x_k) - f(x_*) \leq r^k (f(x_0) - f(x_*))
    \end{align*}
    for some $r \in [0, 1)$. Using Assumption~\ref{algo: convergence3}:
    \begin{gather*}
        \frac{1}{2} M_1 \norm{x_k - x_*}^2 \leq f(x_k) - f(x_*) \\
        \implies \norm{x_k - x_*} \leq r^{k/2} {\left( 2 \frac{f(x_0) - f(x_*)}{M_1} \right)}^{(1/2)}
    \end{gather*}
    so the sequence $\{x_k\}$ is linearly convergent.
\end{mproof}

The implementation of L-BFGS that uses Armijo-Wolfe line search also satisfies the assumptions so it also converges linearly to $x_*$.

%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% TeX-command-extra-options: "-shell-escape"
%%% End: