% chktex-file 9 chktex-file 17 \chapter{Algorithms}\label{ch: algorithms} \section{QR} The algorithm has been implemented considering that the input matrix $A \in \mathbb{R}^{m \times n}$, where $m$ may be different from $n$, namely it can be rectangular \textit{horizontally} or \textit{vertically}. In this version we store in a proper data structure a matrix $\Upsilon \in {m \times n}$ of the following form ($m > n$ in this example): \begin{equation*} \Upsilon = {(\upsilon_{i,j})}_{i,j} = \begin{tikzpicture}[baseline=-1ex] \matrix[% matrix of math nodes, nodes in empty cells, left delimiter={[},right delimiter={]}, inner xsep=2pt, column sep=6pt, ] (m) {% \vphantom{1} & * & \cdots & * \\ & \vphantom{1} & \ddots & \vdots \\ & & \vphantom{1} & * \\ & & & \vphantom{1} \\ u_1 & u_2 & \cdots & u_n \\ \vphantom{1} & \vphantom{1} & \vphantom{1} & \vphantom{1} \\ }; \node[rectangle, draw, fit={(m-1-1) (m-6-1)}, inner sep=-1.5pt, text width=22pt] {}; \node[rectangle, draw, fit={(m-2-2) (m-6-2)}, inner sep=-1.5pt, text width=22pt] {}; \node[rectangle, draw, fit={(m-3-3) (m-6-3)}, inner sep=-1.5pt, text width=22pt] {}; \node[rectangle, draw, fit={(m-4-4) (m-6-4)}, inner sep=-1.5pt, text width=22pt] {}; \end{tikzpicture} \end{equation*} \begin{center} $u_k \in \mathbb{R}^{m - k + 1},\ 1 \leq k \leq n$ \end{center} and the values of the diagonal of R in a vector $d \in \mathbb{R}^{n}$. The $*$ entries are elements computed in the QR factorization belonging to the upper triangular matrix, yielded by line 6 of \hyperref[algo: thinQR]{Algorithm 1}. In this way we are allowed to lazily perform the products $Qy$ and $Q^T y$ by means of the householder vectors $u_1 \dots, u_n $ that we stored. On the other hand, to compute a product between the upper part of $\Upsilon$ and an input vector we reconstruct the upper triangular matrix by taking element $\upsilon_{ij} \text{ such that } j > i$ and attach the vector $d$ as the diagonal of the resulting matrix. The zeros of the matrix $R$ are ignored. \begin{algorithm}[H] \SetAlgoLined% \caption{Thin QR}\label{algo: thinQR} \SetKwInOut{Input}{Input} \SetKwInOut{Output}{Output} \BlankLine% \Input{$A \in \mathbb{R}^{m \times n}$} \Output{$Q \in \mathbb{R}^{m \times m},\ R \in \mathbb{R}^{m \times n}$ implicit $QR$ factorization of $A$} \BlankLine% $\Upsilon = copy(A)$ \\ $d = zeros(\min(m, n))$ \\ \For{ $k \in 1 \dots \min(m, n)$ }{ $u_k, s_k = householder\_vector(\Upsilon[k:m, k])$\\ $d_k = s_k$ \\ $\Upsilon[k:m, k+1:n] = \Upsilon[k:m, k+1:n] - 2u(u^T \Upsilon[k:m, k+1:n])$\\ $\Upsilon[k:m, k] = u_k$ } \Return$\Upsilon, d$ \end{algorithm} \begin{algorithm}[H] \SetAlgoLined% \caption{householder\_vector}\label{algo: householder_vector} \SetKwInOut{Input}{Input} \SetKwInOut{Output}{Output} \BlankLine% \Input{$x \in \mathbb{R}^d$} \Output{$u \in \mathbb{R}^{d},\ s \in \mathbb{R}$ householder vector of $x$} \BlankLine% $s = \norm{x}$ \\ \If{$x_1 \geq 0$}{ $s = -s$ } $u = copy(x)$ \\ $u_1 = u_1 - s$ \\ $u = u\ / \norm{u}$ \\ \Return$u, s$ \end{algorithm} We assume $m > n$ as the case $n > m$ is similar for the complexity analysis. The time complexity of this algorithm is $\theta\bigl(mn^2 \bigr) \approx \theta\bigl(n^3 \bigr)$, because $m \approx n$ in (P). We will see in \hyperref[ch: experiments]{section Experiments} that the running time scales linearly with $m$ as expected, where $m$ is the size of $\hat{X}$. \newpage \section{L-BFGS} We follow the syntax from \textit{Numerical Optimization}\cite{Numerical-Optimization-2006} and define $f_k = f(x_k)$ \begin{algorithm}[H] \SetAlgoLined% \caption{Limited Memory BFGS}\label{algo: L-BFGS} \SetKwInOut{Input}{Input} \SetKwInOut{Output}{Output} \BlankLine% \Input{$\textbf{f}: \mathbb{R}^n \longrightarrow \mathbb{R},\ \textbf{x} \in \mathbb{R}^n,\ m \text{ memory, } \epsilon \text{ tolerance}$} \Output{${\bf x^*}\ \text{ending point},\ {\bf f(x^*)},\ {\bf \nabla f(x^*)}$} \BlankLine% $k = 0$ \\ \While{$\nabla f_k \geq \epsilon \nabla f_0$} { \uIf{storage is empty}{ $H_k^0 = I$ }\uElse{ $H_k^0 = \frac{\langle y_{k-1}, s_{k-1} \rangle}{\norm{y_{k-1}}^2} \cdot I$ } Calculate $p_k = H_k \nabla{f_k}$ with \hyperref[algo: L-BFGS Two-Loop Recursion]{\textbf{Algorithm 4}} \\ Choose $\alpha_k$ satisfying the Armijo-Wolfe conditions or with exact line search \\ $x_{k+1} = x_k + \alpha_k p_k$ \\ $s_k = x_{k+1} - x_k$ \\ $y_k = \nabla f_{k+1} - \nabla f_k$ \\ $curvature = \langle y_k, s_k \rangle$ \\ $\rho_k = curvature^{-1}$ \\ \uIf{$curvature \leq 10^{-16}$}{ free the storage and start again from gradient descent }\uElse{ Discard the vector pair $\{s_{k-m}, y_{k-m}, \rho_{k-m}\}$ from storage \\ Save $s_k, y_k, \rho_k$ } $k = k + 1$ } \Return$x_k$, $f_k$, $\nabla f_k$ \end{algorithm} \begin{algorithm}[H] \SetAlgoLined% \caption{Limited Memory BFGS {-} Two-Loop Recursion}\label{algo: L-BFGS Two-Loop Recursion} $q = \nabla f_k$ \\ \For{$i = (k - 1), \dots, (k - m)$}{ $\alpha_i = \rho_i s_i^T q$ \\ $q = q - \alpha_i y_i$ \\ } $r = H_k^0 q$ \\ \For{$i = (k - m), \dots, (k - 1)$}{ $\beta = \rho_i y_i^T r$ \\ $r = r + s_i\bigl(\alpha_i - \beta\bigr)$ \\ } \Return$-r$ \end{algorithm} In our implementation we keep the triplets $(s_k, y_k, \rho_k)$ in a circular buffer with capacity $m$ and the values of $\alpha_i$ in \hyperref[algo: L-BFGS Two-Loop Recursion]{Algorithm 4} in a stack such that no explicit indices are needed. In case the curvature of the function is too small, we free the storage and restart with a gradient step. We prefer using an exact line search to compute the step size over an inexact line search since the computational cost for our problem is lesser. \subsection*{Convergence} To prove that the implemented method converges to the global minimum of the function we have to optimize, we follow~\cite{convergence_lbfgs} and state the following assumptions about our problem: \begin{enumerate} \item\label{algo: convergence1} $f \in C^2$ \item\label{algo: convergence2} The level sets $\mathcal{L} = \{ x \in \mathbb{R}^n\ |\ f(x) \leq f(x_0) \} $ is convex \item\label{algo: convergence3} $\exists\ M_1, M_2 \in \mathbb{R}^+$ such that \begin{equation*} M_1\norm{z}^2 \leq z^T G(x) z \leq M_2\norm{z}^2\label{eq:6} \end{equation*} $\forall z \in \mathbb{R}^n$ and $\forall x \in \mathcal{L}$ \end{enumerate} We follow the publication's notation and define: \[ G(x) \coloneqq \nabla^{2}f(x) \] \[ \bar{G}_k(x) \coloneqq \int_0^1 G(x_k + \tau \alpha_k p_k) d\tau \] From Taylor's theorem: \begin{equation}\label{algo: definition y_k} y_k = \bar{G}_k \alpha_k p_k = \bar{G}_k s_k \end{equation} The first assumption for our problem follows from the definition. The second assumption is proved by \autoref{definitions: hessian tomography}. The third assumption is also a consequence of the fact that the hessian of $f$ is constant. % \[ z_k \coloneqq {\bar{G}_k}^{1/2} s_k \] \begin{mtheo} Let $B_0$ be any symmetric positive definite initial matrix, and let $x_0$ be a starting point for which the Assumptions~\ref{algo: convergence1},~\ref{algo: convergence2} and~\ref{algo: convergence3} hold, then the sequence ${x_k}$ generated by the L-BFGS algorithm converges to the minimizer $x^*$ of $f$ linearly. \end{mtheo} \begin{mproof} Using \autoref{algo: definition y_k} and Assumption~\ref{algo: convergence3}: \[ M_1 \norm{s_k}^2 \leq y_k^T s_k \leq M_2 \norm{s_k}^2 \] and: \[ \frac{\norm{y_k}^2}{y_k^T s_k} = \frac{s_k^T \hat{G}_k^2 s_k}{s_k^T \hat{G}_k s_k} \] Both trace and determinant can be expressed in terms of the trace and determinant of the starting matrix from which the approximate hessian is constructed: \begin{align*} \Tr(B_{k+1}) &\leq \Tr(B_k^{(0)}) + \Tilde{m} M_2 \leq M_3 \\ \det(B_{k+1}) &= \det(B_k^{(0)}) \cdot \prod_{l=0}^{\Tilde{m}-1} \frac{y_l^T s_l}{s_l^T B_k^{(l)} s_l} \geq \det\left(B_k^{(0)} {\left(\frac{M_1}{M_3}\right)}^{\Tilde{m}}\right) \geq M_4 \end{align*} where $\Tilde{m}$ is the memory size and $M_3$ and $M_4$ are chosen appropriately in $\mathbb{R}^+$. From these two bounds we have that for some constant $\delta > 0$: \[ \cos(\theta_k) = \frac{s_k^T B_k s_k}{\norm{s_k} \norm{B_k s_k}} \geq \delta \] Since with exact line search the Armijo condition $f(x_k + \alpha_k p_k) \leq f(x_k) + m_1 \alpha_k \nabla f(x_k)$ is always satisfied if the constant $m_1$ does not exclude the minimum $x_*$ and since the strong Wolfe condition $\norm{\nabla f(x_k + \alpha_k p_k)} \leq m_3 \norm{\nabla f(x_k)}$ is also always satisfied since $\norm{\nabla f(x_k + \alpha_k p_k)} = O(u)$, follows from the two conditions and Assumptions~\ref{algo: convergence1} and~\ref{algo: convergence2} that: \begin{align*} & f(x_{k+1}) - f(x_*) \leq (1 - c \cos^2(\theta_k) (f(x_k) - f(x_*))) \\ \implies& f(x_k) - f(x_*) \leq {(1 - c \cdot \delta^2)}^k (f(x_0) - f(x_*)) \\ \implies& f(x_k) - f(x_*) \leq r^k (f(x_0) - f(x_*)) \end{align*} for some $r \in [0, 1)$. Using Assumption~\ref{algo: convergence3}: \begin{gather*} \frac{1}{2} M_1 \norm{x_k - x_*}^2 \leq f(x_k) - f(x_*) \\ \implies \norm{x_k - x_*} \leq r^{k/2} {\left( 2 \frac{f(x_0) - f(x_*)}{M_1} \right)}^{(1/2)} \end{gather*} so the sequence $\{x_k\}$ is linearly convergent. \end{mproof} The implementation of L-BFGS that uses Armijo-Wolfe line search also satisfies the assumptions so it also converges linearly to $x_*$. %%% Local Variables: %%% mode: latex %%% TeX-master: "../main" %%% TeX-command-extra-options: "-shell-escape" %%% End: