219 lines
10 KiB
TeX
219 lines
10 KiB
TeX
|
|
% chktex-file 9 chktex-file 17
|
||
|
|
\chapter{Algorithms}\label{ch: algorithms}
|
||
|
|
|
||
|
|
\section{QR}
|
||
|
|
The algorithm has been implemented considering that the input matrix $A \in \mathbb{R}^{m \times n}$, where $m$ may be different from $n$, namely it can be rectangular \textit{horizontally} or \textit{vertically}. In this version we store in a proper data structure a matrix $\Upsilon \in {m \times n}$ of the following form ($m > n$ in this example):
|
||
|
|
|
||
|
|
\begin{equation*}
|
||
|
|
\Upsilon = {(\upsilon_{i,j})}_{i,j} = \begin{tikzpicture}[baseline=-1ex]
|
||
|
|
\matrix[%
|
||
|
|
matrix of math nodes,
|
||
|
|
nodes in empty cells,
|
||
|
|
left delimiter={[},right delimiter={]},
|
||
|
|
inner xsep=2pt, column sep=6pt,
|
||
|
|
] (m)
|
||
|
|
{%
|
||
|
|
\vphantom{1} & * & \cdots & * \\
|
||
|
|
& \vphantom{1} & \ddots & \vdots \\
|
||
|
|
& & \vphantom{1} & * \\
|
||
|
|
& & & \vphantom{1} \\
|
||
|
|
u_1 & u_2 & \cdots & u_n \\
|
||
|
|
\vphantom{1} & \vphantom{1} & \vphantom{1} & \vphantom{1} \\
|
||
|
|
};
|
||
|
|
\node[rectangle, draw, fit={(m-1-1) (m-6-1)}, inner sep=-1.5pt, text width=22pt] {};
|
||
|
|
\node[rectangle, draw, fit={(m-2-2) (m-6-2)}, inner sep=-1.5pt, text width=22pt] {};
|
||
|
|
\node[rectangle, draw, fit={(m-3-3) (m-6-3)}, inner sep=-1.5pt, text width=22pt] {};
|
||
|
|
\node[rectangle, draw, fit={(m-4-4) (m-6-4)}, inner sep=-1.5pt, text width=22pt] {};
|
||
|
|
\end{tikzpicture}
|
||
|
|
\end{equation*}
|
||
|
|
|
||
|
|
\begin{center}
|
||
|
|
$u_k \in \mathbb{R}^{m - k + 1},\ 1 \leq k \leq n$
|
||
|
|
\end{center}
|
||
|
|
and the values of the diagonal of R in a vector $d \in \mathbb{R}^{n}$. The $*$ entries are elements computed in the QR factorization belonging to the upper triangular matrix, yielded by line 6 of \hyperref[algo: thinQR]{Algorithm 1}. In this way we are allowed to lazily perform the products $Qy$ and $Q^T y$ by means of the householder vectors $u_1 \dots, u_n $ that we stored. On the other hand, to compute a product between the upper part of $\Upsilon$ and an input vector we reconstruct the upper triangular matrix by taking element $\upsilon_{ij} \text{ such that } j > i$ and attach the vector $d$ as the diagonal of the resulting matrix.
|
||
|
|
The zeros of the matrix $R$ are ignored.
|
||
|
|
|
||
|
|
\begin{algorithm}[H]
|
||
|
|
\SetAlgoLined%
|
||
|
|
\caption{Thin QR}\label{algo: thinQR}
|
||
|
|
\SetKwInOut{Input}{Input}
|
||
|
|
\SetKwInOut{Output}{Output}
|
||
|
|
|
||
|
|
\BlankLine%
|
||
|
|
\Input{$A \in \mathbb{R}^{m \times n}$}
|
||
|
|
\Output{$Q \in \mathbb{R}^{m \times m},\ R \in \mathbb{R}^{m \times n}$ implicit $QR$ factorization of $A$}
|
||
|
|
|
||
|
|
\BlankLine%
|
||
|
|
$\Upsilon = copy(A)$ \\
|
||
|
|
$d = zeros(\min(m, n))$ \\
|
||
|
|
|
||
|
|
\For{
|
||
|
|
$k \in 1 \dots \min(m, n)$
|
||
|
|
}{
|
||
|
|
$u_k, s_k = householder\_vector(\Upsilon[k:m, k])$\\
|
||
|
|
$d_k = s_k$ \\
|
||
|
|
$\Upsilon[k:m, k+1:n] = \Upsilon[k:m, k+1:n] - 2u(u^T \Upsilon[k:m, k+1:n])$\\
|
||
|
|
$\Upsilon[k:m, k] = u_k$
|
||
|
|
}
|
||
|
|
|
||
|
|
\Return$\Upsilon, d$
|
||
|
|
\end{algorithm}
|
||
|
|
|
||
|
|
\begin{algorithm}[H]
|
||
|
|
\SetAlgoLined%
|
||
|
|
\caption{householder\_vector}\label{algo: householder_vector}
|
||
|
|
\SetKwInOut{Input}{Input}
|
||
|
|
\SetKwInOut{Output}{Output}
|
||
|
|
|
||
|
|
\BlankLine%
|
||
|
|
\Input{$x \in \mathbb{R}^d$}
|
||
|
|
\Output{$u \in \mathbb{R}^{d},\ s \in \mathbb{R}$ householder vector of $x$}
|
||
|
|
|
||
|
|
\BlankLine%
|
||
|
|
$s = \norm{x}$ \\
|
||
|
|
\If{$x_1 \geq 0$}{
|
||
|
|
$s = -s$
|
||
|
|
}
|
||
|
|
$u = copy(x)$ \\
|
||
|
|
$u_1 = u_1 - s$ \\
|
||
|
|
$u = u\ / \norm{u}$ \\
|
||
|
|
|
||
|
|
\Return$u, s$
|
||
|
|
\end{algorithm}
|
||
|
|
|
||
|
|
We assume $m > n$ as the case $n > m$ is similar for the complexity analysis. The time complexity of this algorithm is $\theta\bigl(mn^2 \bigr) \approx \theta\bigl(n^3 \bigr)$, because $m \approx n$ in (P). We will see in \hyperref[ch: experiments]{section Experiments} that the running time scales linearly with $m$ as expected, where $m$ is the size of $\hat{X}$.
|
||
|
|
|
||
|
|
\newpage
|
||
|
|
\section{L-BFGS}
|
||
|
|
|
||
|
|
We follow the syntax from \textit{Numerical Optimization}\cite{Numerical-Optimization-2006} and define $f_k = f(x_k)$
|
||
|
|
|
||
|
|
\begin{algorithm}[H]
|
||
|
|
\SetAlgoLined%
|
||
|
|
\caption{Limited Memory BFGS}\label{algo: L-BFGS}
|
||
|
|
\SetKwInOut{Input}{Input}
|
||
|
|
\SetKwInOut{Output}{Output}
|
||
|
|
|
||
|
|
\BlankLine%
|
||
|
|
\Input{$\textbf{f}: \mathbb{R}^n \longrightarrow \mathbb{R},\ \textbf{x} \in \mathbb{R}^n,\ m \text{ memory, } \epsilon \text{ tolerance}$}
|
||
|
|
\Output{${\bf x^*}\ \text{ending point},\ {\bf f(x^*)},\ {\bf \nabla f(x^*)}$}
|
||
|
|
|
||
|
|
\BlankLine%
|
||
|
|
$k = 0$ \\
|
||
|
|
\While{$\nabla f_k \geq \epsilon \nabla f_0$} {
|
||
|
|
\uIf{storage is empty}{
|
||
|
|
$H_k^0 = I$
|
||
|
|
}\uElse{
|
||
|
|
$H_k^0 = \frac{\langle y_{k-1}, s_{k-1} \rangle}{\norm{y_{k-1}}^2} \cdot I$
|
||
|
|
}
|
||
|
|
Calculate $p_k = H_k \nabla{f_k}$ with \hyperref[algo: L-BFGS Two-Loop Recursion]{\textbf{Algorithm 4}} \\
|
||
|
|
Choose $\alpha_k$ satisfying the Armijo-Wolfe conditions or with exact line search \\
|
||
|
|
$x_{k+1} = x_k + \alpha_k p_k$ \\
|
||
|
|
$s_k = x_{k+1} - x_k$ \\
|
||
|
|
$y_k = \nabla f_{k+1} - \nabla f_k$ \\
|
||
|
|
$curvature = \langle y_k, s_k \rangle$ \\
|
||
|
|
$\rho_k = curvature^{-1}$ \\
|
||
|
|
\uIf{$curvature \leq 10^{-16}$}{
|
||
|
|
free the storage and start again from gradient descent
|
||
|
|
}\uElse{
|
||
|
|
Discard the vector pair $\{s_{k-m}, y_{k-m}, \rho_{k-m}\}$ from storage \\
|
||
|
|
Save $s_k, y_k, \rho_k$
|
||
|
|
}
|
||
|
|
|
||
|
|
$k = k + 1$
|
||
|
|
}
|
||
|
|
\Return$x_k$, $f_k$, $\nabla f_k$
|
||
|
|
\end{algorithm}
|
||
|
|
\begin{algorithm}[H]
|
||
|
|
\SetAlgoLined%
|
||
|
|
\caption{Limited Memory BFGS {-} Two-Loop Recursion}\label{algo: L-BFGS Two-Loop Recursion}
|
||
|
|
|
||
|
|
$q = \nabla f_k$ \\
|
||
|
|
\For{$i = (k - 1), \dots, (k - m)$}{
|
||
|
|
$\alpha_i = \rho_i s_i^T q$ \\
|
||
|
|
$q = q - \alpha_i y_i$ \\
|
||
|
|
}
|
||
|
|
|
||
|
|
$r = H_k^0 q$ \\
|
||
|
|
\For{$i = (k - m), \dots, (k - 1)$}{
|
||
|
|
$\beta = \rho_i y_i^T r$ \\
|
||
|
|
$r = r + s_i\bigl(\alpha_i - \beta\bigr)$ \\
|
||
|
|
}
|
||
|
|
|
||
|
|
\Return$-r$
|
||
|
|
|
||
|
|
\end{algorithm}
|
||
|
|
In our implementation we keep the triplets $(s_k, y_k, \rho_k)$ in a circular buffer with capacity $m$ and the values of $\alpha_i$ in \hyperref[algo: L-BFGS Two-Loop Recursion]{Algorithm 4} in a stack such that no explicit indices are needed.
|
||
|
|
|
||
|
|
In case the curvature of the function is too small, we free the storage and restart with a gradient step.
|
||
|
|
|
||
|
|
We prefer using an exact line search to compute the step size over an inexact line search since the computational cost for our problem is lesser.
|
||
|
|
|
||
|
|
\subsection*{Convergence}
|
||
|
|
To prove that the implemented method converges to the global minimum of the function we have to optimize, we follow~\cite{convergence_lbfgs} and state the following assumptions about our problem:
|
||
|
|
\begin{enumerate}
|
||
|
|
\item\label{algo: convergence1} $f \in C^2$
|
||
|
|
\item\label{algo: convergence2} The level sets $\mathcal{L} = \{ x \in \mathbb{R}^n\ |\ f(x) \leq f(x_0) \} $ is convex
|
||
|
|
\item\label{algo: convergence3} $\exists\ M_1, M_2 \in \mathbb{R}^+$ such that
|
||
|
|
\begin{equation*}
|
||
|
|
M_1\norm{z}^2 \leq z^T G(x) z \leq M_2\norm{z}^2\label{eq:6}
|
||
|
|
\end{equation*}
|
||
|
|
$\forall z \in \mathbb{R}^n$ and $\forall x \in \mathcal{L}$
|
||
|
|
\end{enumerate}
|
||
|
|
|
||
|
|
We follow the publication's notation and define:
|
||
|
|
|
||
|
|
\[ G(x) \coloneqq \nabla^{2}f(x) \]
|
||
|
|
\[ \bar{G}_k(x) \coloneqq \int_0^1 G(x_k + \tau \alpha_k p_k) d\tau \]
|
||
|
|
|
||
|
|
From Taylor's theorem:
|
||
|
|
|
||
|
|
\begin{equation}\label{algo: definition y_k}
|
||
|
|
y_k = \bar{G}_k \alpha_k p_k = \bar{G}_k s_k
|
||
|
|
\end{equation}
|
||
|
|
|
||
|
|
The first assumption for our problem follows from the definition. The second assumption is proved by \autoref{definitions: hessian tomography}. The third assumption is also a consequence of the fact that the hessian of $f$ is constant.
|
||
|
|
|
||
|
|
% \[ z_k \coloneqq {\bar{G}_k}^{1/2} s_k \]
|
||
|
|
|
||
|
|
\begin{mtheo}
|
||
|
|
Let $B_0$ be any symmetric positive definite initial matrix, and let $x_0$ be a starting point for which the Assumptions~\ref{algo: convergence1},~\ref{algo: convergence2} and~\ref{algo: convergence3} hold, then the sequence ${x_k}$ generated by the L-BFGS algorithm converges to the minimizer $x^*$ of $f$ linearly.
|
||
|
|
\end{mtheo}
|
||
|
|
|
||
|
|
\begin{mproof}
|
||
|
|
Using \autoref{algo: definition y_k} and Assumption~\ref{algo: convergence3}:
|
||
|
|
\[ M_1 \norm{s_k}^2 \leq y_k^T s_k \leq M_2 \norm{s_k}^2 \]
|
||
|
|
and:
|
||
|
|
\[ \frac{\norm{y_k}^2}{y_k^T s_k} = \frac{s_k^T \hat{G}_k^2 s_k}{s_k^T \hat{G}_k s_k} \]
|
||
|
|
Both trace and determinant can be expressed in terms of the trace and determinant of the starting matrix from which the approximate hessian is constructed:
|
||
|
|
\begin{align*}
|
||
|
|
\Tr(B_{k+1}) &\leq \Tr(B_k^{(0)}) + \Tilde{m} M_2 \leq M_3 \\
|
||
|
|
\det(B_{k+1}) &= \det(B_k^{(0)}) \cdot \prod_{l=0}^{\Tilde{m}-1} \frac{y_l^T s_l}{s_l^T B_k^{(l)} s_l} \geq \det\left(B_k^{(0)} {\left(\frac{M_1}{M_3}\right)}^{\Tilde{m}}\right) \geq M_4
|
||
|
|
\end{align*}
|
||
|
|
where $\Tilde{m}$ is the memory size and $M_3$ and $M_4$ are chosen appropriately in $\mathbb{R}^+$.
|
||
|
|
|
||
|
|
From these two bounds we have that for some constant $\delta > 0$:
|
||
|
|
|
||
|
|
\[ \cos(\theta_k) = \frac{s_k^T B_k s_k}{\norm{s_k} \norm{B_k s_k}} \geq \delta \]
|
||
|
|
Since with exact line search the Armijo condition $f(x_k + \alpha_k p_k) \leq f(x_k) + m_1 \alpha_k \nabla f(x_k)$ is always satisfied if the constant $m_1$ does not exclude the minimum $x_*$ and since the strong Wolfe condition $\norm{\nabla f(x_k + \alpha_k p_k)} \leq m_3 \norm{\nabla f(x_k)}$ is also always satisfied since $\norm{\nabla f(x_k + \alpha_k p_k)} = O(u)$, follows from the two conditions and Assumptions~\ref{algo: convergence1} and~\ref{algo: convergence2} that:
|
||
|
|
\begin{align*}
|
||
|
|
& f(x_{k+1}) - f(x_*) \leq (1 - c \cos^2(\theta_k) (f(x_k) - f(x_*))) \\
|
||
|
|
\implies& f(x_k) - f(x_*) \leq {(1 - c \cdot \delta^2)}^k (f(x_0) - f(x_*)) \\
|
||
|
|
\implies& f(x_k) - f(x_*) \leq r^k (f(x_0) - f(x_*))
|
||
|
|
\end{align*}
|
||
|
|
for some $r \in [0, 1)$. Using Assumption~\ref{algo: convergence3}:
|
||
|
|
\begin{gather*}
|
||
|
|
\frac{1}{2} M_1 \norm{x_k - x_*}^2 \leq f(x_k) - f(x_*) \\
|
||
|
|
\implies \norm{x_k - x_*} \leq r^{k/2} {\left( 2 \frac{f(x_0) - f(x_*)}{M_1} \right)}^{(1/2)}
|
||
|
|
\end{gather*}
|
||
|
|
so the sequence $\{x_k\}$ is linearly convergent.
|
||
|
|
\end{mproof}
|
||
|
|
|
||
|
|
The implementation of L-BFGS that uses Armijo-Wolfe line search also satisfies the assumptions so it also converges linearly to $x_*$.
|
||
|
|
|
||
|
|
%%% Local Variables:
|
||
|
|
%%% mode: latex
|
||
|
|
%%% TeX-master: "../main"
|
||
|
|
%%% TeX-command-extra-options: "-shell-escape"
|
||
|
|
%%% End:
|