Added Project and Report

2024-07-30 14:43:25 +02:00
parent c828453e94
commit 3ad6f7f86f
311 changed files with 13490 additions and 3280 deletions
--- a/definition/images/conditioning.png
+++ b/definition/images/conditioning.png
--- a/definition/problem
+++ b/definition/problem
@ -0,0 +1,177 @@
+\relax 
+\providecommand\hyper@newdestlabel[2]{}
+\@writefile{toc}{\contentsline {chapter}{\numberline {2}Problem Definition}{2}{chapter.2}\protected@file@percent }
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{loa}{\addvspace {10\p@ }}
+\newlabel{ch: problem definition}{{2}{2}{Problem Definition}{chapter.2}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.1}QR}{2}{section.2.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {2.2}L-BFGS}{3}{section.2.2}\protected@file@percent }
+\newlabel{ch: L-BFGS}{{2.2}{3}{L-BFGS}{section.2.2}{}}
+\newlabel{definitions: hessian tomography}{{2.2}{4}{L-BFGS}{equation.2.2.2}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.3}Conditioning}{4}{section.2.3}\protected@file@percent }
+\newlabel{subsec:conditioning}{{2.3}{4}{Conditioning}{section.2.3}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {2.1}{\ignorespaces $\kappa (\hat  {X})$ \textit  {for different values of} $\lambda $}}{5}{figure.caption.2}\protected@file@percent }
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
+\newlabel{fig:condition}{{2.1}{5}{$\kappa (\hat {X})$ \textit {for different values of} $\lambda $}{figure.caption.2}{}}
+\@setckpt{(2) - problem definition/problem definition}{
+\setcounter{page}{6}
+\setcounter{equation}{2}
+\setcounter{enumi}{0}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{0}
+\setcounter{footnote}{0}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{0}
+\setcounter{chapter}{2}
+\setcounter{section}{3}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{1}
+\setcounter{table}{0}
+\setcounter{tabx@nest}{0}
+\setcounter{listtotal}{0}
+\setcounter{listcount}{0}
+\setcounter{liststart}{0}
+\setcounter{liststop}{0}
+\setcounter{citecount}{0}
+\setcounter{citetotal}{0}
+\setcounter{multicitecount}{0}
+\setcounter{multicitetotal}{0}
+\setcounter{instcount}{0}
+\setcounter{maxnames}{3}
+\setcounter{minnames}{3}
+\setcounter{maxitems}{3}
+\setcounter{minitems}{1}
+\setcounter{citecounter}{0}
+\setcounter{maxcitecounter}{0}
+\setcounter{savedcitecounter}{0}
+\setcounter{uniquelist}{0}
+\setcounter{uniquename}{0}
+\setcounter{refsection}{0}
+\setcounter{refsegment}{0}
+\setcounter{maxextratitle}{0}
+\setcounter{maxextratitleyear}{0}
+\setcounter{maxextraname}{0}
+\setcounter{maxextradate}{0}
+\setcounter{maxextraalpha}{0}
+\setcounter{abbrvpenalty}{50}
+\setcounter{highnamepenalty}{50}
+\setcounter{lownamepenalty}{25}
+\setcounter{maxparens}{3}
+\setcounter{parenlevel}{0}
+\setcounter{blx@maxsection}{0}
+\setcounter{mincomprange}{10}
+\setcounter{maxcomprange}{100000}
+\setcounter{mincompwidth}{1}
+\setcounter{afterword}{0}
+\setcounter{savedafterword}{0}
+\setcounter{annotator}{0}
+\setcounter{savedannotator}{0}
+\setcounter{author}{0}
+\setcounter{savedauthor}{0}
+\setcounter{bookauthor}{0}
+\setcounter{savedbookauthor}{0}
+\setcounter{commentator}{0}
+\setcounter{savedcommentator}{0}
+\setcounter{editor}{0}
+\setcounter{savededitor}{0}
+\setcounter{editora}{0}
+\setcounter{savededitora}{0}
+\setcounter{editorb}{0}
+\setcounter{savededitorb}{0}
+\setcounter{editorc}{0}
+\setcounter{savededitorc}{0}
+\setcounter{foreword}{0}
+\setcounter{savedforeword}{0}
+\setcounter{holder}{0}
+\setcounter{savedholder}{0}
+\setcounter{introduction}{0}
+\setcounter{savedintroduction}{0}
+\setcounter{namea}{0}
+\setcounter{savednamea}{0}
+\setcounter{nameb}{0}
+\setcounter{savednameb}{0}
+\setcounter{namec}{0}
+\setcounter{savednamec}{0}
+\setcounter{translator}{0}
+\setcounter{savedtranslator}{0}
+\setcounter{shortauthor}{0}
+\setcounter{savedshortauthor}{0}
+\setcounter{shorteditor}{0}
+\setcounter{savedshorteditor}{0}
+\setcounter{labelname}{0}
+\setcounter{savedlabelname}{0}
+\setcounter{institution}{0}
+\setcounter{savedinstitution}{0}
+\setcounter{lista}{0}
+\setcounter{savedlista}{0}
+\setcounter{listb}{0}
+\setcounter{savedlistb}{0}
+\setcounter{listc}{0}
+\setcounter{savedlistc}{0}
+\setcounter{listd}{0}
+\setcounter{savedlistd}{0}
+\setcounter{liste}{0}
+\setcounter{savedliste}{0}
+\setcounter{listf}{0}
+\setcounter{savedlistf}{0}
+\setcounter{location}{0}
+\setcounter{savedlocation}{0}
+\setcounter{organization}{0}
+\setcounter{savedorganization}{0}
+\setcounter{origlocation}{0}
+\setcounter{savedoriglocation}{0}
+\setcounter{origpublisher}{0}
+\setcounter{savedorigpublisher}{0}
+\setcounter{publisher}{0}
+\setcounter{savedpublisher}{0}
+\setcounter{language}{0}
+\setcounter{savedlanguage}{0}
+\setcounter{origlanguage}{0}
+\setcounter{savedoriglanguage}{0}
+\setcounter{pageref}{0}
+\setcounter{savedpageref}{0}
+\setcounter{textcitecount}{0}
+\setcounter{textcitetotal}{0}
+\setcounter{textcitemaxnames}{0}
+\setcounter{biburlbigbreakpenalty}{100}
+\setcounter{biburlbreakpenalty}{200}
+\setcounter{biburlnumpenalty}{0}
+\setcounter{biburlucpenalty}{0}
+\setcounter{biburllcpenalty}{0}
+\setcounter{smartand}{1}
+\setcounter{bbx:relatedcount}{0}
+\setcounter{bbx:relatedtotal}{0}
+\setcounter{cbx@tempcnta}{0}
+\setcounter{cbx@tempcntb}{0}
+\setcounter{cbx@tempcntc}{0}
+\setcounter{cbx@tempcntd}{0}
+\setcounter{float@type}{4}
+\setcounter{parentequation}{0}
+\setcounter{ALG@line}{0}
+\setcounter{ALG@rem}{0}
+\setcounter{ALG@nested}{0}
+\setcounter{ALG@Lnr}{2}
+\setcounter{ALG@blocknr}{10}
+\setcounter{ALG@storecount}{0}
+\setcounter{ALG@tmpcounter}{0}
+\setcounter{thmt@dummyctr}{0}
+\setcounter{nlinenum}{0}
+\setcounter{caption@flags}{2}
+\setcounter{continuedfloat}{0}
+\setcounter{subfigure}{0}
+\setcounter{subtable}{0}
+\setcounter{section@level}{0}
+\setcounter{Item}{0}
+\setcounter{Hfootnote}{0}
+\setcounter{bookmark@seq@number}{0}
+\setcounter{AlgoLine}{0}
+\setcounter{algocfline}{0}
+\setcounter{algocfproc}{0}
+\setcounter{algocf}{0}
+\setcounter{mlemma}{0}
+}
--- a/definition/problem
+++ b/definition/problem
@ -0,0 +1,147 @@
+\chapter{Problem Definition}\label{ch: problem definition}
+Henceforth, we denote the norm-2 $\norm{ - }_2$ with the generic norm symbol $\norm{ - }$.\newline
+Given $\hat{X} \in \mathbb{R}^{(m + n) \times m},\ \hat{y} \in \mathbb{R}^{m + n},\ $ we want to find 
+\[\min_{w}\ \norm{\hat{X}w-\hat{y}}\]
+
+\section{QR}
+By performing a QR factorization on $\hat{X}$ we can reformulate the problem as follows:
+\[
+    \min_{w}\ \norm{\hat{X}w - \hat{y}} = \min_{w}\ \norm{\vphantom{\hat{X}}QRw - \hat{y}}
+\]
+with $Q \in \mathbb{R}^{(m + n) \times (m + n)}$ being an orthogonal matrix and $R \in \mathbb{R}^{(m + n) \times m}$ being an upper triangular matrix. Knowing that $R_{ij} = 0,\ \ \forall i > j,\ i = 1, \ldots, m + n,\ j = 1, \ldots, m,\ $ we can write 
+\begin{equation*}
+    \begin{aligned}
+        &R = 
+        \begin{bmatrix} 
+            R_0 \\
+            0
+        \end{bmatrix},\ 
+        &R_0 \in \mathbb{R}^{m \times m} \\
+        &Q = 
+        \begin{bmatrix} 
+            Q_0\ Q_c
+        \end{bmatrix},\ 
+        &Q_0 \in \mathbb{R}^{(m+n) \times m},\ &Q_c \in \mathbb{R}^{(m+n) \times n}
+    \end{aligned}
+\end{equation*}
+Since orthogonal matrices preserve norm-2, we have:
+\begin{equation*}
+    \begin{aligned}
+        &\min_{w}\ \norm{QRw - \hat{y}} = \min_{w}\ \norm{Q^T(QRw - \hat{y})} = \\
+        &\min_{w}\ \norm{Q^{T}QRw - Q^T\hat{y}} = \\
+        &\min_{w}\ \norm{Rw - Q^T\hat{y}}  = \\ 
+        &\min_{w}\ \norm{
+            \begin{bmatrix} 
+                R_0 \\
+                0
+            \end{bmatrix}
+            w - 
+            \begin{bmatrix} 
+                Q^T_0 \\
+                Q^T_c
+            \end{bmatrix}\hat{y}} = \\
+        & \min_{w}\ \norm{
+            \begin{bmatrix} 
+                R_0w - Q^T_0\hat{y} \\
+                - Q^T_c\hat{y}
+            \end{bmatrix}}
+    \end{aligned}
+\end{equation*}
+The entries of the second block $- Q^T_c\hat{y}$ do not depend on $w$, meaning that they will appear in the norm independently from $w$. Thus, we can simplify the problem and solve the triangular system
+\begin{equation*}
+    R_0w - Q^T_0\hat{y} = 0 \iff R_0w = Q^T_0\hat{y}
+\end{equation*}
+provided that $R_0$ is invertible.
+\begin{center}
+    $R_0$ is invertible $\iff \hat{X}$ has full column rank $\iff \hat{X}^T\hat{X} \succ 0$.
+\end{center}
+$R_0$ is invertible and the triangular system can be solved via backsubstitution. This claim is proved  in \hyperref[proofs: fullcolumn]{the last section}.
+
+\section{L-BFGS}\label{ch: L-BFGS}
+
+We can define 
+\begin{equation}
+    \begin{aligned}
+        g(w) = {f(w)}^2 = \norm{\hat{X}w-\hat{y}}^2
+    \end{aligned}
+\end{equation}
+and reformulate the problem equivalently in terms of $g(w)$, since it is monotonic.
+\begin{equation*}
+    \begin{aligned}
+        \min_{w}\ g(w) = \min_{w}\ \norm{\hat{X}w-\hat{y}}^2 = \min_{w}\ {\bigl(\hat{X}w - \hat{y}\bigr)}^T\bigl(\hat{X}w - \hat{y}\bigr)
+    \end{aligned}
+\end{equation*}
+The gradient of $g$ with respect to $w$ is 
+\begin{equation*}
+    \begin{aligned}
+        \nabla g(w) = 2\hat{X}^T\bigl(\hat{X}w - \hat{y}\bigr)
+    \end{aligned}
+\end{equation*}
+
+Likewise the gradient of $f(w)$ is as follows:
+\begin{equation*}
+    \nabla f(w) = \frac{1}{\norm{\hat{X} w - \hat{y}}} \hat{X}^T\bigl(\hat{X}w - \hat{y}\bigr)
+\end{equation*}
+but gives much worse performance since it is no longer quadratic.
+
+The function is L-smooth since $\forall w, w' \in \mathbb{R}^m,\ \text{with } w \neq w'$:
+
+\vspace{6pt}
+
+\begin{tblr}{colspec={crl}, colsep={0pt}}
+    & \(\norm{\nabla g(w) - \nabla g(w')}\) &\(\ \leq L \norm{w - w'}\)\\
+    \(\iff\) & \(\norm{\hat{X}^T(\hat{X}w - w') - \hat{X}^T (\hat{X} w' -\hat{y})}\) & \(\ \leq L \norm{w - w'}\) \\
+    \(\iff\) & \(\norm{\hat{X}^T \hat{X} (w-w')}\) & \(\ \leq L \norm{w - w'}\) \\
+    \(\Longleftarrow\) & \(\norm{\hat{X}^T \hat{X}} \norm{w-w'}\) & \(\ \leq L \norm{w - w'}\) \\
+    \(\iff\) & \(\norm{\hat{X}^T \hat{X}}\) & \(\ \leq L\ \)
+\end{tblr}
+
+\vspace{6pt}
+
+The function $g$ is also strongly convex since \( \nabla^2g(w) = \hat{X}^T \hat{X} \succ 0\).
+
+The tomography of $g(w)$ with respect to the direction $p$ is:
+\begin{align}
+    \phi(\alpha)&={(\hat{X}(w+\alpha p) - \hat{y})}^T \cdot (\hat{X}(w+\alpha p) - \hat{y}) \notag\\
+    \frac{d \phi(\alpha)}{d \alpha} &= 2 w^T \hat{X}^T \hat{X} p - 2 \hat{y}^T \hat{X} p + 2 \alpha p^T \hat{X}^T \hat{X} p \notag\\
+    \frac{d^2 \phi(\alpha)}{d \alpha^2} &= 2 p^T \hat{X}^T \hat{X} p \label{definitions: hessian tomography}
+\end{align}
+
+Since $\frac{d^2 \phi(\alpha)}{d \alpha^2}$ is constant, the tomography is simply a parabola and since $\hat{X}^T \hat{X}$ is positive definite, the dot product $\langle p, p \rangle_{\hat{X}^T \hat{X}}$ is always positive and the parabola always has a minimum. The minimum is found by solving $\frac{d \phi(\alpha)}{d \alpha}$ for $0$:
+
+\[ \alpha_{\min} = \frac{\hat{y}^T \hat{X} p - w^T \hat{X}^T \hat{X} p}{p^T \hat{X}^T \hat{X} p} \]
+
+\section{Conditioning}\label{subsec:conditioning}
+
+We check the condition number $\kappa(\hat{X})$ when the regularization term $\lambda > 0$ varies.
+\[
+\kappa(\hat{X}) = \norm{\hat{X}} \norm{\hat{X}^{T}} = \frac{\sigma_1}{\sigma_m} = \sqrt{\frac{\lambda_{\max}}{\lambda_{\min}}}
+\]
+with $\sigma_1, \sigma_m$ being respectively the largest and smallest singular values of $\hat{X}$ and $\lambda_{\max}, \lambda_{\min}$ being the largest and smallest eigenvalues of $\hat{X}^T\hat{X}$.\\
+Knowing that $\hat{X}^T\hat{X} = XX^T + \lambda^2I_m$, we have that 
+\begin{center}
+    \begin{tblr}{colspec={c}, colsep={0pt}, column{1} = {mode = math}}
+        \lambda_{max} = \lambda_1 + \lambda^2 \\
+        \lambda_{min} = \lambda_m + \lambda^2 \\
+    \end{tblr}
+\end{center}
+with $\lambda_1, \lambda_m$ being the largest and smallest eigenvalues of $XX^T$, which are translated by $\lambda^2$ as a result of adding $\lambda^2I_m$ (\autoref{proof:eigenvalues_translation})\\
+In \autoref{proofs: eigenvalues} we show that $\lambda_m = 0$ and conclude that $\kappa(\hat{X})$ scales linearly with $\frac{1}{\lambda}$:
+\[
+\kappa(\hat{X}) = \sqrt{\frac{\lambda_{\max}}{\lambda_{\min}}} = \sqrt{\frac{\lambda_{1} + \lambda^2}{\lambda_{m} + \lambda^2}} = \frac{\sqrt{\lambda_{1} + \lambda^2}}{{\sqrt{\lambda^2}}} = \frac{\sqrt{\lambda_{1} + \lambda^2}}{\lambda}
+\]
+if $\lambda_1 > 0$.
+
+For lambda close to zero we have $\frac{\sqrt{\lambda_{1} + \lambda^2}}{\lambda} \approx O\left(\frac{1}{\lambda}\right)$.
+This property is witnessed in \autoref{fig:condition}, which is in logarithmic scale:
+\begin{figure}[htbp]
+    \centering
+    \includegraphics[width=0.7\linewidth]{(2) - problem definition/images/conditioning.png} % chktex 8
+    \caption{$\kappa(\hat{X})$ \textit{for different values of} $\lambda$}\label{fig:condition}
+\end{figure}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../main"
+%%% TeX-command-extra-options: "-shell-escape"
+%%% End: