stencilparallelpattern/report/document.tex

\documentclass[12pt, oneside]{article}

%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%
%%                               Load Packages                               %%
%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%

\usepackage[
  top=2cm,
  bottom=2cm,
  left=2cm,
  right=2cm,
  headheight=20pt,
  centering
]{geometry}
\geometry{a4paper}

\usepackage[utf8]{inputenc} %% use UTF-8, maybe not needed since 2018
\usepackage[italian,main=english]{babel} %% language

\pagestyle{headings}

\usepackage{scrlayer-scrpage}
\usepackage{csquotes} %% correct language also for citations

\ifoot[]{}
\cfoot[]{}
\ofoot[\pagemark]{\pagemark}
\pagestyle{scrplain}

\usepackage[
  backend=biber,
  style=numeric,
  sorting=ynt
]{biblatex} %% for citations
\addbibresource{document.bib}

\usepackage{import} %% specify path for import

%% math packages
\usepackage{graphicx} %% for pictures
\usepackage{float}
\usepackage{amssymb} %% math symbols
\usepackage{amsmath} %% math matrix etc
\usepackage{minted} %% code block
\usepackage{tabularray} %% better tables
\usepackage{booktabs} %% rules for tables
\usepackage{mathrsfs}
\usepackage{mathtools}
\usepackage{algorithm} %% for algorithms
\usepackage{algpseudocode} %% loads algorithmicx
\usepackage{amsthm}
\usepackage{thmtools} %% theorems

%% plot packages
\usepackage{pgfplots} %% plots used with \begin{tikzpicture}
\usepackage{tikz} %% for pictures
\usetikzlibrary{trees}
\pgfplotsset{width=10cm,compat=newest}

%% design packages
\usepackage{enumitem} %% for lists and enumerating
\usepackage{color}
\usepackage{xcolor,colortbl} % xcolor for defining colors, colortbl for table colors
\usepackage{makecell} %% for multiple lines in cell of table
\usepackage{cancel}
\usepackage{pgfornament} %% ornaments

%% load last
\usepackage[hidelinks]{hyperref} %% links for table of contents, load last
\usepackage{bookmark} %% for better table of contents


%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%
%%                       Configuration of the packages                       %%
%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%

\linespread{1}
\raggedbottom %% spaces if page is empty % chktex 1

%% set max table of contents recursion to subsection (3->subsubsecition)
\setcounter{tocdepth}{3}
\setcounter{secnumdepth}{3}

%% use bar instead of arrow for vectors
\renewcommand{\vec}[1]{\bar{#1}}
%% easy norm
\newcommand{\norm}[1]{\left\lvert#1\right\rvert}

% argmin and argmax
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}

%% itemize use less vertical space (use olditemize for default behaviour)
\let\olditemize=\itemize%% old itemize
\let\endolditemize=\enditemize%% old end itemize
\renewenvironment{itemize}{\olditemize\itemsep-0.2em}{\endolditemize}

%% items in itemize emph+box
%% usage: \ieb{Class:} for simple item
%%        \ieb[4cm]{Class:} for specific size of box
\newcommand{\ieb}[2][2cm]{
        \makebox[#1][l]{\emph{#2}}
} %% TODO: replace with description environment (? maybe)

% less vertical space around align & align*
\newcommand{\zerodisplayskips}{
  \setlength{\abovedisplayskip}{0pt}
  \setlength{\belowdisplayskip}{0pt}
  \setlength{\abovedisplayshortskip}{0pt}
  \setlength{\belowdisplayshortskip}{0pt}
}

% make dotfill use all the space available
\renewcommand{\dotfill}{
  \leavevmode\cleaders\hbox to 1.00em{\hss .\hss }\hfill\kern0pt } % chktex 1 chktex 26

\setlength{\fboxsep}{-\fboxrule} % for debugging


%% PACKAGE algorithm
\floatname{algorithm}{Algorithm}


%% PACKAGE tabularray
\UseTblrLibrary{amsmath}


%% PACKAGE color
\definecolor{red}{rgb}{1, 0.1, 0.1}
\definecolor{lightgreen}{rgb}{0.55, 0.87, 0.47}
\definecolor{gray}{rgb}{0.3, 0.3, 0.3}
\newcommand{\lgt}{\cellcolor{lightgreen}} %% light green in tables
\newcommand{\gry}{\textcolor{gray}} %% gray text
\newcommand{\rd}{\textcolor{red}} %% red text

%% PACKAGE minipage
\newcommand{\thend}[1]{\begin{center}
  \begin{minipage}[c][1em][c]{#1}
    \dotfill{}
  \end{minipage}
\end{center}}


%% PACKAGE thmtools
\declaretheoremstyle[
 headfont=\normalfont\bfseries,
 notefont=\mdseries,
 bodyfont=\normalfont,
 qed=\qedsymbol % chktex 1
]{steo}
\declaretheorem[numbered=no, style=steo]{theorem}

\declaretheoremstyle[
  headfont=\normalfont\bfseries,
  notefont=\mdseries,
  bodyfont=\normalfont,
]{sdef}
\declaretheorem[numbered=no, style=sdef]{definition}

\declaretheoremstyle[
  spaceabove=-6pt,
  spacebelow=6pt,
  headfont=\normalfont\bfseries,
  bodyfont=\normalfont,
  postheadspace=1em,
  qed=$\blacksquare$,
  headpunct={:}
]{sprf}
\declaretheorem[name={Proof}, style=sprf, numbered=no]{prof}

%% ......................................................................... %%
%% local changes
% \setcounter{secnumdepth}{0}

\usetikzlibrary{calc}
\usepgfplotslibrary{groupplots}

\usepackage[%
binary-units=true,
prefixes-as-symbols=false,
]{siunitx}

\DeclareSIUnit{\microsecond}{\SIUnitSymbolMicro s} % chktex 1

\newcommand{\newgroupplot}[1]{
  \nextgroupplot[
  title = #1,
  xmin = 1, xmax = 64,
  ymin = 0.005, ymode = log,
  log basis y={2},
  xtick distance = 8,
  log ticks with fixed point,
  grid = both,
  minor tick num = 1,
  major grid style = {lightgray},
  minor grid style = {lightgray!25},
  width = 0.8\textwidth,
  height = 0.5\textwidth,
  legend pos=outer north east,
  ]
}

\newcommand{\plotfile}[2]{
  \newgroupplot{#2}
  \pgfplotstableread[col sep=comma]{#1}{\table}
  \pgfplotstablegetcolsof{\table}
  \pgfmathtruncatemacro\numberofcols{\pgfplotsretval-1}
  \pgfplotsinvokeforeach{1,...,\numberofcols}{
    \pgfplotstablegetcolumnnamebyindex{##1}\of{\table}\to{\colname}
    \addplot table [y index=##1] {\table};
    \addlegendentryexpanded{\colname}
  }
  \addplot[mark=none, black, samples=2, domain=0:64] {1};
  \addplot[domain=1:64,samples=200,color=gray!70,] {x};
}

\graphicspath{ {./import/} }

%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%

\title{Report Parallel and Distributed Systems}
\author{
  Elvis Rossi
}
\date{\today}

%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%

\begin{document}

\hypersetup{pageanchor=false}

\begin{titlepage}
\begin{figure}[!htb]
    \centering
    \includegraphics[keepaspectratio=true,scale=0.8]{figures/marchio.eps}
\end{figure}

\begin{center}
  \large{Master Degree in Computer Science} \\
  \vspace{1cm}
  \large{Report for Parallel and Distributed Systems: paradigms and models} \\
  \vspace{1cm}
  \textbf{\LARGE{``Stencil'' parallel pattern}}
\end{center}

\vspace{7cm}

\noindent
\begin{minipage}[t]{0.55\textwidth}
  \raggedright%
    \large{\bf Teachers: \\
      \ Prof.\ Marco\ Danelutto \\
      \ Prof.\ Patrizio\ Dazzi \\
    }
\end{minipage}\hfill
\noindent
\begin{minipage}[t]{0.4\textwidth}
  \raggedleft%
    \large{\bf Student: \\
      Elvis Rossi \\
      ID:\ 561394 \\
    }
\end{minipage}

\vspace{25mm}
\noindent
\centering{
  \large{Academic year 2022—2023}
}

\end{titlepage}

\hypersetup{pageanchor=true}

%% - - - - - - - - - - - - - - - - - %%

\tableofcontents

\newpage


\section{Building and Executing the project}

The project uses \texttt{cmake} to create the native makefiles. The flag \texttt{CMAKE\_BUILD\_TYPE} can be used to specify the type of build; two options are supported: \texttt{Debug} and \texttt{Release}.
The main file creates a \texttt{.csv} file with the execution time of different test cases with input files located in \texttt{./tests}. On MacOS, thread pinning for the Fastflow library is disabled since is not supported by the operating system.

To compile and run the project:

\begin{minted}{bash}
  cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/
  cd build/
  make
  ./main
\end{minted}

\section{Implementation Design}
%% - - - - - - - - - - - - - - - - - - %%
\subsection{Design Choices}
The class \texttt{Stencil} holds both the parallel implementation using the FastFlow library and using the native C++ threads.  The one using C++ threads can be called with the method \texttt{stdthread}.  The operator \texttt{()} instead will use the FastFlow library.  The class can also be used as a node; an example is given in the file ``main.cpp'', where using the function \texttt{fastflow} creates a pipe between the reader, the stencil and the writer.

\begin{figure}[H]
  \centering
  \includegraphics[width=0.4\textwidth]{pipeline.eps}
  \caption{}
\end{figure}

The class \texttt{Reader} reads a binary file composed of 4 bytes representing the number of rows, 4 bytes representing the number of columns and then the raw matrix data. Each element is a \texttt{char} in all the test cases. The result is stored in the class \texttt{Task} which will be passed to the next node. If instead the operator \texttt{()} is called, only the data will be returned as a pointer.

The \texttt{Task} class can support matrixes of different element type rather than \texttt{char}.

The \texttt{Writer} instead writes to disk the task to the same folder, overwriting existing files if present.

%% - - - - - - - - - - - - - - - - - - %%
\subsection{Native C++ Threads}

The structure of the implementation with native C++ threads is as follows:

\begin{algorithm}[H]
  \begin{algorithmic}[1]
    \Procedure{stdthread}{$Input,Output$}
      \For{$result \in Input$}
        \State $arena = result$
        \While{$iter>0$}
          \For{$thread \in ThreadPool$}
            \State send a new LAMBDA with appropriate bounds to the threadpool
          \EndFor
          \State swap $arena$ with $result$
          \State $iter = iter - 1$
        \EndWhile
        \State wait for the threadpool to finish
        \State append $result$ to $Output$
      \EndFor
    \EndProcedure
  \end{algorithmic}

  \begin{algorithmic}[1]
    \Procedure{lambda}{$l, \Delta$}\Comment \textit{$l$ is the thread number, $\Delta$ is the ammount of rows to process}
      \For{$x \in \{l \cdot \Delta, \ldots, (l+1) \cdot \Delta - 1\}$}
        \For{$y \in \{0, \ldots, Columns\}$}
          \If $(x, y)$ not in the border
            \State calculate the neighborhood of $(x, y)$
            \State $arena[x][y] = Stencil(neighborhood)$
          \EndIf
        \EndFor
      \EndFor
    \EndProcedure
  \end{algorithmic}
\end{algorithm}

The threadpool is implemented in the \texttt{threadPool.hpp} and \texttt{threadPool.cpp} files.

Since for each element the work is equivalent, the $\Delta$ used in the lambda function is simply the total number of rows divided by the number of workers, such that each worker has only one job and all jobs are roughly equal in time.

The threadpool uses a queue and once a job is pushed only one thread may execute the function.
Since it is required for all jobs to finish, a condition variable is used to wake a thread that is waiting for all jobs to finish, eliminating the need for active wait.

%% - - - - - - - - - - - - - - - - - - %%
\subsection{FastFlow}

The structure of the implementation using the FastFlow is similar to the one with native threads.
Since the \texttt{Stencil} class is a subclass of \texttt{ff\_Map}, the method used for the execution is \texttt{parallel\_for}.

A custom emitter and collector would not have been faster and so the simpler approach of inheriting the methods from \texttt{ff\_Map} was chosen.

\begin{algorithm}[H]
  \begin{algorithmic}[1]
    \Procedure{fastflow}{$Task$}
      \State $arena = Task$
      \While{$iter>0$}
        \State \texttt{parallel\_for} with LAMBDA as the function to execute
        \State swap $arena$ with $Task$
        \State $iter = iter - 1$
      \EndWhile
      \State return $Task$
    \EndProcedure
  \end{algorithmic}

  \begin{algorithmic}[1]
    \Procedure{lambda}{$x$}
      \For{$y \in \{0, \ldots, Columns\}$}
        \If $(x, y)$ not in the border
          \State calculate the neighborhood of $(x, y)$
          \State $arena[x][y] = Stencil(neighborhood)$
        \EndIf
      \EndFor
    \EndProcedure
  \end{algorithmic}
\end{algorithm}

%% - - - - - - - - - - - - - - - - - - %%
\section{Performance Analysis}

The matrix data inside the class \texttt{Task} was both tested for performance as a vector of vectors and as a simple contiguous arena. The performance was exactly the same so the simpler vector of vectors implementation was preferred.

In the file \texttt{main.cpp} a csv file is created from various tests on files from the \texttt{tests/} directory.
The time computed is for reading the file from disk, computing the stencil with different parameters and finally writing again to disk.
Instead of averaging the times of different runs, the minimum of the runs is chosen since outliers skew the mean greatly.
Reading and writing to disk are much faster than the computation except for the largest examples. In those cases the minimum time of reading and writing is subtracted.

Since

\[ T_{\text{total}} = T_{\texttt{Reader}} + T_{\texttt{Stencil}} + T_{\texttt{Writer}} \]

and the value of $T_{\texttt{Reader}} + T_{\texttt{Writer}}$ is known on average then the values speedup, scalability and efficiency are calculated as follows

\begin{align*}
  \text{Speedup}(n) &= \frac{T_{\text{seq}}}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
  \text{Scalability}(n) &= \frac{T_{\text{par}}(1) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
  \text{Efficiency}(n) &= \frac{\text{Speedup}(n)}{n} \\
\end{align*}

For very small matrices the efficiency, the speedup and the scalability is very poor for both versions.
For larger examples instead a significant speedup is seen, but the implementation using native threads is slightly faster.

\begin{center}
  \begin{tblr}{
      colspec = {Q[l,m]|Q[r,m]|Q[r,m]},
    }
    Image & $T_{\texttt{Reader}} + T_{\texttt{Writer}}$ in \si{\microsecond} & Size in \si{\byte} \\
    \hline % chktex 44
    empty2x2 & 2218 & 12 \\ % chktex 29
    increasing4x6 & 2054 & 32 \\ % chktex 29
    increasing300x200 & 1301 & 60008 \\ % chktex 29
    random400x2500 & 7101 & 1000008 \\ % chktex 29
    equation & 786324 & 10000008 \\
    equation2 & 2312927 & 30000008 \\
  \end{tblr}
\end{center}

\begin{center}
  \begin{tikzpicture}
    \begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
      \plotfile{data/increasing300x200ff.dat}{Fastflow}

      \plotfile{data/increasing300x200std.dat}{Native Threads}
    \end{groupplot}
    \node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{increasing300x200}};
  \end{tikzpicture}
\end{center}

For the file \texttt{increasing300x200} % chktex 29
the fastflow has a peek of speedup and scalability when using 4 workers in the stencil stage but quickly looses performance due to the small size of the input. For the native thread version instead the speedup and the scalability always stays above $1$ but has a peek at 32 workers.

\begin{center}
  \begin{tikzpicture}
    \begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
      \plotfile{data/random400x2500ff.dat}{Fastflow}

      \plotfile{data/random400x2500std.dat}{Native Threads}
    \end{groupplot}
    \node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{random400x2500}};
  \end{tikzpicture}
\end{center}

The file \texttt{random400x2500} % chktex 29
performs best with 16 workers in the Fastflow implementation and slightly better at 64 workers compared to 32 workers in terms of speedup and scalability but has a significand drop in efficiency from $0.361$ to $0.184$. The relationship between number of workers and speedup is close to linear up to 8 workers.

\begin{center}
  \begin{tikzpicture}
    \begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
      \plotfile{data/equationff.dat}{Fastflow}

      \plotfile{data/equationstd.dat}{Native Threads}
    \end{groupplot}
    \node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{equation}};
  \end{tikzpicture}
\end{center}

The file \texttt{equation} more closely follows a linear relationship between speedup or scalability and number of workers for both versions.

\begin{center}
  \begin{tikzpicture}
    \begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
      \plotfile{data/equation2ff.dat}{Fastflow}

      \plotfile{data/equation2std.dat}{Native Threads}
    \end{groupplot}
    \node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{equation2}};
  \end{tikzpicture}
\end{center}

As the size of the input increases the speedup and the scalability both follow linear trends up with a higher ammount of threds.

The scalability for both test files \texttt{equation} and \texttt{equation2} never go below $0.37$, but is slightly better for the implementation with native C++ threads.

The difference in the three quantities between the test with file \texttt{equation} and the test with file \texttt{euqation1} is much smaller for the Fastflow version. In the native thread version instead there is a small improvement expecially with a higher number of workers.

\end{document}

%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%

%%% Local Variables:
%%% TeX-command-extra-options: "-shell-escape"
%%% End: