Compare commits

...

10 Commits

Author SHA1 Message Date
d15c57b5a1 Update .gitattributes 2023-10-29 02:18:12 +01:00
cb2a139b4f added reasons for current implementation 2023-08-29 17:26:23 +02:00
5d8348b42a fixed memory leak on () operator for stencil class 2023-08-29 17:25:54 +02:00
00e921f219 fixed some latex warnings 2023-08-29 15:34:45 +02:00
0476329279 spelling mistakes 2023-08-26 21:14:54 +02:00
a21e5d46da fixing 2023-08-26 20:21:35 +02:00
278f6d6b7a fixing 2023-08-26 20:20:01 +02:00
24566d7002 ignoring eps files 2023-08-26 20:15:02 +02:00
8fcbdb788e fixes 2023-08-26 20:07:00 +02:00
20125fc29f fixes 2023-08-26 20:01:16 +02:00
6 changed files with 73 additions and 52 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.eps linguist-generated

View File

@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
vector<pair<int, int>> neig = {make_pair(-1, 1), make_pair(-1, 0),
make_pair(-1, -1), make_pair(0, 1),
make_pair(0, -1), make_pair(1, 1),
make_pair(1, 0), make_pair(1, -1)};
make_pair(1, 0), make_pair(1, -1)};
ofstream csvfile;
csvfile.open("performance.csv");

View File

Binary file not shown.

View File

@ -15,7 +15,7 @@
\geometry{a4paper}
\usepackage[utf8]{inputenc} %% use UTF-8, maybe not needed since 2018
\usepackage[italian,main=english]{babel} %% language
\usepackage[english]{babel} %% language
\pagestyle{headings}
@ -32,7 +32,7 @@
style=numeric,
sorting=ynt
]{biblatex} %% for citations
\addbibresource{document.bib}
% \addbibresource{document.bib}
\usepackage{import} %% specify path for import
@ -175,8 +175,7 @@
\usetikzlibrary{calc}
\usepgfplotslibrary{groupplots}
\usepackage[%
binary-units=true,
\usepackage[
prefixes-as-symbols=false,
]{siunitx}
@ -205,9 +204,9 @@ prefixes-as-symbols=false,
\pgfplotstableread[col sep=comma]{#1}{\table}
\pgfplotstablegetcolsof{\table}
\pgfmathtruncatemacro\numberofcols{\pgfplotsretval-1}
\pgfplotsinvokeforeach{1,...,\numberofcols}{
\pgfplotsinvokeforeach{1,...,\numberofcols}{ % chktex 11
\pgfplotstablegetcolumnnamebyindex{##1}\of{\table}\to{\colname}
\addplot table [y index=##1] {\table};
\addplot table [y index=##1] {\table}; % chktex 1
\addlegendentryexpanded{\colname}
}
\addplot[mark=none, black, samples=2, domain=0:64] {1};
@ -275,6 +274,9 @@ prefixes-as-symbols=false,
%% - - - - - - - - - - - - - - - - - %%
\thispagestyle{empty}
\addtocounter{page}{-1}
\tableofcontents
\newpage
@ -307,10 +309,17 @@ The class \texttt{Stencil} holds both the parallel implementation using the Fast
The class \texttt{Reader} reads a binary file composed of 4 bytes representing the number of rows, 4 bytes representing the number of columns and then the raw matrix data. Each element is a \texttt{char} in all the test cases. The result is stored in the class \texttt{Task} which will be passed to the next node. If instead the operator \texttt{()} is called, only the data will be returned as a pointer.
The \texttt{Task} class can support matrixes of different element type rather than \texttt{char}.
The \texttt{Task} class can support matrices of different element type other than \texttt{char}.
The \texttt{Writer} instead writes to disk the task to the same folder, overwriting existing files if present.
The \texttt{Stencil} class divides the matrix in roughly equal parts and distributes them to other workers.
Since the ammount of work for simple stencil functions is roughly equal between blocks of columns, the matrix is split into equal blocks and each block is processed by a different worker.
The result is stored in a copy of the original matrix and the pointers swapped at the end of each iteration.
Since a neighbourhood of columns is needed for the next iteration, the simplest solution of waiting for all workers has been implemented.
A countiguous block of columns reduces the probability of false sharing.
The loops of the workers thread cannot be vectorized by the compiler since the stencil function may be calling library functions or use conditional statements.
%% - - - - - - - - - - - - - - - - - - %%
\subsection{Native C++ Threads}
@ -320,34 +329,35 @@ The structure of the implementation with native C++ threads is as follows:
\begin{algorithmic}[1]
\Procedure{stdthread}{$Input,Output$}
\For{$result \in Input$}
\State $arena = result$
\State{$arena = result$}
\While{$iter>0$}
\For{$thread \in ThreadPool$}
\State send a new LAMBDA with appropriate bounds to the threadpool
\EndFor
\State swap $arena$ with $result$
\State $iter = iter - 1$
\EndWhile
\State wait for the threadpool to finish
\State append $result$ to $Output$
\EndFor
\EndProcedure
\State{send a new LAMBDA with appropriate bounds to the threadpool}
\EndFor{}
\State{swap $arena$ with $result$}
\State{$iter = iter - 1$}
\EndWhile{}
\State{wait for the threadpool to finish}
\State{append $result$ to $Output$}
\EndFor{}
\EndProcedure{}
\end{algorithmic}
\begin{algorithmic}[1]
\Procedure{lambda}{$l, \Delta$}\Comment \textit{$l$ is the thread number, $\Delta$ is the ammount of rows to process}
\Procedure{lambda}{$l, \Delta$}\Comment{\textit{$l$ is the index of block of rows, $\Delta$ is the number of rows}}
\For{$x \in \{l \cdot \Delta, \ldots, (l+1) \cdot \Delta - 1\}$}
\For{$y \in \{0, \ldots, Columns\}$}
\If $(x, y)$ not in the border
\State calculate the neighborhood of $(x, y)$
\State $arena[x][y] = Stencil(neighborhood)$
\EndIf
\EndFor
\EndFor
\EndProcedure
\If{$(x, y)$ not in the border}
\State{calculate the neighborhood of $(x, y)$}
\State{$arena[x][y] = Stencil(neighborhood)$}
\EndIf{}
\EndFor{}
\EndFor{}
\EndProcedure{}
\end{algorithmic}
\end{algorithm}
The threadpool is implemented in the \texttt{threadPool.hpp} and \texttt{threadPool.cpp} files.
Since for each element the work is equivalent, the $\Delta$ used in the lambda function is simply the total number of rows divided by the number of workers, such that each worker has only one job and all jobs are roughly equal in time.
@ -361,30 +371,31 @@ Since it is required for all jobs to finish, a condition variable is used to wak
The structure of the implementation using the FastFlow is similar to the one with native threads.
Since the \texttt{Stencil} class is a subclass of \texttt{ff\_Map}, the method used for the execution is \texttt{parallel\_for}.
A custom emitter and collector would not have been faster and so the simpler approach of inheriting the methods from \texttt{ff\_Map} was chosen.
A custom emitter and collector would not have been significantly faster and so the simpler approach of inheriting the methods from \texttt{ff\_Map} was chosen.
A custom emitter would have had to split the range in the same number of blocks as number of workers and the custom collector would have had to function as a barrier for all workers.
\begin{algorithm}[H]
\begin{algorithmic}[1]
\Procedure{fastflow}{$Task$}
\State $arena = Task$
\State{$arena = Task$}
\While{$iter>0$}
\State \texttt{parallel\_for} with LAMBDA as the function to execute
\State swap $arena$ with $Task$
\State $iter = iter - 1$
\EndWhile
\State return $Task$
\EndProcedure
\State{\texttt{parallel\_for} with LAMBDA as the function to execute}
\State{swap $arena$ with $Task$}
\State{$iter = iter - 1$}
\EndWhile{}
\State{return $Task$}
\EndProcedure{}
\end{algorithmic}
\begin{algorithmic}[1]
\Procedure{lambda}{$x$}
\For{$y \in \{0, \ldots, Columns\}$}
\If $(x, y)$ not in the border
\State calculate the neighborhood of $(x, y)$
\State $arena[x][y] = Stencil(neighborhood)$
\EndIf
\EndFor
\EndProcedure
\If{$(x, y)$ not in the border}
\State{calculate the neighborhood of $(x, y)$}
\State{$arena[x][y] = Stencil(neighborhood)$}
\EndIf{}
\EndFor{}
\EndProcedure{}
\end{algorithmic}
\end{algorithm}
@ -404,11 +415,14 @@ Since
and the value of $T_{\texttt{Reader}} + T_{\texttt{Writer}}$ is known on average then the values speedup, scalability and efficiency are calculated as follows
\begingroup
\addtolength{\jot}{1em}
\begin{align*}
\text{Speedup}(n) &= \frac{T_{\text{seq}}}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
\text{Speedup}(n) &= \frac{T_{\text{seq}} - (T_{\texttt{Reader}} + T_{\texttt{Writer}})}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
\text{Scalability}(n) &= \frac{T_{\text{par}}(1) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
\text{Efficiency}(n) &= \frac{\text{Speedup}(n)}{n} \\
\text{Efficiency}(n) &= \frac{\text{Speedup}(n)}{n}
\end{align*}
\endgroup
For very small matrices the efficiency, the speedup and the scalability is very poor for both versions.
For larger examples instead a significant speedup is seen, but the implementation using native threads is slightly faster.
@ -454,7 +468,7 @@ the fastflow has a peek of speedup and scalability when using 4 workers in the s
\end{center}
The file \texttt{random400x2500} % chktex 29
performs best with 16 workers in the Fastflow implementation and slightly better at 64 workers compared to 32 workers in terms of speedup and scalability but has a significand drop in efficiency from $0.361$ to $0.184$. The relationship between number of workers and speedup is close to linear up to 8 workers.
performs best with 16 workers in the Fastflow implementation and in the native thread implementation performs slightly better at 64 workers compared to 32 workers in terms of speedup and scalability, but has a significant drop in efficiency from $0.361$ to $0.184$. The relationship between number of workers and speedup is close to linear up to 8 workers.
\begin{center}
\begin{tikzpicture}
@ -480,11 +494,11 @@ The file \texttt{equation} more closely follows a linear relationship between sp
\end{tikzpicture}
\end{center}
As the size of the input increases the speedup and the scalability both follow linear trends up with a higher ammount of threds.
As the size of the input increases the speedup and the scalability both follow linear trends up with a higher amount of threads.
The scalability for both test files \texttt{equation} and \texttt{equation2} never go below $0.37$, but is slightly better for the implementation with native C++ threads.
The difference in the three quantities between the test with file \texttt{equation} and the test with file \texttt{euqation1} is much smaller for the Fastflow version. In the native thread version instead there is a small improvement expecially with a higher number of workers.
The difference in the three quantities between the test with file \texttt{equation} and the test with file \texttt{equation1} is much smaller for the Fastflow version. In the native thread version instead there is a small improvement especially with a higher number of workers.
\end{document}

View File

@ -35,7 +35,7 @@ template <typename T> class Stencil : public ff::ff_Map<Task<T>> {
std::vector<std::promise<Task<T> *> *> *OutputVector);
private:
Task<T> *svc_helper(Task<T> *t);
Task<T> *svc_helper(Task<T> *t, int iterations);
void constructor_helper(std::vector<std::pair<int, int>> neighborhood);
std::function<T(std::vector<T>)> Convolution;
@ -100,7 +100,7 @@ void Stencil<T>::constructor_helper(
// svc function for fastflow library
template <typename T> Task<T> *Stencil<T>::svc(Task<T> *task) {
task = svc_helper(task);
task = svc_helper(task, this->Iterations);
ff::ff_node::ff_send_out(task);
return this->GO_ON;
}
@ -112,9 +112,15 @@ Stencil<T>::operator()(std::vector<std::vector<T>> *matrix, int iterations) {
if ((*matrix).size() == 0 || (*matrix)[0].size() == 0) {
return matrix;
}
Task<T> *task = new Task<T>(matrix, (*matrix).size(), (*matrix)[0].size());
task = svc_helper(task);
return task->VectorData;
std::vector<std::vector<T>> * arena = new std::vector<std::vector<T>>();
*arena = *matrix;
Task<T> *task = new Task<T>(arena, (*arena).size(), (*arena)[0].size());
task = svc_helper(task, iterations);
*matrix = *task->VectorData;
delete task;
return matrix;
}
// function for std thread
@ -243,8 +249,8 @@ void Stencil<T>::sequential(
}
}
template <typename T> Task<T> *Stencil<T>::svc_helper(Task<T> *task) {
int niter = Iterations;
template <typename T> Task<T> *Stencil<T>::svc_helper(Task<T> *task, int iterations) {
int niter = iterations;
std::vector<std::vector<T>> *arena = new std::vector<std::vector<T>>(0);
*arena = *(task->VectorData);