Added data and begun report

This commit is contained in:
elvis
2023-08-25 20:18:08 +02:00
parent 49180c6b8a
commit ace44a4848
11 changed files with 264 additions and 8 deletions

View File

@ -213,8 +213,6 @@ int main(int argc, char *argv[]) {
ofstream csvfile;
csvfile.open("performance.csv");
goto theend;
for (std::string image : images1) {
cout << endl
<< "\033[1;31mProcessing: \t" << image << "\033[0m" << endl;
@ -399,26 +397,27 @@ int main(int argc, char *argv[]) {
}
}
theend:
cout << "Computing reading and writing speeds" << endl;
for (auto image : images1) {
cout << endl
<< "\033[1;31mProcessing: \t" << image << "\033[0m" << endl;
csvfile << "RWtime:," << image << ",";
vector<long long int> results;
for (int i = 0; i < average_max_rw; ++i) {
results.push_back(reading_writing({image}));
}
csvfile << std::accumulate(results.begin(), results.end(), 0) /
average_max_rw
csvfile << *std::min_element(results.begin(), results.end())
<< "\n";
}
for (auto image : images2) {
cout << endl
<< "\033[1;31mProcessing: \t" << image << "\033[0m" << endl;
csvfile << "RWtime:," << image << ",";
vector<long long int> results;
for (int i = 0; i < average_max_rw; ++i) {
results.push_back(reading_writing({image}));
}
csvfile << std::accumulate(results.begin(), results.end(), 0) /
average_max_rw
csvfile << *std::min_element(results.begin(), results.end())
<< "\n";
}

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.017,1.000,1.017
2,2.091,2.055,1.045
4,3.757,3.693,0.939
8,6.739,6.624,0.842
16,11.018,10.829,0.689
32,17.190,16.897,0.537
64,23.619,23.215,0.369

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.047,1.000,1.047
2,2.052,1.959,1.026
4,4.085,3.901,1.021
8,8.162,7.794,1.020
16,15.871,15.155,0.992
32,23.195,22.149,0.725
64,30.969,29.572,0.484

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.016,1.000,1.016
2,2.085,2.052,1.043
4,3.720,3.660,0.930
8,6.512,6.407,0.814
16,11.130,10.951,0.696
32,17.066,16.791,0.533
64,23.691,23.310,0.370

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.047,1.000,1.047
2,2.048,1.956,1.024
4,4.062,3.881,1.016
8,7.992,7.635,0.999
16,15.656,14.956,0.978
32,21.868,20.891,0.683
64,27.141,25.928,0.424

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.012,1.000,1.012
2,1.191,1.176,0.595
4,1.285,1.270,0.321
8,0.965,0.954,0.121
16,0.715,0.706,0.045
32,0.491,0.485,0.015
64,0.341,0.337,0.005

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.040,1.000,1.040
2,1.943,1.867,0.971
4,3.691,3.547,0.923
8,6.241,5.999,0.780
16,6.018,5.784,0.376
32,7.736,7.436,0.242
64,3.764,3.618,0.059

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.012,1.000,1.012
2,1.767,1.746,0.883
4,2.994,2.959,0.748
8,4.776,4.721,0.597
16,6.491,6.417,0.406
32,6.342,6.269,0.198
64,5.385,5.323,0.084

View File

@ -0,0 +1,8 @@
Number of Workers,Speedup,Scalability,Efficiency
1,1.049,1.000,1.049
2,2.054,1.959,1.027
4,3.866,3.687,0.966
8,7.178,6.846,0.897
16,10.737,10.239,0.671
32,11.564,11.028,0.361
64,11.761,11.216,0.184

Binary file not shown.

View File

@ -172,6 +172,45 @@
%% local changes
% \setcounter{secnumdepth}{0}
\usetikzlibrary{calc}
\usepgfplotslibrary{groupplots}
\usepackage[%
binary-units=true,
prefixes-as-symbols=false,
]{siunitx}
\DeclareSIUnit{\microsecond}{\SIUnitSymbolMicro s} % chktex 1
\newcommand{\newgroupplot}[1]{
\nextgroupplot[
title = #1,
xmin = 0, xmax = 64,
ymin = 0,
xtick distance = 8,
grid = both,
minor tick num = 1,
major grid style = {lightgray},
minor grid style = {lightgray!25},
width = 0.8\textwidth,
height = 0.5\textwidth,
legend pos=outer north east,
]
}
\newcommand{\plotfile}[2]{
\newgroupplot{#2}
\pgfplotstableread[col sep=comma]{#1}{\table}
\pgfplotstablegetcolsof{\table}
\pgfmathtruncatemacro\numberofcols{\pgfplotsretval-1}
\pgfplotsinvokeforeach{1,...,\numberofcols}{
\pgfplotstablegetcolumnnamebyindex{##1}\of{\table}\to{\colname}
\addplot table [y index=##1] {\table};
\addlegendentryexpanded{\colname}
}
\addplot[mark=none, black, samples=2, domain=0:64] {1};
}
\graphicspath{ {./import/} }
%% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - %%
@ -187,6 +226,7 @@
\begin{document}
\section{Implementation Design}
%% - - - - - - - - - - - - - - - - - - %%
\subsection{Design Choices}
The class \texttt{Stencil} holds both the parallel implementation using the FastFlow library and using the native C++ threads. The one using C++ threads can be called with the method \texttt{stdthread}. The operator \texttt{()} instead will use the FastFlow library. The class can also be used as a node; an example is given in the file ``main.cpp'', where using the function \texttt{fastflow} creates a pipe between the reader, the stencil and the writer.
@ -196,8 +236,161 @@ The class \texttt{Stencil} holds both the parallel implementation using the Fast
\caption{}
\end{figure}
The class \texttt{Reader} reads a binary file composed of 4 bytes representing the number of rows, 4 bytes representing the number of columns and then the raw matrix data. The result is stored in the class \texttt{Task} which will be passed to the next node in the FastFlow implementation. If instead the operator \texttt{()} is called, the resulting task will be returned via the promise given as input.
The class \texttt{Reader} reads a binary file composed of 4 bytes representing the number of rows, 4 bytes representing the number of columns and then the raw matrix data. Each element is a \texttt{char}. The result is stored in the class \texttt{Task} which will be passed to the next node. If instead the operator \texttt{()} is called, only the data will be returned as a pointer.
The \texttt{Writer} instead writes to disk the task to the same folder, overwriting existing files if present.
%% - - - - - - - - - - - - - - - - - - %%
\subsection{Native C++ Threads}
The structure of the implementation with native C++ threads is as follows:
\begin{algorithm}
\begin{algorithmic}[1]
\Procedure{stdthread}{$Input,Output$}
\For{$result \in Input$}
\State $arena = result$
\While{$iter>0$}
\For{$thread \in ThreadPool$}
\State send a new LAMBDA with appropriate bounds to the threadpool
\EndFor
\State swap $arena$ with $result$
\State $iter = iter - 1$
\EndWhile
\State wait for the threadpool to finish
\State push $result$ to $Output$
\EndFor
\EndProcedure
\end{algorithmic}
\begin{algorithmic}[1]
\Procedure{lambda}{$l, \Delta$}\Comment \textit{$l$ is the thread number, $\Delta$ is the ammount of rows to process}
\For{$x \in \{l \cdot \Delta, \ldots, (l+1) \cdot \Delta - 1\}$}
\For{$y \in \{0, \ldots, Columns\}$}
\If $(x, y)$ not in the border
\State calculate the neighborhood of $(x, y)$
\State $arena[x][y] = Stencil(neighborhood)$
\EndIf
\EndFor
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}
The threadpool is implemented in the \texttt{threadPool.hpp} and \texttt{threadPool.cpp} files.
Since for each element the work is equivalent, the $\Delta$ used in the lambda function is simply the total number of rows divided by the number of workers, such that each worker has only one job and all jobs are roughly equal in time.
The threadpool uses a queue and once a job is pushed only one thread may execute the function.
Since it is required for all jobs to finish, a condition variable is used to wake a thread that is waiting for all jobs to finish, eliminating the need for active wait.
%% - - - - - - - - - - - - - - - - - - %%
\subsection{FastFlow}
The structure of the implementation using the FastFlow is similar to the one with native threads.
Since the \texttt{Stencil} class is a subclass of \texttt{ff\_Map}, the method used for the execution is \texttt{parallel\_for}.
A custom emitter and collector would not have been faster and so the simpler approach of inheriting the methods from \texttt{ff\_Map} was chosen.
\begin{algorithm}
\begin{algorithmic}[1]
\Procedure{fastflow}{$Task$}
\State $arena = Task$
\While{$iter>0$}
\State \texttt{parallel\_for} with LAMBDA as the function to execute
\State swap $arena$ with $result$
\State $iter = iter - 1$
\EndWhile
\State return task
\EndProcedure
\end{algorithmic}
\begin{algorithmic}[1]
\Procedure{lambda}{$x$}
\For{$y \in \{0, \ldots, Columns\}$}
\If $(x, y)$ not in the border
\State calculate the neighborhood of $(x, y)$
\State $arena[x][y] = Stencil(neighborhood)$
\EndIf
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}
%% - - - - - - - - - - - - - - - - - - %%
\section{Performance Analysis}
The matrix data inside the class \texttt{Task} was both tested for performance as a vector of vectors and as a simple contiguous arena. The performance was exaclty the same so the simpler vector of vectors implementation was preferred.
In the file \texttt{main.cpp} a csv file is created from various tests on files from the \texttt{tests/} directory.
The time computed is for reading the file from disk, computing the stencil with different parameters and finally writing again to disk.
Reading and writing to disk are much faster than the computation except for the largest examples. In those cases the minimum time of reading and writing is subtracted.
For very small matrices the efficiency, the speedup and the scalability is very poor for both versions.
For larger examples instead a significant speedup is seen, but the implementation using native threads is slightly faster.
\begin{center}
\begin{tblr}{
colspec = {Q[l,m]|Q[r,m]|Q[r,m]},
}
Image & Time in \si{\microsecond} & Size in \si{\byte} \\
\hline % chktex 44
empty2x2 & 2218 & 12 \\ % chktex 29
increasing4x6 & 2054 & 32 \\ % chktex 29
increasing300x200 & 1301 & 60008 \\ % chktex 29
random400x2500 & 7101 & 1000008 \\ % chktex 29
equation & 786324 & 10000008 \\
equation2 & 2312927 & 30000008 \\
\end{tblr}
\end{center}
\begin{center}
\begin{tikzpicture}
\begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
\plotfile{data/increasing300x200ff.dat}{fastflow}
\plotfile{data/increasing300x200std.dat}{stdthread}
\end{groupplot}
\node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{increasing300x200}};
\end{tikzpicture}
\end{center}
\begin{center}
\begin{tikzpicture}
\begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
\plotfile{data/random400x2500ff.dat}{fastflow}
\plotfile{data/random400x2500std.dat}{stdthread}
\end{groupplot}
\node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{random400x2500}};
\end{tikzpicture}
\end{center}
\begin{center}
\begin{tikzpicture}
\begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
\plotfile{data/equationff.dat}{fastflow}
\plotfile{data/equationstd.dat}{stdthread}
\end{groupplot}
\node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{equation}};
\end{tikzpicture}
\end{center}
\begin{center}
\begin{tikzpicture}
\begin{groupplot}[group style={group size=1 by 2, vertical sep = 1.5cm}]
\plotfile{data/equation2ff.dat}{fastflow}
\plotfile{data/equation2std.dat}{stdthread}
\end{groupplot}
\node (title) at ($(group c1r1.center)+(0,4.5cm)$) {\color{red}{equation2}};
\end{tikzpicture}
\end{center}
As the size of the input increases the speedup and the scalability both follow linear trends up with a higher ammount of threds.
The scalability for both test files \texttt{equation} and \texttt{equation2} never go below $0.37$, but is slightly better for the implementation with native C++ threads.
\end{document}