Update .gitattributes

added reasons for current implementation
fixed memory leak on () operator for stencil class
2023-10-29 02:18:12 +01:00 · 2023-08-29 17:26:23 +02:00 · 2023-08-29 17:25:54 +02:00 · 2023-08-29 15:34:45 +02:00 · 2023-08-26 21:14:54 +02:00 · 2023-08-26 20:21:35 +02:00
6 changed files with 73 additions and 52 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+*.eps linguist-generated
--- a/main.cpp
+++ b/main.cpp
@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
    vector<pair<int, int>> neig = {make_pair(-1, 1),  make_pair(-1, 0),
                                   make_pair(-1, -1), make_pair(0, 1),
                                   make_pair(0, -1),  make_pair(1, 1),
-                                   make_pair(1, 0),   make_pair(1, -1)};
+                                   make_pair(1, 0),   make_pair(1, -1)};    

    ofstream csvfile;
    csvfile.open("performance.csv");
--- a/report/document.bib
+++ b/report/document.bib
--- a/report/document.pdf
+++ b/report/document.pdf
--- a/report/document.tex
+++ b/report/document.tex
@ -15,7 +15,7 @@
 \geometry{a4paper}

 \usepackage[utf8]{inputenc} %% use UTF-8, maybe not needed since 2018
-\usepackage[italian,main=english]{babel} %% language
+\usepackage[english]{babel} %% language

 \pagestyle{headings}

@ -32,7 +32,7 @@
  style=numeric,
  sorting=ynt
 ]{biblatex} %% for citations
-\addbibresource{document.bib}
+% \addbibresource{document.bib}

 \usepackage{import} %% specify path for import

@ -175,8 +175,7 @@
 \usetikzlibrary{calc}
 \usepgfplotslibrary{groupplots}

-\usepackage[%
-binary-units=true,
+\usepackage[
 prefixes-as-symbols=false,
 ]{siunitx}

@ -205,9 +204,9 @@ prefixes-as-symbols=false,
  \pgfplotstableread[col sep=comma]{#1}{\table}
  \pgfplotstablegetcolsof{\table}
  \pgfmathtruncatemacro\numberofcols{\pgfplotsretval-1}
-  \pgfplotsinvokeforeach{1,...,\numberofcols}{
+  \pgfplotsinvokeforeach{1,...,\numberofcols}{ % chktex 11
    \pgfplotstablegetcolumnnamebyindex{##1}\of{\table}\to{\colname}
-    \addplot table [y index=##1] {\table};
+    \addplot table [y index=##1] {\table}; % chktex 1
    \addlegendentryexpanded{\colname}
  }
  \addplot[mark=none, black, samples=2, domain=0:64] {1};
@ -275,6 +274,9 @@ prefixes-as-symbols=false,

 %% - - - - - - - - - - - - - - - - - %%

+\thispagestyle{empty}
+\addtocounter{page}{-1}
+
 \tableofcontents

 \newpage
@ -307,10 +309,17 @@ The class \texttt{Stencil} holds both the parallel implementation using the Fast

 The class \texttt{Reader} reads a binary file composed of 4 bytes representing the number of rows, 4 bytes representing the number of columns and then the raw matrix data. Each element is a \texttt{char} in all the test cases. The result is stored in the class \texttt{Task} which will be passed to the next node. If instead the operator \texttt{()} is called, only the data will be returned as a pointer.

-The \texttt{Task} class can support matrixes of different element type rather than \texttt{char}.
+The \texttt{Task} class can support matrices of different element type other than \texttt{char}.

 The \texttt{Writer} instead writes to disk the task to the same folder, overwriting existing files if present.

+The \texttt{Stencil} class divides the matrix in roughly equal parts and distributes them to other workers.
+Since the ammount of work for simple stencil functions is roughly equal between blocks of columns, the matrix is split into equal blocks and each block is processed by a different worker.
+The result is stored in a copy of the original matrix and the pointers swapped at the end of each iteration.
+Since a neighbourhood of columns is needed for the next iteration, the simplest solution of waiting for all workers has been implemented.
+A countiguous block of columns reduces the probability of false sharing.
+The loops of the workers thread cannot be vectorized by the compiler since the stencil function may be calling library functions or use conditional statements.
+
 %% - - - - - - - - - - - - - - - - - - %%
 \subsection{Native C++ Threads}

@ -320,34 +329,35 @@ The structure of the implementation with native C++ threads is as follows:
  \begin{algorithmic}[1]
    \Procedure{stdthread}{$Input,Output$}
      \For{$result \in Input$}
-        \State $arena = result$
+        \State{$arena = result$}
        \While{$iter>0$}
          \For{$thread \in ThreadPool$}
-            \State send a new LAMBDA with appropriate bounds to the threadpool
-          \EndFor
-          \State swap $arena$ with $result$
-          \State $iter = iter - 1$
-        \EndWhile
-        \State wait for the threadpool to finish
-        \State append $result$ to $Output$
-      \EndFor
-    \EndProcedure
+            \State{send a new LAMBDA with appropriate bounds to the threadpool}
+          \EndFor{}
+          \State{swap $arena$ with $result$}
+          \State{$iter = iter - 1$}
+        \EndWhile{}
+        \State{wait for the threadpool to finish}
+        \State{append $result$ to $Output$}
+      \EndFor{}
+    \EndProcedure{}
  \end{algorithmic}

  \begin{algorithmic}[1]
-    \Procedure{lambda}{$l, \Delta$}\Comment \textit{$l$ is the thread number, $\Delta$ is the ammount of rows to process}
+    \Procedure{lambda}{$l, \Delta$}\Comment{\textit{$l$ is the index of block of rows, $\Delta$ is the number of rows}}
      \For{$x \in \{l \cdot \Delta, \ldots, (l+1) \cdot \Delta - 1\}$}
        \For{$y \in \{0, \ldots, Columns\}$}
-          \If $(x, y)$ not in the border
-            \State calculate the neighborhood of $(x, y)$
-            \State $arena[x][y] = Stencil(neighborhood)$
-          \EndIf
-        \EndFor
-      \EndFor
-    \EndProcedure
+          \If{$(x, y)$ not in the border}
+            \State{calculate the neighborhood of $(x, y)$}
+            \State{$arena[x][y] = Stencil(neighborhood)$}
+          \EndIf{}
+        \EndFor{}
+      \EndFor{}
+    \EndProcedure{}
  \end{algorithmic}
 \end{algorithm}

+
 The threadpool is implemented in the \texttt{threadPool.hpp} and \texttt{threadPool.cpp} files.

 Since for each element the work is equivalent, the $\Delta$ used in the lambda function is simply the total number of rows divided by the number of workers, such that each worker has only one job and all jobs are roughly equal in time.
@ -361,30 +371,31 @@ Since it is required for all jobs to finish, a condition variable is used to wak
 The structure of the implementation using the FastFlow is similar to the one with native threads.
 Since the \texttt{Stencil} class is a subclass of \texttt{ff\_Map}, the method used for the execution is \texttt{parallel\_for}.

-A custom emitter and collector would not have been faster and so the simpler approach of inheriting the methods from \texttt{ff\_Map} was chosen.
+A custom emitter and collector would not have been significantly faster and so the simpler approach of inheriting the methods from \texttt{ff\_Map} was chosen.
+A custom emitter would have had to split the range in the same number of blocks as number of workers and the custom collector would have had to function as a barrier for all workers.

 \begin{algorithm}[H]
  \begin{algorithmic}[1]
    \Procedure{fastflow}{$Task$}
-      \State $arena = Task$
+      \State{$arena = Task$}
      \While{$iter>0$}
-        \State \texttt{parallel\_for} with LAMBDA as the function to execute
-        \State swap $arena$ with $Task$
-        \State $iter = iter - 1$
-      \EndWhile
-      \State return $Task$
-    \EndProcedure
+        \State{\texttt{parallel\_for} with LAMBDA as the function to execute}
+        \State{swap $arena$ with $Task$}
+        \State{$iter = iter - 1$}
+      \EndWhile{}
+      \State{return $Task$}
+    \EndProcedure{}
  \end{algorithmic}

  \begin{algorithmic}[1]
    \Procedure{lambda}{$x$}
      \For{$y \in \{0, \ldots, Columns\}$}
-        \If $(x, y)$ not in the border
-          \State calculate the neighborhood of $(x, y)$
-          \State $arena[x][y] = Stencil(neighborhood)$
-        \EndIf
-      \EndFor
-    \EndProcedure
+        \If{$(x, y)$ not in the border}
+          \State{calculate the neighborhood of $(x, y)$}
+          \State{$arena[x][y] = Stencil(neighborhood)$}
+        \EndIf{}
+      \EndFor{}
+    \EndProcedure{}
  \end{algorithmic}
 \end{algorithm}

@ -404,11 +415,14 @@ Since

 and the value of $T_{\texttt{Reader}} + T_{\texttt{Writer}}$ is known on average then the values speedup, scalability and efficiency are calculated as follows

+\begingroup
+\addtolength{\jot}{1em}
 \begin{align*}
-  \text{Speedup}(n) &= \frac{T_{\text{seq}}}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
+  \text{Speedup}(n) &= \frac{T_{\text{seq}} - (T_{\texttt{Reader}} + T_{\texttt{Writer}})}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
  \text{Scalability}(n) &= \frac{T_{\text{par}}(1) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})}{T_{\text{par}}(n) - (T_{\texttt{Reader}} + T_{\texttt{Writer}})} \\
-  \text{Efficiency}(n) &= \frac{\text{Speedup}(n)}{n} \\
+  \text{Efficiency}(n) &= \frac{\text{Speedup}(n)}{n}
 \end{align*}
+\endgroup

 For very small matrices the efficiency, the speedup and the scalability is very poor for both versions.
 For larger examples instead a significant speedup is seen, but the implementation using native threads is slightly faster.
@ -454,7 +468,7 @@ the fastflow has a peek of speedup and scalability when using 4 workers in the s
 \end{center}

 The file \texttt{random400x2500} % chktex 29
-performs best with 16 workers in the Fastflow implementation and slightly better at 64 workers compared to 32 workers in terms of speedup and scalability but has a significand drop in efficiency from $0.361$ to $0.184$. The relationship between number of workers and speedup is close to linear up to 8 workers.
+performs best with 16 workers in the Fastflow implementation and in the native thread implementation performs slightly better at 64 workers compared to 32 workers in terms of speedup and scalability, but has a significant drop in efficiency from $0.361$ to $0.184$. The relationship between number of workers and speedup is close to linear up to 8 workers.

 \begin{center}
  \begin{tikzpicture}
@ -480,11 +494,11 @@ The file \texttt{equation} more closely follows a linear relationship between sp
  \end{tikzpicture}
 \end{center}

-As the size of the input increases the speedup and the scalability both follow linear trends up with a higher ammount of threds.
+As the size of the input increases the speedup and the scalability both follow linear trends up with a higher amount of threads.

 The scalability for both test files \texttt{equation} and \texttt{equation2} never go below $0.37$, but is slightly better for the implementation with native C++ threads.

-The difference in the three quantities between the test with file \texttt{equation} and the test with file \texttt{euqation1} is much smaller for the Fastflow version. In the native thread version instead there is a small improvement expecially with a higher number of workers.
+The difference in the three quantities between the test with file \texttt{equation} and the test with file \texttt{equation1} is much smaller for the Fastflow version. In the native thread version instead there is a small improvement especially with a higher number of workers.

 \end{document}

--- a/stencil.hpp
+++ b/stencil.hpp
@ -35,7 +35,7 @@ template <typename T> class Stencil : public ff::ff_Map<Task<T>> {
                    std::vector<std::promise<Task<T> *> *> *OutputVector);

  private:
-    Task<T> *svc_helper(Task<T> *t);
+    Task<T> *svc_helper(Task<T> *t, int iterations);
    void constructor_helper(std::vector<std::pair<int, int>> neighborhood);

    std::function<T(std::vector<T>)> Convolution;
@ -100,7 +100,7 @@ void Stencil<T>::constructor_helper(

 // svc function for fastflow library
 template <typename T> Task<T> *Stencil<T>::svc(Task<T> *task) {
-    task = svc_helper(task);
+    task = svc_helper(task, this->Iterations);
    ff::ff_node::ff_send_out(task);
    return this->GO_ON;
 }
@ -112,9 +112,15 @@ Stencil<T>::operator()(std::vector<std::vector<T>> *matrix, int iterations) {
    if ((*matrix).size() == 0 || (*matrix)[0].size() == 0) {
        return matrix;
    }
-    Task<T> *task = new Task<T>(matrix, (*matrix).size(), (*matrix)[0].size());
-    task = svc_helper(task);
-    return task->VectorData;
+    std::vector<std::vector<T>> * arena = new std::vector<std::vector<T>>();
+    *arena = *matrix;
+
+    Task<T> *task = new Task<T>(arena, (*arena).size(), (*arena)[0].size());
+    task = svc_helper(task, iterations);
+
+    *matrix = *task->VectorData;
+    delete task;
+    return matrix;
 }

 // function for std thread
@ -243,8 +249,8 @@ void Stencil<T>::sequential(
    }
 }

-template <typename T> Task<T> *Stencil<T>::svc_helper(Task<T> *task) {
-    int niter = Iterations;
+template <typename T> Task<T> *Stencil<T>::svc_helper(Task<T> *task, int iterations) {
+    int niter = iterations;

    std::vector<std::vector<T>> *arena = new std::vector<std::vector<T>>(0);
    *arena = *(task->VectorData);
Author	SHA1	Message	Date
Elvis Rossi	d15c57b5a1	Update .gitattributes	2023-10-29 02:18:12 +01:00
elvis	cb2a139b4f	added reasons for current implementation	2023-08-29 17:26:23 +02:00
elvis	5d8348b42a	fixed memory leak on () operator for stencil class	2023-08-29 17:25:54 +02:00
elvis	00e921f219	fixed some latex warnings	2023-08-29 15:34:45 +02:00
elvis	0476329279	spelling mistakes	2023-08-26 21:14:54 +02:00
elvis	a21e5d46da	fixing	2023-08-26 20:21:35 +02:00
elvis	278f6d6b7a	fixing	2023-08-26 20:20:01 +02:00
elvis	24566d7002	ignoring eps files	2023-08-26 20:15:02 +02:00
elvis	8fcbdb788e	fixes	2023-08-26 20:07:00 +02:00
elvis	20125fc29f	fixes	2023-08-26 20:01:16 +02:00