# Team MMD

## Intel Math Kernel Library (MKL) Summary

The MKL is a library of optimized math routines for science, engineering and financial applications. The library consists of many standard math library along with Intel’s math libraries implementation. Currently it is compatible with Windows, Linux and OS X operating systems. Currently Intel offers native support for C/C++, and Fortran. It is possible to use the MKL with Python but that will depend on the library that you will be required to use.

## Setting up MKL

Next you will need to install and allow for the intergration of the tools with your version of visual studio.
Now when you open a new project there will still be two more steps you will need to add to use MKL. The first is adding the mkl.h header file and the second is going under the project properties and select the use MKL option to yes.

## MKL testing

For our project we wanted to test the performance of MKL verse other Intel parallel solutions such as Cilk Plus and Thread building blocks (TBB).
We also tested the serial implementation for completions sake Serial version :

```   // serial_matmul_naive returns the product a * b using basic serial logic
void serial_matmul_naive(const double* a, const double* b, double* c, int numRows, int numCols) {
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
double sum = 0.0;
for (int k = 0; k < numRows; k++)
sum += a[i * numRows + k] * b[k * numRows + j];
c[i * numRows + j] = sum;
}
}
}
```

Serial version with reversed inner loops :

```   // serial_matmul_reversed returns the product a * b using serial logic loops reversed
void serial_matmul_naive(const double* a, const double* b, double* c, int numRows, int numCols) {
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
double sum = 0.0;
for (int k = 0; k < numRows; k++)
sum += a[i * numRows + k] * b[k * numRows + j];
c[i * numRows + j] = sum;
}
}
}
```

TBB version :

```   // Thread builing blocks matrix multipication
class Body {
double* a;
double* b;
double* c;
int nRows;
int nCols;
public:
Body(double* in1, double* in2, double* out, int nR, int nC) {
a = in1;
b = in2;
c = out;
nRows = nR;
nCols = nC;
}

void operator()(const tbb::blocked_range<int>& r) const {
for (int i = r.begin(); i < r.end() && i < nRows; i++) {
for (int j = 0; j < nCols; j++) {
c[i * nRows + j] = 0.0;
for (int k = 0; k < nRows; k++)
c[i * nRows + j] += a[i * nRows + k] * b[k * nRows + j];
}
}
}
};
```

Cilk plus naive version :

```   // cilkplus_matmul_naive returns the product a * b using cilk_for
void cilkplus_matmul_naive(const double* a, const double* b, double* c, int numRows, int numCols) {
cilk_for(int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
double sum = 0.0;
for (int k = 0; k < numRows; k++)
sum += a[i * numRows + k] * b[k * numRows + j];
c[i * numRows + j] = sum;
}
}
}
```

Cilk plus with vectorization :

```   // cilkplus_matmul_vectorized returns the product a * b using cilk_for and pragma simd (vectorization)
void cilkplus_matmul_vectorized(const double* a, const double* b, double* c, int numRows, int numCols) {
cilk_for(int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
double sum = 0.0;
#pragma simd
for (int k = 0; k < numRows; k++)
sum += a[i * numRows + k] * b[k * numRows + j];
c[i * numRows + j] = sum;
}
}
}
```

MKL version (compressed) :

```   double *A, *B, *C;
double alpha = 1.0, beta = 0.0;
...
A = (double *)mkl_malloc(numRows * numCols * sizeof(double), 64);
B = (double *)mkl_malloc(numRows * numCols * sizeof(double), 64);
C = (double *)mkl_malloc(numRows * numCols * sizeof(double), 64);
...
if (A == NULL || B == NULL || C == NULL) {
std::cout << "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n";
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 3;
}
...
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
numRows, numCols, 1, alpha, A, numRows, B, numCols, beta, C, numRows);
...
//// deallocate mkl
mkl_free(A);
mkl_free(B);
mkl_free(C);
```

## Full code & Results

Here is the full program source code that was used during our tests.

```//DPS921 - Project

#include <iostream>
#include <cstdlib>
#include <chrono>
using namespace std::chrono;

#include <cilk/cilk.h>
#include <tbb/tbb.h>
#include "mkl.h"

// report system time
void reportTime(const char* msg, steady_clock::duration span) {
auto ms = duration_cast<milliseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " milliseconds" << std::endl;
}

// Print matricies
void printMatricies(const double* a, const double* b, double* mat, int numRows, int numCols) {
if (numRows <= 5 && numCols <= 5)
{
for (int i = 0; i < numRows; ++i) {
for (int j = 0; j < numCols; ++j) {
std::cout << a[i * numRows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
for (int i = 0; i < numRows; ++i) {
for (int j = 0; j < numCols; ++j) {
std::cout << b[i * numRows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
for (int i = 0; i < numRows; ++i) {
for (int j = 0; j < numCols; ++j) {
std::cout << mat[i * numRows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}

// serial_matmul_naive returns the product a * b using basic serial logic
void serial_matmul_naive(const double* a, const double* b, double* c, int numRows, int numCols) {
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
double sum = 0.0;
for (int k = 0; k < numRows; k++)
sum += a[i * numRows + k] * b[k * numRows + j];
c[i * numRows + j] = sum;
}
}
}

// serial_matmul_reversed returns the product a * b using serial logic loops reversed
// TODO: actually test this one
void serial_matmul_reversed(const double* a, const double* b, double* c, int numRows, int numCols) {
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
c[i * numRows + j] = 0.0;
for (int k = 0; k < numRows; k++)
c[i * numRows + j] += a[i * numRows + k] * b[k * numRows + j];
}
}
}

// Thread builing blocks matrix multipication
class Body {
double* a;
double* b;
double* c;
int nRows;
int nCols;
public:
Body(double* in1, double* in2, double* out, int nR, int nC) {
a = in1;
b = in2;
c = out;
nRows = nR;
nCols = nC;
}

void operator()(const tbb::blocked_range<int>& r) const {
for (int i = r.begin(); i < r.end() && i < nRows; i++) {
for (int j = 0; j < nCols; j++) {
c[i * nRows + j] = 0.0;
for (int k = 0; k < nRows; k++)
c[i * nRows + j] += a[i * nRows + k] * b[k * nRows + j];
}
}
}
};

// cilkplus_matmul_naive returns the product a * b using cilk_for
void cilkplus_matmul_naive(const double* a, const double* b, double* c, int numRows, int numCols) {
cilk_for(int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
double sum = 0.0;
for (int k = 0; k < numRows; k++)
sum += a[i * numRows + k] * b[k * numRows + j];
c[i * numRows + j] = sum;
}
}
}

// cilkplus_matmul_vectorized returns the product a * b using cilk_for and pragma simd (vectorization)
void cilkplus_matmul_vectorized(const double* a, const double* b, double* c, int numRows, int numCols) {
cilk_for(int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
double sum = 0.0;
#pragma simd
for (int k = 0; k < numRows; k++)
sum += a[i * numRows + k] * b[k * numRows + j];
c[i * numRows + j] = sum;
}
}
}

int main(int argc, char** argv) {
if (argc != 4) {
std::cerr << "*** Incorrect number of arguments ***\n";
std::cerr << "Usage: " << argv[0] << " Method #rows #columns\n";
return 1;
}

int numRows = std::atoi(argv[2]);
int numCols = std::atoi(argv[3]);

double *a, *b, *mat;
try
{
a = new double[numRows * numCols];
b = new double[numRows * numCols];
mat = new double[numRows * numCols];
}
catch (std::exception& e)
{
std::cerr << "*** Failed to Allocate Memory for "
<< numRows << " by " << numCols << "matrices" << std::endl;
return 2;
}

// initialize a and b
for (int i = 0; i < numRows * numCols; ++i)
{
a[i] = 0.0;
b[i] = 0.0;
}
for (int i = 0; i < numRows; ++i)
{
a[i * numRows + i] = 1.0;
b[i * numRows + i] = 1.0;
}

// Serial
serial_matmul_naive(a, b, mat, numRows, numCols);
printMatricies(a, b, mat, numRows, numCols);
reportTime("\n - Serial matrix naive multiplication ", te - ts);
std::cout << std::endl;

serial_matmul_reversed(a, b, mat, numRows, numCols);
printMatricies(a, b, mat, numRows, numCols);
reportTime("\n - Serial matrix reversed multiplication ", te - ts);
std::cout << std::endl;

// TBB
Body const body(a, b, mat, numRows, numCols);
size_t grainsize = 100000;
tbb::blocked_range<int> range(0, numRows, grainsize);
tbb::parallel_for(range, body);
printMatricies(a, b, mat, numRows, numCols);
reportTime("\n - TBB matrix multiplication ", te - ts);
std::cout << std::endl;

// Cilkplus
cilkplus_matmul_naive(a, b, mat, numRows, numCols);
printMatricies(a, b, mat, numRows, numCols);
reportTime("\n - Cilkplus matrix multiplication naive ", te - ts);
std::cout << std::endl;

cilkplus_matmul_vectorized(a, b, mat, numRows, numCols);
printMatricies(a, b, mat, numRows, numCols);
reportTime("\n - Cilkplus matrix multiplication vectorization ", te - ts);
std::cout << std::endl;

// Math kernel Library
// math kernel //
double *A, *B, *C;
double alpha = 1.0, beta = 0.0;

A = (double *)mkl_malloc(numRows * numCols * sizeof(double), 64);
B = (double *)mkl_malloc(numRows * numCols * sizeof(double), 64);
C = (double *)mkl_malloc(numRows * numCols * sizeof(double), 64);

// initialize a and b
for (int i = 0; i < numRows * numCols; ++i)
{
A[i] = 0.0;
B[i] = 0.0;
}
for (int i = 0; i < numRows; ++i)
{
A[i * numRows + i] = 1.0;
B[i * numRows + i] = 1.0;
}

if (A == NULL || B == NULL || C == NULL) {
std::cout << "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n";
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 3;
}
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
numRows, numCols, 1, alpha, A, numRows, B, numCols, beta, C, numRows);
printMatricies(A, B, C, numRows, numCols);
reportTime("\n - MKL matrix multiplication ", te - ts);
std::cout << std::endl;

//// deallocate mkl
mkl_free(A);
mkl_free(B);
mkl_free(C);
// math kernel //

// deallocate
delete[] a;
delete[] b;
delete[] mat;

std::cout << std::endl;
system("pause");
}

```

And our results