Changes

GPU621/Intel oneMKL - Math Kernel Library

5,754 bytes added, 01:52, 1 December 2021

no edit summary

clock_t startTime = clock();

~~for (r = 0; r < LOOP_COUNT; r++) {~~ for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (k = 0; k < p; k++) sum += A[p * i + k] * B[n * k + j]; C[n * i + j] = sum; }

}

clock_t endTime = clock();

<br />

! 6

|-

| ~~1500~~9000

| 15.7

| 7.7

When mkl_get_max_threads is equal to the number of physical cores, the performance is the best, not the number of threads, which is the following 3 instead of 6. <br />

==Source Code==

=Serial=

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

/* Consider adjusting LOOP_COUNT based on the performance of your computer */

/* to make sure that total run time is at least 1 second */

#define LOOP_COUNT 220 //220 for more accurate statistics

int main()

{

double* A, * B, * C;

int m, n, p, i, j, k, r;

double alpha, beta;

double sum;

double s_initial, s_elapsed;

printf("\n This example demonstrates threading impact on computing real matrix product \n"

" C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are \n"

" matrices and alpha and beta are double precision scalars \n\n");

m = 2000, p = 200, n = 1000;

printf(" Initializing data for matrix multiplication C=A*B for matrix \n"

" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);

alpha = 1.0; beta = 0.0;

printf(" Allocating memory for matrices aligned on 64-byte boundary for better \n"

" performance \n\n");

A = (double*)malloc(m * p * sizeof(double), 64);

B = (double*)malloc(p * n * sizeof(double), 64);

C = (double*)malloc(m * n * sizeof(double), 64);

if (A == NULL || B == NULL || C == NULL) {

printf("\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");

free(A);

free(B);

free(C);

return 1;

}

printf(" Intializing matrix data \n\n");

for (i = 0; i < (m * p); i++) {

A[i] = (double)(i + 1);

}

for (i = 0; i < (p * n); i++) {

B[i] = (double)(-i - 1);

}

for (i = 0; i < (m * n); i++) {

C[i] = 0.0;

}

clock_t startTime = clock();

for (i = 0; i < m; i++) {

for (j = 0; j < n; j++) {

sum = 0.0;

for (k = 0; k < p; k++)

sum += A[p * i + k] * B[n * k + j];

C[n * i + j] = sum;

}

clock_t endTime = clock();

s_elapsed = (endTime - startTime) / LOOP_COUNT;

printf(" == Matrix multiplication using triple nested loop completed == \n"

" == at %.5f milliseconds == \n\n", (s_elapsed * 1000));

printf(" Deallocating memory \n\n");

free(A);

free(B);

free(C);

if (s_elapsed < 0.9 / LOOP_COUNT) {

s_elapsed = 1.0 / LOOP_COUNT / s_elapsed;

i = (int)(s_elapsed * LOOP_COUNT) + 1;

printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"

" computer as %i to have total execution time about 1 second for reliability \n"

" of measurements\n\n", i);

}

printf(" Example completed. \n\n");

return 0;

}

=MKL version=

#include <stdio.h>

#include <stdlib.h>

#include "mkl.h"

/* Consider adjusting LOOP_COUNT based on the performance of your computer */

/* to make sure that total run time is at least 1 second */

#define LOOP_COUNT 220 // 220 用于更精确的统计

int main()

{

double* A, * B, * C;

int m, n, p, i, j, r, max_threads;

double alpha, beta;

double s_initial, s_elapsed;

printf("\n This example demonstrates threading impact on computing real matrix product \n"

" C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are \n"

" matrices and alpha and beta are double precision scalars \n\n");

m = 2000, p = 200, n = 1000;

printf(" Initializing data for matrix multiplication C=A*B for matrix \n"

" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);

alpha = 1.0; beta = 0.0;

printf(" Allocating memory for matrices aligned on 64-byte boundary for better \n"

" performance \n\n");

A = (double*)mkl_malloc(m * p * sizeof(double), 64);

B = (double*)mkl_malloc(p * n * sizeof(double), 64);

C = (double*)mkl_malloc(m * n * sizeof(double), 64);

if (A == NULL || B == NULL || C == NULL) {

printf("\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");

mkl_free(A);

mkl_free(B);

mkl_free(C);

return 1;

}

printf(" Intializing matrix data \n\n");

for (i = 0; i < (m * p); i++) {

A[i] = (double)(i + 1);

}

for (i = 0; i < (p * n); i++) {

B[i] = (double)(-i - 1);

}

for (i = 0; i < (m * n); i++) {

C[i] = 0.0;

}

max_threads = mkl_get_max_threads();

printf(" Finding max number %d of threads Intel(R) MKL can use for parallel runs \n\n", max_threads);

printf(" Running Intel(R) MKL from 1 to %i threads \n\n", max_threads * 2);

for (i = 1; i <= max_threads * 2; i++) {

for (j = 0; j < (m * n); j++)

C[j] = 0.0;

mkl_set_num_threads(i);

cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,

m, n, p, alpha, A, p, B, n, beta, C, n);

s_initial = dsecnd();

for (r = 0; r < LOOP_COUNT; r++) {

cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,

m, n, p, alpha, A, p, B, n, beta, C, n);

}

s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

printf(" == Matrix multiplication using Intel(R) MKL dgemm completed ==\n"

" == at %.5f milliseconds using %d thread(s) ==\n\n", (s_elapsed * 1000), i);

}

printf(" Deallocating memory \n\n");

mkl_free(A);

mkl_free(B);

mkl_free(C);

if (s_elapsed < 0.9 / LOOP_COUNT) {

s_elapsed = 1.0 / LOOP_COUNT / s_elapsed;

i = (int)(s_elapsed * LOOP_COUNT) + 1;

printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"

" computer as %i to have total execution time about 1 second for reliability \n"

" of measurements\n\n", i);

}

printf(" Example completed. \n\n");

return 0;

}

==References==

~~references~~https://www.intel.com/content/www/us/en/developer/articles/technical/a-simple-example-to-measure-the-performance-of-an-intel-mkl-function.html

Menglinwu

37

edits

Changes

GPU621/Intel oneMKL - Math Kernel Library

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools