Difference between revisions of "Team Lion F2017"

From CDOT Wiki
Jump to: navigation, search
(Basic hotspot analysis)
(Concurrency)
Line 47: Line 47:
 
====matmul_0 (Serial)====
 
====matmul_0 (Serial)====
  
 +
<pre>
 +
double matmul_0(const double* a, const double* b, double* c, int n) {
 +
for (int i = 0; i < n; i++) {
 +
for (int j = 0; j < n; j++) {
 +
double sum = 0.0;
 +
for (int k = 0; k < n; k++)
 +
sum += a[i * n + k] * b[k * n + j];
 +
c[i * n + j] = sum;
 +
}
 +
}
 +
double diag = 0.0;
 +
for (int i = 0; i < n; i++)
 +
diag += c[i * n + i];
 +
return diag;
 +
}
 +
</pre>
  
 
[[File:Conc-01.png]]
 
[[File:Conc-01.png]]
Line 53: Line 69:
 
====matmul_1 (Serial with j-k loops reversed)====
 
====matmul_1 (Serial with j-k loops reversed)====
  
 +
<pre>
 +
double matmul_1(const double* a, const double* b, double* c, int n) {
 +
 +
for (int i = 0; i < n; i++) {
 +
for (int k = 0; k < n; k++) {
 +
double sum = 0.0;
 +
for (int j = 0; j < n; j++)
 +
sum += a[i * n + k] * b[k * n + j];
 +
c[i * n + k] = sum;
 +
}
 +
}
 +
double diag = 0.0;
 +
for (int i = 0; i < n; i++)
 +
diag += c[i * n + i];
 +
return diag;
 +
}
 +
</pre>
  
 
[[File:Conc-11.png]]
 
[[File:Conc-11.png]]
Line 59: Line 92:
 
====matmul_2 (Cilk Plus with cilk_for)====
 
====matmul_2 (Cilk Plus with cilk_for)====
  
 +
<pre>
 +
double matmul_2(const double* a, const double* b, double* c, int n) {
 +
 +
cilk_for (int i = 0; i < n; i++) {
 +
cilk_for (int j = 0; j < n; j++) {
 +
double sum = 0.0;
 +
for(int k = 0; k < n; k++) {
 +
sum += a[i * n + k] * b[k * n + j];
 +
}
 +
c[i * n + j] = sum;
 +
}
 +
}
 +
 +
double diag = 0.0;
 +
for (int i = 0; i < n; i++)
 +
diag += c[i * n + i];
 +
return diag;
 +
}
 +
</pre>
  
 
[[File:Conc-21.png]]
 
[[File:Conc-21.png]]
Line 65: Line 117:
 
====matmul_3 (+array notation, reducer)====
 
====matmul_3 (+array notation, reducer)====
  
 +
<pre>
 +
double matmul_3(const double* a, const double* b, double* c, int n) {
 +
 +
cilk_for(int i = 0; i < n; i++) {
 +
cilk_for(int j = 0; j < n; j++) {
 +
double sum = 0.0;
 +
for (int k = 0; k < n; k++) {
 +
sum += a[i * n + k] * b[k * n + j];
 +
}
 +
c[i * n + j] = sum;
 +
}
 +
}
 +
 +
cilk::reducer_opadd <double> diag(0.0);
 +
cilk_for(int i = 0; i < n; i++) {
 +
diag += c[i * n + i];
 +
}
 +
return diag.get_value();
 +
}
 +
</pre>
  
 
[[File:Conc-31.png]]
 
[[File:Conc-31.png]]
Line 71: Line 143:
 
====matmul_4 (+vectorization)====
 
====matmul_4 (+vectorization)====
  
 +
<pre>
 +
double matmul_4(const double* a, const double* b, double* c, int n) {
 +
 +
cilk_for(int i = 0; i < n; i++) {
 +
cilk_for(int j = 0; j < n; j++) {
 +
double sum = 0.0;
 +
#pragma simd
 +
for (int k = 0; k < n; k++) {
 +
sum += a[i * n + k] * b[k * n + j];
 +
}
 +
c[i * n + j] = sum;
 +
}
 +
}
 +
 +
cilk::reducer_opadd <double> diag(0.0);
 +
cilk_for(int i = 0; i < n; i++) {
 +
diag += c[i * n + i];
 +
}
 +
return diag.get_value();
 +
}
 +
</pre>
  
 
[[File:Conc-41.png]]
 
[[File:Conc-41.png]]
 
[[File:Conc-42.png]]
 
[[File:Conc-42.png]]
 +
 +
====Final test with all functions====
 +
 +
 +
[[File:Conc-51.png]]
 +
[[File:Conc-52.png]]
 +
 +
[[File:Conc-53.png]]
  
 
====Final test with all functions====
 
====Final test with all functions====

Revision as of 12:15, 5 January 2018

Group Members

Intel Parallel Studio vTune Amplifier

  1. Jagmeet Bhamber
  2. Shivam Gupta
  3. Yong Kuk Kim

What is VTune Amplifier?

  • A tool created by Intel to provide performance analysis on software.
  • Offers both a GUI and command-line version for both Windows and Linux
  • GUI only for OSX
  • Basic features available on both Intel and AMD processors, but advanced features only for Intel

How to use it?

  • Available as a standalone unit or part of the following packages:
    • Intel Parallel Studio XE Cluster Edition and Professional Edition
    • Intel Media Server Studio Professional Edition
    • Intel System Studio

Can be run on a local machine


Hotspots

Basic hotspot analysis

We used our workshop 6 as an example to demonstrate this particular aspect of Intel Vtune Amplifer

Summary.PNG

Advanced hotspot analysis

Parallelism

Concurrency

  • Best for visualizing thread parallelism on available cores, finding areas with high or low concurrency, and identifying serial bottlenecks in your code
  • Provides information on how many threads were running at each moment during application execution
  • Includes threads which are currently running or ready to run and therefore are not waiting at a defined waiting or blocking API
  • Also shows CPU time while the hotspot was executing and estimates its effectiveness either by CPU usage or by Threads Concurrency

Results of Concurrency tests on Workshop 6

I ran the Concurrency test on each of the functions in Workshop 6. I isolated each function by commenting out all others, then ran them 1 by 1. This was to get an idea of how they perform on their own. Finally I ran them all together to see how the program runs overall.

matmul_0 (Serial)

double matmul_0(const double* a, const double* b, double* c, int n) {
	for (int i = 0; i < n; i++) {
		for (int j = 0; j < n; j++) {
			double sum = 0.0;
			for (int k = 0; k < n; k++)
				sum += a[i * n + k] * b[k * n + j];
			c[i * n + j] = sum;
		}
	}
	double diag = 0.0;
	for (int i = 0; i < n; i++)
		diag += c[i * n + i];
	return diag;
}

Conc-01.png Conc-02.png

matmul_1 (Serial with j-k loops reversed)

double matmul_1(const double* a, const double* b, double* c, int n) {
	
	for (int i = 0; i < n; i++) {
		for (int k = 0; k < n; k++) {
			double sum = 0.0;
			for (int j = 0; j < n; j++)
				sum += a[i * n + k] * b[k * n + j];
			c[i * n + k] = sum;
		}
	}
	double diag = 0.0;
	for (int i = 0; i < n; i++)
		diag += c[i * n + i];
	return diag;
}

Conc-11.png Conc-12.png

matmul_2 (Cilk Plus with cilk_for)

double matmul_2(const double* a, const double* b, double* c, int n) {
	
	cilk_for (int i = 0; i < n; i++) {
		cilk_for (int j = 0; j < n; j++) {
			double sum = 0.0;
			for(int k = 0; k < n; k++) {
				sum += a[i * n + k] * b[k * n + j];
			}
			c[i * n + j] = sum;
		}
	}

	double diag = 0.0;
	for (int i = 0; i < n; i++)
		diag += c[i * n + i];
	return diag;
}

Conc-21.png Conc-22.png

matmul_3 (+array notation, reducer)

double matmul_3(const double* a, const double* b, double* c, int n) {
	
	cilk_for(int i = 0; i < n; i++) {
		cilk_for(int j = 0; j < n; j++) {
			double sum = 0.0;
			for (int k = 0; k < n; k++) {
				sum += a[i * n + k] * b[k * n + j];
			}
			c[i * n + j] = sum;
		}
	}

	cilk::reducer_opadd <double> diag(0.0);
	cilk_for(int i = 0; i < n; i++) {
		diag += c[i * n + i];
	}
	return diag.get_value();
}

Conc-31.png Conc-32.png

matmul_4 (+vectorization)

double matmul_4(const double* a, const double* b, double* c, int n) {
	
	cilk_for(int i = 0; i < n; i++) {
		cilk_for(int j = 0; j < n; j++) {
			double sum = 0.0;
#pragma simd
			for (int k = 0; k < n; k++) {
				sum += a[i * n + k] * b[k * n + j];
			}
			c[i * n + j] = sum;
		}
	}

	cilk::reducer_opadd <double> diag(0.0);
	cilk_for(int i = 0; i < n; i++) {
		diag += c[i * n + i];
	}
	return diag.get_value();
}

Conc-41.png Conc-42.png

Final test with all functions

Conc-51.png Conc-52.png

Conc-53.png

Final test with all functions

Conc-51.png Conc-52.png

Conc-53.png

Locks & Waits

HPC Performance Characterization

Microarchitecture

General Exploration

Memory Access

references

https://en.wikipedia.org/wiki/VTune

https://software.intel.com/en-us/get-started-with-vtune

https://software.intel.com/en-us/vtune-amplifier-help-analysis-types

https://software.intel.com/en-us/vtune-amplifier-help-basic-hotspots-analysis

https://software.intel.com/en-us/vtune-amplifier-help-advanced-hotspots-analysis

https://software.intel.com/en-us/vtune-amplifier-help-concurrency-analysis

https://software.intel.com/en-us/vtune-amplifier-help-locks-and-waits-analysis

https://software.intel.com/en-us/vtune-amplifier-help-hpc-performance-characterization-analysis

https://software.intel.com/en-us/vtune-amplifier-help-general-exploration-analysis

https://software.intel.com/en-us/vtune-amplifier-help-memory-access-analysis