Changes

Jump to: navigation, search

Algo holics

13,805 bytes added, 03:00, 8 April 2019
Kernel Version 2
{| class="wikitable mw-collapsible mw-collapsed"
! Flat profileCall Graph
|-
|
{| class="wikitable mw-collapsible mw-collapsed"
! Flat profileCall Graph
|-
|
=== Assignment 2 ===
We decided Our initial idea was to choose use the neural network code for our assignment 2. But since the algorithm itself was not very accurate (2/10 correct predictions even after 10,000 training iterations), we decided to paralellizemerge sort. The reason being it gives us a good learning opportunity and more importantly Soon we realized that since its Big O classification was n log n, offloading computations to GPU would not be that effective. So, we can potentially speed up the execution time of settled with the program by using CUDAcosine transform library, as described below.
====Cosine Tranformation====
=== Parallelize ===The Cosine_Transform is a simple C++ library which demonstrates properties of the Discrete cosine Transform for real data. The Discrete Cosine Transform or DCT is used to create jpeg (compressed images).
{The formula used here is: | class(√1/n) , if u="wikitable mw0; 0≤v≤n-collapsible mw-collapsed"1! Flat profile C(u,v) = |(√2/n) * cos[((2*v+1)π*u)/2n], if 1≤u≤n-| 1; 0≤v≤n-1
#include <iostream>#include <algorithm>#include <vector>#include <ctime>#include <cuda_runtimeWhere, u is the row index, v is the column index and n is the total number of elements in a row/column in the computational matrix.h>#include "device_launch_parameters.h"
This [https://www.youtube.com/watch?v=tW3Hc0Wrgl0 Link] can be used for better understanding of the above formula. Here is the [https://people.sc.fsu.edu/~jburkardt/cpp_src/cosine_transform/cosine_transform.html source code] used.
__global__ void merge(std=====Profiling=====The flat profile for the above serial code looks like::vector<int> arrayInput, unsigned int b, unsigned int c, unsigned int e) {
int idx {| class= blockIdx.x * blockDim.x + threadIdx.x;"wikitable mw-collapsible mw-collapsed"! Flat Profile|-|
std::vector<int> C(arrayInput);
unsigned int i1 = b;
unsigned int i2 = c + 1; //start point in each piece
unsigned int n1 = c;
unsigned int n2 = e; //end point of each piece
if 1 2 3 4 granularity: each sample hit covers 2 byte(idx s) for 0.68% of 1.47 seconds 5 6 index % time self children called name 7 < espontaneous> 8 [1] 100.0 0.00 1.47 main [1] 9 0.00 1.47 1/1 cosine_transform_test01(int) {[3] 10 ----------------------------------------------- 11 1.47 0.00 1/1 cosine_transform_test01(int) [3] for 12 [2] 100.0 1.47 0.00 1 cosine_transform_data(int, double*) [2] 13 ----------------------------------------------- 14 0.00 1.47 1/1 main [1] 15 [3] 100.0 0.00 1.47 1 cosine_transform_test01(int) [3] 16 1.47 0.00 1/1 cosine_transform_data(int, double*) [2] 17 0.00 0.00 1/1 r8vec_uniform_01_new(int, int&) [14] 18 0.00 0.00 1/1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >) [13] 19 0.00 0.00 1/1 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::s teady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(unsigned i = b; i std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<= e; ++i1l, 10 00000000l> > > const&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&)[21] { 20 ----------------------------------------------- if 21 0.00 0.00 1/3 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>: :__cast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [18] 22 0.00 0.00 2/3 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<long, std::ratio<1l , 1000000000l>, long, std::ratio<1l, 1000000000l> >(i1 std::chrono::duration<long, std::ratio<1l, 1000000000l> n1> const&, std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [22] 23 [10] 0.0 0.00 0.00 3 std::chrono::duration<long, std::ratio<1l, 1000000000l> >::count() const [10] 24 ----------------------------------------------- 25 0.00 0.00 2/2 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::s teady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 10 00000000l> > > const&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) [21] 26 [11] 0.0 0.00 0.00 2 std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::time_since_epoch() const [11] 27 ----------------------------------------------- 28 0.00 0.00 1/1 __libc_csu_init [28] { 29 [12] 0.0 0.00 0.00 1 _GLOBAL__sub_I__Z20r8vec_uniform_01_newiRi [12] C 30 0.00 0.00 1/1 __static_initialization_and_destruction_0(int, int) [i15] = arrayInput 31 ----------------------------------------------- 32 0.00 0.00 1/1 cosine_transform_test01(int) [i23]; ++i2; 33 [13] 0.0 0.00 0.00 1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >) [13] }else if 34 0.00 0.00 1/1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duratio n_cast<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(i2 std::chrono::duration<long, std::ratio<1l, 1000000000l> n2> const&){[17] C 35 0.00 0.00 1/1 std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const [16] 36 ----------------------------------------------- 37 0.00 0.00 1/1 cosine_transform_test01(int) [i3] = arrayInput 38 [i114] 0.0 0.00 0.00 1 r8vec_uniform_01_new(int, int&) [14]; ++i1; 39 ----------------------------------------------- }else if 40 0.00 0.00 1/1 _GLOBAL__sub_I__Z20r8vec_uniform_01_newiRi [12] 41 [15] 0.0 0.00 0.00 1 __static_initialization_and_destruction_0(arrayInputint, int) [i115] 42 ----------------------------------------------- 43 0.00 0.00 1/1 reportTime(char const*, std::chrono::duration<= arrayInputlong, std::ratio<1l, 1000000000l> >) [i213]) { C 44 [i16] = arrayInput 0.0 0.00 0.00 1 std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const [i116]; ++i1; 45 ----------------------------------------------- }else{ 46 0.00 0.00 1/1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >) [13] C 47 [i17] = arrayInput 0.0 0.00 0.00 1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duration_ca st<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [17] 48 0.00 0.00 1/1 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>: :__cast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [i218]; ++i2; 49 ----------------------------------------------- 50 0.00 0.00 1/1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duratio n_cast<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [17] } 51 [18] 0.0 0.00 0.00 1 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>::__c ast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [18] } 52 0.00 0.00 1/3 std::chrono::duration<long, std::ratio<1l, 1000000000l> >::count() const [10] |}
As is evident, the algorithm is O(n2) currently. Using thread indices on the GPU to replace the for loops could potentially improve performance.
To increase the efficiency of the program we transformed the '''cosine_transform_data''' function into a kernel named '''cosTransformKernel''' which offloads the compute intense calculation of the program to the GPU.
}
void check_sort(std::vector<int> array); //test sorted arrays=====Kernel Version 1=====void merge_sort(std::vector<int> array); //recurrent merge sort{| class="wikitable mw-collapsible mw-collapsed"//std::vector<int> merge(std::vector<int> array, unsigned int b, unsigned int c, unsigned int e);! Modified Code|-|
int main() # include <iostream>{ # include <iomanip> unsigned int size; # include <ctime> std::cout # include <chrono> # include < "Please enter the size of your array:" cstdlib> # include <cmath> #include < cuda_runtime.h> using namespace std::endl; using namespace std::cin >> chrono; const double pi = 3.141592653589793; const int ntpb = 1024; void cosine_transform_test01 ( int size);
std::vector double *r8vec_uniform_01_new ( int n, int &seed ){ int i; const int i4_huge = 2147483647; int k; double *r; if ( seed == 0 ){ cerr << "\n"; cerr <int> initial_array< "R8VEC_UNIFORM_01_NEW - Fatal error!\n"; cerr << " Input value of SEED = 0.\n"; exit (size1 ); } r = new double[n]; for ( i = 0; i < n; i++ ){ k = seed //array of random elements127773; seed = 16807 * ( seed - k * 127773 ) - k * 2836; if ( seed < 0 ){ seed = seed + i4_huge; } r[i] = ( double ) ( seed ) * 4.656612875E-10; } return r; }
//prefill arrays srand double *cosine_transform_data (time(0)int n, double d[] ){ double angle; double *c; int i; int j; c = new double[n]; for (unsigned int i= 0; i < initial_arrayn; i++ ){ c[i] = 0.size0; for ()j = 0; j < n; j++){ angle = pi * ( double ) ( i* ( 2 * j + 1 ) ) / ( double ) ( 2 * n ) {; c[i] = c[i] + cos ( angle ) * d[j]; } initial_array c[i] = randc[i] * sqrt ( 2.0 / (double )( n ) ); } return c; }
merge_sort void reportTime(initial_arrayconst char* msg, steady_clock::duration span){ auto ms = duration_cast<milliseconds>(span); std::cout << msg << " - took - " << ms.count() << " millisecs" << std::endl; }
/ __global__ void cosTransformKernel(double *a, double * DEBUGGERb, int n){ double angle; const double pi = 3.141592653589793; std::cout << "initial array" << std::endl int i = blockIdx.x * blockDim.x + threadIdx.x; for (unsigned int i j= 0; i j< initial_array.size()n; j++){ angle = pi * (double) (i*(2*j+1)) / (double) {(2*n); std::cout << initial_array b[i] << " "+= cos ( angle ) * a[j]; } std::cout << std::endl b[i] *= sqrt( 2.0 / (double) n ); */ }
int main (int argc, char* argv[] ){ if (argc != 2) { std::cerr << argv[0] << ": invalid number of arguments\n"; std::cerr << "Usage: " << argv[0] << " size_of_vector\n"; return 1; } int n = std::atoi(argv[1]); cosine_transform_test01 (n); return 0; }
void merge_sortcosine_transform_test01 (std::vector<int> arrayInputsize){ int n = size; int seed; double *r; double *hs; double *s = new double[n]; double *d_a; double *d_b; //allocate memory on the device for the randomly generated array and for the array in which transform values will be stored cudaMalloc((void**)&d_a,sizeof(double) * n);{ cudaMalloc((void**)&d_b,sizeof(double) * n); std::cout << "merge sort" << std::endl seed = 123456789; check_sort r = r8vec_uniform_01_new (arrayInputn, seed ); //copy randomly generated values from host to device cudaMemcpy(d_a,r,sizeof(double)* DEBUGGERn,cudaMemcpyHostToDevice); std int nblks = (n + ntpb - 1) / ntpb; steady_clock::cout time_point ts, te; ts = steady_clock::now(); cosTransformKernel<< "initial array" << stdnblks,ntpb>>>(d_a,d_b,size); cudaDeviceSynchronize(); te = steady_clock::endlnow(); reportTime("Cosine Transform on device",te-ts); for cudaMemcpy(s,d_b,sizeof(unsigned int i double)*n,cudaMemcpyDeviceToHost); ts = 0steady_clock::now(); i < array.size hs = cosine_transform_data (n, r ); ++i) { std te = steady_clock::cout << array[i] << now(); reportTime(" Cosine Transform on host",te-ts); } */
int d cudaFree(d_a); cudaFree(d_b); cudaDeviceProp prop delete [] r; cudaGetDevice(&d) delete [] s; cudaGetDeviceProperties(&prop, d) delete [] hs; }
float* d_a; |}
float start_time = clock();
unsigned int n = arrayInput.size();
float* h_a = new float[n]; cudaMalloc((void**)&d_a, n * sizeof(float));The graph for the execution time difference between the device and the host looks like:
[[File:kernel1.png]]
Even though the kernel includes a for-loop the execution time has decreased drastically. Thats because each thread is now responsible for one calculating one element of the final Cos transformed matrix(unit vector).
=== Assignment 3 ===
For optimizing the code better, we thought of removing the iterative loop from the kernel by using threadIdx.y to control calculation of each element's cosine for that position in the supposed matrix. The problem in this was that each thread was in a racing condition to write to the same memory location, to sum up the cosine transformations for all elements of that row. We solved this by using the atomic function. Its prototype is as follows.
double atomicAdd(double* address, double value)
for (unsigned int s = 1; s < n; s *= ===Kernel Version 2) { for (unsigned int b = 0; b < n; b += s * 2) { unsigned int c = std::min(b + s - 1, n - 1); unsigned int e = std::min(c + s, n - 1); merge <<< 1,n >> > (arrayInput, b, c, e);=
{| class="wikitable mw-collapsible mw-collapsed"
! Kernel 2
|-
|
# include <cmath>
# include <cstdlib>
# include <iostream>
# include <iomanip>
# include <ctime>
# include <chrono>
# include <cstdlib>
# include <cmath>
#include <limits>
#include <cuda_runtime.h>
#include <cuda.h>
using namespace std;
using namespace std::chrono;
const double pi = 3.141592653589793;
const unsigned ntpb = 32;
void cosine_transform_test01 ( int size );
} double *r8vec_uniform_01_new ( int n, int &seed ){ } int i; float end_time const int i4_huge = clock2147483647; int k; double *r; if (seed == 0 ) { cerr << "\n"; cerr << "R8VEC_UNIFORM_01_NEW - start_timeFatal error!\n"; /* DEBUGGER cerr << " Input value of SEED = 0.\n"; exit ( 1 ); } r = new double[n]; for (unsigned int i = 0; i < array.size()n; i++i) { std::cout k = seed / 127773; seed = 16807 * ( seed - k * 127773 ) - k * 2836; if ( seed << array0 ){ seed = seed + i4_huge; } r[i] << " "= ( double ) ( seed ) * 4.656612875E-10; } */ check_sort(arrayInput) return r; std::cout << "time: " << end_time / 1000 << std::endl; }
double *cosine_transform_data ( int n, double d[] ){
double angle;
double *c;
int i;
int j;
c = new double[n];
for ( i = 0; i < n; i++ ){
c[i] = 0.0;
for ( j = 0; j < n; j++ ){
angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n );
c[i] = c[i] + cos ( angle ) * d[j];
}
c[i] = c[i] * sqrt ( 2.0 / ( double ) ( n ) );
}
return c;
}
cudaMemcpy(h_a, d_a, n * sizeof(float), cudaMemcpyDeviceToHost);
void reportTime(const char* msg, steady_clock::duration span) {
auto ms = duration_cast<milliseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " millisecs" << std::endl;
}
cudaFree __global__ void cosTransformKernel(d_adouble *a, double *b, const int n){ double angle; const double pi = 3.141592653589793; delete int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; if(i<n && j<n){ angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n ); double value = cos ( angle ) * a[j] h_a; b[i] = atomicAdd(&b[i], value); } //square root of the whole cos transformed row term if(j==n-1 && i<n){ b[i] *= sqrt ( 2.0 / ( double ) ( n ) ); } }
int main (int argc, char* argv[] ){ if (argc != 2) { std::cerr << argv[0] << ": invalid number of arguments\n"; std::cerr << "Usage: " << argv[0] << " size_of_vector\n"; return 1; } int n = std::atoi(argv[1]); cosine_transform_test01 (n); return 0; }
void cosine_transform_test01 ( int size){ int n = size; int seed; double *r; double *hs; //host side pointer to store the array returned from host side cosine_transform_data, for comparison purposes double *s = new double[n]; //double *t; double *d_a; double *d_b; //allocate memory on the device for the randomly generated array and for the array in which transform values will be stored cudaMalloc((void**)&d_a,sizeof(double) * n); cudaMalloc((void**)&d_b,sizeof(double) * n); seed = 123456789; r = r8vec_uniform_01_new ( n, seed ); //copy randomly generated values from host to device for(int i=0; i<n; i++) s[i]=0.0; cudaMemcpy(d_a,r,sizeof(double)*n,cudaMemcpyHostToDevice); cudaMemcpy(d_b,s,sizeof(double)*n,cudaMemcpyHostToDevice); int nblks = (n + ntpb - 1) / ntpb; dim3 grid(nblks,nblks,1); dim3 block(ntpb,ntpb,1);std steady_clock::vectortime_point ts, te; ts = steady_clock::now(); cosTransformKernel<<<intgrid,block>>> merge(stdd_a,d_b,size); cudaDeviceSynchronize(); te = steady_clock::vector<int> arraynow(); reportTime("Cosine Transform on device",te-ts); cudaMemcpy(s,d_b, unsigned int bsizeof(double)*n, unsigned int ccudaMemcpyDeviceToHost); ts = steady_clock::now(); hs = cosine_transform_data ( n, unsigned int er ) {; std te = steady_clock::vector<int> Cnow(); reportTime(array"Cosine Transform on host",te-ts);
cudaFree(d_a);
cudaFree(d_b);
delete [] r;
delete [] s;
delete [] hs;
//delete [] t;
return;
}
unsigned int i1 = b; unsigned int i2 = c + 1; //start point in each piece unsigned int n1 = c; unsigned int n2 = e; //end point of each piece|}
for (unsigned i = b; i <= e; ++i) { if (i1 > n1) // { C[i] = array[i2]; ++i2; } else if (i2 > n2) C[i] = array[i1]; ++i1; { }else if (array[i1] <= array[i2]) { C[i] = array[i1]; ++i1; } else { C[i] = array[i2]; ++i2; } } return C; Here is a comparison between the naive and optimized kernel
}*/[[File:kernel2.jpg]]
 void check_sort(std::vector<int> array){ Evidently, there is some performance boost for (unsigned int i = 0; i < (arraythe new version. However, each call to atomicAdd by a thread locks the global memory until the old value is read and added to the passed value. This deters faster execution as might be expected.size() - 1); ++i) { if (array[i] >(array[i + 1])) { std::cout << "unsorted" << std::endl; return; } } std::cout << "sorted" << std::endl;}  |} === Assignment 3 ===
57
edits

Navigation menu