Changes

Jump to: navigation, search

Algo holics

13,627 bytes added, 03:00, 8 April 2019
Kernel Version 2
{| class="wikitable mw-collapsible mw-collapsed"
! Flat profileCall Graph
|-
|
{| class="wikitable mw-collapsible mw-collapsed"
! Flat profileCall Graph
|-
|
=== Assignment 2 ===
After carefully weighing the projects from assignment 1, we first decided Our initial idea was to parallelize use the neural networkcode for our assignment 2. However, on further inspection of But since the source code, it algorithm itself was found that the network had a not very low accuracy accurate (2/10 correct predictions even after ten thousand iteration10,000 training iterations), we decided to paralellize merge sort. To improve the algorithm first and then parallelize did Soon we realized that since its Big O classification was n log n, offloading computations to GPU would not sound very pragmaticbe that effective. So , we decided settled with the next most interesting topic cosine transform library, as described below. ====Cosine Tranformation==== The Cosine_Transform is a simple C++ library which demonstrates properties of the Discrete cosine Transform for real data. The Discrete Cosine Transform or DCT is used to create jpeg (compressed images). The formula used here is: | (√1/n) , if u=0; 0≤v≤n-1 C(u,v) = | (√2/n) * cos[((2*v+1)π*u)/2n], if 1≤u≤n-1; 0≤v≤n-1 Where, u is the grouprow index, which was merge sortv is the column index and n is the total number of elements in a row/column in the computational matrix. This [https://www.youtube.com/watch?v=tW3Hc0Wrgl0 Link] can be used for better understanding of the above formula. Here is how the [https://people.sc.fsu.edu/~jburkardt/cpp_src/cosine_transform/cosine_transform.html source code] used. =====Profiling=====The flat profile for the above serial code was changed looks like: {| class="wikitable mw-collapsible mw-collapsed"! Flat Profile|-|   1 2 3 4 granularity: each sample hit covers 2 byte(s) for 0.68% of 1.47 seconds 5 6 index % time self children called name 7 <spontaneous> 8 [1] 100.0 0.00 1.47 main [1] 9 0.00 1.47 1/1 cosine_transform_test01(int) [3] 10 ----------------------------------------------- 11 1.47 0.00 1/1 cosine_transform_test01(int) [3] 12 [2] 100.0 1.47 0.00 1 cosine_transform_data(int, double*) [2] 13 ----------------------------------------------- 14 0.00 1.47 1/1 main [1] 15 [3] 100.0 0.00 1.47 1 cosine_transform_test01(int) [3] 16 1.47 0.00 1/1 cosine_transform_data(int, double*) [2] 17 0.00 0.00 1/1 r8vec_uniform_01_new(int, int&) [14] 18 0.00 0.00 1/1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >) [13] 19 0.00 0.00 1/1 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::s teady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 10 00000000l> > > const&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) [21] 20 ----------------------------------------------- 21 0.00 0.00 1/3 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>: :__cast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [18] 22 0.00 0.00 2/3 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<long, std::ratio<1l , 1000000000l>, long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [22] 23 [10] 0.0 0.00 0.00 3 std::chrono::duration<long, std::ratio<1l, 1000000000l> >::count() const [10] 24 ----------------------------------------------- 25 0.00 0.00 2/2 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::s teady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 10 00000000l> > > const&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) [21] 26 [11] 0.0 0.00 0.00 2 std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::time_since_epoch() const [11] 27 ----------------------------------------------- 28 0.00 0.00 1/1 __libc_csu_init [28] 29 [12] 0.0 0.00 0.00 1 _GLOBAL__sub_I__Z20r8vec_uniform_01_newiRi [12] 30 0.00 0.00 1/1 __static_initialization_and_destruction_0(int, int) [15] 31 ----------------------------------------------- 32 0.00 0.00 1/1 cosine_transform_test01(int) [3] 33 [13] 0.0 0.00 0.00 1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >) [13] 34 0.00 0.00 1/1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duratio n_cast<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [17] 35 0.00 0.00 1/1 std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const [16] 36 ----------------------------------------------- 37 0.00 0.00 1/1 cosine_transform_test01(int) [3] 38 [14] 0.0 0.00 0.00 1 r8vec_uniform_01_new(int, int&) [14] 39 ----------------------------------------------- 40 0.00 0.00 1/1 _GLOBAL__sub_I__Z20r8vec_uniform_01_newiRi [12] 41 [15] 0.0 0.00 0.00 1 __static_initialization_and_destruction_0(int, int) [15] 42 ----------------------------------------------- 43 0.00 0.00 1/1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >) [13] 44 [16] 0.0 0.00 0.00 1 std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const [16] 45 ----------------------------------------------- 46 0.00 0.00 1/1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >) [13] 47 [17] 0.0 0.00 0.00 1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duration_ca st<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [17] 48 0.00 0.00 1/1 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>: :__cast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [18] 49 ----------------------------------------------- 50 0.00 0.00 1/1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duratio n_cast<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [17] 51 [18] 0.0 0.00 0.00 1 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>::__c ast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) [18] 52 0.00 0.00 1/3 std::chrono::duration<long, std::ratio<1l, 1000000000l> >::count() const [10] |} As is evident, the algorithm is O(n2) currently. Using thread indices on the GPU to replace the for loops could potentially improve performance.To increase the efficiency of the program we transformed the '''cosine_transform_data''' function into a kernel named '''cosTransformKernel''' which offloads the compute intense calculation of the program to its parallel versionthe GPU.   =====Kernel Version 1====={| class="wikitable mw-collapsible mw-collapsed"! Modified Code|-|
# include <iostream> # include <iomanip> # include <ctime> # include <chrono> # include <cstdlib> # include <cmath> #include <cuda_runtime.h> using namespace std; using namespace std::chrono; const double pi =3.141592653589793; const int ntpb ===CUDA enabled functions====1024;The main function was changed to perform the copying of data from host to device, launch the kernel, copy back results from the device to host and release all memory, on host and device. void cosine_transform_test01 ( int size );
cudaMalloc double *r8vec_uniform_01_new ((void**)int n, int &d_A, (sizeseed ) * sizeof(long));{ cudaMalloc((void**)&d_B, (size) * sizeof(long))int i; cudaMemcpy(d_A, h_A, size * sizeof(long), cudaMemcpyHostToDevice)const int i4_huge = 2147483647; cudaMalloc((void**)&d_ntpb, sizeof(dim3))int k; cudaMalloc((void*double *)&d_nbpg, sizeof(dim3))r; cudaMemcpyif (d_ntpb, &ntpb, sizeof(dim3), cudaMemcpyHostToDeviceseed == 0 ){ cerr << "\n"; cudaMemcpy(d_nbpg, &nbpg, sizeof(dim3), cudaMemcpyHostToDevice) cerr << "R8VEC_UNIFORM_01_NEW - Fatal error!\n"; long *d_actual cerr << " Input value of SEED = d_A0.\n"; long *d_swap = d_B exit ( 1 ); long totalThreads = ntpb.x * nbpg.x;} float start_time r = clock()new double[n]; for (int width i = 20; width i < (size << 1)n; width <<= 1i++ ) { long slices k = size seed / (totalThreads * width) + 1127773; merge_sort << <nbpg, ntpb >> > seed = 16807 * (d_actual, d_swap, size, width, slices, d_ntpb, d_nbpgseed - k * 127773 )- k * 2836; cudaDeviceSynchronize if (seed < 0 );{ d_actual seed = d_actual == d_A ? d_B : d_Aseed + i4_huge; d_swap } r[i] = d_swap == d_A ? d_B : d_A( double ) ( seed ) * 4.656612875E-10;
}
float end_time = clock() - start_timereturn r; //copy back the device array to host array cudaMemcpy(h_C, d_actual, size * sizeof(long), cudaMemcpyDeviceToHost); }
The serial code did not use recusion double *cosine_transform_data ( int n, which is usually utilized double d[] ){ double angle; double *c; int i; int j; c = new double[n]; for merge sort( i = 0; i < n; i++ ){ c[i] = 0. Instead, it did a bottoms-up merge, wherein the array was sliced into pieces and each piece was sorted. The width of each slice changed with each iteration0; for ( j = 0; j < n; j++ ){ angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n ); c[i] = c[i] + cos ( angle ) * d[j]; } c[i] = c[i] * sqrt ( 2. This was done using two functions int the serial code: merge0 / (...double ) and merge_sort(...n ) ). The former function was responsible for slicing the array into different widths. The latter received this slices and sorted them by creating a local STL map. ; } return c;In the CUDA code for this, it seemed advisable to make merge_sort as the kernel. This can be justified because each thread now gets a slice of the array to sort, which means more work is done concurrently than before. Also, this enabled removal of the two nested for-loops from the serial code. Here is how the new merge_sort function looks like: }
__global__ void merge_sortreportTime(long const char*sourceAmsg, long *destA, long size, long width, long slices, dim3 *threads, dim3 *blockssteady_clock::duration span) { int index auto ms = threadIdx.x + blockIdx.x * blockDim.x; //may need to change this int beg = width * index * slices, middle, end; for (long slice = 0; slice duration_cast< slices; slice++) { //basically removed the two for loops if milliseconds>(beg < size) { middle = min(beg + (width >> 1), sizespan); end = min(beg + width, size); std::cout << msg << " - took - " << merge ms.count(sourceA, destA, beg, middle, end)<< " millisecs" << std::endl; beg += width; } }
}
As previously __global__ void cosTransformKernel(double *a, the merge_sort(double *b, int n) function calls the merge function{ double angle; const double pi = 3.141592653589793; int i = blockIdx.x * blockDim. In the serial code, an STL map is utilized to store the sliced array's values that are later used as values to be assigned to the original array based on the index comparison or value comparisonx + threadIdx.x; for(int j=0; j<n; j++){ angle = pi * (double) (i*(2*j+1)) / (double)(2*n);Now, the merge function is strictly b[i] += cos ( angle ) * a device function that is called from the kernel for each slice to be sorted[j]; } b[i] *= sqrt( 2.0 / (double) n ); }
__device__ void mergeint main (long *sourceAint argc, long char*destA, long b, long c, long eargv[] ) { long i1 = b; long i2 = c; for if (long k argc != b; k < e; k++2) { if (i1 std::cerr << c && (i2 >= e || sourceAargv[i10] < sourceA[i2])) {< ": invalid number of arguments\n"; destA std::cerr << "Usage: " << argv[k] = sourceA[i10]<< " size_of_vector\n"; i1++ return 1; } else { destA[k] int n = sourceAstd::atoi(argv[i21]); i2++ cosine_transform_test01 (n); } } return 0;
}
void cosine_transform_test01 ( int size){ int n =size; int seed; double *r; double *hs; double *s =new double[n]; double *d_a; double *d_b; //allocate memory on the device for the randomly generated array and for the array in which transform values will be stored cudaMalloc((void**)&d_a,sizeof(double) * n); cudaMalloc((void**)&d_b,sizeof(double) * n); seed =123456789; r =Profiling resultsr8vec_uniform_01_new ( n, seed ); //copy randomly generated values from host to device cudaMemcpy(d_a,r,sizeof(double)*n,cudaMemcpyHostToDevice); int nblks =(n + ntpb - 1) / ntpb; steady_clock::time_point ts, te; ts =steady_clock::now(); cosTransformKernel<<<nblks,ntpb>>>(d_a,d_b,size); cudaDeviceSynchronize(); te =steady_clock::now(); reportTime("Cosine Transform on device",te-ts); cudaMemcpy(s,d_b,sizeof(double)*n,cudaMemcpyDeviceToHost); ts =steady_clock::now(); hs = cosine_transform_data ( n, r ); te = steady_clock::now(); reportTime("Cosine Transform on host",te-ts);  cudaFree(d_a); cudaFree(d_b); delete [] r; delete [] s; delete [] hs;}
|}   The graph for the execution time difference between the device and the host looks like: [[File:serialvParallelkernel1.png|200px|thumb|left|Some text]] Even though the kernel includes a for-loop the execution time has decreased drastically. Thats because each thread is now responsible for one calculating one element of the final Cos transformed matrix(unit vector).
=== Assignment 3 ===
 
For optimizing the code better, we thought of removing the iterative loop from the kernel by using threadIdx.y to control calculation of each element's cosine for that position in the supposed matrix. The problem in this was that each thread was in a racing condition to write to the same memory location, to sum up the cosine transformations for all elements of that row. We solved this by using the atomic function. Its prototype is as follows.
double atomicAdd(double* address, double value)
 
=====Kernel Version 2=====
 
{| class="wikitable mw-collapsible mw-collapsed"
! Kernel 2
|-
|
# include <cmath>
# include <cstdlib>
# include <iostream>
# include <iomanip>
# include <ctime>
# include <chrono>
# include <cstdlib>
# include <cmath>
#include <limits>
#include <cuda_runtime.h>
#include <cuda.h>
using namespace std;
using namespace std::chrono;
const double pi = 3.141592653589793;
const unsigned ntpb = 32;
void cosine_transform_test01 ( int size );
 
double *r8vec_uniform_01_new ( int n, int &seed ){
int i;
const int i4_huge = 2147483647;
int k;
double *r;
if ( seed == 0 ){
cerr << "\n";
cerr << "R8VEC_UNIFORM_01_NEW - Fatal error!\n";
cerr << " Input value of SEED = 0.\n";
exit ( 1 );
}
r = new double[n];
for ( i = 0; i < n; i++ ){
k = seed / 127773;
seed = 16807 * ( seed - k * 127773 ) - k * 2836;
if ( seed < 0 ){
seed = seed + i4_huge;
}
r[i] = ( double ) ( seed ) * 4.656612875E-10;
}
return r;
}
 
double *cosine_transform_data ( int n, double d[] ){
double angle;
double *c;
int i;
int j;
c = new double[n];
for ( i = 0; i < n; i++ ){
c[i] = 0.0;
for ( j = 0; j < n; j++ ){
angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n );
c[i] = c[i] + cos ( angle ) * d[j];
}
c[i] = c[i] * sqrt ( 2.0 / ( double ) ( n ) );
}
return c;
}
 
 
void reportTime(const char* msg, steady_clock::duration span) {
auto ms = duration_cast<milliseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " millisecs" << std::endl;
}
 
__global__ void cosTransformKernel(double *a, double *b, const int n){
double angle;
const double pi = 3.141592653589793;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if(i<n && j<n){
angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n );
double value = cos ( angle ) * a[j];
b[i] = atomicAdd(&b[i], value);
}
//square root of the whole cos transformed row term
if(j==n-1 && i<n){
b[i] *= sqrt ( 2.0 / ( double ) ( n ) );
}
}
 
int main (int argc, char* argv[] ){
if (argc != 2) {
std::cerr << argv[0] << ": invalid number of arguments\n";
std::cerr << "Usage: " << argv[0] << " size_of_vector\n";
return 1;
}
int n = std::atoi(argv[1]);
cosine_transform_test01 (n);
return 0;
}
 
void cosine_transform_test01 ( int size){
int n = size;
int seed;
double *r;
double *hs; //host side pointer to store the array returned from host side cosine_transform_data, for comparison purposes
double *s = new double[n];
//double *t;
double *d_a;
double *d_b;
//allocate memory on the device for the randomly generated array and for the array in which transform values will be stored
cudaMalloc((void**)&d_a,sizeof(double) * n);
cudaMalloc((void**)&d_b,sizeof(double) * n);
seed = 123456789;
r = r8vec_uniform_01_new ( n, seed );
//copy randomly generated values from host to device
for(int i=0; i<n; i++)
s[i]=0.0;
cudaMemcpy(d_a,r,sizeof(double)*n,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,s,sizeof(double)*n,cudaMemcpyHostToDevice);
int nblks = (n + ntpb - 1) / ntpb;
dim3 grid(nblks,nblks,1);
dim3 block(ntpb,ntpb,1);
steady_clock::time_point ts, te;
ts = steady_clock::now();
cosTransformKernel<<<grid,block>>>(d_a,d_b,size);
cudaDeviceSynchronize();
te = steady_clock::now();
reportTime("Cosine Transform on device",te-ts);
cudaMemcpy(s,d_b,sizeof(double)*n,cudaMemcpyDeviceToHost);
ts = steady_clock::now();
hs = cosine_transform_data ( n, r );
te = steady_clock::now();
reportTime("Cosine Transform on host",te-ts);
 
cudaFree(d_a);
cudaFree(d_b);
delete [] r;
delete [] s;
delete [] hs;
//delete [] t;
 
return;
}
 
|}
 
Here is a comparison between the naive and optimized kernel
 
[[File:kernel2.jpg]]
 
Evidently, there is some performance boost for the new version. However, each call to atomicAdd by a thread locks the global memory until the old value is read and added to the passed value. This deters faster execution as might be expected.
57
edits

Navigation menu