Changes

DPS915 Toad

1,719 bytes added, 21:21, 8 December 2015

→‎Assignment 3

We tried to use shared memory, however our array size was simply too large.

In the end, after countless and tiring changes to the code, we settled on this simple configuration as being the fastest for our purpose of negating 1000 images.

Kernel:

<pre>

__global__ void cuda_negate_image(int * new_array, int * old_array, int n){

int tIdx = threadIdx.x;

int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < n)

new_array[i] = -(old_array[i]) + 255;

}

</pre>

Method:

<pre>

void Image::negateImage(Image& oldImage)

/*negates image*/

{

int rows, cols, gray;

rows = N;

cols = M;

gray = Q;

//convert 2D array to 1D array

int * h_old_array = new int[N*M];

int * d_old_array;

int * d_new_array;

int k = 0;

cout << rows * cols << endl;

//copy array to 1D array (Row Major)

for(int i = 0; i < N; i++){

for(int j = 0; j < M; j++){

h_old_array[k] = oldImage.pixelVal[i][j];

k++;

}

int nblks = (N*M + ntpb - 1) / ntpb;

cout << rows * cols;

cudaMalloc((void**)&d_old_array, rows * cols* sizeof(int));

cudaMalloc((void**)&d_new_array, rows * cols* sizeof(int));

cudaMemcpy(d_old_array, h_old_array, rows * cols* sizeof(int), cudaMemcpyHostToDevice);

int n = rows * cols;

int start_s=clock();

for(int i = 0; i < 1000; i++){

cuda_negate_image<<<nblks, ntpb>>>(d_new_array, d_old_array, n);

}

int stop_s=clock();

cout << "time: " << (stop_s-start_s)/double(CLOCKS_PER_SEC)*1000 << endl;

int * result = new int[N*M];

cudaMemcpy(result, d_new_array, rows * cols *sizeof(int), cudaMemcpyDeviceToHost);

k = 0;

//copy array from 1D to 2D array

for(int i = 0; i < N; i++){

for(int j = 0; j < M; j++){

pixelVal[i][j] = result[k];

k++;

}

cudaFree(d_old_array);

cudaFree(d_new_array);

}

</pre>

Kristoffer Vukasinovic

1

edit

CDOT Wiki β

Changes

DPS915 Toad

CDOT Wiki ^β