Changes

Three-Star

4,323 bytes added, 02:52, 8 April 2018

→‎Assignment 2

=== Assignment 2 ===

Original CPU Implementation:

void Image::rotateImage(int theta, Image& oldImage)

/*based on users input and rotates it around the center of the image.*/

{

int r0, c0;

int r1, c1;

int rows, cols;

rows = oldImage.N;

cols = oldImage.M;

Image tempImage(rows, cols, oldImage.Q);

float rads = (theta * 3.14159265)/180.0;

r0 = rows / 2;

c0 = cols / 2;

for(int r = 0; r < rows; r++)

{

for(int c = 0; c < cols; c++)

{

r1 = (int) (r0 + ((r - r0) * cos(rads)) - ((c - c0) * sin(rads)));

c1 = (int) (c0 + ((r - r0) * sin(rads)) + ((c - c0) * cos(rads)));

if(inBounds(r1,c1))

{

tempImage.pixelVal[r1][c1] = oldImage.pixelVal[r][c];

}

for(int i = 0; i < rows; i++)

{

for(int j = 0; j < cols; j++)

{

if(tempImage.pixelVal[i][j] == 0)

tempImage.pixelVal[i][j] = tempImage.pixelVal[i][j+1];

}

oldImage = tempImage;

}

Parallelized Code:

Kernels

__device__ bool inBounds(int row, int col, int maxRow, int maxCol) {

if (row >= maxRow || row < 0 || col >= maxCol || col < 0)

return false;

//else

return true;

}

__global__ void rotateKernel(int* oldImage, int* newImage, int rows, int cols, float rads) {

int r = blockIdx.x * blockDim.x + threadIdx.x;

int c = blockIdx.y * blockDim.y + threadIdx.y;

int r0 = rows / 2;

int c0 = cols / 2;

float sinRads = sinf(rads);

float cosRads = cosf(rads);

/*__shared__ int s[ntpb * ntpb];

s[r * cols + c] = oldImage[r * cols + c];*/

if (r < rows && c < cols)

{

int r1 = (int)(r0 + ((r - r0) * cosRads) - ((c - c0) * sinRads));

int c1 = (int)(c0 + ((r - r0) * sinRads) + ((c - c0) * cosRads));

if (inBounds(r1, c1, rows, cols))

{

newImage[r1 * cols + c1] = oldImage[r * cols + c];

}

Modified Function

void Image::rotateImage(int theta, Image& oldImage)

/*based on users input and rotates it around the center of the image.*/

{

int r0, c0;

int r1, c1;

int rows, cols;

rows = oldImage.N;

cols = oldImage.M;

Image tempImage(rows, cols, oldImage.Q);

float rads = (theta * 3.14159265)/180.0;

// workspace start

// - calculate number of blocks for n rows assume square image

int nb = (rows + ntpb - 1) / ntpb;

// allocate memory for matrices d_a, d_b on the device

// - add your allocation code here

int* d_a;

check("device a", cudaMalloc((void**)&d_a, rows* cols * sizeof(int)));

int* d_b;

check("device b", cudaMalloc((void**)&d_b, rows* cols * sizeof(int)));

// copy h_a and h_b to d_a and d_b (host to device)

// - add your copy code here

check("copy to d_a", cudaMemcpy(d_a, oldImage.pixelVal, rows * cols * sizeof(int), cudaMemcpyHostToDevice));

//check("copy to d_b", cudaMemcpy(d_b, tempImage.pixelVal, rows * cols * sizeof(int), cudaMemcpyHostToDevice));

// launch execution configuration

// - define your 2D grid of blocks

dim3 dGrid(nb, nb);

// - define your 2D block of threads

dim3 dBlock(ntpb, ntpb);

// - launch your execution configuration

rotateKernel<<<dGrid, dBlock >>>(d_a, d_b, rows, cols, rads);

check("launch error: ", cudaPeekAtLastError());

// - check for launch termination

// synchronize the device and the host

check("Synchronize ", cudaDeviceSynchronize());

// copy d_b to tempImage (device to host)

// - add your copy code here

check("device copy to hc", cudaMemcpy(tempImage.pixelVal, d_b, rows * cols * sizeof(int), cudaMemcpyDeviceToHost));

// deallocate device memory

// - add your deallocation code here

cudaFree(d_a);

cudaFree(d_b);

// reset the device

cudaDeviceReset();

// workspace end

for(int i = 0; i < rows; i++)

{

for(int j = 0; j < cols; j++)

{

if(tempImage.pixelVal[i * M + j] == 0)

tempImage.pixelVal[i * M + j] = tempImage.pixelVal[i * M + j+1];

}

oldImage = tempImage;

}

{|

|Function

|CPU-Only

|GPU-CPU

|speedup(%)

|-

|function1

|s

|%

|-

|function2

|s

|%

|}

=== Assignment 3 ===

Tmoy

93

edits

Changes

Three-Star

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools