Difference between revisions of "GPU610/TeamKappa"

From CDOT Wiki
Jump to: navigation, search
(Assignment 1)
(Assignment 2)
Line 175: Line 175:
  
 
== Assignment 2 ==
 
== Assignment 2 ==
 +
 +
=== Image Manipulation: Rotation ===
 +
 +
==== Benchmark ====
 +
 +
{| class="wikitable" border="1"
 +
! image size !! algorithm time - gpu !! full time - gpu !! algorithm time - scalar !! full time - scalar
 +
|-
 +
| 500 x 600 || 109ms || 117ms || 119ms || 127ms
 +
|-
 +
| 800 x 800 || 113ms || 127ms || 254ms || 256ms
 +
|-
 +
| 1600 x 900 || 122ms || 168ms || 567ms || 592ms
 +
|-
 +
| 1920 x 1080 || 130ms || 203ms || 930ms || 972ms
 +
|-
 +
| 2747 x 1545 || 153ms || 293ms || 1704ms || 1783ms
 +
|}
 +
 +
[[Image:Gpu610_rotateprofile.jpg|481px| ]]
 +
 +
==== Scalar Code ====
 +
 +
void Image::rotateImage(int theta, Image& source) {
 +
 +
    steady_clock::time_point first_start;
 +
    first_start = steady_clock::now();
 +
 +
    int rows = source.N;
 +
    int cols = source.M;
 +
 +
    Image temp(rows, cols, source.Q);
 +
 +
    steady_clock::time_point second_start;
 +
    second_start = steady_clock::now();
 +
 +
    float rads = (theta * 3.14159265) / 180.0;
 +
 +
    for (int r = 0; r < rows; r++) {
 +
 +
        for (int c = 0; c < cols; c++) {
 +
 +
            int new_row = (int)(rows / 2 + ((r - rows / 2) * cos(rads)) - ((c - cols / 2) * sin(rads)));
 +
            int new_col = (int)(cols / 2 + ((r - rows / 2) * sin(rads)) + ((c - cols / 2) * cos(rads)));
 +
 +
            if (inBounds(new_row, new_col)) {
 +
 +
                temp.pixelVal[new_row][new_col] = source.pixelVal[r][c];
 +
 +
            }
 +
 +
        }
 +
 +
    }
 +
 +
    profile("rotate - cuda", steady_clock::now() - second_start);
 +
 +
    for (int r = 0; r < rows; r++) {
 +
 +
        for (int c = 0; c < cols; c++) {
 +
 +
            if (temp.pixelVal[r][c] == 0) {
 +
 +
                temp.pixelVal[r][c] = temp.pixelVal[r][c + 1];
 +
 +
            }
 +
 +
        }
 +
 +
    }
 +
 +
    source = temp;
 +
 +
    profile("rotate - full", steady_clock::now() - first_start);
 +
 +
}
 +
 +
==== GPU Code ====
 +
 +
__global__ void kernel_rotate(int * old_image, int * temp_image, float rads, int rows, int cols) {
 +
 +
    int index = blockIdx.x * blockDim.x + threadIdx.x;
 +
 +
    if (index > rows * cols) {
 +
 +
        return;
 +
 +
    }
 +
 +
    int row = index % rows;
 +
    int col = index / rows;
 +
 +
    int new_row = (int)(rows / 2 + ((row - rows / 2) * cos(rads)) - ((col - cols / 2) * sin(rads)));
 +
    int new_col = (int)(cols / 2 + ((row - rows / 2) * sin(rads)) + ((col - cols / 2) * cos(rads)));
 +
 +
    if (!(new_row >= rows || new_row < 0 || new_col >= cols || new_col < 0)) {
 +
 +
        temp_image[rows * new_col + new_row] = old_image[index];
 +
 +
    }
 +
 +
}
 +
 +
void Image::rotateImage(int theta, Image & source) {
 +
 +
    steady_clock::time_point first_start;
 +
    first_start = steady_clock::now();
 +
 +
    int rows = source.N;
 +
    int cols = source.M;
 +
 +
    int nb = (rows * cols + ntpb - 1) / ntpb;
 +
 +
    int * d_temp_image;
 +
    int * d_old_image;
 +
 +
    int * h_temp_image = new int[rows * cols];
 +
    int * h_old_image = new int[rows * cols];
 +
 +
    for (int r = 0; r < rows; r++) {
 +
 +
        for (int c = 0; c < cols; c++) {
 +
 +
            h_old_image[rows * c + r] = source.pixelVal[r][c];
 +
        }
 +
 +
    }
 +
 +
    steady_clock::time_point second_start;
 +
    second_start = steady_clock::now();
 +
 +
    cudaMalloc((void**)&d_old_image, rows * cols * sizeof(int));
 +
 +
    if (!d_old_image) {
 +
 +
        cout << "CUDA: out of memory (d_old_image)" << endl;
 +
        return;
 +
 +
    }
 +
 +
    cudaMalloc((void**)&d_temp_image, rows * cols * sizeof(int));
 +
 +
    if (!d_temp_image) {
 +
 +
        cout << "CUDA: out of memory (d_temp_image)" << endl;
 +
        return;
 +
 +
    }
 +
 +
    cudaMemcpy(d_old_image, h_old_image, rows * cols * sizeof(int), cudaMemcpyHostToDevice);
 +
 +
    dim3 dGrid(nb);
 +
    dim3 dBlock(ntpb);
 +
 +
    kernel_rotate <<<dGrid, dBlock>>>(d_old_image, d_temp_image, (theta * 3.14159265) / 180.0, rows, cols);
 +
 +
    cudaDeviceSynchronize();
 +
 +
    cudaMemcpy(h_temp_image, d_temp_image, rows * cols * sizeof(int), cudaMemcpyDeviceToHost);
 +
 +
    profile("rotate - cuda", steady_clock::now() - second_start);
 +
 +
    for (int r = 0; r < rows; r++) {
 +
 +
        for (int c = 0; c < cols; c++) {
 +
 +
            if (h_temp_image[rows * c + r] == 0 && c + 1 < cols)
 +
                source.pixelVal[r][c] = h_temp_image[rows * (c + 1) + r];
 +
            else
 +
                source.pixelVal[r][c] = h_temp_image[rows * c + r];
 +
 +
        }
 +
 +
    }
 +
 +
    profile("rotate - full", steady_clock::now() - first_start);
 +
 +
}
 +
 
== Assignment 3 ==
 
== Assignment 3 ==

Revision as of 17:00, 7 November 2015


GPU610/DPS915 | Student List | Group and Project Index | Student Resources | Glossary


Team Kappa

Team Members

  1. Ryan Mullings, Team Member
  2. Andy Cooc, Team Member
  3. Matt Jang, Team Member

Email All

Assignment 1

Program 1: RSA Encryption

Code From Github: link

Profiling

  • gprof: Encryption 10MB
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 60.48      1.50     1.50  3495254     0.00     0.00  fastModularExp(unsigned long long, unsigned long long, unsigned long long)
 32.66      2.31     0.81  3495254     0.00     0.00  findHighestSetBit(unsigned long long)
  4.44      2.42     0.11        1     0.11     2.48  encryption(std::string, std::string, std::string)
  2.42      2.48     0.06  3495254     0.00     0.00  intToByteArray(int, char*)
  0.00      2.48     0.00        1     0.00     0.00  _GLOBAL__sub_I__Z10encryptionSsSsSs
  0.00      2.48     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
  0.00      2.48     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  0.00      2.48     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  • gprof: Encryption 20MB
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 63.65      3.27     3.27  6990507     0.00     0.00  fastModularExp(unsigned long long, unsigned long long, unsigned long long)
 29.04      4.75     1.49  6990507     0.00     0.00  findHighestSetBit(unsigned long long)
  3.90      4.96     0.20        1     0.20     5.13  encryption(std::string, std::string, std::string)
  3.41      5.13     0.17  6990507     0.00     0.00  intToByteArray(int, char*)
  0.00      5.13     0.00        1     0.00     0.00  _GLOBAL__sub_I__Z10encryptionSsSsSs
  0.00      5.13     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
  0.00      5.13     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  0.00      5.13     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  • gprof: Encryption 30MB
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 68.04      5.11     5.11 10485760     0.00     0.00  fastModularExp(unsigned long long, unsigned long long, unsigned long long)
 25.23      7.00     1.90 10485760     0.00     0.00  findHighestSetBit(unsigned long long)
  3.99      7.30     0.30        1     0.30     7.51  encryption(std::string, std::string, std::string)
  2.73      7.51     0.20 10485760     0.00     0.00  intToByteArray(int, char*)
  0.00      7.51     0.00        1     0.00     0.00  _GLOBAL__sub_I__Z10encryptionSsSsSs
  0.00      7.51     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
  0.00      7.51     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  0.00      7.51     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  • gprof: Decryption 10MB
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 69.80      1.71     1.71  3495254     0.00     0.00  fastModularExp(unsigned long long, unsigned long long, unsigned long long)
 21.63      2.24     0.53  3495254     0.00     0.00  findHighestSetBit(unsigned long long)
  6.12      2.39     0.15        1     0.15     2.45  decryption(std::string, std::string, std::string)
  2.45      2.45     0.06  3495254     0.00     0.00  intToByteArray(int, char*)
  0.00      2.45     0.00        1     0.00     0.00  _GLOBAL__sub_I__Z10encryptionSsSsSs
  0.00      2.45     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
  0.00      2.45     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  0.00      2.45     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  • gprof: Decryption 20MB
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 64.11      3.38     3.38  6990507     0.00     0.00  fastModularExp(unsigned long long, unsigned long long, unsigned long long)
 27.18      4.82     1.44  6990507     0.00     0.00  findHighestSetBit(unsigned long long)
  4.73      5.07     0.25        1     0.25     5.28  decryption(std::string, std::string, std::string)
  3.98      5.28     0.21  6990507     0.00     0.00  intToByteArray(int, char*)
  0.00      5.28     0.00        1     0.00     0.00  _GLOBAL__sub_I__Z10encryptionSsSsSs
  0.00      5.28     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
  0.00      5.28     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  0.00      5.28     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  • gprof: Decryption 30MB
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 66.88      5.25     5.25 10485760     0.00     0.00  fastModularExp(unsigned long long, unsigned long long, unsigned long long)
 24.59      7.18     1.93 10485760     0.00     0.00  findHighestSetBit(unsigned long long)
  5.73      7.63     0.45        1     0.45     7.85  decryption(std::string, std::string, std::string)
  2.80      7.85     0.22 10485760     0.00     0.00  intToByteArray(int, char*)
  0.00      7.85     0.00        1     0.00     0.00  _GLOBAL__sub_I__Z10encryptionSsSsSs
  0.00      7.85     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
  0.00      7.85     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
  0.00      7.85     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)

Bottleneck Code

The two slowest parts of this encryption method are fastModularExp and findHighestSetBit. A third function, intToByteArray, takes up a relatively small amount of time but may still be able to be optimized.

unsigned int fastModularExp(ULong a, ULong b, ULong c) {
    ULong result = 1;
    ULong leadingbit = findHighestSetBit(b); // Heighest set bit
    while(leadingbit > 0){ //while there are bits left
        result = ((result*result) % c); //case 1: bit is a 0
        if((b & leadingbit) > 0){
            result = ((result * a) % c); //case 2: if bit is a 1
        }
        leadingbit = leadingbit >> 1;
    }
    return (unsigned int)result;
}
ULong findHighestSetBit(ULong num){
    ULong result = 0;
    for(int i = 63; i >= 0; i--){
        if(num & (1ULL << i)){
            result = 1ULL << i;
            return result;
        }
    }
    return result;
}
byte* intToByteArray(int num, byte *result){
    for(int i = 0; i < 4; i++){
        result[i] = (num & (0xFF << (8 *(3-i)))) >> (8 *(3-i));
    }
    return result;
}

At first glance, the fastest function is also the function that appears like it would be the easiest to run on a GPU. The other code does not look like it can be optimized with a GPU easily as it does not use large arrays of N size. However, after some short internet research, several documents turned up on the topic of using a GPU to make RSA faster (example: here and here). This leaves me hopeful that in the further stages of this assignment that this could be an interesting program to work with.

Ryan Mullings: Program 2-Image Manipulation Processor

I decided to profile an Image manipulation program since it had many different functions that I could play around with. Also depending on the number of files you enter as arguments the processor will display different options Source Code:link to dream in code

Sample Run

File

Code for some of the manipulations

File


Profiles

  • Profile for shrinking image by 2
  %   cumulative                   self     total           
 time   seconds    calls   s/call   s/call  name    
 40.22      5.75     2     0.00     0.00  Image(Image const&)
 27.53      3.32     1     0.00     0.00  readImage(char*, Image&)
 13.23      2.21     1     0.00     2.48  shrinkImage(int, Image&)
 12.67      2.11     1     0.00     0.00  Image(int, int, int)
  6.32      2.48     1     0.00     0.00  writeImage(char*, Image&)
  • Profile for Expanding image by 3 and Rotating by 20 degrees
  %   cumulative                   self     total           
 time   seconds    calls   s/call   s/call  name    
 64.28      11.23    1     0.00     0.00  rotateImage(Image const&)
 16.89      4.51     1     0.00     0.00  enlargeImage(char*, Image&)
 13.23      2.21     1     0.00     0.00  writeImage(int, Image&)

Essentially, all of the functions are the main hotspots from the test runs.


Program 3: Serial pi Calculation - C version

I've decided to profile the calculate (estimate) of pi using a "dartboard" algorithm.

work in progress...

Source Code:link title

Assignment 2

Image Manipulation: Rotation

Benchmark

image size algorithm time - gpu full time - gpu algorithm time - scalar full time - scalar
500 x 600 109ms 117ms 119ms 127ms
800 x 800 113ms 127ms 254ms 256ms
1600 x 900 122ms 168ms 567ms 592ms
1920 x 1080 130ms 203ms 930ms 972ms
2747 x 1545 153ms 293ms 1704ms 1783ms

Gpu610 rotateprofile.jpg

Scalar Code

void Image::rotateImage(int theta, Image& source) {

    steady_clock::time_point first_start;
    first_start = steady_clock::now();

    int rows = source.N;
    int cols = source.M;

    Image temp(rows, cols, source.Q);

    steady_clock::time_point second_start;
    second_start = steady_clock::now();

    float rads = (theta * 3.14159265) / 180.0;

    for (int r = 0; r < rows; r++) {

        for (int c = 0; c < cols; c++) {

            int new_row = (int)(rows / 2 + ((r - rows / 2) * cos(rads)) - ((c - cols / 2) * sin(rads)));
            int new_col = (int)(cols / 2 + ((r - rows / 2) * sin(rads)) + ((c - cols / 2) * cos(rads)));

            if (inBounds(new_row, new_col)) {

                temp.pixelVal[new_row][new_col] = source.pixelVal[r][c];

            }

        }

    }

    profile("rotate - cuda", steady_clock::now() - second_start);

    for (int r = 0; r < rows; r++) {

        for (int c = 0; c < cols; c++) {

            if (temp.pixelVal[r][c] == 0) {

                temp.pixelVal[r][c] = temp.pixelVal[r][c + 1];

            }

        }

    }

    source = temp;

    profile("rotate - full", steady_clock::now() - first_start);

}

GPU Code

__global__ void kernel_rotate(int * old_image, int * temp_image, float rads, int rows, int cols) {

    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index > rows * cols) {

        return;

    }

    int row = index % rows;
    int col = index / rows;

    int new_row = (int)(rows / 2 + ((row - rows / 2) * cos(rads)) - ((col - cols / 2) * sin(rads)));
    int new_col = (int)(cols / 2 + ((row - rows / 2) * sin(rads)) + ((col - cols / 2) * cos(rads)));

    if (!(new_row >= rows || new_row < 0 || new_col >= cols || new_col < 0)) {

        temp_image[rows * new_col + new_row] = old_image[index];

    }

}
void Image::rotateImage(int theta, Image & source) {

    steady_clock::time_point first_start;
    first_start = steady_clock::now();

    int rows = source.N;
    int cols = source.M;

    int nb = (rows * cols + ntpb - 1) / ntpb;

    int * d_temp_image;
    int * d_old_image;

    int * h_temp_image = new int[rows * cols];
    int * h_old_image = new int[rows * cols];

    for (int r = 0; r < rows; r++) {

        for (int c = 0; c < cols; c++) {

            h_old_image[rows * c + r] = source.pixelVal[r][c];
        }

    }

    steady_clock::time_point second_start;
    second_start = steady_clock::now();

    cudaMalloc((void**)&d_old_image, rows * cols * sizeof(int));

    if (!d_old_image) {

        cout << "CUDA: out of memory (d_old_image)" << endl;
        return;

    }

    cudaMalloc((void**)&d_temp_image, rows * cols * sizeof(int));

    if (!d_temp_image) {

        cout << "CUDA: out of memory (d_temp_image)" << endl;
        return;

    }

    cudaMemcpy(d_old_image, h_old_image, rows * cols * sizeof(int), cudaMemcpyHostToDevice);

    dim3 dGrid(nb);
    dim3 dBlock(ntpb);

    kernel_rotate <<<dGrid, dBlock>>>(d_old_image, d_temp_image, (theta * 3.14159265) / 180.0, rows, cols);

    cudaDeviceSynchronize();

    cudaMemcpy(h_temp_image, d_temp_image, rows * cols * sizeof(int), cudaMemcpyDeviceToHost);

    profile("rotate - cuda", steady_clock::now() - second_start);

    for (int r = 0; r < rows; r++) {

        for (int c = 0; c < cols; c++) {

            if (h_temp_image[rows * c + r] == 0 && c + 1 < cols)
                source.pixelVal[r][c] = h_temp_image[rows * (c + 1) + r];
            else
                source.pixelVal[r][c] = h_temp_image[rows * c + r];

        }

    }

    profile("rotate - full", steady_clock::now() - first_start);

}

Assignment 3