BarraCUDA Boiz

Assignment 2
Set Samples kernel
setSamples - goes through the entire image and collects samples from the image (the current pixel and the next x number of pixels).

__global__ void setSamples(cv::cuda::PtrStepSz<float> samples, cv::cuda::PtrStepSz<uchar> img, int dimC) {
    int i = blockIdx.y*blockDim.y + threadIdx.y;
    int j = blockIdx.x*blockDim.x + threadIdx.x;
    if (i >= img.rows || j >= img.cols) return;
    int index = i * img.cols + j;
    for (int d = 0; d<dimC; d++) {
        samples(index, d) = (float)img(i, j * dimC + d);
    }
}
calculateDistance - goes through the image and computes the difference between the samples and the centers from the input image.
calculateDistance - goes through the image and computes the difference between the samples and the centers from the input image.

__global__ void calculateDistance(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, int k, int N, int dim, double* minval, float* D) {
    // Compute distances between already sampled centers and other input samples.
    // Update nearest distance if it is smaller than previous ones.
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int i = col + row * N;
    //int i = blockIdx.y*blockDim.y + threadIdx.y;
    if (i >= N) return;
    double dist = 0.0;
    for (int d = 0; d<dim; d++) {
        double diff = centers(k - 1, d) - samples(i, d);
        dist += diff * diff;
    }
    if (dist < minval[i]) {
        minval[i] = dist;
    }
    *D += minval[i];
}
generateImage - takes the modified image and then writes it to the file using the function "out()"
__global__ void generateImage(cv::cuda::PtrStepSz<uchar> out, cv::cuda::PtrStepSz<int> indices, cv::cuda::PtrStepSz<float> centers, int dim) {
    // Generate output image
    int i = blockIdx.y*blockDim.y + threadIdx.y;
    int j = blockIdx.x*blockDim.x + threadIdx.x;
    if (i >= out.rows || j >= out.cols) return;
    int index = i * out.cols + j;
    int ci = indices(index, 0);
    for (int d = 0; d<dim; d++) {
        out(i, j*dim + d) = (uchar)centers(ci, d);
    }
}
After programming these kernel, we noticed an improvement in performance.
==== Conclusion ====
This program can further be improved by off-loading some more operations from the CPU to the GPU. But this will require more time and research.
=== Assignment 3 ===

