Changes

Carlos

4,492 bytes removed, 12:53, 8 March 2013

→‎Progress

~~// sat.cpp~~ /* ~~---- Profiling Results for the summarizedAreaTable() function ------~~ ~~Word Problem Seconds~~ ~~250 1.50~~ ~~500 25.87~~ ~~750 173.99~~ ~~1000 658.34~~ ~~--------------------------------------------------------------------~~ */ <pre>

#include <iostream>

---- Profiling Results for the summarizedAreaTable() function ------

~~/* To print the results~~

Word Problem Seconds

~~cout << "a is: " << endl;~~ 250 1.50

~~for(int i = 0; i < size; i++){~~ 500 25.87

~~for(int j = 0; j < size; j++)~~ 750 173.99

~~cout << a[i][j] << " ";~~ 1000 658.34

~~cout << endl;~~

}--------------------------------------------------------------------

*/

~~=== Assignment 2 ===~~

~~#include~~ <~~iostream~~/pre>

~~#include <cstdlib>~~ ~~#include <cuda_runtime.h>~~ ~~using namespace std;~~ ~~/* Initializes the matrix to any random number between 0 and 1 */~~ ~~void initializeMatrix(float* a, int size){~~ ~~float f~~ = ~~1.0 / RAND_MAX;~~ ~~for(int i = 0; i < size * size; i++)~~ ~~a[i] = rand() * f;~~ } ~~/* Displays the matrix */~~ ~~void matrixDisplay(char matrix, float* a, int size){~~ ~~int i = 0;~~ ~~cout << matrix <<" is: " << endl;~~ ~~while(i < size*size){~~ ~~for(int j = 0; j < size; j++, i++)~~ ~~cout << a[i] << " ";~~ ~~cout << endl;~~ } } ~~/* Creates the Summed area table */~~ ~~__global__ void SummedAreaTable(float* A, float* B, int size){~~ ~~int idx = blockIdx.x * blockDim.x + threadIdx.x;~~ ~~float sum = 0;~~ ~~int rest;~~ ~~if(idx < (size*size)){~~ ~~if(idx < size)~~ ~~rest = idx;~~ ~~else{~~ ~~rest = idx - size;~~ ~~while(rest >= size)~~ ~~rest = rest - size;~~ } ~~for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)~~ ~~for(int j = i, k = rest; k >= 0; j--, k--)~~ ~~sum += A[j];~~ ~~B[idx] = sum;~~ } } ~~int main(int argc, char* argv[]){~~ ~~if(argc~~ == Assignment 2~~){ // only one argument (program name + one argument) allowed~~ ~~int size~~ = ~~atoi(argv[1]);~~ ~~int tSize~~ = ~~size * size;~~ ~~int d;~~ ~~int nThreads;~~ ~~int mThreads;~~ ~~int Blocks;~~ ~~int Threads;~~ ~~float *a~~ = ~~new float[tSize];~~ ~~float *A;~~ ~~float *b = new float[tSize];~~ ~~float *B;~~ ~~cudaError_t error; // error handler~~ ~~/* Gets the maximum number of threads and blocks */~~ ~~cudaDeviceProp prop;~~ ~~cudaGetDevice(&d);~~ ~~cudaGetDeviceProperties(&prop, d);~~ ~~nThreads = prop.maxThreadsDim[0];~~ ~~mThreads = nThreads * prop.maxGridSize[0];~~ ~~/* Checks if the size of the matrix is less than the maximum number of threads */~~ ~~if((tSize) < nThreads){~~ ~~Blocks = 1;~~ ~~Threads = tSize;~~ } ~~/* Checks if the size of the matrix is greater than the maximum number of threads */~~ ~~else if((tSize) > nThreads){~~ ~~Blocks = (tSize + nThreads - 1) / nThreads;~~ ~~Threads = nThreads;~~ } ~~/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */~~ ~~else if((tSize) > mThreads){~~ ~~tSize = mThreads;~~ ~~Blocks = (tSize + nThreads - 1) / nThreads;~~ ~~Threads = nThreads;~~ } ~~dim3 dGrid(Blocks, Blocks, 1); // sets the grids~~ ~~dim3 dBlock(Threads, Threads, 1); // sets the blocks~~ ~~initializeMatrix(a,size); // initializes the matrix a~~ ~~error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;~~ ~~if (error != cudaSuccess) {~~ ~~cout << cudaGetErrorString(error) << endl;~~ } ~~error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;~~ ~~if (error != cudaSuccess) {~~ ~~cout << cudaGetErrorString(error) << endl;~~ } ~~error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A~~ ~~if (error != cudaSuccess) {~~ ~~cout << cudaGetErrorString(error) << endl;~~ } ~~/* Performs the SAT on the device on A and stores it on B */~~ ~~SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b~~ ~~cudaDeviceSynchronize(); // synchronizes the host and the device~~ ~~error = cudaGetLastError();~~ ~~if (error != cudaSuccess) {~~ ~~cout << cudaGetErrorString(error) << endl;~~ ~~cudaFree(a);~~ ~~cudaFree(b);~~ ~~delete [] a;~~ ~~delete [] b;~~ ~~return 3;~~ } ~~/* copies device matrix B into host matrix b */~~ ~~error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost);~~ ~~if (error != cudaSuccess) {~~ ~~cout << cudaGetErrorString(error) << endl;~~ } ~~//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a~~ ~~//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b~~ ~~/* deallocates space in both host memory and device memory */~~ ~~cudaFree(A);~~ ~~cudaFree(B);~~ ~~delete [] a;~~ ~~delete [] b;~~ ~~cudaDeviceReset();~~ ~~cout << "Finished" << endl;~~ ~~return 0;~~ } ~~else if(argc < 2){~~ ~~cout << "Please provide a size" << endl; // when no arguments are supplied~~ ~~return 0;~~ } ~~else{~~ ~~cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied~~ ~~return 0;~~ } }

Carlos Javier Conejo Molero

1

edit

Changes

Carlos

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools