//%%file TSort.cu #include #include #include #include #include #include "cublas_v2.h" #define MAX_DEPTH 16 #define INSERTION_SORT 32 __device__ void selection_sort( int *data, int left, int right ) { for( int i = left ; i <= right ; ++i ){ int min_val = data[i]; int min_idx = i; // Find the smallest value in the range [left, right]. for( int j = i+1 ; j <= right ; ++j ){ int val_j = data[j]; if( val_j < min_val ){ min_idx = j; min_val = val_j; } } // Swap the values. if( i != min_idx ){ data[min_idx] = data[i]; data[i] = min_val; } } } __global__ void simple_quicksort(int *data, int left, int right, int depth ){ //If we're too deep or there are few elements left, we use an insertion sort... if( depth >= MAX_DEPTH || right-left <= INSERTION_SORT ){ selection_sort( data, left, right ); return; } cudaStream_t s,s1; int *lptr = data+left; int *rptr = data+right; int pivot = data[(left+right)/2]; int lval, rval; int nright, nleft; // Do the partitioning. while (lptr <= rptr){ // Find the next left- and right-hand values to swap lval = *lptr; rval = *rptr; // Move the left pointer as long as the pointed element is smaller than the pivot. while (lval < pivot && lptr < data+right){ lptr++; lval = *lptr; } // Move the right pointer as long as the pointed element is larger than the pivot. while (rval > pivot && rptr > data+left){ rptr--; rval = *rptr; } // If the swap points are valid, do the swap! if (lptr <= rptr){ *lptr = rval; *rptr = lval; lptr++; rptr--; } } // Now the recursive part nright = rptr - data; nleft = lptr - data; // Launch a new block to sort the left part. if (left < (rptr-data)){ cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1); cudaStreamDestroy(s); } // Launch a new block to sort the right part. if ((lptr-data) < right){ cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1); cudaStreamDestroy(s1); } } void gpu_qsort(int *data, int n){ int* gpuData; int left = 0; int right = n-1; // Prepare CDP for the max depth 'MAX_DEPTH'. cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH); // Allocate GPU memory. cudaMalloc((void**)&gpuData,n*sizeof(int)); cudaMemcpy(gpuData,data, n*sizeof(int), cudaMemcpyHostToDevice); // Launch on device simple_quicksort<<<1, 1>>>(gpuData, left, right, 0); cudaDeviceSynchronize(); // Copy back cudaMemcpy(data,gpuData, n*sizeof(int), cudaMemcpyDeviceToHost); cudaFree(gpuData); cudaDeviceReset(); } //Generates random numbers and assigns them to the array void fillArray(int* arr, int size) { for (int i = 0; i < size; i++) { arr[i] = rand() % size; } } void print(int *arr, int size) { for (int i = 0; i < size; i++) { std::cout << arr[i] << " "; } std::cout << std::endl; } int main(int argc, char *argv[]) { //Get the size of the array int n = std::atoi(argv[1]); // Create 6 arrays of size n and allocate memory for them int *qArray = new int[n]; //Fill the array with randomly generated numbers fillArray(qArray, n); //std::cout << "unordered" << std::endl; //print(qArray, n); std::cout << std::endl; //Call the sorting algorithms 1 by 1 with their respecive array gpu_qsort(qArray, n); std::cout << "Selection Sort performed." << std::endl; //print(qArray, n); //Deallocate the arrays delete[] qArray; //std::cout << cudaGetLastErrorr() << std::endl; return 0; }