#include #include #include #include const int ntpb = 512; __device__ void swap(float *a, float *b) { const float c = *a; *a = *b; *b = c; } __device__ void heapify(float* arr, int size, int i) { int largest = i; int l = (i << 1) + 1; int r = (i + 1) << 1; if (l < size && arr[l] > arr[largest]) { largest = l; } if (r < size && arr[r] > arr[largest]) { largest = r; } if (largest != i) { swap(&arr[largest], &arr[i]); heapify(arr, size, largest); } } __device__ void buildHeap(float *arr, int size) { for (int i = (size - 2) / 2; i >= 0; --i) { heapify(arr, size, i); } } __global__ void heapSortKernel(float *arr, int size) { __shared__ float s[ntpb]; for (int i = threadIdx.x; i < ntpb; i += blockDim.x) { s[i] = arr[i]; } __syncthreads(); if (threadIdx.x == 0) { buildHeap(s, size); int heapSize = size; while (heapSize > 1) { swap(s, &s[heapSize - 1]); --heapSize; heapify(s, heapSize, 0); } } for (int i = threadIdx.x; i < ntpb; i += blockDim.x) { arr[i] = s[i]; } } //Generates random numbers and assigns them to the array void fillArray(int* arr, int size) { for (int i = 0; i < size; i++) { arr[i] = rand() % size; } } void print(int *arr, int size) { for (int i = 0; i < size; i++) { std::cout << arr[i] << " "; } std::cout << std::endl; } int main(int argc, char *argv[]) { int numOfElements = atoi(argv[1]); // Print the vector length to be used, and compute its size size_t size = numOfElements * sizeof(float); // Allocate the host input vector A int *heapArray = new int[numOfElements]; // Allocate the device input vector A float *d_A; cudaMalloc((void **)&d_A, size); fillArray(heapArray, numOfElements); cudaMemcpy(d_A, heapArray, size, cudaMemcpyHostToDevice); std::cout << "Host Side: "; print(heapArray, numOfElements); heapSortKernel<<<(numOfElements+ntpb-1)/numOfElements, ntpb >>>(d_A, numOfElements); cudaDeviceSynchronize(); cudaMemcpy(heapArray, d_A, size, cudaMemcpyDeviceToHost); std::cout << "Device Side: "; print(heapArray, numOfElements); // Free device global memory cudaFree(d_A); // Free host memory delete [] heapArray; cudaDeviceReset(); return 0; }