Changes

GPUSquad

6,553 bytes added, 23:44, 9 April 2018

→‎Assignment 3

=== Assignment 3 ===

Optimization techniques ~~used~~attempted

* Get rid of the for loop in the kernel and use 2D threading within blocks

* Use gpu constant memory for jacobi calculation constants

all other blocks in the middle need ghost cells on both the left and right sides.

In A2 and A3 we only scaled the columns(we would have, ~~but~~ had we not mixed up the ~~rows~~indexing--the 2d version scales along the n dimension, while the 1d versions scale properly along the n dimension), ~~they~~ and the other dimension always stayed at 32. This made it easy for us to keep the grid of blocks one dimensional.

If we scaled the rows as well as the columns we would most likely have need to use a two dimensional grid of blocks.

This would make the problem more difficult as we would then not only need ghost columns, but also ghost rows on the top and bottom of blocks. This would likely cause synchronization problems because we would need to make sure that block execution was synchronized along 2 dimensions instead of just one.

[[File:dps915_gpusquad_2Dghostcells.png]]

CODE FOR 1D GRID WITH 2D THREAD SETUP (kernel contains shared and global memory versions)

int ti = threadIdx.x;

int tij = threadIdx.x + ntpbXY*threadIdx.y;

~~//__shared__ float bShared[ntpbXY*(ntpbXY+2)];//add a couple of ghost columns~~ ~~__shared__ float bShared[ntpbXY+2][ntpbXY];~~

int ij = i + d_m*j;

//GLOBAL MEMORY VERSION (COMMENT OUT SHARED SECTION BELOW TO RUN):==================================

// if (!(blockIdx.x == 0 && threadIdx.x == 0 || blockIdx.x == (d_nBlocks - 1) && threadIdx.x == (d_ntpbXY - 1)) && threadIdx.y != 0 && threadIdx.y != (d_ntpbXY - 1)) {

// float x = d_xmin + i*d_dx, y = d_ymin + j*d_dy;

// float input = abs(x) > 0.5 || abs(y) > 0.5 ? 0 : 1;

// a[ij] = (input + d_dxxinv*(b[ij - 1] + b[ij + 1])

// + d_dyyinv*(b[ij - d_m] + b[ij + d_m]))*d_dcent;

// }

//SHARED MEMORY SETUP (COMMENT OUT FOR GLOBAL VERSION)

__shared__ float bShared[ntpbXY+2][ntpbXY];

//NOTES ON SHARED INDEX OFFSET:

//x offset is -1, so to get the current index for the thread for global, it is bShared[ti+i][tj]

__syncthreads();

//SHARED MEMORY VERSION:==========

CODE FOR 1D BLOCK OF 1D THREADS WITH CONSTANT MEMORY: we tried just replacing the global memory values that were used as calculation constants with constant memory, since these values would not have to be modified by kernel logic. #include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h> // Load standard libraries#include <cstdio>#include <cstdlib>#include <iostream>#include <fstream>#include <cmath>#include <chrono>using namespace std;<pre style="color: blue">__device__ __constant__ int d_m;__device__ __constant__ int d_n;__device__ __constant__ float d_xmin;__device__ __constant__ float d_xmax;__device__ __constant__ float d_ymin;__device__ __constant__ float d_ymax;__device__ __constant__ float d_pi;__device__ __constant__ float d_omega;__device__ __constant__ float d_dx;__device__ __constant__ float d_dy;__device__ __constant__ float d_dxxinv;__device__ __constant__ float d_dyyinv;__device__ __constant__ float d_dcent;</pre>__global__ void matrixKernel(float* a, float* b){//, float dcent, int n, int m, int xmin, int xmax, //int ymin, int ymax, float dx, float dy, float dxxinv, float dyyinv) {//MODIFY to suit algorithm int j = threadIdx.x + 1; //above: we are using block and thread indexes to replace some of the iteration logic if (j < d_n - 1) { for (int i = 1; i < d_m - 1; i++) { int ij = i + d_m*j; float x = d_xmin + i*d_dx, y = d_ymin + j*d_dy; float input = abs(x) > 0.5 || abs(y) > 0.5 ? 0 : 1; a[ij] = (input + d_dxxinv*(b[ij - 1] + b[ij + 1]) + d_dyyinv*(b[ij - d_m] + b[ij + d_m]))*d_dcent; } } } // Set grid size and number of iterationsconst int save_iters = 20;const int total_iters = 5000;const int error_every = 2;const int m = 32, n = 1024;const float xmin = -1, xmax = 1;const float ymin = -1, ymax = 1; // Compute useful constantsconst float pi = 3.1415926535897932384626433832795;const float omega = 2 / (1 + sin(2 * pi / n));const float dx = (xmax - xmin) / (m - 1);const float dy = (ymax - ymin) / (n - 1);const float dxxinv = 1 / (dx*dx);const float dyyinv = 1 / (dy*dy);const float dcent = 1 / (2 * (dxxinv + dyyinv)); // Input functioninline float f(int i, int j) { float x = xmin + i*dx, y = ymin + j*dy; return abs(x) > 0.5 || abs(y) > 0.5 ? 0 : 1;} // Common output and error routinevoid outputImage(char* filename, float *a) { // Computes the error if sn%error every==0 // Saves the matrix if sn<=save iters int i, j, ij = 0, ds = sizeof(float); float x, y, data_float; const char *pfloat; pfloat = (const char*)&data_float; ofstream outfile; static char fname[256]; sprintf(fname, "%s.%d", filename, 101); outfile.open(fname, fstream::out | fstream::trunc | fstream::binary); data_float = m; outfile.write(pfloat, ds); for (i = 0; i < m; i++) { x = xmin + i*dx; data_float = x; outfile.write(pfloat, ds); } for (j = 0; j < n; j++) { y = ymin + j*dy; data_float = y; outfile.write(pfloat, ds); for (i = 0; i < m; i++) { data_float = a[ij++]; outfile.write(pfloat, ds); } } outfile.close();} void dojacobi() { int i, j, ij, k; float error, z; float *a, *b; float *u; float *v; u = new float[m*n]; v = new float[m*n]; // Set initial guess to be identically zero for (ij = 0; ij < m*n; ij++) u[ij] = v[ij] = 0; a = v; b = u; float* d_a; float* d_b; //malloc cudaMalloc((void**)&d_a, m*n * sizeof(float)); cudaMalloc((void**)&d_b, m*n * sizeof(float)); cudaMemcpy(d_a, a, n* m * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, n* m * sizeof(float), cudaMemcpyHostToDevice); int nblocks = n / 1024; dim3 dGrid(nblocks); dim3 dBlock(1024); // Carry out Jacobi iterations for (k = 1; k <= total_iters; k++) { if (k % 2 == 0) { cudaError_t error = cudaGetLastError(); matrixKernel << <dGrid, dBlock >> > (d_a, d_b);// , dcent, n, m, xmin, xmax, ymin, ymax, dx, dy, dxxinv, dyyinv); cudaDeviceSynchronize(); if (cudaGetLastError()) { std::cout << "error"; } } else { cudaError_t error = cudaGetLastError(); matrixKernel << <dGrid, dBlock >> > (d_b, d_a);// , dcent, n, m, xmin, xmax, ymin, ymax, dx, dy, dxxinv, dyyinv); cudaDeviceSynchronize(); if (cudaGetLastError()) { std::cout << "error"; } } } cudaMemcpy(a, d_a, n* m * sizeof(float), cudaMemcpyDeviceToHost); outputImage("jacobi out", a); cudaFree(d_a); cudaFree(d_b); delete[] u; delete[] v;} int main() { cudaMemcpyToSymbol(&d_n, &n, sizeof(int)); cudaMemcpyToSymbol(&d_xmin, &xmin, sizeof(float)); cudaMemcpyToSymbol(&d_xmax, &xmax, sizeof(float)); cudaMemcpyToSymbol(&d_ymin, &ymin, sizeof(float)); cudaMemcpyToSymbol(&d_ymax, &ymax, sizeof(float)); cudaMemcpyToSymbol(&d_pi, &pi, sizeof(float)); cudaMemcpyToSymbol(&d_omega, &pi, sizeof(float)); cudaMemcpyToSymbol(&d_dx, &dx, sizeof(float)); cudaMemcpyToSymbol(&d_dy, &dy, sizeof(float)); std::chrono::steady_clock::time_point ts, te; ts = std::chrono::steady_clock::now(); dojacobi(); te = std::chrono::steady_clock::now(); std::chrono::steady_clock::duration duration = te - ts; auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(duration); std::cout << "Parallel Code Time: " << ms.count() << " ms" << std::endl; cudaDeviceReset(); return 0;} <nowiki>****************</nowiki> FINAL TIMINGS<pre style="color: red"> THE GRAPH IMMEDIATELY BELOW IS INCORRECT: there was an error recording the 1D runtimes for assignment 2</pre>

[[File:dps915_gpusquad_a3chart.png]]

Moverall

41

edits

CDOT Wiki β

Changes

GPUSquad

CDOT Wiki ^β