Changes

Ghost Cells

10,552 bytes added, 02:03, 7 April 2019

→‎Assignment 3

=== Assignment 3 ===

==== Source Codes ====

{| class="wikitable mw-collapsible mw-collapsed"

! PCIe Optimization

</source>

|}

{| class="wikitable mw-collapsible mw-collapsed"

! For-loop Optimization

|-

|

/*

* Poisson Method using two arrays.

* Non-Ghost Cells Method

* Multiple PCIe Calls made, once per iteration

* by Tony Sim

*/

#include <cstring>

#include <cstdlib>

#include <iomanip>

#include <iostream>

#include <string>

#include <cuda_runtime.h>

#include "poisson-alt-ghost2.cuh"

namespace DPS{

Poisson::Poisson(std::ifstream& ifs) {

blockx = 32;

blocky = 32;

std::string line;

nColumns = 0;

bufferSide = 0;

nRowsTotal = 0;

/* find number of columns */

std::getline(ifs,line);

for (size_t i = 0 ; i < line.size() ; i++){

if(line[i]==' ') nColumns++;

}

nColumns++;

/* find number of rows */

nRowsTotal++; /* already fetched one */

while(std::getline(ifs,line))

nRowsTotal++;

ifs.clear();

int sizeX = ((nColumns + 2 + blockx + 2 - 1)/(blockx+2))*(blockx+2);

int sizeY = ((nRowsTotal + 2 + blocky + 2 - 1)/(blocky+2))*(blocky+2);

bufferSize = sizeX * sizeY;

std::cout << "Allocate initial memory" << std::endl;

try{

h_data = new float[ bufferSize ]; /* add edge buffers */

}

catch (...){

throw std::runtime_error("Failed to Allocate Memory");

}

/* readin data */

std::cout <<"Reading in data"<<std::endl;

ifs.seekg(0,ifs.beg);

/* allocate memory to all but the edge buffer, index 0 and max for each row and column */

std::memset(h_data,0,bufferSize);

for (size_t i = 0 ; i < nRowsTotal+2 ; i++){

for (size_t j = 0 ; j < nColumns+2 ; j++){

float val = 0;

if(!(i == 0 || i == nRowsTotal + 1 || j == 0 || j == nColumns + 1))

ifs >> val;

h_data[i*(nColumns+2)+j] = val;

}

std::cout <<"Setting buffer"<<std::endl;

bool state = devMemSet();

}

Poisson::Poisson(const size_t r, const size_t c, float* d) {

bufferSide = 0;

nRowsTotal = r;

nColumns = c;

try{

h_data = new float[(r+2)*(c+2)];

}

catch (...){

throw std::runtime_error("Failed to Allocate Memory");

}

std::memcpy(h_data,d,(r+2)*(c+2)*sizeof(float));

devMemSet();

}

Poisson::~Poisson(){

delete [] h_data;

cudaFree(d_data);

cudaDeviceReset();

}

bool Poisson::devMemSet(){

/* create double buffer */

cudaMalloc(&d_data, bufferSize * sizeof(float));

if(d_data != nullptr){

/* copy the initial information to the first buffer */

cudaError_t state = cudaMemcpy((void*)d_data,(const void*)h_data, bufferSize * sizeof(float),cudaMemcpyHostToDevice);

if(state != cudaSuccess)

std::cerr << "ERROR on devMemSet at cudaMemcpy : " << cudaGetErrorString(state)<< std::endl;

}

return d_data;

}

float* Poisson::operator()(const size_t nIterations, const float wx, const float wy){

/* calculate the grid, block, where block has 1024 threads total */

unsigned int gridx = ((nRowsTotal+2)+blockx-1)/blockx;

unsigned int gridy = ((nRowsTotal+2)+blocky-1)/blocky;

/* create dim3 */

dim3 dBlock= {blockx,blocky};

dim3 dGrid = {gridx,gridy};

/* generate shared memory map that will control ghost cell sharing */

char* hmap = new char[(blockx+2)*(blocky+2)*3];

int stride = 3;

for(int i = 0 ; i < (blockx+2);i++){

for(int j = 0 ; j < (blocky+2);j++){

char val = 0;

char x = 0;

char y = 0;

if(i==1){

val = 1;

x=-1;

y=0;

}

if(j==1){

val = 1;

x=0;

y=-1;

}

if(i==blockx) {

val = 1;

x=1;

y=0;

}

if(j==blocky){

val = 1;

x=0;

y=1;

}

if(i==2 || j==2 || i==31 || j==31)

val = 2;

hmap[(i * (blockx+2) + j)*stride] = val;

hmap[(i * (blockx+2) + j)*stride+1] = x;

hmap[(i * (blockx+2) + j)*stride+2] = y;

}

/* transfer to device */

char* dmap = nullptr;

cudaMalloc(&dmap,(blockx+2)*(blocky+2)*sizeof(char)*3);

cudaMemcpy(dmap,hmap,(blockx+2)*(blocky+2)*sizeof(char)*3,cudaMemcpyHostToDevice);

/* run iterations */

update<<<dGrid,dBlock>>>(d_data,dmap,nColumns, nRowsTotal, wx, wy,nIterations);

/*DEBUG */ h_data[2*(nColumns+2)+2] = 100.0f;

/* output results from device to host */

cudaError_t state = cudaMemcpy(h_data,d_data,(nColumns+2)*(nRowsTotal+2)*sizeof(float),cudaMemcpyDeviceToHost);

if(state != cudaSuccess)

std::cout << "ERROR on () when copying data back to host with : " << cudaGetErrorString(state)<< std::endl;

return h_data;

}

void Poisson::show(std::ostream& ofs) const{

ofs << std::fixed << std::setprecision(1);

for (size_t j = 1; j <= nColumns ; j++) {

for (size_t i = 1 ; i <= nRowsTotal ; i++)

ofs << std::setw(8) << h_data[i * (nColumns+2) + j]<<",";

ofs << std::endl;

}

__global__ void update (float* data, char* dmap, int nCol, int nRow, const float wx, const float wy, unsigned int nIterations){

size_t j = blockDim.x * blockIdx.x + threadIdx.x + 1; /* for x axis */

size_t i = blockDim.y * blockIdx.y + threadIdx.y + 1; /* for y axis */

size_t y = threadIdx.x+1;

size_t x = threadIdx.y+1;

const unsigned int bufferSize = (32+2)*(32+2);

__shared__ float localBuffer[ 2 * bufferSize ]; /* double local buffer with ghost cells */

// __shared__ char lmap[bufferSize];

unsigned int buffer = 0;

float prefetch = 0.0f;

/* copy information into first of the local buffer */

localBuffer[x*(32+2)+y] = data[i*(nCol+2)+j];

__syncthreads();

const char lmap = dmap[(x*(32+2)+y)*3];

const char addx = dmap[(x*(32+2)+y)*3+1];

const char addy = dmap[(x*(32+2)+y)*3+2];

/* prefetch */

if(lmap)

prefetch = data[(i+addx)*(nCol+2)+j+addy] ;

/* run iterations */

for (unsigned int n = 0 ; n < nIterations; n++){

if(lmap)

localBuffer[buffer * bufferSize + (x+addx)*(32+2) + y+addy] = prefetch;

/* Calculate and store into the other buffer */

float curr = localBuffer[buffer*bufferSize + x * (32+2)+ y];

float dir1 = localBuffer[buffer*bufferSize + (x+1) * (32+2) +y];

float dir2 = localBuffer[buffer*bufferSize + (x-1) * (32+2) +y];

float dir3 = localBuffer[buffer*bufferSize + x * (32+2) + y + 1];

float dir4 = localBuffer[buffer*bufferSize + x * (32+2) + y - 1];

localBuffer[(1-buffer)*bufferSize + x*(32+2)+y] = curr + wx*(dir1+dir2-2.0f*curr) + wy*(dir3+dir4-2.0f*curr);

/* flip buffer */

buffer = 1-buffer;

/* for threads in charge of edges, share and obtain ghost cells */

if(lmap){

/* Copy over edges to global memory to be shared with neighboring blocks */

data[i*(nCol+2)+j] = localBuffer[buffer * bufferSize + x * (32+2) + y ];

}

__syncthreads();

if(lmap){

/* Copy back buffers from global memory */

prefetch = data[(i+addx)*(nCol+2)+j+addy] ;

}

/* copy the output back into global memory */

data[i*(nCol+2)+j] = localBuffer[buffer * bufferSize + x * (32+2) + y ];

__syncthreads();

}

</source>

|}

{| class="wikitable mw-collapsible mw-collapsed"

! For-loop Optimization - poissant-alt-ghost2.cuh

|-

|

/*

* Poisson Method using two arrays.

* Non-Ghost Cells Method

* Multiple PCIe Calls made, once per iteration

* by Tony Sim

*/

#ifndef POISSON_H

#define POISSON_H

#include <fstream>

#include <cuda_runtime.h>

namespace DPS{

class Poisson {

unsigned int blockx;

unsigned int blocky;

unsigned int nRowsTotal;

unsigned int nColumns;

unsigned int bufferSize;

float* h_data;

float* d_data;

int bufferSide;

void bufferSwitch(){ bufferSide = 1 - bufferSide; };

bool devMemSet();

public:

Poisson() = delete;

Poisson(std::ifstream& ifs);

Poisson(const size_t r, const size_t c, float* d);

~Poisson();

float* operator()(const size_t iteration, const float wx, const float wy);

float* operator()(const size_t iteration){

return operator()(iteration,0.1,0.1);

}

void show(std::ostream& ofs) const;

};

__global__ void update (float* data, char* dmap, int nCol, int nRow, const float wx, const float wy, unsigned int nIterations);

}

#endif

</source>

|}

==== Optimization Details ====

===== PCIe Version =====

* Coalesced Memory - Large performance boost.

* Prefetch - this had minor to no effect on the performance.

===== For-loop Version =====

* Shared Memory - Small boost. Used technique called Ghost Cells where updated information is shared over global memory as needed to perform the next iteration.

* Prefetch - Small boost. Information are fetched first into register in the previous iteration to be copied in the current iteration prior to calculation.

* Coalesed Memory - Large boost.

* Logic change - To minimize the number of condition calls, a predefined map of instruction was created on the host based on the block dimension information. Using this information, the if statement had been cut down to almost 1/4, showing noticeable performance increase.

==== Result ====

[[File:optimized.png|frame|GPU highlights. Ghost Cell + Prefetch + Coaleased memory + logic change is slightly faster than simpler Prefetch+Coaleased memory that uses Global Memory]]

[[File:all.png|frame|Using GPU significantly improved Calculation Time over the CPU counterparts.]]

Ysim2

70

edits

Changes

Ghost Cells

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools