Changes

Jump to: navigation, search

Ghost Cells

10,552 bytes added, 02:03, 7 April 2019
Assignment 3
=== Assignment 3 ===
==== Source Codes ====
{| class="wikitable mw-collapsible mw-collapsed"
! PCIe Optimization
</source>
|}
 
 
{| class="wikitable mw-collapsible mw-collapsed"
! For-loop Optimization
|-
|
<source>
/*
* Poisson Method using two arrays.
* Non-Ghost Cells Method
* Multiple PCIe Calls made, once per iteration
* by Tony Sim
*/
#include <cstring>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <string>
#include <cuda_runtime.h>
#include "poisson-alt-ghost2.cuh"
 
namespace DPS{
 
Poisson::Poisson(std::ifstream& ifs) {
blockx = 32;
blocky = 32;
 
std::string line;
nColumns = 0;
bufferSide = 0;
nRowsTotal = 0;
/* find number of columns */
std::getline(ifs,line);
for (size_t i = 0 ; i < line.size() ; i++){
if(line[i]==' ') nColumns++;
}
nColumns++;
 
/* find number of rows */
nRowsTotal++; /* already fetched one */
while(std::getline(ifs,line))
nRowsTotal++;
ifs.clear();
 
int sizeX = ((nColumns + 2 + blockx + 2 - 1)/(blockx+2))*(blockx+2);
int sizeY = ((nRowsTotal + 2 + blocky + 2 - 1)/(blocky+2))*(blocky+2);
bufferSize = sizeX * sizeY;
std::cout << "Allocate initial memory" << std::endl;
try{
h_data = new float[ bufferSize ]; /* add edge buffers */
}
catch (...){
throw std::runtime_error("Failed to Allocate Memory");
}
 
/* readin data */
std::cout <<"Reading in data"<<std::endl;
ifs.seekg(0,ifs.beg);
/* allocate memory to all but the edge buffer, index 0 and max for each row and column */
std::memset(h_data,0,bufferSize);
for (size_t i = 0 ; i < nRowsTotal+2 ; i++){
for (size_t j = 0 ; j < nColumns+2 ; j++){
float val = 0;
if(!(i == 0 || i == nRowsTotal + 1 || j == 0 || j == nColumns + 1))
ifs >> val;
h_data[i*(nColumns+2)+j] = val;
}
}
 
std::cout <<"Setting buffer"<<std::endl;
bool state = devMemSet();
 
}
 
Poisson::Poisson(const size_t r, const size_t c, float* d) {
bufferSide = 0;
nRowsTotal = r;
nColumns = c;
try{
h_data = new float[(r+2)*(c+2)];
}
catch (...){
throw std::runtime_error("Failed to Allocate Memory");
}
std::memcpy(h_data,d,(r+2)*(c+2)*sizeof(float));
devMemSet();
}
 
Poisson::~Poisson(){
delete [] h_data;
cudaFree(d_data);
cudaDeviceReset();
}
 
bool Poisson::devMemSet(){
 
/* create double buffer */
cudaMalloc(&d_data, bufferSize * sizeof(float));
 
if(d_data != nullptr){
/* copy the initial information to the first buffer */
cudaError_t state = cudaMemcpy((void*)d_data,(const void*)h_data, bufferSize * sizeof(float),cudaMemcpyHostToDevice);
if(state != cudaSuccess)
std::cerr << "ERROR on devMemSet at cudaMemcpy : " << cudaGetErrorString(state)<< std::endl;
}
return d_data;
}
 
float* Poisson::operator()(const size_t nIterations, const float wx, const float wy){
 
/* calculate the grid, block, where block has 1024 threads total */
unsigned int gridx = ((nRowsTotal+2)+blockx-1)/blockx;
unsigned int gridy = ((nRowsTotal+2)+blocky-1)/blocky;
 
/* create dim3 */
dim3 dBlock= {blockx,blocky};
dim3 dGrid = {gridx,gridy};
 
/* generate shared memory map that will control ghost cell sharing */
char* hmap = new char[(blockx+2)*(blocky+2)*3];
int stride = 3;
for(int i = 0 ; i < (blockx+2);i++){
for(int j = 0 ; j < (blocky+2);j++){
char val = 0;
char x = 0;
char y = 0;
if(i==1){
val = 1;
x=-1;
y=0;
}
if(j==1){
val = 1;
x=0;
y=-1;
}
if(i==blockx) {
val = 1;
x=1;
y=0;
}
if(j==blocky){
val = 1;
x=0;
y=1;
}
if(i==2 || j==2 || i==31 || j==31)
val = 2;
hmap[(i * (blockx+2) + j)*stride] = val;
hmap[(i * (blockx+2) + j)*stride+1] = x;
hmap[(i * (blockx+2) + j)*stride+2] = y;
}
}
/* transfer to device */
char* dmap = nullptr;
cudaMalloc(&dmap,(blockx+2)*(blocky+2)*sizeof(char)*3);
cudaMemcpy(dmap,hmap,(blockx+2)*(blocky+2)*sizeof(char)*3,cudaMemcpyHostToDevice);
 
/* run iterations */
update<<<dGrid,dBlock>>>(d_data,dmap,nColumns, nRowsTotal, wx, wy,nIterations);
 
/*DEBUG */ h_data[2*(nColumns+2)+2] = 100.0f;
/* output results from device to host */
cudaError_t state = cudaMemcpy(h_data,d_data,(nColumns+2)*(nRowsTotal+2)*sizeof(float),cudaMemcpyDeviceToHost);
if(state != cudaSuccess)
std::cout << "ERROR on () when copying data back to host with : " << cudaGetErrorString(state)<< std::endl;
 
return h_data;
}
 
void Poisson::show(std::ostream& ofs) const{
ofs << std::fixed << std::setprecision(1);
for (size_t j = 1; j <= nColumns ; j++) {
for (size_t i = 1 ; i <= nRowsTotal ; i++)
ofs << std::setw(8) << h_data[i * (nColumns+2) + j]<<",";
ofs << std::endl;
}
}
__global__ void update (float* data, char* dmap, int nCol, int nRow, const float wx, const float wy, unsigned int nIterations){
size_t j = blockDim.x * blockIdx.x + threadIdx.x + 1; /* for x axis */
size_t i = blockDim.y * blockIdx.y + threadIdx.y + 1; /* for y axis */
size_t y = threadIdx.x+1;
size_t x = threadIdx.y+1;
 
const unsigned int bufferSize = (32+2)*(32+2);
__shared__ float localBuffer[ 2 * bufferSize ]; /* double local buffer with ghost cells */
// __shared__ char lmap[bufferSize];
 
unsigned int buffer = 0;
 
float prefetch = 0.0f;
/* copy information into first of the local buffer */
localBuffer[x*(32+2)+y] = data[i*(nCol+2)+j];
__syncthreads();
 
const char lmap = dmap[(x*(32+2)+y)*3];
const char addx = dmap[(x*(32+2)+y)*3+1];
const char addy = dmap[(x*(32+2)+y)*3+2];
 
/* prefetch */
if(lmap)
prefetch = data[(i+addx)*(nCol+2)+j+addy] ;
 
/* run iterations */
for (unsigned int n = 0 ; n < nIterations; n++){
if(lmap)
localBuffer[buffer * bufferSize + (x+addx)*(32+2) + y+addy] = prefetch;
/* Calculate and store into the other buffer */
float curr = localBuffer[buffer*bufferSize + x * (32+2)+ y];
float dir1 = localBuffer[buffer*bufferSize + (x+1) * (32+2) +y];
float dir2 = localBuffer[buffer*bufferSize + (x-1) * (32+2) +y];
float dir3 = localBuffer[buffer*bufferSize + x * (32+2) + y + 1];
float dir4 = localBuffer[buffer*bufferSize + x * (32+2) + y - 1];
localBuffer[(1-buffer)*bufferSize + x*(32+2)+y] = curr + wx*(dir1+dir2-2.0f*curr) + wy*(dir3+dir4-2.0f*curr);
/* flip buffer */
buffer = 1-buffer;
/* for threads in charge of edges, share and obtain ghost cells */
if(lmap){
/* Copy over edges to global memory to be shared with neighboring blocks */
data[i*(nCol+2)+j] = localBuffer[buffer * bufferSize + x * (32+2) + y ];
}
__syncthreads();
if(lmap){
/* Copy back buffers from global memory */
prefetch = data[(i+addx)*(nCol+2)+j+addy] ;
}
}
 
/* copy the output back into global memory */
data[i*(nCol+2)+j] = localBuffer[buffer * bufferSize + x * (32+2) + y ];
__syncthreads();
}
}
</source>
|}
{| class="wikitable mw-collapsible mw-collapsed"
! For-loop Optimization - poissant-alt-ghost2.cuh
|-
|
<source>
/*
* Poisson Method using two arrays.
* Non-Ghost Cells Method
* Multiple PCIe Calls made, once per iteration
* by Tony Sim
*/
#ifndef POISSON_H
#define POISSON_H
#include <fstream>
#include <cuda_runtime.h>
 
namespace DPS{
class Poisson {
unsigned int blockx;
unsigned int blocky;
unsigned int nRowsTotal;
unsigned int nColumns;
unsigned int bufferSize;
float* h_data;
float* d_data;
int bufferSide;
 
void bufferSwitch(){ bufferSide = 1 - bufferSide; };
bool devMemSet();
 
public:
Poisson() = delete;
Poisson(std::ifstream& ifs);
Poisson(const size_t r, const size_t c, float* d);
~Poisson();
float* operator()(const size_t iteration, const float wx, const float wy);
float* operator()(const size_t iteration){
return operator()(iteration,0.1,0.1);
}
void show(std::ostream& ofs) const;
};
__global__ void update (float* data, char* dmap, int nCol, int nRow, const float wx, const float wy, unsigned int nIterations);
}
#endif
 
</source>
|}
 
==== Optimization Details ====
===== PCIe Version =====
* Coalesced Memory - Large performance boost.
* Prefetch - this had minor to no effect on the performance.
 
===== For-loop Version =====
* Shared Memory - Small boost. Used technique called Ghost Cells where updated information is shared over global memory as needed to perform the next iteration.
* Prefetch - Small boost. Information are fetched first into register in the previous iteration to be copied in the current iteration prior to calculation.
* Coalesed Memory - Large boost.
* Logic change - To minimize the number of condition calls, a predefined map of instruction was created on the host based on the block dimension information. Using this information, the if statement had been cut down to almost 1/4, showing noticeable performance increase.
 
==== Result ====
[[File:optimized.png|frame|GPU highlights. Ghost Cell + Prefetch + Coaleased memory + logic change is slightly faster than simpler Prefetch+Coaleased memory that uses Global Memory]]
[[File:all.png|frame|Using GPU significantly improved Calculation Time over the CPU counterparts.]]
70
edits

Navigation menu