Changes

Jump to: navigation, search

The B-Team

1,437 bytes removed, 19:20, 30 November 2012
Code
I'm only going to post the kernels that changed. There were also many changes in the host code that helped make the program faster.
<big><pre>
 
// part of common.h
const int BLOCK_SIZE_X = 16;
 
const int BLOCK_SIZE_Y = 8;
 
const int BLOCK_SIZE_SSAA = 256;
 
const int BLOCK_SIZE_RGB = 16;
 
 
 
const int MAX_GRID_SIZE_X = 65536;
 
 
 
const uint8_t MAX_ALIASING_FACTOR = 16;
</pre></big>
<big><pre>
 
// part of main.cpp
 
cudaError_t error;
 
int iDevice;
 
cudaDeviceProp prop;
 
 
// Get device information for total global memory
 
error = cudaGetDevice(&iDevice);
 
if(error != cudaSuccess)
  displayCudeError(error);  
error = cudaGetDeviceProperties(&prop, iDevice);
 
if(error != cudaSuccess)
  displayCudeError(error);  
// The max amount to do per pass demends on the size of GPU memory and the size of unsigned integer.
 
// Global memory is devided by two so that both the value array and RGB array can both fit in memory.
 
DimensionSqType maxPixelsPerPass = (UINT_MAX > (prop.totalGlobalMem / 2)) ? (prop.totalGlobalMem / 2) : UINT_MAX;
 
 
// RGB + alpha is 4 BYTEs. Make sure two copies of the larger can fit in device memory.
 
DimensionType largerType = ((4 * sizeof(BYTE)) < sizeof(ElementType)) ? sizeof(ElementType) : (4 * sizeof(BYTE));
 
 
// Divide by two for extra safty.
 
maxPixelsPerPass /= (largerType * 2);
 
</pre></big>
 
 
 
<big><pre>
__device__ ElementType mandelbrot(ElementType c_i, ElementType c_r, IterationType iterations)
{
 
ElementType z_r = c_r;
 
ElementType z_i = c_i;
 
 
ElementType z2_r = z_r * z_r;
 
ElementType z2_i = z_i * z_i;
 
 
IterationType n = 0;
 
 
while(n < iterations && z2_r + z2_i < 4.0)
 
{
 
z_i = 2.0 * z_r * z_i + c_i;
 
z_r = z2_r - z2_i + c_r;
 
 
z2_r = z_r * z_r;
 
z2_i = z_i * z_i;
 
 
n++;
 
}
 
 
z_i = 2.0 * z_r * z_i + c_i;
 
z_r = z2_r - z2_i + c_r;
 
 
z2_r = z_r * z_r;
 
z2_i = z_i * z_i;
 
 
z_i = 2.0 * z_r * z_i + c_i;
 
z_r = z2_r - z2_i + c_r;
 
 
z2_r = z_r * z_r;
 
z2_i = z_i * z_i;
 
 
n += 2;
 
 
if(n > iterations)
 
{
  return (ElementType)iterations; 
}
 
else
 
{
 
return (ElementType)n + 1.0 - __logf(__logf(__dsqrt_rn(z2_r + z2_i)))/__logf(2.0);;
 
}
 
}
</pre></big>
// Return number of iterations.
__global__ void getFractal(ElementType* img, ElementType yMax, ElementType xMin, ElementType xScale, ElementType yScale, IterationType iterations, DimensionType width, DimensionType height)
 
{
 
DimensionType dx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
 
DimensionType dy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y;
 
 
if(dx >= width || dy >= height)
  return;    // This is fine because so few registers are used
img[(DimensionSqType)dy * (DimensionSqType)width + (DimensionSqType)dx] = mandelbrot(yMax - (ElementType)dy * yScale,
 
xMin + (ElementType)dx * xScale,
 
iterations);
 
}
</pre></big>
// Return number of iterations.
__global__ void getFractalSSAA(ElementType* img, DimensionSqType* list, DimensionSqType length, ElementType yMax, ElementType xMin,
 
ElementType xScale, ElementType yScale, IterationType iterations,
 
DimensionType width, AlisingFactorType ssaafactor)
 
{
 
DimensionType curr = blockIdx.x * BLOCK_SIZE_SSAA + threadIdx.x;
 
 
if(curr >= length)
  return;   
DimensionSqType val = list[curr];
 
 
ElementType xSubScale = xScale / ((ElementType)ssaafactor);
 
ElementType ySubScale = yScale / ((ElementType)ssaafactor);
 
 
// Get the centre of the top left subpixel
 
xMin = xMin + (ElementType)(val % width) * xScale - (xScale / 2.0) + (xSubScale / 2.0);
 
yMax = yMax - (ElementType)(val / width) * yScale + (yScale / 2.0) - (ySubScale / 2.0);
 
 
// Get the values for each pixel in fractal
 
ElementType subpixels[MAX_ALIASING_FACTOR * MAX_ALIASING_FACTOR];
 
 
for(AlisingFactorType x = 0; x < ssaafactor; x++)
 
{
 
for(AlisingFactorType y = 0; y < ssaafactor; y++)
 
{
 
subpixels[x * ssaafactor + y] = mandelbrot(yMax - ySubScale * y , xMin + xSubScale * x, iterations);
 
}
 
}
 
 
AlisingFactorSqType factor2 = (AlisingFactorSqType)ssaafactor * (AlisingFactorSqType)ssaafactor;
 
 
if(factor2 % 2 != 0)
 
{
 
img[val] = getMedian(subpixels, (AlisingFactorSqType)ssaafactor * (AlisingFactorSqType)ssaafactor / 2, factor2);
 
}
 
else
 
{
 
img[val] = (getMedian(subpixels, factor2 / 2 - 1, factor2)
 
+ getMedian(subpixels, factor2 / 2, factor2))
 
/ 2.0;
 
}
 
}
</pre></big>
<big><pre>
__global__ void getBmpRGB(BYTE* image, ElementType* values, DimensionType width, DimensionType height, IterationType iterations)
{
DimensionType dy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
DimensionType dx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if(dx >= width || dy >= height)
return;
 
DimensionType c = dy * width + dx;
BYTE rgbValue[3];
 
getRGB(values[c]/(ElementType)iterations, rgbValue);
image[c*3] = rgbValue[2];
image[c*3 + 1] = rgbValue[1];
image[c*3 + 2] = rgbValue[0];
}
</pre></big>
<big><pre>
__global__ void getBmpRGBfromHistorgram(ElementType* map, BYTE* image, ElementType* values, DimensionType width, DimensionType height)
{
DimensionType dy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
DimensionType dx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if(dx >= width || dy >= height)
return;
 
DimensionType c = dy * width + dx;
 
IterationType ival = (IterationType)values[c];
 
ElementType colourVal = map[ival] + (values[c] - (ElementType)ival) * (map[ival + 1] - map[ival]);
BYTE rgbValue[3];
 
getRGB(colourVal, rgbValue);
image[c*3] = rgbValue[2];
image[c*3 + 1] = rgbValue[1];
image[c*3 + 2] = rgbValue[0];
}
</pre></big>
 
 
= Assignment 2 =
1
edit

Navigation menu