# Changes

## Algo holics

, 02:14, 31 March 2019
Assignment 2
=== Assignment 2 ===
We decided to choose the neural network code to paralellize. The reason being it gives us a good learning opportunity and more importantly that we can potentially speed up the execution time of the program by using CUDA.

=== Parallelize ===

{| class="wikitable mw-collapsible mw-collapsed"
! Flat profile
|-
|

#include <iostream>
#include <algorithm>
#include <vector>
#include <ctime>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

__global__ void merge(std::vector<int> arrayInput, unsigned int b, unsigned int c, unsigned int e) {

int idx = blockIdx.x * blockDim.x + threadIdx.x;

std::vector<int> C(arrayInput);
unsigned int i1 = b;
unsigned int i2 = c + 1; //start point in each piece
unsigned int n1 = c;
unsigned int n2 = e; //end point of each piece

if (idx < e) {
for (unsigned i = b; i <= e; ++i)
{
if (i1 > n1) //
{
C[i] = arrayInput[i2];
++i2;
}else if (i2 > n2){
C[i] = arrayInput[i1];
++i1;
}else if (arrayInput[i1] <= arrayInput[i2]) {
C[i] = arrayInput[i1];
++i1;
}else{
C[i] = arrayInput[i2];
++i2;
}
}
}

}

void check_sort(std::vector<int> array); //test sorted arrays
void merge_sort(std::vector<int> array); //recurrent merge sort
//std::vector<int> merge(std::vector<int> array, unsigned int b, unsigned int c, unsigned int e);

int main()
{
unsigned int size;
std::cin >> size;

std::vector<int> initial_array(size); //array of random elements

//prefill arrays
srand(time(0));
for (unsigned int i; i < initial_array.size(); ++i) {
initial_array[i] = rand();
}

merge_sort(initial_array);

/* DEBUGGER
std::cout << "initial array" << std::endl;
for (unsigned int i = 0; i < initial_array.size(); ++i) {
std::cout << initial_array[i] << " ";
}
std::cout << std::endl;
*/

return 0;
}

void merge_sort(std::vector<int> arrayInput)
{
std::cout << "merge sort" << std::endl;
check_sort(arrayInput);
/* DEBUGGER
std::cout << "initial array" << std::endl;
for (unsigned int i = 0; i < array.size(); ++i) {
std::cout << array[i] << " ";
} */

int d;
cudaGetDevice(&d);
cudaGetDeviceProperties(&prop, d);

float* d_a;

float start_time = clock();
unsigned int n = arrayInput.size();

float* h_a = new float[n];
cudaMalloc((void**)&d_a, n * sizeof(float));

for (unsigned int s = 1; s < n; s *= 2)
{
for (unsigned int b = 0; b < n; b += s * 2)
{
unsigned int c = std::min(b + s - 1, n - 1);
unsigned int e = std::min(c + s, n - 1);
merge <<< 1,n >> > (arrayInput, b, c, e);

}
}
float end_time = clock() - start_time;
/* DEBUGGER
for (unsigned int i = 0; i < array.size(); ++i) {
std::cout << array[i] << " ";
} */
check_sort(arrayInput);
std::cout << "time: " << end_time / 1000 << std::endl;

cudaMemcpy(h_a, d_a, n * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(d_a);
delete[] h_a;

}

/*
std::vector<int> merge(std::vector<int> array, unsigned int b, unsigned int c, unsigned int e) {
std::vector<int> C(array);

unsigned int i1 = b;
unsigned int i2 = c + 1; //start point in each piece
unsigned int n1 = c;
unsigned int n2 = e; //end point of each piece

for (unsigned i = b; i <= e; ++i)
{
if (i1 > n1) //
{
C[i] = array[i2];
++i2;
}
else if (i2 > n2)
C[i] = array[i1];
++i1;
{
}else if (array[i1] <= array[i2]) {
C[i] = array[i1];
++i1;
}
else
{
C[i] = array[i2];
++i2;
}
}
return C;

}*/

void check_sort(std::vector<int> array)
{
for (unsigned int i = 0; i < (array.size() - 1); ++i)
{
if (array[i] >(array[i + 1]))
{
std::cout << "unsorted" << std::endl;
return;
}
}
std::cout << "sorted" << std::endl;
}

|}

=== Assignment 3 ===
8
edits