Changes

Source Code ms3 neural net

13,477 bytes added, 06:45, 1 April 2019

The Third and final iteration of the neural network

#include <iostream>
#include <vector>
#include <math.h>
#include <fstream>
#include <sstream>
#include <string>
#include <random>
#include <assert.h>
//extern "C" {
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//}

using namespace std;

void print(const vector <float>& m, int n_rows, int n_columns) {

/* "Couts" the input vector as n_rows x n_columns matrix.
Inputs:
m: vector, matrix of size n_rows x n_columns
n_rows: int, number of rows in the left matrix m1
n_columns: int, number of columns in the left matrix m1
*/

for (int i = 0; i != n_rows; ++i) {
for (int j = 0; j != n_columns; ++j) {
cout << m[i * n_columns + j] << " ";
}
cout << '\n';
}
cout << endl;
}
vector <float> operator/(const vector <float>& m2, const float m1) {

/* Returns the product of a float and a vectors (elementwise multiplication).
Inputs:
m1: float
m2: vector
Output: vector, m1 * m2, product of two vectors m1 and m2
*/

const unsigned long VECTOR_SIZE = m2.size();
vector <float> product(VECTOR_SIZE);

for (unsigned i = 0; i != VECTOR_SIZE; ++i) {
product[i] = m2[i] / m1;
};

return product;
}
vector <float> operator-(const vector <float>& m1, const vector <float>& m2) {

/* Returns the difference between two vectors.
Inputs:
m1: vector
m2: vector
Output: vector, m1 - m2, difference between two vectors m1 and m2.
*/

const unsigned long VECTOR_SIZE = m1.size();
vector <float> difference(VECTOR_SIZE);

for (unsigned i = 0; i != VECTOR_SIZE; ++i) {
difference[i] = m1[i] - m2[i];
};

return difference;
}
vector <float> operator-(const vector <float>& m1, const float* m2) {

/* Returns the difference between two vectors.
Inputs:
m1: vector
m2: vector
Output: vector, m1 - m2, difference between two vectors m1 and m2.
*/

const unsigned long VECTOR_SIZE = m1.size();
vector <float> difference(VECTOR_SIZE);

for (unsigned i = 0; i != VECTOR_SIZE; ++i) {
difference[i] = m1[i] - m2[i];
};

return difference;
}
vector <float> operator*(const float m1, const vector<float> m2) {

/* Returns the product of a float and a vectors (elementwise multiplication).
Inputs:
m1: float
m2: vector
Output: vector, m1 * m2, product of two vectors m1 and m2
*/

const unsigned long VECTOR_SIZE = m2.size();
vector <float> product(VECTOR_SIZE);

for (unsigned i = 0; i != VECTOR_SIZE; ++i) {
product[i] = m1 * m2[i];
};

return product;
}

int argmax(const vector <float>& m) {

return distance(m.begin(), max_element(m.begin(), m.end()));
}

static vector<float> random_vector(const int size)
{
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<> distribution(0.0, 0.05);
static default_random_engine generator;

vector<float> data(size);
generate(data.begin(), data.end(), [&]() { return distribution(generator); });
return data;
}

vector <float> softmax(const vector <float>& z, const int dim) {

const int zsize = static_cast<int>(z.size());
vector <float> out;

for (unsigned i = 0; i != zsize; i += dim) {
vector <float> foo;
for (unsigned j = 0; j != dim; ++j) {
foo.push_back(z[i + j]);
}

float max_foo = *max_element(foo.begin(), foo.end());

for (unsigned j = 0; j != dim; ++j) {
foo[j] = exp(foo[j] - max_foo);
}

float sum_of_elems = 0.0;
for (unsigned j = 0; j != dim; ++j) {
sum_of_elems = sum_of_elems + foo[j];
}

for (unsigned j = 0; j != dim; ++j) {
out.push_back(foo[j] / sum_of_elems);
}
}
return out;
}

__device__ float* ktranspose(float *m, const int C, const int R) {

/* Returns a transpose matrix of input matrix.
Inputs:
m: vector, input matrix
C: int, number of columns in the input matrix
R: int, number of rows in the input matrix
Output: vector, transpose matrix mT of input matrix m
*/

float* mT = new float[C * R];

for (unsigned n = 0; n != C * R; n++) {
unsigned i = n / C;
unsigned j = n % C;
mT[n] = m[R*j + i];
}

return mT;
}

vector <float> dot(const vector <float>& m1, const vector <float>& m2, const int m1_rows, const int m1_columns, const int m2_columns) {

/* Returns the product of two matrices: m1 x m2.
Inputs:
m1: vector, left matrix of size m1_rows x m1_columns
m2: vector, right matrix of size m1_columns x m2_columns (the number of rows in the right matrix
must be equal to the number of the columns in the left one)
m1_rows: int, number of rows in the left matrix m1
m1_columns: int, number of columns in the left matrix m1
m2_columns: int, number of columns in the right matrix m2
Output: vector, m1 * m2, product of two vectors m1 and m2, a matrix of size m1_rows x m2_columns
*/

vector <float> output(m1_rows*m2_columns);

for (int row = 0; row != m1_rows; ++row) {
for (int col = 0; col != m2_columns; ++col) {
output[row * m2_columns + col] = 0.f;
for (int k = 0; k != m1_columns; ++k) {
output[row * m2_columns + col] += m1[row * m1_columns + k] * m2[k * m2_columns + col];
}
}
}

return output;
}

//version 1 dot product
__global__ void kdot(const float* d_a, const float* d_b, int ni, int nj, int nk, float* d_p) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
//matrix multiplication
if (i < ni && j < nj) {
float sum = 0.0f;
for (int k = 0; k < nk; k++)
sum += d_a[i * nk + k] * d_b[k * nj + j];
d_p[i * nj + j] = sum;
}
//float* d_p = new float [ni*nk];

//for (int row = 0; row != ni; ++row) {
// for (int col = 0; col != nk; ++col) {
// d_p[row * nk + col] = 0.f;
// for (int k = 0; k != nj; ++k) {
// d_p[row * nk + col] += d_a[row * nj + k] * d_b[k * nk + col];
// }
// }
//}

//return d_p;

}
void cudaCheck(cudaError_t Error) {
if (Error != cudaSuccess) {
cerr << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
}

vector<string> split(const string &s, char delim) {
stringstream ss(s);
string item;
vector<string> tokens;
while (getline(ss, item, delim)) {
tokens.push_back(item);
}
return tokens;
}
__device__ void krelu(float* a, float* b, int n) {
for (int i = 0; i < n; ++i) {
if (a[i] < 0) {
b[i] = 0.0f;
}
else b[i] = a[i];
}

}
__device__ float* kreluPrime(float* a, int n) {
for (int i = 0; i < n; ++i) {
if (a[i] < 0) {
a[i] = 0.0f;
}
else a[i] = 1.0;
}
return a;
}
__device__ float* ksoftmax(float *input, size_t input_len) {
assert(input);//might not be needed
// assert(input_len >= 0); Not needed

float m = -INFINITY;
for (size_t i = 0; i < input_len; i++) {
if (input[i] > m) {
m = input[i];
}
}

float sum = 0.0;
for (size_t i = 0; i < input_len; i++) {
sum += expf(input[i] - m);
}

float offset = m + logf(sum);
for (size_t i = 0; i < input_len; i++) {
input[i] = expf(input[i] - offset);
}
return input;
}
void get_random_fill(float* a, int size)
{
for (int i = 0; i < size; i++) {
static std::default_random_engine e;
static std::uniform_real_distribution<> dis(0, 0.5); // rage 0 - 0.5
a[i] = dis(e);
}
}

__global__ void train(float* d_W1, float* d_W2, float* d_W3, float* d_b_X, float* d_b_Y, float* d_a2, float* d_a1, float* d_dyhat, float* d_dW3, float* d_dW2, float* d_dW1, float* d_dz2, float* d_dz1) {
int BATCH_SIZE = 256;
float lr = .01 / BATCH_SIZE;

kdot<<< 50,51>>>(ktranspose(d_a2, BATCH_SIZE, 64), d_dyhat, 64, BATCH_SIZE, 10, d_dW3);

kdot << <80,32>> >(d_dyhat, ktranspose(d_W3, 64, 10), BATCH_SIZE, 10, 64, d_dz2);
kreluPrime(d_a2, 128 * 64);
for (int i = 0; i < BATCH_SIZE * 10; i++) {
d_dz2[i] = d_dz2[i] * d_a2[i];
}

kdot << <1024, 32>> >(ktranspose(d_a1, BATCH_SIZE, 128), d_dz2, 128, BATCH_SIZE, 64, d_dW2);

kdot << <512,32>> >(d_dz2, ktranspose(d_W2, 128, 64), BATCH_SIZE, 64, 128, d_dz1);
kreluPrime(d_a1, BATCH_SIZE * 784);
for (int i = 0; i < 256 * 64; i++) {
d_dz1[i] = d_dz1[i] * d_a1[i];
}

kdot <<<512,512,32 >>>(ktranspose(d_b_X, BATCH_SIZE, 784), d_dz1, 784, BATCH_SIZE, 128, d_dW1);
// Updating the parameters
//W3 = W3 - lr * dW3;
for (int i = 0; i < (64*10); i++) {
d_W3[i] = d_W3[i] - lr * d_dW3[i];
}
//W2 = W2 - lr * dW2;
for (int i = 0; i < (128*64); i++) {
d_W2[i] = d_W2[i] - lr * d_dW2[i];
}
//W1 = W1 - lr * dW1;
for (int i = 0; i < (784*128); i++) {
d_W1[i] = d_W1[i] - lr * d_dW1[i];
}

}
vector <float> relu(const vector <float>& z) {
int size = z.size();
vector <float> output;
for (int i = 0; i < size; ++i) {
if (z[i] < 0) {
output.push_back(0.0);
}
else output.push_back(z[i]);
}
return output;
}
int main(int argc, const char * argv[]) {

string line;
vector<string> line_v;

cout << "Loading data ...\n";
vector<float> X_train;
vector<float> y_train;
ifstream myfile("train.txt");
if (myfile.is_open())
{
while (getline(myfile, line))
{
line_v = split(line, '\t');
int digit = strtof((line_v[0]).c_str(), 0);
for (unsigned i = 0; i < 10; ++i) {
if (i == digit)
{
y_train.push_back(1.);
}
else y_train.push_back(0.);
}

int size = static_cast<int>(line_v.size());
for (unsigned i = 1; i < size; ++i) {
X_train.push_back(strtof((line_v[i]).c_str(), 0));
}
}
X_train = X_train / 255.0;
myfile.close();
}

else cout << "Unable to open file" << '\n';

int xsize = static_cast<int>(X_train.size());
int ysize = static_cast<int>(y_train.size());

// Some hyperparameters for the NN
int BATCH_SIZE = 256;
// float lr = .01 / BATCH_SIZE;

// Random initialization of the weights
vector <float> W1 = random_vector(784 * 128);
vector <float> W2 = random_vector(128 * 64);
vector <float> W3 = random_vector(64 * 10);
float* fW1 = new float[784 * 128];
float* fW2 = new float[128 * 64];
float* fW3 = new float[64 * 10];

for (int i = 0; i < (784 * 128); i++) {
if (i < (128 * 64)) {
if (i < (64 * 10)) {
fW3[i] = W3[i];
}
fW2[i] = W2[i];
}
fW1[i] = W1[i];
}

float* dW1 = new float[784 * 128];
float* dW2 = new float[128 * 64];
float* dW3 = new float[64 * 10];
//allocate device memory
float* d_W1;
float* d_W2;
float* d_W3;

float* d_b_X;
float* d_b_Y;

float* d_a1;
float* d_a2;

float* d_dyhat;

float* d_dW3;
float* d_dW2;
float* d_dW1;

float* d_dz2;
float* d_dz1;

//round weights
cudaMalloc((void**)&d_W1, sizeof(float) * 784 * 128);
cudaMalloc((void**)&d_W2, sizeof(float) * 128 * 64);
cudaMalloc((void**)&d_W3, sizeof(float) * 64 * 10);

//feed forward
cudaMalloc((void**)&d_a1, sizeof(float) * BATCH_SIZE * 784);
cudaMalloc((void**)&d_a2, sizeof(float) * BATCH_SIZE * 128);

cudaMalloc((void**)&d_dyhat, sizeof(float) * 64 * 10);
//backpropagation weights
cudaMalloc((void**)&d_dW3, sizeof(float) * 64 * BATCH_SIZE);
cudaMalloc((void**)&d_dW2, sizeof(float) * 128 * BATCH_SIZE);
cudaMalloc((void**)&d_dW1, sizeof(float) * 784 * BATCH_SIZE);
cudaMalloc((void**)&d_dz2, sizeof(float) * BATCH_SIZE * 10);
cudaMalloc((void**)&d_dz1, sizeof(float) * BATCH_SIZE * 64);

cudaMemcpy(d_W1, fW1, sizeof(float) * 784 * 128, cudaMemcpyHostToDevice);
cudaMemcpy(d_W2, fW2, sizeof(float) * 128 * 64, cudaMemcpyHostToDevice);
cudaMemcpy(d_W3, fW3, sizeof(float) * 64 * 10, cudaMemcpyHostToDevice);

cout << "Training the model ...\n";
for (unsigned i = 0; i < 10000; ++i) {

// Building batches of input variables (X) and labels (y)
int randindx = rand() % (42000 - BATCH_SIZE);
vector<float> b_X;
vector<float> b_y;
for (unsigned j = randindx * 784; j < (randindx + BATCH_SIZE) * 784; ++j) {
b_X.push_back(X_train[j]);
}
for (unsigned k = randindx * 10; k < (randindx + BATCH_SIZE) * 10; ++k) {
b_y.push_back(y_train[k]);
}
// Feed forward
vector<float> a1 = relu(dot(b_X, W1, BATCH_SIZE, 784, 128));
vector<float> a2 = relu(dot(a1, W2, BATCH_SIZE, 128, 64));
cudaMemcpy(d_a1, &*a1.begin(), sizeof(float) * a1.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_a2, &*a2.begin(), sizeof(float) * a2.size(), cudaMemcpyHostToDevice);

vector<float> yhat = softmax(dot(a2, W3, BATCH_SIZE, 64, 10), 10);

// Back propagation
vector<float> dyhat = (yhat - b_y);
cudaMemcpy(d_dyhat, dyhat.data(), sizeof(float) * dyhat.size(), cudaMemcpyHostToDevice);

cudaMalloc((void**)&d_b_X, sizeof(float) * b_X.size());
cudaMalloc((void**)&d_b_Y, sizeof(float) * b_y.size());
cudaMemcpy(d_b_X, b_X.data(), sizeof(float) * b_X.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_b_Y, b_y.data(), sizeof(float) * b_y.size(), cudaMemcpyHostToDevice);

//parallel
train << <1, 1 >> > (d_W1, d_W2, d_W3, d_b_X, d_b_Y, d_a2, d_a1, d_dyhat, d_dW3, d_dW2, d_dW1, d_dz2, d_dz1);

if ((i + 1) % 100 == 0) {
cout << "-----------------------------------------------Epoch " << i + 1 << "--------------------------------------------------" << "\n";
cout << "Predictions:" << "\n";
print(yhat, 10, 10);
cout << "Ground truth:" << "\n";
print(b_y, 10, 10);
vector<float> loss_m = yhat - b_y;
float loss = 0.0;
for (unsigned k = 0; k < BATCH_SIZE * 10; ++k) {
loss += loss_m[k] * loss_m[k];
}
cout << " Loss " << loss / BATCH_SIZE << "\n";
cout << "--------------------------------------------End of Epoch :(------------------------------------------------" << "\n";
};
};

//free device memory
cudaFree(d_dW1);
cudaFree(d_dW2);
cudaFree(d_dW3);
cudaFree(d_dyhat);
cudaFree(d_dz1);
cudaFree(d_dz2);
cudaFree(d_W1);
cudaFree(d_W2);
cudaFree(d_W3);
cudaFree(d_b_X);
cudaFree(d_b_Y);

cudaDeviceReset();
delete[] dW1;
delete[] dW2;
delete[] dW3;
return 0;
}

Spdjurovic

113

edits

CDOT Wiki β

Changes

Source Code ms3 neural net

CDOT Wiki ^β