GPU621/Threadless Horsemen

From CDOT Wiki
Revision as of 17:27, 25 November 2018 by Tsarkarcd (talk | contribs) (Comparing Multi-threading in Julia vs OpenMP)
Jump to: navigation, search

Comparing Multi-threading in Julia vs OpenMP

Group Members

  1. Tanvir Sarkar
  2. Nathan Misener

Introduction: The Julia Programming Language

  • used for scientific computing
  • alternative to matlab
  • faster than python, not as fast c, has a repl for quick edit-run debug cycles

Julia's Forms of Parallelism

  • multi-threading (our focus)
  • multi-core / distributed processing (like mpi?)
  • coroutines / green threads

OpenMP vs Julia Code

  • Julia code is more concise than C/C++ (also simpler to initialize arrays, don't need to manage memory)
  • Note that OpenMp code uses loop interchange (ikj) and Julia doesn't (ijk), which will be explained in the next section
OpenMP Julia Multi-threading
#include <omp.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <chrono>

#pragma once
#define N 512
typedef double array[N];

// matmul returns the product c = a * b
//
void matmul(const double a[][N], const double b[][N], double c[][N], int n) {
    
    #pragma omp parallel for
    for (int i = 0; i < n; i++) {
        for (int k = 0; k < n; k++) {
            for (int j = 0; j < n; j++) {
                c[i][j] += a[i][k] * b[k][j];
            }
        }
    }
}

// checksum returns the sum of the coefficients in matrix x[N][N]
//
double checksum(const double x[][N]) {
    double sum = 0.0;

    #pragma omp parallel for
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            sum += x[i][j];
        }
    }
    return sum;
}

// initialize initializes matrix a[N][N]
//
void initialize(double a[][N]) {

    #pragma omp parallel for
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            a[i][j] = static_cast<double>(i * j) / (N * N);
        }
    }
}

void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(span);
    std::cout << msg << ": " <<
        ms.count() << " milliseconds" << std::endl;
}

int main(int argc, char** argv) {
    if (argc != 1) {
        std::cerr << "*** Incorrect number of arguments ***\n";
        std::cerr << "Usage: " << argv[0] << "\n";
        return 1;
    }
    std::cout << std::fixed << std::setprecision(2);

    // allocate memory for matrices
    double* a = new double[N*N];
    double* b = new double[N*N];
    double* c = new double[N*N];
    array* aa = (array*)a;
    array* bb = (array*)b;
    array* cc = (array*)c;
    std::cout << "Matrix Multiplication " << N << " by " << N << std::endl;

    // initialize a and b
    std::chrono::steady_clock::time_point ts, te;
    ts = std::chrono::steady_clock::now();
    initialize(aa);
    initialize(bb);
    double* t = c;
    for (int i = 0; i < N * N; i++)
        *t++ = 0.0;
    te = std::chrono::steady_clock::now();
    reportTime("initialization", te - ts);

    // multiply matrices a and b
    ts = std::chrono::steady_clock::now();
    matmul(aa, bb, cc, N);
    te = std::chrono::steady_clock::now();
    reportTime("execution", te - ts);
    std::cout << "checksum = " << checksum(cc) << std::endl;

    // deallocate memory
    delete[] a;
    delete[] b;
    delete[] c;

    std::cout << "max threads: " << omp_get_max_threads() << std::endl;
}
N = 512

function matmul(a, b, c, n)
    Threads.@threads for i = 1:n
        for j = 1:n
            for k = 1:n
                c[i,j] += a[i,k] * b[k,j]
            end
        end
    end

    return c
end 

function checksum(x)
    sum = 0.0

    Threads.@threads for i = 1:N
        for j = 1:N
            sum += x[i,j]
        end
    end

    return sum
end

# creates and returns 2d array of zeroes
function initialize()
    a = zeros(Float64, N, N)

    Threads.@threads for i = 1:N
        for j = 1:N
            a[i,j] = (convert(Float64,((i-1) * (j-1))) / convert(Float64, (N * N)))
        end
    end

    return a
end

a = initialize();
b = initialize();
c = initialize();

start_t = time()

d = matmul(a, b, c, N)

end_t = time()
run_time = end_t - start_t

sum = checksum(d)

println("checksum = $sum")

println("matmul func time (s):")
println(run_time)

println("num threads: ")
nt = Threads.nthreads()
println(nt)

More code here: https://github.com/tsarkarsc/parallel_prog

OpenMP vs Julia Results

  • add graphs
  • recap loop interchange benefits for openmp (locality of reference)
  • discuss julia storing arrays as column major, loop interchange was worse for julia
  • discuss different levels of optimization

Conclusion

  • summary of everything