Changes

Jump to: navigation, search

GPU621/NoName

5,120 bytes removed, 05:42, 3 December 2016
no edit summary
=====C++ 11=====
The C++ 11 thread libraries provide the mutex class to support mutual exclusionand synchronization. <br>
The mutex class is a synchronization primitive that can be used to protect shared data from being accessed by multiple threads.
std::mutex is usually not accessed directly, instead std::unique_lock and std::lock_guard are used to manage locking.
Openmp unfortunately does not support asynchronous multi-threading as is designed for designed for parallelism, not concurrency.
 
===Programming Models===
====SPMD====
 
An example of the SPMD programming model in STD Threads using an atomic barrier
 
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <chrono>
#include <vector>
#include <thread>
#include <atomic>
using namespace std::chrono;
std::atomic<double> pi;
void reportTime(const char* msg, steady_clock::duration span) {
auto ms = duration_cast<milliseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " milliseconds" << std::endl;
}
void run(int ID, double stepSize, int nthrds, int n)
{
double x;
double sum = 0.0;
for (int i = ID; i < n; i = i + nthrds){
x = (i + 0.5)*stepSize;
sum += 4.0 / (1.0 + x*x);
}
sum = sum * stepSize;
pi = pi + sum;
}
int main(int argc, char** argv) {
if (argc != 3) {
std::cerr << argv[0] << ": invalid number of arguments\n";
return 1;
}
int n = atoi(argv[1]);
int numThreads = atoi(argv[2]);
steady_clock::time_point ts, te;
// calculate pi by integrating the area under 1/(1 + x^2) in n steps
ts = steady_clock::now();
std::vector<std::thread> threads(numThreads);
double stepSize = 1.0 / (double)n;
for (int ID = 0; ID < numThreads; ID++) {
int nthrds = std::thread::hardware_concurrency();
if (ID == 0) numThreads = nthrds;
threads[ID] = std::thread(run, ID, stepSize, 8, n);
}
te = steady_clock::now();
for (int i = 0; i < numThreads; i++){
threads[i].join();
}
std::cout << "n = " << n << std::fixed << std::setprecision(15) << "\n pi(exact) = " << 3.141592653589793 << "\n pi(calcd) = " << pi << std::endl;
reportTime("Integration", te - ts);
// terminate
char c;
std::cout << "Press Enter key to exit ... ";
std::cin.get(c);
}
====Question & Awnser====
also proponents of other execution models (e.g. Intel with Cilk and TBB, GCC with C++11, etc.)
and x86 is usually considered an "experimental" platform (other vendors are usually much more conservative).
 
 
====OpenMP code====
//Workshop 3 using the scan and reduce with openMp
 
template <typename T, typename R, typename C, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
R reduce, // reduction expression
C combine, // combine expression
S scan_fn, // scan function (exclusive or inclusive)
T initial // initial value
)
{
/* int tile size = (n - 1)/ntiles + 1;
reduced[tid] = reduce(in + tid * tilesize,itile == last_tile ? last_tile_size : tile_size, combine, T(0));
#pragma omp barrier
#pragma omp single */
int nthreads = 1;
if (size > 0) {
// requested number of tiles
int max_threads = omp_get_max_threads();
T* reduced = new T[max_threads];
T* scanRes = new T[max_threads];
#pragma omp parallel
{
int ntiles = omp_get_num_threads(); // Number of tiles
int itile = omp_get_thread_num();
int tile_size = (size - 1) / ntiles + 1;
int last_tile = ntiles - 1;
int last_tile_size = size - last_tile * tile_size;
if (itile == 0)
nthreads = ntiles;
// step 1 - reduce each tile separately
for (int itile = 0; itile < ntiles; itile++)
reduced[itile] = reduce(in + itile * tile_size,
itile == last_tile ? last_tile_size : tile_size, combine, T(0));
// step 2 - perform exclusive scan on all tiles using reduction outputs
// store results in scanRes[]
excl_scan(reduced, scanRes, ntiles, combine, T(0));
// step 3 - scan each tile separately using scanRes[]
for (int itile = 0; itile < ntiles; itile++)
scan_fn(in + itile * tile_size, out + itile * tile_size,
itile == last_tile ? last_tile_size : tile_size, combine,
scanRes[itile]);
}
delete[] reduced;
delete[] scanRes;
}
return nthreads;
}
 
====C++11 code====
 
#include <iostream>
#include <omp.h>
#include <chrono>
#include <vector>
#include <thread>
using namespace std;
void doNothing() {}
int run(int algorithmToRun)
{
auto startTime = std::chrono::system_clock::now();
for(int j=1; j<100000; ++j)
{
if(algorithmToRun == 1)
{
vector<thread> threads;
for(int i=0; i<16; i++)
{
threads.push_back(thread(doNothing));
}
for(auto& thread : threads) thread.join();
}
else if(algorithmToRun == 2)
{
#pragma omp parallel for num_threads(16)
for(unsigned i=0; i<16; i++)
{
doNothing();
}
}
}
auto endTime = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = endTime - startTime;
return elapsed_seconds.count();
}
int main()
{
int cppt = run(1);
int ompt = run(2);
cout<<cppt<<endl;
cout<<ompt<<endl;
return 0;
}

Navigation menu