Changes

GPU621/NoName

5,120 bytes removed, 05:42, 3 December 2016

no edit summary

=====C++ 11=====

The C++ 11 thread libraries provide the mutex class to support mutual exclusionand synchronization. <br>

The mutex class is a synchronization primitive that can be used to protect shared data from being accessed by multiple threads.

std::mutex is usually not accessed directly, instead std::unique_lock and std::lock_guard are used to manage locking.

Openmp unfortunately does not support asynchronous multi-threading as is designed for designed for parallelism, not concurrency.

~~===Programming Models===~~

~~====SPMD====~~

~~An example of the SPMD programming model in STD Threads using an atomic barrier~~

~~#include <iostream>~~

~~#include <iomanip>~~

~~#include <cstdlib>~~

~~#include <chrono>~~

~~#include <vector>~~

~~#include <thread>~~

~~#include <atomic>~~

~~using namespace std::chrono;~~

~~std::atomic<double> pi;~~

~~void reportTime(const char* msg, steady_clock::duration span) {~~

~~auto ms = duration_cast<milliseconds>(span);~~

~~std::cout << msg << " - took - " <<~~

~~ms.count() << " milliseconds" << std::endl;~~

}

~~void run(int ID, double stepSize, int nthrds, int n)~~

{

~~double x;~~

~~double sum = 0.0;~~

~~for (int i = ID; i < n; i = i + nthrds){~~

~~x = (i + 0.5)*stepSize;~~

~~sum += 4.0 / (1.0 + x*x);~~

}

~~sum = sum * stepSize;~~

~~pi = pi + sum;~~

}

~~int main(int argc, char** argv) {~~

~~if (argc != 3) {~~

~~std::cerr << argv[0] << ": invalid number of arguments\n";~~

~~return 1;~~

}

~~int n = atoi(argv[1]);~~

~~int numThreads = atoi(argv[2]);~~

~~steady_clock::time_point ts, te;~~

~~// calculate pi by integrating the area under 1/(1 + x^2) in n steps~~

~~ts = steady_clock::now();~~

~~std::vector<std::thread> threads(numThreads);~~

~~double stepSize = 1.0 / (double)n;~~

~~for (int ID = 0; ID < numThreads; ID++) {~~

~~int nthrds = std::thread::hardware_concurrency();~~

~~if (ID == 0) numThreads = nthrds;~~

~~threads[ID] = std::thread(run, ID, stepSize, 8, n);~~

}

~~te = steady_clock::now();~~

~~for (int i = 0; i < numThreads; i++){~~

~~threads[i].join();~~

}

~~std::cout << "n = " << n << std::fixed << std::setprecision(15) << "\n pi(exact) = " << 3.141592653589793 << "\n pi(calcd) = " << pi << std::endl;~~

~~reportTime("Integration", te - ts);~~

~~// terminate~~

~~char c;~~

~~std::cout << "Press Enter key to exit ... ";~~

~~std::cin.get(c);~~

}

====Question & Awnser====

also proponents of other execution models (e.g. Intel with Cilk and TBB, GCC with C++11, etc.)

and x86 is usually considered an "experimental" platform (other vendors are usually much more conservative).

~~====OpenMP code====~~

~~//Workshop 3 using the scan and reduce with openMp~~

~~template <typename T, typename R, typename C, typename S>~~

~~int scan(~~

~~const T* in, // source data~~

~~T* out, // output data~~

~~int size, // size of source, output data sets~~

~~R reduce, // reduction expression~~

~~C combine, // combine expression~~

~~S scan_fn, // scan function (exclusive or inclusive)~~

~~T initial // initial value~~

)

{

~~/* int tile size = (n - 1)/ntiles + 1;~~

~~reduced[tid] = reduce(in + tid * tilesize,itile == last_tile ? last_tile_size : tile_size, combine, T(0));~~

~~#pragma omp barrier~~

~~#pragma omp single */~~

~~int nthreads = 1;~~

~~if (size > 0) {~~

~~// requested number of tiles~~

~~int max_threads = omp_get_max_threads();~~

~~T* reduced = new T[max_threads];~~

~~T* scanRes = new T[max_threads];~~

~~#pragma omp parallel~~

{

~~int ntiles = omp_get_num_threads(); // Number of tiles~~

~~int itile = omp_get_thread_num();~~

~~int tile_size = (size - 1) / ntiles + 1;~~

~~int last_tile = ntiles - 1;~~

~~int last_tile_size = size - last_tile * tile_size;~~

~~if (itile == 0)~~

~~nthreads = ntiles;~~

~~// step 1 - reduce each tile separately~~

~~for (int itile = 0; itile < ntiles; itile++)~~

~~reduced[itile] = reduce(in + itile * tile_size,~~

~~itile == last_tile ? last_tile_size : tile_size, combine, T(0));~~

~~// step 2 - perform exclusive scan on all tiles using reduction outputs~~

~~// store results in scanRes[]~~

~~excl_scan(reduced, scanRes, ntiles, combine, T(0));~~

~~// step 3 - scan each tile separately using scanRes[]~~

~~for (int itile = 0; itile < ntiles; itile++)~~

~~scan_fn(in + itile * tile_size, out + itile * tile_size,~~

~~itile == last_tile ? last_tile_size : tile_size, combine,~~

~~scanRes[itile]);~~

}

~~delete[] reduced;~~

~~delete[] scanRes;~~

}

~~return nthreads;~~

}

~~====C++11 code====~~

~~#include <iostream>~~

~~#include <omp.h>~~

~~#include <chrono>~~

~~#include <vector>~~

~~#include <thread>~~

~~using namespace std;~~

~~void doNothing() {}~~

~~int run(int algorithmToRun)~~

{

~~auto startTime = std::chrono::system_clock::now();~~

~~for(int j=1; j<100000; ++j)~~

{

~~if(algorithmToRun == 1)~~

{

~~vector<thread> threads;~~

~~for(int i=0; i<16; i++)~~

{

~~threads.push_back(thread(doNothing));~~

}

~~for(auto& thread : threads) thread.join();~~

}

~~else if(algorithmToRun == 2)~~

{

~~#pragma omp parallel for num_threads(16)~~

~~for(unsigned i=0; i<16; i++)~~

{

~~doNothing();~~

}

~~auto endTime = std::chrono::system_clock::now();~~

~~std::chrono::duration<double> elapsed_seconds = endTime - startTime;~~

~~return elapsed_seconds.count();~~

}

~~int main()~~

{

~~int cppt = run(1);~~

~~int ompt = run(2);~~

~~cout<<cppt<<endl;~~

~~cout<<ompt<<endl;~~

~~return 0;~~

}

Danylo Daniel Medinski

45

edits

Changes

GPU621/NoName

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools