Changes

DPS921/Intel Math Kernel Library

11,347 bytes removed, 22:41, 10 April 2021

→‎Code Samples

=== Code Samples ===

These samples are directly from the Intel Math Kernal Library code examples.

~~==== Vector Add ====~~ <All our code>~~//==============================================================// Vector Add is~~ examples were taken from the ~~equivalent of a Hello, World! sample for data parallel// programs. Building and running the sample verifies that your development// environment is setup correctly and demonstrates the use of the core features// of DPC++. This sample runs on both CPU and GPU (or FPGA). When run, it// computes on both the CPU and offload device, then compares results. If the// code executes on both CPU and offload device, the device name and a success// message are displayed. And, your development environment is setup correctly!~~//~~// For comprehensive instructions regarding DPC++ Programming, go to~~// github intel library located at [https://~~software.intel~~github.com~~/en-us~~/oneapi-~~programming-guide and search based on// relevant terms noted in the comments.~~//~~// DPC++ material used in the code sample:// • A one dimensional array of data shared between CPU and offload device.// • A device queue and kernel.//==============================================================// Copyright © Intel Corporation~~//src/~~/ SPDX~~oneAPI-~~License-Identifier: MIT~~samples One API Github]~~// ========================================================~~=====~~/* #include <CL/sycl.hpp>#include <array>#include <iostream>#if FPGA || FPGA_EMULATOR#include <CL/sycl/INTEL/fpga_extensions.hpp>#endif~~*/ ~~using namespace sycl;~~ ~~// Array size for this example.constexpr size_t array_size = 10000;~~ ~~// Create an exception handler for asynchronous SYCL exceptionsstatic auto exception_handler = [](sycl::exception_list e_list) {~~ ~~for (std::exception_ptr const &e : e_list) {~~ ~~try {~~ ~~std::rethrow_exception(e);~~ } ~~catch (std::exception const &e) {#if _DEBUG~~ ~~std::cout << "Failure" << std::endl;#endif~~ ~~std::terminate();~~ } }}; //************************************// Vector ~~add in DPC++ on device: returns sum in 4th parameter "sum".~~//************************************~~void VectorAdd(queue~~ Add &~~q, const int *a, const int *b, int *sum, size_t size) {~~ ~~// Create the range object for the arrays.~~ ~~range<1> num_items{size};~~ ~~// Use parallel_for to run vector addition in parallel on device. This~~ ~~// executes the kernel.~~ ~~// 1st parameter is the number of work items.~~ ~~// 2nd parameter is the kernel, a lambda that specifies what to do per~~ ~~// work item. the parameter of the lambda is the work item id.~~ ~~// DPC++ supports unnamed lambda kernel by default.~~ ~~auto e~~ MatMul = ~~q.parallel_for(num_items, [=](auto i) { sum[i]~~ = ~~a[i] + b[i]; });~~ ~~// q.parallel_for() is an asynchronous call. DPC++ runtime enqueues and runs~~ ~~// the kernel asynchronously. Wait for the asynchronous call to complete.~~ ~~e.wait();~~} //************************************~~// Initialize the array from 0 to array_size - 1~~//************************************~~void InitializeArray(int *a, size_t size) {~~ ~~for (size_t i~~ = ~~0; i < size; i++) a[i]~~ = i;} //************************************~~// Demonstrate vector add both in sequential on CPU and in parallel on device.~~//************************************~~int main() {~~ ~~// Create device selector for the device of your interest.#if FPGA_EMULATOR~~ ~~// DPC++ extension: FPGA emulator selector on systems without FPGA card.~~ ~~INTEL::fpga_emulator_selector d_selector;#elif FPGA~~ ~~// DPC++ extension: FPGA selector on systems with FPGA card.~~ ~~INTEL::fpga_selector d_selector;#else~~ // The ~~default device selector will select the most performant device.~~ ~~default_selector d_selector;#endif~~ ~~try {~~ ~~queue q(d_selector, exception_handler);~~ ~~// Print out the device information used for the kernel code.~~ ~~std::cout << "Running on device: "~~ ~~<< q.get_device().get_info<info::device::name>() << "\n";~~ ~~std::cout << "Vector size: " << array_size << "\n";~~ ~~// Create arrays with "array_size" to store input and output data. Allocate~~ ~~// unified shared memory so that both CPU and device can access them.~~ ~~int *a = malloc_shared<int>(array_size, q);~~ ~~int *b = malloc_shared<int>(array_size, q);~~ ~~int *sum_sequential = malloc_shared<int>(array_size, q);~~ ~~int *sum_parallel = malloc_shared<int>(array_size, q);~~ ~~if ((a == nullptr) || (b == nullptr) || (sum_sequential == nullptr) ||~~ ~~(sum_parallel == nullptr)) {~~ ~~if (a != nullptr) free(a, q);~~ ~~if (b != nullptr) free(b, q);~~ ~~if (sum_sequential != nullptr) free(sum_sequential, q);~~ ~~if (sum_parallel != nullptr) free(sum_parallel, q);~~ ~~std::cout << "Shared memory allocation failure.\n";~~ ~~return -1;~~ } ~~// Initialize input arrays with values from 0 to array_size - 1~~ ~~InitializeArray(a, array_size);~~ ~~InitializeArray(b, array_size);~~ ~~// Compute the sum of~~ two ~~arrays~~ samples we included in ~~sequential for validation.~~ ~~for (size_t i = 0; i < array_size; i++) sum_sequential[i] = a[i] + b[i];~~ ~~// Vector addition in DPC++.~~ ~~VectorAdd(q, a, b, sum_parallel, array_size);~~ ~~// Verify that the two arrays~~ our presentation are ~~equal.~~ ~~for (size_t i = 0; i < array_size; i++) {~~specifically located at ~~if (sum_parallel~~[~~i] != sum_sequential[i]) {~~ ~~std::cout << "Vector add failed on device.\n";~~ ~~return -1;~~ } } ~~int indices[]{0, 1, 2, (array_size - 1)};~~ ~~constexpr size_t indices_size = sizeof(indices) / sizeof(int);~~ ~~// Print out the result of vector add.~~ ~~for (int i = 0; i < indices_size; i++) {~~ ~~int j = indices[i];~~ ~~if (i == indices_size - 1) std::cout << "...\n";~~ ~~std::cout << "[" << j << "]: " << j << " + " << j << " = "~~ ~~<< sum_sequential[j] << "\n";~~ } ~~free(a, q);~~ ~~free(b, q);~~ ~~free(sum_sequential, q);~~ ~~free(sum_parallel, q);~~ ~~} catch (exception const &e) {~~ ~~std::cout << "An exception is caught while adding two vectors.\n";~~ ~~std::terminate();~~ } ~~std::cout << "Vector add successfully completed on device.\n";~~ ~~return 0;~~}~~</code>~~ ~~==== Math Mul ====~~ ~~```//==============================================================// Copyright © 2020 Intel Corporation~~//~~// SPDX-License-Identifier: MIT// =============================================================~~ /** * Matrix_mul multiplies two large matrices both the CPU and the offload device, * then compares results. If the code executes on both CPU and the offload * device, the name of the offload device and a success message are displayed. * * For comprehensive instructions regarding DPC++ Programming, go to * https://~~software.intel~~github.com/enoneapi-ussrc/~~oneapi~~oneAPI-~~programming-guide and search based on~~ * relevant terms noted in the comments. */ ~~#include <CL/sycl.hpp>#include <iostream>#include <limits>~~ samples/blob/ ~~dpc_common.hpp can be found in the dev-utilities include folder.~~master/DirectProgramming/ ~~e.g., $ONEAPI_ROOT~~DPC%2B%2B/~~dev-utilities~~DenseLinearAlgebra/~~<version>~~matrix_mul/~~include~~src/~~dpc_common~~matrix_mul_omp.~~hpp#include "dpc_common.hpp"~~ ~~using namespace std;using namespace sycl;~~ /** * Each element of the product matrix c[i][jcpp Mat Mul] ~~is computed from a unique row and~~ * column of the factor matrices, a[i][k] and ~~b[k][j]~~ */ ~~// Matrix size constants.constexpr int m_size = 150 * 8; // Must be a multiple of 8.constexpr int M = m_size / 8;constexpr int N = m_size / 4;constexpr int P = m_size / 2;~~ /** * Perform matrix multiplication on host to verify results from device. */~~int VerifyResult(float (*c_back)~~[~~P]);~~ ~~int main() {~~ ~~// Host memory buffer that device will write data back before destruction.~~ ~~float(*c_back)[P] = new float[M][P];~~ ~~// Intialize c_back~~ ~~for (int i = 0; i < M; i++)~~ ~~for (int j = 0; j < P; j++) c_back[i][j] = 0.0f;~~ ~~// Initialize the device queue with the default selector. The device queue is~~ ~~// used to enqueue kernels. It encapsulates all states needed for execution.~~ ~~try {~~ ~~queue q(default_selector{}, dpc_common::exception_handler);~~ ~~cout << "Device: " << q.get_device().get_info<info::device:~~https:~~name>() << "\n";~~ // ~~Create 2D buffers for matrices, buffer c is bound with host memory c_back~~ ~~buffer<float, 2> a_buf(range(M, N));~~ ~~buffer<float, 2> b_buf(range(N, P));~~ ~~buffer c_buf(reinterpret_cast<float *>(c_back), range(M, P));~~ ~~cout << "Problem size: c(" << M << "," << P << ") = a(" << M << "," << N~~ ~~<< ") * b(" << N << "," << P << ")\n";~~ ~~// Using three command groups to illustrate execution order~~github. ~~The use of~~ com/oneapi-src/ ~~first two command groups for initializing matrices is not the most~~ oneAPI-samples/blob/ ~~efficient way. It just demonstrates the implicit multiple command group~~ master/DirectProgramming/ ~~execution ordering.~~ DPC%2B%2B/DenseLinearAlgebra/ ~~Submit command group to queue to initialize matrix a~~ ~~q.submit([&](auto &h) {~~ vector-add/src/ ~~Get write only access to the buffer on a device~~vector-add-usm. ~~accessor a(a_buf, h, write_only);~~ ~~// Execute kernel.~~ ~~h.parallel_for(range(M, N), [=](auto index) {~~ ~~// Each element of matrix a is 1.~~ ~~a[index] = 1.0f;~~ ~~});~~ ~~});~~ ~~// Submit command group to queue to initialize matrix b~~ ~~q.submit([&](auto &h) {~~ ~~// Get write only access to the buffer on a device~~ ~~accessor b(b_buf, h, write_only);~~ ~~// Execute kernel.~~ ~~h.parallel_for(range(N, P), [=](auto index) {~~ ~~// Each column of b is the sequence 1,2,...,N~~ ~~b[index] = index[0] + 1.0f;~~ ~~});~~ ~~});~~ ~~// Submit command group to queue to multiply matrices: c = a * b~~ ~~q.submit([&](auto &h) {~~ ~~// Read from a and b, write to c~~ ~~accessor a(a_buf, h, read_only);~~ ~~accessor b(b_buf, h, read_only);~~ ~~accessor c(c_buf, h, write_only);~~ ~~int width_a = a_buf.get_range()[1];~~ ~~// Execute kernel.~~ ~~h.parallel_for(range(M, P), [=](auto index) {~~ ~~// Get global position in Y direction.~~ ~~int row = index[0];~~ ~~// Get global position in X direction.~~ ~~int col = index[1];~~ ~~float sum = 0.0f;~~ ~~// Compute the result of one element of c~~ ~~for (int i = 0; i < width_a; i++) {~~ ~~sum += a[row][i] * b[i][col];~~ } ~~c[index] = sum;~~ ~~});~~ ~~});~~ ~~} catch (sycl::exception const &e) {~~ ~~cout << "An exception is caught while multiplying matrices.\n";~~ ~~terminate();~~ } ~~int result;~~ ~~cout << "Result of matrix multiplication using DPC++: ";~~ ~~result = VerifyResult(c_back);~~ ~~delete[] c_back;~~ ~~return result;~~} ~~bool ValueSame(float a, float b) {~~ ~~return fabs(a~~ cpp Vector- ~~b) < numeric_limits<float>::epsilon();~~} ~~int VerifyResult(float (*c_back)[P~~add]~~) {~~ ~~// Check that~~ located at the ~~results are correct by comparing with host computing.~~ ~~int i, j, k;~~ ~~// 2D arrays on host side~~links provided. ~~float(*a_host)[N] = new float[M][N];~~ ~~float(*b_host)[P] = new float[N][P];~~ ~~float(*c_host)[P] = new float[M][P];~~ ~~// Each element of matrix a is 1.~~ ~~for (i = 0; i < M; i++)~~ ~~for (j = 0; j < N; j++) a_host[i][j] = 1.0f;~~ ~~// Each column of b_host is the sequence 1,2,...,N~~ ~~for (i = 0; i < N; i++)~~ ~~for (j = 0; j < P; j++) b_host[i][j] = i + 1.0f;~~ ~~// c_host is initialized to zero.~~ ~~for (i = 0; i < M; i++)~~ ~~for (j = 0; j < P; j++) c_host[i][j] = 0.0f;~~ ~~for (i = 0; i < M; i++) {~~ ~~for (k = 0; k < N; k++) {~~ ~~// Each element of the product is just the sum 1+2+...+n~~ ~~for (j = 0; j < P; j++) {~~ ~~c_host[i][j] += a_host[i][k] * b_host[k][j];~~ } } } ~~bool mismatch_found = false;~~ ~~// Compare host side results with the result buffer from device side: print~~ ~~// mismatched data 5 times only.~~ ~~int print_count = 0;~~ ~~for (i = 0; i < M; i++) {~~ ~~for (j = 0; j < P; j++) {~~ ~~if (!ValueSame(c_back[i][j], c_host[i][j])) {~~ ~~cout << "Fail - The result is incorrect for element: [" << i << ", "~~ ~~<< j << "], expected: " << c_host[i][j]~~ ~~<< ", but found: " << c_back[i][j] << "\n";~~ ~~mismatch_found = true;~~ ~~print_count++;~~ ~~if (print_count == 5) break;~~ } } ~~if (print_count == 5) break;~~ } ~~delete[] a_host;~~ ~~delete[] b_host;~~ ~~delete[] c_host;~~ ~~if (!mismatch_found) {~~ ~~cout << "Success - The results are correct!\n";~~ ~~return 0;~~ ~~} else {~~ ~~cout << "Fail - The results mismatch!\n";~~ ~~return -1;~~ }}~~```~~

Jinkster

22

edits

Changes

DPS921/Intel Math Kernel Library

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools