Changes

Jump to: navigation, search

The parallelizing Express

2,919 bytes added, 20:36, 17 March 2017
Assignment 2
=== Description ===
'''Removing CPU Bottleneck'''
 
Removing the old CPU bottleneck in the ColorTransfer/main.cpp:
}
return u;
}
</pre>
 
''' Added functions and changes'''
 
We wrote a matrix by vector multiplication and made a few minor adjustments to the main loop that deals with the color shift for the target (image to be modified).
 
Matrix by vector
<pre>
void sgemv(const float* h_A, const float* h_B, float* h_C, int n) {
// level 3 calculation: C = alpha * A * B + beta * C
float* devPtrA;
float* devPtrB;
float* devPtrC;
// ... allocate memory on the device
cudaMalloc((void**)&devPtrA, n * n * sizeof(float));
cudaMalloc((void**)&devPtrB, n * sizeof(float));
cudaMalloc((void**)&devPtrC, n * sizeof(float));
// ... create cuBLAS context
cublasHandle_t handle;
cublasStatus_t status;
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "***cublasCreate failed***\n";
return;
}
// ... copy host matrices to the device
status = cublasSetMatrix(n, n, sizeof(float), h_A, n, devPtrA, n);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "***cublasSetMatrix A failed***\n";
return;
}
status = cublasSetMatrix(n sizeof(float), h_B, n, devPtrB, n);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "***cublasSetVector B failed***\n";
return;
}
// ... calculate matrix-vector product
int ld_d_A = n;
int ld_d_B = n;
int ld_d_C = n;
 
float alpha = 1.0f;
float beta = 0.0f;
status = cublasSgemv(handle, CUBLAS_OP_N, n, n,
&alpha, devPtrA, ld_d_A, devPtrB, ld_d_B, &beta, devPtrC, ld_d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "***cublasSgemm failed***\n";
return;
}
// ... copy result matrix from the device to the host
status = cublasGetVector(n, sizeof(float), devPtrC, n, h_C, n);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "***cublasGetVector C failed***\n";
return;
}
// ... destroy cuBLAS context
cublasDestroy(handle);
// ... deallocate device memory
cudaFree(&h_A);
cudaFree(&h_B);
cudaFree(&h_C);
}
</pre>
 
Changes to main loop
 
Old
<pre>
// Transform back from lab to RGB
for(int y=0; y<target.rows; y++) {
for(int x=0; x<target.cols; x++) {
v = target.at<Color3d>(y, x);
v = mlab2LMS * v;
for(int c=0; c<3; c++) v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;
 
target.at<Color3d>(y, x) = mLMS2RGB * v;
}
}
</pre>
 
New
<pre>
// allocate host memory
float* h_C = new float[3]; // result
 
// Transform back from lab to RGB
for(int y=0; y<target.rows; y++) {
for(int x=0; x<target.cols; x++) {
v = target.at<Color3d>(y, x);
sgemv(&mlab2LMS, v, h_c);
memcpy(v, h_c, sizeof(Color3d));
 
for(int c=0; c<3; c++)
v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;
 
sgemv(&mLMS2RGB , v, h_c);
memcpy(target.at<Color3d>(y, x), h_c, sizeof(Color3d));
}
}
</pre>
94
edits

Navigation menu