Changes

Jump to: navigation, search

The parallelizing Express

697 bytes added, 20:49, 7 April 2017
Assignment 2
'''Removing CPU Bottleneck'''
Removing the old The CPU bottleneck in the ColorTransfer/main.cpp:
<pre>
''' Added functions and changes'''
To fix this issue We wrote a device function which handles a matrix by vector multiplication and made . We also wrote a few minor adjustments kernel which has equal logic to the host version so we could off load all the needed data to the main loop that deals with kernel without needing to do multiple back and forth(s). We also wrote a helper function which will allocate all transfer all the color shift opencv matrices to a suitable form for the target (image kernel to be modified)deal with.
Matrix by vector
<pre>
__device__ void sgemvmatvec(const float* h_Ad_A, const float* h_Bd_B, float* h_C, int nd_C) { // level 2 calculation: C float sum = alpha * A * x + B * y float* devPtrA0; float* devPtrB; float* devPtrC; // ... allocate memory on the device cudaMallocfor ((void**)&devPtrA, n * n * sizeof(float))int i = 0; cudaMalloc((void**)&devPtrB, i < n * sizeof(float)); cudaMalloc((void**)&devPtrC, n * sizeof(float)); // ... create cuBLAS context cublasHandle_t handle; cublasStatus_t status; status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS++i) { std::cerr << "sum += d_A[i] *d_B[(i **cublasCreate failed***\n"; return) + tid];
}
  d_C[0] = sum;}<// ... copy host matrices to the devicepre> Kernel<pre> status = cublasSetMatrix__global__ void matvec_kernel(nfloat* d_A, nfloat* d_RGB2, sizeof(float)* d_LMS2, h_Afloat* d_C, const int n, devPtrAint targetrows, nint targetcols, float* d_Tar){ const double eps = 1.0e-4; if for (status !int y = CUBLAS_STATUS_SUCCESS0; y < targetrows; ++y) { std::cerr for (int x = 0; x << "targetcols; ++x) { memcpy(&d_A, h_Tar[y *3 + x], N *sizeof(float));  matvec(&d_A, &d_RGB2, d_C); memcpy(&d_A, h_C, N *cublasSetMatrix A failedsizeof(float));  for (int c = 0; c < 3; c++) d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps;  matvec(&d_A, &d_LMS2, d_C); memcpy(&h_Tar[y *3 + x], d_C, N **\n"sizeof(float)); return;}
}
}</pre> Helper<pre> inline void vecTransfer(float* h, Color3d* v){ status for (int j = cublasSetMatrix(n sizeof0; j < 3; ++j) h[j] = v->v[j];} //KERNEL Helper function does setup and launchvoid matvec_L(cv::Mat* mRGB2LMS, cv::Mat* mLMS2lab, float* h_C, int tarrow, int tarcol, float* h_Tar){ float *h_A, h_B*h_RGB2, *h_LMS2, *h_C; float *d_A, n*d_RGB2, devPtrB*d_LMS2, n*d_C;  int N = 3;  h_A = (float*)malloc(sizeof(float) * N); h_RGB2 = new float[mRGB2LMS->total()]; if h_LMS2 = new float[mLMS2LMS->total(status !)]; h_C = CUBLAS_STATUS_SUCCESS(float*)malloc(sizeof(float) {* N);  cudaMalloc((void**)&d_A, sizeof(float) * N); std::cerr << " cudaMalloc((void**)&d_RGB2, sizeof(float) *N *N); cudaMalloc((void*cublasSetVector B failed*)&d_LMS2, sizeof(float) *N *\n"N); return cudaMalloc((void**)&d_C, sizeof(float) * N)}Color3d vec;  // ... calculate copy vec and matrixto host pointers vecTransfer(h_A, vec); memcpy(h_RGB2, mRGB2LMS->data, mRGB2LMS-vector product>total()); int ld_d_A = memcpy(h_LMS2, mLMS2Lab->data, mLMS2Lab->total());  cudaMemcpy(d_A, h_A, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_RGB2, h_RGB2, sizeof(float) * N * N, cudaMemcpyHostToDevice); cudaMemcpy(d_LMS2, h_LMS2, sizeof(float) * N * N, cudaMemcpyHostToDevice);  matvec_kernel<<<N / BLOCK_SIZE + 1, BLOCK_SIZE>>>(d_A, d_RGB2, d_LMS2, d_C, N); //printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()));  cudaMemcpy(h_C, d_C, sizeof(float) * N, cudaMemcpyDeviceToHost);  free(h_A); free(h_RGB2); int ld_d_B = nfree(h_LMS2); int ld_d_C = nfree(h_C);
float alpha = 1.0f; float beta = 0.0f; status = cublasSgemv(handle, CUBLAS_OP_N, n, n, &alpha, devPtrA, ld_d_A, devPtrB, ld_d_B, &beta, devPtrC, ld_d_C); if (status != CUBLAS_STATUS_SUCCESS) { std::cerr << "***cublasSgemm failed***\n"; return; } // ... copy result matrix from the device to the host status = cublasGetVector(n, sizeof(float), devPtrC, n, h_C, n); if cudaFree(status != CUBLAS_STATUS_SUCCESSd_A) { std::cerr << "***cublasGetVector C failed***\n"; return; } // ... destroy cuBLAS context cublasDestroy(handle); // ... deallocate device memory cudaFree(&h_Ad_RGB2); cudaFree(&h_Bd_LMS2); cudaFree(&h_Cd_C);
}
</pre>
 
Changes to main loop
Old
<pre>
// Transform back from lab to RGBfor(int y=0; y<target.rows; y++) { for(int x=0; x<target.cols; x++) { v = target.at<Color3d>(y, x); v = mlab2LMS * v; for(int c=0; c<3; c++) v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;
matvec_L(&v, &mlab2LMS, h_C); memcpy(&v, h_C, N * sizeof(float));  for (int c = 0; c < 3; c++) v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;  matvec_L(&v, &mLMS2RGB, h_C); memcpy(&target.at<Color3d>(y, x) = mLMS2RGB , h_C, N * vsizeof(float)); }
}
</pre>
New
<pre>
// allocate host memory float* h_C h_TARGET = new (float[3]; // result // Transform back from lab to RGBfor*)malloc(int y=0; y<target.rowstotatl()); y++) { formemcpy(int x=0; x<h_TARGET, &target.cols; x++) { v = data, target.at<Color3d>total(y, x)); sgemv matvec_L(&mlab2LMS, v&mLMS2RGB, h_c); memcpy(vh_C, h_ctarget.rows, sizeof(Color3d));  for(int c=0; c<3; c++) v(c) = v(c) > -5target.0 ? pow(10.0, v(c)) : eps;  sgemv(&mLMS2RGB cols, v, h_ch_TARGET); memcpy(&target.at<Color3d>(ydata, x)h_C, h_c, sizeoftarget.total(Color3d)); }}
</pre>
94
edits

Navigation menu