Changes

Jump to: navigation, search

The parallelizing Express

185 bytes added, 00:40, 8 April 2017
Assignment 2
__device__ void matvec(float* d_A, float* d_B, float* d_C)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
float sum = 0;
if (tid < 3)
{
for (int i = 0; i < 3; ++i)
{
sum += d_A[i] * d_B[(i * 3) + tid];
}
for (int i = 0; i < n; ++i){ sum += d_Ad_C[i] * d_B[(i * n) + tid]= sum;
}
 
d_C[0] = sum;
}
</pre>
<pre>
__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C,
const int n, int targetrows, int targetcols, float* d_Tar)
{
const double eps = 1.0e-4;
for (int y = 0; y < targetrows; ++y) {
for (int x = 0; x < targetcols; ++x) {
memcpy(&d_A, h_Tar&d_Tar[y * 3 + x], N 3 * sizeof(float));
matvec(&d_A, &d_RGB2, d_C); memcpy(&d_A, h_Cd_C, N 3 * sizeof(float));
for (int c = 0; c < 3; c++)
d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps;
matvec(&d_A, &d_LMS2, d_C); memcpy(&h_Tard_Tar[y * 3 + x], d_C, N 3 * sizeof(float));
}
}
Helper
<pre>
 
inline void vecTransfer(float* h, Color3d* v)
{
void matvec_L(cv::Mat* mRGB2LMS, cv::Mat* mLMS2lab, float* h_C, int tarrow, int tarcol, float* h_Tar)
{
float *h_A, *h_RGB2, *h_LMS2, *h_Cd_Tar;
float *d_A, *d_RGB2, *d_LMS2, *d_C;
h_A = (float*)malloc(sizeof(float) * N);
h_RGB2 = new float[mRGB2LMS->total()];
h_LMS2 = new float[mLMS2LMSmLMS2lab->total()]; //h_C = (float*)malloc(sizeof(float) * N);
cudaMalloc((void**)&d_A, sizeof(float) * N);
cudaMalloc((void**)&d_RGB2, sizeof(float) * N * N);
cudaMalloc((void**)&d_LMS2, sizeof(float) * N * N);
cudaMalloc((void**)&d_C, sizeof(floath_C) ); cudaMalloc((void** N)&d_Tar, sizeof(h_Tar));
Color3d vec;
//copy vec and matrix to host pointers
vecTransfer(h_A, &vec);
memcpy(h_RGB2, mRGB2LMS->data, mRGB2LMS->total());
memcpy(h_LMS2, mLMS2LabmLMS2lab->data, mLMS2LabmLMS2lab->total());
cudaMemcpy(d_A, h_A, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_RGB2, h_RGB2, sizeof(float) * N * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_LMS2, h_LMS2, sizeof(float) * N * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_Tar, h_Tar, sizeof(h_Tar), cudaMemcpyHostToDevice);
matvec_kernel<<<N / BLOCK_SIZE + 1, BLOCK_SIZE>>>(d_A, d_RGB2, d_LMS2, d_C, N, tarrow, tarcol, d_Tar);
//printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()));
cudaMemcpy(h_C, d_C, sizeof(floath_C) * N, cudaMemcpyDeviceToHost);
free(h_A);
free(h_RGB2);
free(h_LMS2);
//free(h_C);
cudaFree(d_A);
cudaFree(d_LMS2);
cudaFree(d_C);
cudaFree(d_Tar);
}
</pre>
New
<pre>
float* h_TARGET = (float *)malloc(sizeof(target.totatl(data)); memcpy(h_TARGET, &target.data, sizeof(target.total(data)); matvec_L(&mlab2LMS, &mLMS2RGB, h_C, target.rows, target.cols, h_TARGET); memcpy(&target.data, h_C, target.total());
</pre>
94
edits

Navigation menu