#include <chrono> #include <iostream> #include <cstring> #include <string> #include <opencv2/core/core.hpp> #include <opencv2/highgui/highgui.hpp> #include <opencv2/opencv.hpp> #include <vector_types.h> #include "openMP.hpp" #include "CUDA_wrappers.hpp" #include "common/image_helpers.hpp" using namespace cv; using namespace std; int main( int argc, char** argv ) { using namespace std::chrono; if( argc != 2) { cout <<" Usage: convert_to_grayscale imagefile" << endl; return -1; } Mat image, imageGray; uchar4 *imageArray; unsigned char *imageGrayArray; prepareImagePointers(argv[1], image, &imageArray, imageGray, &imageGrayArray, CV_8UC1); int numRows = image.rows, numCols = image.cols; auto start = system_clock::now(); RGBtoGrayscaleOpenMP(imageArray, imageGrayArray, numRows, numCols); auto duration = duration_cast<milliseconds>(system_clock::now() - start); cout<<"OpenMP time (ms):" << duration.count() << endl; memset(imageGrayArray, 0, sizeof(unsigned char)*numRows*numCols); RGBtoGrayscaleCUDA(imageArray, imageGrayArray, numRows, numCols); return 0; }
template <class T1, class T2> void prepareImagePointers(const char * const inputImageFileName, cv::Mat& inputImage, T1** inputImageArray, cv::Mat& outputImage, T2** outputImageArray, const int outputImageType) { using namespace std; using namespace cv; inputImage = imread(inputImageFileName, IMREAD_COLOR); if (inputImage.empty()) { cerr << "Couldn't open input file." << endl; exit(1); } //allocate memory for the output outputImage.create(inputImage.rows, inputImage.cols, outputImageType); cvtColor(inputImage, inputImage, cv::COLOR_BGR2BGRA); *inputImageArray = (T1*)inputImage.ptr<char>(0); *outputImageArray = (T2*)outputImage.ptr<char>(0); }
#include <stdio.h> #include <omp.h> #include <vector_types.h> void RGBtoGrayscaleOpenMP(uchar4 *imageArray, unsigned char *imageGrayArray, int numRows, int numCols) { #pragma omp parallel for collapse(2) for (int i = 0; i < numRows; ++i) { for (int j = 0; j < numCols; ++j) { const uchar4 pixel = imageArray[i*numCols+j]; imageGrayArray[i*numCols+j] = 0.299f*pixel.x + 0.587f*pixel.y+0.114f*pixel.z; } } }
void RGBtoGrayscaleCUDA(const uchar4 * const h_imageRGBA, unsigned char* const h_imageGray, size_t numRows, size_t numCols) { uchar4 *d_imageRGBA; unsigned char *d_imageGray; const size_t numPixels = numRows * numCols; cudaSetDevice(0); checkCudaErrors(cudaGetLastError()); //allocate memory on the device for both input and output checkCudaErrors(cudaMalloc(&d_imageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMalloc(&d_imageGray, sizeof(unsigned char) * numPixels)); //copy input array to the GPU checkCudaErrors(cudaMemcpy(d_imageRGBA, h_imageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
#include <cuda.h> #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) template<typename T> void check(T err, const char* const func, const char* const file, const int line) { if (err != cudaSuccess) { std::cerr << "CUDA error at: " << file << ":" << line << std::endl; std::cerr << cudaGetErrorString(err) << " " << func << std::endl; exit(1); } }
dim3 blockSize; dim3 gridSize; int threadNum; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); threadNum = 1024; blockSize = dim3(threadNum, 1, 1); gridSize = dim3(numCols/threadNum+1, numRows, 1); cudaEventRecord(start); rgba_to_grayscale_simple<<<gridSize, blockSize>>>(d_imageRGBA, d_imageGray, numRows, numCols); cudaEventRecord(stop); cudaEventSynchronize(stop); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); std::cout << "CUDA time simple (ms): " << milliseconds << std::endl;
__global__ void rgba_to_grayscale_simple(const uchar4* const d_imageRGBA, unsigned char* const d_imageGray, int numRows, int numCols) { int y = blockDim.y*blockIdx.y + threadIdx.y; int x = blockDim.x*blockIdx.x + threadIdx.x; if (x>=numCols || y>=numRows) return; const int offset = y*numCols+x; const uchar4 pixel = d_imageRGBA[offset]; d_imageGray[offset] = 0.299f*pixel.x + 0.587f*pixel.y+0.114f*pixel.z; }
checkCudaErrors(cudaMemcpy(h_imageGray, d_imageGray, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost)); cudaFree(d_imageGray); cudaFree(d_imageRGBA);
OpenMP time (ms):45 CUDA time simple (ms): 43.1941
#define WARP_SIZE 32 __global__ void rgba_to_grayscale_optimized(const uchar4* const d_imageRGBA, unsigned char* const d_imageGray, int numRows, int numCols, int elemsPerThread) { int y = blockDim.y*blockIdx.y + threadIdx.y; int x = blockDim.x*blockIdx.x + threadIdx.x; const int loop_start = (x/WARP_SIZE * WARP_SIZE)*(elemsPerThread-1)+x; for (int i=loop_start, j=0; j<elemsPerThread && i<numCols; i+=WARP_SIZE, ++j) { const int offset = y*numCols+i; const uchar4 pixel = d_imageRGBA[offset]; d_imageGray[offset] = 0.299f*pixel.x + 0.587f*pixel.y+0.114f*pixel.z; } }
threadNum=128; const int elemsPerThread = 16; blockSize = dim3(threadNum, 1, 1); gridSize = dim3(numCols / (threadNum*elemsPerThread) + 1, numRows, 1); cudaEventRecord(start); rgba_to_grayscale_optimized<<<gridSize, blockSize>>>(d_imageRGBA, d_imageGray, numRows, numCols, elemsPerThread); cudaEventRecord(stop); cudaEventSynchronize(stop); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); std::cout << "CUDA time optimized (ms): " << milliseconds << std::endl;
OpenMP time (ms):44 CUDA time simple (ms): 53.1625 CUDA time optimized (ms): 15.9273
Source: https://habr.com/ru/post/245503/
All Articles