PostProcessing_kernel.cu 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. // includes, system
  2. #include <stdlib.h>
  3. #include <stdio.h>
  4. #include <string.h>
  5. #include <math.h>
  6. // clamp x to range [a, b]
  7. __device__ int clamp(int x, int a, int b)
  8. {
  9. return max(a, min(b, x));
  10. }
  11. // convert floating point rgb color to 8-bit integer
  12. __device__ int rgbToInt(float r, float g, float b)
  13. {
  14. r = clamp(r, 0.0f, 255.0f);
  15. g = clamp(g, 0.0f, 255.0f);
  16. b = clamp(b, 0.0f, 255.0f);
  17. return (int(b)<<16) | (int(g)<<8) | int(r);
  18. }
  19. // get pixel from 2D image, with clamping to border
  20. __device__ int getPixel(int *data, int x, int y, int width, int height)
  21. {
  22. x = clamp(x, 0, width-1);
  23. y = clamp(y, 0, height-1);
  24. return data[y*width+x];
  25. }
  26. // macros to make indexing shared memory easier
  27. #define SMEM(X, Y) sdata[(Y)*tilew+(X)]
  28. /*
  29. 2D convolution using shared memory
  30. - operates on 8-bit RGB data stored in 32-bit int
  31. - assumes kernel radius is less than or equal to block size
  32. - not optimized for performance
  33. _____________
  34. | : : |
  35. |_ _:_____:_ _|
  36. | | | |
  37. | | | |
  38. |_ _|_____|_ _|
  39. r | : : |
  40. |___:_____:___|
  41. r bw r
  42. <----tilew---->
  43. */
  44. extern __shared__ int sdata[];
  45. extern "C"
  46. __global__ void
  47. cudaProcess_k(
  48. int* g_data,
  49. int* g_odata,
  50. int imgw,
  51. int imgh,
  52. int tilew,
  53. int r,
  54. float threshold,
  55. float highlight)
  56. {
  57. int tx = threadIdx.x;
  58. int ty = threadIdx.y;
  59. int bw = blockDim.x;
  60. int bh = blockDim.y;
  61. int x = blockIdx.x*bw + tx;
  62. int y = blockIdx.y*bh + ty;
  63. // copy tile to shared memory
  64. // center region
  65. SMEM(r + tx, r + ty) = getPixel(g_data, x, y, imgw, imgh);
  66. // borders
  67. if (threadIdx.x < r) {
  68. // left
  69. SMEM(tx, r + ty) = getPixel(g_data, x - r, y, imgw, imgh);
  70. // right
  71. SMEM(r + bw + tx, r + ty) = getPixel(g_data, x + bw, y, imgw, imgh);
  72. }
  73. if (threadIdx.y < r) {
  74. // top
  75. SMEM(r + tx, ty) = getPixel(g_data, x, y - r, imgw, imgh);
  76. // bottom
  77. SMEM(r + tx, r + bh + ty) = getPixel(g_data, x, y + bh, imgw, imgh);
  78. }
  79. // load corners
  80. if ((threadIdx.x < r) && (threadIdx.y < r)) {
  81. // tl
  82. SMEM(tx, ty) = getPixel(g_data, x - r, y - r, imgw, imgh);
  83. // bl
  84. SMEM(tx, r + bh + ty) = getPixel(g_data, x - r, y + bh, imgw, imgh);
  85. // tr
  86. SMEM(r + bw + tx, ty) = getPixel(g_data, x + bh, y - r, imgw, imgh);
  87. // br
  88. SMEM(r + bw + tx, r + bh + ty) = getPixel(g_data, x + bw, y + bh, imgw, imgh);
  89. }
  90. // wait for loads to complete
  91. __syncthreads();
  92. // perform convolution
  93. float rsum = 0.0;
  94. float gsum = 0.0;
  95. float bsum = 0.0;
  96. float samples = 0.0;
  97. for(int dy=-r; dy<=r; dy++) {
  98. for(int dx=-r; dx<=r; dx++) {
  99. int pixel = SMEM(r+tx+dx, r+ty+dy);
  100. // only sum pixels within disc-shaped kernel
  101. float l = dx*dx + dy*dy;
  102. if (l <= r*r)
  103. {
  104. float r = float(pixel&0xff);
  105. float g = float((pixel>>8)&0xff);
  106. float b = float((pixel>>16)&0xff);
  107. // // brighten highlights
  108. // float lum = (r + g + b) / (255*3);
  109. // if (lum > threshold)
  110. // {
  111. // r *= highlight;
  112. // g *= highlight;
  113. // b *= highlight;
  114. // }
  115. rsum += r;
  116. gsum += g;
  117. bsum += b;
  118. samples += 1.0;
  119. }
  120. }
  121. }
  122. rsum /= samples;
  123. gsum /= samples;
  124. bsum /= samples;
  125. g_odata[y*imgw+x] = rgbToInt(rsum, gsum, bsum);
  126. }