ScalarProduct_kernel.cu 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. /*
  2. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
  3. *
  4. * NVIDIA Corporation and its licensors retain all intellectual property and
  5. * proprietary rights in and to this software and related documentation.
  6. * Any use, reproduction, disclosure, or distribution of this software
  7. * and related documentation without an express license agreement from
  8. * NVIDIA Corporation is strictly prohibited.
  9. *
  10. * Please refer to the applicable NVIDIA end user license agreement (EULA)
  11. * associated with this source code for terms and conditions that govern
  12. * your use of this NVIDIA software.
  13. *
  14. */
  15. ///////////////////////////////////////////////////////////////////////////////
  16. // On G80-class hardware 24-bit multiplication takes 4 clocks per warp
  17. // (the same as for floating point multiplication and addition),
  18. // whereas full 32-bit multiplication takes 16 clocks per warp.
  19. // So if integer multiplication operands are guaranteed to fit into 24 bits
  20. // (always lie withtin [-8M, 8M - 1] range in signed case),
  21. // explicit 24-bit multiplication is preferred for performance.
  22. ///////////////////////////////////////////////////////////////////////////////
  23. #define IMUL(a, b) __mul24(a, b)
  24. ///////////////////////////////////////////////////////////////////////////////
  25. // Calculate scalar products of VectorN vectors of ElementN elements on GPU
  26. // Parameters restrictions:
  27. // 1) ElementN is strongly preferred to be a multiple of warp size to
  28. // meet alignment constraints of memory coalescing.
  29. // 2) ACCUM_N must be a power of two.
  30. ///////////////////////////////////////////////////////////////////////////////
  31. #define ACCUM_N 1024
  32. __global__ void scalarProdGPU(
  33. float *d_C,
  34. float *d_A,
  35. float *d_B,
  36. int vectorN,
  37. int elementN
  38. ){
  39. //Accumulators cache
  40. __shared__ float accumResult[ACCUM_N];
  41. ////////////////////////////////////////////////////////////////////////////
  42. // Cycle through every pair of vectors,
  43. // taking into account that vector counts can be different
  44. // from total number of thread blocks
  45. ////////////////////////////////////////////////////////////////////////////
  46. for(int vec = blockIdx.x; vec < vectorN; vec += gridDim.x){
  47. int vectorBase = IMUL(elementN, vec);
  48. int vectorEnd = vectorBase + elementN;
  49. ////////////////////////////////////////////////////////////////////////
  50. // Each accumulator cycles through vectors with
  51. // stride equal to number of total number of accumulators ACCUM_N
  52. // At this stage ACCUM_N is only preferred be a multiple of warp size
  53. // to meet memory coalescing alignment constraints.
  54. ////////////////////////////////////////////////////////////////////////
  55. for(int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x){
  56. float sum = 0;
  57. for(int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
  58. sum += d_A[pos] * d_B[pos];
  59. accumResult[iAccum] = sum;
  60. }
  61. ////////////////////////////////////////////////////////////////////////
  62. // Perform tree-like reduction of accumulators' results.
  63. // ACCUM_N has to be power of two at this stage
  64. ////////////////////////////////////////////////////////////////////////
  65. for(int stride = ACCUM_N / 2; stride > 0; stride >>= 1){
  66. __syncthreads();
  67. for(int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
  68. accumResult[iAccum] += accumResult[stride + iAccum];
  69. }
  70. if(threadIdx.x == 0) d_C[vec] = accumResult[0];
  71. }
  72. }