| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- /*
- * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and
- * proprietary rights in and to this software and related documentation.
- * Any use, reproduction, disclosure, or distribution of this software
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA)
- * associated with this source code for terms and conditions that govern
- * your use of this NVIDIA software.
- *
- */
- ///////////////////////////////////////////////////////////////////////////////
- // On G80-class hardware 24-bit multiplication takes 4 clocks per warp
- // (the same as for floating point multiplication and addition),
- // whereas full 32-bit multiplication takes 16 clocks per warp.
- // So if integer multiplication operands are guaranteed to fit into 24 bits
- // (always lie withtin [-8M, 8M - 1] range in signed case),
- // explicit 24-bit multiplication is preferred for performance.
- ///////////////////////////////////////////////////////////////////////////////
- #define IMUL(a, b) __mul24(a, b)
- ///////////////////////////////////////////////////////////////////////////////
- // Calculate scalar products of VectorN vectors of ElementN elements on GPU
- // Parameters restrictions:
- // 1) ElementN is strongly preferred to be a multiple of warp size to
- // meet alignment constraints of memory coalescing.
- // 2) ACCUM_N must be a power of two.
- ///////////////////////////////////////////////////////////////////////////////
- #define ACCUM_N 1024
- __global__ void scalarProdGPU(
- float *d_C,
- float *d_A,
- float *d_B,
- int vectorN,
- int elementN
- ){
- //Accumulators cache
- __shared__ float accumResult[ACCUM_N];
- ////////////////////////////////////////////////////////////////////////////
- // Cycle through every pair of vectors,
- // taking into account that vector counts can be different
- // from total number of thread blocks
- ////////////////////////////////////////////////////////////////////////////
- for(int vec = blockIdx.x; vec < vectorN; vec += gridDim.x){
- int vectorBase = IMUL(elementN, vec);
- int vectorEnd = vectorBase + elementN;
- ////////////////////////////////////////////////////////////////////////
- // Each accumulator cycles through vectors with
- // stride equal to number of total number of accumulators ACCUM_N
- // At this stage ACCUM_N is only preferred be a multiple of warp size
- // to meet memory coalescing alignment constraints.
- ////////////////////////////////////////////////////////////////////////
- for(int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x){
- float sum = 0;
- for(int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
- sum += d_A[pos] * d_B[pos];
- accumResult[iAccum] = sum;
- }
- ////////////////////////////////////////////////////////////////////////
- // Perform tree-like reduction of accumulators' results.
- // ACCUM_N has to be power of two at this stage
- ////////////////////////////////////////////////////////////////////////
- for(int stride = ACCUM_N / 2; stride > 0; stride >>= 1){
- __syncthreads();
- for(int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
- accumResult[iAccum] += accumResult[stride + iAccum];
- }
- if(threadIdx.x == 0) d_C[vec] = accumResult[0];
- }
- }
|