fScalarProductD.pas 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. unit fScalarProductD;
  2. interface
  3. uses
  4. System.SysUtils,
  5. System.Variants,
  6. System.Classes,
  7. Vcl.Graphics,
  8. Vcl.Controls,
  9. Vcl.Forms,
  10. Vcl.Dialogs,
  11. Vcl.StdCtrls,
  12. CUDA.Compiler,
  13. CUDA.Context,
  14. CUDA.APIComps,
  15. CUDA.Utility;
  16. type
  17. TFormSP = class(TForm)
  18. GLCUDA1: TGLCUDA;
  19. GLCUDADevice1: TGLCUDADevice;
  20. GLCUDACompiler1: TGLCUDACompiler;
  21. Memo1: TMemo;
  22. Button1: TButton;
  23. MainModule: TCUDAModule;
  24. scalarProdGPU: TCUDAFunction;
  25. deviceA: TCUDAMemData;
  26. deviceB: TCUDAMemData;
  27. deviceC: TCUDAMemData;
  28. hostC_GPU: TCUDAMemData;
  29. hostB: TCUDAMemData;
  30. hostC_CPU: TCUDAMemData;
  31. hostA: TCUDAMemData;
  32. _Z13scalarProdGPUPfS_S_ii_d_C: TCUDAFuncParam;
  33. _Z13scalarProdGPUPfS_S_ii_d_A: TCUDAFuncParam;
  34. _Z13scalarProdGPUPfS_S_ii_d_B: TCUDAFuncParam;
  35. _Z13scalarProdGPUPfS_S_ii_vectorN: TCUDAFuncParam;
  36. _Z13scalarProdGPUPfS_S_ii_elementN: TCUDAFuncParam;
  37. procedure Button1Click(Sender: TObject);
  38. procedure scalarProdGPUParameterSetup(Sender: TObject);
  39. procedure FormCreate(Sender: TObject);
  40. private
  41. Path: TFileName;
  42. public
  43. end;
  44. var
  45. FormSP: TFormSP;
  46. implementation
  47. {$R *.dfm}
  48. const
  49. // Total number of input vector pairs; arbitrary
  50. VECTOR_N = 256;
  51. (* Number of elements per vector; arbitrary,
  52. but strongly preferred to be a multiple of warp size
  53. to meet memory coalescing constraints *)
  54. ELEMENT_N = 4096;
  55. procedure TFormSP.FormCreate(Sender: TObject);
  56. begin
  57. Path := GetCurrentAssetPath();
  58. SetCurrentDir(Path + '\texture');
  59. pgm := TGLPGMImage.Create;
  60. end;
  61. procedure scalarProdCPU(hC, hA, hB: TCUDAMemData; vectorN, elementN: Integer);
  62. var
  63. vec, pos, vectorBase, vectorEnd: Integer;
  64. sum: Double;
  65. A, B: Single;
  66. begin
  67. for vec := 0 to vectorN - 1 do
  68. begin
  69. vectorBase := elementN * vec;
  70. vectorEnd := vectorBase + elementN;
  71. sum := 0;
  72. for pos := vectorBase to vectorEnd - 1 do
  73. begin
  74. A := hA.Data<Single>(pos).Scalar;
  75. B := hB.Data<Single>(pos).Scalar;
  76. sum := sum + A * B;
  77. end;
  78. hC.Data<Single>(vec).Scalar := sum;
  79. end;
  80. end;
  81. procedure TFormSP.Button1Click(Sender: TObject);
  82. var
  83. I: Integer;
  84. timer: Cardinal;
  85. sumDelta, sumRef, L1norm: Double;
  86. val1, val2, delta: Single;
  87. begin
  88. if not InitCUTIL then
  89. begin
  90. Memo1.Lines.Add('Can''t load cutil32.dll');
  91. exit;
  92. end;
  93. cutCreateTimer( Timer );
  94. Memo1.Lines.Add('Initializing data...');
  95. hostA.Width := VECTOR_N * ELEMENT_N;
  96. hostB.Width := VECTOR_N * ELEMENT_N;
  97. hostC_CPU.Width := VECTOR_N;
  98. hostC_GPU.Width := VECTOR_N;
  99. hostA.RawData;
  100. hostB.RawData;
  101. hostC_CPU.RawData;
  102. hostC_GPU.RawData;
  103. Memo1.Lines.Add('...allocating GPU memory.');
  104. deviceA.Width := VECTOR_N * ELEMENT_N;
  105. deviceB.Width := VECTOR_N * ELEMENT_N;
  106. deviceC.Width := VECTOR_N;
  107. deviceA.RawData;
  108. deviceB.RawData;
  109. deviceC.RawData;
  110. Memo1.Lines.Add('...generating input data in CPU mem.');
  111. // Generating input data on CPU
  112. for I := 0 to VECTOR_N * ELEMENT_N - 1 do
  113. begin
  114. hostA.Data<Single>(I).Scalar := Random;
  115. hostB.Data<Single>(I).Scalar := Random;
  116. end;
  117. Memo1.Lines.Add('...copying input data to GPU mem.');
  118. // Copy options data to GPU memory for further processing
  119. hostA.CopyTo(deviceA);
  120. hostB.CopyTo(deviceB);
  121. Memo1.Lines.Add('Data init done.');
  122. Memo1.Lines.Add('Executing GPU kernel...');
  123. cutResetTimer( Timer );
  124. cutStartTimer( Timer );
  125. scalarProdGPU.Launch;
  126. cutStopTimer( Timer );
  127. Memo1.Lines.Add('Launch finished.');
  128. Memo1.Lines.Add(Format('GPU time: %f (ms)', [cutGetTimerValue( Timer )]));
  129. Memo1.Lines.Add('Reading back GPU result...');
  130. // Read back GPU results to compare them to CPU results
  131. deviceC.CopyTo(hostC_GPU);
  132. Memo1.Lines.Add('Checking GPU results...');
  133. Memo1.Lines.Add('...running CPU scalar product calculation');
  134. scalarProdCPU(hostC_CPU, hostA, hostB, VECTOR_N, ELEMENT_N);
  135. Memo1.Lines.Add('...comparing the results');
  136. // Calculate max absolute difference and L1 distance
  137. // between CPU and GPU results
  138. sumDelta := 0;
  139. sumRef := 0;
  140. for I := 0 to VECTOR_N - 1 do
  141. begin
  142. val1 := hostC_GPU.Data<Single>(I).Scalar;
  143. val2 := hostC_CPU.Data<Single>(I).Scalar;
  144. delta := Abs(val1 - val2);
  145. sumDelta := sumDelta + delta;
  146. sumRef := sumRef + val2;
  147. end;
  148. L1norm := sumDelta / sumRef;
  149. Memo1.Lines.Add(Format('L1 error: %f', [L1norm]));
  150. if L1norm < 1e-6 then
  151. Memo1.Lines.Add('TEST PASSED')
  152. else
  153. Memo1.Lines.Add('TEST FAILED');
  154. cutDeleteTimer( timer );
  155. end;
  156. procedure TFormSP.scalarProdGPUParameterSetup(Sender: TObject);
  157. begin
  158. with scalarProdGPU do
  159. begin
  160. SetParam(deviceC);
  161. SetParam(deviceA);
  162. SetParam(deviceB);
  163. SetParam(VECTOR_N);
  164. SetParam(ELEMENT_N);
  165. end;
  166. end;
  167. end.