| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 |
- unit fScalarProductD;
- interface
- uses
- System.SysUtils,
- System.Variants,
- System.Classes,
- Vcl.Graphics,
- Vcl.Controls,
- Vcl.Forms,
- Vcl.Dialogs,
- Vcl.StdCtrls,
- CUDA.Compiler,
- CUDA.Context,
- CUDA.APIComps,
- CUDA.Utility;
- type
- TFormSP = class(TForm)
- GLCUDA1: TGLCUDA;
- GLCUDADevice1: TGLCUDADevice;
- GLCUDACompiler1: TGLCUDACompiler;
- Memo1: TMemo;
- Button1: TButton;
- MainModule: TCUDAModule;
- scalarProdGPU: TCUDAFunction;
- deviceA: TCUDAMemData;
- deviceB: TCUDAMemData;
- deviceC: TCUDAMemData;
- hostC_GPU: TCUDAMemData;
- hostB: TCUDAMemData;
- hostC_CPU: TCUDAMemData;
- hostA: TCUDAMemData;
- _Z13scalarProdGPUPfS_S_ii_d_C: TCUDAFuncParam;
- _Z13scalarProdGPUPfS_S_ii_d_A: TCUDAFuncParam;
- _Z13scalarProdGPUPfS_S_ii_d_B: TCUDAFuncParam;
- _Z13scalarProdGPUPfS_S_ii_vectorN: TCUDAFuncParam;
- _Z13scalarProdGPUPfS_S_ii_elementN: TCUDAFuncParam;
- procedure Button1Click(Sender: TObject);
- procedure scalarProdGPUParameterSetup(Sender: TObject);
- procedure FormCreate(Sender: TObject);
- private
- Path: TFileName;
- public
- end;
- var
- FormSP: TFormSP;
- implementation
- {$R *.dfm}
- const
- // Total number of input vector pairs; arbitrary
- VECTOR_N = 256;
- (* Number of elements per vector; arbitrary,
- but strongly preferred to be a multiple of warp size
- to meet memory coalescing constraints *)
- ELEMENT_N = 4096;
- procedure TFormSP.FormCreate(Sender: TObject);
- begin
- Path := GetCurrentAssetPath();
- SetCurrentDir(Path + '\texture');
- pgm := TGLPGMImage.Create;
- end;
- procedure scalarProdCPU(hC, hA, hB: TCUDAMemData; vectorN, elementN: Integer);
- var
- vec, pos, vectorBase, vectorEnd: Integer;
- sum: Double;
- A, B: Single;
- begin
- for vec := 0 to vectorN - 1 do
- begin
- vectorBase := elementN * vec;
- vectorEnd := vectorBase + elementN;
- sum := 0;
- for pos := vectorBase to vectorEnd - 1 do
- begin
- A := hA.Data<Single>(pos).Scalar;
- B := hB.Data<Single>(pos).Scalar;
- sum := sum + A * B;
- end;
- hC.Data<Single>(vec).Scalar := sum;
- end;
- end;
- procedure TFormSP.Button1Click(Sender: TObject);
- var
- I: Integer;
- timer: Cardinal;
- sumDelta, sumRef, L1norm: Double;
- val1, val2, delta: Single;
- begin
- if not InitCUTIL then
- begin
- Memo1.Lines.Add('Can''t load cutil32.dll');
- exit;
- end;
- cutCreateTimer( Timer );
- Memo1.Lines.Add('Initializing data...');
- hostA.Width := VECTOR_N * ELEMENT_N;
- hostB.Width := VECTOR_N * ELEMENT_N;
- hostC_CPU.Width := VECTOR_N;
- hostC_GPU.Width := VECTOR_N;
- hostA.RawData;
- hostB.RawData;
- hostC_CPU.RawData;
- hostC_GPU.RawData;
- Memo1.Lines.Add('...allocating GPU memory.');
- deviceA.Width := VECTOR_N * ELEMENT_N;
- deviceB.Width := VECTOR_N * ELEMENT_N;
- deviceC.Width := VECTOR_N;
- deviceA.RawData;
- deviceB.RawData;
- deviceC.RawData;
- Memo1.Lines.Add('...generating input data in CPU mem.');
- // Generating input data on CPU
- for I := 0 to VECTOR_N * ELEMENT_N - 1 do
- begin
- hostA.Data<Single>(I).Scalar := Random;
- hostB.Data<Single>(I).Scalar := Random;
- end;
- Memo1.Lines.Add('...copying input data to GPU mem.');
- // Copy options data to GPU memory for further processing
- hostA.CopyTo(deviceA);
- hostB.CopyTo(deviceB);
- Memo1.Lines.Add('Data init done.');
- Memo1.Lines.Add('Executing GPU kernel...');
- cutResetTimer( Timer );
- cutStartTimer( Timer );
- scalarProdGPU.Launch;
- cutStopTimer( Timer );
- Memo1.Lines.Add('Launch finished.');
- Memo1.Lines.Add(Format('GPU time: %f (ms)', [cutGetTimerValue( Timer )]));
- Memo1.Lines.Add('Reading back GPU result...');
- // Read back GPU results to compare them to CPU results
- deviceC.CopyTo(hostC_GPU);
- Memo1.Lines.Add('Checking GPU results...');
- Memo1.Lines.Add('...running CPU scalar product calculation');
- scalarProdCPU(hostC_CPU, hostA, hostB, VECTOR_N, ELEMENT_N);
- Memo1.Lines.Add('...comparing the results');
- // Calculate max absolute difference and L1 distance
- // between CPU and GPU results
- sumDelta := 0;
- sumRef := 0;
- for I := 0 to VECTOR_N - 1 do
- begin
- val1 := hostC_GPU.Data<Single>(I).Scalar;
- val2 := hostC_CPU.Data<Single>(I).Scalar;
- delta := Abs(val1 - val2);
- sumDelta := sumDelta + delta;
- sumRef := sumRef + val2;
- end;
- L1norm := sumDelta / sumRef;
- Memo1.Lines.Add(Format('L1 error: %f', [L1norm]));
- if L1norm < 1e-6 then
- Memo1.Lines.Add('TEST PASSED')
- else
- Memo1.Lines.Add('TEST FAILED');
- cutDeleteTimer( timer );
- end;
- procedure TFormSP.scalarProdGPUParameterSetup(Sender: TObject);
- begin
- with scalarProdGPU do
- begin
- SetParam(deviceC);
- SetParam(deviceA);
- SetParam(deviceB);
- SetParam(VECTOR_N);
- SetParam(ELEMENT_N);
- end;
- end;
- end.
|