123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268 |
- {
- The sample is give at
- http://developer.apple.com/mac/library/samplecode/OpenCL_Hello_World_Example/index.html
- ===========================================================================
- DESCRIPTION:
- A simple "Hello World" compute example showing basic usage of OpenCL which
- calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of
- floating point values.
- For simplicity, this example is intended to be run from the command line.
- If run from within XCode, open the Run Log (Command-Shift-R) to see the
- output. Alternatively, run the applications from within a Terminal.app
- session to launch from the command line.
- ===========================================================================
- BUILD REQUIREMENTS:
- Mac OS X v10.6 or later
- or
- Windows with NVidia OpenCL SDK Installed
- ===========================================================================
- RUNTIME REQUIREMENTS:
- Mac OS X v10.6 or later
- To use the GPU as a compute device, use one of the following devices:
- - MacBook Pro w/NVidia GeForce 8600M
- - Mac Pro w/NVidia GeForce 8800GT
-
- If you don't have powerful GPU you can try to use CPU instead:
- by changing
- gpu := CL_DEVICE_TYPE_GPU;
- to
- gpu := CL_DEVICE_TYPE_CPU;
- Windows with NVidia OpenCL SDK Installed and libOpenCLXX.dll available
- CL_DEVICE_TYPE_CPU doesn't seem to work for windows
- ===========================================================================
- }
- program testcl;
- {$mode objfpc}{$H+}
- uses
- ctypes, cl;
- // Use a static data size for simplicity
- const
- DATA_SIZE = 1024;
- // Simple compute kernel which computes the square of an input array
- const
- KernelSource : PChar =
- '__kernel void square( '#10+
- ' __global float* input, '#10+
- ' __global float* output, '#10+
- ' const unsigned int count) '#10+
- '{ '#10+
- ' int i = get_global_id(0); '#10+
- ' if(i < count) '#10+
- ' output[i] = input[i] * input[i]; '#10+
- '} '#0;
- var
- err : Integer; // error code returned from api calls
- data : array [0..DATA_SIZE-1] of single; // original data set given to device
- results : array [0..DATA_SIZE-1] of single; // results returned from device
- correct : LongWord; // number of correct results returned
- global : csize_t; // global domain size for our calculation
- local : csize_t; // local domain size for our calculation
- device_id : cl_device_id; // compute device id
- context : cl_context; // compute context
- commands : cl_command_queue; // compute command queue
- prog : cl_program; // compute program
- kernel : cl_kernel; // compute kernel
- input : cl_mem; // device memory used for the input array
- output : cl_mem; // device memory used for the output array
- i : Integer;
- count : Integer;
- gpu : cl_device_type;
- tmpd : single;
- platformids : Pcl_platform_id;
- num_platforms : cl_uint;
-
- begin
- // Fill our data set with random float values
- count := DATA_SIZE;
- for i:=0 to count - 1 do
- data[i]:= random;
- err:=clGetPlatformIDs(0,nil,@num_platforms);
- Writeln('clGetPlatformIDs ', num_platforms);
- if (err <> CL_SUCCESS) then
- begin
- writeln('Error: Cannot get number of platforms!');
- Halt(1);
- end;
- getmem(platformids,num_platforms*sizeof(cl_platform_id));
- err := clGetPlatformIDs(num_platforms, platformids, nil);
- if (err <> CL_SUCCESS) then begin
- Writeln('Error: Failed to platforms!');
- Halt($FF);
- end;
- // Connect to a compute device
- // change CL_DEVICE_TYPE_CPU to CL_DEVICE_TYPE_GPU is you have powerful video (GeForce 8800/8600M or higher)
- gpu := CL_DEVICE_TYPE_GPU;
- device_id:=nil;
- err := clGetDeviceIDs(platformids[0], gpu, 1, @device_id, nil);
- writeln('clGetDeviceIDs ', err);
- if (err <> CL_SUCCESS) then begin
- Writeln('Error: Failed to create a device group!');
- Halt($FF);
- end;
- // Create a compute context
- context := clCreateContext(nil, 1, @device_id, nil, nil, err);
- writeln('clCreateContext ', err);
- if context=nil then begin
- Writeln('Error: Failed to create a compute context!');
- Halt($FF);
- end;
- // Create a command commands
- commands := clCreateCommandQueue(context, device_id, 0, err);
- writeln('clCreateCommandQueue ', err);
- if commands=nil then begin
- Writeln('Error: Failed to create a command commands!');
- Halt($FF);
- end;
- // Create the compute program from the source buffer
- prog:= clCreateProgramWithSource(context, 1, PPChar(@KernelSource), nil, err);
- writeln('clCreateProgramWithSource ', err);
- if prog=nil then begin
- writeln('Error: Failed to create compute program! ');
- Halt($FF);
- end;
- // Build the program executable
- err := clBuildProgram(prog, 0, nil, nil, nil, nil);
- writeln('clBuildProgram ', err);
- if (err <> CL_SUCCESS) then begin
- writeln('Error: Failed to build program executable!');
- Halt(1);
- end;
- // Create the compute kernel in the program we wish to run
- kernel := clCreateKernel(prog, 'square', err);
- writeln('clCreateKernel ', err);
- if (kernel=nil) or (err <> CL_SUCCESS) then begin
- writeln('Error: Failed to create compute kernel!');
- Halt(1);
- end;
-
- err := clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), @local, nil);
- writeln('clGetKernelWorkGroupInfo ', err);
- if (err<>CL_SUCCESS) then begin
- writeln('Error: Failed to retrieve kernel work group info!');
- Halt(1);
- end;
-
-
- // Create the input and output arrays in device memory for our calculation
- input := clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(single) * count, nil, err);
- writeln('clCreateBuffer ', err);
- output := clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(single) * count, nil, err);
- writeln('clCreateBuffer ', err);
- if (input=nil) or (output=nil) then begin
- writeln('Error: Failed to allocate device memory!');
- Halt(1);
- end;
- // Write our data set into the input array in device memory
- err := clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(single) * count, @data, 0, nil, nil);
- writeln('clEnqueueWriteBuffer ', err);
- if (err <> CL_SUCCESS) then begin
- writeln('Error: Failed to write to source array!');
- Halt(1);
- end;
- // Set the arguments to our compute kernel
- err := 0;
- err := clSetKernelArg(kernel, 0, sizeof(cl_mem), @input);
- writeln('clSetKernelArg ', err);
- err := err or clSetKernelArg(kernel, 1, sizeof(cl_mem), @output);
- writeln('clSetKernelArg ', err);
- err := err or clSetKernelArg(kernel, 2, sizeof(longword), @count);
- writeln('clSetKernelArg ', err);
- if (err <> CL_SUCCESS) then begin
- writeln('Error: Failed to set kernel arguments! ');
- Halt(1);
- end;
- // Get the maximum work group size for executing the kernel on the device
- err := clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), @local, nil);
- writeln('clGetKernelWorkGroupInfo ', err);
- if (err<>CL_SUCCESS) then begin
- writeln('Error: Failed to retrieve kernel work group info!');
- Halt(1);
- end;
-
- // Execute the kernel over the entire range of our 1d input data set
- // using the maximum number of work group items for this device
- global := count;
- err := clEnqueueNDRangeKernel(commands, kernel, 1, nil, @global, @local, 0, nil, nil);
- writeln('clEnqueueNDRangeKernel ',err);
- if (err<>0) then begin
- writeln('Error: Failed to execute kernel!');
- Halt($FF);
- end;
- // Wait for the command commands to get serviced before reading back results
- err:=clFinish(commands);
- writeln('clFinish ',err);
- // Read back the results from the device to verify the output
- err := clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(single) * count, @results, 0, nil, nil);
- writeln('clEnqueueReadBuffer ',err);
- if (err <> CL_SUCCESS) then begin
- writeln('Error: Failed to read output array! ', err);
- Halt(1);
- end;
- // Validate our results
- correct := 0;
- for i:= 0 to count - 1 do begin
- // FPU warning:
- //
- // the following check (as in original C sample)
- // if results[i] = data[i] * data[i] then
- //
- // return the incorrect result (FP accuracy?),
- // must store the result to single type variable first,
- // and then compare:
- tmpd:=data[i] * data[i];
- if results[i] = tmpd then inc(correct);
- end;
- // Print a brief summary detailing the results
- writeln('Computed ', correct, '/', count,' correct values!');
- // Shutdown and cleanup
- clReleaseMemObject(input);
- clReleaseMemObject(output);
- clReleaseProgram(prog);
- clReleaseKernel(kernel);
- clReleaseCommandQueue(commands);
- clReleaseContext(context);
- end.
|