mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-16 19:01:24 +02:00
269 lines
8.5 KiB
ObjectPascal
269 lines
8.5 KiB
ObjectPascal
{
|
|
The sample is give at
|
|
http://developer.apple.com/mac/library/samplecode/OpenCL_Hello_World_Example/index.html
|
|
|
|
===========================================================================
|
|
DESCRIPTION:
|
|
|
|
A simple "Hello World" compute example showing basic usage of OpenCL which
|
|
calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of
|
|
floating point values.
|
|
|
|
For simplicity, this example is intended to be run from the command line.
|
|
If run from within XCode, open the Run Log (Command-Shift-R) to see the
|
|
output. Alternatively, run the applications from within a Terminal.app
|
|
session to launch from the command line.
|
|
|
|
===========================================================================
|
|
BUILD REQUIREMENTS:
|
|
|
|
Mac OS X v10.6 or later
|
|
or
|
|
Windows with NVidia OpenCL SDK Installed
|
|
|
|
===========================================================================
|
|
RUNTIME REQUIREMENTS:
|
|
|
|
Mac OS X v10.6 or later
|
|
|
|
To use the GPU as a compute device, use one of the following devices:
|
|
- MacBook Pro w/NVidia GeForce 8600M
|
|
- Mac Pro w/NVidia GeForce 8800GT
|
|
|
|
If you don't have powerful GPU you can try to use CPU instead:
|
|
by changing
|
|
gpu := CL_DEVICE_TYPE_GPU;
|
|
to
|
|
gpu := CL_DEVICE_TYPE_CPU;
|
|
|
|
Windows with NVidia OpenCL SDK Installed and libOpenCLXX.dll available
|
|
|
|
CL_DEVICE_TYPE_CPU doesn't seem to work for windows
|
|
|
|
===========================================================================
|
|
}
|
|
program testcl;
|
|
|
|
{$mode objfpc}{$H+}
|
|
|
|
uses
|
|
ctypes, cl;
|
|
|
|
|
|
// Use a static data size for simplicity
|
|
|
|
const
|
|
DATA_SIZE = 1024;
|
|
|
|
// Simple compute kernel which computes the square of an input array
|
|
const
|
|
KernelSource : PChar =
|
|
'__kernel void square( '#10+
|
|
' __global float* input, '#10+
|
|
' __global float* output, '#10+
|
|
' const unsigned int count) '#10+
|
|
'{ '#10+
|
|
' int i = get_global_id(0); '#10+
|
|
' if(i < count) '#10+
|
|
' output[i] = input[i] * input[i]; '#10+
|
|
'} '#0;
|
|
|
|
|
|
var
|
|
err : Integer; // error code returned from api calls
|
|
data : array [0..DATA_SIZE-1] of single; // original data set given to device
|
|
results : array [0..DATA_SIZE-1] of single; // results returned from device
|
|
correct : LongWord; // number of correct results returned
|
|
|
|
global : csize_t; // global domain size for our calculation
|
|
local : csize_t; // local domain size for our calculation
|
|
|
|
device_id : cl_device_id; // compute device id
|
|
context : cl_context; // compute context
|
|
commands : cl_command_queue; // compute command queue
|
|
prog : cl_program; // compute program
|
|
kernel : cl_kernel; // compute kernel
|
|
|
|
input : cl_mem; // device memory used for the input array
|
|
output : cl_mem; // device memory used for the output array
|
|
|
|
i : Integer;
|
|
count : Integer;
|
|
gpu : cl_device_type;
|
|
|
|
tmpd : single;
|
|
platformids : Pcl_platform_id;
|
|
num_platforms : cl_uint;
|
|
|
|
begin
|
|
// Fill our data set with random float values
|
|
count := DATA_SIZE;
|
|
for i:=0 to count - 1 do
|
|
data[i]:= random;
|
|
|
|
err:=clGetPlatformIDs(0,nil,@num_platforms);
|
|
Writeln('clGetPlatformIDs ', num_platforms);
|
|
if (err <> CL_SUCCESS) then
|
|
begin
|
|
writeln('Error: Cannot get number of platforms!');
|
|
Halt(1);
|
|
end;
|
|
|
|
getmem(platformids,num_platforms*sizeof(cl_platform_id));
|
|
|
|
err := clGetPlatformIDs(num_platforms, platformids, nil);
|
|
|
|
if (err <> CL_SUCCESS) then begin
|
|
Writeln('Error: Failed to platforms!');
|
|
Halt($FF);
|
|
end;
|
|
|
|
// Connect to a compute device
|
|
// change CL_DEVICE_TYPE_CPU to CL_DEVICE_TYPE_GPU is you have powerful video (GeForce 8800/8600M or higher)
|
|
gpu := CL_DEVICE_TYPE_GPU;
|
|
|
|
device_id:=nil;
|
|
err := clGetDeviceIDs(platformids[0], gpu, 1, @device_id, nil);
|
|
writeln('clGetDeviceIDs ', err);
|
|
if (err <> CL_SUCCESS) then begin
|
|
Writeln('Error: Failed to create a device group!');
|
|
Halt($FF);
|
|
end;
|
|
|
|
// Create a compute context
|
|
context := clCreateContext(nil, 1, @device_id, nil, nil, err);
|
|
writeln('clCreateContext ', err);
|
|
if context=nil then begin
|
|
Writeln('Error: Failed to create a compute context!');
|
|
Halt($FF);
|
|
end;
|
|
|
|
// Create a command commands
|
|
commands := clCreateCommandQueue(context, device_id, 0, err);
|
|
writeln('clCreateCommandQueue ', err);
|
|
if commands=nil then begin
|
|
Writeln('Error: Failed to create a command commands!');
|
|
Halt($FF);
|
|
end;
|
|
|
|
// Create the compute program from the source buffer
|
|
prog:= clCreateProgramWithSource(context, 1, PPChar(@KernelSource), nil, err);
|
|
writeln('clCreateProgramWithSource ', err);
|
|
if prog=nil then begin
|
|
writeln('Error: Failed to create compute program! ');
|
|
Halt($FF);
|
|
end;
|
|
|
|
// Build the program executable
|
|
err := clBuildProgram(prog, 0, nil, nil, nil, nil);
|
|
writeln('clBuildProgram ', err);
|
|
if (err <> CL_SUCCESS) then begin
|
|
writeln('Error: Failed to build program executable!');
|
|
Halt(1);
|
|
end;
|
|
|
|
// Create the compute kernel in the program we wish to run
|
|
kernel := clCreateKernel(prog, 'square', err);
|
|
writeln('clCreateKernel ', err);
|
|
if (kernel=nil) or (err <> CL_SUCCESS) then begin
|
|
writeln('Error: Failed to create compute kernel!');
|
|
Halt(1);
|
|
end;
|
|
|
|
err := clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), @local, nil);
|
|
writeln('clGetKernelWorkGroupInfo ', err);
|
|
if (err<>CL_SUCCESS) then begin
|
|
writeln('Error: Failed to retrieve kernel work group info!');
|
|
Halt(1);
|
|
end;
|
|
|
|
|
|
// Create the input and output arrays in device memory for our calculation
|
|
input := clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(single) * count, nil, err);
|
|
writeln('clCreateBuffer ', err);
|
|
output := clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(single) * count, nil, err);
|
|
writeln('clCreateBuffer ', err);
|
|
if (input=nil) or (output=nil) then begin
|
|
writeln('Error: Failed to allocate device memory!');
|
|
Halt(1);
|
|
end;
|
|
|
|
// Write our data set into the input array in device memory
|
|
err := clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(single) * count, @data, 0, nil, nil);
|
|
writeln('clEnqueueWriteBuffer ', err);
|
|
if (err <> CL_SUCCESS) then begin
|
|
writeln('Error: Failed to write to source array!');
|
|
Halt(1);
|
|
end;
|
|
|
|
// Set the arguments to our compute kernel
|
|
err := 0;
|
|
err := clSetKernelArg(kernel, 0, sizeof(cl_mem), @input);
|
|
writeln('clSetKernelArg ', err);
|
|
err := err or clSetKernelArg(kernel, 1, sizeof(cl_mem), @output);
|
|
writeln('clSetKernelArg ', err);
|
|
err := err or clSetKernelArg(kernel, 2, sizeof(longword), @count);
|
|
writeln('clSetKernelArg ', err);
|
|
if (err <> CL_SUCCESS) then begin
|
|
writeln('Error: Failed to set kernel arguments! ');
|
|
Halt(1);
|
|
end;
|
|
|
|
// Get the maximum work group size for executing the kernel on the device
|
|
err := clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), @local, nil);
|
|
writeln('clGetKernelWorkGroupInfo ', err);
|
|
if (err<>CL_SUCCESS) then begin
|
|
writeln('Error: Failed to retrieve kernel work group info!');
|
|
Halt(1);
|
|
end;
|
|
|
|
// Execute the kernel over the entire range of our 1d input data set
|
|
// using the maximum number of work group items for this device
|
|
global := count;
|
|
err := clEnqueueNDRangeKernel(commands, kernel, 1, nil, @global, @local, 0, nil, nil);
|
|
writeln('clEnqueueNDRangeKernel ',err);
|
|
if (err<>0) then begin
|
|
writeln('Error: Failed to execute kernel!');
|
|
Halt($FF);
|
|
end;
|
|
|
|
// Wait for the command commands to get serviced before reading back results
|
|
err:=clFinish(commands);
|
|
writeln('clFinish ',err);
|
|
|
|
// Read back the results from the device to verify the output
|
|
err := clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(single) * count, @results, 0, nil, nil);
|
|
writeln('clEnqueueReadBuffer ',err);
|
|
if (err <> CL_SUCCESS) then begin
|
|
writeln('Error: Failed to read output array! ', err);
|
|
Halt(1);
|
|
end;
|
|
|
|
// Validate our results
|
|
correct := 0;
|
|
for i:= 0 to count - 1 do begin
|
|
// FPU warning:
|
|
//
|
|
// the following check (as in original C sample)
|
|
// if results[i] = data[i] * data[i] then
|
|
//
|
|
// return the incorrect result (FP accuracy?),
|
|
// must store the result to single type variable first,
|
|
// and then compare:
|
|
tmpd:=data[i] * data[i];
|
|
if results[i] = tmpd then inc(correct);
|
|
end;
|
|
|
|
// Print a brief summary detailing the results
|
|
writeln('Computed ', correct, '/', count,' correct values!');
|
|
|
|
// Shutdown and cleanup
|
|
clReleaseMemObject(input);
|
|
clReleaseMemObject(output);
|
|
clReleaseProgram(prog);
|
|
clReleaseKernel(kernel);
|
|
clReleaseCommandQueue(commands);
|
|
clReleaseContext(context);
|
|
end.
|
|
|