Running computation in CUDA is actually quite straight forward:
blockDim
)An example would be:
/*
* nvcc cuda_example.cu -o cuda_example && ./cuda_example
*/
#include <stdio.h>
// Size of array
#define N 1048576
// Kernel
__global__ void add_vectors(double *a, double *b, double *c)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
if(id < N) c[id] = a[id] + b[id];
}
// Main program
int main()
{
// Number of bytes to allocate for N doubles
size_t bytes = N*sizeof(double);
// Allocate memory for arrays A, B, and C on host
double *A = (double*)malloc(bytes);
double *B = (double*)malloc(bytes);
double *C = (double*)malloc(bytes);
// Allocate memory for arrays d_A, d_B, and d_C on device
double *d_A, *d_B, *d_C;
cudaMalloc(&d_A, bytes);
cudaMalloc(&d_B, bytes);
cudaMalloc(&d_C, bytes);
// Fill host arrays A and B
for(int i=0; i<N; i++)
{
A[i] = 1.0;
B[i] = 2.0;
}
// Copy data from host arrays A and B to device arrays d_A and d_B
cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);
// Set execution configuration parameters
// thr_per_blk: number of CUDA threads per grid block
// blk_in_grid: number of blocks in grid
int thr_per_blk = 256;
int blk_in_grid = ceil( float(N) / thr_per_blk );
// Launch kernel
add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C);
// Copy data from device array d_C to host array C
cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);
// Verify results
double tolerance = 1.0e-14;
for(int i=0; i<N; i++)
{
if( fabs(C[i] - 3.0) > tolerance)
{
printf("\nError: value of C[%d] = %d instead of 3.0\n\n", i, C[i]);
exit(1);
}
}
// Free CPU memory
free(A);
free(B);
free(C);
// Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
printf("\n---------------------------\n");
printf("__SUCCESS__\n");
printf("---------------------------\n");
printf("N = %d\n", N);
printf("Threads Per Block = %d\n", thr_per_blk);
printf("Blocks In Grid = %d\n", blk_in_grid);
printf("---------------------------\n\n");
return 0;
}
There’s barely any “abstractions” per se, at least you don’t think about them that way, all the objects/functions map intuitively to how we imagine a physical GPU device.
That’s not the case in Apple’s Metal framework though. Let’s see what the code would look like for Metal, written in objective C, for a similar add operation:
/*
* clang -framework CoreGraphics -framework Foundation -framework Metal example.m -o example.o && ./example.o
*/
#import <Foundation/Foundation.h>
#import <Metal/Metal.h>
#import <CoreGraphics/CoreGraphics.h>
int main(int argc, char** argv)
{
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
id<MTLCommandQueue> commandQueue = [device newCommandQueue];
NSError* compileError;
MTLCompileOptions* compileOptions = [MTLCompileOptions new];
id<MTLLibrary> library = [device newLibraryWithSource:
@"#include <metal_stdlib>\n"
"using namespace metal;\n"
"kernel void add(const device float2 *in [[ buffer(0) ]],\n"
" device float *out [[ buffer(1) ]],\n"
" uint id [[ thread_position_in_grid ]]) {\n"
"out[id] = in[id].x + in[id].y;\n"
"}\n"
options: compileOptions error: nil];
id<MTLFunction> kernelFunction = [library newFunctionWithName:@"add"];
//----------------------------------------------------------------------
// pipeline
NSError *error = NULL;
[commandQueue commandBuffer];
id<MTLCommandBuffer> commandBuffer = [commandQueue commandBuffer];
id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
id<MTLComputePipelineState> pipelineState = [device newComputePipelineStateWithFunction:kernelFunction error:&error];
[encoder setComputePipelineState:pipelineState];
//----------------------------------------------------------------------
// Set Data
float input[] = {1,2};
NSInteger dataSize = sizeof(input);
[encoder setBuffer:[device newBufferWithBytes:input length:dataSize options:0]
offset:0
atIndex:0];
id<MTLBuffer> outputBuffer = [device newBufferWithLength:sizeof(float) options:0];
[encoder setBuffer:outputBuffer offset:0 atIndex:1];
//----------------------------------------------------------------------
// Run Kernel
MTLSize numThreadgroups = {1,1,1};
MTLSize numgroups = {1,1,1};
[encoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:numgroups];
[encoder endEncoding];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
//----------------------------------------------------------------------
// Results
float *output = [outputBuffer contents];
printf("result = %f\n", output[0]);
}
The peculiar objective C syntax may seem distracting, but even without them, you can see we are dealing with a lot more nouns, or in other words, object orientation layers, here are the steps described in English:
Set up:
Compile:
Describe the computation
Allocate memory:
Run:
Seeing all these symbols caused me to think about the merits of object orientated
design, especially when compared to CUDA’s simple API. It appears that every single
simple function call, like cudaMalloc
that makes intuitive sense, now has to
be wrapped in a dedicated class file with special getters and setters, for exmaple,
[encoder setComputePipelineState:[device newComputePipelineStateWithFunction:kernelFunction error:&error];]]
But putting the rant aside, the verbosity does yield a nice conceptual model that describes the GPU programming in a hardware agnostic manner, so maybe it’s worth the trade off with regards to simplicity.
We start with the physical GPU chip, Metal calls it MTLDevice. Each device has its own MTLDevice, like a file handle that represents the underlying hardware.
┌──────────┐┌──────────┐┌──────────┐
│ Physical ││ Physical ││ Physical │
│ GPU 1 ││ GPU 2 ││ GPU 3 │
└──────────┘└──────────┘└──────────┘
▲ ▲ ▲
│ │ │
┌──┴───────┐┌──┴───────┐┌──┴───────┐
│MTLDevice ││MTLDevice ││MTLDevice │
│instance1 ││instance2 ││instance3 │
└──────────┘└──────────┘└──────────┘
You can think of MTLDevice as class MTLDevice: def __init__(self, device_number:int)...
in python, although in Objc-C it is defined as a protocol, and the operating
system provide the “instance” to you directly. One of the commond way to access it
is by calling the top level MTLCreateSystemDefaultDevice
C function (objective C
can invoke C function the normal way).
Obtaining a handle to the device is required to compile (maybe not?) and run code. The Metal code, when compiled into byte code, is called an MTLLibrary. Think of it as just a fancy dictionary:
class MTLLibrary:
def __init__(self):
self.mapping = {
"add": b"\x001010101010010101" # the binary here is the compiled C++ we wrote earlier
}
You might think obtaining a reference to the add
function is just a matter of indexing
into the mapping object, but it seems that object oriented designer prefers
to create a separate class for it, hence MTLFunction:
class MTLFunction:
def __init__(self, lib: bytes)
class MTLLibrary:
def newFunctionWithName(self, name):
return MTLFunction(self.mapping[name])
Having the byte code doesn’t mean you can call it, so there’s another class for running it, called an MTLComputePipelineState
class MTLComputePipelineState:
def __init__(self, fn: MTLFunction): pass
def __call__(self): ...
And you call the newComputePipelineStateWithFunction on the library to get an instance of it:
class MTLLibrary:
def newComputePipelineStateWithFunction(self, function: MTLFunction):
return MTLComputePipelineState(function)
To create this MTLLibrary, call the newLibraryWithSource
on your device.
Visually:
newComputePipelineStateWithFunction
┌─────────────────────────────────────────────────┐
│ │
│ │
│ │
newLibraryWithSource │ newFunctionWithName ▼
┌─────────┐ ┌────┴─────┐ ┌───────────┐ ┌─────────────────────────┐
│MTLDevice├───────────► │MTLLibrary├──────────►│MTLFunction│ │ MTLComputePipelineState │
└─────────┘ ▲ └─────┬────┘ └─────┬─────┘ └───────┬─────────────────┘
│ │ │ │
│ │ │ │
┌─────────┴─────┐ ┌────────────┐ ┌─────┴────────┐ ┌──┴────────────┐
│C++ source code│ │c++ binaries│ │c++ binary │ │c++ executable │
└───────────────┘ └────────────┘ └──────────────┘ └───────────────┘
Now we have finished the compilation step, which corresponds to running
nvcc file.cu
, let’s look at the next step, setting up the computation
on GPU.
You may think that all that takes to prepare the GPU for computation is just 1. allocate memory 2. specify size, and 3. pass in the C++ executable, at least that’s how I thought, there are more to it though.
First, there needs to be a way to communicate with the GPU, instead of calling some low level C functions from other libraries, Metal wraps them all under an instance called MTLCommandQueue. If you want to tell GPU to allocate some memory, set up the function to execute, etc., you will send those instruction to MTLCommandQueue, it might be tempting to think of it as:
class MTLCommandQueue:
def alloc(self, size):
# call low level C function
But having it take in direct command is not ideal, you may have multiple threads
of program all trying to send this alloc command on the same GPU, so there needs
to be a mechanism to devide them, metal calls this MTLCommandBuffer
, again,
object oriented design seem to really want to give an unintuitive name to
everything intuitive…
So the model would look like, essentially we just send a list of instructions, so it is clear where each instruction originated and retain the ordering.
class MTLCommandQueue:
def send_instructions(self, commandBuffer):
for instruction in commandbBuffer.instructions:
externallibrary.run(instruction)
class CommandBuffer:
def alloc(self, size):
self.instructions.append(f"alloc(size)")
def commit(self):
mtlCommandQueue.send_instructions(self)
Obviously, appending string to a list is not ideal, and there could be multiple threads sending instructions, there needs to be a structured way to handle them, as a result, instead of doing the manual appending string, let’s create a separate class to handle it, Metal calls this MTLComputeCommandEncoder:
class CommandBuffer:
def send_instructions(self, commandEncoder):
self.instructions.append(commandEncoder)
def commit()...
class CommandEncoder:
def dispatchThreadGroups(): pass
def setBuffer(): pass
def setComputePipelineState():
def endEncoding(self):
commandBuffer.send_instructions(self)
It seems a bit redundant, but you get the idea. In fact, most
of the useful functions are defined on the encoder level, as you see
on the pseudo python code above. dispatchThreadGroups is analogous to
specifying the grid size etc. when launching a CUDA kernel,
setBuffer is like cudaMemcpy
and setComputePipleineState
is like CUDA’s add_vectors<<<
triple
angle bracket telling it which function to run.