tinygrad-notes

Abstraction in Apple’s Metal Framework

Running computation in CUDA is actually quite straight forward:

Write a CUDA flavored C++ GPU function, with some keywords that specify dimensions (blockDim)
allocate memory on GPU with cudaMalloc
invoke the function
copy result back and print it out

An example would be:

/*
 * nvcc cuda_example.cu -o cuda_example && ./cuda_example
 */


#include <stdio.h>

// Size of array
#define N 1048576

// Kernel
__global__ void add_vectors(double *a, double *b, double *c)
{
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if(id < N) c[id] = a[id] + b[id];
}

// Main program
int main()
{
    // Number of bytes to allocate for N doubles
    size_t bytes = N*sizeof(double);

    // Allocate memory for arrays A, B, and C on host
    double *A = (double*)malloc(bytes);
    double *B = (double*)malloc(bytes);
    double *C = (double*)malloc(bytes);

    // Allocate memory for arrays d_A, d_B, and d_C on device
    double *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    // Fill host arrays A and B
    for(int i=0; i<N; i++)
    {
        A[i] = 1.0;
        B[i] = 2.0;
    }

    // Copy data from host arrays A and B to device arrays d_A and d_B
    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

    // Set execution configuration parameters
    //      thr_per_blk: number of CUDA threads per grid block
    //      blk_in_grid: number of blocks in grid
    int thr_per_blk = 256;
    int blk_in_grid = ceil( float(N) / thr_per_blk );

    // Launch kernel
    add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C);

    // Copy data from device array d_C to host array C
    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    // Verify results
    double tolerance = 1.0e-14;
    for(int i=0; i<N; i++)
    {
        if( fabs(C[i] - 3.0) > tolerance)
        {
            printf("\nError: value of C[%d] = %d instead of 3.0\n\n", i, C[i]);
            exit(1);
        }
    }

    // Free CPU memory
    free(A);
    free(B);
    free(C);

    // Free GPU memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    printf("\n---------------------------\n");
    printf("__SUCCESS__\n");
    printf("---------------------------\n");
    printf("N                 = %d\n", N);
    printf("Threads Per Block = %d\n", thr_per_blk);
    printf("Blocks In Grid    = %d\n", blk_in_grid);
    printf("---------------------------\n\n");

    return 0;
}

There’s barely any “abstractions” per se, at least you don’t think about them that way, all the objects/functions map intuitively to how we imagine a physical GPU device.

That’s not the case in Apple’s Metal framework though. Let’s see what the code would look like for Metal, written in objective C, for a similar add operation:

/*
 * clang -framework CoreGraphics -framework Foundation -framework Metal example.m -o example.o && ./example.o
 */

#import <Foundation/Foundation.h>
#import <Metal/Metal.h>
#import <CoreGraphics/CoreGraphics.h>

int main(int argc, char** argv)
{
        id<MTLDevice> device = MTLCreateSystemDefaultDevice();
        id<MTLCommandQueue> commandQueue = [device newCommandQueue];
        NSError* compileError;
        MTLCompileOptions* compileOptions = [MTLCompileOptions new];
        id<MTLLibrary> library = [device newLibraryWithSource:
        @"#include <metal_stdlib>\n"
        "using namespace metal;\n"
        "kernel void add(const device float2 *in [[ buffer(0) ]],\n"
        "                device float  *out [[ buffer(1) ]],\n"
        "                uint id [[ thread_position_in_grid ]]) {\n"
        "out[id] = in[id].x + in[id].y;\n"
        "}\n"
         options: compileOptions error: nil];
        id<MTLFunction> kernelFunction = [library newFunctionWithName:@"add"];
        //----------------------------------------------------------------------
        // pipeline
        NSError *error = NULL;
        [commandQueue commandBuffer];
        id<MTLCommandBuffer> commandBuffer = [commandQueue commandBuffer];
        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
        id<MTLComputePipelineState> pipelineState = [device newComputePipelineStateWithFunction:kernelFunction error:&error];
        [encoder setComputePipelineState:pipelineState];
        //----------------------------------------------------------------------
        // Set Data
        float input[] = {1,2};
        NSInteger dataSize = sizeof(input);
        
        [encoder setBuffer:[device newBufferWithBytes:input length:dataSize options:0]
                    offset:0
                   atIndex:0];
        
        id<MTLBuffer> outputBuffer = [device newBufferWithLength:sizeof(float) options:0];
        [encoder setBuffer:outputBuffer offset:0 atIndex:1];
        //----------------------------------------------------------------------
        // Run Kernel
        MTLSize numThreadgroups = {1,1,1};
        MTLSize numgroups = {1,1,1};
        [encoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:numgroups];
        [encoder endEncoding];
        [commandBuffer commit];
        [commandBuffer waitUntilCompleted];
        //----------------------------------------------------------------------
        // Results
        float *output = [outputBuffer contents];
        printf("result = %f\n", output[0]);
}

The peculiar objective C syntax may seem distracting, but even without them, you can see we are dealing with a lot more nouns, or in other words, object orientation layers, here are the steps described in English:

Set up:

create an MTLDevice

Compile:

call a method on the device to create an MTLLibrary with the GPU code, with options being specified by an MTLCompileOptions (aka. a dictionary)
call a method on the MTLLibrary create an MTLFunction

Describe the computation

call a method on device to create an MTLCommandQueue
call a method on the MTLCommandQueue to create an MTLCommandBuffer
call a method on the MTLCommandBuffer to create an MTLComputeCommandEncoder
call a method on the device to create an MTLComputePipelineState
call a method on MTLComputeCommandEncoder to specify the GPU function that will be executed, by passing in the MTLComputePipelineState, which contains a reference to the kernel function we compiled on step 2

Allocate memory:

call a method on the MTLDevice to allocate memory on either CPU or GPU or shared
call a method on the MTLComputeCommandEncoder to acknowledge the memory created in step 9

Run:

call a method on MTLComputeCommandEncoder to do the computation
call a method on MTLComputeCommandEncoder to mark an end to the computation
call a method on MTLCommandBuffer to mark an end to the computation
call a method on MTLCommandBuffer to wait until the computation completes

Seeing all these symbols caused me to think about the merits of object orientated design, especially when compared to CUDA’s simple API. It appears that every single simple function call, like cudaMalloc that makes intuitive sense, now has to be wrapped in a dedicated class file with special getters and setters, for exmaple, [encoder setComputePipelineState:[device newComputePipelineStateWithFunction:kernelFunction error:&error];]]

But putting the rant aside, the verbosity does yield a nice conceptual model that describes the GPU programming in a hardware agnostic manner, so maybe it’s worth the trade off with regards to simplicity.

The abstractions

Reference article

We start with the physical GPU chip, Metal calls it MTLDevice. Each device has its own MTLDevice, like a file handle that represents the underlying hardware.

                     
                                        
 ┌──────────┐┌──────────┐┌──────────┐   
 │ Physical ││ Physical ││ Physical │   
 │ GPU 1    ││ GPU 2    ││ GPU 3    │   
 └──────────┘└──────────┘└──────────┘   
    ▲           ▲           ▲           
    │           │           │           
 ┌──┴───────┐┌──┴───────┐┌──┴───────┐   
 │MTLDevice ││MTLDevice ││MTLDevice │   
 │instance1 ││instance2 ││instance3 │   
 └──────────┘└──────────┘└──────────┘   
                                        

You can think of MTLDevice as class MTLDevice: def __init__(self, device_number:int)... in python, although in Objc-C it is defined as a protocol, and the operating system provide the “instance” to you directly. One of the commond way to access it is by calling the top level MTLCreateSystemDefaultDevice C function (objective C can invoke C function the normal way).

Obtaining a handle to the device is required to compile (maybe not?) and run code. The Metal code, when compiled into byte code, is called an MTLLibrary. Think of it as just a fancy dictionary:

class MTLLibrary:
    def __init__(self):
        self.mapping = {
            "add": b"\x001010101010010101" # the binary here is the compiled C++ we wrote earlier
        }

You might think obtaining a reference to the add function is just a matter of indexing into the mapping object, but it seems that object oriented designer prefers to create a separate class for it, hence MTLFunction:

class MTLFunction:
    def __init__(self, lib: bytes)

class MTLLibrary:
    def newFunctionWithName(self, name):
        return MTLFunction(self.mapping[name])

Having the byte code doesn’t mean you can call it, so there’s another class for running it, called an MTLComputePipelineState

class MTLComputePipelineState:
    def __init__(self, fn: MTLFunction): pass
    def __call__(self): ...

And you call the newComputePipelineStateWithFunction on the library to get an instance of it:

class MTLLibrary:
    def newComputePipelineStateWithFunction(self, function: MTLFunction):
        return MTLComputePipelineState(function)

To create this MTLLibrary, call the newLibraryWithSource on your device.

Visually:

                                                                              
                                                                                                    
                                    newComputePipelineStateWithFunction                             
                             ┌─────────────────────────────────────────────────┐                    
                             │                                                 │                    
                             │                                                 │                    
                             │                                                 │                    
      newLibraryWithSource   │  newFunctionWithName                            ▼                    
┌─────────┐             ┌────┴─────┐           ┌───────────┐          ┌─────────────────────────┐   
│MTLDevice├───────────► │MTLLibrary├──────────►│MTLFunction│          │ MTLComputePipelineState │   
└─────────┘    ▲        └─────┬────┘           └─────┬─────┘          └───────┬─────────────────┘   
               │              │                      │                        │                     
               │              │                      │                        │                     
     ┌─────────┴─────┐  ┌────────────┐         ┌─────┴────────┐            ┌──┴────────────┐        
     │C++ source code│  │c++ binaries│         │c++ binary    │            │c++ executable │        
     └───────────────┘  └────────────┘         └──────────────┘            └───────────────┘        
                                                                                                    
                                                                                                    

Now we have finished the compilation step, which corresponds to running nvcc file.cu, let’s look at the next step, setting up the computation on GPU.

You may think that all that takes to prepare the GPU for computation is just 1. allocate memory 2. specify size, and 3. pass in the C++ executable, at least that’s how I thought, there are more to it though.

First, there needs to be a way to communicate with the GPU, instead of calling some low level C functions from other libraries, Metal wraps them all under an instance called MTLCommandQueue. If you want to tell GPU to allocate some memory, set up the function to execute, etc., you will send those instruction to MTLCommandQueue, it might be tempting to think of it as:

class MTLCommandQueue:
    def alloc(self, size):
        # call low level C function

But having it take in direct command is not ideal, you may have multiple threads of program all trying to send this alloc command on the same GPU, so there needs to be a mechanism to devide them, metal calls this MTLCommandBuffer, again, object oriented design seem to really want to give an unintuitive name to everything intuitive…

So the model would look like, essentially we just send a list of instructions, so it is clear where each instruction originated and retain the ordering.

class MTLCommandQueue:
    def send_instructions(self, commandBuffer):
        for instruction in commandbBuffer.instructions:
            externallibrary.run(instruction)

class CommandBuffer:
    def alloc(self, size):
        self.instructions.append(f"alloc(size)")
    def commit(self):
        mtlCommandQueue.send_instructions(self)

Obviously, appending string to a list is not ideal, and there could be multiple threads sending instructions, there needs to be a structured way to handle them, as a result, instead of doing the manual appending string, let’s create a separate class to handle it, Metal calls this MTLComputeCommandEncoder:

class CommandBuffer:
    def send_instructions(self, commandEncoder):
        self.instructions.append(commandEncoder)
    def commit()...

class CommandEncoder:
    def dispatchThreadGroups(): pass
    def setBuffer(): pass
    def setComputePipelineState(): 
    def endEncoding(self):
        commandBuffer.send_instructions(self)
    

It seems a bit redundant, but you get the idea. In fact, most of the useful functions are defined on the encoder level, as you see on the pseudo python code above. dispatchThreadGroups is analogous to specifying the grid size etc. when launching a CUDA kernel, setBuffer is like cudaMemcpy and setComputePipleineState is like CUDA’s add_vectors<<< triple angle bracket telling it which function to run.