The Mnist Feedforward net example

A few posts ago I showed a feedforward pass for a neural net on the XOR example, but no reverse pass. This here is pretty much that neural net except with reverse pass using the spiral library. These are the contents of the load_mnist_fsx:

let testSetData = @"\t10k-images.idx3-ubyte"
let testSetLabels = @"\t10k-labels.idx1-ubyte"
let trainSetData = @"\train-images.idx3-ubyte"
let trainSetLabels = @"\train-labels.idx1-ubyte"

open System
open System.IO

type MnistImageset = {
    num_images : int32
    num_rows : int32
    num_cols : int32
    raw_data : uint8 []
    raw_labels : uint8 []
    float_data : float32 []
    float_labels : float32 []

let readInt32 (r : BinaryReader) = // Because Mnist's ints are in reverse order.
    let arr = r.ReadBytes(4)
    arr |> Array.Reverse

let make_imageset data_path label_path =
    use stream_data = File.OpenRead(data_path)
    use stream_label = File.OpenRead(label_path)
    use reader_data = new BinaryReader(stream_data)
    use reader_label = new BinaryReader(stream_label)

    let magic_number = readInt32 reader_data
    let num_images = readInt32 reader_data
    let num_rows = readInt32 reader_data
    let num_cols = readInt32 reader_data
    let total_num_bytes = num_images * num_rows * num_cols

    let raw_data = reader_data.ReadBytes(total_num_bytes)
    let raw_label_data = reader_label.ReadBytes(num_images+8)

    let float_pixel_values = [|for x in raw_data -> (float32 x)/255.0f|]

    let float_labels = Array.zeroCreate (10*num_images)
    let mutable c = 0
    for x in raw_label_data.[8..] do
        float_labels.[(int x) + c] <- 1.0f
        c <- c+10
    num_images = num_images
    num_rows = num_rows
    num_cols = num_cols
    raw_data = raw_data
    raw_labels = raw_label_data.[8..]
    float_data = float_pixel_values
    float_labels = float_labels

The code just loads the Mnist dataset from the given file and returns it in the record above.

// Spiral reverse AD example. Used for testing.
#load "../Spiral Library/ad_utils_spiral_v1.fsx"
open Ad_utils_spiral_v1

#r "../packages/FSharp.Charting.0.90.13/lib/net40/FSharp.Charting.dll"
#r "System.Windows.Forms.DataVisualization.dll"

open FSharp.Charting

#load "load_mnist.fsx"
open Load_mnist
open System

open ManagedCuda
open ManagedCuda.BasicTypes
open ManagedCuda.VectorTypes
open ManagedCuda.CudaBlas
open ManagedCuda.CudaRand
open ManagedCuda.NVRTC
open ManagedCuda.CudaDNN

open System
open System.IO
open System.Collections

// "__SOURCE_DIRECTORY__ + testSetData" gives parsing errors if it is written like "__SOURCE_DIRECTORY__+testSetData"
// __SOURCE_DIRECTORY__ is just a string literal refering to the directory where the script resides.
let train_data = make_imageset (__SOURCE_DIRECTORY__ + trainSetData) (__SOURCE_DIRECTORY__ + trainSetLabels) 
let test_data = make_imageset (__SOURCE_DIRECTORY__ + testSetData) (__SOURCE_DIRECTORY__ + testSetLabels)

/// Returns a tuple of training set and label set split into minibatches.
let make_set (s : MnistImageset) batch_size =
    /// Function that splits the dataset along the columns.
    let split_cols (x:dMatrix) batch_size =
        for i=0 to (x.num_cols-1)/batch_size do
            let start_pos = i*batch_size
            let end_pos = min ((i+1)*batch_size-1) (x.num_cols-1)
            yield x.[*,start_pos..end_pos]
    use d_data = dMatrix.create(s.num_rows*s.num_cols,s.num_images,s.float_data) // Loads the data
    use d_label = dMatrix.create(10,s.num_images,s.float_labels) // Loads the labels
    let ar_data = split_cols d_data batch_size |> (fun x -> DM.makeConstantNode x)
    let ar_label = split_cols d_label batch_size |> (fun x -> DM.makeConstantNode x) ar_data ar_label

// The type of each of these two variable is dMatrix [], dMatrix [] - a tuple.
let dtrain = make_set train_data 128
let dtest = make_set test_data 128 

One of the reason why I am programming in a statically typed language like F# instead of Python is because types are so informative. The function are like little machines that take in certain types and spit out another. If this code seems difficult you can just hover your mouse of the variables and IDE will tell you the exact type of everything.

In the above case the last two variables are all dMatrix [], dMatrix [] tuple types.

Here is the code for the feedforward layer.

// A feedforward layer of neurons
type FeedforwardLayer =
    W:DM  // Input weight matrix
    b:DM  // Bias vector
    } with     // Activation function
    member l.ToArray = 

    static member fromArray (a : DM[]) act =
         W = a.[0]
         b = a.[1]
         a = act

    static member createRandomLayer hidden_size input_size act =
         W = DM.makeUniformRandomNode(hidden_size, input_size)
         b = DM.makeUniformRandomNode(hidden_size, 1)
         a = act

    member l.runLayer (x:DM) =
        linear_layer_matmult [|l.W,x|] (Some l.b) |> l.a

The linear_layer function is just Ax+b. The |> pipe operator at the end forward the result of the previous computation, that is from the call to the linear layer, to the l.a activation function.

let l1 = FeedforwardLayer.createRandomLayer 1024 784 relu
let l2 = FeedforwardLayer.createRandomLayer 2048 1024 relu
let l3 = FeedforwardLayer.createRandomLayer 1024 2048 relu
let l4 = FeedforwardLayer.createRandomLayer 10 1024 (clipped_steep_sigmoid 3.0f)
let layers = [|l1;l2;l3;l4|]

This is how layers are created. Currently besides the above, Spiral also has standard RNN, GRU and LSTM classes. More will be added no doubt, as I experiment with reinforcement learning. At the time of writing the Spiral library is about one month old and I am still in the middle of porting my old Alea stuff to it.

// This does not actually train it, it just initiates the tree for later training.
let training_loop (data: DM) (targets: DM) (layers: FeedforwardLayer[]) =
    let outputs = layers |> Array.fold (fun state layer -> layer.runLayer state) data
    // I make the accuracy calculation lazy. This is similar to returning a lambda function that calculates the accuracy
    // although in this case it will be calculated at most once.
    lazy get_accuracy targets.r.P outputs.r.P, cross_entropy_cost targets outputs 

The above is the forward pass of the library. The Array.fold part is a bit confusing in you are not familiar with it. I’ll rewrite the function into an equivalent imperative form.

// This does not actually train it, it just initiates the tree for later training.
// Imperative form.
let training_loop (data: DM) (targets: DM) (layers: FeedforwardLayer[]) =
    let mutable outputs = data
    for x in layers do
        outputs <- x.runLayer outputs
    // I make the accuracy calculation lazy. This is similar to returning a lambda function that calculates the accuracy
    // although in this case it will be calculated at most once.
    lazy get_accuracy targets.r.P outputs.r.P, cross_entropy_cost targets outputs 

The fold function just takes an anonymous function, an initial state and an array as parameters and iterates over that array in the last line, it lazily returns the accuracy value which is not actually a part of the tree construction or the tape and returns non-lazily the cross_entropy_cost which definitely goes into the tape.

The way the accuracy calculation is worth getting into for a bit

/// o <- max_col(x)
/// Sets all except one of the max of a column to zero.
type DeviceMaxColumnActivationModule() = 
    let block_size = 128

    let kernel_code = "
        //Kernel code:
        extern \"C\" {
            #define INIT __int_as_float(0xff800000) // The constant init for the reduce operations. This is float negative infinity.
            // The max reduce version.
            __device__ inline "+FloatTypeCpp+" warpReduce("+FloatTypeCpp+" value){
	            for (int i=1; i<32; i*=2) {
                    "+FloatTypeCpp+" tmp = __shfl_xor(value, i);
                    value = (tmp > value) ? tmp : value;
	            return value;
            __device__ inline "+FloatTypeCpp+" blockReduce("+FloatTypeCpp+" value){
	            __shared__ "+FloatTypeCpp+" temp[32];
                if (threadIdx.x < 32) temp[threadIdx.x] = INIT; "+FloatTypeCpp+" out_partial = warpReduce(value);
	            if (threadIdx.x % 32 == 0) temp[threadIdx.x / 32] = out_partial;
	            if (threadIdx.x < 32) out_partial = warpReduce(temp[threadIdx.x]);
                return out_partial;

            // Device code
            __global__ void Kernel(const "+FloatTypeCpp+"* A, "+FloatTypeCpp+"* O, const int num_rows, const int num_cols)
                int row = threadIdx.x;
                //const int col = blockIdx.x;
                int col_idx = blockIdx.x*num_rows; "+FloatTypeCpp+" max = INIT; // This is the negative infinity for floats.
                int index = -1;
                while (row < num_rows)
                   if (A[row+col_idx] > max) {
                        max = A[row+col_idx];
                        index = row;
                    row += blockDim.x;
                __shared__ "+FloatTypeCpp+" max_index;
                if (max == blockReduce(max)) max_index = index;
                index = max_index; // These last four lines are to make absolutely sure that only one max is selected in case there is more than one.
                row = threadIdx.x;
                while (row < num_rows)
                    O[row+col_idx] = (row == index) ? max : 0.0f;
                    row += blockDim.x;

    let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"Kernel")
        try k.Compile([|"-arch=compute_30"|])
        | 😕 NVRTCException as x -> 
            printfn "%s" (k.GetLogAsString())

    let kernel = ctx.LoadKernelPTX(k.GetPTX(),"Kernel")

    member t.A(x: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>, m:int , n: int) =
        kernel.GridDimensions <- dim3(n)
        kernel.BlockDimensions <- dim3(block_size)
        kernel.RunAsync(str.Stream,x.DevicePointer,o.DevicePointer,m,n) |> ignore

    member t.A(x: dMatrix, o: dMatrix) =
        if x.rc <> o.rc then failwith "x.rc <> o.rc"

    member t.A(x: dMatrix) =
        let o = dMatrix.create(x.num_rows,x.num_cols)
let maxColumnModule = lazy new DeviceMaxColumnActivationModule()
let accuracyModule = lazy new DeviceBinaryMapSumModule "(x*y == 0.0f) ? 0.0f : 1.0f;"
let get_accuracy targets activations =
    let o = tape.GetDMIf
    o.P.ReplaceIf activations.num_rows activations.num_cols

In the above Cuda kernel, each blocks iterates over its column and then does a block max reduction. At first the way it was written it then just compared the values in the A array with the max and then set the values that did not equal max to zero. This actually caused errors when the values were saturated as not all values except one of a column would be set to zero. There would be more than one max.

What the above does is neatly avoids that issue by having all threads in a block that equal the max store their indexes into the shared memory at a single location. They each crowd and push each other to store that value and only one of them gets stored. The the block gets synchronized and they all read that same value thereby guaranteeing that all threads in a block have the same index. Then all rows except for the one that equals that index get set to zero.

At the end of this is outputted a matrix whose columns only have one value nonzero. The above algorithm is actually even more efficient that the previous as it does not require reading from the original again.

The way accuracy is then calculated from that is that the column max matrix and the label matrix are elementwise multiplied and then their nonzero elements are set to 1. Then they are summed. That is neatly done in a single operation inside the accuracyModule.

let train_mnist_sgd num_iters learning_rate (layers: FeedforwardLayer[]) =
    let mutable r' = 0.0f
    let base_nodes = layers |> (fun x -> x.ToArray) |> Array.concat // Stores all the base nodes of the layer so they can later be reset.
    for i=1 to num_iters do
        for x in dtrain do
            let data, target = x
            let _,r = training_loop data target layers // Builds the tape.

            tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures.
            r' <- r' + (!r.r.P/ float32 dtrain.Length) // Adds the cost to the accumulator.
            if System.Single.IsNaN r' then failwith "Nan error"

            for x in base_nodes do x.r.A.setZero() // Resets the base adjoints
            tape.resetTapeAdjoint 0 // Resets the adjoints for the training select
            r.r.A := 1.0f // Pushes 1.0f from the top node
            tape.reversepropTape 0 // Resets the adjoints for the test select
            add_gradients_to_weights' base_nodes learning_rate // The optimization step
            tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed for the simple recurrent and feedforward case.

        printfn "The training cost at iteration %i is %f" i r'
        let r1 = r'
        r' <- 0.0f
        let mutable acc = 0.0f

        for x in dtest do
            let data, target = x
            let lazy_acc,r = training_loop data target layers // Builds the tape.

            tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures.
            r' <- r' + (!r.r.P/ float32 dtest.Length) // Adds the cost to the accumulator.
            acc <- acc+lazy_acc.Value // Here the accuracy calculation is triggered by accessing it through the Lazy property.

            if System.Single.IsNaN r' then failwith "Nan error"

            tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed.

        printfn "The validation cost at iteration %i is %f" i r'
        printfn "The accuracy is %i/10000" (int acc)
        let r2 = r'
        r' <- 0.0f
        yield r1,r2
let num_iters = 40
let learning_rate = 0.1f
let s = train_mnist_sgd num_iters learning_rate layers

let l = [|for l,_ in s do yield l|]
let r = [|for _,r in s do yield r|]

(Chart.Combine [|Chart.Line l;Chart.Line r|]).ShowChart()

The above just demonstrates just what a small part of the actual program the neural net actually is. We set out to write f(Ax+b) and ended up writing about two thousand lines of code.
Having the accuracy calculation be lazy is a good idea as that ensures that its function does not get triggered on the training set. If not for this I would have to have separate function and I actually do not have the access to the intermediate variables inside the train_mnist_sgd loops.

For the reference I got about 98.3% after 40 iterations. I actually get 98.5% with just two layers.

This might seem like a limit, but it can be improved upon by pretraining the net with an autoencoder.


Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s