// The function that generates embedded Reber grammars. // This is a more difficult kind with long term dependencies. // http://christianherta.de/lehre/dataScience/machineLearning/neuralNetworks/reberGrammar.php type reberNode = | NodeF // Only outputs B. | NodeS // Can only receive B. Outputs T or P. | Node0a // Can only receive T. Outputs B. | Node1a // Can only receive B. Outputs T or P. | Node2a // Can receive T or S. Outputs S or X. | Node3a // Can receive P or T. Outputs V or T. | Node4a // Can receive X or P. Outputs X or S. | Node5a // Can only receive V. Outputs P or V. | Node6a // Can receive S or V. Outputs E. | Node7a // Can only receive E. Outputs T. | Node0b // Can only receive P. Outputs B. | Node1b // Can only receive B. Outputs T or P. | Node2b // Can receive T or S. Outputs S or X. | Node3b // Can receive P or T. Outputs V or T. | Node4b // Can receive X or P. Outputs X or S. | Node5b // Can only receive V. Outputs P or V. | Node6b // Can receive S or V. Outputs E. | Node7b // Can only receive E. Outputs P. | Node8 // Can receive T or P. Outputs E. let rng = System.Random() let b_string = [|1.0f;0.0f;0.0f;0.0f;0.0f;0.0f;0.0f|] let t_string = [|0.0f;1.0f;0.0f;0.0f;0.0f;0.0f;0.0f|] let p_string = [|0.0f;0.0f;1.0f;0.0f;0.0f;0.0f;0.0f|] let s_string = [|0.0f;0.0f;0.0f;1.0f;0.0f;0.0f;0.0f|] let x_string = [|0.0f;0.0f;0.0f;0.0f;1.0f;0.0f;0.0f|] let v_string = [|0.0f;0.0f;0.0f;0.0f;0.0f;1.0f;0.0f|] let e_string = [|0.0f;0.0f;0.0f;0.0f;0.0f;0.0f;1.0f|] let t_p_string = [|0.0f;1.0f;1.0f;0.0f;0.0f;0.0f;0.0f|] let t_v_string = [|0.0f;1.0f;0.0f;0.0f;0.0f;1.0f;0.0f|] let s_x_string = [|0.0f;0.0f;0.0f;1.0f;1.0f;0.0f;0.0f|] let p_v_string = [|0.0f;0.0f;1.0f;0.0f;0.0f;1.0f;0.0f|] let rec make_random_reber_string str list prediction node = match node with | NodeF -> make_random_reber_string "B" [b_string] [b_string] NodeS | NodeS -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"T") (t_string::list) (t_p_string::prediction) Node0a else make_random_reber_string (str+"P") (p_string::list) (t_p_string::prediction) Node0b | Node0a -> make_random_reber_string (str+"B") (b_string::list) (b_string::prediction) Node1a | Node1a -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"T") (t_string::list) (t_p_string::prediction) Node2a else make_random_reber_string (str+"P") (p_string::list) (t_p_string::prediction) Node3a | Node2a -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"S") (s_string::list) (s_x_string::prediction) Node2a else make_random_reber_string (str+"X") (x_string::list) (s_x_string::prediction) Node4a | Node3a -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"T") (t_string::list) (t_v_string::prediction) Node3a else make_random_reber_string (str+"V") (v_string::list) (t_v_string::prediction) Node5a | Node4a -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"X") (x_string::list) (s_x_string::prediction) Node3a else make_random_reber_string (str+"S") (s_string::list) (s_x_string::prediction) Node6a | Node5a -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"P") (p_string::list) (p_v_string::prediction) Node4a else make_random_reber_string (str+"V") (v_string::list) (p_v_string::prediction) Node6a | Node6a -> make_random_reber_string (str+"E") (e_string::list) (e_string::prediction) Node7a | Node7a -> make_random_reber_string (str+"T") (t_string::list) (t_string::prediction) Node8 | Node0b -> make_random_reber_string (str+"B") (b_string::list) (b_string::prediction) Node1b | Node1b -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"T") (t_string::list) (t_p_string::prediction) Node2b else make_random_reber_string (str+"P") (p_string::list) (t_p_string::prediction) Node3b | Node2b -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"S") (s_string::list) (s_x_string::prediction) Node2b else make_random_reber_string (str+"X") (x_string::list) (s_x_string::prediction) Node4b | Node3b -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"T") (t_string::list) (t_v_string::prediction) Node3b else make_random_reber_string (str+"V") (v_string::list) (t_v_string::prediction) Node5b | Node4b -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"X") (x_string::list) (s_x_string::prediction) Node3b else make_random_reber_string (str+"S") (s_string::list) (s_x_string::prediction) Node6b | Node5b -> let p = rng.NextDouble() if p > 0.5 then make_random_reber_string (str+"P") (p_string::list) (p_v_string::prediction) Node4b else make_random_reber_string (str+"V") (v_string::list) (p_v_string::prediction) Node6b | Node6b -> make_random_reber_string (str+"E") (e_string::list) (e_string::prediction) Node7b | Node7b -> make_random_reber_string (str+"P") (p_string::list) (p_string::prediction) Node8 | Node8 -> (str+"E"), ((e_string::list) |> List.rev), ((e_string::prediction) |> List.rev) open System.Collections.Generic let make_reber_set num_examples = let mutable c = 0 let reber_set = new HashSet<string * float32 [] list * float32 [] list>() while c < num_examples do if reber_set.Add (make_random_reber_string "" [] [] NodeF) then c <- c+1 reber_set

The above is the string generator. In the examples above the make_reber_set is dispatched to generate random 3000 unique strings and then take those of length 20 for the training set. They usually came out to around 320 or so examples.

Here are the functions for processing that data to the GPU:

// Spiral reverse AD example. Used for testing. // Embedded Reber grammar LSTM example. #load "../Spiral Library/ad_utils_spiral_v1.fsx" open Ad_utils_spiral_v1 #r "../packages/FSharp.Charting.0.90.13/lib/net40/FSharp.Charting.dll" #r "System.Windows.Forms.DataVisualization.dll" open FSharp.Charting #load "embedded_reber.fsx" open Embedded_reber let reber_set = make_reber_set 3000 let make_data_from_set target_length = let twenties = reber_set |> Seq.filter (fun (a,b,c) -> a.Length = target_length) |> Seq.toArray let batch_size = (twenties |> Seq.length) let d_training_data = [| for i=0 to target_length-1 do let input = [| for k=0 to batch_size-1 do let example = twenties.[k] let s, input, output = example yield input.[i] |] |> Array.concat yield DM.makeConstantNode(7,batch_size,input)|] let d_target_data = [| for i=1 to target_length-1 do // The targets are one less than the inputs. This has the effect of shifting them to the left. let output = [| for k=0 to batch_size-1 do let example = twenties.[k] let s, input, output = example yield output.[i] |] |> Array.concat yield DM.makeConstantNode(7,batch_size,output)|] d_training_data, d_target_data

Here is the training function:

let lstm_embedded_reber_train num_iters learning_rate (data: DM[]) (targets: DM[]) (data_v: DM[]) (targets_v: DM[]) clip_coef (l1: LSTMLayer) (l2: FeedforwardLayer) = [| let l1 = l1 let l2 = l2 let base_nodes = [|l1.ToArray;l2.ToArray|] |> Array.concat let training_loop (data: DM[]) (targets: DM[]) i = tape.Select i let costs = [| let mutable a, c = l1.runLayerNoH data.[0] let b = l2.runLayer a let r = squared_error_cost targets.[0] b yield r for i=1 to data.Length-2 do let a',c' = l1.runLayer data.[i] a c a <- a'; c <- c' let b = l2.runLayer a let r = squared_error_cost targets.[i] b yield r |] scale (1.0f/float32 (costs.Length-1)) (sum_scalars costs) let ts = (data.Length-1) let vs = ts//(data_v.Length-1) // It is possible to put the validation set on a separate branch and run them that way. In the earlier versions of Spiral before efficient tape rebuilding that is how it was done for speed. // In such case each branch would also have its memory buffer. Keep it all on one branch for maximum reuse. let mutable r' = 0.0f let mutable i = 1 while i <= num_iters && System.Single.IsNaN r' = false do let rv = training_loop data_v targets_v vs tape.forwardpropTape vs printfn "The validation cost is %f at iteration %i" !rv.r.P i tape.Clear vs let r = training_loop data targets ts tape.forwardpropTape ts printfn "The training cost is %f at iteration %i" !r.r.P i yield !r.r.P, !rv.r.P tape.resetTapeAdjoint -1 // Resets base adjoints tape.resetTapeAdjoint ts // Resets the adjoints for the training select r.r.A := 1.0f tape.reversepropTape ts // Runs the reverse step. add_gradients_to_weights base_nodes learning_rate clip_coef tape.Clear ts //tape.Clear vs i <- i+1 r' <- !r.r.P |]

Build the tape, run it forward by calling forwardpropTape(), reset the adjoints, run it backwards, this is similar to the previous examples.

In the cost function as there are multiple outputs for each of the time steps, we add them up together to make the result of the function a R^1 scalar.

Here is the last bit of code in the example file that makes the LSTM layers:

let d_training_data_20, d_target_data_20 = make_data_from_set 20 let d_training_data_validation, d_target_data_validation = make_data_from_set 30 let hidden_size = 64 let l1 = LSTMLayer.createRandomLSTMLayer hidden_size 7 tanh_ tanh_ let l2 = FeedforwardLayer.createRandomLayer 7 hidden_size (steep_sigmoid 3.0f) // Add the base nodes to the tape for easier resetting and disposal. tape.Select -1 let base_nodes = [|l1.ToArray;l2.ToArray|] |> Array.concat |> Array.iter (fun x -> tape.Add x) let learning_rate = 5.0f // This iteration is to warm up the library. It compiles all the lazy Cuda modules. lstm_embedded_reber_train 1 learning_rate d_training_data_20 d_target_data_20 d_training_data_validation d_target_data_validation 1.0f l1 l2 #time let s = [| yield lstm_embedded_reber_train 99 learning_rate d_training_data_20 d_target_data_20 d_training_data_validation d_target_data_validation 1.0f l1 l2 |] |> Array.concat #time // On the GTX 970, I get 3-4s depending on how hot the GPU is. let l = [|for l,_ in s do yield l|] let r = [|for _,r in s do yield r|] (Chart.Combine [|Chart.Line l;Chart.Line r|]).ShowChart() tape.DisposeAll() // Disposes all the elements of all the tapes and then clears them including the memory buffer.

The main difference from the previous example is that the nodes are added directly into the tape on the -1 branch (the default is 0.) That allows us to call resetAdjoints on that branch.

I did not find it much of time saver, so I omitted this feature when doing the feedforward net examples.

Here is the LSTM layer main body. There is also a GRU in the library as well:

type LSTMLayer = {W_z:DM // Input weight matrix for the block input U_z:DM // Recurrent weight matrix for the block input b_z:DM // Bias vector for the block input W_i:DM // Input weight matrix for the input gate U_i:DM // Recurrent weight matrix for the input gate b_i:DM // Bias vector for the input gate P_i:DM // Peephole weight matrix for the input gate W_f:DM // Input weight matrix for the forget gate U_f:DM // Recurrent weight matrix for the forget gate b_f:DM // Bias vector for the forget gate P_f:DM // Peephole weight matrix for the forget gate W_o:DM // Input weight matrix for the output gate U_o:DM // Recurrent weight matrix for the output gate b_o:DM // Bias vector for the output gate P_o:DM // Peephole weight matrix for the output gate block_input_a : DM -> DM block_output_a : DM -> DM } with /// Returns all the weights in an array. member l.ToArray = [|l.W_z;l.U_z;l.b_z;l.W_i;l.U_i;l.b_i;l.P_i;l.W_f;l.U_f;l.b_f;l.P_f;l.W_o;l.U_o;l.b_o;l.P_o|] static member fromArray (a: DM[]) block_input_a block_output_a = { W_z = a.[0] U_z = a.[1] b_z = a.[2] W_i = a.[3] U_i = a.[4] b_i = a.[5] P_i = a.[6] W_f = a.[7] U_f = a.[8] b_f = a.[9] P_f = a.[10] W_o = a.[11] U_o = a.[12] b_o = a.[13] P_o = a.[14] block_input_a = block_input_a block_output_a = block_output_a } static member createRandomLSTMLayer hidden_size input_size block_input_a block_output_a = { W_z = DM.makeUniformRandomNode(hidden_size, input_size) U_z = DM.makeUniformRandomNode(hidden_size, hidden_size) b_z = DM.makeUniformRandomNode(hidden_size, 1) W_i = DM.makeUniformRandomNode(hidden_size, input_size) U_i = DM.makeUniformRandomNode(hidden_size, hidden_size) b_i = DM.makeUniformRandomNode(hidden_size, 1) P_i = DM.makeUniformRandomNode(hidden_size, hidden_size) W_f = DM.makeUniformRandomNode(hidden_size, input_size) U_f = DM.makeUniformRandomNode(hidden_size, hidden_size) b_f = DM.makeUniformRandomNode(hidden_size, 1) P_f = DM.makeUniformRandomNode(hidden_size, hidden_size) W_o = DM.makeUniformRandomNode(hidden_size, input_size) U_o = DM.makeUniformRandomNode(hidden_size, hidden_size) b_o = DM.makeUniformRandomNode(hidden_size, 1) P_o = DM.makeUniformRandomNode(hidden_size, hidden_size) block_input_a = block_input_a block_output_a = block_output_a } member l.runLayer (x:DM) (y:DM) (c:DM) = let block_input = linear_layer [|l.W_z,x;l.U_z,y|] [||] (Some l.b_z) |> l.block_input_a let input_gate = linear_layer [|l.W_i,x;l.U_i,y;l.P_i,c|] [||] (Some l.b_i) |> sigmoid let forget_gate = linear_layer [|l.W_f,x;l.U_f,y;l.P_f,c|] [||] (Some l.b_f) |> sigmoid let c' = linear_layer [||] [|block_input,input_gate;c,forget_gate|] None let output_gate = linear_layer [|l.W_o,x;l.U_o,y;l.P_o,c'|] [||] (Some l.b_o) |> sigmoid hadmult (l.block_output_a c') output_gate, c' member l.runLayerNoH (x:DM) = let block_input = linear_layer [|l.W_z,x|] [||] (Some l.b_z) |> l.block_input_a let input_gate = linear_layer [|l.W_i,x|] [||] (Some l.b_i) |> sigmoid let forget_gate = linear_layer [|l.W_f,x|] [||] (Some l.b_f) |> sigmoid let c' = hadmult block_input input_gate let output_gate = linear_layer [|l.W_o,x;l.P_o,c'|] [||] (Some l.b_o) |> sigmoid hadmult (l.block_output_a c') output_gate, c' member l.runLayerNoI (y:DM) (c:DM) = let block_input = linear_layer [|l.U_z,y|] [||] (Some l.b_z) |> l.block_input_a let input_gate = linear_layer [|l.U_i,y;l.P_i,c|] [||] (Some l.b_i) |> sigmoid let forget_gate = linear_layer [|l.U_f,y;l.P_f,c|] [||] (Some l.b_f) |> sigmoid let c' = linear_layer [||] [|block_input,input_gate;c,forget_gate|] None let output_gate = linear_layer [|l.U_o,y;l.P_o,c'|] [||] (Some l.b_o) |> sigmoid hadmult (l.block_output_a c') output_gate, c'

There is a lot of boilerplate, but the sheer effort saved over doing this by hand in the runLayer functions is astounding.

Simply being able to write runLayer and having the confidence that the program will do what the user indeed it to, is the reason for the library’s existence.

I actually tried making a LSTM and a GRU a few times and failed because the code became such a bloated mess. Then I wrote this library. The failures are what motivated me thus far.

I understand that this tutorial is pretty code dump heavy and intended for a niche audience. If you found it helpful then I am glad. So far there is a dearth of good reverse AD Windows libraries that work on the GPU. I hope with this I’ve filled that gap a little.

It could not have been written nearly as easily in any other language as in F#.

Let me talk about languages and what great benefit they bring to their users and what a great gap there is between ‘possible’ and ‘it will get done.’

In theory this library could have been written in assembly, but no so much in reality.

It stands to a reason that if we imagine looking back to the point where we are now from some distant future, that the tools we use, hardware certainly, but especially software here are lacking.

Currently, the Cuda libraries as they exist today are a straitjacket, and those kernels that I have dumped, while entirely necessary are an embarrassment that they should have been dumped. There should have been such sorts in the Nvidia library and they should be open sourced. I cannot imagine that it won’t be the case that in the future, that one couldn’t simply fuse these kernels together in a functional fashion.

Here at the start of 2016, the ML spring has come and I find myself wanting for a good library and writing GPU select routines by hand. I am also constrained by the cuBLAS library and no doubt other researchers are too.

Take for example gemm function (general matrix-matrix multiply). It is fine if one wants to take a dot product (a*b) of all the vectors in one matrix with all the vectors in the other, but what if for example one wants to do (a-b)^2 instead?

Wouldn’t it be interesting to stack a layer of k-means after k-means like with the WTA autoencoder. What is so special about a*b anyway?

GPU algorithms especially could benefit greatly from increased meta-programming capabilities. GPU kernels while very simplistic, are incredibly intricate and hard to get right. Inserting strings like “(a-b)*(a-b);” is merely the first step. Fusing a series of gemm calls to a single target without the scheduler turning on the red lights and having them switch to using atomics would be another good step.

Without the basics firmly in place nothing else will proceed.

]]>let l1 = FeedforwardLayer.createRandomLayer 1024 784 (WTA 6) let l2 = FeedforwardLayer.createRandomLayer 1024 1024 (WTA 6) let l3 = FeedforwardLayer.createRandomLayer 1024 1024 (WTA 6) let l4 = InverseFeedforwardLayer.createRandomLayer l3 (fun x -> x) // No nonlinearity at the end. Linearities in the final layet cause the individual layers to overfit too badly. let l5 = InverseFeedforwardLayer.createRandomLayer l2 (fun x -> x) let l6 = InverseFeedforwardLayer.createRandomLayer l1 (fun x -> x) let l1' = FeedforwardLayer.fromArray l1.ToArray relu // Makes supervised layers from the same weights. let l2' = FeedforwardLayer.fromArray l2.ToArray relu let l3' = FeedforwardLayer.fromArray l3.ToArray relu let l_sig = FeedforwardLayer.createRandomLayer 10 1024 (clipped_steep_sigmoid 3.0f) let layers_deep_autoencoder = [|[|l1;l2;l3|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l4;l5;l6|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The deep autoencoder is not used in this example, but only serves an illustration here. let layers_1 = [|[|l1|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l6|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The correct functions will get called with dynamic dispatch. let layers_2 = [|[|l1;l2|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l5|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The correct functions will get called with dynamic dispatch. let layers_3 = [|[|l1;l2;l3|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l4|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The correct functions will get called with dynamic dispatch. let layers_fine_tune = [|l1';l2';l3';l_sig|] |> Array.map (fun x -> x :> IFeedforwardLayer)

First we create all the layers individually and then we create arrays of such layers grouped together, so we can later pass them into the training function.

This is a three layer net with a small sigmoid output layer on top that we are training.

First we train each layer individually and then we do the whole net at the end.

let loop_1 data targets = // These loops are closures. They are not called directly, but passed as parameters into the training function. This one is for the first autoencoder let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data layers_1 // Scan is like fold except it returns the intermediates. let inp = outputs.[outputs.Length-3] let out = outputs.[outputs.Length-1] squared_error_cost inp out, None let loop_2 data targets = // The targets do nothing in autoencoders, they are here so the type for the supervised net squares out. This one is for the second. let l,r = layers_2 |> Array.splitAt 1 let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data l // Scan is like fold except it returns the intermediates. tape.Add(BlockReverse()) // This blocks the reverse pass from running past this point. It is so the gradients get blocked and only the top two layers get trained. let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) (outputs |> Array.last) r // Scan is like fold except it returns the intermediates. let inp = outputs.[outputs.Length-3] let out = outputs.[outputs.Length-1] squared_error_cost inp out, None let loop_3 data targets = let l,r = layers_3 |> Array.splitAt 2 let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data l // Scan is like fold except it returns the intermediates. tape.Add(BlockReverse()) // This blocks the reverse pass from running past this point. It is so the gradients get blocked and only the top two layers get trained. let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) (outputs |> Array.last) r // Scan is like fold except it returns the intermediates. let inp = outputs.[outputs.Length-3] let out = outputs.[outputs.Length-1] squared_error_cost inp out, None let loop_3b data targets = // This is not for the autoencoder, but for the final logistic regression layer. We train it separately first so it does not distrupt the pretrained weights below it. let l,r = layers_fine_tune |> Array.splitAt 3 let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data l // Scan is like fold except it returns the intermediates. tape.Add(BlockReverse()) // This blocks the reverse pass from running past this point. It is so the gradients get blocked and only the top two layers get trained. let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) (outputs |> Array.last) r // Scan is like fold except it returns the intermediates. let out = outputs.[outputs.Length-1] squared_error_cost targets out, None let loop_fine_tune data targets = // The full net with the pretrained weights. let outputs = Array.fold(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data layers_fine_tune cross_entropy_cost targets outputs, Some (lazy get_accuracy targets.r.P outputs.r.P) // It might be possible to get more speed by not repeating needless calculations in the lower layers, but that would require switching // branches and some modifying the training loop, but this is decent enough. // Doing it like this is in fact the most effiecient from a memory standpoint.

In the training functions above, the fold is replaced with scan because for the autoencoders the second to last output is the target, so having the intermediates at hand is necessary.

In the training loop for the second and the third layer, it is necessary to block the gradients from flowing backwards and optimizing the whole net. Unfortunately calling tape.Clear() here would not be recommended as that would reuse the memory used in the low layers in the uppers and corrupt it.

Instead the better choice is to create a separate type and modify the reverse call so it stops running if it encounters it.

An alternative to the above would be to switch branches in the tape, but that would be more complex. It would require us to write a separate not only these loop closures, but separate training functions.

The training function is much the same as before:

let train_mnist_sgd num_iters learning_rate training_loop (layers: IFeedforwardLayer[]) = [| let mutable r' = 0.0f let base_nodes = layers |> Array.map (fun x -> x.ToArray) |> Array.concat // Stores all the base nodes of the layer so they can later be reset. for i=1 to num_iters do for x in dtrain do let data, target = x let (r:Df), _ = training_loop data target // Builds the tape. tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures. r' <- r' + (!r.r.P/ float32 dtrain.Length) // Adds the cost to the accumulator. if System.Single.IsNaN r' then failwith "Nan error" for x in base_nodes do x.r.A.setZero() // Resets the base adjoints tape.resetTapeAdjoint 0 // Resets the adjoints for the training select r.r.A := 1.0f // Pushes 1.0f from the top node tape.reversepropTape 0 // Resets the adjoints for the test select add_gradients_to_weights' base_nodes learning_rate // The optimization step tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed for the simple recurrent and feedforward case. printfn "The training cost at iteration %i is %f" i r' let r1 = r' r' <- 0.0f let mutable acc = 0.0f for x in dtest do let data, target = x let r,lazy_acc = training_loop data target // Builds the tape. tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures. r' <- r' + (!r.r.P/ float32 dtest.Length) // Adds the cost to the accumulator. match lazy_acc with | Some (lazy_acc: Lazy<floatType>) -> acc <- acc+lazy_acc.Value // Here the accuracy calculation is triggered by accessing it through the Lazy property. | None -> () if System.Single.IsNaN r' then failwith "Nan error" tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed. printfn "The validation cost at iteration %i is %f" i r' if acc <> 0.0f then printfn "The accuracy is %i/10000" (int acc) let r2 = r' r' <- 0.0f yield r1,r2 |]

Here the accuracy calculation is an optional lazy type. The compiler needed some type annotations to figure that out.

The only thing that remains is to use the above to run the process. It is a five step regiment:

// For the autoencoders it seems 0.1f is a decent learning rate. // The autoencoders blow up with 0.2f. // The lower learning rate in the final layer does not help, in fact the higher does. // My record here is 99.1% after a few hours of playing around. // Might be possible to do even better with max norm normalization. // This layerwise pretraining is an old technique by now. // Here is the more up to date research on this old idea: // GradNets - http://arxiv.org/abs/1511.06827 // Net2Net - http://arxiv.org/abs/1511.05641 for loop,layers,num_iters,learning_rate in [|loop_1,layers_1,10,0.1f;loop_2,layers_2,10,0.1f;loop_3,layers_3,10,0.1f;loop_3b,layers_fine_tune,10,0.1f;loop_fine_tune,layers_fine_tune,30,0.2f|] do printfn "Starting training loop %i..." loop_iter let s = train_mnist_sgd num_iters learning_rate loop layers let l = [|for l,_ in s do yield l|] let r = [|for _,r in s do yield r|] //(Chart.Combine [|Chart.Line l;Chart.Line r|]).ShowChart() |> ignore loop_iter <- loop_iter+1

All the closures, their layers, the number of iterations to run them and the learning rate are packed into an array and iterated over. The layers need to be packed as well because the adjoints of the base nodes need to be extracted so they can be reset.

What pretraining does is essentially gradually transforms the network so it becomes more performant. The view that one is pretraining an already complete net is a misnomer. A far better way of viewing it rather, is as a method transforming a net into a different one.

The way neural net training is currently done, the actual trained nets are very disposable.

That is not that much of a problem when one is playing around with detecting digits on Mnist, as training time become larger, knowledge transfer become more valuable in turn.

In the addition to that, the above experiment opens the door to doing evolutionary based search over neural net architectures in a more granular fashion with possibly checkpointing.

There will no doubt be major discoveries coming from this direction in the nearby future as making the transformation a part of the search process greatly eases up difficulty of the problem.

]]>

A few months ago, I spent an enormous amount of time implementing the k-sparse autoencoder as practice for machine learning. In fact, for this new year, I wanted to make this the subject of the autoencoder tutorial, but the method in the paper suffers from some drawbacks. It was fun playing around, but to get the k-sparse autoencoder working properly, one needs to gradually reduce the k-value to its optimal point which is needlessly time consuming.

The k-sparse function is interesting given its applications to sparsity. It is pretty much interchangeable with Relu.

For example, if one actually trains a net with Relu units with a large learning rate and high momentum and uses the k-sparse units during test time, it is possible to get 98% or more which was surprising to me at the time and taught me that a high learning rate combined with momentum has natural regularization properties.

I would not seriously use it as unfortunately as it requires a select operation on the columns which requires iterating over them which in turn requires storing them in local memory. When the columns are relatively medium sized like 1024 that is fine, but anymore than that would put put strain on the available registers on the GPU. The columns being large also puts pressure on the algorithm itself. It has to be more complex in order to the accommodate to the changed nature of the problem.

When the columns are small like 128, it allows one to use a relatively simple select like iteratively finding the next lowest max starting from positive infinity.

This is one of the big themes of parallel programming, in that changing the size of the collection also changes the structure of the problem.

The WTA autoencoder is similar to the k-sparse one in the fully connected case, except there is a transpose operation applied before the select.

/// o <- max_k(tranpose_if(x)) /// Sets all except the k number of max of a column to zero. /// Unlike for the other modules, to ensure it works swiftly, the column size and the number of iterations is fixed so the compiler can unroll the loops. type DeviceMaxSelectColumnActivationModule(column_size: int) = let block_size_x = 32 // This should not be changed for this module. Small block sizes such as these are much more efficient on Maxwell. let block_size_y = 16 // This should be a mutliple of 2. On the GTX 970, 16 seems to work best, although 4,8,16,32 are quite close. let kernel_code = " //Kernel code: extern \"C\" { #define INIT_MIN __int_as_float(0xff800000) // The constant init for the reduce operations. This is the float negative infinity. #define INIT_MAX __int_as_float(0x7f800000) // The constant init for the reduce operations. This is the float positive infinity. #define NUM_VARS "+string (divup column_size 32)+" // This is to ensure that the number of variables is encoded as a constant. #define NUM_COLS_32 "+string ((divup column_size 32)*32)+" // The column size has to be static for the shared memory array. This also to ensures it is a multiple of 32. #define BLOCK_SIZE_Y "+string block_size_y+" // I am not sure whether gridDim.x is a compile time constant. It was not in Alea. #define ROW_ITERS "+string (divup 32 block_size_y)+" // The number of iterations that should be done across the rows in shared memory. typedef "+FloatTypeCpp+" floatType; // The max reduce version. __device__ inline floatType warpReduce(floatType value){ #pragma unroll for (int i=1; i<32; i*=2) { floatType tmp = __shfl_xor(value, i); value = (tmp > value) ? tmp : value; } return value; } // Device code __global__ void Kernel(const floatType* A, floatType* O, const int num_rows, const int num_cols, const int k, const int transpose) { if (transpose) { __shared__ floatType ar[32][NUM_COLS_32]; // The reason why the second dimension is a mutliple of 32 is so that in the warp reduction phase, there are no inactive threads. // Inactive threads during the warp shuffle give undefined values. One has to go an extra mile to ensure that they are defined. { const int row = threadIdx.x; const int row_idx = row+blockIdx.x*32; #pragma unroll // Unroll the loops for performance, though it probably will not work on this part as the col = threadIdx.y is non static. for (int col = threadIdx.y; col < NUM_COLS_32; col += BLOCK_SIZE_Y) { // Stores everything into shared memory first by reading from the global in a contiguous manner. ar[row][col] = (row_idx < num_rows && col < num_cols) ? A[row_idx+col*num_rows] : INIT_MIN; } } __syncthreads(); floatType vars[NUM_VARS]; // The local array size needs to be made constant so the variables there get stored into registers instead of being spilled into global memory. #pragma unroll // Unroll the loop for performance. All the variables in the loop conditional are static. for (int row_iter=0; row_iter < ROW_ITERS; row_iter++) { // This loop does max selection on the rows in shared memory. // Unlike for the global memory, not only is shared memory much faster, but it does not need to be read // contiguosly to hit peak performance. // From shared memory, I put it into the local register memory and operate on that for even further speed gains. // This transposed kernel is adapted from the one in the other branch by adding the shared memory steps. floatType upper_bound = INIT_MAX; // This is the positive infinity for floats. floatType lower_bound = INIT_MIN; // This is the negative infinity for floats. #pragma unroll // Loop unrolling for improved performance. For this to work the number of unrolls has to be defined as a constant. for (int i=0; i < NUM_VARS;i++) { const int col = threadIdx.x + i*32; const int row_idx = threadIdx.y + row_iter*BLOCK_SIZE_Y; if (row_idx < 32) vars[i] = ar[row_idx][col]; } for (int iters=1; iters <= k; iters++){ #pragma unroll for (int i=0; i < NUM_VARS;i++) { if (vars[i] < upper_bound && lower_bound < vars[i]) lower_bound = vars[i]; } upper_bound = warpReduce(lower_bound); // Lowers the upper bound. lower_bound = INIT_MIN; } #pragma unroll for (int i=0; i < NUM_VARS;i++) { const int col = threadIdx.x + i*32; const int row_idx = threadIdx.y + row_iter*BLOCK_SIZE_Y; if (row_idx < 32) ar[row_idx][col] = (vars[i] < upper_bound) ? 0.0f : vars[i]; } } __syncthreads(); { const int row = threadIdx.x; const int row_idx = row+blockIdx.x*32; #pragma unroll for (int col = threadIdx.y; col < NUM_COLS_32; col += BLOCK_SIZE_Y) { if (row_idx < num_rows && col < num_cols) O[row_idx+col*num_rows] = ar[row][col]; } } } else { // Does not need a to do a tranpose so it reads directly off global memory. //int row = threadIdx.x; //const int col = blockIdx.x; const int col_idx = blockIdx.x*num_rows; floatType upper_bound = INIT_MAX; // This is the positive infinity for floats. floatType lower_bound = INIT_MIN; // This is the negative infinity for floats. floatType vars[NUM_VARS]; // The local array size needs to be made constant so the variables there get stored into registers instead of being spilled into global memory. #pragma unroll // Loop unrolling for improved performance. For this to work the number of unrolls has to be defined as a constant. for (int i=0; i < NUM_VARS;i++) { const int row = threadIdx.x + i*32; const int idx = row+col_idx; vars[i] = (row < num_rows) ? A[idx] : INIT_MIN; } for (int iters=1; iters <= k; iters++){ #pragma unroll for (int i=0; i < NUM_VARS;i++) { const int row = threadIdx.x + i*32; if (vars[i] < upper_bound && lower_bound < vars[i]) lower_bound = vars[i]; } upper_bound = warpReduce(lower_bound); // Lowers the upper bound. lower_bound = INIT_MIN; } #pragma unroll for (int i=0; i < NUM_VARS;i++) { const int row = threadIdx.x + i*32; const int idx = row+col_idx; if (row < num_rows){ O[idx] = (vars[i] < upper_bound) ? 0.0f : vars[i]; } } } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"Kernel") do try k.Compile([|"-arch=compute_30"|]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"Kernel") member t.AT(x: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>, m:int, n: int, k: int) = kernel.GridDimensions <- dim3(divup m 32) kernel.BlockDimensions <- dim3(block_size_x,block_size_y) kernel.RunAsync(str.Stream,x.DevicePointer,o.DevicePointer,m,n,k,1) |> ignore /// Does a transpose in shared memory first. member t.AT(x: dMatrix, k, o: dMatrix) = if x.rc <> o.rc then failwith "x.rc <> o.rc" if x.num_cols <> column_size then failwith "Wrong num_cols." t.AT(x.dArray,o.dArray,x.num_rows,x.num_cols,k) /// Does a transpose in shared memory first. member t.AT(x: dMatrix, k) = let o = dMatrix.create(x.num_rows,x.num_cols) if x.num_cols <> column_size then failwith "Wrong num_cols." t.AT(x.dArray,o.dArray,x.num_rows,x.num_cols,k) o member t.A(x: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>, m:int, n: int, k: int) = kernel.GridDimensions <- dim3(n) kernel.BlockDimensions <- dim3(block_size_x) kernel.RunAsync(str.Stream,x.DevicePointer,o.DevicePointer,m,n,k,0) |> ignore member t.A(x: dMatrix, k, o: dMatrix) = if x.rc <> o.rc then failwith "x.rc <> o.rc" if divup x.num_rows 32 <> divup column_size 32 then failwith "Wrong num_rows." t.A(x.dArray,o.dArray,x.num_rows,x.num_cols,k) member t.A(x: dMatrix, k) = let o = dMatrix.create(x.num_rows,x.num_cols) if divup x.num_rows 32 <> divup column_size 32 then failwith "Wrong num_rows." t.A(x.dArray,o.dArray,x.num_rows,x.num_cols,k) o

The above seems awe inspiring, but one just needs to keep in mind what it does, which is select on rows if called with .AT() and on columns if called with .A()

For performance reasons, in the row version, it first stores the rows of the matrix in shared memory by reading contiguously from the global memory and then from that into registers. It does the select iteratively on the variables in registers.

Probably the most intricate kernel up to now, but what it does is crystal clear.

Here is the library function that uses it:

let DeviceMaxSelectDict = lazy Dictionary<int,DeviceMaxSelectColumnActivationModule>() /// o <- max_k(x) /// Sets all except the k number of max of a column to zero. /// Unlike for the other modules, to ensure it works swiftly, the column size and the number of iterations is fixed so the compiler can unroll the loops. /// This name is for a function wrapper for the Dict that holds the DeviceMaxSelectColumnActivation modules. let DeviceMaxSelectColumnActivationModule column_size = let d = DeviceMaxSelectDict.Value if d.ContainsKey(divup column_size 32) then d.[divup column_size 32] else let t = DeviceMaxSelectColumnActivationModule column_size d.Add(divup column_size 32,t) t /// The winner take all activation. Zeroes out the non top-k values along the row. let sparseActivationErrorModule = lazy new DeviceTrinaryTransformModule "y*((x == 0.0f) ? 0.0f : 1.0f)+z;" let WTA k (a:DM) = let va = a.r.P let el = a.r.A let node = tape.GetDMIf let c = node.P let error = node.A let ff () = let nr,nc = (va).rc node.Resize nr nc DeviceMaxSelectColumnActivationModule(nc).AT(va,k,c) let fb () = sparseActivationErrorModule.Value.A(c,error,el,el) let t = DM.create(node,ff,fb) tape.Add(t) t

Interesting tidbit about the module is that storing the variables into registers requires them to be indexed statically. Not only does their size need to be determined statically during the compilation step, but the variable that indexes them also needs to be static.

The flaw of CUDA C++ unlike for Alea, is that the “#pragma unroll” directives might not get triggered, but as (int i=0; i < NUM_VARS;i++) is all static, one can be decently sure that here it will go as planned.

#pragma unroll // Unroll the loops for performance, though it probably will not work on this part as the col = threadIdx.y is non static. for (int col = threadIdx.y; col < NUM_COLS_32; col += BLOCK_SIZE_Y) { // Stores everything into shared memory first by reading from the global in a contiguous manner. ar[row][col] = (row_idx < num_rows && col < num_cols) ? A[row_idx+col*num_rows] : INIT_MIN; } }

The above probably will not work and I would have to check the PTX code to if does. Here is the above rewritten so it gets properly unrolled.

#pragma unroll // Unroll the loops for performance. for (int i = 0; i < NUM_VARS*ROW_ITERS; i++) { // Stores everything into shared memory first by reading from the global in a contiguous manner. const int col = threadIdx.y + i*BLOCK_SIZE_Y; ar[row][col] = (row_idx < num_rows && col < num_cols) ? A[row_idx+col*num_rows] : INIT_MIN; } }

NUM_VARS*ROW_ITERS are all constants that the compiler can replace now. The attempted optimization does not seem to have improved performance any though.

Now about the above wrapper, as the function has to know the number of rows divided by 32 ahead of time, what it does is compile a new kernel on the fly for each **different** column size while keeping the old ones stored inside a Dictionary. Much like for memory buggers this allows for an efficient reuse of components.

With that in place, the script for the autoencoder is trivial to adapt based on the previous example:

//let l1 = FeedforwardLayer.fromArray (load_data (__SOURCE_DIRECTORY__ + @"\l1_weights.dat") false) (WTAT 6) \\ For loading the weights from file. let l1 = FeedforwardLayer.createRandomLayer 1024 784 (WTA 6) let l2 = InverseFeedforwardLayer.createRandomLayer l1 (fun x -> x) // No nonlinearity at the end. With a steep sigmoid the cost is much better, but the visualizations are less crisp. let layers = [|l1 :> IFeedforwardLayer;l2 :> IFeedforwardLayer|] // Upcasting to the base type. The correct functions will get called with dynamic dispatch. //save_data (__SOURCE_DIRECTORY__ + @"\l1_weights.dat") l1.ToArray // For saving the weights // This does not actually train it, it just initiates the tree for later training. // The autoencoder version. let training_loop (data: DM) (layers: IFeedforwardLayer[]) = let outputs = Array.fold (fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data layers squared_error_cost data outputs let train_mnist_sgd num_iters learning_rate (layers: IFeedforwardLayer[]) = [| let mutable r' = 0.0f let base_nodes = layers |> Array.map (fun x -> x.ToArray) |> Array.concat // Stores all the base nodes of the layer so they can later be reset. for i=1 to num_iters do for x in dtrain do let data, target = x let r = training_loop data layers // Builds the tape. tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures. r' <- r' + (!r.r.P/ float32 dtrain.Length) // Adds the cost to the accumulator. if System.Single.IsNaN r' then failwith "Nan error" for x in base_nodes do x.r.A.setZero() // Resets the base adjoints tape.resetTapeAdjoint 0 // Resets the adjoints for the training select r.r.A := 1.0f // Pushes 1.0f from the top node tape.reversepropTape 0 // Resets the adjoints for the test select add_gradients_to_weights' base_nodes learning_rate // The optimization step tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed for the simple recurrent and feedforward case. printfn "The training cost at iteration %i is %f" i r' let r1 = r' r' <- 0.0f for x in dtest do let data, target = x let r = training_loop data layers // Builds the tape. tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures. r' <- r' + (!r.r.P/ float32 dtest.Length) // Adds the cost to the accumulator. if System.Single.IsNaN r' then failwith "Nan error" tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed. printfn "The validation cost at iteration %i is %f" i r' let r2 = r' r' <- 0.0f yield r1,r2 |] let num_iters = 20 let learning_rate = 0.1f #time let s = train_mnist_sgd num_iters learning_rate layers #time let l = [|for l,_ in s do yield l|] let r = [|for _,r in s do yield r|] (Chart.Combine [|Chart.Line l;Chart.Line r|]).ShowChart() let bitmap = make_bitmap_from_dmatrix l1.W.r.P 28 28 25 40 bitmap.Save(__SOURCE_DIRECTORY__ + @"\weights.png")

The inverse layer takes in the weight matrix of the first feedforward layer and creates a new random bias vector.

// An inverse feedforward layer of neurons made from a regular one. Used in autoencoders type InverseFeedforwardLayer = { W:DM // Input weight matrix b:DM // Bias vector a:DM->DM } with // Activation function member l.ToArray = [|l.W;l.b|] static member fromArray (a : DM[]) act = { W = a.[0] b = a.[1] a = act } static member createRandomLayer (l: FeedforwardLayer) act = { W = l.W b = DM.makeUniformRandomNode(l.W.r.P.num_cols, 1) a = act } member l.runLayer (x:DM) = addb (matmultTN l.W x) l.b |> l.a interface IFeedforwardLayer with member l.runLayer (x:DM) = l.runLayer x member l.ToArray = l.ToArray

The linear layer for transposed matrix multiplication is not implemented yet, but the above works nicely for now.

The first time I implemented this a few months ago adding the biases was such a chore that I did not bother to add them. Currently, the library is getting a bit big at 2327 lines of code, but it is not the size that determines the complexity of it. What does is its modularity at various levels. The above kernel was written just today in about five hours and it is in a sense the distillation of all my effort in September down to those five hours. It is satisfying.

There is no need to be understand all the code fragments, but the purpose of these tutorials, is to make the library slightly more familiar to the reader and for myself to revise the material.

The most important post I would say is the basics of AD.

During September of 2015, I think I spent around three weeks or more trying and failing to go above 96% on Mnist with a pretrained net.

As it turned out, there was a bug in the backwards step. One of the map modules was missing a boundary check and when I added it, I realized that I was applying the transpose of the error matrix. Given the error it was amazing that It got as high as 96.5% in the first place.

To get good results with pretraining, it is not actually necessary to do it for a large number of iterations. Even one should be enough in fact.

In a different experiment that used dropout with standard relu units I did just that, and even though the reconstructions were blurry and indistinct compared to the crisp picture at the top of the page, the logistic regression on top of the weights liked it.

More speculatively, the selection functions as employed in the WTA autoencoder have connections to discrete optimization problems. The K-means algorithm in fact is very similar to the sparse activation function and one could potentially use it to cluster cities in the TSP problem for easier problem partitioning. For the k-means specifically, it is also actually possible to optimize it using gradient descent instead of less efficiently by the expectation maximization steps as shown in the Andrew Ng’s course.

To me there is something profound that one can calculate gradients through branches and even sorting functions.

Had I known about the link between math and the general programming, I would have paid more attention in class.

It is speculative, but figuring out the correct way of using backprop like techniques to augment evolutionary optimization techniques could bring about very powerful abilities. I am looking forward to trying it out myself once I am done with these tutorials.

]]>let testSetData = @"\t10k-images.idx3-ubyte" let testSetLabels = @"\t10k-labels.idx1-ubyte" let trainSetData = @"\train-images.idx3-ubyte" let trainSetLabels = @"\train-labels.idx1-ubyte" open System open System.IO type MnistImageset = { num_images : int32 num_rows : int32 num_cols : int32 raw_data : uint8 [] raw_labels : uint8 [] float_data : float32 [] float_labels : float32 [] } let readInt32 (r : BinaryReader) = // Because Mnist's ints are in reverse order. let arr = r.ReadBytes(4) arr |> Array.Reverse BitConverter.ToInt32(arr,0) let make_imageset data_path label_path = use stream_data = File.OpenRead(data_path) use stream_label = File.OpenRead(label_path) use reader_data = new BinaryReader(stream_data) use reader_label = new BinaryReader(stream_label) let magic_number = readInt32 reader_data let num_images = readInt32 reader_data let num_rows = readInt32 reader_data let num_cols = readInt32 reader_data let total_num_bytes = num_images * num_rows * num_cols let raw_data = reader_data.ReadBytes(total_num_bytes) let raw_label_data = reader_label.ReadBytes(num_images+8) let float_pixel_values = [|for x in raw_data -> (float32 x)/255.0f|] let float_labels = Array.zeroCreate (10*num_images) let mutable c = 0 for x in raw_label_data.[8..] do float_labels.[(int x) + c] <- 1.0f c <- c+10 { num_images = num_images num_rows = num_rows num_cols = num_cols raw_data = raw_data raw_labels = raw_label_data.[8..] float_data = float_pixel_values float_labels = float_labels }

The code just loads the Mnist dataset from the given file and returns it in the record above.

// Spiral reverse AD example. Used for testing. #load "../Spiral Library/ad_utils_spiral_v1.fsx" open Ad_utils_spiral_v1 #r "../packages/FSharp.Charting.0.90.13/lib/net40/FSharp.Charting.dll" #r "System.Windows.Forms.DataVisualization.dll" open FSharp.Charting #load "load_mnist.fsx" open Load_mnist open System open ManagedCuda open ManagedCuda.BasicTypes open ManagedCuda.VectorTypes open ManagedCuda.CudaBlas open ManagedCuda.CudaRand open ManagedCuda.NVRTC open ManagedCuda.CudaDNN open System open System.IO open System.Collections // "__SOURCE_DIRECTORY__ + testSetData" gives parsing errors if it is written like "__SOURCE_DIRECTORY__+testSetData" // __SOURCE_DIRECTORY__ is just a string literal refering to the directory where the script resides. let train_data = make_imageset (__SOURCE_DIRECTORY__ + trainSetData) (__SOURCE_DIRECTORY__ + trainSetLabels) let test_data = make_imageset (__SOURCE_DIRECTORY__ + testSetData) (__SOURCE_DIRECTORY__ + testSetLabels) /// Returns a tuple of training set and label set split into minibatches. let make_set (s : MnistImageset) batch_size = /// Function that splits the dataset along the columns. let split_cols (x:dMatrix) batch_size = [| for i=0 to (x.num_cols-1)/batch_size do let start_pos = i*batch_size let end_pos = min ((i+1)*batch_size-1) (x.num_cols-1) yield x.[*,start_pos..end_pos] |] use d_data = dMatrix.create(s.num_rows*s.num_cols,s.num_images,s.float_data) // Loads the data use d_label = dMatrix.create(10,s.num_images,s.float_labels) // Loads the labels let ar_data = split_cols d_data batch_size |> Array.map (fun x -> DM.makeConstantNode x) let ar_label = split_cols d_label batch_size |> Array.map (fun x -> DM.makeConstantNode x) Array.zip ar_data ar_label // The type of each of these two variable is dMatrix [], dMatrix [] - a tuple. let dtrain = make_set train_data 128 let dtest = make_set test_data 128

One of the reason why I am programming in a statically typed language like F# instead of Python is because types are so informative. The function are like little machines that take in certain types and spit out another. If this code seems difficult you can just hover your mouse of the variables and IDE will tell you the exact type of everything.

In the above case the last two variables are all dMatrix [], dMatrix [] tuple types.

Here is the code for the feedforward layer.

// A feedforward layer of neurons type FeedforwardLayer = { W:DM // Input weight matrix b:DM // Bias vector a:DM->DM } with // Activation function member l.ToArray = [|l.W;l.b|] static member fromArray (a : DM[]) act = { W = a.[0] b = a.[1] a = act } static member createRandomLayer hidden_size input_size act = { W = DM.makeUniformRandomNode(hidden_size, input_size) b = DM.makeUniformRandomNode(hidden_size, 1) a = act } member l.runLayer (x:DM) = linear_layer_matmult [|l.W,x|] (Some l.b) |> l.a

The linear_layer function is just Ax+b. The |> pipe operator at the end forward the result of the previous computation, that is from the call to the linear layer, to the l.a activation function.

let l1 = FeedforwardLayer.createRandomLayer 1024 784 relu let l2 = FeedforwardLayer.createRandomLayer 2048 1024 relu let l3 = FeedforwardLayer.createRandomLayer 1024 2048 relu let l4 = FeedforwardLayer.createRandomLayer 10 1024 (clipped_steep_sigmoid 3.0f) let layers = [|l1;l2;l3;l4|]

This is how layers are created. Currently besides the above, Spiral also has standard RNN, GRU and LSTM classes. More will be added no doubt, as I experiment with reinforcement learning. At the time of writing the Spiral library is about one month old and I am still in the middle of porting my old Alea stuff to it.

// This does not actually train it, it just initiates the tree for later training. let training_loop (data: DM) (targets: DM) (layers: FeedforwardLayer[]) = let outputs = layers |> Array.fold (fun state layer -> layer.runLayer state) data // I make the accuracy calculation lazy. This is similar to returning a lambda function that calculates the accuracy // although in this case it will be calculated at most once. lazy get_accuracy targets.r.P outputs.r.P, cross_entropy_cost targets outputs

The above is the forward pass of the library. The Array.fold part is a bit confusing in you are not familiar with it. I’ll rewrite the function into an equivalent imperative form.

// This does not actually train it, it just initiates the tree for later training. // Imperative form. let training_loop (data: DM) (targets: DM) (layers: FeedforwardLayer[]) = let mutable outputs = data for x in layers do outputs <- x.runLayer outputs // I make the accuracy calculation lazy. This is similar to returning a lambda function that calculates the accuracy // although in this case it will be calculated at most once. lazy get_accuracy targets.r.P outputs.r.P, cross_entropy_cost targets outputs

The fold function just takes an anonymous function, an initial state and an array as parameters and iterates over that array in the last line, it lazily returns the accuracy value which is not actually a part of the tree construction or the tape and returns non-lazily the cross_entropy_cost which definitely goes into the tape.

The way the accuracy calculation is worth getting into for a bit

/// o <- max_col(x) /// Sets all except one of the max of a column to zero. type DeviceMaxColumnActivationModule() = let block_size = 128 let kernel_code = " //Kernel code: extern \"C\" { #define INIT __int_as_float(0xff800000) // The constant init for the reduce operations. This is float negative infinity. // The max reduce version. __device__ inline "+FloatTypeCpp+" warpReduce("+FloatTypeCpp+" value){ for (int i=1; i<32; i*=2) { "+FloatTypeCpp+" tmp = __shfl_xor(value, i); value = (tmp > value) ? tmp : value; } return value; } __device__ inline "+FloatTypeCpp+" blockReduce("+FloatTypeCpp+" value){ __shared__ "+FloatTypeCpp+" temp[32]; if (threadIdx.x < 32) temp[threadIdx.x] = INIT; "+FloatTypeCpp+" out_partial = warpReduce(value); __syncthreads(); if (threadIdx.x % 32 == 0) temp[threadIdx.x / 32] = out_partial; __syncthreads(); if (threadIdx.x < 32) out_partial = warpReduce(temp[threadIdx.x]); return out_partial; } // Device code __global__ void Kernel(const "+FloatTypeCpp+"* A, "+FloatTypeCpp+"* O, const int num_rows, const int num_cols) { int row = threadIdx.x; //const int col = blockIdx.x; int col_idx = blockIdx.x*num_rows; "+FloatTypeCpp+" max = INIT; // This is the negative infinity for floats. int index = -1; while (row < num_rows) { if (A[row+col_idx] > max) { max = A[row+col_idx]; index = row; } row += blockDim.x; } __shared__ "+FloatTypeCpp+" max_index; if (max == blockReduce(max)) max_index = index; __syncthreads(); index = max_index; // These last four lines are to make absolutely sure that only one max is selected in case there is more than one. row = threadIdx.x; while (row < num_rows) { O[row+col_idx] = (row == index) ? max : 0.0f; row += blockDim.x; } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"Kernel") do try k.Compile([|"-arch=compute_30"|]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"Kernel") member t.A(x: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>, m:int , n: int) = kernel.GridDimensions <- dim3(n) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream,x.DevicePointer,o.DevicePointer,m,n) |> ignore member t.A(x: dMatrix, o: dMatrix) = if x.rc <> o.rc then failwith "x.rc <> o.rc" t.A(x.dArray,o.dArray,x.num_rows,x.num_cols) member t.A(x: dMatrix) = let o = dMatrix.create(x.num_rows,x.num_cols) t.A(x.dArray,o.dArray,x.num_rows,x.num_cols) o

let maxColumnModule = lazy new DeviceMaxColumnActivationModule() let accuracyModule = lazy new DeviceBinaryMapSumModule "(x*y == 0.0f) ? 0.0f : 1.0f;" let get_accuracy targets activations = let o = tape.GetDMIf o.P.ReplaceIf activations.num_rows activations.num_cols maxColumnModule.Value.A(activations,o.P) accuracyModule.Value.A(targets,o.P)

In the above Cuda kernel, each blocks iterates over its column and then does a block max reduction. At first the way it was written it then just compared the values in the A array with the max and then set the values that did not equal max to zero. This actually caused errors when the values were saturated as not all values except one of a column would be set to zero. There would be more than one max.

What the above does is neatly avoids that issue by having all threads in a block that equal the max store their indexes into the shared memory at a single location. They each crowd and push each other to store that value and only one of them gets stored. The the block gets synchronized and they all read that same value thereby guaranteeing that all threads in a block have the same index. Then all rows except for the one that equals that index get set to zero.

At the end of this is outputted a matrix whose columns only have one value nonzero. The above algorithm is actually even more efficient that the previous as it does not require reading from the original again.

The way accuracy is then calculated from that is that the column max matrix and the label matrix are elementwise multiplied and then their nonzero elements are set to 1. Then they are summed. That is neatly done in a single operation inside the accuracyModule.

let train_mnist_sgd num_iters learning_rate (layers: FeedforwardLayer[]) = [| let mutable r' = 0.0f let base_nodes = layers |> Array.map (fun x -> x.ToArray) |> Array.concat // Stores all the base nodes of the layer so they can later be reset. for i=1 to num_iters do for x in dtrain do let data, target = x let _,r = training_loop data target layers // Builds the tape. tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures. r' <- r' + (!r.r.P/ float32 dtrain.Length) // Adds the cost to the accumulator. if System.Single.IsNaN r' then failwith "Nan error" for x in base_nodes do x.r.A.setZero() // Resets the base adjoints tape.resetTapeAdjoint 0 // Resets the adjoints for the training select r.r.A := 1.0f // Pushes 1.0f from the top node tape.reversepropTape 0 // Resets the adjoints for the test select add_gradients_to_weights' base_nodes learning_rate // The optimization step tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed for the simple recurrent and feedforward case. printfn "The training cost at iteration %i is %f" i r' let r1 = r' r' <- 0.0f let mutable acc = 0.0f for x in dtest do let data, target = x let lazy_acc,r = training_loop data target layers // Builds the tape. tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures. r' <- r' + (!r.r.P/ float32 dtest.Length) // Adds the cost to the accumulator. acc <- acc+lazy_acc.Value // Here the accuracy calculation is triggered by accessing it through the Lazy property. if System.Single.IsNaN r' then failwith "Nan error" tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed. printfn "The validation cost at iteration %i is %f" i r' printfn "The accuracy is %i/10000" (int acc) let r2 = r' r' <- 0.0f yield r1,r2 |] let num_iters = 40 let learning_rate = 0.1f #time let s = train_mnist_sgd num_iters learning_rate layers #time let l = [|for l,_ in s do yield l|] let r = [|for _,r in s do yield r|] (Chart.Combine [|Chart.Line l;Chart.Line r|]).ShowChart()

The above just demonstrates just what a small part of the actual program the neural net actually is. We set out to write f(Ax+b) and ended up writing about two thousand lines of code.

Having the accuracy calculation be lazy is a good idea as that ensures that its function does not get triggered on the training set. If not for this I would have to have separate function and I actually do not have the access to the intermediate variables inside the train_mnist_sgd loops.

For the reference I got about 98.3% after 40 iterations. I actually get 98.5% with just two layers.

This might seem like a limit, but it can be improved upon by pretraining the net with an autoencoder.

]]>type Df_rec = { P: floatType ref A : floatType ref is_constant : bool } with static member create P = {P=P;A=ref 0.0f;is_constant=false} static member createConstant P = {P=P;A=ref 0.0f;is_constant=true} type DM_rec = { P : dMatrix A : dMatrix is_constant : bool } with static member create (P: dMatrix) = {P=P;A=P.zeroLike();is_constant=false} static member createConstant (P: dMatrix) = {P=P;A=dMatrix.createEmpty;is_constant=true} static member createEmpty = {P=dMatrix.createEmpty;A=dMatrix.createEmpty;is_constant=false} static member createEmptyConstant = {P=dMatrix.createEmpty;A=dMatrix.createEmpty;is_constant=true} /// Resizes the primal and the adjoint if they are below nr*nc in size. member t.Resize nr nc = let p = t.P let a = t.A // This is an optimization to prevent an clogup of dMatrix objects here. // GC can't free up memory if the dMatrix instances are pointing to the same dArray. // If the class is larger, replace the reference else the function will mutably just adjust // the num_rows and num_col fields. p.ReplaceIf nr nc a.ReplaceIf nr nc member t.Dispose() = (t.P :> IDisposable).Dispose() (t.A :> IDisposable).Dispose()

At the time of writing, there are three steps in the library, the build step, the forward step and the reverse step. In the forward step, the nodes dynamically resize themselves if they have insufficient capacity to meet the demand by calling the ReplaceIf functions.

let Noop() = () type Df = { r: Df_rec ff : (unit -> unit) fb : (unit -> unit) } static member create(r,ff,fb) = {r=r;ff=ff;fb=fb} and DM = { r: DM_rec ff : (unit -> unit) fb : (unit -> unit) } static member create(r,ff,fb) = {r=r;ff=ff;fb=fb} static member makeNode(hidden_size, input_size) = let p = dMatrix.create(hidden_size,input_size) {r=DM_rec.create p;ff=Noop;fb=Noop} static member makeNode(hidden_size, input_size, input: floatType[]) = let p = dMatrix.create(hidden_size,input_size, input) {r=DM_rec.create p;ff=Noop;fb=Noop} static member makeNode p = {r=DM_rec.create p;ff=Noop;fb=Noop} static member makeConstantNode(hidden_size, input_size, input: floatType[]) = let p = dMatrix.create(hidden_size,input_size, input) {r=DM_rec.createConstant p;ff=Noop;fb=Noop} static member makeConstantNode p = {r=DM_rec.createConstant p;ff=Noop;fb=Noop} static member makeUniformRandomNode(hidden_size,input_size) = let scale = (1.0f / sqrt(hidden_size+input_size |> floatType)) let p = dMatrix.createRandomUniformMatrix hidden_size input_size scale 0.0f {r=DM_rec.create p;ff=Noop;fb=Noop}

The Df and DM types are the types the library operates on respectively. The hierarchy goes like this from top to bottom: DM->DM_rec->dMatrix and vice versa for Df. In addition to the records holding the primals and the adjoints, they each carry anonymous functions that take and return nothing, ff and fb. These functions are closures – closet classes really – that actually execute the forward and the backwards steps.

open System.Collections.Generic type tapeType() = let d = Dictionary<int,List<obj>*(List<DM_rec>*int ref)>() let mutable select = 0 /// Instantiates a new List if none is present at the selection and adds to it, else it just adds to the selected one. /// The default select is 0. member t.Add a = if d.ContainsKey(select) = false then let tape = List() let memory_dm = List(), ref 0 d.Add(select, (tape, memory_dm)) tape.Add(a) else let tape,_ = d.[select] tape.Add(a) /// Sets the select to input. member t.Select i = select <- i /// Runs all the forward functions in the currently selected tape. member t.forwardpropTape select = let tape,_ = d.[select] for i=0 to tape.Count-1 do match tape.[i] with | Df as x -> x.ff() | DM as x -> x.ff() | _ -> failwith “Type not supported” /// Runs all the backward functions in the currently selected tape, starting from the top. member t.reversepropTape select = let tape,_ = d.[select] for i=tape.Count-1 downto 0 do match tape.[i] with | Df as x -> x.fb() | DM as x -> x.fb() | _ -> failwith “Type not supported” /// Resets the adjoints of the selected tape. member t.resetTapeAdjoint select = let tape,_ = d.[select] for i=tape.Count-1 downto 0 do match tape.[i] with | Df as x -> x.r.A := 0.0f | DM as x -> x.r.A.setZero() | _ -> failwith “Type not supported” /// Resets the adjoints of the selected tape. member t.resetTapePrimal select = if d.ContainsKey(select) then let tape,_ = d.[select] for i=tape.Count-1 downto 0 do match tape.[i] with | Df as x -> x.r.P := 0.0f | DM as x -> x.r.P.setZero() | _ -> failwith “Type not supported” /// Disposes all the elements of the select tape and then clears it including the memory buffer. member t.Dispose select = let tape,mp = d.[select] let memory,dm_pointer = mp for x in tape do match x with | Df as x -> () | DM as x -> x.r.Dispose() | _ -> failwith “Type not supported” for x in memory do x.Dispose() tape.Clear() memory.Clear() dm_pointer := 0 /// Clears the select tape without disposing it or the memory buffer. /// Also sets the pointer to zero for the select. member t.Clear select = if d.ContainsKey(select) then let tape,mp = d.[select] let memory,dm_pointer = mp tape.Clear() dm_pointer := 0 /// Disposes all the elements of all the tapes and then clears them including the memory buffer. member t.DisposeAll() = for tape,mp in d.Values do let memory,dm_pointer = mp for x in tape do match x with | Df as x -> () | DM as x -> x.r.Dispose() | _ -> failwith “Type not supported” for x in memory do x.Dispose() tape.Clear() memory.Clear() dm_pointer := 0 /// Returns an empty DM_rec if none exists at the pointer and adds it to the memory buffer, else it returns the DM_rec at the pointer to be reused. /// Increments the pointer afterwards. member t.GetDMIf = if d.ContainsKey(select) then let _, mp = d.[select] let memory,dm_pointer = mp if memory.Count > !dm_pointer then dm_pointer := !dm_pointer+1 memory.[!dm_pointer-1] else dm_pointer := !dm_pointer+1 let t = DM_rec.createEmpty memory.Add(t) t else let tape = List() let memory = List() let dm_pointer = ref 1 d.Add(select, (tape, (memory,dm_pointer))) let t = DM_rec.createEmpty memory.Add(t) t /// The global tape instance. let tape = tapeType()

The above class is the one most likely to change in the future.

Not only does it store the steps of the program in the order they are defined, it also houses the memory buffers. They are the current solution to reusing GPU allocated memory.

During the first build step GetDMIf is called and it returns an empty DM_rec before storing its location to memory. Then during the forward step, the program resizes those records. Then after the reverse step the local (not global) tape that records the steps is cleared. During the subsequent steps GetDMIf then actually returns the previously allocated DM_recs. It works quite nicely as an optimization and gives us that 100% speed gain for memory reuse. It works for LSTMs quite nicely.

Here are a few examples of library functions. They are similar from the basics of AD tutorial.

let hadamaradMultiplicationModule = lazy new DeviceBinaryTransformModule “x*y;” let hadamaradMultiplicationErrorModule = lazy new DeviceTrinaryTransformModule “x*y+z;” /// Hadamarad (elementwise) multiplication function. let hadmult (a: DM) (b: DM) = let va = a.r.P let vb = b.r.P let el = a.r.A let er = b.r.A let node = tape.GetDMIf let c = node.P let error = node.A let ff () = let nr, nc = va.rc node.Resize nr nc hadamaradMultiplicationModule.Value.A(va, vb, c) let fb () = if a.r.is_constant = false then hadamaradMultiplicationErrorModule.Value.A(vb,error,el,el) if b.r.is_constant = false then hadamaradMultiplicationErrorModule.Value.A(va,error,er,er) let t = DM.create(node,ff,fb) tape.Add(t) t

In the forward step, it resizes the node which houses the primal and the adjoint and then calls the hadamarad multiplication module function.

The new thing here is the lazy prefix. What that does is wraps an expression in something like a function so it defers evaluation until it needs to be used.

let a = lazy (5*5) // *Do something else* printfn “%i” a.Value // Only now evaluate a.

The above example is not too good, but imagine if you have over a dozen of new DeviceModule expressions like the ones for the Hadamarad module. Each of them requires roughly 0.35s to evaluate even if they are unused. By adding those statements the time it takes to load the library script goes from 9.3s to 4.1s on my machine.

At the end of the function call what has happened is that the program has created a node and two closures with the forward and the backwards steps and added them to the tape.

There are really two ‘tapes’ so to speak in the global tape and they are separate. The closure tape is detached from the memory buffer tape and should they be called on wildly different function the buffers might become miscalibrated. Not really a problem with machine learning algorithms though. Even the LSTM which has a branching factor depending on the sequence length, is a pretty benign case.

/// Matrix-matrix multiply. let matmult (a: DM) (b:DM) = let va = a.r.P let vb = b.r.P let el = a.r.A let er = b.r.A let node = tape.GetDMIf let c = node.P let error = node.A let ff () = let nr = (va).num_rows let nc = (vb).num_cols node.Resize nr nc gemm2 nT nT 1.0f va vb 0.0f c let fb () = if a.r.is_constant = false then gemm2 nT T 1.0f error vb 1.0f el// The derivative with respect to the left. So the above argument gets inserted from the right left. Usually error * input. if b.r.is_constant = false then gemm2 T nT 1.0f va error 1.0f er// The derivative with respect to the right. So the above argument gets inserted from the right side. Usually weights * error. let t = DM.create(node,ff,fb) tape.Add(t) t

As in all steps, on the forward step, the node is set, but on the backwards steps, all adjoints are added to. This requires resetting the adjoints to zero before every reverse pass manually by calling tape.resetTapeAdjoint().

The above function are really trivial to add to the library and with the current design, it is readily extensible.

Another example would be the one I just finished recently.

let clipModule = lazy new DeviceTrinaryCoefTransformModule "((x < coef_x) ? coef_x : (x > coef_y ? coef_y : x))+coef_z;" let clipErrorModule = lazy new DeviceTrinaryCoefTransformModule "y*((x < coef_x) ? 0.0f : (x > coef_y ? 0.0f : 1.0f))+z;" /// o <- clip(min,max,a)+scalar /// The clip function. Can be used as Relu by setting max to positive infinity. /// Can be used to make linear clipped sigmoid by setting min,max,scalar to -0.5f,0.5f,0.5f. let clip min max a scalar = let va = a.r.P let el = a.r.A let node = tape.GetDMIf let c = node.P let error = node.A let ff () = let nr,nc = (va).rc node.Resize nr nc clipModule.Value.A(min,va,max,va,scalar,va,c) let fb () = clipErrorModule.Value.A(min,va,max,error,max,el,el) let t = DM.create(node,ff,fb) tape.Add(t) t let inline clipped_sigmoid x = clip 0.0001f 0.9999f (sigmoid x) 0.0f let inline clipped_steep_sigmoid coef x = clip 0.0001f 0.9999f (steep_sigmoid coef x) 0.0f let inline relu x = clip 0.0f Single.PositiveInfinity x 0.0f // The linear versions of the sigmoid and tanh. let inline clipped_linear_sigmoid x = clip -0.4999f 0.4999f x 0.5f // Clipped linear sigmoid in the [0.001,0.999] range. let inline linear_sigmoid x = clip -0.5f 0.5f x 0.0f // Linear sigmoid in the [0.0,1.0] range. let inline linear_tanh x = clip -1.0f 1.0f x 0.0f // Linear tanh in the [-1.0,1.0] range.

A good way to not have to suffer Nan errors in logistic regression is to just clip the min and the max values to something close to 1, but not 1. Otherwise, eventually gradient descent will push the outputs in the final layer to 1 or 0 exactly and the net will blow up. In the library there is also a steep sigmoid variant which works better than the regular sigmoid with LSTMs or so I’ve heard.

I’ve tried clipped linear tanh and sigmoid and they really do not work well though, in feedforward regimes. On the XOR problem the relu gets stuck sometimes. It really depends on problem to problem.

Actually the clipped linear versions worked really well for me two months or so ago while I was writing everything by hand. At the time out of laziness, I did not implement the cost function properly in the last layer and just let all the gradients flow through and the clipped linear function worked better than the regular sigmoid for me.

It might be worth doing some research on mismatched clipped functions that let the gradients flow backwards in an irregular manner.

The really major speed gains that come to the library are from bunching the functions together like I did in the clip function when in the forward step I combined the min, max and the add function in one single pass.

Likewise, one potential improvement is to extend the linear layer into a nonlinear layer. That would eliminate the need to have a primal and an adjoint for the activation step. At that point the library would be as efficient as my previous handwritten (and *faulty*) code.

As a AD library Spiral is really specialized at this point, not so much for reverse mode, but for parallel operations on large collections. As I consider the possibilities, one thing that greatly interests me now is the source transformation. What if instead of making these closures, I somehow make it so that the functions translate the tape into native code directly?

In AD literature there are such source transformation tools and compared to their operator overload brethren, they are order or two of magnitude faster. However, even though I am interested regardless, I do not think think there are significant gains to be had in this direction for Spiral. It would be different if it was operating directly on scalars, but the overhead of having having extra function calls on large collection is insignificant compared to the scalar case.

There is a lot to be satisfied about with the current design of the library. It essentially combines the best of OO, functional and even imperative design to make coding machine learning algorithms on the GPU a relatively straightforward affair. Past this point, getting the library to the next level is simply a matter of adding features to it. Bug reports and new feature pull requests are welcome.

Oh and before I forget, some of the code in the last few posts is already a tad outdated. I always recommend checking out the source instead of this tutorial.

]]>type DeviceGetSliceModule() = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __global__ void getSliceKernel(const "+FloatTypeCpp+"* matrix, "+FloatTypeCpp+"* out_matrix, const int start_row, const int end_row, const int num_rows, const int start_col, const int end_col, const int num_cols, const unsigned col_major){ const int stride = blockDim.x * gridDim.x; if (col_major){ int i = threadIdx.x+blockIdx.x*blockDim.x; const int row_stride = end_row-start_row+1; const int col_stride = end_col-start_col+1; while (1) { const int row_i = i % row_stride; const int col_i = i / row_stride; const int row = start_row+row_i; const int col = start_col+col_i; const int idx = row+col*num_rows; if (row_i < row_stride && col_i < col_stride) { out_matrix[i] = matrix[idx]; i += stride; } else return; } } else{ int i = threadIdx.x+blockIdx.x*blockDim.x; const int row_stride = end_row-start_row+1; const int col_stride = end_col-start_col+1; while (1) { const int row_i = i / col_stride; const int col_i = i % col_stride; const int row = start_row+row_i; const int col = start_col+col_i; const int idx = col+row*num_cols; if (row_i < row_stride && col_i < col_stride) { out_matrix[i] = matrix[idx]; i += stride; } else return; } } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"getSliceKernel") do try k.Compile([|"-arch=compute_30"|]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"getSliceKernel") /// For matrices stored in row major order. /// Zero based indexing. member t.AR(x: dMatrix, start_row, end_row, start_col, end_col) = if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0" if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols" let order = 0u let row_stride = end_row-start_row+1 let col_stride = end_col-start_col+1 let y = dMatrix.create(row_stride, col_stride) let n = row_stride*col_stride let gridSize = divup n block_size kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore y /// For matrices stored in column major order. /// Zero based indexing. member t.AC(x: dMatrix, start_row, end_row, start_col, end_col) = if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0" if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols" let order = 1u let row_stride = end_row-start_row+1 let col_stride = end_col-start_col+1 let y = dMatrix.create(row_stride, col_stride) let n = row_stride*col_stride let gridSize = divup n block_size kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore y type DeviceSetSliceModule() = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __global__ void setSliceKernel("+FloatTypeCpp+"* matrix, const "+FloatTypeCpp+"* out_matrix, const int start_row, const int end_row, const int num_rows, const int start_col, const int end_col, const int num_cols, const unsigned col_major){ const int stride = blockDim.x * gridDim.x; if (col_major){ int i = threadIdx.x+blockIdx.x*blockDim.x; const int row_stride = end_row-start_row+1; const int col_stride = end_col-start_col+1; while (1) { const int row_i = i % row_stride; const int col_i = i / row_stride; const int row = start_row+row_i; const int col = start_col+col_i; const int idx = row+col*num_rows; if (row_i < row_stride && col_i < col_stride) { matrix[idx] = out_matrix[i]; i += stride; } else return; } } else{ int i = threadIdx.x+blockIdx.x*blockDim.x; const int row_stride = end_row-start_row+1; const int col_stride = end_col-start_col+1; while (1) { const int row_i = i / col_stride; const int col_i = i % col_stride; const int row = start_row+row_i; const int col = start_col+col_i; const int idx = col+row*num_cols; if (row_i < row_stride && col_i < col_stride) { matrix[idx] = out_matrix[i]; i += stride; } else return; } } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"setSliceKernel") do try k.Compile([|"-arch=compute_30"|]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"setSliceKernel") /// For matrices stored in row major order. /// Zero based indexing. member t.AR(x: dMatrix, y: dMatrix, start_row, end_row, start_col, end_col) = if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0" if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols" let order = 0u let row_stride = end_row-start_row+1 let col_stride = end_col-start_col+1 if y.rc <> (row_stride,col_stride) then failwith "y.rc <> row_stride,col_stride" let n = row_stride*col_stride let gridSize = divup n block_size kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore /// For matrices stored in column major order. /// Zero based indexing. member t.AC(x: dMatrix, y: dMatrix, start_row, end_row, start_col, end_col) = if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0" if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols" let order = 1u let row_stride = end_row-start_row+1 let col_stride = end_col-start_col+1 if y.rc <> (row_stride,col_stride) then failwith "y.rc <> row_stride,col_stride" let n = row_stride*col_stride let gridSize = divup n block_size kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore // The Item and GetSlice operators. Column major let setsliceModule = DeviceSetSliceModule() let getsliceModule = DeviceGetSliceModule() type dMatrix with member t.GetSlice(rowStart: int option, rowFinish : int option, colStart: int option, colFinish : int option) = let rowStart = defaultArg rowStart 0 let rowFinish = defaultArg rowFinish (t.num_rows-1) let colStart = defaultArg colStart 0 let colFinish = defaultArg colFinish (t.num_cols-1) getsliceModule.AC(t,rowStart,rowFinish,colStart,colFinish) member t.GetSlice(row: int, colStart: int option, colFinish: int option) = let colStart = defaultArg colStart 0 let colFinish = defaultArg colFinish t.num_cols-1 getsliceModule.AC(t,row,row,colStart,colFinish) member t.GetSlice(rowStart: int option, rowFinish: int option, col: int) = let rowStart = defaultArg rowStart 0 let rowFinish = defaultArg rowFinish t.num_rows-1 getsliceModule.AC(t,rowStart,rowFinish,col,col) member t.SetSlice(rowStart: int option, rowFinish : int option, colStart: int option, colFinish : int option, y) = let rowStart = defaultArg rowStart 0 let rowFinish = defaultArg rowFinish (t.num_rows-1) let colStart = defaultArg colStart 0 let colFinish = defaultArg colFinish (t.num_cols-1) setsliceModule.AC(t,y,rowStart,rowFinish,colStart,colFinish) member t.SetSlice(row: int, colStart: int option, colFinish: int option,y) = let colStart = defaultArg colStart 0 let colFinish = defaultArg colFinish t.num_cols-1 setsliceModule.AC(t,y,row,row,colStart,colFinish) member t.SetSlice(rowStart: int option, rowFinish: int option, col: int,y) = let rowStart = defaultArg rowStart 0 let rowFinish = defaultArg rowFinish t.num_rows-1 setsliceModule.AC(t,y,rowStart,rowFinish,col,col)

Even though it is 200 lines long, all the above does is lets us access matrix like a 2D array. With this extension it can be read and set using .[1..3,2..5] or something to that effect. These kernels could also be helpful in isolation. There are both column major and row major versions inside the function.

]]>The XOR network rewritten in more mathy notation would be:

y = sum((targets-sigmoid(W2 * tanh(W*input+bias) + bias2))^2)

The challenge of optimizing this neural net is to make sure the W,W2,bias and bias2 variables are such that the cost is as close as possible to zero. To do that one must calculate the derivatives of those variables with respect to the cost function and use an algorithm such as gradient descent to push the weights closer towards their optimal values.

In school I am assuming you learned how to take the derivatives of polynomoials and various complex expressions. For myself, I now think that most of that was a waste of time.

It is possible to calculate the derivatives of an arbitrary expression such as the one above completely automatically in both the forward and reverse mode by simple local rules.

To do that let me simplify the above for clarity’s sake and assume that W,W2,bias,bias2 are not matrices, but single variables instead. I’ll also take out the sum as there is nothing to sum here and change ‘targets’ to ‘target’.

y = (target-sigmoid(W2 * tanh(W*input+bias) + bias2))^2

let sigmoid a = 1.0f/(1.0f+exp(-a)) let target = -0.5f let input = 1.0f let W = 1.5f let W2 = 2.0f let bias = 0.25f let bias2 = 0.0f let y = (target-sigmoid(W2 * tanh(W*input+bias) + bias2))**2.0f

Output:

val sigmoid : a:float32 ->; float32 val target : float32 = -0.5f val input : float32 = 1.0f val W : float32 = 1.5f val W2 : float32 = 2.0f val bias : float32 = 0.25f val bias2 : float32 = 0.0f val y : float32 = 1.87122381f

A person calculating this by hand, and the compiler itself, would break the large expression down into individual pieces and evaluate them separately.

Let us go through it step by step:

// These are the original assignments. In AD literature the starting variables are denoted from i up to 0, but here they will start at 0. // Starting from scratch, this would be the evaluation trace of the program had I decompiled it. let v0 = target let v1 = input let v2 = W let v3 = W2 let v4 = bias let v5 = bias2 // The first calculation is W*input = v2*v1. let v6 = v2*v1 // Then comes v6+bias=v6+v4 let v7 = v6+v4 // Then comes tanh(v7) let v8 = tanh(v7) // Then comes W2*v8=v3*v8. let v9 = v3*v8 // Then comes v9+bias2=v9+v5 let v10 = v9+v5 // Then comes sigmoid(v10) let v11 = sigmoid(v10) // Then comes target-v11=v0-v11 let v12 = v0-v11 // Then comes v12**2.0f let v13 = v12**2.0f

Output:

val v0 : float32 = -0.5f val v1 : float32 = 1.0f val v2 : float32 = 1.5f val v3 : float32 = 2.0f val v4 : float32 = 0.25f val v5 : float32 = 0.0f val v6 : float32 = 1.5f val v7 : float32 = 1.75f val v8 : float32 = 0.941375554f val v9 : float32 = 1.88275111f val v10 : float32 = 1.88275111f val v11 : float32 = 0.867926776f val v12 : float32 = -1.36792684f val v13 : float32 = 1.87122381f

v13 is exactly the same as y.

Let us put aside the above evaluation trace for a moment and consider how one would differentiate much simpler examples – a*b for starters.

module Isolated = let a = 3.0f let b = 2.0f let isolated_mult a b = let c = a*b let dc_a = b // dc/da = b - The derivative of c with respect to a let dc_b = a // dc/db = a - The derivative of c with respect to b c,dc_a,dc_b let c,dc_a,dc_b = isolated_mult a b

Output:

module Isolated = begin val a : float32 = 3.0f val b : float32 = 2.0f val isolated_mult : a:float32 -> b:float32 -> float32 * float32 * float32 val dc_b : float32 = 3.0f val dc_a : float32 = 2.0f val c : float32 = 6.0f end

Picture it as a graph with nodes a and b leading into c. In the isolated example above as c has no nodes leading into it the correct steps would indeed be the above. But had this been a part of the larger expression such as (a*a)*(b*b) it would be wrong.

What needs to be added is the error coming from above.

dc_a and dc_b need to know the error at c. In reverse mode parlance such directional gradients or errors are also called adjoints.

// The functions in here propagate the error from upwards to downwards. module Propagating = let a = 3.0f let b = 2.0f let prop_mult a b = let c = a*b // http://stackoverflow.com/questions/36636/what-is-a-closure let fdc_a error_c = error_c*b // dc/da = error*b - The derivative of c with respect to a. This is a function. let fdc_b error_c = error_c*a // dc/db = error*a - The derivative of c with respect to b. This is a function. c,fdc_a,fdc_b let c,fdc_a,fdc_b = prop_mult a b let dc_a = fdc_a 1.0f let dc_b = fdc_b 1.0f

Output:

module Propagating = begin val a : float32 = 3.0f val b : float32 = 2.0f val prop_mult : a:float32 -> b:float32 -> float32 * (float32 -> float32) * (float32 -> float32) val fdc_b : (float32 -> float32) val fdc_a : (float32 -> float32) val c : float32 = 6.0f val dc_a : float32 = 2.0f val dc_b : float32 = 3.0f end

In the above I did a little sleight of hand and modified the dc_a and dc_b into functions. Now they require to be feed the error at the node they are at.

Let us try a more complex example: (a*b)*(a*b).

module Propagating2 = let a = 3.0f let b = 2.0f let prop_mult a b = let c = a*b // http://stackoverflow.com/questions/36636/what-is-a-closure let fdc_a error_c = error_c*b // dc/da = error*b - The derivative of c with respect to a. This is a function. let fdc_b error_c = error_c*a // dc/db = error*a - The derivative of c with respect to b. This is a function. c,fdc_a,fdc_b // (a*b)*(a*b) let c,fdc_a,fdc_b = prop_mult a b // c=a*b let d,fdc_a',fdc_b' = prop_mult a b // d=a*b let e,fde_c,fde_d = prop_mult c d // e=c*d let er_c, er_d = fde_c 1.0f, fde_d 1.0f // errors (or adjoints) of e with respect to c and d let er_a', er_b' = fdc_a' er_c, fdc_b' er_d // adjoints of a and b let er_a, er_b = fdc_a er_c, fdc_b er_d // adjoints of a and b let adjoint_a = er_a+er_a' let adjoint_b = er_b+er_b'

Output:

module Propagating2 = begin val a : float32 = 3.0f val b : float32 = 2.0f val prop_mult : a:float32 -> b:float32 -> float32 * (float32 -> float32) * (float32 -> float32) val fdc_b : (float32 -> float32) val fdc_a : (float32 -> float32) val c : float32 = 6.0f val fdc_b' : (float32 -> float32) val fdc_a' : (float32 -> float32) val d : float32 = 6.0f val fde_d : (float32 -> float32) val fde_c : (float32 -> float32) val e : float32 = 36.0f val er_d : float32 = 6.0f val er_c : float32 = 6.0f val er_b' : float32 = 18.0f val er_a' : float32 = 12.0f val er_b : float32 = 18.0f val er_a : float32 = 12.0f val adjoint_a : float32 = 24.0f val adjoint_b : float32 = 36.0f end

The adjoint of a is 24 and the adjoint of b is 36.

This one is easy to check by hand.

e = a^2*b^2

de / da = 2*a*b^2 = 2*3*2*2 = 24

de / db = 2*a^2*b = 2*3*3*2 = 36

No problem. The above is correct, but requires us to manually push in values into the returned functions. It is easy to modify the above procedure so that is not necessary.

module Propagating3 = type Df = struct // Struct is similar to record or a class except it is stack allocated. val P : float32 // primal val A : float32 ref // adjoint (reference type) new p = {P=p;A=ref 0.0f} end let a = Df 3.0f let b = Df 2.0f let mult (a: Df) (b: Df) = let c = Df (a.P*b.P) // http://stackoverflow.com/questions/36636/what-is-a-closure let fb() = //The function for the backwards pass. a.A := !c.A*b.P + !a.A b.A := !c.A*a.P + !b.A c,fb // (a*b)*(a*b) let c,fc = mult a b // c=a*b let d,fd = mult a b // d=a*b let e,fe = mult c d // e=c*d e.A := 1.0f // Feed the 1.0f at the top. fe() // Crank the machine fd() // Crank the machine fc() // Crank the machine let adjoint_a = !a.A let adjoint_b = !b.A

Output:

module Propagating3 = begin type Df = struct new : p:float32 -> Df val P: float32 val A: float32 ref end val a : Df = FSI_0005+Propagating3+Df val b : Df = FSI_0005+Propagating3+Df val mult : a:Df -> b:Df -> Df * (unit -> unit) val fc : (unit -> unit) val c : Df = FSI_0005+Propagating3+Df val fd : (unit -> unit) val d : Df = FSI_0005+Propagating3+Df val fe : (unit -> unit) val e : Df = FSI_0005+Propagating3+Df val adjoint_a : float32 = 24.0f val adjoint_b : float32 = 36.0f end

It is kind of ugly now, but the output is obvious. Behind the scenes, the closures are compiled into classes. The above could be also done in a language like C++ using virtual function overloads, but with closures the pattern is unparalleled in its elegance.

The above is essentially, the main design pattern of the Spiral library. It can be done better still:

module Propagating4 = open System.Collections.Generic type Df = struct // Struct is similar to record or a class except it is stack allocated. val P : float32 // primal val A : float32 ref // adjoint (reference type) new p = {P=p;A=ref 0.0f} end let a = Df 3.0f let b = Df 2.0f let tape = List<unit -> unit>() // List is not really a list, but a dynamic array. unit -> unit is the function type that takes no parameters and returns nothing. let mult (a: Df) (b: Df) = let c = Df (a.P*b.P) // http://stackoverflow.com/questions/36636/what-is-a-closure let fb() = //The function for the backwards pass. a.A := !c.A*b.P + !a.A b.A := !c.A*a.P + !b.A tape.Add(fb) c // (a*b)*(a*b) let c = mult a b // c=a*b let d = mult a b // d=a*b let e = mult c d // e=c*d e.A := 1.0f // Feed the 1.0f at the top. for i=tape.Count-1 downto 0 do tape.[i]() // Let the computer crank it for you from top to bottom. let adjoint_a = !a.A let adjoint_b = !b.A

Output:

module Propagating4 = begin type Df = struct new : p:float32 -> Df val P: float32 val A: float32 ref end val a : Df = FSI_0006+Propagating4+Df val b : Df = FSI_0006+Propagating4+Df val tape : System.Collections.Generic.List<(unit -> unit)> val mult : a:Df -> b:Df -> Df val c : Df = FSI_0006+Propagating4+Df val d : Df = FSI_0006+Propagating4+Df val e : Df = FSI_0006+Propagating4+Df val adjoint_a : float32 = 24.0f val adjoint_b : float32 = 36.0f end

Just put all the operations into a tape and then run it backwards. No problem.

I think this is generally the difference between backpropagation and reverse AD. One buries you in calculus and the other just gives you a crank. The tape is the favored approach of reverse AD.

In theory one should not have to study AD explicitly to figure this out, but showing it like this makes a difference.

F# does support operator overloading:

module Propagating5 = open System.Collections.Generic type Df = struct // Struct is similar to record or a class except it is stack allocated. val P : float32 // primal val A : float32 ref // adjoint (reference type) new p = {P=p;A=ref 0.0f} end let a = Df 3.0f let b = Df 2.0f let tape = List<unit -> unit>() // List is not really a list, but a dynamic array. unit -> unit is the function type that takes no parameters and returns nothing. let mult (a: Df) (b: Df) = let c = Df (a.P*b.P) // http://stackoverflow.com/questions/36636/what-is-a-closure let fb() = //The function for the backwards pass. a.A := !c.A*b.P + !a.A b.A := !c.A*a.P + !b.A tape.Add(fb) c type Df with static member inline (*)(a: Df, b: Df) = mult a b // The overloaded * operator // (a*b)*(a*b) let e = (a*b)*(a*b) e.A := 1.0f // Feed the 1.0f at the top. for i=tape.Count-1 downto 0 do tape.[i]() // Let the computer crank it for you from top to bottom. let adjoint_a = !a.A let adjoint_b = !b.A

Output:

module Propagating5 = begin type Df = struct new : p:float32 -> Df val P: float32 val A: float32 ref static member ( * ) : a:Df * b:Df -> Df end val a : Df = FSI_0007+Propagating5+Df val b : Df = FSI_0007+Propagating5+Df val tape : System.Collections.Generic.List<(unit -> unit)> val mult : a:Df -> b:Df -> Df val e : Df = FSI_0007+Propagating5+Df val adjoint_a : float32 = 24.0f val adjoint_b : float32 = 36.0f end

The above overload is just an aesthetic improvement, but right now I haven’t even put it into the Spiral library yet. With this the basic pattern is complete.

With it we can make the backward pass as we go forward.

Let us go back to the first example:

module ReverseADExample = type Df = struct // Struct is similar to record or a class except it is stack allocated. val P : float32 // primal val A : float32 ref // adjoint (reference type) new (p) = {P=p;A=ref 0.0f} new (p,t) = {P=p;A=ref t} end with override t.ToString() = sprintf "(%f,%f)" t.P !t.A // To make F# Interactive print out the fields member t.dup = Df(t.P,!t.A) // Makes a duplicate of the struct. open System.Collections.Generic let tape = List<unit -> unit>() // List is not really a list, but a dynamic array. unit -> unit is the function type that takes no parameters and returns nothing. let sigmoid (a: Df) = let c = Df (1.0f/(1.0f+exp(-a.P))) let fb() = // The function for the backwards pass. a.A := !c.A*c.P*(1.0f-c.P) + !a.A tape.Add(fb) c let tanh (a: Df) = let c = Df (tanh a.P) let fb() = // The function for the backwards pass. a.A := !c.A*(1.0f-c.P*c.P) + !a.A tape.Add(fb) c let pow (a: Df) b = let c = Df (a.P**b) let fb() = // The function for the backwards pass. a.A := !c.A*b*(c.P/a.P) + !a.A tape.Add(fb) c let mult (a: Df) (b: Df) = let c = Df (a.P*b.P) let fb() = //The function for the backwards pass. a.A := !c.A*b.P + !a.A b.A := !c.A*a.P + !b.A tape.Add(fb) c let add (a: Df) (b: Df) = let c = Df (a.P+b.P) let fb() = //The function for the backwards pass. a.A := !c.A + !a.A b.A := !c.A + !b.A tape.Add(fb) c let sub (a: Df) (b: Df) = let c = Df (a.P-b.P) let fb() = //The function for the backwards pass. a.A := !c.A + !a.A b.A := !b.A - !c.A tape.Add(fb) c type Df with static member inline (*)(a: Df, b: Df) = mult a b // The overloaded * operator static member inline (+)(a: Df, b: Df) = add a b // The overloaded + operator static member inline (-)(a: Df, b: Df) = sub a b // The overloaded - operator static member inline Pow(a: Df, b) = pow a b // The overloaded ** operator let target = Df -0.5f let input = Df 1.0f let W = Df 1.5f let W2 = Df 2.0f let bias = Df 0.25f let bias2 = Df 0.0f //let y = (target-sigmoid(W2 * tanh(W*input+bias) + bias2))**2.0f // These are the original assignments. In AD literature the starting variables are denoted from i up to 0, but here they will start at 0. // Starting from scratch, this would be the evaluation trace of the program had I decompiled it. let v0 = target.dup // The reason for these duplicates is to emphasize that the F# Interactive will output only the values of the final run. let v1 = input.dup // In this case it makes no difference as they will be the same either way, but it would improper to not copy them. let v2 = W.dup let v3 = W2.dup let v4 = bias.dup let v5 = bias2.dup // The first calculation is W*input = v2*v1. let v6 = v2*v1 // Then comes v6+bias=v6+v4 let v7 = v6+v4 // Then comes tanh(v7) let v8 = tanh(v7) // Then comes W2*v8=v3*v8. let v9 = v3*v8 // Then comes v9+bias2=v9+v5 let v10 = v9+v5 // Then comes sigmoid(v10) let v11 = sigmoid(v10) // Then comes target-v11=v0-v11 let v12 = v0-v11 // Then comes v12**2.0f let v13 = v12**2.0f v13.A := 1.0f // Feed the 1.0f at the top. for i=tape.Count-1 downto 0 do tape.[i]() // Let the computer crank it for you from top to bottom. let adjoint_W = !v2.A let adjoint_W2 = !v3.A let adjoint_bias = !v4.A let adjoint_bias2 = !v5.A // Once more from the top. tape.Clear() let y = (target-sigmoid(W2 * tanh(W*input+bias) + bias2))**2.0f y.A := 1.0f for i=tape.Count-1 downto 0 do tape.[i]() // Let the computer crank it for you from top to bottom. let adjoint_W' = !W.A let adjoint_W2' = !W2.A let adjoint_bias' = !bias.A let adjoint_bias2' = !bias2.A

Output:

module ReverseADExample = begin type Df = struct new : p:float32 -> Df val P: float32 val A: float32 ref override ToString : unit -> string static member Pow : a:Df * b:float32 -> Df static member ( + ) : a:Df * b:Df -> Df static member ( * ) : a:Df * b:Df -> Df static member ( - ) : a:Df * b:Df -> Df end val tape : System.Collections.Generic.List<(unit -> unit)> val sigmoid : a:Df -> Df val tanh : a:Df -> Df val pow : a:Df -> b:float32 -> Df val mult : a:Df -> b:Df -> Df val add : a:Df -> b:Df -> Df val sub : a:Df -> b:Df -> Df val target : Df = (-0.500000,-2.735854) val input : Df = (1.000000,0.107078) val W : Df = (1.500000,0.071385) val W2 : Df = (2.000000,0.295225) val bias : Df = (0.250000,0.071385) val bias2 : Df = (0.000000,0.313611) val v0 : Df = (-0.500000,-2.735854) val v1 : Df = (1.000000,0.107078) val v2 : Df = (1.500000,0.071385) val v3 : Df = (2.000000,0.295225) val v4 : Df = (0.250000,0.071385) val v5 : Df = (0.000000,0.313611) val v6 : Df = (1.500000,0.071385) val v7 : Df = (1.750000,0.071385) val v8 : Df = (0.941376,0.627221) val v9 : Df = (1.882751,0.313611) val v10 : Df = (1.882751,0.313611) val v11 : Df = (0.867927,2.735854) val v12 : Df = (-1.367927,-2.735854) val v13 : Df = (1.871224,1.000000) val adjoint_W : float32 = 0.0713853538f val adjoint_W2 : float32 = 0.295225322f val adjoint_bias : float32 = 0.0713853538f val adjoint_bias2 : float32 = 0.313610584f val l : unit = () val y : Df = (1.871224,1.000000) val adjoint_W' : float32 = 0.0713853538f val adjoint_W2' : float32 = 0.295225322f val adjoint_bias' : float32 = 0.0713853538f val adjoint_bias2' : float32 = 0.313610584f end

I checked it with DiffSharp and surprisingly it is correct on the first try. DiffSharp has more advanced features than Spiral, but no Cuda support at the moment or inplace operations.

In order for me to explain how to calculate the Hessian using AD, it is necessary for me to explain the forward AD mode first. As the opposite of the reverse mode, it can be used to calculate the derivative of a function R -> R^n. By iterating it similarly to finite difference approximation one might use for gradient checking, it is possible to calculate R^m -> R^n.

As ML algorithms in general are R^m to R that means one would have to rerun the algorithm m times to get all the weight gradients.

Forward mode is pretty simple to understand.

One just attaches an extra number to the Df similar to the adjoint, called the tangent.

module Forward1 = open System.Collections.Generic type Df = struct // Struct is similar to record or a class except it is stack allocated. val P : float32 // primal val T : float32 ref // tangent (reference type) new (p) = {P=p; T=ref 0.0f} new (p,t) = {P=p; T=ref t} end with override t.ToString() = sprintf "(%f,%f)" t.P !t.T // To make F# Interactive print out the fields let a = Df 3.0f let b = Df 2.0f let mult (a: Df) (b: Df) = let cp = a.P*b.P // primal let ct = !a.T * b.P + a.P * !b.T // tangent let c = Df(cp,ct) c type Df with static member inline (*)(a: Df, b: Df) = mult a b // The overloaded * operator // In order to get the derivative of the cost with respect to a, set a's tangent to 1.0f and every others' to 0.0f a.T := 1.0f b.T := 0.0f let c = a*a*b*b // In order to get the derivative of the cost with respect to b, set a's tangent to 1.0f and every others' to 0.0f a.T := 0.0f b.T := 1.0f let c' = a*a*b*b

Output:

module Forward1 = begin type Df = struct new : p:float32 -> Df new : p:float32 * t:float32 -> Df val P: float32 val T: float32 ref override ToString : unit -> string static member ( * ) : a:Df * b:Df -> Df end val a : Df = (3.000000,0.000000) val b : Df = (2.000000,1.000000) val mult : a:Df -> b:Df -> Df val c : Df = (36.000000,24.000000) val c' : Df = (36.000000,36.000000) end

The tangent is in the second field. 24 and 36 seems about right for a and b. Tangents are really just errors from below.

c = a*b

c’ = a’*b+a*b’

In the above equation we just replace the a’ and b’ with these dual numbers as we move up the graph. In the reverse mode we would just do that from the opposite side – on the way down.

To get second order derivatives, we combine reverse and the forward mode:

module Hessian = open System.Collections.Generic type Df = struct // Struct is similar to record or a class except it is stack allocated. val P : float32 // primal val T : float32 ref // tangent (reference type) val A : float32 ref // adjoint (reference type) val TA : float32 ref // tangent of the adjoint (reference type) new (p) = {P=p; T=ref 0.0f; A=ref 0.0f; TA=ref 0.0f} new (p,t) = {P=p; T=ref t; A=ref 0.0f; TA=ref 0.0f} end with override t.ToString() = sprintf "(primal=%f,tangent=%f,adjoint=%f,tangent of adjoint=%f)" t.P !t.T !t.A !t.TA // To make F# Interactive print out the fields let a = Df 3.0f let b = Df 2.0f let tape = List<unit -> unit>() // List is not really a list, but a dynamic array. unit -> unit is the function type that takes no parameters and returns nothing. let mult (a: Df) (b: Df) = let cp = a.P*b.P // primal let ct = !a.T * b.P + a.P * !b.T // tangent let c = Df(cp,ct) let fb() = a.A := !c.A*b.P + !a.A b.A := !c.A*a.P + !b.A // The derivative of !c.A*b.P is !c.TA*b.P + !c.A* !b.T // We also run the forward mode during the reverse. // This calculates the Hessian multiplied by a vector. a.TA := !c.TA*b.P + !c.A* !b.T + !a.TA b.TA := !c.TA*a.P + !c.A* !a.T + !b.TA tape.Add(fb) c type Df with static member inline (*)(a: Df, b: Df) = mult a b // The overloaded * operator // In order to get the derivative of the cost with respect to a, set a's tangent to 1.0f and every others' to 0.0f a.T := 1.0f b.T := 0.0f let c = a*a*b*b c.A := 1.0f for i=tape.Count-1 downto 0 do tape.[i]() // Let the computer crank it for you from top to bottom. printfn "The elements of the Hessian are inside the tangent of the adjoint." printfn "a=%A" a printfn "b=%A" b // Once more from the top. [|a;b|] |> Array.iter (fun x -> x.A := 0.0f;x.T := 0.0f;x.TA := 0.0f) // Reset the adjoints and the tangents of the base variables. a.T := 0.0f b.T := 1.0f tape.Clear() let c' = a*a*b*b c'.A := 1.0f for i=tape.Count-1 downto 0 do tape.[i]() // Let the computer crank it for you from top to bottom. printfn "a=%A" a printfn "b=%A" b

Output:

The elements of the Hessian are inside the tangent of the adjoint. a=(primal=3.000000,tangent=1.000000,adjoint=24.000000,tangent of adjoint=8.000000) b=(primal=2.000000,tangent=0.000000,adjoint=36.000000,tangent of adjoint=24.000000) a=(primal=3.000000,tangent=0.000000,adjoint=24.000000,tangent of adjoint=24.000000) b=(primal=2.000000,tangent=1.000000,adjoint=36.000000,tangent of adjoint=18.000000) // I've omitted the rest

So based on the above:

d^2 c / d^2 a = 8

d^2 c / d ab = 24

d^2 c / d^2 b = 18

What I did is attach a tangent to the adjoint so that in effect we are running forward mode on the way down along with the reverse. At the end we get the elements of the Hessian as shown in the output.

With the above tools, one has everything one needs to implement any machine learning imaginable. For myself, it took me quite a while to figure out second order differentiation. I definitely could not find any info on how to do it online.

Now I know and now that I revealed the secret, you do too.

I do not actually intend to extend the support for this in the library as it would place too great a burden on me as a programmer to support a feature I do not need, but there is satisfaction in knowing how to do this in case it becomes needed.

While the above examples were on scalars, they extend naturally to matrix operations.

Having covered the basics, the next few posts will be a stroll down memory lane.

The full code for post 6 is at the usual place.

]]>In order to make any kind of machine learning algorithm work, the map operations are essential. Relu, sigmoid, tanh…for such functions and for more complex kinds like k-selection, it is necessary to apply map operations to the result of Ax+b.

Before just last month I started working with ManagedCuda, most of my examples were written in Alea. It is a full .NET Cuda compilation solution. It has various advanced features such as the ability to take anonymous function inside its map kernel, compilation to PTX directly off F# code and even a garbage collection scheme for its objects.

Almost by accident, when I rewrote the library the first time in ManagedCuda which does not have nearly as advanced features as Alea – it is mostly a wrapper library – I noticed a 3-4 fold speedup in a straight up comparison. At the time of writing, I am not sure whether that is due to the fact that ManagedCuda provides access to the latest Cuda libraries (7.5) while Alea is stuck on 6.5 currently or something else.

At any rate, interfacing with Cuda even with access to its libraries would be impossible without some means of doing runtime compilation. Thankfully, Nvidia provides exactly such a tool – NVRTC – as a part of its SDK.

It allows one to compile C++ code as a string like so:

let inline divup a b = (a+b-1)/b // Division with rounding up. /// o <- f(x) type DeviceUnaryTransformModule(op: string) = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __device__ inline "+FloatTypeCpp+" op("+FloatTypeCpp+" x) { return "+op+" } // Device code __global__ void Map1Kernel(const "+FloatTypeCpp+"* A, "+FloatTypeCpp+"* O, const int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; while (i < N) { O[i] = op(A[i]); i += stride; } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"Map1Kernel") do try k.Compile([||]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"Map1Kernel") member t.A(x: CudaDeviceVariable<floatType>) = let n = int x.Size let o = new_dev<floatType> n let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,o.DevicePointer,n) |> ignore o member t.A(x: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>) = let n = int o.Size let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,o.DevicePointer,n) |> ignore member t.A(x: dMatrix) = let o = dMatrix.create(x.num_rows,x.num_cols) t.A(x,o) o member t.A(x: dMatrix, o: dMatrix) = if x.rc <> o.rc then failwith "x.rc <> o.rc in DeviceUnaryTransformModule" t.A(x.dArray,o.dArray)

When the above class is created, NVRTC takes in the C++ string and compiles it into a kernel that can then be run like a class function.

Here is an example with the most commonly used ML activations.

let rng = System.Random() let n = 9 let h_a = Array.init n (fun _ -> (rng.NextDouble()-0.5)*6.0 |> floatType) let a = dMatrix.create(3,3,h_a) let sigmoidModule = DeviceUnaryTransformModule "1.0f / (1.0f + expf(-x));" let tanhModule = DeviceUnaryTransformModule "tanhf(x);" let reluModule = DeviceUnaryTransformModule "x > 0.0f ? x : 0.0f;" let sig_a = sigmoidModule.A(a) let tanh_a = tanhModule.A(a) let relu_a = reluModule.A(a) let a' = a.Gather'() let sig_a' = sig_a.Gather'() let tanh_a' = tanh_a.Gather'() let relu_a' = relu_a.Gather'()

Output:

val rng : Random val n : int = 9 val h_a : float32 [] = [|-0.665023863f; -0.248744547f; 0.32134819f; 0.774180651f; -0.298591435f; -2.38772154f; 1.29249966f; 0.437697798f; 1.47583818f|] val a : dMatrix = {num_rows = 3; num_cols = 3; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val sigmoidModule : DeviceUnaryTransformModule val tanhModule : DeviceUnaryTransformModule val reluModule : DeviceUnaryTransformModule val sig_a : dMatrix = {num_rows = 3; num_cols = 3; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val tanh_a : dMatrix = {num_rows = 3; num_cols = 3; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val relu_a : dMatrix = {num_rows = 3; num_cols = 3; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val a' : floatType [,] = [[-0.665023863f; 0.774180651f; 1.29249966f] [-0.248744547f; -0.298591435f; 0.437697798f] [0.32134819f; -2.38772154f; 1.47583818f]] val sig_a' : floatType [,] = [[0.339611977f; 0.684424579f; 0.784569979f] [0.438132524f; 0.42590186f; 0.607710302f] [0.579652786f; 0.0841137916f; 0.813943148f]] val tanh_a' : floatType [,] = [[-0.581697047f; 0.649353862f; 0.859779835f] [-0.24373816f; -0.290023059f; 0.411734343f] [0.310725451f; -0.983272374f; 0.90068531f]] val relu_a' : floatType [,] = [[0.0f; 0.774180651f; 1.29249966f] [0.0f; 0.0f; 0.437697798f] [0.32134819f; 0.0f; 1.47583818f]]

Seems decent enough. The above kernel just iterates over each element in parallel, applying the specified operation that can be inserted as a text string, similar to an anonymous function.

Here are the rest of the simple map kernels:

/// o <- f(x,y) type DeviceBinaryTransformModule(op: string) = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __device__ inline "+FloatTypeCpp+" op("+FloatTypeCpp+" x, "+FloatTypeCpp+" y) { return "+op+" } // Device code __global__ void Map2Kernel(const "+FloatTypeCpp+"* A, const "+FloatTypeCpp+"* B, "+FloatTypeCpp+"* O, const int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; while (i < N) { O[i] = op(A[i],B[i]); i += stride; } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"Map2Kernel") do try k.Compile([||]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"Map2Kernel") member t.A(x: CudaDeviceVariable<floatType>, y: CudaDeviceVariable<floatType>) = let n = int x.Size let o = new_dev<floatType> n let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,y.DevicePointer,o.DevicePointer,n) |> ignore o member t.A(x: CudaDeviceVariable<floatType>, y: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>) = let n = int o.Size let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,y.DevicePointer,o.DevicePointer,n) |> ignore member t.A(x: dMatrix, y: dMatrix) = let o = dMatrix.create(x.num_rows,x.num_cols) t.A(x,y,o) o member t.A(x: dMatrix, y: dMatrix, o: dMatrix) = if x.rc <> y.rc then failwith "x.rc <> y.rc in DeviceBinaryTransformModule" if y.rc <> o.rc then failwith "y.rc <> o.rc in DeviceBinaryTransformModule" t.A(x.dArray,y.dArray,o.dArray) /// o <- f(x,y,z) type DeviceTrinaryTransformModule(op: string) = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __device__ inline "+FloatTypeCpp+" op("+FloatTypeCpp+" x, "+FloatTypeCpp+" y, "+FloatTypeCpp+" z) { return "+op+" } // Device code __global__ void Map3Kernel(const "+FloatTypeCpp+"* A, const "+FloatTypeCpp+"* B, const "+FloatTypeCpp+"* C, "+FloatTypeCpp+"* O, const int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; while (i < N) { O[i] = op(A[i],B[i],C[i]); i += stride; } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"Map3Kernel") do try k.Compile([||]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"Map3Kernel") member t.A(x: CudaDeviceVariable<floatType>, y: CudaDeviceVariable<floatType>, z: CudaDeviceVariable<floatType>) = let n = int x.Size let o = new_dev<floatType> n let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,y.DevicePointer,z.DevicePointer,o.DevicePointer,n) |> ignore o member t.A(x: CudaDeviceVariable<floatType>, y: CudaDeviceVariable<floatType>, z: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>) = let n = int o.Size let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,y.DevicePointer,z.DevicePointer,o.DevicePointer,n) |> ignore member t.A(x: dMatrix, y: dMatrix, z: dMatrix) = let o = dMatrix.create(x.num_rows,x.num_cols) t.A(x,y,z,o) o member t.A(x: dMatrix, y: dMatrix, z: dMatrix, o: dMatrix) = if x.rc <> y.rc then failwith "x.rc <> y.rc in DeviceTrinaryTransformModule" if y.rc <> z.rc then failwith "y.rc <> z.rc in DeviceTrinaryTransformModule" if z.rc <> o.rc then failwith "y.rc <> o.rc in DeviceTrinaryTransformModule" t.A(x.dArray,y.dArray,z.dArray,o.dArray) /// o <- sum(f(x)) type DeviceUnaryMapSumModule(op: string) = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __device__ inline "+FloatTypeCpp+" op("+FloatTypeCpp+" x) { return "+op+" } __device__ inline "+FloatTypeCpp+" warpDownReduce("+FloatTypeCpp+" value){ for (int i = 16; i>0; i = i / 2) value += __shfl_down(value, i); return value; } // Device code __global__ void MapSumKernel(const "+FloatTypeCpp+"* A, "+FloatTypeCpp+"* O, const int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; __shared__ "+FloatTypeCpp+" temp[32]; if (threadIdx.x < 32) temp[threadIdx.x] = 0.0f; "+FloatTypeCpp+" acc = 0.0f; while (i < N) { acc += op(A[i]); i += stride; } __syncthreads(); "+FloatTypeCpp+" out_partial = warpDownReduce(acc); if (threadIdx.x % 32 == 0) temp[threadIdx.x / 32] = out_partial; __syncthreads(); if (threadIdx.x < 32) out_partial = warpDownReduce(temp[threadIdx.x]); if (threadIdx.x == 0) atomicAdd(O, out_partial); } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"MapSumKernel") do try k.Compile([|"-arch=compute_30"|]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"MapSumKernel") member t.A(x: CudaDeviceVariable<floatType>) = let n = int x.Size use o = new_dev<floatType> 1 o.Memset(0u) let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,o.DevicePointer,n) |> ignore o.[SizeT 0] member t.A(x: dMatrix) = t.A(x.dArray) /// o <- sum(f(x,y)) type DeviceBinaryMapSumModule(op: string) = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __device__ inline "+FloatTypeCpp+" op("+FloatTypeCpp+" x, "+FloatTypeCpp+" y) { return "+op+" } __device__ inline "+FloatTypeCpp+" warpDownReduce("+FloatTypeCpp+" value){ for (int i = 16; i>0; i = i / 2) value += __shfl_down(value, i); return value; } // Device code __global__ void Map2SumKernel(const "+FloatTypeCpp+"* A, const "+FloatTypeCpp+"* B, "+FloatTypeCpp+"* O, const int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; __shared__ "+FloatTypeCpp+" temp[32]; if (threadIdx.x < 32) temp[threadIdx.x] = 0.0f; "+FloatTypeCpp+" acc = 0.0f; while (i < N) { acc += op(A[i],B[i]); i += stride; } __syncthreads(); "+FloatTypeCpp+" out_partial = warpDownReduce(acc); if (threadIdx.x % 32 == 0) temp[threadIdx.x / 32] = out_partial; __syncthreads(); if (threadIdx.x < 32) out_partial = warpDownReduce(temp[threadIdx.x]); if (threadIdx.x == 0) atomicAdd(O, out_partial); } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"Map2SumKernel") do try k.Compile([|"-arch=compute_30"|]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"Map2SumKernel") member t.A(x: CudaDeviceVariable<floatType>,y: CudaDeviceVariable<floatType>) = let n = int x.Size use o = new_dev<floatType> 1 o.Memset(0u) let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, x.DevicePointer,y.DevicePointer,o.DevicePointer,n) |> ignore o.[SizeT 0] member t.A(x: dMatrix,y: dMatrix) = if x.rc <> y.rc then failwith "x.rc <> y.rc in DeviceBinaryMapSumModule" t.A(x.dArray,y.dArray) /// o <- f(coef_x,x) type DeviceUnaryCoefTransformModule(op: string) = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __device__ inline "+FloatTypeCpp+" op("+FloatTypeCpp+" coef_x, "+FloatTypeCpp+" x) { return "+op+" } // Device code __global__ void MapCoefKernel(const "+FloatTypeCpp+" coef_A, const "+FloatTypeCpp+"* A, "+FloatTypeCpp+"* O, const int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; while (i < N) { O[i] = op(coef_A,A[i]); i += stride; } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"MapCoefKernel") do try k.Compile([||]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"MapCoefKernel") member t.A(coef_x: floatType, x: CudaDeviceVariable<floatType>) = let n = int x.Size let o = new_dev<floatType> n let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, coef_x,x.DevicePointer,o.DevicePointer,n) |> ignore o member t.A(coef_x: floatType, x: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>) = let n = int o.Size let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, coef_x,x.DevicePointer,o.DevicePointer,n) |> ignore member t.A(coef_x, x: dMatrix) = let o = dMatrix.create(x.num_rows,x.num_cols) t.A(coef_x,x,o) o member t.A(coef_x, x: dMatrix, o: dMatrix) = if x.rc <> o.rc then failwith "x.rc <> o.rc in DeviceUnaryCoefTransformModule" t.A(coef_x,x.dArray,o.dArray) /// o <- f(coef_x,x,coef_y,y) type DeviceBinaryCoefTransformModule(op: string) = let block_size = 256 let kernel_code = " //Kernel code: extern \"C\" { __device__ inline "+FloatTypeCpp+" op("+FloatTypeCpp+" coef_x, "+FloatTypeCpp+" x, "+FloatTypeCpp+" coef_y, "+FloatTypeCpp+" y) { return "+op+" } // Device code __global__ void MapCoef2Kernel(const "+FloatTypeCpp+" coef_A, const "+FloatTypeCpp+"* A, const "+FloatTypeCpp+" coef_B, const "+FloatTypeCpp+"* B, "+FloatTypeCpp+"* O, const int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; while (i < N) { O[i] = op(coef_A,A[i],coef_B,B[i]); i += stride; } } } " let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"MapCoef2Kernel") do try k.Compile([||]) with | NVRTCException as x -> printfn "%s" (k.GetLogAsString()) reraise() let kernel = ctx.LoadKernelPTX(k.GetPTX(),"MapCoef2Kernel") member t.A(coef_x: floatType, x: CudaDeviceVariable<floatType>,coef_y: floatType, y: CudaDeviceVariable<floatType>) = let n = int x.Size let o = new_dev<floatType> n let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, coef_x,x.DevicePointer,coef_y, y.DevicePointer,o.DevicePointer,n) |> ignore o member t.A(coef_x: floatType, x: CudaDeviceVariable<floatType>, coef_y: floatType, y: CudaDeviceVariable<floatType>, o: CudaDeviceVariable<floatType>) = let n = int o.Size let gridSize = min (2*numSm*(1024/block_size)) (divup n block_size) kernel.GridDimensions <- dim3(gridSize) kernel.BlockDimensions <- dim3(block_size) kernel.RunAsync(str.Stream, coef_x,x.DevicePointer,coef_y,y.DevicePointer,o.DevicePointer,n) |> ignore member t.A(coef_x, x: dMatrix, coef_y, y: dMatrix) = let o = dMatrix.create(x.num_rows,x.num_cols) t.A(coef_x,x,coef_y,y,o) o member t.A(coef_x, x: dMatrix, coef_y, y: dMatrix, o: dMatrix) = if x.rc <> y.rc then failwith "x.rc <> y.rc in DeviceBinaryCoefTransformModule" if y.rc <> o.rc then failwith "y.rc <> o.rc in DeviceBinaryCoefTransformModule" t.A(coef_x,x.dArray,coef_y,y.dArray,o.dArray)

The DeviceMapSum modules do a block sum reduction after applying a map function and then add up the results using atomics. Despite how it might look, in newer Nvidia cards banks conflicts are resolved in L2 cache making atomics quite fast for reductions.

It might be good to replace my custom reductions with CUB library calls, but I am afraid that it might make compilation times even slower than they already are.

Cuda has ways to go until it catches up to native code compilation speed.

The hidden purpose of dealing with this low level stuff is to build some mental muscle for when neuromorphic architectures arrive. Even if Cuda is hard to code for, the magic brain chips freshly off the presses will be much more so. All the libraries will have be pretty much redone when that happens, but the design patterns will remain intact and therein lies the value in having firm foundations.

DeviceCoefTransform modules are similar to the standard map ones except they can take extra scalar arguments. With those one can implement gradient clipping or resize and translate matrices.

// The gradient clipping module. let gradclipModule = DeviceUnaryCoefTransformModule "(x < -coef_x) ? -coef_x : (x > coef_x ? coef_x : x);" // coef_x = scale // coef_y = location // y does not get used. let randMapModule = DeviceBinaryCoefTransformModule "coef_x*(x-0.5f)+coef_y;" type dMatrix with /// Generates a matrix sampled from a random uniform distribution in <-1.0f,1.0f] static member createRandomUniformMatrix weights_num_rows weights_num_cols (scaling_factor : floatType) location = let weights_total_size = weights_num_rows*weights_num_cols let cudaBuffer = new_dev<floatType> weights_total_size cudaRandom.GenerateUniform(cudaBuffer) // 2.0f*scaling_factor ensures that it is rescaled around zero if the scaling_factor is 1.0f. randMapModule.A(2.0f*scaling_factor,cudaBuffer,location,cudaBuffer,cudaBuffer) dMatrix.create(weights_num_rows,weights_num_cols,cudaBuffer) /// Fills matrix by sampling from a random uniform distribution in <-1.0f,1.0f] member t.fillRandomUniformMatrix (scaling_factor : floatType) location = let weights_total_size = t.num_rows*t.num_cols cudaRandom.GenerateUniform(t.dArray) // 2.0f*scaling_factor ensures that it is rescaled around zero if the scaling_factor is 1.0f. randMapModule.A(2.0f*scaling_factor,t.dArray,location,t.dArray,t.dArray)

Again I extend the dMatrix type by giving it the ability to create random matrices. Here is a demonstration of the things build so far in tandem:

let W = dMatrix.createRandomUniformMatrix 3 2 1.0f 0.0f let bias = dMatrix.createRandomUniformMatrix 3 1 1.0f 0.0f let W2 = dMatrix.createRandomUniformMatrix 1 3 1.0f 0.0f let bias2 = dMatrix.createRandomUniformMatrix 1 1 1.0f 0.0f let input = dMatrix.create(2,4,[|0.0f;0.0f;0.0f;1.0f;1.0f;0.0f;1.0f;1.0f|]) let targets = dMatrix.create(1,4,[|0.0f;1.0f;1.0f;0.0f|]) let inline mm a b = gemm nT nT 1.0f a b let inline badd a b = broadcastAdd 1.0f a 1.0f b let inline tanh_act (a: dMatrix) = tanhModule.A(a) let inline sig_act (a: dMatrix) = sigmoidModule.A(a) // tanh(W*input + bias) let a1 = badd (mm W input) bias |> tanh_act // sigmoid(W2*a1 + bias2) let o = badd (mm W2 a1) bias2 |> sig_act let W' = W.Gather'() let bias' = bias.Gather'() let W2' = W2.Gather'() let bias2' = bias2.Gather'() let input' = input.Gather'() let targets' = targets.Gather'() let a1' = a1.Gather'() let o' = o.Gather'() let squareDifferenceModule = new DeviceBinaryMapSumModule "(x-y)*(x-y);" let L2_cost = squareDifferenceModule.A(targets,o)

Here is an example run:

val W : dMatrix = {num_rows = 3; num_cols = 2; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val bias : dMatrix = {num_rows = 3; num_cols = 1; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val W2 : dMatrix = {num_rows = 1; num_cols = 3; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val bias2 : dMatrix = {num_rows = 1; num_cols = 1; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val input : dMatrix = {num_rows = 2; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val targets : dMatrix = {num_rows = 1; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val inline mm : a:dMatrix -> b:dMatrix -> dMatrix val inline badd : a:dMatrix -> b:dMatrix -> dMatrix val inline tanh_act : a:dMatrix -> dMatrix val inline sig_act : a:dMatrix -> dMatrix val a1 : dMatrix = {num_rows = 3; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val o : dMatrix = {num_rows = 1; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val W' : floatType [,] = [[-0.443403065f; -0.249918759f] [0.238359213f; -0.51902771f] [0.175117373f; -0.170489728f]] val bias' : floatType [,] = [[-0.812610984f] [0.265104771f] [-0.430138469f]] val W2' : floatType [,] = [[0.558640003f; -0.573411465f; -0.173779786f]] val bias2' : floatType [,] = [[-0.934492171f]] val input' : floatType [,] = [[0.0f; 0.0f; 1.0f; 1.0f] [0.0f; 1.0f; 0.0f; 1.0f]] val targets' : floatType [,] = [[0.0f; 1.0f; 1.0f; 0.0f]] val a1' : floatType [,] = [[-0.671028078f; -0.786630154f; -0.849961519f; -0.906214595f] [0.259063959f; -0.248602718f; 0.464837015f; -0.0155624701f] [-0.405437022f; -0.537496448f; -0.249632731f; -0.40156284f]] val o' : floatType [,] = [[0.199815348f; 0.242691875f; 0.163491398f; 0.203910157f]] val squareDifferenceModule : DeviceBinaryMapSumModule val L2_cost : floatType = 1.3547678f

The above is a single forward pass of the XOR neural net problem and we have everything we need to make the backwards pass.

The backward pass as I have unfortunately found out is especially hard to get right, even more so for more complex architectures like LSTMs.

For the simple feedforward case as in the above, one could do it by hand, but it is not worth the bother anyway.

As I couldn’t prevent WordPress from stuffing random html into my code this time, all the examples as they are written can be found in the tutorial examples directory. The current post is post 5.

]]>Before anything can be done, the basic 2D matrix type has to be defined. It would be far too unsafe to call Cuda library functions on raw Cuda matrices. Instead is very helpful to wrap them inside a class, or in this case a record which is similar to a class. The difference is that it can’t be inherited from, its field can be made mutable and there is the ability to make updated copies.

/// The main matrix type. type dMatrix = { mutable num_rows:int mutable num_cols:int mutable dArray: CudaDeviceVariable<floatType> } /// The main create function. A substitute for the constructor. static member create(num_rows: int,num_cols,dArray: CudaDeviceVariable<floatType>) = {num_rows=num_rows;num_cols=num_cols;dArray=dArray} /// Throws an exception if it tries to allocate an array of size 0. static member create(num_rows: int,num_cols) = let q = (num_rows*num_cols) |> SizeT let t = new CudaDeviceVariable<floatType>(q) {num_rows=num_rows;num_cols=num_cols;dArray=t} /// Copies a host to a device array. /// Throws an exception if it tries to allocate an array of size 0. static member create(num_rows: int,num_cols,dArray: floatType[]) = let q = num_rows*num_cols if dArray.Length <> q then failwith "Invalid size in dMatrix construction." let t = to_dev dArray {num_rows=num_rows;num_cols=num_cols;dArray=t} /// Returns a new instance of an (dMatrix.createEmpty) dMatrix. /// Unlike the let statements, the member statements are always reevaluated. static member createEmpty = dMatrix.create(0,0,CudaDeviceVariable.Null) /// Returns num_rows, num_cols as a tuple member inline t.rc = t.num_rows, t.num_cols /// Sets the matrix to zero. member inline t.setZero() = t.dArray.MemsetAsync(0u,str.Stream) /// Set the matrix to a value. member inline t.set (x: floatType) = let v = BitConverter.ToUInt32(BitConverter.GetBytes(x),0) t.dArray.MemsetAsync(v,str.Stream) /// Creates a copy of this matrix with all the values set to zero. member inline t.zeroLike() = let c = dMatrix.create(t.num_rows,t.num_cols) c.setZero() c /// Copies a matrix. member inline t.copy() = let c = dMatrix.create(t.num_rows,t.num_cols) c.dArray.AsyncCopyToDevice(t.dArray,str) c /// Resized the dArray if the current one is less than nr*nc. Otherwise it only adjusts num_rows and num_cols. member inline t.ReplaceIf nr nc = if int t.dArray.Size < nr*nc then (t :> IDisposable).Dispose() t.num_rows <- nr t.num_cols <- nc t.dArray <- new_dev (nr*nc) else t.num_rows <- nr t.num_cols <- nc /// Copies a matrix to a host array. member inline t.Gather() = let h_a = Array.zeroCreate<floatType> (int t.dArray.Size) t.dArray.CopyToHost(h_a) h_a member inline t.isEmpty = t.dArray.Equals(CudaDeviceVariable.Null) /// The unmanaged Cuda memory has to be freed explicitly or by letting go of the context by resetting the F# Interactive. /// Finalizers work really poorly and can lead to unpredictable bugs when used to manage Cuda memory. interface IDisposable with member t.Dispose() = if t.isEmpty = false then t.dArray.Dispose()

The reason why the record fields are mutable is to enable resizing. When an array is resized upwards it disposes it dArray and assigns a new one, while it just changes num_rows and num_cols fields when it resizes downwards. That is in order to keep the dynamic reallocations to a minimum.

To achieve optimal performance, the amount of memory allocation and deallocation needs to be kept to a minimum. Ideally one should allocate all the memory on the first pass and keep reusing those matrices on the other passes. The versions that can reuse the memory are roughly 3-4 times faster than their more functional equivalents. On the other hand, the functional equivalents that use dynamic allocation solely are definitely much easier to write.

An useful practice I have found is to fluidly transition between the two styles by writing the first versions in a functional manner and then making them faster by making them use less dynamic memory juggling.

The most essential part of a ML library would be that it provides access to the basic linear algebra functions that are in turn, placed inside of other libraries. For fully connected neural nets the functions that power them are the sgemm (A*x) matrix multiply found in the cuBLAS library and the AddTensor (+ bias). The cuDNN user guide can be found under downloads.

A recommendation for those interested in making convolutional functions work is to use the Julia documentation instead as it is much better than the Nvidia’s one. It actually explains what the parameters do.

I won’t be covering convolutional functions in this series, but I have written a bunch of wrappers for them in Alea (the Cuda .Net compiler library), that will have to be adapted to work in ManagedCuda. I’ll do it eventually after I finish my exploration of recurrent nets.

Here is the general matrix multiply:

let T = Operation.Transpose let nT = Operation.NonTranspose /// General matrix-matrix multiply from cuBLAS. let gemm transa transb (alpha: floatType) (A:dMatrix) (B:dMatrix) = let a_col = if transa = nT then A.num_cols else A.num_rows let b_row = if transb = nT then B.num_rows else B.num_cols if a_col <> b_row then failwith (sprintf "a_col <> b_row in gemm! %i <> %i" a_col b_row) let m = if transa = nT then A.num_rows else A.num_cols let n = if transb = nT then B.num_cols else B.num_rows let k = a_col let lda = if transa = nT then m else k let ldb = if transb = nT then k else n let ldc = m let C_dArray = new CudaDeviceVariable<floatType>(m*n |> SizeT) cublas.Gemm(transa, transb, m, n, k, alpha, A.dArray, lda, B.dArray, ldb, 0.0f, C_dArray, ldc) dMatrix.create(m,n,C_dArray) /// General matrix-matrix multiply from cuBLAS. Inplace version let gemm2 transa transb (alpha: floatType) (A:dMatrix) (B:dMatrix) beta (C:dMatrix) = let a_col = if transa = nT then A.num_cols else A.num_rows let b_row = if transb = nT then B.num_rows else B.num_cols if a_col <> b_row then failwith (sprintf "a_col <> b_row in gemm! %i <> %i" a_col b_row) let m = if transa = nT then A.num_rows else A.num_cols let n = if transb = nT then B.num_cols else B.num_rows let k = a_col let lda = if transa = nT then m else k let ldb = if transb = nT then k else n let ldc = m let C_dArray = C.dArray if m <> C.num_rows || n <> C.num_cols then failwith "m <> C.num_rows || n <> C.num_cols in gemm2" cublas.Gemm(transa, transb, m, n, k, alpha, A.dArray, lda, B.dArray, ldb, beta, C_dArray, ldc)

The gemm (general matrix matrix multiply) functions were difficult to figure out as they have some insane parameter rules that force the programmer to do acrobatics with the row and column variables.

At any rate, there are not difficult to figure out. The boundary checks will throw an exception if the matrix dimensions are input incorrectly.

The difference between the two is easy enough to see by the return type. The standard gemm will dynamically allocate and return the dMatrix C, while the inplace version requires the memory to be preallocated.

/// General matrix-matrix addition. let geam transa transb (alpha: floatType) (A:dMatrix) beta (B:dMatrix) = let a_row = if transa = nT then A.num_rows else A.num_cols let a_col = if transa = nT then A.num_cols else A.num_rows let b_row = if transb = nT then B.num_rows else B.num_cols let b_col = if transb = nT then B.num_cols else B.num_rows if a_row <> b_row then failwith (sprintf "a_row <> b_row in geam! %i <> %i" a_row b_row) if a_col <> b_col then failwith (sprintf "a_col <> b_col in geam! %i <> %i" a_col b_col) let lda = if transa = nT then a_row else a_col let ldb = if transa = nT then b_row else b_col let ldc = a_row let C_dArray = new CudaDeviceVariable<floatType>(a_row*a_col |> SizeT) cublas.Geam(transa, transb, a_row, a_col, alpha, A.dArray, lda, B.dArray, ldb, beta, C_dArray, ldc) dMatrix.create(a_row,a_col,C_dArray) /// General matrix-matrix addition. Inplace version. let geam2 transa transb (alpha: floatType) (A:dMatrix) beta (B:dMatrix) (C:dMatrix) = let a_row = if transa = nT then A.num_rows else A.num_cols let a_col = if transa = nT then A.num_cols else A.num_rows let b_row = if transb = nT then B.num_rows else B.num_cols let b_col = if transb = nT then B.num_cols else B.num_rows if a_row <> b_row then failwith (sprintf "a_row <> b_row in geam2! %i <> %i" a_row b_row) if a_col <> b_col then failwith (sprintf "a_col <> b_col in geam2! %i <> %i" a_col b_col) if a_row <> C.num_rows then failwith (sprintf "a_row <> C.num_rows in geam2! %i <> %i" a_col b_col) if a_col <> C.num_cols then failwith (sprintf "a_col <> C.num_cols in geam2! %i <> %i" a_col b_col) let lda = if transa = nT then a_row else a_col let ldb = if transa = nT then b_row else b_col let ldc = a_row cublas.Geam(transa, transb, a_row, a_col, alpha, A.dArray, lda, B.dArray, ldb, beta, C.dArray, ldc) let inline transpose t = geam T T 1.0f t 0.0f t // Transpose function

The extended cuBlas function geam(matrix-matrix addition) acts much like the vector vector addition function with a few extra features. Besides adding two matrices together, it is also able to transpose them before doing that, meaning that it can also be used as a transpose function.

let biasTensorDesc = new TensorDescriptor() let dstTensorDesc = new TensorDescriptor() let SpiralCuDNNDataType = if typeof<floatType> = typeof<float32> then cudnnDataType.Float else if typeof<floatType> = typeof<float> then cudnnDataType.Double else failwith "cudnnDataType not supported." ///o <- beta*mat + alpha*vec (matrix-vector broadcast addition) let broadcastAdd beta (mat: dMatrix) alpha (vec: dMatrix) = let TensorFormat = cudnnTensorFormat.NCHW; biasTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, 1, vec.num_rows, vec.num_cols) dstTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, mat.num_cols, mat.num_rows, 1) let copy_mat = mat.copy() cudnn.AddTensor(alpha,biasTensorDesc,vec.dArray,beta,dstTensorDesc,copy_mat.dArray) copy_mat ///mat <- beta*mat + alpha*vec (matrix-vector broadcast addition) let broadcastAdd2 beta (mat: dMatrix) alpha (vec: dMatrix) = let TensorFormat = cudnnTensorFormat.NCHW; biasTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, 1, vec.num_rows, vec.num_cols) dstTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, mat.num_cols, mat.num_rows, 1) cudnn.AddTensor(alpha,biasTensorDesc,vec.dArray,beta,dstTensorDesc,mat.dArray) /// o <- sum_across_channels(alpha*mat) /// For 2D matrices, channels are the columns. /// The function sums along the rows. let rowSum alpha (mat: dMatrix) = let TensorFormat = cudnnTensorFormat.NHWC; dstTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, mat.num_rows, 1, mat.num_cols) let vec = dMatrix.create(mat.num_rows,1) biasTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, vec.num_rows, 1, vec.num_cols) cudnn.ConvolutionBackwardBias(alpha,dstTensorDesc,mat.dArray,0.0f,biasTensorDesc,vec.dArray) vec /// vec <- sum_across_channels(alpha*mat)+beta*vec /// For 2D matrices, channels are the columns. /// The function sums along the rows. let rowSum2 alpha (mat: dMatrix) beta (vec: dMatrix) = let TensorFormat = cudnnTensorFormat.NHWC; dstTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, mat.num_rows, 1, mat.num_cols) biasTensorDesc.SetTensor4dDescriptor(TensorFormat, SpiralCuDNNDataType, 1, mat.num_rows, 1, vec.num_cols) cudnn.ConvolutionBackwardBias(alpha,dstTensorDesc,mat.dArray,beta,biasTensorDesc,vec.dArray)

The above functions do broadcast matrix vector addition and summation along the rows, used for adding the biases and calculating their adjoints (error derivatives) respectively.

There is no particular need to think deeply what the particular parameters do here, it is enough to know that this is basically what they do. Just put them in a box mentally somewhere. Unlike the cuBLAS functions these do not have boundary checking inside the loop because cuDNN already takes care of that on its own.

Interesting tidbit worth noting here is that a scalar (or a vector in the case of broadcast) expanded to a matrix in the forward pass does not require averaging of errors in the return pass, but just their summation instead. Just now I noticed that I had this error in the rowSum functions in fact.

With these four basic functions, we have almost everything we to make our first neural net, though we still need the map kernels for the activation functions and the random function to initialize the weights and the biases.

Before that here is a demonstration one of interesting features of F#: type extensions.

type dMatrix with /// For accessing individual elements with the .[a,b] syntax. member t.Item with get(a: int, b: int) = t.dArray.[a+b*t.num_rows |> SizeT] and set(a: int, b: int) (value: floatType) = t.dArray.[a+b*t.num_rows |> SizeT] <- value /// For displaying column majors matrices inside Array2D (which is row major.) member inline t.Gather'() = let h_a = Array2D.zeroCreate<floatType> t.num_rows t.num_cols use t' = transpose t // Transpose to row major. The use keyword ensures that it is disposed automatically as soon as it goes out of scope. t'.dArray.CopyToHost(h_a) // Copy directly to host array. h_a

When this snipped does is add two methods to the dMatrix type. The first is a function to access the individual elements like one would a 2D array and the other is to copy the Cuda array to a 2D host one.

As the Array2D class uses the row major ordering before making the transfer, first a transpose needs to be done and for that we needed the geam function to be completed first. Without type extensions, because it would be impossible for us to complete the geam function without making the dMatrix type first, we would have to put everything inside the dMatrix type. Without type extensions it would not be impossible to implement, but this is very convenient.

A short example to prove that the above works:

let a = dMatrix.create(5,3) a.set 1.5f let b = dMatrix.create(3,4) b.set 2.3f let c = gemm nT nT 1.0f a b let a' = a.Gather'() let b' = b.Gather'() let c' = c.Gather'() let c'2 = geam nT nT 0.1f c 0.0f c // Multiplies the first array by 0.1f and the second by 0.0f let c'2' = c'2.Gather'() let bias = dMatrix.create(5,1) for i=0 to 4 do bias.[i,0] <- 0.5f + floatType i let bias' = bias.Gather'() let d = broadcastAdd 1.0f c'2 1.0f bias let d' = d.Gather'() let e = rowSum 1.0f d let e' = e.Gather'() let e2 = rowSum 2.0f d let e2' = e2.Gather'()

Here is the output:

val a : dMatrix = {num_rows = 5; num_cols = 3; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val b : dMatrix = {num_rows = 3; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val c : dMatrix = {num_rows = 5; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val a' : floatType [,] = [[1.5f; 1.5f; 1.5f] [1.5f; 1.5f; 1.5f] [1.5f; 1.5f; 1.5f] [1.5f; 1.5f; 1.5f] [1.5f; 1.5f; 1.5f]] val b' : floatType [,] = [[2.29999995f; 2.29999995f; 2.29999995f; 2.29999995f] [2.29999995f; 2.29999995f; 2.29999995f; 2.29999995f] [2.29999995f; 2.29999995f; 2.29999995f; 2.29999995f]] val c' : floatType [,] = [[10.3499994f; 10.3499994f; 10.3499994f; 10.3499994f] [10.3499994f; 10.3499994f; 10.3499994f; 10.3499994f] [10.3499994f; 10.3499994f; 10.3499994f; 10.3499994f] [10.3499994f; 10.3499994f; 10.3499994f; 10.3499994f] [10.3499994f; 10.3499994f; 10.3499994f; 10.3499994f]] val c'2 : dMatrix = {num_rows = 5; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val c'2' : floatType [,] = [[1.03499997f; 1.03499997f; 1.03499997f; 1.03499997f] [1.03499997f; 1.03499997f; 1.03499997f; 1.03499997f] [1.03499997f; 1.03499997f; 1.03499997f; 1.03499997f] [1.03499997f; 1.03499997f; 1.03499997f; 1.03499997f] [1.03499997f; 1.03499997f; 1.03499997f; 1.03499997f]] val bias : dMatrix = {num_rows = 5; num_cols = 1; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val bias' : floatType [,] = [[0.5f] [1.5f] [2.5f] [3.5f] [4.5f]] val d : dMatrix = {num_rows = 5; num_cols = 4; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val d' : floatType [,] = [[1.53499997f; 1.53499997f; 1.53499997f; 1.53499997f] [2.53499985f; 2.53499985f; 2.53499985f; 2.53499985f] [3.53499985f; 3.53499985f; 3.53499985f; 3.53499985f] [4.53499985f; 4.53499985f; 4.53499985f; 4.53499985f] [5.53499985f; 5.53499985f; 5.53499985f; 5.53499985f]] val e : dMatrix = {num_rows = 5; num_cols = 1; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val e' : floatType [,] = [[6.13999987f] [10.1399994f] [14.1399994f] [18.1399994f] [22.1399994f]] val e2 : dMatrix = {num_rows = 5; num_cols = 1; dArray = ManagedCuda.CudaDeviceVariable`1[System.Single];} val e2' : floatType [,] = [[12.2799997f] [20.2799988f] [28.2799988f] [36.2799988f] [44.2799988f]]

The above example is pretty innocuous, but I managed to discover a bug related to the finalizer that used to be inside the dMatrix record. It made me decide to remove it from the dMatrix record.

What I am doing here is a bit of a throwback to the old times where things like memory management were done explicitly in languages like C. Unfortunately, with device memory that will also have to be the case. Until GPU programming matures, in backwards times such as this (almost 2016), the programmer has the responsibility of handling device memory.

It would recommend not trying to take the easy path by putting in finalizers as in fact, in the above example I found an instance of memory corruption. Most likely, for larger examples the errors would be more subtle and devastating.

]]>Assuming you have installed F#, the first step you should take would be to run F# Interactive in 64-bit mode. In VS go into Tools -> Options and just write F# in the search bar. Then go to F# Tools -> F# Interactive. Enable both debugging and the 64-bit mode. Debugging is for later convenience – it won’t slow down the program in any case, but the 64-bit mode will be necessary to run Cuda libraries. Starting with version 7.0 Nvidia dropped support for the 32-bit versions.

I plan to build up the library step by step in the following chapters, but if one wants to peek ahead, here is the repository to an already finished version.

It it is similar to Andrej Karpathy’s Javascript library and AndyP’s Julia one (that was also inspired by Karpathy’s) except with more of a focus on getting the most out of one’s hardware.

At the time of this writing GPU programming is nowhere nearly as easy as standard CPU code and even if one just intends to call routines that are packaged in a library there are difficulties one must work around.

F# is statically typed functional language that can be compiled to an executable, but that can also be interpreted inside the IDE similar to Python.

All the examples that follow should be run from the F# Interactive.

Let’s test it out. To do anything at all, first we must reference the ManagedCuda files. It can be done like so:

// The Spiral library v1. Basic reverse mode AD on the GPU. #r "../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/ManagedCuda.dll" #r "../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/NVRTC.dll" #r "../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/CudaBlas.dll" #r "../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/CudaRand.dll" #r "../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/NPP.dll" #r "../packages/ManagedCuda-CudaDNN.3.0/lib/net45/CudaDNN.dll"

To run the lines just select them and then using right click and send them to F# Interactive. If the packages are installed something like the following should show up.

--> Referenced 'C:\Users\Marko\Documents\Visual Studio 2015\Projects\Spiral Library\Tutorial examples\../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/ManagedCuda.dll' --> Referenced 'C:\Users\Marko\Documents\Visual Studio 2015\Projects\Spiral Library\Tutorial examples\../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/NVRTC.dll' --> Referenced 'C:\Users\Marko\Documents\Visual Studio 2015\Projects\Spiral Library\Tutorial examples\../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/CudaBlas.dll' --> Referenced 'C:\Users\Marko\Documents\Visual Studio 2015\Projects\Spiral Library\Tutorial examples\../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/CudaRand.dll' --> Referenced 'C:\Users\Marko\Documents\Visual Studio 2015\Projects\Spiral Library\Tutorial examples\../packages/ManagedCuda-75-x64.7.5.7/lib/net45/x64/NPP.dll' --> Referenced 'C:\Users\Marko\Documents\Visual Studio 2015\Projects\Spiral Library\Tutorial examples\../packages/ManagedCuda-CudaDNN.3.0/lib/net45/CudaDNN.dll'

That is roughly all the setup that is necessary to run this.

Unlike the host or CPU code, which tends to run a single thread, for performance reasons GPU kernels tend to be launched asynchronously. To do that they need to be set on a single stream so that the scheduler can manage them. And the stream must be a part of a context which is analogous to a CPU process.

// Open up the namespaces. open ManagedCuda open ManagedCuda.BasicTypes open ManagedCuda.VectorTypes open ManagedCuda.CudaBlas open ManagedCuda.CudaRand open ManagedCuda.NVRTC open ManagedCuda.CudaDNN open System open System.IO open System.Collections // Initialize the context. Analogous to a CPU process. Cuda tries to offload as much as possible during context creation so there aren't // any unexpected delays later. let ctx = new CudaContext() let numSm = ctx.GetDeviceInfo().MultiProcessorCount // The number of streaming multiprocessors on the device. // Make a stream class. let str = new CudaStream() // Set the Cuda libraries handles to the above stream. let cublas = CudaBlas(str.Stream) let cudnn = new CudaDNN.CudaDNNContext() cudnn.SetStream(str) let cudaRandom = new CudaRand.CudaRandDevice(GeneratorType.PseudoDefault) cudaRandom.SetStream(str.Stream) // Type aliasing trick to make Spiral more generic. It is incomplete at the moment though due to Cuda math function being non-overloadable. type floatType = float32 let inline floatType x = float32 x let FloatTypeCpp = "float"

/// Copies a host array to device. let inline to_dev (host_ar: 't []) = let d_a = new CudaDeviceVariable<'t>(SizeT host_ar.Length) d_a.CopyToDevice(host_ar) d_a /// Copies a device array to host. let inline to_host (dev_ar: CudaDeviceVariable<'t>) = let h_a = Array.zeroCreate<'t> (int dev_ar.Size) dev_ar.CopyToHost(h_a) h_a /// Copies the device array to host. Extends the CudaDeviceVariable class. type CudaDeviceVariable<'t when 't: struct and 't: (new: unit -> 't) and 't:> System.ValueType> with member inline this.Gather() = to_host this /// Allocates a new device array without initializing it. let inline new_dev<'t when 't: struct and 't: (new: unit -> 't) and 't:> System.ValueType> (n: int) = new CudaDeviceVariable<'t>(SizeT n)

A short example would be as follows:

let a = [|1.0f;2.0f;3.0f|] let a' = to_dev a a'.[SizeT 0] <- 5.0f // Annoyingly, it is necessary to add explicitly convert ints to SizeT when accessing individual items in the CudaDeviceVariable class. let b = to_host a'

This prints out:

val a : float32 [] = [|1.0f; 2.0f; 3.0f|] val a' : ManagedCuda.CudaDeviceVariable<float32> val b : float32 [] = [|5.0f; 2.0f; 3.0f|]

A word of warning is that for performance reasons, you should never iterate ever elements of a Cuda array individually as in the above. It would be absolute wrong way to do it and will kill the performance of a algorithm.

Moving data back and forth from device to host is one of the slowest operations that exist even in batch mode and doing it individually like the above is doubly so. The library only ever uses it to gather the results of a reduction operation for that reason.

]]>