The pretrained net

Using the WTA function one can pretrain the layers of a net and then fine tune them together with the sigmoid layer added on top. This does not require any additions the library apart from the BlockReverse() type.

let l1 = FeedforwardLayer.createRandomLayer 1024 784 (WTA 6)
let l2 = FeedforwardLayer.createRandomLayer 1024 1024 (WTA 6)
let l3 = FeedforwardLayer.createRandomLayer 1024 1024 (WTA 6)
let l4 = InverseFeedforwardLayer.createRandomLayer l3 (fun x -> x) // No nonlinearity at the end. Linearities in the final layet cause the individual layers to overfit too badly.
let l5 = InverseFeedforwardLayer.createRandomLayer l2 (fun x -> x)
let l6 = InverseFeedforwardLayer.createRandomLayer l1 (fun x -> x)

let l1' = FeedforwardLayer.fromArray l1.ToArray relu // Makes supervised layers from the same weights.
let l2' = FeedforwardLayer.fromArray l2.ToArray relu
let l3' = FeedforwardLayer.fromArray l3.ToArray relu
let l_sig = FeedforwardLayer.createRandomLayer 10 1024 (clipped_steep_sigmoid 3.0f)

let layers_deep_autoencoder = [|[|l1;l2;l3|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l4;l5;l6|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The deep autoencoder is not used in this example, but only serves an illustration here.
let layers_1 = [|[|l1|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l6|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The correct functions will get called with dynamic dispatch.
let layers_2 = [|[|l1;l2|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l5|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The correct functions will get called with dynamic dispatch.
let layers_3 = [|[|l1;l2;l3|] |> Array.map (fun x -> x :> IFeedforwardLayer);[|l4|] |> Array.map (fun x -> x :> IFeedforwardLayer);|] |> Array.concat // Upcasting to the base type. The correct functions will get called with dynamic dispatch.
let layers_fine_tune = [|l1';l2';l3';l_sig|] |> Array.map (fun x -> x :> IFeedforwardLayer)

First we create all the layers individually and then we create arrays of such layers grouped together, so we can later pass them into the training function.

This is a three layer net with a small sigmoid output layer on top that we are training.

First we train each layer individually and then we do the whole net at the end.

let loop_1 data targets = // These loops are closures. They are not called directly, but passed as parameters into the training function. This one is for the first autoencoder
    let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data layers_1 // Scan is like fold except it returns the intermediates.
    let inp = outputs.[outputs.Length-3]
    let out = outputs.[outputs.Length-1]
    squared_error_cost inp out, None

let loop_2 data targets = // The targets do nothing in autoencoders, they are here so the type for the supervised net squares out. This one is for the second.
    let l,r = layers_2 |> Array.splitAt 1
    let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data l // Scan is like fold except it returns the intermediates.
    tape.Add(BlockReverse()) // This blocks the reverse pass from running past this point. It is so the gradients get blocked and only the top two layers get trained.
    let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) (outputs |> Array.last) r // Scan is like fold except it returns the intermediates.
    let inp = outputs.[outputs.Length-3]
    let out = outputs.[outputs.Length-1]
    squared_error_cost inp out, None

let loop_3 data targets =
    let l,r = layers_3 |> Array.splitAt 2
    let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data l // Scan is like fold except it returns the intermediates.
    tape.Add(BlockReverse()) // This blocks the reverse pass from running past this point. It is so the gradients get blocked and only the top two layers get trained.
    let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) (outputs |> Array.last) r // Scan is like fold except it returns the intermediates.
    let inp = outputs.[outputs.Length-3]
    let out = outputs.[outputs.Length-1]
    squared_error_cost inp out, None

let loop_3b data targets = // This is not for the autoencoder, but for the final logistic regression layer. We train it separately first so it does not distrupt the pretrained weights below it.
    let l,r = layers_fine_tune |> Array.splitAt 3
    let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data l // Scan is like fold except it returns the intermediates.
    tape.Add(BlockReverse()) // This blocks the reverse pass from running past this point. It is so the gradients get blocked and only the top two layers get trained.
    let outputs = Array.scan(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) (outputs |> Array.last) r // Scan is like fold except it returns the intermediates.
    let out = outputs.[outputs.Length-1]
    squared_error_cost targets out, None

let loop_fine_tune data targets = // The full net with the pretrained weights.
    let outputs = Array.fold(fun state (layer:IFeedforwardLayer) -> (layer.runLayer state)) data layers_fine_tune
    cross_entropy_cost targets outputs, Some (lazy get_accuracy targets.r.P outputs.r.P)

// It might be possible to get more speed by not repeating needless calculations in the lower layers, but that would require switching
// branches and some modifying the training loop, but this is decent enough.
// Doing it like this is in fact the most effiecient from a memory standpoint.

In the training functions above, the fold is replaced with scan because for the autoencoders the second to last output is the target, so having the intermediates at hand is necessary.

In the training loop for the second and the third layer, it is necessary to block the gradients from flowing backwards and optimizing the whole net. Unfortunately calling tape.Clear() here would not be recommended as that would reuse the memory used in the low layers in the uppers and corrupt it.

Instead the better choice is to create a separate type and modify the reverse call so it stops running if it encounters it.

An alternative to the above would be to switch branches in the tape, but that would be more complex. It would require us to write a separate not only these loop closures, but separate training functions.

The training function is much the same as before:

let train_mnist_sgd num_iters learning_rate training_loop (layers: IFeedforwardLayer[]) =
    [|
    let mutable r' = 0.0f
    let base_nodes = layers |> Array.map (fun x -> x.ToArray) |> Array.concat // Stores all the base nodes of the layer so they can later be reset.
    for i=1 to num_iters do
        for x in dtrain do
            let data, target = x
            let (r:Df), _ = training_loop data target // Builds the tape.

            tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures.
            r' <- r' + (!r.r.P/ float32 dtrain.Length) // Adds the cost to the accumulator.
            if System.Single.IsNaN r' then failwith "Nan error"

            for x in base_nodes do x.r.A.setZero() // Resets the base adjoints
            tape.resetTapeAdjoint 0 // Resets the adjoints for the training select
            r.r.A := 1.0f // Pushes 1.0f from the top node
            tape.reversepropTape 0 // Resets the adjoints for the test select
            add_gradients_to_weights' base_nodes learning_rate // The optimization step
            tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed for the simple recurrent and feedforward case.

        printfn "The training cost at iteration %i is %f" i r'
        let r1 = r'
        r' <- 0.0f
        let mutable acc = 0.0f

        for x in dtest do
            let data, target = x
            let r,lazy_acc = training_loop data target // Builds the tape.

            tape.forwardpropTape 0 // Calculates the forward values. Triggers the ff() closures.
            r' <- r' + (!r.r.P/ float32 dtest.Length) // Adds the cost to the accumulator.
            match lazy_acc with
            | Some (lazy_acc: Lazy<floatType>) -> acc <- acc+lazy_acc.Value // Here the accuracy calculation is triggered by accessing it through the Lazy property.
            | None -> ()

            if System.Single.IsNaN r' then failwith "Nan error"

            tape.Clear 0 // Clears the tape without disposing it or the memory buffer. It allows reuse of memory for a 100% gain in speed.

        printfn "The validation cost at iteration %i is %f" i r'
        if acc <> 0.0f then printfn "The accuracy is %i/10000" (int acc)
        let r2 = r'
        r' <- 0.0f
        yield r1,r2
    |]

Here the accuracy calculation is an optional lazy type. The compiler needed some type annotations to figure that out.

The only thing that remains is to use the above to run the process. It is a five step regiment:

// For the autoencoders it seems 0.1f is a decent learning rate.
// The autoencoders blow up with 0.2f.
// The lower learning rate in the final layer does not help, in fact the higher does.
// My record here is 99.1% after a few hours of playing around.
// Might be possible to do even better with max norm normalization.

// This layerwise pretraining is an old technique by now.
// Here is the more up to date research on this old idea: 
// GradNets - http://arxiv.org/abs/1511.06827
// Net2Net - http://arxiv.org/abs/1511.05641
for loop,layers,num_iters,learning_rate in [|loop_1,layers_1,10,0.1f;loop_2,layers_2,10,0.1f;loop_3,layers_3,10,0.1f;loop_3b,layers_fine_tune,10,0.1f;loop_fine_tune,layers_fine_tune,30,0.2f|] do
    printfn "Starting training loop %i..." loop_iter
    let s = train_mnist_sgd num_iters learning_rate loop layers

    let l = [|for l,_ in s do yield l|]
    let r = [|for _,r in s do yield r|]

    //(Chart.Combine [|Chart.Line l;Chart.Line r|]).ShowChart() |> ignore

    loop_iter <- loop_iter+1

All the closures, their layers, the number of iterations to run them and the learning rate are packed into an array and iterated over. The layers need to be packed as well because the adjoints of the base nodes need to be extracted so they can be reset.

On pretraining

What pretraining does is essentially gradually transforms the network so it becomes more performant. The view that one is pretraining an already complete net is a misnomer. A far better way of viewing it rather, is as a method transforming a net into a different one.

The way neural net training is currently done, the actual trained nets are very disposable.

That is not that much of a problem when one is playing around with detecting digits on Mnist, as training time become larger, knowledge transfer become more valuable in turn.

In the addition to that, the above experiment opens the door to doing evolutionary based search over neural net architectures in a more granular fashion with possibly checkpointing.

There will no doubt be major discoveries coming from this direction in the nearby future as making the transformation a part of the search process greatly eases up difficulty of the problem.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s