Source code for theanets.trainer

# -*- coding: utf-8 -*-

'''This module contains optimization methods for neural networks.

Many optimization methods are general-purpose optimization routines that happen
to be pretty good for training neural networks; these are provided by
``downhill``. The other methods here --- :class:`SampleTrainer`,
:class:`SupervisedPretrainer`, and :class:`UnsupervisedPretrainer` --- are more
specific to neural networks, often taking advantage of the layered structure of
many common network architectures.
'''

import downhill
import itertools
import numpy as np

from . import layers
from . import util


[docs]class DownhillTrainer(object): '''Wrapper for using trainers from ``downhill``.'''
[docs] def __init__(self, algo, network): self.algo = algo self.network = network
[docs] def itertrain(self, train, valid=None, **kwargs): '''Train a model using a training and validation set. This method yields a series of monitor values to the caller. After every iteration, a pair of monitor dictionaries is generated: one evaluated on the training dataset, and another evaluated on the validation dataset. The validation monitors might not be updated during every training iteration; in this case, the most recent validation monitors will be yielded along with the training monitors. Parameters ---------- train : :class:`Dataset <theanets.dataset.Dataset>` A set of training data for computing updates to model parameters. valid : :class:`Dataset <theanets.dataset.Dataset>` A set of validation data for computing monitor values and determining when the loss has stopped improving. Yields ------ training : dict A dictionary mapping monitor names to values, evaluated on the training dataset. validation : dict A dictionary containing monitor values evaluated on the validation dataset. ''' for monitors in downhill.build( algo=self.algo, loss=self.network.loss(**kwargs), updates=self.network.updates(**kwargs), monitors=self.network.monitors(**kwargs), inputs=self.network.variables, params=self.network.params, monitor_gradients=kwargs.get('monitor_gradients', False), ).iterate(train, valid=valid, **kwargs): yield monitors
[docs]class SampleTrainer(object): '''This trainer replaces network weights with samples from the input.'''
[docs] @staticmethod def reservoir(xs, n, rng): '''Select a random sample of n items from xs.''' pool = [] for i, x in enumerate(xs): if len(pool) < n: pool.append(x / np.linalg.norm(x)) continue j = rng.randint(i + 1) if j < n: pool[j] = x / np.linalg.norm(x) # if the pool still has fewer than n items, pad with distorted random # duplicates from the source data. L = len(pool) S = np.std(pool, axis=0) while len(pool) < n: x = pool[rng.randint(L)] pool.append(x + S * rng.randn(*x.shape)) return np.array(pool, dtype=pool[0].dtype)
[docs] def __init__(self, network): self.network = network
[docs] def itertrain(self, train, valid=None, **kwargs): '''Train a model using a training and validation set. This method yields a series of monitor values to the caller. After every iteration, a pair of monitor dictionaries is generated: one evaluated on the training dataset, and another evaluated on the validation dataset. The validation monitors might not be updated during every training iteration; in this case, the most recent validation monitors will be yielded along with the training monitors. Parameters ---------- train : :class:`Dataset <theanets.dataset.Dataset>` A set of training data for computing updates to model parameters. valid : :class:`Dataset <theanets.dataset.Dataset>` A set of validation data for computing monitor values and determining when the loss has stopped improving. Yields ------ training : dict A dictionary mapping monitor names to values, evaluated on the training dataset. validation : dict A dictionary containing monitor values evaluated on the validation dataset. ''' ifci = itertools.chain.from_iterable def first(x): return x[0] if isinstance(x, (tuple, list)) else x def last(x): return x[-1] if isinstance(x, (tuple, list)) else x odim = idim = None for t in train: idim = first(t).shape[-1] odim = last(t).shape[-1] rng = kwargs.get('rng') if rng is None or isinstance(rng, int): rng = np.random.RandomState(rng) # set output (decoding) weights on the network. samples = ifci(last(t) for t in train) for param in self.network.layers[-1].params: shape = param.get_value(borrow=True).shape if len(shape) == 2 and shape[1] == odim: arr = np.vstack(SampleTrainer.reservoir(samples, shape[0], rng)) util.log('setting {}: {}', param.name, shape) param.set_value(arr / np.sqrt((arr * arr).sum(axis=1))[:, None]) # set input (encoding) weights on the network. samples = ifci(first(t) for t in train) for layer in self.network.layers: for param in layer.params: shape = param.get_value(borrow=True).shape if len(shape) == 2 and shape[0] == idim: arr = np.vstack(SampleTrainer.reservoir(samples, shape[1], rng)).T util.log('setting {}: {}', param.name, shape) param.set_value(arr / np.sqrt((arr * arr).sum(axis=0))) samples = ifci(self.network.feed_forward( first(t))[i-1] for t in train) yield dict(loss=0), dict(loss=0)
[docs]class SupervisedPretrainer(object): '''This trainer adapts parameters using a supervised pretraining approach. In this variant, we create "taps" at increasing depths into the original network weights, training only those weights that are below the tap. So, for a hypothetical binary classifier network with layers [3, 4, 5, 6, 2], we would first insert a tap after the first hidden layer (effectively a binary classifier in a [3, 4, (2)] configuration, where (2) indicates that the corresponding layer is the tap, not present in the original) and train just that network. Then we insert a tap at the next layer (effectively training a [3, 4, 5, (2)] classifier, re-using the trained weights for the 3 x 4 layer), and so forth. When we get to training the last layer, i.e., [3, 4, 5, 6, 2], then we just train all of the layers in the original network. For autoencoder networks with tied weights, consider an example with layers [3, 4, 5, 6, 5', 4', 3'], where the prime indicates that the layer is tied. In cases like this, we train the "outermost" pair of layers first, then add then next pair of layers inward, etc. The training for our example would start with [3, 4, 3'], then proceed to [3, 4, 5, 4', 3'], and then finish by training all the layers in the original network. By using layers from the original network whenever possible, we preserve all of the relevant settings of noise, dropouts, loss function and the like, in addition to removing the need for copying trained weights around between different :class:`Network <theanets.graph.Network>` instances. References ---------- .. [Ben06] Y. Bengio, P. Lamblin, D. Popovici, & H. Larochelle. (NIPS 2006) "Greedy Layer-Wise Training of Deep Networks" http://machinelearning.wustl.edu/mlpapers/paper_files/NIPS2006_739.pdf The Appendix also contains pseudocode for the approaches: http://www.iro.umontreal.ca/~pift6266/A06/refs/appendix_dbn_supervised.pdf '''
[docs] def __init__(self, algo, network): self.algo = algo self.network = network
[docs] def itertrain(self, train, valid=None, **kwargs): '''Train a model using a training and validation set. This method yields a series of monitor values to the caller. After every iteration, a pair of monitor dictionaries is generated: one evaluated on the training dataset, and another evaluated on the validation dataset. The validation monitors might not be updated during every training iteration; in this case, the most recent validation monitors will be yielded along with the training monitors. Parameters ---------- train : :class:`Dataset <theanets.dataset.Dataset>` A set of training data for computing updates to model parameters. valid : :class:`Dataset <theanets.dataset.Dataset>` A set of validation data for computing monitor values and determining when the loss has stopped improving. Yields ------ training : dict A dictionary mapping monitor names to values, evaluated on the training dataset. validation : dict A dictionary containing monitor values evaluated on the validation dataset. ''' net = self.network original = list(net.layers) output_name = original[-1].output_name tied = any(isinstance(l, layers.Tied) for l in original) L = 1 + len(original) // 2 if tied else len(original) - 1 for i in range(1, L): tail = [] if i == L - 1: net.layers = original elif tied: net.layers = original[:i+1] for j in range(i): prev = tail[-1] if tail else net.layers[-1] tail.append(layers.Layer.build( 'tied', partner=original[i-j].name, inputs=prev.name)) net.layers = original[:i+1] + tail else: tail.append(layers.Layer.build( 'feedforward', name='lwout', inputs=original[i].output_name, size=original[-1].output_size, activation=original[-1].kwargs['activation'])) net.layers = original[:i+1] + tail util.log('layerwise: training {}', ' -> '.join(l.name for l in net.layers)) [l.bind(net, initialize=False) for l in net.layers] [l.setup() for l in tail] net.losses[0].output_name = net.layers[-1].output_name trainer = DownhillTrainer(self.algo, net) for monitors in trainer.itertrain(train, valid, **kwargs): yield monitors net.layers = original net.losses[0].output_name = output_name
[docs]class UnsupervisedPretrainer(object): '''Train a classification model using an unsupervised pre-training step. This trainer is a bit of glue code that creates a "shadow" autoencoder based on a current network model, trains the autoencoder, and then transfers the trained weights back to the original model. This code is intended mostly as a proof-of-concept to demonstrate how shadow networks can be created, and how trainers can call other trainers for lots of different types of training regimens. '''
[docs] def __init__(self, algo, network): self.algo = algo self.network = network
[docs] def itertrain(self, train, valid=None, **kwargs): '''Train a model using a training and validation set. This method yields a series of monitor values to the caller. After every iteration, a pair of monitor dictionaries is generated: one evaluated on the training dataset, and another evaluated on the validation dataset. The validation monitors might not be updated during every training iteration; in this case, the most recent validation monitors will be yielded along with the training monitors. Parameters ---------- train : :class:`Dataset <theanets.dataset.Dataset>` A set of training data for computing updates to model parameters. valid : :class:`Dataset <theanets.dataset.Dataset>` A set of validation data for computing monitor values and determining when the loss has stopped improving. Yields ------ training : dict A dictionary mapping monitor names to values, evaluated on the training dataset. validation : dict A dictionary containing monitor values evaluated on the validation dataset. ''' from . import feedforward original_layer_names = set(l.name for l in self.network.layers[:-1]) # construct a "shadow" of the input network, using the original # network's encoding layers, with tied weights in an autoencoder # configuration. layers_ = list(l.to_spec() for l in self.network.layers[:-1]) for i, l in enumerate(layers_[::-1][:-2]): layers_.append(dict( form='tied', partner=l['name'], activation=l['activation'])) layers_.append(dict( form='tied', partner=layers_[1]['name'], activation='linear')) util.log('creating shadow network') ae = feedforward.Autoencoder(layers=layers_) # train the autoencoder using the supervised layerwise pretrainer. pre = SupervisedPretrainer(self.algo, ae) for monitors in pre.itertrain(train, valid, **kwargs): yield monitors # copy trained parameter values back to our original network. for param in ae.params: l, p = param.name.split('.') if l in original_layer_names: util.log('copying pretrained parameter {}', param.name) self.network.find(l, p).set_value(param.get_value()) util.log('completed unsupervised pretraining')