Source code for theanets.trainer

# -*- coding: utf-8 -*-

'''This module contains optimization methods for neural networks.

Many optimization methods are general-purpose optimization routines that happen
to be pretty good for training neural networks; these are provided by
``downhill``. The other methods here --- :class:`SampleTrainer`,
:class:`SupervisedPretrainer`, and :class:`UnsupervisedPretrainer` --- are more
specific to neural networks, often taking advantage of the layered structure of
many common network architectures.
'''

import downhill
import itertools
import numpy as np

from . import layers
from . import util


[docs]class DownhillTrainer(object):
    '''Wrapper for using trainers from ``downhill``.'''

[docs]    def __init__(self, algo, network):
        self.algo = algo
        self.network = network

[docs]    def itertrain(self, train, valid=None, **kwargs):
        '''Train a model using a training and validation set.

        This method yields a series of monitor values to the caller. After every
        iteration, a pair of monitor dictionaries is generated: one evaluated on
        the training dataset, and another evaluated on the validation dataset.
        The validation monitors might not be updated during every training
        iteration; in this case, the most recent validation monitors will be
        yielded along with the training monitors.

        Parameters
        ----------
        train : :class:`Dataset <theanets.dataset.Dataset>`
            A set of training data for computing updates to model parameters.
        valid : :class:`Dataset <theanets.dataset.Dataset>`
            A set of validation data for computing monitor values and
            determining when the loss has stopped improving.

        Yields
        ------
        training : dict
            A dictionary mapping monitor names to values, evaluated on the
            training dataset.
        validation : dict
            A dictionary containing monitor values evaluated on the validation
            dataset.
        '''
        for monitors in downhill.build(
                algo=self.algo,
                loss=self.network.loss(**kwargs),
                updates=self.network.updates(**kwargs),
                monitors=self.network.monitors(**kwargs),
                inputs=self.network.variables,
                params=self.network.params,
                monitor_gradients=kwargs.get('monitor_gradients', False),
        ).iterate(train, valid=valid, **kwargs):
            yield monitors


[docs]class SampleTrainer(object):
    '''This trainer replaces network weights with samples from the input.'''

[docs]    @staticmethod
    def reservoir(xs, n, rng):
        '''Select a random sample of n items from xs.'''
        pool = []
        for i, x in enumerate(xs):
            if len(pool) < n:
                pool.append(x / np.linalg.norm(x))
                continue
            j = rng.randint(i + 1)
            if j < n:
                pool[j] = x / np.linalg.norm(x)
        # if the pool still has fewer than n items, pad with distorted random
        # duplicates from the source data.
        L = len(pool)
        S = np.std(pool, axis=0)
        while len(pool) < n:
            x = pool[rng.randint(L)]
            pool.append(x + S * rng.randn(*x.shape))
        return np.array(pool, dtype=pool[0].dtype)

[docs]    def __init__(self, network):
        self.network = network

[docs]    def itertrain(self, train, valid=None, **kwargs):
        '''Train a model using a training and validation set.

        This method yields a series of monitor values to the caller. After every
        iteration, a pair of monitor dictionaries is generated: one evaluated on
        the training dataset, and another evaluated on the validation dataset.
        The validation monitors might not be updated during every training
        iteration; in this case, the most recent validation monitors will be
        yielded along with the training monitors.

        Parameters
        ----------
        train : :class:`Dataset <theanets.dataset.Dataset>`
            A set of training data for computing updates to model parameters.
        valid : :class:`Dataset <theanets.dataset.Dataset>`
            A set of validation data for computing monitor values and
            determining when the loss has stopped improving.

        Yields
        ------
        training : dict
            A dictionary mapping monitor names to values, evaluated on the
            training dataset.
        validation : dict
            A dictionary containing monitor values evaluated on the validation
            dataset.
        '''
        ifci = itertools.chain.from_iterable

        def first(x):
            return x[0] if isinstance(x, (tuple, list)) else x

        def last(x):
            return x[-1] if isinstance(x, (tuple, list)) else x

        odim = idim = None
        for t in train:
            idim = first(t).shape[-1]
            odim = last(t).shape[-1]

        rng = kwargs.get('rng')
        if rng is None or isinstance(rng, int):
            rng = np.random.RandomState(rng)

        # set output (decoding) weights on the network.
        samples = ifci(last(t) for t in train)
        for param in self.network.layers[-1].params:
            shape = param.get_value(borrow=True).shape
            if len(shape) == 2 and shape[1] == odim:
                arr = np.vstack(SampleTrainer.reservoir(samples, shape[0], rng))
                util.log('setting {}: {}', param.name, shape)
                param.set_value(arr / np.sqrt((arr * arr).sum(axis=1))[:, None])

        # set input (encoding) weights on the network.
        samples = ifci(first(t) for t in train)
        for layer in self.network.layers:
            for param in layer.params:
                shape = param.get_value(borrow=True).shape
                if len(shape) == 2 and shape[0] == idim:
                    arr = np.vstack(SampleTrainer.reservoir(samples, shape[1], rng)).T
                    util.log('setting {}: {}', param.name, shape)
                    param.set_value(arr / np.sqrt((arr * arr).sum(axis=0)))
                    samples = ifci(self.network.feed_forward(
                        first(t))[i-1] for t in train)

        yield dict(loss=0), dict(loss=0)


[docs]class SupervisedPretrainer(object):
    '''This trainer adapts parameters using a supervised pretraining approach.

    In this variant, we create "taps" at increasing depths into the original
    network weights, training only those weights that are below the tap. So, for
    a hypothetical binary classifier network with layers [3, 4, 5, 6, 2], we
    would first insert a tap after the first hidden layer (effectively a binary
    classifier in a [3, 4, (2)] configuration, where (2) indicates that the
    corresponding layer is the tap, not present in the original) and train just
    that network. Then we insert a tap at the next layer (effectively training a
    [3, 4, 5, (2)] classifier, re-using the trained weights for the 3 x 4
    layer), and so forth. When we get to training the last layer, i.e., [3, 4,
    5, 6, 2], then we just train all of the layers in the original network.

    For autoencoder networks with tied weights, consider an example with layers
    [3, 4, 5, 6, 5', 4', 3'], where the prime indicates that the layer is tied.
    In cases like this, we train the "outermost" pair of layers first, then add
    then next pair of layers inward, etc. The training for our example would
    start with [3, 4, 3'], then proceed to [3, 4, 5, 4', 3'], and then finish by
    training all the layers in the original network.

    By using layers from the original network whenever possible, we preserve all
    of the relevant settings of noise, dropouts, loss function and the like, in
    addition to removing the need for copying trained weights around between
    different :class:`Network <theanets.graph.Network>` instances.

    References
    ----------

    .. [Ben06] Y. Bengio, P. Lamblin, D. Popovici, & H. Larochelle. (NIPS 2006)
       "Greedy Layer-Wise Training of Deep Networks"
       http://machinelearning.wustl.edu/mlpapers/paper_files/NIPS2006_739.pdf

       The Appendix also contains pseudocode for the approaches:
       http://www.iro.umontreal.ca/~pift6266/A06/refs/appendix_dbn_supervised.pdf
    '''

[docs]    def __init__(self, algo, network):
        self.algo = algo
        self.network = network

[docs]    def itertrain(self, train, valid=None, **kwargs):
        '''Train a model using a training and validation set.

        This method yields a series of monitor values to the caller. After every
        iteration, a pair of monitor dictionaries is generated: one evaluated on
        the training dataset, and another evaluated on the validation dataset.
        The validation monitors might not be updated during every training
        iteration; in this case, the most recent validation monitors will be
        yielded along with the training monitors.

        Parameters
        ----------
        train : :class:`Dataset <theanets.dataset.Dataset>`
            A set of training data for computing updates to model parameters.
        valid : :class:`Dataset <theanets.dataset.Dataset>`
            A set of validation data for computing monitor values and
            determining when the loss has stopped improving.

        Yields
        ------
        training : dict
            A dictionary mapping monitor names to values, evaluated on the
            training dataset.
        validation : dict
            A dictionary containing monitor values evaluated on the validation
            dataset.
        '''
        net = self.network
        original = list(net.layers)
        output_name = original[-1].output_name
        tied = any(isinstance(l, layers.Tied) for l in original)
        L = 1 + len(original) // 2 if tied else len(original) - 1
        for i in range(1, L):
            tail = []
            if i == L - 1:
                net.layers = original
            elif tied:
                net.layers = original[:i+1]
                for j in range(i):
                    prev = tail[-1] if tail else net.layers[-1]
                    tail.append(layers.Layer.build(
                        'tied', partner=original[i-j].name, inputs=prev.name))
                net.layers = original[:i+1] + tail
            else:
                tail.append(layers.Layer.build(
                    'feedforward',
                    name='lwout',
                    inputs=original[i].output_name,
                    size=original[-1].output_size,
                    activation=original[-1].kwargs['activation']))
                net.layers = original[:i+1] + tail
            util.log('layerwise: training {}',
                     ' -> '.join(l.name for l in net.layers))
            [l.bind(net, initialize=False) for l in net.layers]
            [l.setup() for l in tail]
            net.losses[0].output_name = net.layers[-1].output_name
            trainer = DownhillTrainer(self.algo, net)
            for monitors in trainer.itertrain(train, valid, **kwargs):
                yield monitors
        net.layers = original
        net.losses[0].output_name = output_name


[docs]class UnsupervisedPretrainer(object):
    '''Train a classification model using an unsupervised pre-training step.

    This trainer is a bit of glue code that creates a "shadow" autoencoder based
    on a current network model, trains the autoencoder, and then transfers the
    trained weights back to the original model.

    This code is intended mostly as a proof-of-concept to demonstrate how shadow
    networks can be created, and how trainers can call other trainers for lots
    of different types of training regimens.
    '''

[docs]    def __init__(self, algo, network):
        self.algo = algo
        self.network = network

[docs]    def itertrain(self, train, valid=None, **kwargs):
        '''Train a model using a training and validation set.

        This method yields a series of monitor values to the caller. After every
        iteration, a pair of monitor dictionaries is generated: one evaluated on
        the training dataset, and another evaluated on the validation dataset.
        The validation monitors might not be updated during every training
        iteration; in this case, the most recent validation monitors will be
        yielded along with the training monitors.

        Parameters
        ----------
        train : :class:`Dataset <theanets.dataset.Dataset>`
            A set of training data for computing updates to model parameters.
        valid : :class:`Dataset <theanets.dataset.Dataset>`
            A set of validation data for computing monitor values and
            determining when the loss has stopped improving.

        Yields
        ------
        training : dict
            A dictionary mapping monitor names to values, evaluated on the
            training dataset.
        validation : dict
            A dictionary containing monitor values evaluated on the validation
            dataset.
        '''
        from . import feedforward

        original_layer_names = set(l.name for l in self.network.layers[:-1])

        # construct a "shadow" of the input network, using the original
        # network's encoding layers, with tied weights in an autoencoder
        # configuration.
        layers_ = list(l.to_spec() for l in self.network.layers[:-1])
        for i, l in enumerate(layers_[::-1][:-2]):
            layers_.append(dict(
                form='tied', partner=l['name'], activation=l['activation']))
        layers_.append(dict(
            form='tied', partner=layers_[1]['name'], activation='linear'))

        util.log('creating shadow network')
        ae = feedforward.Autoencoder(layers=layers_)

        # train the autoencoder using the supervised layerwise pretrainer.
        pre = SupervisedPretrainer(self.algo, ae)
        for monitors in pre.itertrain(train, valid, **kwargs):
            yield monitors

        # copy trained parameter values back to our original network.
        for param in ae.params:
            l, p = param.name.split('.')
            if l in original_layer_names:
                util.log('copying pretrained parameter {}', param.name)
                self.network.find(l, p).set_value(param.get_value())

        util.log('completed unsupervised pretraining')