Source code for theanets.activations

# -*- coding: utf-8 -*-

r'''Activation functions for network layers.

Activation functions are normally constructed using the :func:`build` function.
Commonly available functions are:

- "linear"
- "logistic" (or "sigmoid")
- "tanh"
- "softmax" (typically used for :class:`classifier <theanets.feedforward.Classifier>`
  output layers)
- "relu" (or "rect:max")
- "rect:min"
- "rect:minmax"
- "softplus" (continuous approximation of "relu")
- "norm:mean": subtractive (mean) batch normalization
- "norm:max": divisive (max) batch normalization
- "norm:std": divisive (standard deviation) batch normalization
- "norm:z": z-score batch normalization

Additionally, the names of all classes defined in this module can be used as
keys when building an activation function.
'''

import functools
import numpy as np
import theano
import theano.tensor as TT

from . import util


def _identity(x): return x


def _relu(x): return (x + abs(x)) / 2


def _trel(x): return (x + 1 - abs(x - 1)) / 2


def _rect(x): return (abs(x) + 1 - abs(x - 1)) / 2


def _norm_mean(x): return x - x.mean(axis=-1, keepdims=True)


def _norm_max(x): return x / (abs(x).max(axis=-1, keepdims=True) + 1e-8)


def _norm_std(x): return x / (x.std(axis=-1, keepdims=True) + 1e-8)


def _norm_z(x): return ((x - x.mean(axis=-1, keepdims=True)) /
                        (x.std(axis=-1, keepdims=True) + 1e-8))


def _softmax(x):
    z = TT.exp(x - x.max(axis=-1, keepdims=True))
    return z / z.sum(axis=-1, keepdims=True)


COMMON = {
    # s-shaped
    'tanh':        TT.tanh,
    'logistic':    TT.nnet.sigmoid,
    'sigmoid':     TT.nnet.sigmoid,

    # softmax (typically for classification)
    'softmax':     _softmax,

    # linear variants
    'linear':      _identity,
    'softplus':    TT.nnet.softplus,
    'relu':        _relu,
    'rect:max':    _relu,
    'rect:min':    _trel,
    'rect:minmax': _rect,

    # batch normalization
    'norm:mean':   _norm_mean,
    'norm:max':    _norm_max,
    'norm:std':    _norm_std,
    'norm:z':      _norm_z,
}


[docs]def build(name, layer, **kwargs):
    '''Construct an activation function by name.

    Parameters
    ----------
    name : str or :class:`Activation`
        The name of the type of activation function to build, or an
        already-created instance of an activation function.
    layer : :class:`theanets.layers.Layer`
        The layer to which this activation will be applied.
    kwargs : dict
        Additional named arguments to pass to the activation constructor.

    Returns
    -------
    activation : :class:`Activation`
        A neural network activation function instance.
    '''
    if isinstance(name, Activation):
        return name

    if '+' in name:
        return functools.reduce(
            Compose, (build(n, layer, **kwargs) for n in name.split('+')))

    act = COMMON.get(name)
    if act is not None:
        act.name = name
        act.params = []
        return act

    if name.lower().startswith('maxout') and ':' in name:
        name, pieces = name.split(':', 1)
        kwargs['pieces'] = int(pieces)
    kwargs['name'] = name
    kwargs['layer'] = layer
    return Activation.build(name, **kwargs)


[docs]class Activation(util.Registrar(str('Base'), (), {})):
    '''An activation function for a neural network layer.

    Parameters
    ----------
    name : str
        Name of this activation function.
    layer : :class:`Layer`
        The layer to which this function is applied.

    Attributes
    ----------
    name : str
        Name of this activation function.
    layer : :class:`Layer`
        The layer to which this function is applied.
    '''

[docs]    def __init__(self, name, layer, **kwargs):
        self.name = name
        self.layer = layer
        self.kwargs = kwargs
        self.params = []

    def __call__(self, x):
        '''Compute a symbolic expression for this activation function.

        Parameters
        ----------
        x : Theano expression
            A Theano expression representing the input to this activation
            function.

        Returns
        -------
        y : Theano expression
            A Theano expression representing the output from this activation
            function.
        '''
        raise NotImplementedError


class Compose(Activation):
    r'''Compose two activation functions.'''

    def __init__(self, f, g):
        self.f = f
        self.g = g
        self.name = '{}({})'.format(g.name, f.name)
        self.layer = None
        self.kwargs = {}
        self.params = getattr(g, 'params', []) + getattr(f, 'params', [])

    def __call__(self, x):
        return self.g(self.f(x))


[docs]class Prelu(Activation):
    r'''Parametric rectified linear activation with learnable leak rate.

    This activation is characterized by two linear pieces joined at the origin.
    For negative inputs, the unit response is a linear function of the input
    with slope :math:`r` (the "leak rate"). For positive inputs, the unit
    response is the identity function:

    .. math::
       f(x) = \left\{ \begin{eqnarray*} rx &\qquad& \mbox{if } x < 0 \\
                      x &\qquad& \mbox{otherwise} \end{eqnarray*} \right.

    This activation allocates a separate leak rate for each unit in its layer.

    References
    ----------
    K He, X Zhang, S Ren, J Sun (2015), "Delving Deep into Rectifiers:
    Surpassing Human-Level Performance on ImageNet Classification"
    http://arxiv.org/abs/1502.01852
    '''

    __extra_registration_keys__ = ['leaky-relu']

[docs]    def __init__(self, *args, **kwargs):
        super(Prelu, self).__init__(*args, **kwargs)
        arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
        self.leak = theano.shared(0.1 * abs(arr), name=self.layer._fmt('leak'))
        self.params.append(self.leak)

    def __call__(self, x):
        return (x + abs(x)) / 2 + TT.exp(self.leak) * (x - abs(x)) / 2


[docs]class LGrelu(Activation):
    r'''Rectified linear activation with learnable leak rate and gain.

    This activation is characterized by two linear pieces joined at the origin.
    For negative inputs, the unit response is a linear function of the input
    with slope :math:`r` (the "leak rate"). For positive inputs, the unit
    response is a different linear function of the input with slope :math:`g`
    (the "gain"):

    .. math::
       f(x) = \left\{ \begin{eqnarray*} rx &\qquad& \mbox{if } x < 0 \\
                       gx &\qquad& \mbox{otherwise} \end{eqnarray*} \right.

    This activation allocates a separate leak and gain rate for each unit in its
    layer.
    '''

    __extra_registration_keys__ = ['leaky-gain-relu']

[docs]    def __init__(self, *args, **kwargs):
        super(LGrelu, self).__init__(*args, **kwargs)
        arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
        self.gain = theano.shared(0.1 * abs(arr), name=self.layer._fmt('gain'))
        self.params.append(self.gain)
        arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
        self.leak = theano.shared(0.1 * abs(arr), name=self.layer._fmt('leak'))
        self.params.append(self.leak)

    def __call__(self, x):
        return TT.exp(self.gain) * (x + abs(x)) / 2 + TT.exp(self.leak) * (x - abs(x)) / 2


class Elu(Activation):
    r'''Exponential linear activation with learnable gain.

    This activation is characterized by two pieces joined at the origin. For
    negative inputs, the unit response is a decaying exponential function of the
    input with saturation :math:`\alpha`. For positive inputs, the unit response
    is the identity linear function of the input:

    .. math::
       f(x) = \left\{ \begin{eqnarray*} \alpha (exp(x) - 1) &\qquad& \mbox{if } x < 0 \\
                       x &\qquad& \mbox{otherwise} \end{eqnarray*} \right.

    This activation allocates a separate gain for each unit in its layer.
    '''

    __extra_registration_keys__ = []

    def __init__(self, *args, **kwargs):
        super(Elu, self).__init__(*args, **kwargs)
        arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
        self.gain = theano.shared(0.1 * abs(arr), name=self.layer._fmt('gain'))
        self.params.append(self.gain)

    def __call__(self, x):
        return x * (x >= 0) + TT.exp(self.gain) * (TT.exp(x) - 1) * (x < 0)


[docs]class Maxout(Activation):
    r'''Arbitrary piecewise linear activation.

    This activation is unusual in that it requires a parameter at initialization
    time: the number of linear pieces to use. Consider a layer for the moment
    with just one unit. A maxout activation with :math:`k` pieces uses a slope
    :math:`m_k` and an intercept :math:`b_k` for each linear piece. It then
    transforms the input to the maximum of all of the pieces:

    .. math::
       f(x) = \max_k m_k x + b_k

    The parameters :math:`m_k` and :math:`b_k` are learnable.

    For layers with more than one unit, the maxout activation allocates a slope
    :math:`m_{ki}` and intercept :math:`b_{ki}` for each unit :math:`i` and each
    piece :math:`k`. The activation for unit :math:`x_i` is:

    .. math::
       f(x_i) = \max_k m_{ki} x_i + b_{ki}

    Again, the slope and intercept parameters are learnable.

    This activation is actually a generalization of the rectified linear
    activations; to see how, just allocate 2 pieces and set the intercepts to 0.
    The slopes of the ``relu`` activation are given by :math:`m = (0, 1)`, those
    of the :class:`Prelu` function are given by :math:`m = (r, 1)`, and those of
    the :class:`LGrelu` are given by :math:`m = (r, g)` where :math:`r` is the
    leak rate parameter and :math:`g` is a gain parameter.

    .. note::

       To use this activation in a network layer specification, provide an
       activation string of the form ``'maxout:k'``, where ``k`` is an integer
       giving the number of piecewise functions.

       For example, the layer tuple ``(100, 'rnn', 'maxout:10')`` specifies a
       vanilla :class:`RNN <theanets.layers.recurrent.RNN>` layer with 100 units
       and a maxout activation with 10 pieces.

    Parameters
    ----------
    pieces : int
        Number of linear pieces to use in the activation.
    '''

[docs]    def __init__(self, *args, **kwargs):
        super(Maxout, self).__init__(*args, **kwargs)

        self.pieces = kwargs['pieces']

        m = self.layer.rng.randn(self.layer.output_size, self.pieces).astype(util.FLOAT)
        self.slope = theano.shared(m, name=self.layer._fmt('slope'))
        self.params.append(self.slope)

        b = self.layer.rng.randn(self.layer.output_size, self.pieces).astype(util.FLOAT)
        self.intercept = theano.shared(b, name=self.layer._fmt('intercept'))
        self.params.append(self.intercept)

    def __call__(self, x):
        dims = list(range(x.ndim)) + ['x']
        return (x.dimshuffle(*dims) * self.slope + self.intercept).max(axis=-1)