# -*- coding: utf-8 -*-
r'''Activation functions for network layers.
Activation functions are normally constructed using the :func:`build` function.
Commonly available functions are:
- "linear"
- "logistic" (or "sigmoid")
- "tanh"
- "softmax" (typically used for :class:`classifier <theanets.feedforward.Classifier>`
output layers)
- "relu" (or "rect:max")
- "rect:min"
- "rect:minmax"
- "softplus" (continuous approximation of "relu")
- "norm:mean": subtractive (mean) batch normalization
- "norm:max": divisive (max) batch normalization
- "norm:std": divisive (standard deviation) batch normalization
- "norm:z": z-score batch normalization
Additionally, the names of all classes defined in this module can be used as
keys when building an activation function.
'''
import functools
import numpy as np
import theano
import theano.tensor as TT
from . import util
def _identity(x): return x
def _relu(x): return (x + abs(x)) / 2
def _trel(x): return (x + 1 - abs(x - 1)) / 2
def _rect(x): return (abs(x) + 1 - abs(x - 1)) / 2
def _norm_mean(x): return x - x.mean(axis=-1, keepdims=True)
def _norm_max(x): return x / (abs(x).max(axis=-1, keepdims=True) + 1e-8)
def _norm_std(x): return x / (x.std(axis=-1, keepdims=True) + 1e-8)
def _norm_z(x): return ((x - x.mean(axis=-1, keepdims=True)) /
(x.std(axis=-1, keepdims=True) + 1e-8))
def _softmax(x):
z = TT.exp(x - x.max(axis=-1, keepdims=True))
return z / z.sum(axis=-1, keepdims=True)
COMMON = {
# s-shaped
'tanh': TT.tanh,
'logistic': TT.nnet.sigmoid,
'sigmoid': TT.nnet.sigmoid,
# softmax (typically for classification)
'softmax': _softmax,
# linear variants
'linear': _identity,
'softplus': TT.nnet.softplus,
'relu': _relu,
'rect:max': _relu,
'rect:min': _trel,
'rect:minmax': _rect,
# batch normalization
'norm:mean': _norm_mean,
'norm:max': _norm_max,
'norm:std': _norm_std,
'norm:z': _norm_z,
}
[docs]def build(name, layer, **kwargs):
'''Construct an activation function by name.
Parameters
----------
name : str or :class:`Activation`
The name of the type of activation function to build, or an
already-created instance of an activation function.
layer : :class:`theanets.layers.Layer`
The layer to which this activation will be applied.
kwargs : dict
Additional named arguments to pass to the activation constructor.
Returns
-------
activation : :class:`Activation`
A neural network activation function instance.
'''
if isinstance(name, Activation):
return name
if '+' in name:
return functools.reduce(
Compose, (build(n, layer, **kwargs) for n in name.split('+')))
act = COMMON.get(name)
if act is not None:
act.name = name
act.params = []
return act
if name.lower().startswith('maxout') and ':' in name:
name, pieces = name.split(':', 1)
kwargs['pieces'] = int(pieces)
kwargs['name'] = name
kwargs['layer'] = layer
return Activation.build(name, **kwargs)
[docs]class Activation(util.Registrar(str('Base'), (), {})):
'''An activation function for a neural network layer.
Parameters
----------
name : str
Name of this activation function.
layer : :class:`Layer`
The layer to which this function is applied.
Attributes
----------
name : str
Name of this activation function.
layer : :class:`Layer`
The layer to which this function is applied.
'''
[docs] def __init__(self, name, layer, **kwargs):
self.name = name
self.layer = layer
self.kwargs = kwargs
self.params = []
def __call__(self, x):
'''Compute a symbolic expression for this activation function.
Parameters
----------
x : Theano expression
A Theano expression representing the input to this activation
function.
Returns
-------
y : Theano expression
A Theano expression representing the output from this activation
function.
'''
raise NotImplementedError
class Compose(Activation):
r'''Compose two activation functions.'''
def __init__(self, f, g):
self.f = f
self.g = g
self.name = '{}({})'.format(g.name, f.name)
self.layer = None
self.kwargs = {}
self.params = getattr(g, 'params', []) + getattr(f, 'params', [])
def __call__(self, x):
return self.g(self.f(x))
[docs]class Prelu(Activation):
r'''Parametric rectified linear activation with learnable leak rate.
This activation is characterized by two linear pieces joined at the origin.
For negative inputs, the unit response is a linear function of the input
with slope :math:`r` (the "leak rate"). For positive inputs, the unit
response is the identity function:
.. math::
f(x) = \left\{ \begin{eqnarray*} rx &\qquad& \mbox{if } x < 0 \\
x &\qquad& \mbox{otherwise} \end{eqnarray*} \right.
This activation allocates a separate leak rate for each unit in its layer.
References
----------
K He, X Zhang, S Ren, J Sun (2015), "Delving Deep into Rectifiers:
Surpassing Human-Level Performance on ImageNet Classification"
http://arxiv.org/abs/1502.01852
'''
__extra_registration_keys__ = ['leaky-relu']
[docs] def __init__(self, *args, **kwargs):
super(Prelu, self).__init__(*args, **kwargs)
arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
self.leak = theano.shared(0.1 * abs(arr), name=self.layer._fmt('leak'))
self.params.append(self.leak)
def __call__(self, x):
return (x + abs(x)) / 2 + TT.exp(self.leak) * (x - abs(x)) / 2
[docs]class LGrelu(Activation):
r'''Rectified linear activation with learnable leak rate and gain.
This activation is characterized by two linear pieces joined at the origin.
For negative inputs, the unit response is a linear function of the input
with slope :math:`r` (the "leak rate"). For positive inputs, the unit
response is a different linear function of the input with slope :math:`g`
(the "gain"):
.. math::
f(x) = \left\{ \begin{eqnarray*} rx &\qquad& \mbox{if } x < 0 \\
gx &\qquad& \mbox{otherwise} \end{eqnarray*} \right.
This activation allocates a separate leak and gain rate for each unit in its
layer.
'''
__extra_registration_keys__ = ['leaky-gain-relu']
[docs] def __init__(self, *args, **kwargs):
super(LGrelu, self).__init__(*args, **kwargs)
arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
self.gain = theano.shared(0.1 * abs(arr), name=self.layer._fmt('gain'))
self.params.append(self.gain)
arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
self.leak = theano.shared(0.1 * abs(arr), name=self.layer._fmt('leak'))
self.params.append(self.leak)
def __call__(self, x):
return TT.exp(self.gain) * (x + abs(x)) / 2 + TT.exp(self.leak) * (x - abs(x)) / 2
class Elu(Activation):
r'''Exponential linear activation with learnable gain.
This activation is characterized by two pieces joined at the origin. For
negative inputs, the unit response is a decaying exponential function of the
input with saturation :math:`\alpha`. For positive inputs, the unit response
is the identity linear function of the input:
.. math::
f(x) = \left\{ \begin{eqnarray*} \alpha (exp(x) - 1) &\qquad& \mbox{if } x < 0 \\
x &\qquad& \mbox{otherwise} \end{eqnarray*} \right.
This activation allocates a separate gain for each unit in its layer.
'''
__extra_registration_keys__ = []
def __init__(self, *args, **kwargs):
super(Elu, self).__init__(*args, **kwargs)
arr = self.layer.rng.randn(self.layer.output_size).astype(util.FLOAT)
self.gain = theano.shared(0.1 * abs(arr), name=self.layer._fmt('gain'))
self.params.append(self.gain)
def __call__(self, x):
return x * (x >= 0) + TT.exp(self.gain) * (TT.exp(x) - 1) * (x < 0)
[docs]class Maxout(Activation):
r'''Arbitrary piecewise linear activation.
This activation is unusual in that it requires a parameter at initialization
time: the number of linear pieces to use. Consider a layer for the moment
with just one unit. A maxout activation with :math:`k` pieces uses a slope
:math:`m_k` and an intercept :math:`b_k` for each linear piece. It then
transforms the input to the maximum of all of the pieces:
.. math::
f(x) = \max_k m_k x + b_k
The parameters :math:`m_k` and :math:`b_k` are learnable.
For layers with more than one unit, the maxout activation allocates a slope
:math:`m_{ki}` and intercept :math:`b_{ki}` for each unit :math:`i` and each
piece :math:`k`. The activation for unit :math:`x_i` is:
.. math::
f(x_i) = \max_k m_{ki} x_i + b_{ki}
Again, the slope and intercept parameters are learnable.
This activation is actually a generalization of the rectified linear
activations; to see how, just allocate 2 pieces and set the intercepts to 0.
The slopes of the ``relu`` activation are given by :math:`m = (0, 1)`, those
of the :class:`Prelu` function are given by :math:`m = (r, 1)`, and those of
the :class:`LGrelu` are given by :math:`m = (r, g)` where :math:`r` is the
leak rate parameter and :math:`g` is a gain parameter.
.. note::
To use this activation in a network layer specification, provide an
activation string of the form ``'maxout:k'``, where ``k`` is an integer
giving the number of piecewise functions.
For example, the layer tuple ``(100, 'rnn', 'maxout:10')`` specifies a
vanilla :class:`RNN <theanets.layers.recurrent.RNN>` layer with 100 units
and a maxout activation with 10 pieces.
Parameters
----------
pieces : int
Number of linear pieces to use in the activation.
'''
[docs] def __init__(self, *args, **kwargs):
super(Maxout, self).__init__(*args, **kwargs)
self.pieces = kwargs['pieces']
m = self.layer.rng.randn(self.layer.output_size, self.pieces).astype(util.FLOAT)
self.slope = theano.shared(m, name=self.layer._fmt('slope'))
self.params.append(self.slope)
b = self.layer.rng.randn(self.layer.output_size, self.pieces).astype(util.FLOAT)
self.intercept = theano.shared(b, name=self.layer._fmt('intercept'))
self.params.append(self.intercept)
def __call__(self, x):
dims = list(range(x.ndim)) + ['x']
return (x.dimshuffle(*dims) * self.slope + self.intercept).max(axis=-1)