Source code for odl.solvers.smooth.newton

# Copyright 2014-2019 The ODL contributors
#
# This file is part of ODL.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

"""(Quasi-)Newton schemes to find zeros of functionals."""

from __future__ import print_function, division, absolute_import
import numpy as np

from odl.solvers.util import ConstantLineSearch
from odl.solvers.iterative.iterative import conjugate_gradient


__all__ = ('newtons_method', 'bfgs_method', 'broydens_method')


def _bfgs_direction(s, y, x, hessinv_estimate=None):
    r"""Compute ``Hn^-1(x)`` for the L-BFGS method.

    Parameters
    ----------
    s : sequence of `LinearSpaceElement`
        The ``s`` coefficients in the BFGS update, see Notes.
    y : sequence of `LinearSpaceElement`
        The ``y`` coefficients in the BFGS update, see Notes.
    x : `LinearSpaceElement`
        Point in which to evaluate the product.
    hessinv_estimate : `Operator`, optional
        Initial estimate of the hessian ``H0^-1``.

    Returns
    -------
    r : ``x.space`` element
        The result of ``Hn^-1(x)``.

    Notes
    -----
    :math:`H_n^{-1}` is defined recursively as

    .. math::
        H_{n+1}^{-1} =
        \left(I - \frac{ s_n y_n^T}{y_n^T s_n} \right)
        H_{n}^{-1}
        \left(I - \frac{ y_n s_n^T}{y_n^T s_n} \right) +
        \frac{s_n s_n^T}{y_n^T \, s_n}

    With :math:`H_0^{-1}` given by ``hess_estimate``.
    """
    assert len(s) == len(y)

    r = x.copy()
    alphas = np.zeros(len(s))
    rhos = np.zeros(len(s))

    for i in reversed(range(len(s))):
        rhos[i] = 1.0 / y[i].inner(s[i])
        alphas[i] = rhos[i] * (s[i].inner(r))
        r.lincomb(1, r, -alphas[i], y[i])

    if hessinv_estimate is not None:
        r = hessinv_estimate(r)

    for i in range(len(s)):
        beta = rhos[i] * (y[i].inner(r))
        r.lincomb(1, r, alphas[i] - beta, s[i])

    return r


def _broydens_direction(s, y, x, hessinv_estimate=None, impl='first'):
    r"""Compute ``Hn^-1(x)`` for Broydens method.

    Parameters
    ----------
    s : sequence of `LinearSpaceElement`'s'
        The ``s`` coefficients in the Broydens update, see Notes.
    y : sequence of `LinearSpaceElement`'s'
        The ``y`` coefficients in the Broydens update, see Notes.
    x : `LinearSpaceElement`
        Point in which to evaluate the product.
    hessinv_estimate : `Operator`, optional
        Initial estimate of the hessian ``H0^-1``.
    impl : {'first', 'second'}, optional
        The type of Broydens method to use.

    Returns
    -------
    r : ``x.space`` element
        The result of ``Hn^-1(x)``.

    Notes
    -----
    For ``impl = 'first'``, :math:`H_n^{-1}` is defined recursively as

    .. math::
        H_{n+1}^{-1} = \left(I + s_n y_n^T \right) H_{n}^{-1}

    and for ``impl = 'second'``:

    .. math::
        H_{n+1}^{-1} = H_{n}^{-1} + s_n y_n^T

    With :math:`H_0^{-1}` given by ``hess_estimate``.
    """
    assert len(s) == len(y)

    if hessinv_estimate is not None:
        r = hessinv_estimate(x)
    else:
        r = x.copy()

    for i in range(len(s)):
        if impl == 'first':
            r.lincomb(1, r, y[i].inner(r), s[i])
        elif impl == 'second':
            r.lincomb(1, r, y[i].inner(x), s[i])
        else:
            raise RuntimeError('unknown `impl`')

    return r



[docs]
def newtons_method(f, x, line_search=1.0, maxiter=1000, tol=1e-16,
                   cg_iter=None, callback=None):
    r"""Newton's method for minimizing a functional.

    Notes
    -----
    This is a general and optimized implementation of Newton's method
    for solving the problem:

    .. math::
        \min f(x)

    for a differentiable function
    :math:`f: \mathcal{X}\to \mathbb{R}` on a Hilbert space
    :math:`\mathcal{X}`. It does so by finding a zero of the gradient

    .. math::
        \nabla f: \mathcal{X} \to \mathcal{X}.

    of finding a root of a function.

    The algorithm is well-known and there is a vast literature about it.
    Among others, the method is described in [BV2004], Sections 9.5
    and 10.2 (`book available online
    <http://stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf>`_),
    [GNS2009],  Section 2.7 for solving nonlinear equations and Section
    11.3 for its use in minimization, and wikipedia on `Newton's_method
    <https://en.wikipedia.org/wiki/Newton's_method>`_.

    The algorithm works by iteratively solving

    .. math::
        \partial f(x_k)p_k = -f(x_k)

    and then updating as

    .. math::
        x_{k+1} = x_k + \alpha x_k,

    where :math:`\alpha` is a suitable step length (see the
    references). In this implementation the system of equations are
    solved using the conjugate gradient method.

    Parameters
    ----------
    f : `Functional`
        Goal functional. Needs to have ``f.gradient`` and
        ``f.gradient.derivative``.
    x : ``op.domain`` element
        Starting point of the iteration
    line_search : float or `LineSearch`, optional
        Strategy to choose the step length. If a float is given, uses it as a
        fixed step length.
    maxiter : int, optional
        Maximum number of iterations.
    tol : float, optional
        Tolerance that should be used for terminating the iteration.
    cg_iter : int, optional
        Number of iterations in the the conjugate gradient solver,
        for computing the search direction.
    callback : callable, optional
        Object executing code per iteration, e.g. plotting each iterate

    References
    ----------
    [BV2004] Boyd, S, and Vandenberghe, L. *Convex optimization*.
    Cambridge university press, 2004.

    [GNS2009] Griva, I, Nash, S G, and Sofer, A. *Linear and nonlinear
    optimization*. Siam, 2009.
    """
    # TODO: update doc
    grad = f.gradient
    if x not in grad.domain:
        raise TypeError('`x` {!r} is not in the domain of `f` {!r}'
                        ''.format(x, grad.domain))

    if not callable(line_search):
        line_search = ConstantLineSearch(line_search)

    if cg_iter is None:
        # Motivated by that if it is Ax = b, x and b in Rn, it takes at most n
        # iterations to solve with cg
        cg_iter = grad.domain.size

    # TODO: optimize by using lincomb and avoiding to create copies
    for _ in range(maxiter):

        # Initialize the search direction to 0
        search_direction = x.space.zero()

        # Compute hessian (as operator) and gradient in the current point
        hessian = grad.derivative(x)
        deriv_in_point = grad(x)

        # Solving A*x = b for x, in this case f''(x)*p = -f'(x)
        # TODO: Let the user provide/choose method for how to solve this?
        try:
            hessian_inverse = hessian.inverse
        except NotImplementedError:
            conjugate_gradient(hessian, search_direction,
                               -deriv_in_point, cg_iter)
        else:
            hessian_inverse(-deriv_in_point, out=search_direction)

        # Computing step length
        dir_deriv = search_direction.inner(deriv_in_point)
        if np.abs(dir_deriv) <= tol:
            return

        step_length = line_search(x, search_direction, dir_deriv)

        # Updating
        x += step_length * search_direction

        if callback is not None:
            callback(x)




[docs]
def bfgs_method(f, x, line_search=1.0, maxiter=1000, tol=1e-15, num_store=None,
                hessinv_estimate=None, callback=None):
    r"""Quasi-Newton BFGS method to minimize a differentiable function.

    Notes
    -----
    Can use either the regular BFGS method, or the limited memory BFGS method.

    This is a general and optimized implementation of a quasi-Newton
    method with BFGS update for solving a general unconstrained
    optimization problem

    .. math::
        \min f(x)

    for a differentiable function
    :math:`f: \mathcal{X}\to \mathbb{R}` on a Hilbert space
    :math:`\mathcal{X}`. It does so by finding a zero of the gradient

    .. math::
        \nabla f: \mathcal{X} \to \mathcal{X}.

    The QN method is an approximate Newton method, where the Hessian
    is approximated and gradually updated in each step. This
    implementation uses the rank-one BFGS update schema where the
    inverse of the Hessian is recalculated in each iteration.

    The algorithm is described in [GNS2009], Section 12.3 and in the
    `BFGS Wikipedia article
    <https://en.wikipedia.org/wiki/BFGS_algorithm>`_.

    Parameters
    ----------
    f : `Functional`
        Functional with ``f.gradient``.
    x : ``f.domain`` element
        Starting point of the iteration
    line_search : float or `LineSearch`, optional
        Strategy to choose the step length. If a float is given, uses it as a
        fixed step length.
    maxiter : int, optional
        Maximum number of iterations.
    tol : float, optional
        Tolerance that should be used for terminating the iteration.
    num_store : int, optional
        Maximum number of correction factors to store. For ``None``, the method
        is the regular BFGS method. For an integer, the method becomes the
        Limited Memory BFGS method.
    hessinv_estimate : `Operator`, optional
        Initial estimate of the inverse of the Hessian operator. Needs to be an
        operator from ``f.domain`` to ``f.domain``.
        Default: Identity on ``f.domain``
    callback : callable, optional
        Object executing code per iteration, e.g. plotting each iterate.

    References
    ----------
    [GNS2009] Griva, I, Nash, S G, and Sofer, A. *Linear and nonlinear
    optimization*. Siam, 2009.
    """
    grad = f.gradient
    if x not in grad.domain:
        raise TypeError('`x` {!r} is not in the domain of `grad` {!r}'
                        ''.format(x, grad.domain))

    if not callable(line_search):
        line_search = ConstantLineSearch(line_search)

    ys = []
    ss = []

    grad_x = grad(x)
    for i in range(maxiter):
        # Determine a stepsize using line search
        search_dir = -_bfgs_direction(ss, ys, grad_x, hessinv_estimate)
        dir_deriv = search_dir.inner(grad_x)
        if np.abs(dir_deriv) == 0:
            return  # we found an optimum
        step = line_search(x, direction=search_dir, dir_derivative=dir_deriv)

        # Update x
        x_update = search_dir
        x_update *= step
        x += x_update

        grad_x, grad_diff = grad(x), grad_x
        # grad_diff = grad(x) - grad(x_old)
        grad_diff.lincomb(-1, grad_diff, 1, grad_x)

        y_inner_s = grad_diff.inner(x_update)

        # Test for convergence
        if np.abs(y_inner_s) < tol:
            if grad_x.norm() < tol:
                return
            else:
                # Reset if needed
                ys = []
                ss = []
                continue

        # Update Hessian
        ys.append(grad_diff)
        ss.append(x_update)
        if num_store is not None:
            # Throw away factors if they are too many.
            ss = ss[-num_store:]
            ys = ys[-num_store:]

        if callback is not None:
            callback(x)




[docs]
def broydens_method(f, x, line_search=1.0, impl='first', maxiter=1000,
                    tol=1e-15, hessinv_estimate=None,
                    callback=None):
    r"""Broyden's first method, a quasi-Newton scheme.

    Notes
    -----
    This is a general and optimized implementation of Broyden's method,
    a quasi-Newton method for solving a general unconstrained optimization
    problem

    .. math::
        \min f(x)

    for a differentiable function
    :math:`f: \mathcal{X}\to \mathbb{R}` on a Hilbert space
    :math:`\mathcal{X}`. It does so by finding a zero of the gradient

    .. math::
        \nabla f: \mathcal{X} \to \mathcal{X}

    using a Newton-type update scheme with approximate Hessian.

    The algorithm is described in [Bro1965] and [Kva1991], and in a
    `Wikipedia article
    <https://en.wikipedia.org/wiki/Broyden's_method>`_.

    Parameters
    ----------
    f : `Functional`
        Functional with ``f.gradient``
    x : ``f.domain`` element
        Starting point of the iteration
    line_search : float or `LineSearch`, optional
        Strategy to choose the step length. If a float is given, uses it as a
        fixed step length.
    impl : {'first', 'second'}, optional
        What version of Broydens method to use. First is also known as Broydens
        'good' method, while the second is known as Broydens 'bad' method.
    maxiter : int, optional
        Maximum number of iterations.
        ``tol``.
    tol : float, optional
        Tolerance that should be used for terminating the iteration.
    hessinv_estimate : `Operator`, optional
        Initial estimate of the inverse of the Hessian operator. Needs to be an
        operator from ``f.domain`` to ``f.domain``.
        Default: Identity on ``f.domain``
    callback : callable, optional
        Object executing code per iteration, e.g. plotting each iterate.

    References
    ----------
    [Bro1965] Broyden, C G. *A class of methods for solving nonlinear
    simultaneous equations*. Mathematics of computation, 33 (1965),
    pp 577--593.

    [Kva1991] Kvaalen, E. *A faster Broyden method*. BIT Numerical
    Mathematics 31 (1991), pp 369--372.
    """
    grad = f.gradient
    if x not in grad.domain:
        raise TypeError('`x` {!r} is not in the domain of `grad` {!r}'
                        ''.format(x, grad.domain))

    if not callable(line_search):
        line_search = ConstantLineSearch(line_search)

    impl, impl_in = str(impl).lower(), impl
    if impl not in ('first', 'second'):
        raise ValueError('`impl` {!r} not understood'
                         ''.format(impl_in))

    ss = []
    ys = []

    grad_x = grad(x)
    for i in range(maxiter):
        # find step size
        search_dir = -_broydens_direction(ss, ys, grad_x,
                                          hessinv_estimate, impl)
        dir_deriv = search_dir.inner(grad_x)
        if np.abs(dir_deriv) == 0:
            return  # we found an optimum

        step = line_search(x, search_dir, dir_deriv)

        # update x
        x_update = step * search_dir
        x += x_update

        # compute new gradient
        grad_x, grad_x_old = grad(x), grad_x
        delta_grad = grad_x - grad_x_old

        # update hessian.
        # TODO: reuse from above
        v = _broydens_direction(ss, ys, delta_grad, hessinv_estimate,
                                impl)
        if impl == 'first':
            divisor = x_update.inner(v)

            # Test for convergence
            if np.abs(divisor) < tol:
                if grad_x.norm() < tol:
                    return
                else:
                    # Reset if needed
                    ys = []
                    ss = []
                    continue
            u = (x_update - v) / divisor
            ss.append(u)
            ys.append(x_update)
        elif impl == 'second':
            divisor = delta_grad.inner(delta_grad)

            # Test for convergence
            if np.abs(divisor) < tol:
                if grad_x.norm() < tol:
                    return
                else:
                    # Reset if needed
                    ys = []
                    ss = []
                    continue
            u = (x_update - v) / divisor
            ss.append(u)
            ys.append(delta_grad)

        if callback is not None:
            callback(x)



if __name__ == '__main__':
    from odl.util.testutils import run_doctests
    run_doctests()