Source code for odl.solvers.smooth.newton

# Copyright 2014-2019 The ODL contributors
#
# This file is part of ODL.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

"""(Quasi-)Newton schemes to find zeros of functionals."""

from __future__ import print_function, division, absolute_import
import numpy as np

from odl.solvers.util import ConstantLineSearch
from odl.solvers.iterative.iterative import conjugate_gradient


__all__ = ('newtons_method', 'bfgs_method', 'broydens_method')


def _bfgs_direction(s, y, x, hessinv_estimate=None):
    r"""Compute ``Hn^-1(x)`` for the L-BFGS method.

    Parameters
    ----------
    s : sequence of `LinearSpaceElement`
        The ``s`` coefficients in the BFGS update, see Notes.
    y : sequence of `LinearSpaceElement`
        The ``y`` coefficients in the BFGS update, see Notes.
    x : `LinearSpaceElement`
        Point in which to evaluate the product.
    hessinv_estimate : `Operator`, optional
        Initial estimate of the hessian ``H0^-1``.

    Returns
    -------
    r : ``x.space`` element
        The result of ``Hn^-1(x)``.

    Notes
    -----
    :math:`H_n^{-1}` is defined recursively as

    .. math::
        H_{n+1}^{-1} =
        \left(I - \frac{ s_n y_n^T}{y_n^T s_n} \right)
        H_{n}^{-1}
        \left(I - \frac{ y_n s_n^T}{y_n^T s_n} \right) +
        \frac{s_n s_n^T}{y_n^T \, s_n}

    With :math:`H_0^{-1}` given by ``hess_estimate``.
    """
    assert len(s) == len(y)

    r = x.copy()
    alphas = np.zeros(len(s))
    rhos = np.zeros(len(s))

    for i in reversed(range(len(s))):
        rhos[i] = 1.0 / y[i].inner(s[i])
        alphas[i] = rhos[i] * (s[i].inner(r))
        r.lincomb(1, r, -alphas[i], y[i])

    if hessinv_estimate is not None:
        r = hessinv_estimate(r)

    for i in range(len(s)):
        beta = rhos[i] * (y[i].inner(r))
        r.lincomb(1, r, alphas[i] - beta, s[i])

    return r


def _broydens_direction(s, y, x, hessinv_estimate=None, impl='first'):
    r"""Compute ``Hn^-1(x)`` for Broydens method.

    Parameters
    ----------
    s : sequence of `LinearSpaceElement`'s'
        The ``s`` coefficients in the Broydens update, see Notes.
    y : sequence of `LinearSpaceElement`'s'
        The ``y`` coefficients in the Broydens update, see Notes.
    x : `LinearSpaceElement`
        Point in which to evaluate the product.
    hessinv_estimate : `Operator`, optional
        Initial estimate of the hessian ``H0^-1``.
    impl : {'first', 'second'}, optional
        The type of Broydens method to use.

    Returns
    -------
    r : ``x.space`` element
        The result of ``Hn^-1(x)``.

    Notes
    -----
    For ``impl = 'first'``, :math:`H_n^{-1}` is defined recursively as

    .. math::
        H_{n+1}^{-1} = \left(I + s_n y_n^T \right) H_{n}^{-1}

    and for ``impl = 'second'``:

    .. math::
        H_{n+1}^{-1} = H_{n}^{-1} + s_n y_n^T

    With :math:`H_0^{-1}` given by ``hess_estimate``.
    """
    assert len(s) == len(y)

    if hessinv_estimate is not None:
        r = hessinv_estimate(x)
    else:
        r = x.copy()

    for i in range(len(s)):
        if impl == 'first':
            r.lincomb(1, r, y[i].inner(r), s[i])
        elif impl == 'second':
            r.lincomb(1, r, y[i].inner(x), s[i])
        else:
            raise RuntimeError('unknown `impl`')

    return r


[docs]def newtons_method(f, x, line_search=1.0, maxiter=1000, tol=1e-16, cg_iter=None, callback=None): r"""Newton's method for minimizing a functional. Notes ----- This is a general and optimized implementation of Newton's method for solving the problem: .. math:: \min f(x) for a differentiable function :math:`f: \mathcal{X}\to \mathbb{R}` on a Hilbert space :math:`\mathcal{X}`. It does so by finding a zero of the gradient .. math:: \nabla f: \mathcal{X} \to \mathcal{X}. of finding a root of a function. The algorithm is well-known and there is a vast literature about it. Among others, the method is described in [BV2004], Sections 9.5 and 10.2 (`book available online <http://stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf>`_), [GNS2009], Section 2.7 for solving nonlinear equations and Section 11.3 for its use in minimization, and wikipedia on `Newton's_method <https://en.wikipedia.org/wiki/Newton's_method>`_. The algorithm works by iteratively solving .. math:: \partial f(x_k)p_k = -f(x_k) and then updating as .. math:: x_{k+1} = x_k + \alpha x_k, where :math:`\alpha` is a suitable step length (see the references). In this implementation the system of equations are solved using the conjugate gradient method. Parameters ---------- f : `Functional` Goal functional. Needs to have ``f.gradient`` and ``f.gradient.derivative``. x : ``op.domain`` element Starting point of the iteration line_search : float or `LineSearch`, optional Strategy to choose the step length. If a float is given, uses it as a fixed step length. maxiter : int, optional Maximum number of iterations. tol : float, optional Tolerance that should be used for terminating the iteration. cg_iter : int, optional Number of iterations in the the conjugate gradient solver, for computing the search direction. callback : callable, optional Object executing code per iteration, e.g. plotting each iterate References ---------- [BV2004] Boyd, S, and Vandenberghe, L. *Convex optimization*. Cambridge university press, 2004. [GNS2009] Griva, I, Nash, S G, and Sofer, A. *Linear and nonlinear optimization*. Siam, 2009. """ # TODO: update doc grad = f.gradient if x not in grad.domain: raise TypeError('`x` {!r} is not in the domain of `f` {!r}' ''.format(x, grad.domain)) if not callable(line_search): line_search = ConstantLineSearch(line_search) if cg_iter is None: # Motivated by that if it is Ax = b, x and b in Rn, it takes at most n # iterations to solve with cg cg_iter = grad.domain.size # TODO: optimize by using lincomb and avoiding to create copies for _ in range(maxiter): # Initialize the search direction to 0 search_direction = x.space.zero() # Compute hessian (as operator) and gradient in the current point hessian = grad.derivative(x) deriv_in_point = grad(x) # Solving A*x = b for x, in this case f''(x)*p = -f'(x) # TODO: Let the user provide/choose method for how to solve this? try: hessian_inverse = hessian.inverse except NotImplementedError: conjugate_gradient(hessian, search_direction, -deriv_in_point, cg_iter) else: hessian_inverse(-deriv_in_point, out=search_direction) # Computing step length dir_deriv = search_direction.inner(deriv_in_point) if np.abs(dir_deriv) <= tol: return step_length = line_search(x, search_direction, dir_deriv) # Updating x += step_length * search_direction if callback is not None: callback(x)
[docs]def bfgs_method(f, x, line_search=1.0, maxiter=1000, tol=1e-15, num_store=None, hessinv_estimate=None, callback=None): r"""Quasi-Newton BFGS method to minimize a differentiable function. Can use either the regular BFGS method, or the limited memory BFGS method. Notes ----- This is a general and optimized implementation of a quasi-Newton method with BFGS update for solving a general unconstrained optimization problem .. math:: \min f(x) for a differentiable function :math:`f: \mathcal{X}\to \mathbb{R}` on a Hilbert space :math:`\mathcal{X}`. It does so by finding a zero of the gradient .. math:: \nabla f: \mathcal{X} \to \mathcal{X}. The QN method is an approximate Newton method, where the Hessian is approximated and gradually updated in each step. This implementation uses the rank-one BFGS update schema where the inverse of the Hessian is recalculated in each iteration. The algorithm is described in [GNS2009], Section 12.3 and in the `BFGS Wikipedia article <https://en.wikipedia.org/wiki/Broyden%E2%80%93Fletcher%E2%80%93\ Goldfarb%E2%80%93Shanno_algorithm>`_ Parameters ---------- f : `Functional` Functional with ``f.gradient``. x : ``f.domain`` element Starting point of the iteration line_search : float or `LineSearch`, optional Strategy to choose the step length. If a float is given, uses it as a fixed step length. maxiter : int, optional Maximum number of iterations. tol : float, optional Tolerance that should be used for terminating the iteration. num_store : int, optional Maximum number of correction factors to store. For ``None``, the method is the regular BFGS method. For an integer, the method becomes the Limited Memory BFGS method. hessinv_estimate : `Operator`, optional Initial estimate of the inverse of the Hessian operator. Needs to be an operator from ``f.domain`` to ``f.domain``. Default: Identity on ``f.domain`` callback : callable, optional Object executing code per iteration, e.g. plotting each iterate. References ---------- [GNS2009] Griva, I, Nash, S G, and Sofer, A. *Linear and nonlinear optimization*. Siam, 2009. """ grad = f.gradient if x not in grad.domain: raise TypeError('`x` {!r} is not in the domain of `grad` {!r}' ''.format(x, grad.domain)) if not callable(line_search): line_search = ConstantLineSearch(line_search) ys = [] ss = [] grad_x = grad(x) for i in range(maxiter): # Determine a stepsize using line search search_dir = -_bfgs_direction(ss, ys, grad_x, hessinv_estimate) dir_deriv = search_dir.inner(grad_x) if np.abs(dir_deriv) == 0: return # we found an optimum step = line_search(x, direction=search_dir, dir_derivative=dir_deriv) # Update x x_update = search_dir x_update *= step x += x_update grad_x, grad_diff = grad(x), grad_x # grad_diff = grad(x) - grad(x_old) grad_diff.lincomb(-1, grad_diff, 1, grad_x) y_inner_s = grad_diff.inner(x_update) # Test for convergence if np.abs(y_inner_s) < tol: if grad_x.norm() < tol: return else: # Reset if needed ys = [] ss = [] continue # Update Hessian ys.append(grad_diff) ss.append(x_update) if num_store is not None: # Throw away factors if they are too many. ss = ss[-num_store:] ys = ys[-num_store:] if callback is not None: callback(x)
[docs]def broydens_method(f, x, line_search=1.0, impl='first', maxiter=1000, tol=1e-15, hessinv_estimate=None, callback=None): r"""Broyden's first method, a quasi-Newton scheme. Notes ----- This is a general and optimized implementation of Broyden's method, a quasi-Newton method for solving a general unconstrained optimization problem .. math:: \min f(x) for a differentiable function :math:`f: \mathcal{X}\to \mathbb{R}` on a Hilbert space :math:`\mathcal{X}`. It does so by finding a zero of the gradient .. math:: \nabla f: \mathcal{X} \to \mathcal{X} using a Newton-type update scheme with approximate Hessian. The algorithm is described in [Bro1965] and [Kva1991], and in a `Wikipedia article <https://en.wikipedia.org/wiki/Broyden's_method>`_. Parameters ---------- f : `Functional` Functional with ``f.gradient`` x : ``f.domain`` element Starting point of the iteration line_search : float or `LineSearch`, optional Strategy to choose the step length. If a float is given, uses it as a fixed step length. impl : {'first', 'second'}, optional What version of Broydens method to use. First is also known as Broydens 'good' method, while the second is known as Broydens 'bad' method. maxiter : int, optional Maximum number of iterations. ``tol``. tol : float, optional Tolerance that should be used for terminating the iteration. hessinv_estimate : `Operator`, optional Initial estimate of the inverse of the Hessian operator. Needs to be an operator from ``f.domain`` to ``f.domain``. Default: Identity on ``f.domain`` callback : callable, optional Object executing code per iteration, e.g. plotting each iterate. References ---------- [Bro1965] Broyden, C G. *A class of methods for solving nonlinear simultaneous equations*. Mathematics of computation, 33 (1965), pp 577--593. [Kva1991] Kvaalen, E. *A faster Broyden method*. BIT Numerical Mathematics 31 (1991), pp 369--372. """ grad = f.gradient if x not in grad.domain: raise TypeError('`x` {!r} is not in the domain of `grad` {!r}' ''.format(x, grad.domain)) if not callable(line_search): line_search = ConstantLineSearch(line_search) impl, impl_in = str(impl).lower(), impl if impl not in ('first', 'second'): raise ValueError('`impl` {!r} not understood' ''.format(impl_in)) ss = [] ys = [] grad_x = grad(x) for i in range(maxiter): # find step size search_dir = -_broydens_direction(ss, ys, grad_x, hessinv_estimate, impl) dir_deriv = search_dir.inner(grad_x) if np.abs(dir_deriv) == 0: return # we found an optimum step = line_search(x, search_dir, dir_deriv) # update x x_update = step * search_dir x += x_update # compute new gradient grad_x, grad_x_old = grad(x), grad_x delta_grad = grad_x - grad_x_old # update hessian. # TODO: reuse from above v = _broydens_direction(ss, ys, delta_grad, hessinv_estimate, impl) if impl == 'first': divisor = x_update.inner(v) # Test for convergence if np.abs(divisor) < tol: if grad_x.norm() < tol: return else: # Reset if needed ys = [] ss = [] continue u = (x_update - v) / divisor ss.append(u) ys.append(x_update) elif impl == 'second': divisor = delta_grad.inner(delta_grad) # Test for convergence if np.abs(divisor) < tol: if grad_x.norm() < tol: return else: # Reset if needed ys = [] ss = [] continue u = (x_update - v) / divisor ss.append(u) ys.append(delta_grad) if callback is not None: callback(x)
if __name__ == '__main__': from odl.util.testutils import run_doctests run_doctests()