\documentclass[reqno]{amsart}
\usepackage{hyperref}
\AtBeginDocument{{\noindent\small \emph{
Electronic Journal of Differential Equations},
Vol. 2007(2007), No. 124, pp. 1--13.\newline
ISSN: 1072-6691. URL: http://ejde.math.txstate.edu
or http://ejde.math.unt.edu
\newline ftp ejde.math.txstate.edu (login: ftp)}
\thanks{\copyright 2007 Texas State University - San Marcos.}
\vspace{9mm}}
\begin{document}
\title[\hfilneg EJDE-2007/124\hfil Newton's method]
{Newton's method in the context of gradients}
\author[J. Kar\'atson, J. W. Neuberger \hfil EJDE-2007/124\hfilneg]
{J. Kar\'atson, J. W. Neuberger} % in alphabetical order
\address{J\'anos Kar\'atson \newline
Dept. of Applied Analysis, ELTE Univ.,
Budapest, H-1518 Pf. 120, Hungary}
\email{karatson@cs.elte.hu}
\address{John W. Neuberger \newline
Dept. of Mathematics, Univ. of North Texas,
Denton, TX 70203-1430, USA}
\email{jwn@unt.edu}
\thanks{Submitted August 8, 2005. Published September 24, 2007.}
\subjclass[2000]{65J15}
\keywords{Newton's method; Sobolev; gradients}
\begin{abstract}
This paper gives a common theoretical treatment for gradient and
Newton type methods for general classes of problems. First, for
Euler-Lagrange equations Newton's method is characterized as an
(asymptotically) optimal variable steepest descent method.
Second, Sobolev gradient type minimization is developed for
general problems using a continuous Newton method which takes
into account a `boundary condition' operator.
\end{abstract}
\maketitle
\numberwithin{equation}{section}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{remark}[theorem]{Remark}
\section{Introduction}
\label{secint}
Gradient and Newton type methods are among the most important
approaches for the solution of nonlinear equations, both in
$\mathbb{R}^n$ and in abstract spaces. The latter are often
connected to PDE applications, and here the involvement of Sobolev
spaces has proved an efficient strategy, see e.g. \cite{sv, Neu6}
on the Sobolev gradient approach and \cite{Ax82, GGZ} on Newton
type methods. Further applications of Sobolev space iterations are
found in \cite{book}.
The two types of methods (gradient and Newton) are generally
considered as two different approaches, although their connection
has been studied in some papers, see e.g. \cite{CN} in the context
of continuous steepest-descent, \cite{varprec} on variable
preconditioning and quasi-Newton methods, and \cite[Chapter 7]{sv}
on Newton's method and constrained optimization.
The goal of this paper is to establish a common theoretical
framework in which gradient and Newton type methods can be
treated, and thereby to clarify the relation of the two types of
methods for general classes of problems.
Note that there are two distinct ways systems of differential
equations may be placed into an optimization setting. Sometimes it
is possible to show that a given system of PDEs are Euler-Lagrange
equations for some functional $\phi$. In the more general case
one looks for the critical points of a least-squares functional
associated with the given system. Furthermore, one can approach
Newton type methods also in two different ways: from numerical
aspect it is the study of the discrete (i.e. iterative) solution
method that is mostly relevant, whereas continuous Newton methods
can lead to attractive theoretical results.
The first part of this paper characterizes Newton's method in the
Euler-Lagrange case as an (asymptotically) optimal variable
steepest descent method for the iterative minimization of the
corresponding functional. The second part treats the more general
(either Euler-Lagrange or least squares) case and develops Sobolev
gradient type minimization using a continuous Newton method which
takes into account a `boundary condition' operator.
\section{Unconstrained optimization: Newton's method as a variable
steepest descent}
\label{secvar}
Let $H$ be a real Hilbert space and $F:H\to H$ an operator
which has a potential $\phi:H\to\mathbb{R}$; i.e.,
\begin{equation} \label{phigrfh}
\phi'(u)h = \langle F(u), h\rangle \quad (u,h\in H)
\end{equation}
in Gateaux sense. We consider the operator equation
\begin{equation}
\label{opeq} F(u)\, = 0 \end{equation} and study the
relationship between steepest descent and Newton method.
We will observe that Newton's method can be regarded as a special
variable steepest descent iteration, where the latter means that
the gradients of $\phi$ are taken with respect to stepwise
redefined inner products. Then our main result states the
following principle: whereas the descents in the ordinary gradient
method are steepest with respect to different directions, in
Newton's method they are steepest with respect to both different
directions and inner products. This optimality is understood in a
(second order) asymptotic sense in the neighbourhood of the
solution.
\subsection{Fixed and variable steepest descent iterations}
% \label{secvar}
A steepest descent iteration corresponding to the gradient
$\phi'$ in (\ref{phigrfh}) is
\begin{equation}
\label{sdsqt} u_{n+1}= u_n - \alpha_n F(u_n)
\end{equation}
with some constants $\alpha_n>0$.
Our aim is to modify this sequence by
varying the inner product of the space $H$.
\subsubsection{Steepest descent under a fixed inner product}
First we modify the sequence (\ref{sdsqt}) by introducing another
fixed inner product. For this purpose let $B:H\to H$ be a bounded
self-adjoint linear operator which is {\it strongly positive}
(i.e. it has a positive lower bound $p>0$), and let
$$
\langle u,v \rangle _B\equiv \langle Bu,v \rangle \quad (u,v\in
H).
$$
Denote by $ \nabla_B \phi $ the gradient of $\phi$ with respect
to the energy inner product $\langle \cdot , \cdot \rangle_B$. Then
$$
\langle \nabla_B \phi (u),v \rangle_{B} = \frac{\partial \phi
}{\partial v}(u) = \phi'(u)v = \langle F(u),v \rangle =
\langle B^{-1} F(u),v \rangle_{B} \qquad (u,v\in H),
$$
which implies
\begin{equation}
\label{phigrsreg}
\nabla_B \phi (u) = B^{-1} F(u) \quad (u\in H).
\end{equation}
That is, the change of the inner product yields the change of the
gradient of $\phi$, namely, the modified gradient is expressed
as the preconditioned version of the original one. Consequently, a
steepest descent iteration corresponding to the gradient
$\phi_B'$ is the preconditioned sequence
\begin{equation}
\label{prsqf} u_{n+1}= u_n - \alpha_n B^{-1} F(u_n)
\end{equation}
with some constants $\alpha_n>0$.
Convergence results for such sequences are well-known if $\phi$ is
strongly convex, which can be formulated in terms of the operator
$F$ (see e.g. the monographs \cite{book, GGZ}). For instance, if
the spectral bounds of the operators $F'(u)$ are between uniform
constants $M\ge m>0$ (in the original resp. the energy inner
product), then the constant stepsize $\alpha_n \equiv 2/(M+m)$
yields convergence with ratio
$$
q=\frac{M-m}{M+m}
$$
for the sequences (\ref{sdsqt}) and (\ref{prsqf}), resp. Clearly,
the aim of the change of the inner product is to achieve better
spectral bounds in the new inner product. For instance, for PDEs a
sometimes dramatic improvement can be achieved by using the
Sobolev inner product instead of the original $L^2$ one (see the
monograph \cite{sv} on Sobolev gradients).
\subsubsection{Steepest descent under a variable inner product}
Assume that the $n$th term of an iterative sequence is
constructed and let $B_n:H\to H$ be a strongly positive bounded
self-adjoint linear operator. It follows similarly to
(\ref{phigrsreg}) that the gradient of $\phi$ with respect to
the inner product $\langle . , . \rangle _{B_n} $ is
\begin{equation}
\label{phigrbnf} \nabla_{B_n} \phi = B_n^{-1} F(u) \quad (u\in H).
\end{equation}
The relation (\ref{phigrbnf}) means that a one-step iterative sequence
\begin{equation}
\label{varsqgr}
u_{n+1}= u_n -\alpha_n \, B_n^{-1} F(u_n)
\end{equation}
(with some constants $\alpha_n>0$) is a variable steepest
descent iteration corresponding to $\phi$ such that in the
$n$th step the gradient of $\phi$ is taken with respect to the
inner product $\langle . , . \rangle _{B_n} $.
Several such types of iterative method are known including
variable metric methods (see e.g. the monograph \cite{R2}). In
this context `variable' is understood as depending on the step
$n$. We note that Sobolev gradients under variable inner product
can also be defined in the context of continuous steepest descent,
and the inner product may depend continuously on each element of
the Sobolev space (see \cite{Neu3, Neu6}).
Convergence results for sequences of the form
(\ref{varsqgr}) are given in \cite{finnof, varprec},
formulated again for convex functionals in terms of spectral
bounds. Namely, under the stepwise spectral equivalence relation
\begin{equation}
\label{ekvgrpr}
m_n \langle B_n h,h \rangle \le \langle F'(u_n)h,h \rangle \le M_n
\langle B_n h,h \rangle \quad (n\in\mathbb{N}, \, h\in H)
\end{equation}
(with some constants $M_n\ge m_n>0$) and assuming the Lipschitz
continuity of $F'$, one can achieve convergence with ratio
$$
q=\limsup \frac{M_n-m_n}{M_n+m_n}\, .
$$
(This convergence is global if $\alpha_n$ includes damping.) In
particular, superlinear convergence can also be obtained when
$q=0$, and its rate is characterized by the speed as $M_n/ m_n\to
1$.
Clearly, the variable steepest descent iteration (\ref{varsqgr})
can also be regarded as a quasi-Newton method, since the relation
(\ref{ekvgrpr}) provides the operators $B_n$ as approximations of
$F'(u_n)$. Moreover,
the choice $B_n=F'(u_n)$ yields optimal spectral bounds
$m_n=M_n=1$ in (\ref{ekvgrpr}), and the corresponding variable
steepest descent iteration (\ref{varsqgr}) becomes Newton method
with quadratic convergence speed.
\subsubsection{Conclusion}
Altogether, we may observe the following relationship between steepest
descent and Newton methods. A usual steepest descent method
defines an optimal descent direction under a fixed inner product,
but the search for an optimal descent may also include the
stepwise change of inner product. If these inner products are
looked for among energy inner products $\langle . , .
\rangle_{B_n}$ corresponding to (\ref{ekvgrpr}), then a resulting
variable steepest descent iteration coincides with a
quasi-Newton method. Under the special choice $B_n=F'(u_n)$ we
obtain Newton's method itself in this way, and the convergence
results suggest that the optimal convergence is obtained with this
choice. Roughly speaking, this means the following principle:
whereas the descents in the gradient method are steepest with
respect to different directions, in Newton's method they are
steepest with respect to both different directions and inner
products.
However, the above principle is not proved by the quoted
convergence results themselves. Namely, in their proof in
\cite{varprec} they {\it a priori} compare the rate of
quasi-Newton method to the exact Newton's method, hence the
obtained convergence estimates are obviously not better than those
for the exact Newton's method. Therefore our goal in the next
section is to verify the above stated principle in a proper sense.
\subsection{Newton's method as an optimal variable steepest descent}
% \label{secopt}
We consider the operator equation (\ref{opeq}) and the
corresponding potential $\phi:H\to\mathbb{R}$. In this subsection
we assume that $\phi$ is uniformly convex and $\phi''$ is locally
Lipschitz continuous. More exactly, formulated in terms of the
operator $F$ in (\ref{phigrfh}), we impose the following
conditions:
\begin{enumerate}
\item[(i)] $F$ is Gateaux differentiable;
\item[(ii)] for every $R>0$ there exist constants
$P\ge p>0$ such that
\begin{equation} \label{uellcond}
p\|h\|^2 \le \langle F'(u)h,h \rangle \le P\|h\|^2 \quad
(\|u\|\le R, \ h\in H);
\end{equation}
\item[(iii)] for every $R>0$ there exists a constant $L>0$
such that
$$
\| F'(u)-F'(v)\| \le L \|u-v\| \quad (\|u\|, \|v\|\le R).
$$
\end{enumerate}
These conditions themselves do not ensure that equation (\ref{opeq})
has a solution, hence we impose condition
\begin{enumerate}
\item[(iv)] equation (\ref{opeq}) has a solution $u^{\ast}\in H$.
\end{enumerate}
Then the solution $u^{\ast}$ is unique and also minimizes $\phi$. We note
that the existence of $u^{\ast}$ is already ensured if the lower bound
$p=p(R)$ in condition (ii) satisfies
$\lim_{R\to \infty} R p(R)=+\infty$,
or if $p$ does not depend on $R$ at all (see e.g. \cite{book,GGZ})
Let $u_0\in H$ and let a variable steepest descent iteration
be constructed in the form
(\ref{varsqgr}):
\begin{equation} \label{varsqgr2}
u_{k+1}= u_k -\alpha_k B_k^{-1} F(u_k)
\end{equation}
with suitable constants $\alpha_k>0$ and strongly positive
self-adjoint operators $B_k$.
Let $n\in\mathbb{N}$ and assume that the $n$th term of the sequence
(\ref{varsqgr2}) is constructed. The stepsize $\alpha_n$ yields
steepest descent with respect to $B_n$ if $\phi(u_{n+1})$ coincides with
the number
$$
\mu(B_n) \equiv \min_{\alpha>0} \phi(u_n - \alpha B_n^{-1}
F(u_n)) .
$$
We wish to choose $B_n$ such that this value is the smallest possible
within the class of strongly positive operators
\begin{equation}
\label{calbdef}
\mathcal{B} \equiv \{ B\in L(H) \hbox{ self-adjoint}:
\exists \, p>0 \quad \langle Bh,h \rangle \ge p\|h\|^2 \quad
( h\in H) \}
\end{equation}
where $L(H)$ denotes the set of bounded linear operators on $H$.
(The strong positivity is needed to yield $R(B_n)=H$, by which the
existence of $B_n^{-1} F(u_n)$ is ensured in the iteration.)
Moreover, when $B_n\in \mathcal{B}$ is varied then one can
incorporate the number $\alpha$ in $B_n$, since $\alpha B_n\in
\mathcal{B}$ as well for any $\alpha>0$. That is, it suffices to
replace $\mu(B_n)$ by
\begin{equation}
\label{mbndef}
m(B_n) \equiv \phi(u_n - B_n^{-1} F(u_n)) \,,
\end{equation}
and to look for
$$
\min_{B_n\in \mathcal{B}} \ m(B_n) \, .
$$
Our aim is to verify that
\begin{equation}
\label{roughres}
\min_{B_n\in \mathcal{B}} \ m(B_n) = m(F'(u_n)) \quad \hbox{\it
up to second order}
\end{equation}
as $u_n\to u^{\ast}$; i.e., the Newton iteration realizes
asymptotically the stepwise optimal steepest descent among
different inner products in the neighbourhood of $u^{\ast}$.
(That is, the descents in Newton's method are asymptotically
steepest with respect to both different directions and inner
products.) We note that, clearly, the asymptotic result cannot be
replaced by an exact one, this can be seen for fixed $u_n$ by an
arbitrary nonlocal change of $\phi$ along the descent direction.
The result (\ref{roughres}) can be given an exact formulation in
the following way. First we define for any $\nu_1>0$ the set
\begin{equation}
\label{calbpedef}
\mathcal{B}(\nu_1) \equiv \{ B\in L(H) \hbox{ self-adjoint}:
\; \langle Bh,h \rangle \ge \nu_1\|h\|^2 \;
( h\in H) \};
\end{equation}
i.e., the subset of $\mathcal{B}$ with operators having the common lower
bound $\nu_1>0$.
\begin{theorem} \label{thm1}
Let conditions (i)-(iv) be satisfied. Let $u_0\in H$ and let the
sequence $(u_k)$ be given by (\ref{varsqgr2}) with some constants
$\alpha_k>0$ and operators $B_k\in \mathcal{B}$, with
$\mathcal{B}$ defined in (\ref{calbdef}).
Let $n\in\mathbb{N}$ be fixed, $m(B_n)$ defined by (\ref{mbndef}) and let
\begin{equation} \label{mkbndef}
\hat m(B_n) \equiv \beta + \frac 12 \left\langle
H_n (B_n^{-1} g_n - H_n^{-1} g_n), B_n^{-1} g_n - H_n^{-1} g_n \right\rangle
\,,
\end{equation}
where
\begin{equation}
\label{bghdef}
\beta = \phi(u^{\ast}), \quad g_n = F(u_n), \quad H_n = F'(u_n).
\end{equation}
Then
\begin{enumerate}
\item[(1)] there holds
$$
\min_{B_n\in \mathcal{B}} \hat m(B_n) = \hat m(F'(u_n));
$$
\item[(2)] $\hat m(B_n)$ is the second order approximation of
$m(B_n)$); i.e., for any $\nu_1>0$ and $B_n\in \mathcal{B}(\nu_1)$
\begin{equation}
\label{propk}
|m(B_n) - \hat m(B_n)| \le C \|u_n-u^{\ast}\|^3
\end{equation}
(with $\mathcal{B}(\nu_1)$ defined by (\ref{calbpedef})), where $C>0$
depends on $u_0$ and $\nu_1$, but does not depend on $B_n$ or
$u_n$.
\end{enumerate}
\end{theorem}
\begin{proof} (1) This part of the theorem is obvious since, using
that $H_n =F'(u_n)$ is positive definite by assumption (ii), we
obtain
$$
\hat m(B_n) \ge \beta = \hat m(H_n) = \hat m(F'(u_n)) .
$$
(2) We verify the required estimate in four steps.
(i) First we prove that
\begin{equation}
\label{rnull} \|u_n-u^{\ast}\| \le R_0 \,,
\end{equation}
where $R_0$ depends on $u_0$; that is, the initial guess
determines an {\it a priori} bound for a ball $B(u^{\ast},R_0)$
around $u^{\ast}$ containing the sequence (\ref{varsqgr2}). For
this it suffices to prove that the level set corresponding to
$\phi(u_0)$ is contained in such a ball; i.e.,
\begin{equation} \label{levball}
\{ u\in H: \phi(u) \le \phi(u_0) \} \subset B(u^{\ast}, R_0) ,
\end{equation}
since $u_n$ is a descent sequence with respect to $\phi$.
Let $u\in H$ be fixed and consider the real function
$$
f(t):= \phi\Bigl( u^{\ast} + t
\frac{u-u^{\ast}}{\|u-u^{\ast}\|} \Bigr) \quad (t\in\mathbb{R}),
$$
which is $C^2$, convex and has its minimum at 0. Assumption (ii)
implies that there exists $p_1>0$ such that
$$
\langle \phi''(v)h,h \rangle \ge p_1\|h\|^2 \quad
(\|v-u^{\ast}\|\le 1, h\in H),
$$
and hence
$$
f''(t) \ge p_1 \quad (|t|\le 1).
$$
Then elementary calculus yields that $f'(1)\ge p_1$ and
$f(1)-f(0)\ge p_1/2$, hence
\begin{align*}
\phi(u)-\phi(u^{\ast})
& = f(\|u-u^{\ast}\|) -f(1)+ f(1)- f(0) \\
& \ge f'(1)(\|u-u^{\ast}\|-1) + f(1)- f(0) \\
& \ge p_1 \Bigl(\|u-u^{\ast}\| - \frac 12 \Bigr) .
\end{align*}
This implies that if
$$
\|u-u^{\ast}\| \ge \frac 1{p_1} \Bigl( \phi(u_0)-\phi(u^{\ast})\Bigr)
+ \frac 12 \equiv R_0
$$
then $\phi(u) \ge \phi(u_0)$; that is, (\ref{levball}) holds with
this $R_0$.
(ii) In the sequel we omit the index $n$ for notational
simplicity, and let
$$
u = u_n, \quad g = g_n , \quad H = H_n , \quad B = B_n ,
$$
where $g_n = F(u_n)$ and $H_n = F'(u_n)$ were defined in
(\ref{bghdef}). Using these notation, (\ref{mbndef}) turns into
\begin{equation} \label{mbdef}
m(B) = \phi(u - B^{-1} g) \, .
\end{equation}
Further, we fix $\nu_1>0$ and assume that $B\in \mathcal{B}(\nu_1)$
as defined by (\ref{calbpedef}).
Now we verify that
\begin{equation} \label{mbecse}
m(B) = \phi(u) - \langle B^{-1} g, g \rangle + \frac 12 \langle HB^{-1} g,
B^{-1} g \rangle + R_1 \,,
\end{equation}
where
\begin{equation} \label{re}
|R_1| \le C_1 \|u-u^{\ast}\|^3
\end{equation}
with $C_1>0$ depending only on $u_0$ and $\nu_1$. Let $z=B^{-1} g$.
Then the Taylor expansion yields
\begin{equation} \label{taylore}
m(B) = \phi(u - z) = \phi(u) - \langle \phi'(u), z \rangle + \frac 12 \langle \phi''(u)z, z\rangle + R_1 \,,
\end{equation}
here the Lipschitz continuity of $\phi''$ implies
\begin{equation} \label{taylork}
|R_1| \le \frac{L_0}{6} \|z\|^3
\end{equation}
where $L_0$ is the Lipschitz constant corresponding to the ball
$B(u^{\ast}, R_0)$ according to assumption (iii). Here
\begin{equation} \label{ghism}
\nabla\phi (u)=F(u)=g \quad \hbox{and} \quad \nabla^2 \phi
(u)=F'(u)=H ,
\end{equation}
hence the definition of $z$ and the symmetry of $B$ yield
$$
\langle \nabla\phi (u), z \rangle = \langle B^{-1} g, g \rangle
, \quad \langle \nabla\phi '(u)z, z\rangle = \langle HB^{-1} g,
B^{-1} g \rangle
$$
and in order to verify (\ref{re}) it suffices to prove that
\begin{equation} \label{zbecs}
\|z\| \le K_1 \|u-u^{\ast}\|
\end{equation}
with $K_1>0$ depending on $u_0$ and $\nu_1$. The Taylor expansion
for $\nabla\phi $ yields
\begin{equation} \label{gbecs}
g=\nabla\phi (u) = \nabla\phi (u^{\ast}) + \nabla^2 \phi
(u^{\ast})(u-u^{\ast}) + \varrho_1 \,,
\end{equation}
where
$$
|\varrho_1| \le \frac{L_0}{2} \|u-u^{\ast}\|^2
$$
with $L_0$ as above. Here $\nabla\phi (u^{\ast})=0$. Let $P_0$ be
the upper spectral bound of $\nabla^2 \phi $ on the ball
$B(u^{\ast}, R_0)$, obtained from assumption (ii). Then, also
using (\ref{rnull}), we have
\begin{equation} \label{gbecse}
\|g\| \le P_0 \|u-u^{\ast}\| + \frac{L_0}{2} \|u-u^{\ast}\|^2
\le \Bigl( P_0 + \frac{L_0 R_0}{2} \Bigr) \|u-u^{\ast}\|
= K_0 \|u-u^{\ast}\| .
\end{equation}
From this the assumption $B\in \mathcal{B}(\nu_1)$ yields
$$
\|z\| = \|B^{-1} g\| \le (K_0/\nu_1) \|u-u^{\ast}\|,
$$
hence (\ref{zbecs}) holds with $K_1= K_0/\nu_1$ and thus
(\ref{mbecse})-(\ref{re}) are verified.
(iii) Now we prove that
\begin{equation} \label{fibecse}
\phi(u) = \beta + \frac 12 \langle H^{-1} g, ^{-1} g \rangle + R_2 \,,
\end{equation}
where
\begin{equation} \label{rk}
|R_2| \le C_2 \|u-u^{\ast}\|^3
\end{equation}
with $C_2>0$ depending only on $u_0$ and $\nu_1$. Similarly to
(\ref{taylore})-(\ref{taylork}), we have
$$
\phi(u) = \phi(u^{\ast}) + \langle \nabla\phi (u^{\ast}),
u-u^{\ast} \rangle + \frac 12 \langle \nabla^2 \phi (u^{\ast})(u-u^{\ast}),
u-u^{\ast}\rangle + \varrho_2 \,,
$$
where
$$
|\varrho_2| \le \frac{L_0 }{6} \|u-u^{\ast}\|^3 .
$$
Here $\phi(u^{\ast})= \beta $, $\nabla\phi (u^{\ast})=0$ and
$$
|\langle \nabla^2 \phi (u^{\ast})(u-u^{\ast}), u-u^{\ast}\rangle -
\langle H(u-u^{\ast}), u-u^{\ast}\rangle| \le L_0 \|u-u^{\ast}\|^3
$$
from $H=\nabla^2 \phi (u)$ and the Lipschitz condition. Hence
$$
\phi(u) = \beta + \frac 12 \langle H(u-u^{\ast}),
u-u^{\ast}\rangle + \varrho_3 \,,
$$
where
$$
|\varrho_3| \le \frac{2L_0}{3} \|u-u^{\ast}\|^3 .
$$
Therefore it remains to prove that
\begin{equation} \label{hbecs}
|\langle H(u-u^{\ast}), u-u^{\ast}\rangle - \langle H^{-1} g,
g \rangle | \le C_3 \|u-u^{\ast}\|^3 .
\end{equation}
Here (\ref{gbecs}) implies
$$
g=\nabla\phi (u) = \nabla^2 \phi (u^{\ast})(u-u^{\ast}) +
\varrho_1 = H(u-u^{\ast}) + (\nabla^2 \phi
(u^{\ast})-H)(u-u^{\ast}) + \varrho_1 \,.
$$
Using again the Lipschitz condition for $\nabla^2 \phi $, we have
$$
\|(\nabla^2 \phi (u^{\ast})-H)(u-u^{\ast})\| \le L_0
\|u-u^{\ast}\|^2 ,
$$
hence
\begin{equation} \label{ghkapcs}
g= H(u-u^{\ast}) + \varrho_4
\end{equation}
with
\begin{equation} \label{robecs}
|\varrho_4| \le C_4 \|u-u^{\ast}\|^2 .
\end{equation}
Setting (\ref{ghkapcs}) into the left-hand side expression in
(\ref{hbecs}) and using the symmetry of $H$, we obtain
\begin{align*}
|\langle H(u-u^{\ast}), u-u^{\ast}\rangle - \langle H^{-1} g, g \rangle |
&= |\langle g-\varrho_4 , H^{-1}(g-\varrho_4) \rangle
- \langle H^{-1} g, g \rangle| \\
&= |- 2\langle H^{-1} g, \varrho_4\rangle
+ \langle H^{-1} \varrho_4, \varrho_4\rangle| \\
&\le 2 |\langle H^{-1} g, \varrho_4\rangle|
+ |\langle H^{-1} \varrho_4, \varrho_4\rangle|\, .
\end{align*}
Let $p_0$ be the lower spectral bound of $\nabla^2 \phi $ on the
ball $B(u^{\ast}, R_0)$, obtained from assumption (ii). Then
$\|H^{-1}\| \le 1/p_0$. Hence, using (\ref{gbecse}),
(\ref{robecs}) and (\ref{rnull}), we have
\begin{align*}
|\langle H(u-u^{\ast}), u-u^{\ast}\rangle - \langle H^{-1} g, g \rangle |
&\le \frac{1}{p_0} \Bigl( 2 \|g\| \|\varrho_4\| + \|\varrho_4\|^2 \Bigr) \\
&\le \frac{1}{p_0} \Bigl( 2 K_0 C_4 \|u-u^{\ast}\|^3 + C_4^2
\|u-u^{\ast}\|^4 \Bigr)\\
& \le \frac{1}{p_0} \Bigl( 2 K_0 C_4 + R_0
C_4^2 \Bigr) \|u-u^{\ast}\|^3 ,
\end{align*}
that is, (\ref{hbecs}) holds and thus (\ref{fibecse})-(\ref{rk})
are verified.
(iv) Let us set (\ref{fibecse}) into (\ref{mbecse}) and use
notation $R_3=R_1+R_2$ :
\begin{align*}
m(B) &= \beta + \frac 12 \langle H^{-1} g, ^{-1} g \rangle
- \langle B^{-1} g, g \rangle + \frac 12 \langle HB^{-1} g,
B^{-1} g \rangle + R_3 \\
&= \beta+ \frac 12 \langle H (B^{-1} g - H^{-1} g),
B^{-1} g - H^{-1} g \rangle+ R_3 \\
&= \hat m(B) + R_3 \,,
\end{align*}
where by (\ref{re}) and (\ref{rk}),
$$
|R_3| \le C \|u-u^{\ast}\|^3
$$
with $C=C_1+C_2$. Therefore (\ref{propk}) is true and the proof
is complete.
\end{proof}
\begin{remark} \label{rmk1} \rm
A main application of the above theorem arises for second order
nonlinear elliptic problems. Then one can define various Sobolev
gradients using different weight functions in the Sobolev inner
product. For instance, in the case of Dirichlet problems one can
use weighted Sobolev norms $ \langle h,h\rangle_w=\int_\Omega
w(x) |\nabla h|^2\, dx$ where $w$ is a positive bounded function,
or more generally $ \langle h,h\rangle_W=\int_\Omega W(x) \nabla
h \cdot \nabla h\,dx$ where $W$ is a bounded uniformly positive
definite matrix function. Such weighted norms can be written as
$\langle Bh,h\rangle_{H^1_0}$ with some operator $B$ as in
(\ref{calbpedef}) on the space $H=H^1_0(\Omega)$, where $\langle
.,.\rangle_{H^1_0}$ denotes the standard Sobolev inner product,
hence the optimality result of Theorem \ref{thm1} covers such Sobolev gradient
preconditioners.
\end{remark}
\section{Constrained optimization for Newton's method and Sobolev gradients}
\label{secopt}
A different interpretation of Newton's method in Sobolev gradient context
uses minimization subject to constraints, which we build up using
a continuous Newton method. Suppose that $\phi$ is a $C^3$
function from $\mathbb{R}^n$ into $R$. What philosophy might guide a
choice of a numerically efficient gradient for $\phi$? We first
give a quick development for the unconstrained case which gives a
somewhat different point of view to the previous section. We then
pass to the constrained case.
If $\phi$ arises from a discretization of a system of differential
equations then the ordinary gradient, a list of partial derivatives of
$\phi$ is a very poor choice for numerical purposes. We
illustrate this by a simple example in which the underlying
equation is $u' - u = 0$ on $[0,1]$. For $n$ a positive
integer, a finite dimensional least-squares formulation is, with
$\delta = 1/n$,
\begin{equation} \label{leastsquares}
\phi (u_0,u_1,\dots,u_n) = \frac{1}{2} \sum^n_{k=1} (\frac{u_k -
u_{k-1}}{\delta} - \frac{u_k + u_{k-1}}{2})^2,
\end{equation}
where $(u_0,u_1,\dots,u_n) \in \mathbb{R}^{n+1}$. It may be seen that if
$(u_0,u_1,\dots,u_n)$ is a critical point of $\phi$ then $\phi
(u_0,u_1,\dots,u_n) = 0$ and so
\begin{equation*}
\frac{u_k - u_{k-1}}{\delta} - \frac{u_k + u_{k-1}}{2} = 0, \quad
k=1,\dots,n,
\end{equation*}
which are precisely the equations to be satisfied by the
Crank-Nicholson method for this problem. It is widely understood
that the ordinary gradient of $\phi$ is a disaster numerically
using steepest descent. By contrast, consider the gradient of
$\phi$ taken with respect the following finite dimensional
emulation of of the Sobolev space $H^{1,2}([0,1])$:
\begin{equation}\label{snorm}
\alpha(u_0,u_1,\dots,u_n) = \|u\|_S^2 = \sum^n_{k=1} ((\frac{u_k -
u_{k-1}}{\delta})^2 + (\frac{u_k + u_{k-1}}{2})^2),
\end{equation}
$u = (u_0,u_1,\dots,u_n) \in R^{n+1}$. The Sobolev gradient of
$\phi$ at such a $u$ is the element $(\nabla_S \phi)(u)$ so that
\begin{equation*}
\phi'(u)h = \langle h , (\nabla_S \phi)(u) \rangle_S,
\quad h \in R^{n+1},
\end{equation*}
where $\langle \cdot,\cdot \rangle_S$ denotes the inner product
associated with \eqref{snorm}.
In \cite{sv}, it is indicated about seven steepest descent iterations
suffices using the Sobolev gradient whereas for steepest descent with the
ordinary gradient a large number of iterations is required (on the order
of 30, 5000, 500000 iterations required for n=10,20,40 respectively).
In the above example we might have been guided in our choice of metric
by the fact that the Sobolev space $H^{1,2}([0,1])$ is a good
choice of a metric for the underlying continuous least squares
problem
\begin{equation*}
\Phi (u) = \frac{1}{2} \int^1_0 (u' - u )^2, \quad u \in
H^{1,2}([0,1]).
\end{equation*}
That this Sobolev metric renders $\Phi$ differentiable (in contrast
with trying to define $\Phi$ as a densely defined everywhere
discontinuous function on $L_2([0,1])$) is a good indication that
its finite dimensional emulation should provide a good numerical
gradient.
Examining \eqref{leastsquares}, \eqref{snorm} together we see that
elements $(u_0,u_1,\dots,u_n)$ have similar sensitivity (i.e.,
similar sized partial derivatives) in both expressions. Note that
the first and last components of such a vector have sensitivity
quite different from the other $n-1$ components. Roughly, when
various components of the argument of $\phi$ have widely different
sensitivity, the resulting gradient is very likely to have poor
numerical properties. As explained in \cite{book, sv}, the Sobolev
gradient compensates, yielding an organized way to define a
preconditioned version of the original gradient. This phenomena is
pervasive for functionals which arise from discretizations of
systems of differential equations. In what follows, we see how to
achieve this benefit when a natural norm is not available.
Essentially we see how Newton's method fits into the family of
Sobolev gradients.
Suppose $\phi$ is a $C^3$ real-valued function on $\mathbb{R}^n$ and that
a more or less obvious norm as in \eqref{snorm} has not presented
itself. Following the opening remarks in \cite{sv}, if $u \in \mathbb{R}^n$
define $\beta: \mathbb{R}^n \to \mathbb{R}$ by
\begin{equation*}
\beta(h) = \phi(h+u), \quad h \in \mathbb{R}^n.
\end{equation*}
For $h$ close to zero, one might expect the sensitivity in $\beta$
of various components of $h$ to somewhat match their sensitivity
in $\phi' (u) h$. Now
\begin{equation*}
\phi'(u) h = \langle h, (\nabla \phi)(u) \rangle_{\mathbb{R}^n},
\end{equation*}
using the ordinary gradient of $\phi$ and
\begin{equation*}
\beta'(u) h = \langle h,(\nabla \phi (u+h) \rangle_{\mathbb{R}^n}.
\end{equation*}
For sensitivities of $h$ in both of $\beta'(u)h$ and
$\phi'(u) h$ to approximately match, one might ask that
$(\nabla \phi (u)$ and $\nabla \beta (u)$ (ordinary gradients) be
dependent. The following result indicates conditions under which
this dependency can be found.
\begin{theorem} \label{thm1b}
Suppose $u \in \mathbb{R}^n$ and $\phi$ is a $C^3$ function from
$\mathbb{R}^n$ to $\mathbb{R}$ so that $((\nabla \phi)'(u))^{-1}$ exists.
Then there is an
open interval $J$ containing $1$ and a function $z: J \to
\mathbb{R}^n$ so that
\begin{equation*}
t (\nabla \phi)(u) = (\nabla \phi)(z(t)), \quad t \in J.
\end{equation*}
\end{theorem}
\begin{proof}
Denote by $\gamma$ a positive number so that if $\|y - u\| \le
\gamma$, then $((\nabla \phi)(y))^{-1}$ exists. By basic
existence and uniqueness theory for ODE, there is an open interval
$J$ containing $1$ and $z: J \to \mathbb{R}^n$ so that $z(1) = u$
and
\begin{equation}\label{cnm}
z'(t) = ((\nabla \phi)' (z(t))^{-1} (\nabla
\phi)(u), \quad t \in J
\end{equation}
and hence
\begin{equation}\label{sdnobc}
((\nabla \phi)(z))' (t) = (\nabla \phi)(u), \quad t \in J.
\end{equation}
Consequently,
\begin{gather}
(\nabla \phi)(z(t)) - (\nabla \phi)(z(1)) = (t-1)(\nabla \phi)(u)
, \quad t \in J, \notag\\
\label{concl}
(\nabla \phi)(z(t)) = t (\nabla \phi)(u), \quad t \in J
\end{gather}
since $z(1) = u$.
\end{proof}
Thus starting at $z(1) = u$, the path followed by the solution $z$ to
\eqref{cnm} is a trajectory under a version of continuous Newton's
method since $(\nabla \phi)(u)$ in \eqref{concl} may be replaced
by $(\nabla \phi)(z(t)$, $t \in J$ with just a change of scaler
multiples due to the fact that the vector field directions are not
altered. Hence \eqref{cnm} traces out, in a sense, a path of
equi-sensitivity. If the interval $J$ can be chosen to include
$0$, then $z(0)$ will be a sought after zero of $\nabla \phi$.
By \cite{Nm} one may substantially reduce the $C^3$ differentiability
in the preceding. This reference also indicates how some of the
above considerations apply to systems of PDE in which indicated
inverses do not exist.
We now turn to a constrained optimization setting motivated in part by the
above. Two versions are indicated, one for Sobolev gradient
steepest descent and the other for continuous Newton's method.
First recall that there are two distinct ways systems of differential
equations may be placed into an optimization setting. Sometimes
a given system of PDE are Euler-Lagrange equations for some functional $\Phi$. In this case
critical points of $\Phi$ are precisely solutions to the given
system of PDE. In the second case for $F: X \to Y$ a
$C^2$ function from a Hilbert space $X$ into a Hilbert space $Y$,
think of
\begin{equation*}
F(u) = 0
\end{equation*}
as representing a system of differential equations. Such a system
may often be placed in an optimization setting by defining
\begin{equation}\label{phif}
\Phi(u) = \frac{1}{2} \|F(u)\|^2_X, \quad u \in X.
\end{equation}
It is common that, for $u \in X$, the range of $F'(u)$ is
dense in $X$. In this case it follows that $u \in X$ is a zero of
$F$ if and only if it is a critical point of $\Phi$ (see
\cite{sv}).
In either the Euler-Lagrange or the least squares cases one might
want a critical point of $\Phi$ which lies in some manifold
contained in $X$. A convenient way that such a manifold might be
specified is by means of a function $B$ from $X$ into a third
Hilbert space $S$. In effect one can specify `boundary conditions'
or, more accurately, supplementary conditions on a given system by
requiring that
\begin{equation}\label{bu}
B(u) = 0
\end{equation}
in addition to \eqref{phif}. For each $u \in X$, denote by $P_B
(u)$ the orthogonal projection of $X$ onto $N(B'(u))$. For
$X$ a finite dimensional space assume that $B'(u)
B'(u)^*$ has an inverse for all $u \in X$ where $B'(u)^*$
is the adjoint of $B'(u)$ considered as a member of
$L(X,S)$. This is a natural assumption in that $S$ would
generally have smaller dimension that $X$.
With this assumption it may be seen that
\begin{equation*}
P_B (u) = I - B'(u)^* (B'(u)B'(u)^*)^{-1}
B'(u), \quad u \in X
\end{equation*}
since $P_B(u)$ is idempotent, symmetric and has range
$N(B'(u))$. We make the additional assumption that $P_B$
is $C^1$. For $\phi$ as in \eqref{phif} and $(\phi'(x)h =
\langle h,\nabla \phi(u) \rangle_X, x,h \in X$, define
\begin{equation*}
(\nabla_B \phi)(x) = P_B(u) (\nabla \phi(x)),\quad x \in X.
\end{equation*}
Then if
\begin{equation}\label{sdb}
z(0) = x \in X, \quad z'(t) = -(\nabla_B \phi)(z(t)), \quad
t \ge 0,
\end{equation}
we have the following result.
\begin{theorem} \label{thm2}
For $z$ as in \eqref{sdb},
\begin{equation*}
B(z)'(t) = 0, \quad t \ge 0.
\end{equation*}
\end{theorem}
This follows since
\begin{equation}\label{sdbc}
B(z)'(t) = -B'(z(t)) P_B(z(t) (\nabla \phi)(z(t)) = 0, \quad t \ge 0.
\end{equation}
Thus if in \eqref{sdb}, $B(x) = 0$ it follows that $B(z(t)) = 0, t \ge 0$
and hence if
\begin{equation*}
u = \lim_{t \to \infty} z(t),
\end{equation*}
then $B(u) = 0$ as well as $(\nabla \phi)(u) = 0$.
We now give a similar development for continuous Newton's method by
means of the following result. Denote by each of $X,Y,S$ a Banach space.
For $x \in X,\; r > 0$, $X_r(x)$ denotes
the ball in $X$ of radius $r$ centered at $X$.
\begin{theorem} \label{nmbc}
Suppose $r > 0$, $x_0 \in X$, $F: X_r(x_0) \to Y$, $B:
X_r(x_0) \to S$ are each $C^1$, $B(x_0) = 0$. Suppose
also that $h: X_r(x_0) \to H$ is a locally Lipschitzian
function so that if $x \in B_r(x_0)$ then
\begin{equation}\label{linver}
F'(x) (h(x)) = -F(x_0) \hbox{ and } h(x) \in N(B'(x)), \; \|h(x)\|_X \le r.
\end{equation}
Denote by $z: [0,1] \to X_r(x_0)$ so that
\begin{equation}\label{cnm2}
z(0) = x_0, \quad z'(t) = h(z(t)), \quad t \in [0,1].
\end{equation}
Then
$F(z(1)) = 0$ and $B(z(1)) = 0$.
\end{theorem}
\begin{proof}
Note that $z(t) \in B_r(x_0)$ since $h(z(t)) \in X_r(0)$, $t \in [0,1]$.
Also note that
\begin{equation*}
(Bz)' (t) = B'(z(t)) z'(t) = B'(z(t)) h(t) = 0, \quad t \in [0,1]
\end{equation*}
and so $B(z(t)) = 0$, $t \in [0,1]$ since $B_r (x_0) = 0$.
Hence $B(z(1)) = 0$.
But also,
\begin{equation*}
F(z)'(t) = F'(z(t)) z'(t) = F'(z(t)) h(z(t)) = -F(x_0), \quad
t \in [0,1]
\end{equation*}
and so
\begin{equation*}
F(z(t)) - F(x_0) = -t F(x_0)
\end{equation*}
that is,
\begin{equation*}
F(z(t)) = (1-t) F(x_0), \quad t \in [0,1].
\end{equation*}
Thus $F(z(1)) = 0$.
\end{proof}
In case $F'(x)$ has an inverse, continuous and defined on all of $X$,
one may take in place of \eqref{cnm2} the following:
\begin{equation}\label{nvf1}
h(x) = -F'(x)^{-1} F(x_0), \; x \in X,
\end{equation}
more likely recognizable as a Newton vector field or else the conventional
field:
\begin{equation}\label{nvf2}
h(x) = -F'(x)^{-1} F(x), \quad x \in X.
\end{equation}
With \eqref{nvf1} continuous Newton's method is on $[0,1]$ and with
\eqref{nvf2} continuous Newton's method is on $[0,\infty)$.
In these last two cases, there is no possibility of imposing further
boundary conditions using a function $B$.
\begin{thebibliography}{99}
\bibitem{Ax82} Axelsson, O.,
\emph{On global convergence of iterative methods},
in: Iterative solution of nonlinear systems of equations,
pp. 1-19, Lecture Notes in Math. 953, Springer, 1982.
\bibitem{finnof} Axelsson, O., Farag\'o I., Kar\'atson J.;
On the application of preconditioning operators for nonlinear
elliptic problems, in: {\it Conjugate Gradient Algorithms and Finite
Element Methods}, pp. 247-261, Springer, 2004.
\bibitem{CN} Castro, A., Neuberger, J. W.;
An inverse function theorem via continuous Newton's method.
Proc. WCNA, Part 5 (Catania, 2000), {\it Nonlinear Anal.} 47 (2001),
no. 5, 3223--3229.
\bibitem{book} Farag\'o, I., Kar\'atson, J.; {\it
Numerical solution of nonlinear elliptic problems via
preconditioning operators. Theory and applications.}
Advances in Computation, Volume 11, NOVA Science Publishers, New York, 2002.
\bibitem{GGZ} Gajewski, H., Gr\"oger, K.,
Zacharias, K.; {\it Nichtlineare Operatorgleichungen und
Operatordifferentialgleichungen,} Akademie-Verlag, Berlin, 1974
\bibitem{K-A} Kantorovich, L. V. and Akilov, G.P.; \emph{Functional
Analysis, } Pergamon Press, 1982.
\bibitem{varprec} Kar\'atson J., Farag\'o I.;
Variable preconditioning via quasi-Newton methods
for nonlinear problems in Hilbert space,
{\it SIAM J. Numer. Anal.} {\bf 41} (2003), No. 4,
1242-1262.
\bibitem{sv} Neuberger, J. W.;
\emph{Sobolev Gradients and Differential
Equations}, Springer Lecture Notes in Mathematics $1670$, 1997.
\bibitem{Neu03} Neuberger, J. W.; Integrated form of continuous Newton's method,
Evolution equations, 331--336, {\it Lecture Notes in Pure and
Appl. Math.}, 234, Dekker, New York, 2003.
\bibitem{Nm}
Neuberger, J. W.; A near minimal hypothesis Nash-Moser Theorem,
{\it Int. J. Pure. Appl. Math.}, 4 (2003), 269-280.
\bibitem{Neu3} Neuberger, J. W., Renka, R. J.; Minimal surfaces
and Sobolev gradients. {\it SIAM J. Sci. Comput.} 16 (1995), no.
6, 1412--1427.
\bibitem{Neu6} Neuberger, J. W.; Renka, R. J.;
Sobolev gradients: introduction, applications, problems, to appear
in: {\it Contemporary Mathematics} (AMS, Northern Arizona)
\bibitem{R2} Rheinboldt, W. C.;
{\it Methods for solving systems of nonlinear equations} (second edition),
CBMS-NSF Regional Conference Series in Applied Mathematics, 70,
SIAM, Philadelphia, PA, 1998.
\end{thebibliography}
\end{document}