\documentclass[reqno]{amsart}
\usepackage{hyperref}
\usepackage{graphicx}

\AtBeginDocument{{\noindent\small
Eighth Mississippi State - UAB Conference on Differential Equations and
Computational Simulations.
{\em Electronic Journal of Differential Equations},
Conf. 19 (2010),  pp. 15--30.\newline
ISSN: 1072-6691. URL: http://ejde.math.txstate.edu or http://ejde.math.unt.edu
\newline ftp ejde.math.txstate.edu}
\thanks{\copyright 2010 Texas State University - San Marcos.}
\vspace{9mm}}


\begin{document}  \setcounter{page}{15}
\title[\hfilneg EJDE-2010/Conf/19\hfil Variational data assimilation]
{Variational data assimilation for discrete Burgers equation}

\author[A. Apte, D. Auroux, M. Ramaswamy \hfil EJDE-2010/Conf/19\hfilneg]
{Amit Apte, Didier Auroux, Mythily Ramaswamy}  % in alphabetical order

\address{Amit Apte \newline
TIFR Center for Applicable mathematics, Post Bag No. 6503,
Chikkabommasandra, Bangalore 560065 India}
\email{apte@math.tifrbng.res.in}

\address{Didier Auroux \newline
Laboratoire J.-A. Dieudonne, Universite de Nice Sophia
Antipolis, Parc Valrose, F-06108 Nice cedex 2, France}
\email{auroux@unice.fr}

\address{Mythily Ramaswamy \newline
TIFR Center for Applicable mathematics, Post Bag No. 6503,
Chikkabommasandra, Bangalore 560065 India}
\email{mythily@math.tifrbng.res.in}

\thanks{Published September 25, 2010.}
\subjclass[2000]{35L03}
\keywords{Variational data assimilation; Burgers equation; \hfill\break\indent
Lax-Friedrichs scheme}

\begin{abstract}
 We present an optimal control formulation of the data assimilation
 problem for the Burgers' equation, with the initial condition as the
 control. First the convergence of the implicit Lax-Friedrichs
 numerical discretization scheme is presented. Then we study the
 dependence of the convergence of the associated minimization problem
 on different terms in the cost function, specifically, the weight for
 the regularization and the number of observations, as well as the
 \emph{a priori} approximation of the initial condition. We present
 numerical evidence for multiple minima of the cost function without
 regularization, while only a single minimum is seen for the
 regularized problem.
\end{abstract}

\maketitle
\numberwithin{equation}{section}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{remark}[theorem]{Remark}

\def\R{{\mathbb R}}

\section{Introduction}

In recent years, there has been great interest in development of
methods aimed at blending together sophisticated computational
models of complex systems and the vast amounts of data about these
systems that is now commonly available. E.g., Satellites provide
detailed observations of the atmosphere and the oceans
\cite{webs}. As a result, \emph{data assimilation}, which refers
to the process of combining data and the model output, has
received a lot of attention not only from researchers in sciences
and engineering, but also from the mathematical community in order
to develop sound mathematical and statistical foundations for
these methods.\cite{ApteJ08} There is a host of methods, such as
the 4d-var (four dimensional variational) and EnKF (Ensemble
Kalman Filter) which are prevalent in the atmospheric and oceanic
sciences,\cite{Ben01, Kalnay03, LewisL06, Evensen07,
LeDimetTalagrand86, Evensen03} while many new ones, such as
Bayesian sampling,\cite{ApteH07, CotterD10a} back and forth
nudging,\cite{Auroux09, AurouxBlum08} are under intense
development. The main aim of this paper is to study some of these
methods, in a model which is simple enough so as to be
mathematically tractable but at the same time retains the
essential features of relevance to applications in atmospheric
sciences.

In particular, we will consider a dynamical model, in our example, the
Burgers' equation, which is well-posed as an initial value
problem. The main problem of data assimilation is that of estimating
in an ``optimal'' manner the initial condition, given a set of noisy
data which are obtained by observing the time evolution of a ``true''
initial condition of the physical dynamical systems which this model
represents. In the context of geophysics, this is the problem of
initialization of the numerical model.

We have chosen Burgers' equation as a dynamical model for several
reasons. Firstly, it is a very well studied forward model -- the
existence and uniqueness of solutions is well known.\cite{lions72} In
fact, uniqueness of the solution of a specific variational formulation
of the initialization problem is also known,\cite{Wh08} see
sec.~\ref{sec:4dvar}. Furthermore, data assimilation and related
problems using Burgers' equation have been studied by several
authors.\cite{LDD94, ChungL97, LundvallK06, CastroP08, CastroP10}
Unlike these previous studies, one of the aims of this study will be
to understand the effect of ``data density,'' i.e. increasing number
of observations, on the optimal state estimates that we discuss.
Lastly, our main aim is to use this model to study data assimilation
methods based on Bayesian approaches and compare them with existing
methods such as 4d-var and EnKF. This is still work in progress and we
will present these results elsewhere in future. The major limitation
of this study in particular, and Burgers' equation in general, is that
the data we use will be ``synthetic'' or simulated data and not from
any physical system. Thus we will not be dealing with the issues of
errors in modelling.

This article presents initial results from a larger study, of
which overall scope is as follows. We would like to study the
Bayesian formulation of the variational approach stated below
(Sec.~\ref{sec:4dvar}), where the cost function is seen as the
logarithm of a posterior distribution function on appropriate
function spaces. Such approach is being developed in other data
assimilation problems as well.\cite{CotterD10a, CotterD10b}
Further, we will formulate a version of the Kalman filter on these
spaces, and then compare the two methods.

The paper is organized as follows. We will first discuss the data
assimilation problem in the framework of optimal control theory, and
present some of the available theoretical results. We will then
describe the analysis of the numerical methods we use for discretizing
the continuous problem. The novel feature here is a discussion about
the convergence of viscous Burgers' equation over a bounded domain. We
will also present numerical results corroborating this analysis. We
will then present the gradient descent methods and the numerical
results for minimization of the cost functional whose minima represent
the optimal estimate of the state based on the data.

We will end the paper with a discussion of our ongoing work and its
relation to other contexts.

\section{4d-var method for Burgers' equation}
\label{sec:4dvar}

We will first formulate the data assimilation problem as an optimal
control problem by constructing a cost function which measures
``distance'' between the given observations and our estimate of it,
and whose minimum will give the optimal initial condition. This is
precisely the commonly used 4d-var method.\cite{LeDimetTalagrand86}
Such a minimization problem is ill-posed.\cite{LundvallK06} We will
use Tikhonov regularization by adding a ``background''
guess.\cite{Wh08, Kirsch96} Another regularization of this problem,
given by the Landweber iteration method, is discussed in
\cite{LundvallK06}.

\subsection{Cost function}

Let us consider a model given by viscous Burgers' equation
\begin{equation}\label{eq:burger}
\frac{\partial z}{\partial t} +
\frac{1}{2}\frac{\partial (z^2) }{\partial x} = \mu \frac{\partial
^2 z}{\partial x^2}
\end{equation}
for $(x,t) \in \Omega = (0,1) \times (0,T) $ and for a positive
parameter $\mu$, with Dirichlet boundary conditions
$$
z(0,t)  =  0  =   z(1,t)
$$
and initial condition
$$
z(x,0) = u(x).
$$
For $u \in L^2 ( \Omega) $, it is known that there exists a unique
solution $z \in L^2 ( 0,T; H^1_0 ( \Omega) ) \cap C ([0,T] ; L^2 (
\Omega)) $ (see for example  \cite[Chapter 2, section 2]{GoR}). For
initial conditions with better regularity, the solution is also in a
better space (see for example \cite{Wh08, LundvallK06}).

In parallel with the discussion in \cite{Wh08}, we will assume that
the observations $Z^d(t)$ at a fixed time $t$ are in a Hilbert space
$\mathcal{Z}$ and that the state of the system $z(t;u)$ at time $t$ is
related to the data by the \emph{observation operator}, which is a
linear mapping $C$; i.e., $C z(t;u) \in \mathcal{Z}$ as well. We will
consider two distinct cases.
\begin{itemize}
\item \emph{Continuous observations}: The observations will be
  available continuously in the domain $\Omega(0,T)$. The cost
  function can be written as
  \begin{equation} \label{eq:costcont}
  J(u) = \frac{1}{2}\int_0^T \| C(z(t;u)) -Z^d
  \|_\mathcal{Z}^2 \ dx \ dt
  + \frac{\alpha}{2} \int_0^1 | u - u^b|^2 \ dx
  \end{equation}
  where $\| \cdot \|_\mathcal{Z}$ is a norm on $\mathcal{Z}$. We will
  only consider the case $C = \textrm{id}$ and the $L^2$ norm. We
  note that the above cost function is not the most natural one to
  consider when the observations are noisy, but continuous in time.
\item \emph{Discrete observations}: The observations are taken at a
  finite number $M$ of points in ``space'' $x$ and finitely many
  times. In particular,
  \begin{align*}
    C(z(t_i)) = \{z(x_1,t_i), \dots, z(x_M,t_i)\} \quad \text{for}
    \quad 0 \le t_1 < t_2 \dots < t_N = T
  \end{align*}
  The cost function in this case is
  \begin{equation} \label{eq:costdiscr}
  J(u) = \frac{1}{2} \sum_{i=1}^N | C(z(t_i,u)) -Z^d(t_i) |^2
  + \frac{\alpha}{2} \int_0^1 | u - u^b|^2 \ dx\,,
  \end{equation}
  where the norm in the sum is simply the $L^2$ norm on $\mathcal{Z}
  \equiv \R^M$ which is finite-dimentional.
\end{itemize}
In both the above cases, the ``optimal'' initial condition will be
given by the minimum of the cost function. Thus, we
will look for
$$ \overline{u} =  \mathop{\rm argmin}_{ u \in \mathcal{U} }  J(u) $$
for a reasonable class of controls, say
$ \mathcal{U} = H^1_0 ( 0,1)$.
Note that we have used Tikhonov regularization with
$u^b (x) $ being an \emph{a priori} approximation of the unknown
initial condition $u$. We will later discuss the dependence of the
minimum on both $\alpha$ as well as on $u^b(x)$.

\subsection{Adjoint equations}

In the continuous observation case, one can show that under reasonable
assumptions, there exists at least one solution $\overline{u}$ to the
minimization problem. Further, the first order optimality conditions
verified by $\overline{u}$ can be derived using the fact
$ J' (\overline{u} ) = 0$ and the existence of the co-state vector
$P \in L^2 ( 0,T; H^1_0 ( \Omega))$, satisfying
\begin{equation}
 -\frac{\partial P}{\partial t} - \overline{z} \frac{\partial P}{\partial x}
- \mu \frac{\partial ^2 P}{\partial x^2} = C^*
[C(\overline{z}(\overline{u})) - Z^d]
\quad \textrm{with} \quad
P (x,T) = 0 .
\label{eq:adjoint}
\end{equation}
Here $\overline{z}(\overline{u})$ is the solution of
\eqref{eq:burger} with initial condition $\overline{u}$ and $C^*$
is the adjoint of $C$.  In \cite{Wh08} sufficient conditions are
derived, namely smallness of $T$, so that $J$ admits a unique
minimum. The condition on smallness of $T$ depends on the observations
as well as the viscosity $\mu$, and is difficult to verify in
practice.  Our main interest will be in studying numerical methods for
finding a minimum and understanding whether this minimum is unique. We
will also use the co-state or adjoint equation in the minimization
algorithm in order to calculate the gradient of the cost function.

In the case of discrete observations, we are not aware of any results
for existence of a minimum. But, even in that case, the above co-state
equation with the right hand set to zero, and with jump conditions
\[
  P(x,t_i-) = P(x,t_i+) - \nabla_z | C(z(t_i,u)) -Z^d(t_i)
  |^2 \quad\text{for}\quad  i = N, N-1, \dots, 1
\]
at observations times, can be used to calculate the gradient of the
cost function. Adjoint methods have been discussed previously for
various different optimal control problems,
e.g., \cite{NoackW07,LDD98}.

\section{Numerical Methods}

Here we will consider two finite difference schemes for Burgers'
equation and indicate their convergence and then derive the adjoint
schemes and describe the 4d-var algorithm using steepest descent
method. Throughout this section, we will use super- and sub-scripts for
time and space discretization, respectively. In particular, for any
function $U(x,t)$, let us denote
\[
  U_j^m = U(x_j, t_m)
  \quad \text{for } j=0, \dots, (n+1),
  \; m = 0, \dots, N,
  \]
where
\[
  x_j = j \Delta x, \quad (n+1)\Delta x = 1, \quad
  t_m = m \Delta t, \quad N \Delta t = T.
\]

\subsection{Schemes for Burgers' equation}

We will consider two schemes here -- the implicit Lax-Friedrichs
scheme and the ``centered difference'' scheme. We will show that for
the implicit Lax-Friedrichs scheme, the time step $\Delta t$ can be
chosen to be much larger than that for the centered difference scheme
and for the rest of the numerical work we will focus only on the
implicit Lax-Friedrichs scheme.

\subsubsection{Implicit Lax-Friedrichs scheme}

Let us first consider the following descretization of the Burgers'
equation.
\begin{equation}  \label{eq:lf}
\begin{aligned}
\mathcal{L}^{\textrm{LF}} U (x_j, t_m)
 &=
\frac{U_j^{m+1} - \frac{U^m_{j+1}  + U^m_{j-1}}{2}}{\Delta t } +
\frac{1}{4 \Delta x} ((U^m_{j+1})^2 - (U^m_{j-1})^2 )   \\
&\quad -  \frac{\mu}{(\Delta x)^2}(U^{m+1}_{j+1} - 2U^{m+1}_j +
U^{m+1}_{j-1}) = 0, \quad 1 \leq j \leq N-1.
\end{aligned}
\end{equation}
We first calculate the local truncation error
$T_j^{m} = T (x_j, t_m)$, which is obtained by applying the scheme
to the exact
solution $z (x_j, t_m):$
$$
T (x, t) =\mathcal{L}^{\textrm{LF}}(z (x, t))\,.
$$
Assuming the solution to be smooth and using Taylor's theorem, we have
for $k = \Delta t$ and $h = \Delta x $,
\begin{gather*}
  \frac{1}{k}\big\{z (x, t +k) - \frac{1}{2}
  [z (x+ h, t) + z( x- h, t)]\big\}
  = z_t + \frac{1}{2} z_{tt} k - \frac{1}{2} z_{xx}
  \frac{h ^2}{k} + o\big(k +\frac{h ^2}{k} \big) ,
\\
\begin{aligned}
&\frac{\mu}{h^2} [ z (  x+ h, t + k)
  -  2  z (x,t +k) + z ( x -h, t+ k)]\\
& =  \mu z_{xx}   + \frac{ \mu}{12} z_{xxxx} h^2
   + \mu z_{xxt} k  + o(h ^2+k) ,
\end{aligned} \\
  \frac{1}{4h}[z^2(x+h,t) - z^2(x-h,t)]
  = \frac{1}{2} (z^2)_x +
  (z z_{xxx} + 3 z_x z_{xx}) \frac{h^2}{6}
  + o(h^3),
\end{gather*}
where $z_x = z_x(x,t)$ etc. on the right hand side. Using
the above expressions, we obtain
\begin{align*}
\mathcal{L}^{\textrm{LF}} z (x_j, t_m)
&=(z_t + zz_x - \mu z_{xx}) +
(\frac{1}{2}z_{tt} - \mu z_{xxt} )k
- z_{xx} \frac{h^2}{2k}\\
&\quad + (3 z_x z_{xx} + zz_{xxx}
- \frac{\mu}{2} z_{xxxx} )\frac{h^2}{6}
 + o( k + h^2 +  \frac{h^2}{k})
\end{align*}
Choosing $h,k$ small such that $h/k$ is a positive constant,
\begin{equation}
\begin{aligned}
|T (x , t)|
&  \leq (\frac{1}{2}|z_{tt}| + \mu |z_{xxt}|) k
+ |z_{xx} | \frac{h^2}{2 k}
+ (3|z_x z_{xx}| + |zz_{xxx}|+ \frac{\mu}{2} |z_{xxxx}|)\frac{h^2}{6}
 \\
&\quad +  o (k + h^2 + \frac{h^2}{k})
\end{aligned} \label{eq:trunc}
\end{equation}
and using the fact that the derivatives of $z$ are bounded in our
domain, and for $h/k$ a constant,
$$
|T (x , t)|  \leq  C k.
$$
This shows that the local truncation error goes to zero as $k$ goes to
zero with $\frac{h}{k}$ constant. Thus the scheme is consistent. [In
fact, this is true for $h^p/k$ constant for any $0 < p < 2$ and the
order of the scheme in this case is $\min(1,2/p-1)$. Thus, $p \le 1$
gives the scheme of highest order which is one.]

To prove convergence, we will extend $U^m_j $ as a piecewise
constant function $U_k (x,t)$ for a time step $k$ for all $(x,t) $ in
our domain and define the error to be
$$
e_k (x,t) = U_k (x,t) - z(x,t)
$$
where $z$ is the solution of \eqref{eq:burger}.  The scheme is
said to be convergent if this error converges to zero in some norm as
$k$ tends to zero.  (See \cite{RLV} for notations and definitions.)
Henceforth, we will drop the subscript $k$ for the time-step $k$.

Let us multiply \eqref{eq:lf} by $k$ and write the scheme as
\begin{equation} \label{eq:lf1}
k \mathcal{L}^{\textrm{LF}} U^m_{j}
= ( A U^{m+1})_j  - [H (U^m)]_j
\end{equation}
where $H$ is a nonlinear operator defined by
\begin{equation}
[H (U^m)]_j
:= \frac{1}{2}\ (U^m_{j+1} + U^m_{j-1})
- \frac{k}{4h}[ (U^m_{j+1})^2
  - (U^m_{j-1})^2 ]
\end{equation}
and the matrix $A$ is symmetric, tridiagonal with $(1+ 2 \mu k/h^2 )$
as diagonal and $(-\mu k/h^2)$ as sub- and super-diagonal
entries. Noting that
$$\mathcal{L}^{\textrm{LF}} U^m_{j} =0, \quad \textrm{and} \quad
\mathcal{L}^{\textrm{LF}} z(x_j,t_k) =  (T_k)^m_j,$$
we get
$$
(Ae^{m+1})_j = [H(U^m)]_j- [H(z^m)]_j - kT^m_j.
$$
We need to estimate the first term on the RHS in order to get a
recurrence relation for the error. Before doing that,
we estimate the norm of the inverse of the matrix $A$.
Let us denote for any vector $x = (x_1,x_2, \cdots, x_n) \in \mathbb{R}^n $,
$$
\|x\|_\infty =  \max_{1 \leq j \leq n} \ |x_j|
$$
and for the bounded function $z$ defined on the domain
$I \times [0,T]$,
$$
\|z \|_{\infty} = \sup_{I \times [0,T]} |z(x,t)|\,.
$$

\begin{lemma} \label{lem3.1}
$\|A^{-1}\|_\infty \le 1$
\end{lemma}

\begin{proof}
By definition,
$$
\| A^{-1} \|_\infty = \max_{\|x\|_\infty = 1} \| A^{-1} x \|_\infty
$$
There exists $x^\ast \in \mathbb{R}^n$ such that
$\|x^\ast\|_\infty = 1$ and
$\| A^{-1} \|_\infty = \| A^{-1} x^\ast \|_\infty$.

Let assume that $\| A^{-1} \|_\infty > 1$. Then
$\| A^{-1} x^\ast \|_\infty > 1$. Let $y=A^{-1}x^\ast$; i.e.,
$x^\ast = Ay$.  Then we
have $\| y \|_\infty > 1$ and $\| Ay \|_\infty = 1$.

Let $i_0$ such that $y_{i_0} = \| y \|_\infty > 1$. (Replace $y$ by
$-y$ if the maximum of the absolute values is reached with a negative
value). Then the corresponding row of $Ay$ is
$$
(Ay)_{i_0} = y_{i_0} + K (2 y_{i_0} - y_{i_0-1} - y_{i_0+1})
$$
where $K = \frac{\mu k}{h^2}>0$ in the decomposition $A = I + K B$.

If $i_0 = 1$ or $n$, then there is only one ``-1'' in the
corresponding row of $B$. As $y_{i_0}\ge y_i$ for all $i$, then
$(Ay)_{i_0} \ge y_{i_0} > 1$. Thus $\|Ay\|_\infty >
1$. There is a contradiction, and thus $\|A^{-1}\|_\infty \le 1$.
\end{proof}


\begin{remark} \label{rmk3.2} \rm
Note that this norm becomes very close to $1$ if $K$ is close to $0$
(but by definition of $K$, it should be a quite large real number), or
if all the components of $y$ are close ($y_{i_0} = y_{i_0-1} =
y_{i_0+1} \Rightarrow (Ay)_{i_0} = y_{i_0}$). It is possible to
consider $y = (1\ 1\ 1 \ \dots \ 1)^T$ (as $\|y\|_\infty = 1$), and
then, $Ay = y$ and then $A^{-1}y = y$ and $\|A^{-1}\|_\infty \ge 1$
(and this is the maximum).
\end{remark}


\begin{lemma} \label{lem3.3} Let
$C_m = \max \{ \|U^m\|_\infty, \|z \|_{\infty} \}$
 for  $0 \leq m \leq N$. If  the CFL condition
\begin{equation}
  \frac{k C_0}{h} \leq 1 \label{eq:cfl}
\end{equation}
holds for $k,h$ with $ kN \leq T $ and
$\frac{k}{h}  $ a positive constant,
then
$$
C_m  \leq C_0  \quad \forall \; 1 \leq m \leq N.
$$
\end{lemma}

\begin{proof}
We can prove this by induction.  From \eqref{eq:lf1},
we have for $m=1$,
\begin{align*}
( A U^{1})_j
&= [H (U^0)]_j  = \frac{1}{2} (U^0_{j+1} + U^0_{j-1})
- \frac{k}{4h}[ (U^0_{j+1})^2  - (U^0_{j-1})^2 ] \\
&= ( \frac{1}{2}- \frac{k}{2h} U^0_{j+1}) U^{0}_{j+1}   +
(\frac{1}{2}+ \frac{k}{2h} U^0_{j-1} ) U^{0}_{j-1} \\
&= \frac{1}{2}( U^0_{j+1}+ U^0_{j-1}) \leq  \|U^0\|_{\infty}
\end{align*}
by using the CFL condition. By the previous lemma and the
definition of $C_0$,  it follows that
$$
\|U^1\|_\infty = \|  A^{-1} A U^{1} \|_\infty
\leq  \|U^0\|_\infty \leq C_0
$$
Hence the condition $\frac{k C_1 }{h} \leq 1 $ also holds.
Now using the assumption $C_m \leq C_0$ and the condition
$\frac{k C_m }{h} \leq 1 $,
proceeding as above, we can show that
$$
C_{m+1}\leq C_0.
$$
Thus the lemma follows.
\end{proof}

\begin{lemma} \label{lem3.4}
If the CFL condition \eqref{eq:cfl}
holds  for $k,h$  with $ kN \leq T $ and
$\frac{k}{h}  $ a positive constant, then we have for every
$m$,  $1 \leq m \leq N$,
$$
|(Ae^{m+1})_j| \leq \| e^{m} \|_{\infty} +  k|(T_k)^m_j|.
$$
\end{lemma}

\begin{proof}
 From the previous lemma, it follows that the CFL condition
$$
\frac{k C_m}{h} \leq 1
$$
holds for all $m$, $1 \leq m \leq N$.
Consider
\begin{align*}
&(H (U^m))_j -  (H (z(x_j,t_m))_j \\
 &= \frac{1}{2} (e^{m}_{j+1} + e^{m}_{j-1}) -\frac{k}{4h}
\{ (U^m_{j+1})^2 - (z^m_{j+1})^2 -
((U^m_{j-1})^2 )-(z^m_{j-1})^2 )\}\\
&=  ( \frac{1}{2}- \frac{k}{2h} \theta^m_{j+1}) e^{m}_{j+1}
+ (\frac{1}{2}+ \frac{k}{2h} \theta^m_{j-1} ) e^{m}_{j-1}
\end{align*}
using mean value theorem, for some $\theta^m_{j+1}$ between
$U^{m}_{j+1}$ and $z( x_j, t_m) $. By the CFL condition, both
coefficients are positive and hence
\begin{align*}
| (H (U^m))_j -  (H (z(x_j,t_m)_j |
& \leq ( \frac{1}{2}- \frac{k}{2h} \theta^m_{j+1})
|e^{m}_{j+1}|   + (\frac{1}{2}+ \frac{k}{2h} \theta^m_{j-1} )
|e^{m}_{j-1}| \\
 & \leq   \frac{1}{2}( |e^{m}_{j+1}| + |e^{m}_{j-1}|) \\
 & \leq  \|e^m\|_{\infty}
\end{align*}
Thus we obtain
$$
|(Ae^{m+1})_j | \leq  \|e^{m}\|_{\infty}
 + k |(T_k)^m_j| .
$$
\end{proof}

Using all these estimates, now we can conclude the convergence
 of the scheme.

\begin{theorem} \label{thm3.5}
If the CFL condition \eqref{eq:cfl}
holds  for $k,h$ with $ kN \leq T $ and
$\frac{k}{h}  $ a positive constant,
the implicit Lax-Friedrichs scheme is convergent.
\end{theorem}

\begin{proof}
 From the previous lemmas, we have
\begin{align*}
 \|e^{m+1}\|_{\infty}
 &=  \max_j|( A^{-1} Ae^{m+1})_j | \leq \| A^{-1}\|
   \| Ae^{m+1}\|_{\infty} \\
 & \leq   \|e^{m}\|_{\infty} + k |(T_k)^m_j| .
\end{align*}
Defining $E^{m+1} = \|e^{m+1}\|_{\infty} $, we have the recurrence
relation
$$
E^{m+1} \leq E^{m} +  k \max_j |(T_k)^m_j|
$$
Solving this iteratively, we get
$$
E^{m+1} \leq E^{0} +  k  \sum_{i=0}^m \max_j |(T_k)^i_j|
$$
Recall that for a smooth solution $z$, the local truncation error,
$|T(x,t)| \leq Ck$ for some constant depending on the bounds of the
derivatives of $z$. Using $mk \leq T $ we obtain
$$
E^{m+1} \leq E^{0} +  k  C T.
$$
This shows that the error goes to zero as $k$ goes to zero with
$\frac{k}{h} $
fixed as a positive constant.
\end{proof}

\begin{remark} \label{rmk3.6} \rm
It is interesting to note that the viscous term treated implicitly
makes the Lax-Friedrichs scheme stable, while the explicit treatment
of the viscous term has been shown not to
converge.\cite{CastroP10,Tadmor84} Indeed, Lax-Friedrichs scheme (see
equation \ref{eq:lf}) can be rewritten in the following way:
\begin{equation} \label{eq:lf2}
\begin{aligned}
&U_j^{m+1} \\
&= \frac{U^m_{j+1} + U^m_{j-1}}{2} - \frac{\Delta t}{4 \Delta x}
((U^m_{j+1})^2 - (U^m_{j-1})^2 ) + \frac{\mu\, \Delta t}{(\Delta
  x)^2}(U^{m}_{j+1} - 2U^{m}_j + U^{m}_{j-1})
 \\
&= U_j^m + \frac{U^m_{j+1} -2 U^m_j + U^m_{j-1}}{2} - \frac{\Delta t}{4
  \Delta x} ((U^m_{j+1})^2 - (U^m_{j-1})^2 ) \\
  &\quad + \frac{\mu\, \Delta t}{(\Delta
  x)^2}(U^{m}_{j+1} - 2U^{m}_j + U^{m}_{j-1}).
\end{aligned}
\end{equation}
The viscosity of the numerical scheme is then
$(1/2 + \mu \Delta t / \Delta x^2)$. The CFL condition
\cite{CastroP10,Tadmor84} for stability of
conservative schemes requires that the numerical viscosity be no
greater than $1/2$, which is not satisfied as long as $\mu > 0$.
\end{remark}

\subsubsection{Centered difference scheme}

The next scheme for Burgers' equation is the implicit centered
scheme,
\begin{equation} \label{eq:centre}
\begin{aligned}
\mathcal{L}^{\rm C} U(x_j,t_m)
&=\frac{U_j^{m+1} - U_j^m}{\Delta t} + \frac{1}{4 \Delta x} (
(U^m_{j+1})^2 - (U^m_{j-1})^2 )  \\
&\quad - \frac{\mu}{(\Delta x)^2}
( U_{j+1}^{m+1} - 2U^{m+1}_{j} + U^{m+1}_{j-1})
\end{aligned}
\end{equation}
Let us first calculate the local truncation error
$T_j^{m} = T (x_j, t_m)$, in this case. As before assuming
the solution to be smooth and using Taylor's theorem, we have
for $k = \Delta t $ and $h =\Delta x $,
$$
\frac{z (x, t + k) - z (x, t)}{k}  = z _t  (x, t) +
\frac{1}{2} z_{tt} ( x , t ) k + \circ (k) ,
$$
Choosing $h$ and $k$ small as before, we get the local truncation
error estimate
\begin{equation} \label{error1}
| T (x_j,t_m)| \leq  \Big(\frac{1}{2}|z_{tt}|
+ \mu|z_{xxt}|\Big) k + \big(3|z_x z_{xx}|
+ |zz_{xxx}|+ \frac{\mu}{2} |z_{xxxx}|\big)
\frac{h^2}{6}
 + o(k  + h^2)
\end{equation}
Thus the local truncation error goes to zero as $k$ and $h$ go to zero
and the scheme is consistent (for any path in the $(h,k)$ plane,
unlike the implicit Lax-Friedrichs scheme).

In the earlier case, we could get a recurrence relation for the error
and from there an error bound. In this case also, we need to check
that the local errors do not amplify too much. Heuristically one can
check that the amplification is not too large, at least in the
linearized equation and then conclude for the nonlinear equation as
the solution is smooth.  We follow the approach as outlined in
\cite{GoR} for the linearized Burgers' equation with frozen
coefficients
$$
u_t + a u_x = \mu u_{xx}
$$
by taking Fourier transform and check Von Neumann's stability
condition.

Extending the functions to whole of the real line by $0$, we can
define
$$
U^m ( \xi) = \frac{1}{\sqrt{2 \pi} } \int_0^{\infty}
e^{-\iota x \xi} U^m (x,t) dx .
$$
Then by taking Fourier transform of the scheme with respect to $x$, we
get
$$
\big[ 1 + 4 \mu \frac{k}{h^2} \sin^2 (\frac{h \xi}{2} )
\big] U^{m+1} (\xi)
 = \big[ 1 - \frac{\iota k a}{ h} \sin (\frac{h \xi}{2} )
\cos(\frac{h \xi}{2}) \big]U^m (\xi)
$$
which gives the amplification factor
$$
H(\xi) = \frac{1 - \frac{k a}{ h} ( \iota \ s \sqrt{1-s^2}
  )}{1 +  \frac{4 \mu k} {h^2} s^2 }
$$
where $s = \sin (h \xi / 2)$. We have, $|H(\xi)| \leq 1 $ if,
for example,
$$
\big(\frac{k a}{ h}\big)^2  \leq 4 \mu \frac{k} {h^2}
$$
This gives a stability condition $ k \leq \frac{4 \mu}{a^2}. $ If this
holds, the scheme for the linearized equation converges as the global
error goes to zero when we refine $k$. In the nonlinear case,
instability may appear if the solution $z$ becomes unbounded
(``$a$'' plays the role of the norm of $u$).

\subsection{Adjoint Schemes}

We have already presented the ``continuous adjoint'' in
eq.~\eqref{eq:adjoint}. In this section we discuss the adjoint of the
Burgers' equation discretized using the implicit Lax-Friedrichs
scheme. We have chosen to use this scheme in our numerical
implementation, rather than discretizing the continuous adjoint
equation.

The discretized cost function is
$$
J(u) = \frac{1}{2} \sum^{N}_{m=0}
\sum^{n-1}_{j=1} | U^m_j - \hat{U}^m_j |^2
$$
where $ u = (u_j)^{n-1}_{j=1} \in \R^{n-1}$ and
$U^m = (U^m_j)^{n-1}_{j=1}$ is the solution of the numerical
scheme with $u$ as the initial condition and
$ \hat{U}= (\hat{U}^m_j)_j $ is the given
observation. More generally, let us take
$$
J(u) = \sum^N_{m=0} g (U^m)
$$
with $g$ a scalar function of
$U^m $.  Let us write the scheme as
\begin{gather*}
 A U^{m+1} =  H (U^m),  \quad 0 \leq m \leq N-1 \\
 U^0 = u
\end{gather*}
for a nonlinear operator $H$ from $\R^{n-1}$ to $\R^{n-1}$.
We define the augmented functional as
$$
\tilde{J}  = \sum^N_{m=0} g (U^m) + \sum^{N-1}_{m=0}
\langle p^{m}, (A U^{m+1} - H (U^m))\rangle
$$
Here $\langle \cdot , \cdot \rangle $ denotes the usual scalar
product in $\R^{n-1}$.
Taking first variations,
\begin{align*}
\delta \tilde{J}
&=  \sum^N_{m=0}\langle g' (U^m) ,\delta U^m \rangle+
\sum^{N-1}_{m=0} \langle p^{m}, A \delta U^{m+1})\rangle -
\langle  p^{m}, H'
(U^m) \delta U^m)\rangle \\
& = \sum^N_{m=0}\langle g ' (U^m), \delta U^m\rangle +
\sum^N_{m=1} \
\langle  A^{T} p^{m-1},  \ \delta U^m \rangle - \sum^{N-1}_{m=0}
\langle {H ' (U^m)}^{T}p^m, \delta U^m \rangle \\
&=  \sum^{N-1}_{m=1} \langle g' (U^m) + A^T  p^{m-1} - H'
(U^m)^T p^m,\delta U^m \rangle + \langle g' (U^0), \delta U^0 \rangle\\
&\quad - \langle p^{0}, H ' (U^0) \delta U^0 \rangle 
 + \langle g' (U^N), \delta U^N \rangle +\langle A^T p^{N-1},
\delta U^N\rangle = 0
\end{align*}
since at the optimal $u$, the first variation has to vanish.
This gives the adjoint scheme as
\begin{gather*}
A^T p^{m-1} - H' (U^m)^T p^m  =  - g'
(U^m),  \quad  1 \leq m \leq N-1 \\
g' (U^N) + A^T p^{N-1} =  0.
\end{gather*}
The gradient of $J$ is given by
$$
g' (U^0) = H'(U^0)^T  p_0 .
$$
Thus for the implicit Lax-Friedrichs scheme, the adjoint scheme is
\begin{align*}
&\frac{p_j^{m-1} - \frac{p^m_{j+1}  + p^m_{j-1}}{2}}{\Delta t } +
\frac{1}{2 \Delta x} U^m_{j}( p^m_{j-1}  - p^m_{j+1} )
 -   \frac{\mu}{(\Delta x)^2}(p^{m-1}_{j+1} - 2p^{m-1}_j +
p^{m-1}_{j-1}) \\
&= -(U^m_{j} - {\hat{U}}^m_{j} )
\end{align*}
for $j$, $ 1 \leq j \leq n-1$.
The gradient of $J$ is given by
$$
\nabla J (u) = [ \frac{1}{2} (B^1)^T - \frac{k}{2 h} (B^0)^T ] p^0
$$
where $B^1$ is the tridiagonal matrix with zero along the diagonal and
$1$ in off the diagonal while $B^0$ is the tridiagonal matrix with
zero on the diagonal and $U^0 = (U^0_j) $ above the diagonal and $-U^0
$ below the diagonal.

\section{Numerical Results}

In this section, we will first compare the numerical results for
convergence of the implicit Lax-Friedrichs and the centered difference
schemes, with \eqref{eq:trunc} and \eqref{error1},
respectively. Next we discuss the various numerical experiments with
the gradient descent method for finding a minimum of the cost
function, for various choices of regularization and for varying number
of observations.

\subsection{Implicit Lax-Friedrichs and centered difference schemes}

In order to compare the truncation error given by
\eqref{eq:trunc} and \eqref{error1} with numerical truncation
error, we will use the exact solutions of Burgers' equation obtained,
e.g., by Cole-Hopf transformation \cite{BentonP72}.

Fig. \ref{fig:lferr} shows the dependence on $\Delta x = h$, on $\mu$,
and on $k/h$ of the $L^2$-norm (in space variable $x$) of the local
truncation error $T(x,t)$ for fixed $t$. Using the variable $\lambda =
k/h$, we write \eqref{eq:trunc} as
\begin{equation}
  T(\lambda; h, \mu) = a(\mu)h\lambda + b(\mu)h\frac{1}{\lambda}
  + c(\mu)h^2
\label{eq:trunclh}
\end{equation}
where $a,b,c$ are functions of $\mu$ through the derivatives of the
solution as they appear in \eqref{eq:trunc}. The minimum of the
graph of the error $T$ vs. $\lambda$ occurs at
\begin{equation}
  \lambda_{\rm opt} = \sqrt{\frac{b(\mu)}{a(\mu)}},
\label{eq:lambdaopt}
\end{equation}
which is independent of $h$ but depends on $\mu$. We also see that the
minima depends on the solution under consideration and the time
$t$. It is seen in Fig.~\ref{fig:lferr}(d) that the ``optimal''
$\lambda$ decreases with $\mu$, approximately as $1/\mu$. Numerically,
we observed that for fixed $\mu$ and $h$, the $\lambda_{\rm opt}$
for solution with an initial condition with fewer zeros is larger than
$\lambda_{\rm opt}$ for the solution with more zeros.  Furthermore,
at fixed $\lambda$ and $\mu$, the error increases with $h$. All these
conclusions are clearly seen in Fig.~\ref{fig:lferr}. Based on this
discussion, the choice of $(h,k)$ is made as follows. We first choose
the smallest possible $h$, which is mainly limited by the memory
available for computation. With this $h$, the $k$ is chosen to be
greater than the largest $\lambda_{\rm opt}$, but smaller than
$C_0/h$ in order to satisfy the CFL condition \eqref{eq:cfl}.

\begin{figure}[t!] 
\begin{center}
 \includegraphics[width=0.47\textwidth,height=0.35\textwidth]{fig1a} % {lferr_dtdx_diffdx}
 \includegraphics[width=0.47\textwidth,height=0.35\textwidth]{fig1b} % {lferr_dtdx_diffmu}
\\
(a)\hspace{0.47\textwidth} (b) \\
 \includegraphics[width=0.47\textwidth,height=0.35\textwidth]{fig1c} %{lferr_dx_diffmu}
 \includegraphics[width=0.47\textwidth,height=0.35\textwidth]{fig1d}%{lferr_optdtdx_mu}
\\
(c) \hspace{0.47\textwidth} (d) %\label{fig:lferr-d}}
\end{center}
\caption{The $L^2$ norm, in ``space'' index $j$, of the truncation
      error $T_j^m$ Eq.~\eqref{eq:trunc} for the implicit
      Lax-Friedrichs scheme.
%
      (a) As a function of $\lambda = k/h$ for fixed $\mu = 0.5$. The
      thick lines are for one initial condition for various $h$, while
      thin ones for another initial condition for various $h$.
%
      (b) As a function of $\lambda$ for fixed $h =
      10^{-3}$. Different lines are for different $\mu$.
%
      (c) As a function of $h$ for fixed $\lambda = 1.2$. Different
      lines are for different $\mu$.
%
      (d) The optimal $\lambda$ as a function of $\mu$ -- it is seen
      to decrease approximately as $1/\mu$.
%
      The numerical results are in complete agreement with
      \eqref{eq:trunclh}-\eqref{eq:lambdaopt}.}
\label{fig:lferr}
 \end{figure}

The behaviour of the truncation error for the centered implicit scheme
is much simpler. As seen in Fig.~\ref{fig:cerr}, the smaller the
$(h,k)$, the smaller is the truncation error. Thus, in this case, we
must choose the smallest possible values of $(h,k)$, limited only by
the memory and computation time.

\begin{figure}[t!] 
\begin{center}
 \includegraphics[width=0.30\textwidth,height=0.25\textwidth]{fig2a} % cerr_dtdx_diffmu
 \includegraphics[width=0.30\textwidth,height=0.25\textwidth]{fig2b} % cerr_dtdx_diffdx
 \includegraphics[width=0.30\textwidth,height=0.25\textwidth]{fig2c} % cerr_dx_diffmu
\\
(a)\hspace{0.30\textwidth}(b)\hspace{0.30\textwidth}(c)
\end{center}
\caption{The $L^2$ norm, in ``space'' index $j$, of the truncation
      error $T_j^m$ Eq.~\eqref{error1} for the implicit centered
      scheme.
%
      (a) As a function of $\lambda$ for fixed $h = 10^{-3}$. The
      different lines are for different $\mu$.
%
      (b) As a function of $\lambda$ for fixed $\mu = 0.5$. Different
      lines are for different $h$.
%
      (c) As a function of $h$ for fixed $\lambda = 0.1$. Different
      lines are for different $\mu$.
%
      We see that in this case, the error increases with $\lambda$ and
      $h$, in agreement with Eq.~\eqref{error1}. }
    \label{fig:cerr}
\end{figure}

\subsection{Gradient descent algorithm}

Now we will discuss the minima of the cost function $J$ found using
the gradient descent method. We perform the ``identical twin
experiments'' as follows. We choose an initial condition
$u^{\rm true}$. We solve the Burgers' equation numerically and
generate the data $Z^d$. This is then used to evaluate the cost
function $J$. We would like to understand the relation between the
minimum $u^{\rm min}$ found using numerical method and this
``true'' initial condition $u^{\rm true}$.

\subsubsection{Non-regularized cost function with $\alpha = 0$}

In the case of discrete observations, we first look at the behaviour
of the gradient descent as the number of observations is increased,
when $\alpha = 0$ in the cost function, i.e., without the
regularization term. We see from fig.~\ref{fig:jvsn1}(a) that the rate of
decrease of $J$ with each descent step strongly depends on the number
of observations.

We also see, from fig.~\ref{fig:jvsn1}(b), that irrespective of the
number of observations, the minimum is different from the true initial
condition. But in the case when $\alpha = 0$, one of the minima of the
cost function is certainly $u^{\rm true}$ since $J \ge 0$ and
$J(u^{\rm true}) = 0$. (Note that this is only true when the
observations $Z^d$ are without any noise, which is the case in our
numerical experiments. The case of noisy observations is of great
interest but will be discussed elsewhere).

This seems to indicate that this cost function for $\alpha = 0$ could
have multiple local minima. Indeed, starting with an initial guess
which is close to $u^{\rm true}$, we find that the gradient descent
converges to a minimum which is very close, within numerical accuracy,
to $u^{\rm true}$.

\begin{figure}[t!] 
\begin{center}
\includegraphics[width=0.47\textwidth,height=0.25\textwidth]{fig3a} % JvsN_ic4
 \includegraphics[width=0.47\textwidth,height=0.25\textwidth]{fig3b} % umin_utrue_ic4
\\  (a)\hspace{0.47\textwidth}(b)
 \end{center}
    \caption{The behaviour of the steepest descent for $\alpha = 0$.
%
      (a) The cost function $J$ as a function of the gradient descent
      step, for different number of observations. The more the
      observations, the faster the gradient descent converges.
%
      (b) The $u^{\rm min}$ and the $u^{\rm true}$. We see that
      $u^{\rm min}$ is almost independent of the number of
      observation (all four lines are almost identical), but is
      different from $u^{\rm true}$.
%
}  \label{fig:jvsn1} % \label{fig:umin1}
 \end{figure}

\subsubsection{Regularized cost function with $\alpha \neq 1$}

First we discuss the minima of $J$ with $\alpha \neq 0$ and with $u_b
= u^{\rm true}$. In this case, we could numerically find only a
single minimum which is close, within numerical accuracy, to
$u^{\rm true}$. This is because setting $u_b = u^{\rm true}$
corresponds to ``perfect observations'' of the initial condition and
thus the regularization term of the cost function dominates. Clearly,
the unique minimum of $\| u - u^{\rm true} \|^2$ is
$u^{\rm true}$. We also see that the gradient descent converges
very fast [within a $O(10)$ iterations].

The most interesting case is when $\alpha \neq 0$ and $u_b \neq
u^{\rm true}$. This is of obvious interest, since in practice, the
\emph{a priori} guess $u_b$ would certainly not be the ``true'' state
$u^{\rm true}$. The behaviour of the gradient descent in this case
is shown in fig.~\ref{fig:jvsn2}(a). It is clear that the larger the
$\alpha$, the faster the convergence. We also see that the presence of
the regularization leads to much faster convergence.  We have also
seen (but not shown in the figure) that the convergence does not
depend so strongly on the number of observations as it does in the
case $\alpha = 0$. We also note that the minimum in this case is
certainly $J_{\rm min} \neq 0$, since both the ``background term''
and the ``observational term'' cannot be zero simultaneously.

Fig.~\ref{fig:jvsn2}(b) shows the minima obtained with varying
$\alpha$. We see that irrespective of the initial guess, the gradient
descent converges to a single minima. (This is true for various other
guesses, but for clarity, only two minima for each value of $\alpha$
are shown in the figure.) Thus it seems that even for observations
which are discrete in time and space, the cost function has a unique
minima.

The figure also shows $u^{\rm true}$ and $u^b$ for comparison. We
clearly see that as $\alpha$ increases, the $u^{\rm min}$ comes
closer to $u^b$, as expected. The minimum is approximately a linear combination between
$u^{\rm true}$ and $u^b$.

\begin{figure}[t!] 
\begin{center}
 \includegraphics[width=0.47\textwidth,height=0.25\textwidth]{fig4a} % jvsn_alpha
 \includegraphics[width=0.47\textwidth,height=0.25\textwidth]{fig4b} %  all_umin1
 \\ (a)\hspace{0.47\textwidth}(b)
\end{center}
    \caption{The behaviour of the steepest descent for $\alpha \neq 0$.
%
      (a) The cost function $J$ as a function of the gradient descent
      step, for different values of $\alpha$. The larger the value of
      $\alpha$, the faster the gradient descent converges.
%
      (b) The $u^{\rm min}$ along with $u^{\rm true}$ and
      $u^b$. We see that $u^{\rm min}$ is like an ``interpolation''
      between $u^{\rm true}$ and $u^b$, and is independent of the
      initial guess of the gradient descent.
} \label{fig:jvsn2}  %  \label{fig:umin2}
\end{figure} 

\section{Conclusion}

In summary, we have discussed the data assimilation problem in the
optimal control setting, specifically for the Burgers' equation. This
leads us to the numerical study of the discretization of Burgers'
equation and the gradient descent algorithm for minimization of an
appropriate cost function.

We first prove the convergence properties of the implicit
Lax-Friedrichs discretization scheme under the CFL condition. We
present numerical results that support the estimates for the
truncation error and which clearly show that the implicit
Lax-Friedrichs scheme allows much larger time steps than the centered
difference scheme.

Next, we study the convergence of the gradient descent algorithm and
its dependence on the various parameters of the problems, namely, the
number of observations, the relative weight in the cost function of
the regularization and the data, and the \emph{a priori} approximation
$u^b$ of the initial condition. We have presented numerical
indications that the cost function without regularization has multiple
minima, while the regularized cost function has unique minimum. The
rate of convergence depends strongly on the number of observations in
the former case, but not the latter case. The minimum obtained is seen
to be a combination of the \emph{a priori} background and the ``true''
state of the system as given by the observations. The interesting case
of noisy observations, as well as probabilistic formulation of this
model will be reported in the future.

\subsection*{Acknowledgements}

The authors would like to thank Sumanth Bharadwaj for his help with
the computations. AA and MR would like to thank Praveen Chandrasekhar
and Enrique Zuazua for illuminating discussions. DA and MR would like
to thank the Indo-French Center for the Promotion of Advanced
Research/ Centre Franco-Indien Pour la Promotion de la Recherche
Avancee, for the financial support for this work through the project
IFCPAR 3701-1.

\begin{thebibliography}{10}

\bibitem{ApteH07}
A.~Apte, M.~Hairer, A.M. Stuart, and J.~Voss, \emph{Sampling the posterior: An
  approach to non-gaussian data assimilation}, Physica D \textbf{230} (2007),
  50--64.

\bibitem{ApteJ08}
A.~Apte, C.~K.~R.~T. Jones, A.~M. Stuart, and J.~Voss, \emph{Ensemble data
  assimilation}, Int.~J.~Numerical Methods in Fluids \textbf{56} (2008),
  1033--1046.

\bibitem{Auroux09}
D.~Auroux, \emph{The back and forth nudging algorithm applied to a shallow
  water model, comparison and hybridization with the {4D-VAR}}, Int. J. Numer.
  Methods Fluids \textbf{61} (2009), 911--929.

\bibitem{AurouxBlum08}
D.~Auroux and J.~Blum, \emph{A nudging-based data assimilation method for
  oceanographic problems: the {B}ack and {F}orth {N}udging ({BFN}) algorithm},
  Nonlin. Proc. Geophys. \textbf{15} (2008), 305--319.

\bibitem{Ben01}
Andrew~F. Bennett, \emph{Inverse modeling of the ocean and atmosphere},
  Cambridge {U}niversity {P}ress, 2002.

\bibitem{BentonP72}
E.~R. Benton and G.~W. Platzman, \emph{A table of solutions of the
  one-dimensional {B}urgers' equation}, Quart. Appl. Math. \textbf{30} (1972),
  195--212.

\bibitem{CastroP08}
Carlos Castro, Francisco Palacios, and Enrique Zuazua, \emph{An alternating
  descent method for the optimal control of inviscid {B}urgers' equation in the
  presence of shocks}, Math.~Models and Methods in Appl.Sci. \textbf{18}
  (2008), 369--416.

\bibitem{CastroP10}
Carlos Castro, Francisco Palacios, and Enrique Zuazua,
 \emph{Optimal control and vanishing viscosity for the {B}urgers'
  equation}, Integral Methods in Science and Engineering (C.~Constanda and
  M.~E. P\'erez, eds.), vol.~2, Birkhauser, Boston, 2010, pp.~65--90.

\bibitem{ChungL97}
Wanglung Chung, John~M. Lewis, S.~Lakshmivarahan, and S.~K. Dhall, \emph{Effect
  of data distribution in data assimilation using {B}urgers' equation},
  Proceedings of the 1997 {ACM} symposium on Applied computing, 1997,
  pp.~521--526.

\bibitem{CotterD10a}
S.L. Cotter, M.~Dashti, J.C. Robinson, and A.M. Stuart, \emph{Bayesian inverse
  problems for functions and applications to fluid mechanics}, Inverse Problems
  \textbf{25} (2009), 115008.

\bibitem{CotterD10b}
S.L. Cotter, M.~Dashti, and A.M. Stuart, \emph{Approximation of bayesian
  inverse problems for pdes}, SIAM J. Numer. Anal \textbf{48} (2010), 322--345.

\bibitem{Evensen03}
G.~Evensen, \emph{The ensemble {K}alman filter: Theoretical formulation and
  practical implementation}, Ocean Dynamics \textbf{53} (2003), 343--367.

\bibitem{Evensen07}
Geir Evensen, \emph{Data assimilation: the ensemble {K}alman filter}, Springer,
  2007.

\bibitem{GoR}
Edwige Godlewski and Pierre {A}rnaud Raviart, \emph{Hyperbolic systems of
  conservation laws}, Mathematics and {A}pplications, no. 3/4, Ellipses, 1991.

\bibitem{webs}
See e.g.
  \url{http://france.meteofrance.com/france/observations?62059.path=animations%
atellite} or \url{http://www.mausam.gov.in/WEBIMD/satellitemainpage.jsp} as
  instances of the vast amount of weather data available.

\bibitem{Kalnay03}
Eugenia Kalnay, \emph{Atmospheric modeling, data assimilation, and
  predictability}, Cambridge {U}niversity {P}ress, 2003.

\bibitem{Kirsch96}
A.~Kirsch, \emph{An introduction to the mathematical theory of inverse
  problems}, Springer-Verlag, New York, 1996.

\bibitem{LeDimetTalagrand86}
F.-X. Le~Dimet and O.~Talagrand, \emph{Variational algorithms for analysis and
  assimilation of meteorological observations: theoretical aspects}, Tellus
  \textbf{38A} (1986), 97--110.

\bibitem{LDD94}
J.-M. Lellouche, J.-L. Devenon, and I.~Dekeyser, \emph{Boundary control of
  {B}urgers' equation -- a numerical approach}, Comput.~Math.~Applic.
  \textbf{28} (1994), 33--44.

\bibitem{LDD98}
J.-M. Lellouche, J.-L. Devenon, and I.~Dekeyser, 
\emph{Data assimilation by optimal control method in a {3D} costal
  oceanic model: the problem of discretization}, J.~Atmos.~Ocean.~Tech.
  \textbf{15} (1998), 470--481.

\bibitem{RLV}
Randall~J. Le{V}eque, \emph{Numerical methods for conservation laws}, second
  ed., Lectures in Mathematics, Birkhauser Verlag, ETH Zurich, 1992.

\bibitem{LewisL06}
J.~M. Lewis, S.~Lakshmivarahan, and S.~K. Dhall, \emph{Dynamic data
  assimilation: a least squares approach}, Encyclopedia of Mathematics and its
  Applications, no. 104, Cambridge University Press, 2006.

\bibitem{lions72}
J.L. Lions, \emph{Control of systems governed by partial differential
  equations}, Springer-Verlag, New York, 1972.

\bibitem{LundvallK06}
Johan Lundvall, Vladimir Kozlov, and Per Weinerfelt, \emph{Iterative methods
  for data assimilation for {B}urgers's equation}, Journal of Inverse and
  Ill-Posed Problems \textbf{14} (2006), 505--535.

\bibitem{NoackW07}
Antje Noack and Andrea Walther, \emph{Adjoint concepts for the optimal control
  of {B}urgers' equation}, Comput. Optim. Applic. \textbf{36} (2007), 109--133.

\bibitem{Tadmor84}
E.~Tadmor, \emph{Numerical viscosity and the entropy condition for conservative
  difference schemes}, Math.~Comp \textbf{43} (1984), 369--381.

\bibitem{Wh08}
Luther~W. White, \emph{A study of uniqueness for the initialization problem for
  {B}urgers' equation}, J.~Math.~Anal.~Appl. \textbf{172} (1993), 412.

\end{thebibliography}


\end{document}