\appendix
\chapter{Formulas for Matrices and Gaussians}
\label{cha:MatrixFormulas}
Here I review some material necessary for deriving
Eqns.~\eqref{eq:KUpdate}-\eqref{eq:smoothing}. Similar material
appears in Appendix A of Kailath et al.\cite{KSH00}.
\subsubsection{Block Matrix Inverse}
\index{block matrix inverse}%
\index{matrix inverse|see{block matrix inverse}}%
If $G$ is an $n\times n$ invertible matrix, $K$ is an $m\times m$
invertible matrix, and $H$ and $J$ are $n\times m$ and $m\times n$
respectively, then direct matrix multiplication verifies that
\begin{subequations}
\label{eq:MatrixInverse}
\begin{align}
\begin{bmatrix}
(G-HK^{-1}J)^{-1} & -(G-HK^{-1}J)^{-1}HK^{-1} \\
-(K-JG^{-1}H)^{-1}JG^{-1} & (K-JG^{-1}H)^{-1}
\end{bmatrix}
\begin{bmatrix}
G & H \\ J & K
\end{bmatrix}
&=
\begin{bmatrix}
\id & 0 \\ 0 & \id
\end{bmatrix}\\ \nonumber \\
\begin{bmatrix}
G & H \\ J & K
\end{bmatrix}
\begin{bmatrix}
(G-HK^{-1}J)^{-1} & -G^{-1}H(K-JG^{-1}H)^{-1} \\
-K^{-1}J(G-HK^{-1}J)^{-1} & (K-JG^{-1}H)^{-1}
\end{bmatrix}
&=
\begin{bmatrix}
\id & 0 \\ 0 & \id
\end{bmatrix},
\end{align}
\end{subequations}
assuming that $(G-HK^{-1}J)^{-1}$ and $(K-JG^{-1}H)^{-1}$ exist.
One can derive other expressions for the inverse by using the Sherman
Morrison Woodbury formula (Eqn.~\eqref{eq:SMW}) to expand terms in
Eqn.~\eqref{eq:MatrixInverse}.
By noting
\begin{multline*}
\begin{bmatrix}
(G-HK^{-1}J)^{-1} & -G^{-1}H(K-JG^{-1}H)^{-1} \\
0 & (K-JG^{-1}H)^{-1}
\end{bmatrix}
\begin{bmatrix}
G & H \\ J & K
\end{bmatrix}\\
=
\begin{bmatrix}
\id & 0 \\
(K-JG^{-1}H)^{-1}J & (K-JG^{-1}H)^{-1} K
\end{bmatrix}
\end{multline*}
and taking the determinant of both sides
\renewcommand{\det}[1]{\left| #1 \right|}
\begin{equation*}
\det{(K-JG^{-1}H)^{-1}} \det{K} = \det{(G-HK^{-1}J)^{-1}}
\det{(K-JG^{-1}H)^{-1}} \det{
\begin{bmatrix}
G & H \\ J & K
\end{bmatrix}}
\end{equation*}
one finds the following formula for determinants
\index{block matrix determinant}
\index{determinant|see{block matrix determinant}}
\begin{equation}
\label{eq:BlockDet}
\det{ \begin{bmatrix} G & H \\ J & K \end{bmatrix}} = \det{K}
\det{(G-HK^{-1}J)}
\end{equation}
\subsubsection{Sherman Morrison Woodbury Formula}
\index{Sherman Morrison Woodbury formula}
If $G$ and $J$ are invertible matrices, $H$ and $K$ have dimensions so
that $\left( G + HJK \right)$ makes sense, and $\left(KG^{-1}H +
J^{-1} \right)^{-1}$ exists, then
\begin{equation}
\label{eq:SMW}
\left( G + HJK \right)^{-1} = G^{-1} - G^{-1}H\left(KG^{-1}H +
J^{-1}\right)^{-1}KG^{-1}.
\end{equation}
Multiplying both sides by $\left( G + HJK \right)$ verifies the
formula. The special case
\begin{equation}
\label{eq:MatrixInversion}
\left(L^{-1} + H\transpose J^{-1}H \right)^{-1} = L - LH\transpose
\left( HLH\transpose + J \right)^{-1}HL
\end{equation}
is efficient when the dimension, $m$, of $J$ is smaller than the
dimension $n$ of $L$ because an $n\times n$ matrix inversion on the
left is replaced by an $m \times m$ matrix inversion on the right.
\subsubsection{Marginal and Conditional Distributions of a Gaussian}
\index{marginal distribution, Gaussian}
\index{conditional distribution, Gaussian}
Suppose that $W = \begin{bmatrix} U \\ V \end{bmatrix}$ is a Gaussian
random variable with an $n$ dimensional component $U$ and an $m$
dimensional component $V$. I write its distribution
%
\nomenclature[rNormal]{$\Normal \left( \mu,\Sigma \right)$}{A
\emph{normal} or Gaussian distribution function; $\mu$ is an $n$
dimensional vector and $\Sigma$ is an $n\times n$ matrix. Writing
$X\sim \Normal \left( \mu,\Sigma \right)$ means $X$ is distributed
normally with mean $\mu$ and covariance $\Sigma$ and the
probability density at any particular vector $x$ is
$\NormalE{\mu}{\Sigma}{x}$.
} % end \nomenclature
%
\nomenclature[rNormalE]{$\NormalE{\mu}{\Sigma}{x}$}{The value of a
Gaussian probability density function evaluated at the $n$
dimensional vector $x$, \ie
\begin{equation*}
\NormalE{\mu}{\Sigma}{x} \equiv \frac{1}
{\sqrt{(2\pi)^n\left"|\Sigma\right"|}} e^{ -\frac{1}{2}
(x-\mu)\transpose \Sigma^{-1} (x-\mu)}
\end{equation*}
where $\left"|\Sigma\right"|$ is the determinant of the covariance
matrix $\Sigma$.
} % end \nomenclature
\begin{equation*}
W \sim \Normal \left( \mu_W,\Sigma_W \right) \text{ or equivalently
} P(w) = \NormalE{\mu_W}{\Sigma_W}{w}
\end{equation*}
with
\begin{equation*}
\mu_W = \begin{bmatrix} \mu_U\\\mu_V \end{bmatrix} \text{ and }
\Sigma_W = \begin{bmatrix} \Sigma_{UU} & \Sigma_{UV}\\ \Sigma_{VU}
& \Sigma_{VV} \end{bmatrix} \equiv \begin{bmatrix} A & C \\
C\transpose & B \end{bmatrix},
\end{equation*}
where I have introduced $A \equiv \Sigma_{UU}$, $B \equiv
\Sigma_{VV}$, and $C \equiv \Sigma_{UV}$ to shorten the notation.
If I denote
\begin{equation*}
\Sigma_W^{-1} = \begin{bmatrix} D & F \\ F\transpose & E
\end{bmatrix},
\end{equation*}
then from Eqns.~\eqref{eq:MatrixInverse} and ~\eqref{eq:SMW}
\begin{subequations}
\label{eq:MI:ABCDEF}
\begin{align}
\label{eq:MI:D}
D &= (A - CB^{-1}C\transpose)^{-1} & &= A^{-1} + A^{-1}C E
C\transpose A^{-1}\\
\label{eq:MI:E}
E &= (B - C\transpose A^{-1}C)^{-1} & &= B^{-1} + B^{-1}C\transpose
D C B^{-1} \\
\label{eq:MI:F}
F &= -A^{-1}C E & &= -DCB^{-1}.
\end{align}
\end{subequations}
In this notation, the marginal distributions are
\begin{subequations}
\label{eq:Gauss-Marginal}
\begin{align}
P\left(u \right) &= \int P\left(u,v \right) dv \\
&= \NormalE{\mu_U}{A}{u} \\
P\left(v \right) &= \int P\left(u,v \right) du \\
&= \NormalE{\mu_V}{B}{v},
\end{align}
\end{subequations}
and the conditional distributions are
\begin{subequations}
\label{eq:Gauss-Conditional}
\begin{align}
P\left(u |v \right) &= \frac{ P\left(u,v \right)}{P\left(v \right)} \\
&= \NormalE{\mu_U + CB^{-1}(v-\mu_v)}{D^{-1}}{u} \\
P\left(v |u \right) &= \frac{ P\left(v,u \right)}{P\left(u \right)} \\
&= \NormalE{\mu_V + C\transpose A^{-1} (u - \mu_U)}{E^{-1}}{v}
\end{align}
\end{subequations}
Notice that the covariance of the \emph{marginal} distribution of $U$
is given by the $UU$ block of $\Sigma_W$, but that the inverse
covariance of the \emph{conditional} distribution of $U$ is given by
the $UU$ block of $\Sigma_W^{-1}$.
As a check of these formulas, I examine $P\left(u |v \right) P\left(v
\right)$ and find
\begin{align*}
P\left(u |v \right) P\left(v \right) &= \frac{\sqrt{\left| D
\right|}} {\sqrt{(2\pi)^n}} e^{ -\frac{1}{2} \left(u - \mu_U -
CB^{-1}(v - \mu_V) \right)\transpose D \left(u - \mu_U -
CB^{-1}(v - \mu_V) \right)
} \\
&\quad \times \frac{1}{\sqrt{(2\pi)^m \left| B \right|}} e^{ -\frac{1}{2}
(v-\mu_V)\transpose B^{-1} (v-\mu_V)}\\
%
&= \frac{1} {\sqrt{(2\pi)^{n+m} \left| \Sigma_W
\right| }} \exp \bigg( -\frac{1}{2} \Big[ \\
&\quad \left( u - \mu_U - CB^{-1}(v - \mu_V) \right)\transpose D
\left(u - \mu_U - CB^{-1}(v - \mu_V) \right) \\
&\quad + (v-\mu_V)\transpose B^{-1} (v-\mu_V) \Big]\bigg)\\
%
&= \frac{1} {\sqrt{(2\pi)^{n+m} \left| \Sigma_W \right| }}\\
&\quad\times e^{ -\frac{1}{2} \left((u-\mu_U)\transpose D (u-\mu_U) +
2 (v-\mu_V)\transpose F\transpose (u-\mu_U) +
(v-\mu_V)\transpose E (v-\mu_V)\right)}\\
&= P \left(u,v \right)
\end{align*}
all is right with the world. In the above, Eqn.~\eqref{eq:BlockDet}
implies that $\frac{\sqrt{\det{D}}}{\sqrt{\det{B}}} =
\frac{1}{\sqrt{\det{\Sigma_W}}}$.
\subsubsection{Completing the Square}
\index{completing the square}
Some of the derivations in section~\ref{sec:KDerive} rely on a
procedure called \emph{completing the square}, which I illustrate
with the following example. Suppose that the function $f(u)$ is the
product of two $n$ dimensional Gaussians,
$\Normal\left(\mu_1,\Sigma_1\right)$ and
$\Normal\left(\mu_2,\Sigma_2\right)$, \ie
\begin{align}
\label{eq:Qu.a}
f(u) &= \frac{1} {\sqrt{(2\pi)^n \left| \Sigma_1 \right| }}
e^{-\frac{1}{2} ( u - \mu_1)\transpose \Sigma_1^{-1} ( u - \mu_1)}
\frac{1} {\sqrt{(2\pi)^n \left| \Sigma_2 \right| }}
e^{-\frac{1}{2} ( u - \mu_2)\transpose \Sigma_2^{-1} ( u - \mu_2)}\\
\label{eq:Qu.b}
&= \frac{1} {\sqrt{(2\pi)^{2n} \left| \Sigma_1 \right| \left|
\Sigma_2 \right| }} e^{-\frac{1}{2}\big[ ( u -
\mu_1)\transpose \Sigma_1^{-1} ( u - \mu_1) + ( u -
\mu_2)\transpose \Sigma_2^{-1} ( u - \mu_2)\big]}\\
\label{eq:Qu.c}
&\equiv \frac{1} {\sqrt{(2\pi)^{2n} \left| \Sigma_1 \right| \left|
\Sigma_2 \right| }} e^{-\frac{1}{2}\big[Q(u)\big]}.
\end{align}
By expanding the function $Q(u)$ in the exponent, I find:
\begin{align}
\label{eq:Qu.d}
Q(u) &= u\transpose \left( \Sigma_1^{-1} + \Sigma_2^{-1} \right) u -
2 u\transpose \left( \Sigma_1^{-1} \mu_1 + \Sigma_2^{-1} \mu_2
\right) + \mu_1\transpose \Sigma_1^{-1} \mu_1 + \mu_2\transpose
\Sigma_2^{-1} \mu_2 \\
\label{eq:Qu.e}
&= u\transpose q u - 2 u\transpose l + s
\end{align}
where the quadratic, linear, and scalar terms are
\begin{align*}
q &= \left( \Sigma_1^{-1} + \Sigma_2^{-1} \right) \\
l &= \left( \Sigma_1^{-1} \mu_1 + \Sigma_2^{-1} \mu_2 \right) \\
s &= \mu_1\transpose \Sigma_1^{-1} \mu_1 + \mu_2\transpose
\Sigma_2^{-1} \mu_2
\end{align*}
respectively.
\emph{Completing the square} means finding values $\mu$, $\Sigma$, and
$R$ for which Eqn.~\eqref{eq:Qu.e} takes the form
\begin{equation}
\label{eq:Qu.f}
Q(u) = (u - \mu)\transpose \Sigma^{-1} (u - \mu) + R,
\end{equation}
where $R$ is not a function of $u$. One can verify by substitution
that the solution is
\begin{align*}
\Sigma^{-1} &= q\\
\mu &= \Sigma l\\
R &= s - \mu\transpose \Sigma^{-1} \mu.
\end{align*}
For the product of Gaussians example \eqref{eq:Qu.a},
\begin{align*}
\Sigma^{-1} &= \Sigma_1^{-1} + \Sigma_2^{-1} \\
\mu &= \Sigma \left( \Sigma_1^{-1} \mu_1 + \Sigma_2^{-1} \mu_2
\right) \\
&= \left( \Sigma_1^{-1} + \Sigma_2^{-1} \right)^{-1} \left(
\Sigma_1^{-1} \mu_1 + \Sigma_2^{-1} \mu_2 \right) \\
R &= \mu_1\transpose \Sigma_1^{-1} \mu_1 + \mu_2\transpose
\Sigma_2^{-1} \mu_2 - \mu\transpose \Sigma^{-1} \mu \\
&= \mu_1\transpose \Sigma_1^{-1} \mu_1 + \mu_2\transpose
\Sigma_2^{-1} \mu_2 - \left( \Sigma_1^{-1} \mu_1 + \Sigma_2^{-1}
\mu_2 \right) \transpose \left( \Sigma_1^{-1} + \Sigma_2^{-1}
\right)^{-1} \left( \Sigma_1^{-1} \mu_1 + \Sigma_2^{-1} \mu_2
\right).
\end{align*}
In words the product of two Gaussian density functions is an
unnormalized Gaussian density function in which the inverse covariance
is the sum of the inverse covariances of the factors and the mean is
the average of the factor means weighted by the inverse covariances.
\chapter{Notes on Software}
\label{cha:Software}
A software package that \emph{makes} this book is available at
www.siam.org.fraser.xxxx. The package consists of LaTeX source files,
scripts, code and data required for making the figures and ultimately
the book. On a gnu/linux system, after unpacking the files and
installing the prerequisites, typing ``make'' in the top level
directory will create a copy of the book after some hours of
computation.
I will reorganize the software before the book is distributed to make
the HMM tools a separate package that has few prerequisites. The
remainder of the software, the \emph{book package}, will require most
of the dependencies listed in the next section to run.
\section*{Dependencies}
\label{sec:SWdep}
The original code relied on many free software packages. Although I
developed the code on debian gnu/linux systems, it will run on any of
the many other platforms for which the dependencies are available. In
addition to the standard gnu development tools, the \emph{book
package} will require the following and their dependencies:
\begin{description}
\item[gnuplot] Old reliable plotting package driven with either a
command line interface or a script
\item[libgsl0-dev] GNU Scientific Library
\item[python] An interpreted, interactive, object-oriented,
extensible programming language
\item[python-dev] Header files necessary for compiling code wrapped
by swig
\item[python-matplotlib] A matlab-like GUI driven plotting package.
I use matplotlib to plot a spectrogram.
\item[python-scipy] Scientific methods for python. Scipy was
developed as I wrote the book. Hoping for both stability and
access to a rich set of tools, I switched between using the
scipy, numeric, and numarray packages too often. Now, scipy is
reasonably stable and complete.
\item[python-tk] Python interface to the Tk GUI toolkit
\item[rubber] A tool to automate latex compilations. Rubber figures
out how many latex passes are needed and also runs
metapost/bibtex/dvips/ etc.~as necessary. Although I recommend
rubber, the code will run without it.
\item[swig] The \emph{Simplified Wrapper and Interface Generator}
wraps code written in C or C++ for any of a number of higher level
programming languages. I used it to make fast C implementations
of the basic HMM algorithms callable from python.
\item[tetex or texlive] TeX, LaTeX and much supporting software
\item[transfig] For translating xfig drawings to eps
\end{description}
\section*{Data}
\label{sec:SWdata}
I used the following sets of data for examples. The book package will
include extracts that are sufficient to build the book.
\begin{description}
\item[Tang's laser data] Carl Otto Weiss \index{Weiss, C.\ O.} mailed
me a CD full of data from various experiments that he and Tang did
in the 1980s and 1990s. Although I analyzed many of the files, I
finally used only a file called \emph{LP5.DAT} in the book (see
Section~\ref{sec:laser}).
\item[H.~L.~Mencken's \emph{A Book of Prefaces}] I used Mencken's book
for the parts of speech example in Section~\ref{sec:POSpeech}.
Although you can get Mencken's book from \emph{Project
Gutenberg}\cite{gutenberg}. My book package will include a parsed
version of text.
\item[CINC2000 ECG data] I used Dr. Thomas Penzel's ECG measurements
throughout Chapter~\ref{chap:apnea}. You can get the raw data from\\
www.physionet.org/physiobank/database/apnea-ecg. In the book
package, I will include the much smaller files that contain
estimates of the timing of heart beats.
\end{description}
\section*{Clarity and Efficiency}
\label{sec:Clarity}
Perhaps the clearest way to describe and code the algorithms of
Chapter~\ref{chap:algorithms} is in terms of vector and matrix
operations. For example, see Kevin Murphy's \emph{Hidden Markov Model
(HMM) Toolbox} at www.cs.ubc.ca/~murphyk/Bayes/hmm.html which uses
MATLAB\texttrademark. In preliminary work on the code for this book,
Ralf Juengling \index{Juengling, Ralf} used the python \emph{scipy}
package to similar effect. Here is his implementation of the entire
forward algorithm:
\begin{alltt}
def forward(hmm, y):
P_S0, P_ScS, P_YcS = hmm.parameters()
N = len(hmm.states)
T = len(y)
# create data structures alpha and gamma
alpha = N.zeros((T, N), N.float64) # alpha[t,s] = P(s|y(0..t))
gamma = N.zeros((T, 1), N.float64) # gamma[t] = P(y(t)|y(0..t-1))
# fill up alpha and gamma
alpha[0] = P_YcS[y[0]]*P_S0
gamma[0] = N.sum(alpha[0])
alpha[0] /= gamma[0]
for t in xrange(1, T):
P_ScY_1_prev = N.dot(P_ScS, alpha[t-1]) # Eqn. \eqref{eq:forwardB}
P_SYcY_1_prev = P_YcS[y[t]]*P_ScY_1_prev # Eqn. \eqref{eq:forwardC}
gamma[t] = N.sum(P_SYcY_1_prev) # Eqn. \eqref{eq:forwardD}
alpha[t] = P_SYcY_1_prev/gamma[t] # Eqn. \eqref{eq:forwardA}
return alpha, gamma
\end{alltt}
The comments refer to the formulas in Chapter~\ref{chap:algorithms}
implemented by each line.
As I developed code for the examples in the book, I abandoned the
simplicity of Juengling's code in the pursuit of flexibility and
speed. To have code that could use special observation models such as
those in Chapters~\ref{chap:variants} and~\ref{chap:apnea}, I
replaced \emph{P\_YcS[yt]} with a function call. To make code faster,
I wrote routines in C and called them from python using swig. And to
have code that could operate on the large models required for
Fig.~\ref{fig:LikeLor} I used hash tables (python
\emph{dictionaries}) rather than lists or arrays. The result is a
collection of code that is flexible and fast enough to build the book
in a few hours but not as easy to read as I would have liked.
After many modifications of observation models as I worked on
Chapter~\ref{chap:apnea}, I found that caching the conditional
probabilities of observations given the states works well. Originally
I used a class method to calculate $P(\ti{y}{t}|s)$ for particular
values of $s$ and $t$ wherever the code required such a probability.
To use different observation models, I wrote subclasses that redefined
the method. The technique worked but there were drawbacks. By
profiling the python code, I found that most of the time was spent
calling the methods that implemented $P(\ti{y}{t}|s)$. For the
simplest observation models (discrete outputs which can be implemented
as arrays), Karl Hegbloom examined the details and reported:
\begin{quote}
Actually, it was not the actual method calls that took so much time.
The problem was that Python had to traverse it's
object-representation data structures in order to find the methods.
The result was returned by code about three levels deep. The
interpreter dutifully performed that look-up chaining over and over
again, once for each run-time method call.
Python is a dynamic language where variables are not constrained to
hold values of a particular type. When you declare a variable, you
just name it, without saying anything to Python about the data type
of the object that will be kept at the location it stands for. This
language characteristic, known as ``duck typing'', makes it easy to
implement \emph{interface polymorphism}. ``If it walks like a duck
and quacks like a duck, it must be a duck.'' To make that work,
method look-up must happen at run-time. Since there is no syntax
for declaring the variable to hold only a certain data-type, the
Python byte-compiler has no way of resolving those at compile-time.
\end{quote}
Reorganizing the code so that before any call to the forward
algorithm, \ie, each iteration of the Baum-Welch algorithm and before
any call to the the Viterbi algorithm, a single call to an observation
method calculates and stores $P(\ti{y}{t}|s)$ for every value of $s$
and $t$, resolved the drawbacks at the expense of using more
memory\footnote{For big problems the modification requires about 30\%
more memory to run the Baum-Welch algorithm.}.
%%% Local Variables:
%%% TeX-master: "main"
%%% eval: (load-file "hmmkeys")
%%% End:
% LocalWords: Welch Viterbi HMM