This document is (c) David J.C. MacKay, 2001

It originates from http://www.inference.phy.cam.ac.uk/mackay/itprnn/book.html

It contains the text of David MacKay's book, Information theory, inference, and learning algorithms. (latex source)

Copying and distribution of this file are NOT PERMITTED.

The file is provided for convenience of anyone wishing to make a web-based search of the text of the book.

% This document is (c) David J.C. MacKay, 2001
%
% It originates from http://www.inference.phy.cam.ac.uk/mackay/itprnn/
%                    http://www.inference.phy.cam.ac.uk/mackay/itprnn/book.html
%
% It contains the text of David MacKay's book,
%   Information theory, inference, and learning algorithms.
% (latex source)
%
% Copying and distribution of this file are NOT PERMITTED.
%
% The file is provided for convenience of anyone wishing to
% make a web-based search of the text of the book.

% was book2e.tex is now book.tex   (and still latex2e)
\documentclass[11pt]{book}%
\usepackage{floatflt}
\usepackage{hangingsecnum}% makes sec numbers sit in the left margin
%\usepackage{mparhack}
\usepackage{mparhackright-209}% makes all margin pars go in right margin
\usepackage{marginfig}%   Defines many macros for making various styles of figure with captions
\usepackage{symbols}
%\usepackage{twoside}
\usepackage{myalgorith}%   defines the Algorithm environment as a float
\usepackage{aside}%        defines the {aside} environment
\usepackage{chapsummary}%  helps me compile index-like objects
\usepackage{chapternotes}% lots of assorted stuff
\usepackage{lsalike}%      defines citation commands
\usepackage{booktabs}%     makes nice quality tables
\usepackage{prechapter}%   defines a chapter-like object
\usepackage{mycaption}%    defines ``\indented''and \@makecaption
% additions post-Sat 5/10/02
\usepackage{latexsym}%  needed in order to make use of the \Box command
\usepackage{tocloft}% implements my look of table of contents
\usepackage{tocloftcomp}% implements my look of table of contents
\usepackage{mychapter}% defines chapter command, including the look of the new chapter page
                      % also defines the look of the section and subsection commands
\usepackage{mycenter}%  modifies center to reduce vertical space waste
\usepackage{mypart}%    modifies part to not cleardoublepage 
\usepackage{myheadings}% redefines the pagestyle ``headings''
% \usepackage{headingmods}% redefines the pagestyle ``headings'' (similar to myheadings)
% \usepackage{myindents}% defines parindent and leftmargin
\usepackage{graphics}% enables rotating of boxes
% \usepackage{boldmathgk}%  provides bold alpha etc. (doesn't work)
\usepackage{fixmath}%  provides bold alpha etc.
\usepackage{makeidx}
\makeindex
%
\newcommand{\thedraft}{3.141} 
\renewcommand{\textfraction}{0.10}
\pagestyle{headings}
\begin{document}
\bibliographystyle{lsalike}
%\newcommand{\bf}{\textbf}
%\newcommand{\sf}{\textsf}
%%\newcommand{\em}{\textem}
%\newcommand{\rm}{\textrm}
%\newcommand{\tt}{\texttt}
%\newcommand{\sl}{\textsl}
%\newcommand{\sc}{\textsc}
%
%
%
 

 
 
% chapter.tex
% 
% this contains a few common definitions for all chapters 
% of the itprnn book
\newcommand{\adhoc}{ad hoc}
\newcommand{\busstop}{bus-stop}
\newcommand{\mynewpage}{\newpage}% switch this off later Sun 3/2/02
% see also tex/inputs/itchapter.sty
% chapternotes.sty is where there is an index
\newcommand{\fN}{f\!N}
\newcommand{\exercisetitlestyle}{\sf}
%
% used in sumproduct.tex and gallager.tex
\newcommand{\Mn}{{\cal M}(n)}
\newcommand{\Nm}{{\cal N}(m)}
%\newcommand{\N}{{\cal N}}
%
% the delta function that is 1 if true (defined in notation.tex)
\newcommand{\truth}{1}
% 
% used in gene.tex
\newcommand{\deltaf}{\delta\! f}
\newcommand{\tI}{\tilde{I}}
\newcommand{\Kp}{K_{\rm{p}}}
\newcommand{\Ks}{K_{\rm{s}}}
%
% end
% lang4.tex - distributions.tex
\newcommand{\lI}{I}
%
% clust.tex
\newcommand{\rnk}{r^{(n)}_k}
\newcommand{\hkn}{\hat{k}^{(n)}}
% good sizes: 
% -0.45: 1.25
% -0.25: 0.65
% -0.4 0.8
\newcommand{\softfig}[1]{\hspace{-0.4in}\psfig{figure=octave/kmeansoft/ps1/#1.ps,width=0.8in,angle=-90}}
\newcommand{\softtfa}[3]{\begin{tabular}{c}{$t=#2$}\\
\hspace*{-0.4in}\mbox{\psfig{figure=octave/kmeansoft/#3/#1.ps,width=1.2in,angle=-90}\hspace*{-0.2in}}\\
\end{tabular}}
\newcommand{\softtfabig}[3]{\begin{tabular}{c}{$t=#2$}\\
\hspace*{-0.6in}\mbox{\psfig{figure=octave/kmeansoft/#3/#1.ps,width=1.5in,angle=-90}\hspace*{-0.2in}}\\
\end{tabular}}
\newcommand{\softtfabigb}[3]{\begin{tabular}{c}{$t=#2$}\\
\hspace*{-0.45in}\mbox{\psfig{figure=octave/kmeansoft/#3/#1.ps,width=1.625in,angle=-90}\hspace*{-0.2in}}\\
\end{tabular}}
\newcommand{\softtf}[2]{\softtfa{#1}{#2}{ps1}}
\newcommand{\softtfbig}[2]{\softtfabig{#1}{#2}{ps1}}
\newcommand{\softtfbigb}[2]{\softtfabigb{#1}{#2}{ps1}}
\newcommand{\softtfb}[2]{\softtfa{#1}{#2}{ps3}}
\newcommand{\softtfbbig}[2]{\softtfabigb{#1}{#2}{ps3}}
\newcommand{\softfc}[1]{\begin{tabular}{c}%
\hspace*{-0.2in}\mbox{\psfig{figure=octave/kmeansoft/ps5/#1.ps,width=1.32in,angle=-90}\hspace*{-0.2in}}\\
\end{tabular}}
% end
%
% used in _p1 and _l2
\newcommand{\hpheight}{26mm}
\newcommand{\wow}{\marginpar{\raisebox{-12pt}{\psfig{figure=figs/wow.eps,width=1in}}}}
%
% used in _l1.tex:::::::
\renewcommand{\q}{{f}}
\newcommand{\obr}[3]{\overbrace{{#1}\,{#2}\,{#3}}}
\newcommand{\ubr}[3]{\underbrace{{#1}\,{#2}\,{#3}}}
\newcommand{\nbr}[3]{{{#1}\,{#2}\,{#3}}}
%
%
\newcommand{\fitpath}{/home/mackay/octave/fit/ps}% used in fit.tex (gaussian fitting, octave)
% CUP style: 
\renewcommand{\ie}{i.e.}
\renewcommand{\eg}{e.g.}
\renewcommand{\NB}{N.B.}
%
% symbols i e and d in maths (operators)
\newcommand{\im}{{\rm i}}
\newcommand{\e}{{\rm e}}
% \d is already defined
%
\newenvironment{conclusionbox}%
{\vskip 0.1pt \noindent\rule{\textwidth}{0.1pt}\vskip -18pt\begin{quote}\vskip -8pt}%
{\end{quote}\vskip -14pt \noindent\rule{\textwidth}{0.1pt}\vskip 6pt}
% {\vskip 0.1pt \noindent\rule{\textwidth}{0.1pt}\vskip -12pt\begin{quote}}%
% {\end{quote}\vskip -12pt \noindent\rule{\textwidth}{0.1pt}}
\newcommand{\dy}{\d y}
\newcommand{\plus}{+}
\newcommand{\Wenglish}{Wenglish}% winglish
\newcommand{\wenglish}{\Wenglish}% winglish
\newcommand{\percent}{{per cent}}% in USA only: percent
%
%\newcommand{\nonexaminable}{$^{*}$}
\newcommand{\nonexaminable}{}
%
% for exact sampling chapter
\newcommand{\envelope}{summary state}
%
\def\unit#1{\,{\rm #1}}
\def\cm{\unit{cm}}
\def\grams{\unit{g}}
% this is a 209 versus 2e problem: (huffman.latex edited instead)
%\def\tenrm{\rm}
%\def\tenit{\it}
%
% other problems: \pem
\renewcommand{\textfraction}{0.1}
%
% for use in free text: 
\newcommand{\bits}{{\rm bits}}
\newcommand{\bita}{{\rm bit}}
% for use in equations or in '1 bit'
\newcommand{\ubits}{\,{\bits}}
\newcommand{\ubit}{\,{\bita}}
%
%
% used in ch 1:
\newcommand{\pB}{p_{\rm B}}
\newcommand{\pb}{p_{\rm b}}
%
% ch 2:
\newcommand{\sixtythree}{{\tt sixty-three}}
\newcommand{\aep}{`asymptotic equipartition' principle}
%
% used in alpha: 
\newcommand{\sla}{\sqrt{\lambda_a}}
\newcommand{\kga}{\kappa\gamma}
\newcommand{\kkgg}{\kappa^2\gamma^2}
\newcommand{\skg}{\sqrt{\kappa\gamma}}
\newcommand{\TYP}{{\rm \scriptscriptstyle TYP}}
%
\newcommand{\bb}{{\bf b}} 
\newcommand{\eq}{\mbox{$=$}}
%
% used in ising.tex and _s4.tex
% J=+1 are in states1, J=-1 are in states
%\newcommand{\risingsample}[1]{\psfig{figure=isingfigs/states1/#1.ps,width=1.82in}}
\newcommand{\risingsample}[1]{\psfig{figure=isingfigs/states1/#1.ps,width=1in}}% was 1.75
\newcommand{\smallrisingsample}[1]{\psfig{figure=isingfigs/states1/#1.ps,width=0.6in}}% was 1.2 was 0.9
\newcommand{\Hisingsample}[1]{\psfig{figure=isingfigs/states1/#1.ps,width=2.6in}}
\newcommand{\hisingsample}[1]{\psfig{figure=isingfigs/states/#1.ps,width=2.6in}}
\newcommand{\bighisingsample}[1]{\psfig{figure=isingfigs/states/#1.ps,width=3.86in}}
%
% used in _noiseless.tex
\newcommand{\Connectionmatrix}{Connection matrix}
\newcommand{\connectionmatrix}{connection matrix}
\newcommand{\connectionmatrices}{connection matrices}
%\newcommand{\cwM}{M}% codeword number
%\newcommand{\cwm}{m}% codeword number
\newcommand{\cwM}{S}% codeword number
\newcommand{\cwm}{s}% codeword number
\newcommand{\sa}{\alpha}% signal amplitude in gaussian channel
%
\newcommand{\cmA}{A}% connection matrix symbol
\newcommand{\bcmA}{{\bf \cmA}}% connection matrix symbol
\newcommand{\bAcm}{{\bcmA}}
\newtheorem{ctheorem}{Theorem}[chapter]
\newtheorem{definc}{Definition}[chapter]
\newcommand{\appendixref}[1]{appendix \ref{#1}}
\newcommand{\Appendixref}[1]{Appendix \ref{#1}}
\newcommand{\sectionref}[1]{section \ref{#1}}
\newcommand{\Sectionref}[1]{Section \ref{#1}}
\newcommand{\secref}[1]{section \ref{#1}}
\newcommand{\Secref}[1]{Section \ref{#1}}
%\newcommand{\chapterref}[1]{chapter #1}% why?
%\newcommand{\Chapterref}[1]{Chapter #1}% why?
\newcommand{\chapterref}[1]{chapter \ref{#1}}
\newcommand{\Chapterref}[1]{Chapter \ref{#1}}
\newcommand{\chref}[1]{chapter \ref{#1}}
\newcommand{\Chref}[1]{Chapter \ref{#1}}
\newcommand{\chone}{\ref{ch.one}}
\newcommand{\chtwo}{\ref{ch.two}}
\newcommand{\chthree}{\ref{ch.three}}
\newcommand{\chfour}{\ref{ch.four}}
\newcommand{\chfive}{\ref{ch.five}}
\newcommand{\chsix}{\ref{ch.six}}
\newcommand{\chseven}{\ref{ch.ecc}}
\newcommand{\cheight}{\ref{ch.bayes}}
\newcommand{\chthirteen}{\ref{ch.single.neuron.class}}% single neuron
\newcommand{\chfourteen}{\ref{ch.single.neuron.bayes}}% single neuron bayes? 
\newcommand{\chtwelve}{\ref{ch.nn.intro}}% intro to nn
\newcommand{\chcover}{\ref{ch.cover}}
\newcommand{\chbayes}{\ref{ch.bayes}}
\newcommand{\secpulse}{\ref{sec.pulse}}% 7.2.1?}
\newcommand{\secthirteenthree}{13.3?}
\newcommand{\secmetrop}{\ref{sec.metrop}}% 11.3?}
\newcommand{\figooo}{?1.11?}
\newcommand{\eqgamma}{8.27?}
\newcommand{\TSP}{travelling salesman problem} 
\newcommand{\vfe}{variational free energy}
\newcommand{\vfem}{variational free energy minimization}
% could make this \ch6 = \ref{ch6}
% author, title etc is in here....
% {headerinfo.tex}% uses special commands
\setcounter{secnumdepth}{2}%
\newcommand{\indep}{\bot}% upside down pi desired
\newcommand{\dbf}{\sl}% boldface in definitions
\newcommand{\dem}{\sl}% emphasized definitions in text
\newcommand{\solutionb}[2]{\setcounter{solution_number}{#1}
\solutiona{#2}}
\newcommand{\lsolution}[2]{\section{Solution to exercise {#1}}{#2}}
%
%
\newcommand{\FIGS}{/home/mackay/book/FIGS}
\newcommand{\bookfigs}{/home/mackay/book/figs}
\newcommand{\figsinter}{/home/mackay/handbook/figs/inter}
\newcommand{\exburglar}{\exerciseref{ex.burglar}}
\newcommand{\exnine}{\exerciseref{ex.invP}}%10}
\newcommand{\exseven}{\exerciseonlyref{ex.weigh}}% use deprecated!
% was \exseven ....  \exerciseref{ex.expectn}}%9}
\newcommand{\exaseven}{\exerciseref{ex.R9}}%{7}
\newcommand{\exten}{\exerciseref{ex.expectng}}%{11}
\newcommand{\exfourteen}{\exerciseref{ex.Hadditive}}%{15}
\newcommand{\exfifteen}{\exerciseref{ex.Hcondnal}}%{16}
\newcommand{\exeighteen}{\exerciseref{ex.Hmutualineq}}%{19}
\newcommand{\extwenty}{\exerciseref{ex.rel.ent}}%{21}
\newcommand{\extwentyone}{\exerciseref{ex.joint}}%{22}% the joint ensemble
\newcommand{\extwentytwo}{\exerciseref{ex.dataprocineq}}%{23}
\newcommand{\extwentythree}{\exerciseref{ex.zxymod2}}%{24}
\newcommand{\extwentyfour}{\exerciseref{ex.waithead}}%{25}
\newcommand{\extwentyfive}{\exerciseref{ex.sumdice}}%{26}
\newcommand{\extwentysix}{\exerciseref{ex.RN}}%{27}
\newcommand{\extwentyseven}{\exerciseref{ex.RNGaussian}}%{28}
\newcommand{\exthirtyone}{\exerciseref{ex.logit}}%{32}% logistic
\newcommand{\exthirtysix}{\exerciseref{ex.exponential}}%{37}% 
\newcommand{\exthirtyseven}{\exerciseref{ex.blood}}%{38}% forensic
\newcommand{\exfiftythree}{\exerciseref{ex.}}%{53}% integers
\newcommand{\eqsixteenfive}{16.5}
\newcommand{\Kraft}{Kraft}% Kraft--McMillan
\newcommand{\exrelent}{\exerciseref{ex.rel.ent}}%{20} %% \ref{ex.rel.ent}
\newcommand{\eqKL}{1.24} %% \eqref{eq.KL}
\newcommand{\bSigma}{{\mathbf{\Sigma}}}
\newcommand{\sumproduct}{sum-product}
%
% for cpi material
%
\newcommand{\sigbias}{\sigma_{\rm bias}}
\newcommand{\sigin}{\sigma_{\rm in}}
\newcommand{\sigout}{\sigma_{\rm out}}
\newcommand{\abias}{\alpha_{\rm bias}}
\newcommand{\ain}{\alpha_{\rm in}}
\newcommand{\aout}{\alpha_{\rm out}}
%\newcommand{\bff}{\bf}
\newcommand{\handfigs}{/home/mackay/handbook/figs} 
\newcommand{\mjofigs}{/home/mackay/figs/mjo} 
\newcommand{\FIGSlearning}{/home/mackay/book/FIGS/learning}
\newcommand{\codefigs}{/home/mackay/_doc/code/ps/ps} 
%
% mncEL stuff
%
\newcommand{\ebnowide}[1]{\mbox{\psfig{figure=../../code/#1.ps,width=2.8in,angle=-90}}}
\newcommand{\fem}{m}
\newcommand{\feM}{M}
\newcommand{\fel}{n}
\newcommand{\feL}{N}
\renewcommand{\L}{N}
\newcommand{\feLm}{{\cal N}(m)}
\newcommand{\feMl}{{\cal M}(n)}
\newcommand{\feK}{N}
\newcommand{\fek}{n}
\newcommand{\feKn}{{\cal N}(m)}
\newcommand{\feNk}{{\cal M}(n)}
\newcommand{\feN}{M}
\newcommand{\fen}{m}
\newcommand{\fer}{r}
\newcommand{\GL}{GL}
\newcommand{\SMN}{GL}
\newcommand{\NMN}{MN}
\newcommand{\MN}{MN}
\renewcommand{\check}{check}% was relationship
\newcommand{\checks}{checks}% was relationship
\newcommand{\fs}{f_{\rm s}}
\newcommand{\fn}{f_{\rm n}}
\newcommand{\llncspunc}{.}
\newcommand{\lcA}{{H}} 
\newcommand{\rmncNall}{/home/mackay/_doc/code/rmncNall} 
\newcommand{\oneA}{1A}
\newcommand{\twoA}{2A}
\newcommand{\thrA}{2A}
\newcommand{\oneB}{1B}
\newcommand{\twoB}{2B}
\newcommand{\thrB}{2B}
\newcommand{\bndips}{/home/mackay/_doc/code/bndips}
\newcommand{\codeps}{/home/mackay/_doc/code/ps}
\newcommand{\equalnode}{\raisebox{-1pt}[0in][0in]{\psfig{figure=figs/gallager/equal.eps,width=8pt}\hspace{0mm}}}
\newcommand{\plusnode}{\raisebox{-1pt}[0in][0in]{\psfig{figure=figs/gallager/plus.eps,width=8pt}\hspace{0mm}}}
% 
\newcommand{\fourfourtable}[9]{\begin{tabular}[b]{lcc@{\hspace{4pt}}c}
 \multicolumn{1}{l}{#1:} &  & \multicolumn{2}{c}{#2} \\[-0.1in]%  \cline{1-1}
 & & {#3} & {#4} \\ \cline{3-4}
{#5} &\multicolumn{1}{l|}{#3} & {#6} & {#7} \\[-7pt] 
     &\multicolumn{1}{l|}{#4} & {#8} & {#9} \\ 
\end{tabular}}
\newcommand{\fourfourtableb}[9]{\begin{tabular}[b]{l|c@{\hspace{1pt}}c@{\hspace{3pt}}c}
 {#1} & {#2}  & {#3} & {#4} \\ \cline{1-1}\cline{3-4}
\multicolumn{2}{l}{#5} & & \\ 
\multicolumn{1}{l|}{#3} & & {#6} & {#7} \\[-5pt] 
\multicolumn{1}{l|}{#4} & & {#8} & {#9} \\ 
\end{tabular}}
\newcommand{\fourfourtableold}[9]{\begin{tabular}[b]{l|c|c|c|}
 {#1} & {#2}  & {#3} & {#4} \\ \cline{1-1}
\multicolumn{2}{l|}{#5} & & \\ \hline
\multicolumn{2}{l|}{#3} & {#6} & {#7} \\ \hline
\multicolumn{2}{l|}{#4} & {#8} & {#9} \\ \hline
\end{tabular}}
\newcommand{\mathsstrut}{\rule[-3mm]{0pt}{8mm}}
%
% for ra.tex
%
\newcommand{\halfw}{0.35in}
\newcommand{\onew}{0.7in}
\newcommand{\onehalfw}{1.05in}
\newcommand{\twow}{1.4in}
\newcommand{\twohalfw}{1.75in}
\newcommand{\GHfig}[1]{\psfig{figure=GHps/#1,width=\onehalfw}}% for rate 1/3
\newcommand{\GHfigone}[1]{\psfig{figure=GHps/#1,width=\onew}}%
\newcommand{\GHfigthird}[1]{\psfig{figure=GHps/#1,width=\halfw}}
\newcommand{\GHfigquarter}[1]{\psfig{figure=GHps/#1,width=\twohalfw}}
\newcommand{\GHfigtwo}[1]{\psfig{figure=GHps/#1,width=\twow}}% for rate 1/2
\newcommand{\GHfigdouble}[1]{\psfig{figure=GHps/#1,width=\twohalfw}}% for five wide
% extra wide fitting::::::::::: (for turbo)
\newcommand{\GHfigdoubleE}[1]{\psfig{figure=GHps/#1,width=2in}}% for five wide
\newcommand{\GHfigE}[1]{\psfig{figure=GHps/#1,width=1.2in}}% for rate 1/3
%
\newcommand{\GHdrawfig}[1]{\psfig{figure=GHps/#1,width=1.5in}}% was 1.8
\newcommand{\standardfig}[1]{\psfig{figure=rirreg/#1,width=1.8in,angle=-90}}
\newcommand{\loopsfig}[1]{\psfig{figure=rirreg/loops.#1,height=1.85in,width=1.8in,angle=-90}}
\newcommand{\titledfig}[2]{\begin{tabular}{c}%
{#1}\\%
\standardfig{#2}\\%
\end{tabular}%
}

%
% for the single neuron chapters
%
\newcounter{funcfignum}
\setcounter{funcfignum}{1}
\newcommand{\funcfig}[2]{
	\put(#1,#2){\makebox(0,0)[b]{
	\begin{tabular}{@{}c@{}}
		\psfig{figure=\FIGSlearning/f.#1.#2.ps,height=1.3in,width=1.3in,angle=-90} \\[-0.15in]
 			$\bw = (#1,#2)$		
\\	\end{tabular}
		}
	}
}
\newcommand{\wflatfig}[1]{
	\begin{tabular}{@{}c@{}}\setlength{\unitlength}{1in}\begin{picture}(1.5,1.3)(0.30,0.40)
		\psfig{figure=\FIGSlearning/#1,height=2.43in,width=2.064in,angle=-90}
% was 1.3,1.3
	\end{picture}\\\end{tabular}
}
\newcommand{\wsurfig}[1]{
	\begin{tabular}{@{}c@{}}\setlength{\unitlength}{1in}\begin{picture}(1.5,1.5)(0,0)
		\psfig{figure=\FIGSlearning/#1,height=1.8in,width=1.8in,angle=-90}
% was 1.5,1.5
	\end{picture}\end{tabular}
}
\newcommand{\datfig}[1]{
	\begin{tabular}{@{}c@{}}\setlength{\unitlength}{1in}\begin{picture}(1,1)(0.30,0.1)
		\psfig{figure=\FIGSlearning/#1,height=1.2in,width=1.412in,angle=-90}
% was 1,1
	\end{picture}\end{tabular}
}
\newcommand{\optens}{optimal input distribution}% used in l5.tex, l6.tex, s5.tex
\newcommand{\dilbertcopy}{{[Dilbert image Copyright\copyright{1997} United Feature Syndicate, Inc.,
 used with permission.]}}
\newcommand{\Rnine}{\mbox{R}_9}
\newcommand{\Rthree}{\mbox{R}_3}
\newcommand{\eof}{{\Box}}
\newcommand{\teof}{\mbox{$\Box$}}% for use in text
\newcommand{\ta}{{\tt{a}}}
\newcommand{\tb}{{\tt{b}}}
%\newcommand{\dits}{dits}
%\newcommand{\dit}{dit}
\newcommand{\dits}{bans}
\newcommand{\dit}{ban}
%
% used in l5
%
\newcommand{\BSC}{binary symmetric channel}
\newcommand{\BEC}{binary erasure channel}
\newcommand{\subsubpunc}{}% change to . if subsubsections are given in-line headings
%
% convolutional code definitions
%
\newcommand{\cta}{t^{(a)}}
\newcommand{\ctb}{t^{(b)}}
\newcommand{\z}{z}
\newcommand{\lfsr}{linear feedback shift register} 
%
% definitions for including hinton diagrams from extended directory
%
\newcommand{\ecfig}[1]{\psfig{figure=extended/ps/#1.ps,silent=}}
% extra argument
\newcommand{\ecfigb}[2]{\psfig{figure=extended/ps/#1.ps,#2,silent=}}
%
% used in _s1 and in _linear maybe
%%%%%%%%%%% see /home/mackay/code/bucky
\newcommand{\buckypsfig}[1]{\mbox{\psfig{figure=buckyps/#1,width=1.2in}}}
\newcommand{\buckypsfigw}[1]{\mbox{\psfig{figure=buckyps/#1,width=1.75in}}}
\newcommand{\buckypsgraph}[1]{\mbox{\psfig{figure=buckyps/#1,width=1.2in,angle=-90}}}
\newcommand{\buckypsgraphb}[1]{\mbox{\psfig{figure=buckyps/#1,width=1.75in,angle=-90}}}
\newcommand{\buckypsgraphB}[1]{\mbox{\psfig{figure=buckyps/#1,width=2.2in,angle=-90}}}
%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%55
% for l1a
%%%%%%%%%%%%%%%%%%
% example
% \bigrampicture{3.538mm}{hd_conbigram.ps}
% \bigrampicture{3.538mm}{hd_conbigram.ps,width=278pt}%%%%%%% 278 is the original size
% This used to work fine in latex209 then needed rejigging in 2e.
% (alignment of g,j,p,q,y wrong at the bottom) (saved to graveyard.tex
\newcommand{\bigrampicture}[3]%args are unitlength,picturename-and-picturesize,font-request
{%%%%%%%%%
\setlength{\unitlength}{#1}
\begin{picture}(30,30)(0,-30)% was 28,28   0,-28
\put(0.15,-27.8){\makebox(0,0)[bl]{\psfig{figure=bigrams/#2,angle=-90}}}
\put(1,-29){\makebox(0,0)[b]{{#3\tt a}}}
\put(2,-29){\makebox(0,0)[b]{{#3\tt b}}}
\put(3,-29){\makebox(0,0)[b]{{#3\tt c}}}
\put(4,-29){\makebox(0,0)[b]{{#3\tt d}}}
\put(5,-29){\makebox(0,0)[b]{{#3\tt e}}}
\put(6,-29){\makebox(0,0)[b]{{#3\tt f}}}
\put(7,-29){\makebox(0,0)[b]{\raisebox{0mm}[0mm][0mm]{#3\tt g}}}
\put(8,-29){\makebox(0,0)[b]{{#3\tt h}}}
\put(9,-29){\makebox(0,0)[b]{{#3\tt i}}}
\put(10,-29){\makebox(0,0)[b]{\raisebox{0mm}[0mm][0mm]{#3\tt j}}}
\put(11,-29){\makebox(0,0)[b]{{#3\tt k}}}
\put(12,-29){\makebox(0,0)[b]{{#3\tt l}}}
\put(13,-29){\makebox(0,0)[b]{{#3\tt m}}}
\put(14,-29){\makebox(0,0)[b]{{#3\tt n}}}
\put(15,-29){\makebox(0,0)[b]{{#3\tt o}}}
\put(16,-29){\makebox(0,0)[b]{\raisebox{0mm}[0mm][0mm]{#3\tt p}}}
\put(17,-29){\makebox(0,0)[b]{\raisebox{0mm}[0mm][0mm]{#3\tt q}}}
\put(18,-29){\makebox(0,0)[b]{{#3\tt r}}}
\put(19,-29){\makebox(0,0)[b]{{#3\tt s}}}
\put(20,-29){\makebox(0,0)[b]{{#3\tt t}}}
\put(21,-29){\makebox(0,0)[b]{{#3\tt u}}}
\put(22,-29){\makebox(0,0)[b]{{#3\tt v}}}
\put(23,-29){\makebox(0,0)[b]{{#3\tt w}}}
\put(24,-29){\makebox(0,0)[b]{{#3\tt x}}}
\put(25,-29){\makebox(0,0)[b]{\raisebox{0mm}[0mm][0mm]{#3\tt y}}}
\put(26,-29){\makebox(0,0)[b]{{#3\tt z}}}
\put(27,-29){\makebox(0,0)[b]{{#3--}}}
% they used to be at height -29 and were aligned  bottom
%\put(27,-29){\makebox(0,0)[b]{{#3\verb+-+}}}
%      
\put(29,-29){\makebox(0,0)[r]{#3$y$}}
%
\put(-0.2,-1){\makebox(0,0)[r]{{#3\tt a}}}
\put(-0.2,-2){\makebox(0,0)[r]{{#3\tt b}}}
\put(-0.2,-3){\makebox(0,0)[r]{{#3\tt c}}}
\put(-0.2,-4){\makebox(0,0)[r]{{#3\tt d}}}
\put(-0.2,-5){\makebox(0,0)[r]{{#3\tt e}}}
\put(-0.2,-6){\makebox(0,0)[r]{{#3\tt f}}}
\put(-0.2,-7){\makebox(0,0)[r]{{#3\tt g}}}
\put(-0.2,-8){\makebox(0,0)[r]{{#3\tt h}}}
\put(-0.2,-9){\makebox(0,0)[r]{{#3\tt i}}}
\put(-0.2,-10){\makebox(0,0)[r]{{#3\tt j}}}
\put(-0.2,-11){\makebox(0,0)[r]{{#3\tt k}}}
\put(-0.2,-12){\makebox(0,0)[r]{{#3\tt l}}}
\put(-0.2,-13){\makebox(0,0)[r]{{#3\tt m}}}
\put(-0.2,-14){\makebox(0,0)[r]{{#3\tt n}}}
\put(-0.2,-15){\makebox(0,0)[r]{{#3\tt o}}}
\put(-0.2,-16){\makebox(0,0)[r]{{#3\tt p}}}
\put(-0.2,-17){\makebox(0,0)[r]{{#3\tt q}}}
\put(-0.2,-18){\makebox(0,0)[r]{{#3\tt r}}}
\put(-0.2,-19){\makebox(0,0)[r]{{#3\tt s}}}
\put(-0.2,-20){\makebox(0,0)[r]{{#3\tt t}}}
\put(-0.2,-21){\makebox(0,0)[r]{{#3\tt u}}}
\put(-0.2,-22){\makebox(0,0)[r]{{#3\tt v}}}
\put(-0.2,-23){\makebox(0,0)[r]{{#3\tt w}}}
\put(-0.2,-24){\makebox(0,0)[r]{{#3\tt x}}}
\put(-0.2,-25){\makebox(0,0)[r]{{#3\tt y}}}
\put(-0.2,-26){\makebox(0,0)[r]{{#3\tt z}}}
\put(-0.2,-27){\makebox(0,0)[r]{{#3--}}}
%\put(-0.2,-27){\makebox(0,0)[r]{{#3\verb+-+}}}

\put(-0.2,1){\makebox(0,0)[r]{#3$x$}}
\end{picture}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% from theorems.tex for exact.tex
\newcommand{\PGB}{P^{G}_B}
\newcommand{\PGb}{P^{G}_b}
\newcommand{\PB}{P_B}
\newcommand{\Pb}{P_b}

% included by l2.tex
% definitions for weighings.tex and for text
% shows weighing trees, ternary
%
% decisions of what to weigh are shown in square boxes with 126 over 345 (l:r)
% state of valid hypotheses are listed in double boxes
% or maybe dashboxes?
% three arrows, up means left heavy,  straioght means right heavy, down is balance
%
\newcommand{\mysbox}[3]{\put(#1){\framebox(#2){\begin{tabular}{c}#3\end{tabular}}}}
\newcommand{\mydbox}[3]{\put(#1){\framebox(#2){\begin{tabular}{c}#3\end{tabular}}}}
\newcommand{\myuvector}[3]{\put(#1){\vector(#2){#3}}}
\newcommand{\mydvector}[3]{\put(#1){\vector(#2){#3}}}
\newcommand{\mysvector}[2]{\put(#1){\vector(1,0){#2}}}
\newcommand{\mythreevector}[4]{\myuvector{#1}{#2,#3}{#4}\mydvector{#1}{#2,-#3}{#4}\mysvector{#1}{#4}}
%
%\newcommand{\h1}{\mbox{$1^+$}}
%\newcommand{\l1}{\mbox{$1^-$}}
%\newcommand{\h2}{\mbox{$2^+$}}
%\newcommand{\l2}{\mbox{$2^-$}}
%\newcommand{\h3}{\mbox{$3^+$}}
%\newcommand{\l3}{\mbox{$3^-$}}
%\newcommand{\h4}{\mbox{$4^+$}}
%\newcommand{\l4}{\mbox{$4^-$}}
%\newcommand{\h5}{\mbox{$5^+$}}
%\newcommand{\l5}{\mbox{$5^-$}}
%\newcommand{\h6}{\mbox{$6^+$}}
%\newcommand{\l6}{\mbox{$6^-$}}
%\newcommand{\h7}{\mbox{$7^+$}}
%\newcommand{\l7}{\mbox{$7^-$}}
%\newcommand{\h8}{\mbox{$8^+$}}
%\newcommand{\l8}{\mbox{$8^-$}}
%\newcommand{\h9}{\mbox{$9^+$}}
%\newcommand{\l9}{\mbox{$9^-$}}
%\newcommand{\h10}{\mbox{$10^+$}}
%\newcommand{\l10}{\mbox{$10^-$}}
%\newcommand{\h11}{\mbox{$11^+$}}
%\newcommand{\l11}{\mbox{$11^-$}}
%\newcommand{\h12}{\mbox{$12^+$}}
%\newcommand{\l12}{\mbox{$12^-$}}

\title{Information Theory,  Inference, \& Learning Algorithms}
\shortlecturetitle{}
\shortauthor{David J.C. MacKay}
% the book - called by book.tex
% thebook.tex
% Mon 7/10/02
\setcounter{page}{0} % set to current value    
\setcounter{exercise_number}{1} % set to imminent value 
%
\setcounter{secnumdepth}{1}    % sets the level at which subsection numbering stops
\setcounter{tocdepth}{0}
\newcommand{\mysetcounter}[2]{}%was {\setcounter{#1}{#2}}
% useful for forcing pagenumbers in drafts
%\setcounter{tocdepth}{1}    
\renewcommand{\bs}{{\bf s}}
\newcommand{\figs}{/home/mackay/handbook/figs} % while in bayes chapter 
% \addtocounter{page}{-1}
\thispagestyle{empty} 
\begin{center}
~\\[1.5in]
{\Huge \bf Information Theory,  \\[0.2in]  
% Pattern Recognition  \\[0.1in] 
 Inference,\\[0.2in]
 and Learning Algorithms\\[1in]
% Probability \\[0.2in]
% and Neural Networks\\[1in]
}
{\Large\sf David J.C. MacKay } \\[0.3in]
\copyright  1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003\\[1.3in]
Draft \thedraft\ \today\\ 
\end{center}
\dvipsb{frontpage}
\newpage
% choose one of these: 
% \input{cambridgefrontstuff.tex}
%
% alternate
\fakesection{Roadmap}
% \subchapter{Part III Physics}
\section*{Information Theory, Pattern Recognition and Neural Networks}
% {\large  Handout number 3.}\medskip
%
% Do not be disturbed by missing pagenumbers: the handouts do
% not include all the  book's chapters.
%
% Because the handouts are based on  different drafts of the book, there may
% be occasional mismatches between pagereferences, etc., where
% cross-references occur between this and the other handouts.
%
% Nonexaminable material is indicated by the symbol
% \nonexaminable.
%

\section*{Approximate roadmap for the eight-week course in Cambridge}
 The course will cover about 16 chapters of this book.
 The rest of the book is provided for your interest.
 The book contains numerous exercises with worked solutions.
% less than half the material in this book.
\medskip

\noindent
\begin{tabular}{@{}lp{5in}}
 Lecture 1 & Introduction to Information Theory.
		 \Chref{ch.one}.
% The lecture is mirrored by
\\
 Before lecture 2 &  Work on \exerciseref{ex.weigh}.
\\
	 & Read chapters \ref{ch.prob.ent} and \ref{ch.two}
                  and work on  exercises  in \chref{ch.prob.ent}.
% listed in the introduction		to \chref{ch.two}.
\\
 Lecture 2--3 &  Information content \& typicality. \Chref{ch.two}.
\\
 Lecture 4 & Symbol codes.  \Chref{ch.three}.
\\
 Lecture 5 & Arithmetic codes.  \Chref{ch.ac} (sections \ref{sec.startofch4}--\ref{sec.stopbeforeLZ} only).
\\
	 &  Read \chref{ch.prefive} and do the exercises.
\\
 Lecture 6 & Noisy channels. Definition of mutual information and capacity.
		\Chref{ch.five}.
\\
 Lecture 7--8 & The noisy channel coding theorem.
		\Chref{ch.six} (but not section \ref{sec.ch6stop} onwards).
\\
 Lecture 9 & Clustering. Bayesian inference. \Chref{ch1b}, \ref{ch.clustering}, \ref{ch.ml}.\\
% as a data modelling problem. \\
          & Read \chref{ch.ising} (Ising models).
\\
 Lecture 10--11 & Monte Carlo methods. \Chref{ch.mc}, \ref{ch.mc2}. \\
 Lecture 12 & Variational methods. \Chref{ch.mft}. \\
 Lecture 13 & Neural networks -- the single neuron. 
		\Chref{ch.single.neuron.class}. \\
 Lecture 14 & Capacity of the single neuron.  \Chref{ch.single.neuron.capacity}. \\
 Lecture 15 & Learning as inference.  \Chref{ch.single.neuron.bayes}. \\
 Lecture 16 & The Hopfield network. Content-addressable memory.  \Chref{ch.hopfield}. \\
\end{tabular}



\subsection*{About the exercises}
 I firmly believe that one can only understand a subject by 
 recreating it for oneself. To this end, I think it is essential to
 work through some exercises on each topic.  For guidance, each exercise
 has a rating (similar to that used by \citeasnoun{Knuth_vol1})
 from 1 to 5 that indicates the level of difficulty.

 In addition, exercises that are especially recommended
 are marked by a marginal encouraging rat -- \dorat.
 Exercises that require the use of a computer may be
 marked with a {\sl C}.
% will have 
% a rating such as A1, A5, C1 or C5. 
% The letter  indicates how important I think the exercise is:
% A = very important $\ldots$ C = not essential to the flow of the
% book. The number indicates the difficulty of the problem: 
% 1 = easy, 5 = research project.

 I'll circulate detailed recommendations on exercises
 as the course progresses.

 Answers to many of the exercises are provided. Please use them
 wisely.

 

%\begin{table}[htbp]
%\caption[a]
\begin{realcenter}
\fbox{
\begin{tabular}{ll}
%\begin{minipage}{3in}
{\sf Summary of codes for exercises}\\%[0.2in]
% \hspace{0.2in}
\begin{tabular}[b]{cl}
\dorat & Especially recommended \\[0.2in]
{\ensuremath{\triangleright}} & Recommended \\
{\sl C}   & Some parts require a computer \\
\end{tabular}
%\end{minipage}
&
\begin{tabular}[b]{cl}
\pdifficulty{1} & Simple (one minute) \\
\pdifficulty{2} &  Medium (quarter hour) \\
\pdifficulty{3} &  Moderately hard \\
\pdifficulty{4} &  Hard \\
\pdifficulty{5} &  Research project \\[0.2in]
\end{tabular}
\\
\end{tabular}
}
\end{realcenter}
%\end{table}



%%%%%%%%%%%%%%
\newpage
\tableofcontents
\dvipsb{toc}
% \input{extrafrontstuff.tex}% aims dedication, about the author, etc
% see also tex/oldaims.tex
% for some good stuff.
% and tex/typicalreaders.tex
%
%% \input{tex/overview2001.tex}
 
\prechapter{About Chapter} 
\setcounter{page}{1} % set to current value    
\label{pch.one}
%
% pre-chapter 1
%
\fakesection{Before ch 1}
 I hope you will find the mathematics in the first chapter easy.
 You will need to be familiar with the \ind{binomial distribution}.
% , reviewed below.
 And to solve the exercises in the text --
 which I urge you to do -- you will need to remember {\dem\ind{Stirling's
 approximation}\/}\index{approximation, Stirling's}
 for the factorial function, $%\beq
	 x! \simeq  x^{x}  e^{-x} 
$,
 and be able to
 apply it to ${{N}\choose{r}} =
 \frac{N!}{(N-r)!r!}$.\marginpar{\footnotesize{Unfamiliar notation?\\ See
 appendix \ref{app.notation}, \pref{app.notation}.}}
% $x!$
 These topics are reviewed below.

\subsection*{The binomial distribution}
\label{sec.first.binomial}
\exampl{ex.binomial}{
 A bent coin has probability $f$ of coming up heads.
 The coin is tossed $N$ times.
 What is the  probability
 distribution of the number of heads, $r$?
 What are the \ind{mean} and \ind{variance} of $r$?
}

\amarginfig{t}{%
\begin{tabular}{r}
% $P(r|f,N)$\\
\mbox{\psfig{figure=bigrams/urn.f.g.ps,angle=-90,width=1.51in}}%
\\
\mbox{\psfig{figure=bigrams/urn.f.l.ps,angle=-90,width=1.64in}}%
\\[-0.1in]
\multicolumn{1}{c}{$r$}
\\
\end{tabular}
%}{%
\caption[a]{The binomial distribution $P(r|f\eq 0.3,\,N \eq 10)$,
 on a linear scale (top) and  a logarithmic scale (bottom).}
\label{fig.binomial}
}
% see bigrams/README

\noindent
%\begin{Sexample}{ex.binomial}
{\sf Solution:}
\label{sec.first.binomial.sol}
 The number of heads
 has a binomial distribution.
\beq P(r|f,N) = {N \choose r} f^{r} (1-f)^{N-r} \eeq
 The mean, $\Exp [ r ]$, and variance, $\var[r]$,
 of this distribution are
 defined by
\beq
 \Exp [ r ] \equiv \sum_{r=0}^{N} P(r|f,N) \, r
\label{eq.mean.def}
\eeq
\beqan
 \var[r] & \equiv &
\Exp \left[ \left( r  -   \Exp [ r ] \right)^2 \right] \\
& = &
\Exp [ r^2 ] -  \left( \Exp [ r ] \right)^2
 =  \sum_{r=0}^{N} P(r|f,N) r^2 -  \left( \Exp [ r ] \right)^2 .
\label{eq.var.sum}
\eeqan
%
 Rather than evaluating the sums over $r$ (\ref{eq.mean.def},\ref{eq.var.sum}) directly,
 it is easiest to  obtain the mean and variance by noting that $r$
 is the sum of $N$ {\em independent\/}
% , identically distributed
 random variables, namely, the number of heads in the
 first toss (which is either zero or one),
 the number of heads in the second toss, and so forth.
 In general, 
\beq
\begin{array}{rcll}
 \Exp [ x + y ] &=&  \Exp [ x ] +  \Exp [ y ]  & \mbox{for any random variables $x$ and $y$};
\\
 \var [ x + y ] &=&  \var [ x ] +  \var [ y ]  & \mbox{if $x$ and $y$ are independent}.
\end{array}
\eeq
 So the mean of $r$ is the sum of the means of those random
 variables, and the variance of $r$ is the sum of their variances.
% its mean and variance are given by adding the means and variances
% of those random variables, respectively.
 The mean number of heads in a single toss
 is $f\times 1 + (1-f)\times 0 = f$, and the variance of the
 number of heads in a single toss  is
\beq
 \left[ f\times 1^2 + (1-f)\times 0^2 \right] - f^2 = f - f^2 = f(1-f),
\eeq
 so the mean and variance of $r$ are:
\beq \Exp [ r ] = N f
%\eeq\beq
\hspace{0.5in} \mbox{and} \hspace{0.5in}
 \var[r] = N f (1-f) .
\eeq
%\end{Sexample}
% ADD END PROOF SYMBOL HERE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

\subsection*{Approximating $x!$ and ${{N}\choose{r}}$}

\amarginfig{t}{%
\begin{tabular}{r}
\mbox{\psfig{figure=bigrams/poisson.g.ps,angle=-90,width=1.5in}}%
\\
\mbox{\psfig{figure=bigrams/poisson.l.ps,angle=-90,width=1.64in}}%
\\[-0.1in]
\multicolumn{1}{c}{$r$}
\\
\end{tabular}
%}{%
\caption[a]{The Poisson distribution $P(r\,|\,\l\eq 15)$,
 on a linear scale (top) and  a logarithmic scale (bottom).}
\label{fig.poisson}
}
% see bigrams/README
\label{sec.poisson}


% FAVOURITE BIT
\noindent
 Let's derive Stirling's approximation by an unconventional route.
 We start from the \ind{Poisson distribution},
\beq
	P( r | \l ) = e^{-\l} \frac{\l^r}{r!} \:\:\:\:
  \:\: r\in \{ 0,1,2,\ldots\} .
\label{eq.poisson}
\eeq
%
% \noindent
For large $\l$, this distribution is well approximated -- at least\index{approximation by Gaussian}
 in the vicinity of $r \simeq \l$ -- by
 a Gaussian distribution with mean $\l$ and variance $\l$:
% So,
\beq
  e^{-\l} \frac{\l^r}{r!} \simeq \frac{1}{\sqrt{2\pi \l}}
		e^{{\textstyle -\frac{(r-\l)^2}{2\l}}} .
\eeq
 Let's plug  $r=\l$ into this formula.
\beqan
  e^{-\l} \frac{\l^{\l}}{\l!} &\simeq& \frac{1}{\sqrt{2\pi \l}}
\\
\Rightarrow \l! &\simeq&  \l^{\l} \, e^{-\l}  \sqrt{2\pi \l}  .
\eeqan
 This is {\bf Stirling's approximation}
 for the \ind{factorial} function,
 including several of the correction
 terms that are usually forgotten.
\beq
	 x! \simeq  x^{x}  e^{-x}  \sqrt{2\pi x}  \:\:\:\Leftrightarrow\:\:\:
	\ln x! \simeq x \ln x - x + {\textstyle\frac{1}{2}} \ln {2\pi x} .
\label{eq.stirling}
\eeq
%
 We can use this approximation
% the approximation
%$%\beq
%	 x! \simeq  x^{x}  e^{-x} $
 to approximate\index{combination}
$%\beq
	{{N}\choose{r}} \equiv \frac{N!}{(N-r)!r!}
$.%\eeq
\beqan
\ln	{{N}\choose{r}} 
%	&	\simeq &
% N  [ \ln N - 1 ] - (N-r) [ \ln (N-r) - 1 ] - r [ \ln r - 1 ]
%\\
 & \simeq & (N-r) \ln\frac{N}{N-r} + r \ln\frac{N}{r}
 .
\label{eq.choose.approx}
\eeqan
 Since all the terms in this equation are logarithms,
 this result can be rewritten in any base.\marginpar{\small Recall that
$\displaystyle{ \log_2 x = \frac{ \log_e x }{ \log_e 2} }$.\\[0.03in]
 Note that $\displaystyle\frac{\partial  \log_2 x }{\partial x} =
 \frac{1}{\log_e 2}\,\frac{1}{x}$.
}
%\fakesubsection*{My rule about log and ln}
 We will denote\index{conventions!logarithms}\index{notation!logarithms}
 natural logarithms ($\log_e$) by `ln', and \ind{logarithms}
 to base 2 ($\log_2$)
 by `$\log$'.

 If we introduce the {\dbf\ind{binary entropy function}},
\beq
 H_2(x) \equiv x \log \frac{1}{x} + (1-x) \log \frac{1}{(1-x)}
\eeq
 then we can rewrite the approximation
 (\ref{eq.choose.approx})
%\beq
%$ \log	{{N}\choose{r}} 
%  \simeq  (N-r) \log \frac{N}{N-r} + r \log \frac{N}{r} 
%$
%\eeq
 as
\amarginfig{t}{\small%
\begin{center}
\mbox{
\hspace{-6mm}
% \hspace{6.2mm}
\raisebox{\hpheight}{$H_2(x)$}
% to put H at left:
\hspace{-7.5mm}
% \hspace{-20mm}
\mbox{\psfig{figure=figs/H2.ps,%
width=42mm,angle=-90}}$x$
}
% see also H2p.tex

\end{center}
\caption[a]{The  binary entropy function.}
% $H_2(x)$.}
\label{fig.h2x}
}
\beq
\log	{{N}\choose{r}} 
  \simeq N H_2(r/N) ,
\label{eq.stirling.choose.l}
\eeq
 or, equivalently,
% \:\:\:\Leftrightarrow\:\:\:
\beq
	{{N}\choose{r}} 
  \simeq 2^{N H_2(r/N)} .
\label{eq.stirling.choose}
\eeq
 If we need a more accurate approximation, we
 can include terms of the next order from
 Stirling's approximation
 (\ref{eq.stirling}):
\beq
\log	{{N}\choose{r}} 
  \simeq N H_2(r/N) -
 {\textstyle\frac{1}{2}} \log \left[ {2\pi N \, \frac{N\!-\!r}{N} \,
                                       \frac{r}{N}}  \right]
.
\label{eq.H2approxaccurate}
\eeq
%
% - {\textstyle\frac{1}{2}} \ln {2\pi N}
% + {\textstyle\frac{1}{2}} \ln {2\pi N-r}
% + {\textstyle\frac{1}{2}} \ln {2\pi r}
%
% ln += {\textstyle\frac{1}{2}} \ln {2\pi (N-r)(r)/N}
% log_2 += {\textstyle\frac{1}{2}} \log_2 {2\pi (N-r)(r)/N}
% or
% log_2 += {\textstyle\frac{1}{2}} \log_2 {2\pi N}
%        + {\textstyle\frac{1}{2}} \log_2 {\frac{(N-r)}{N}\frac{r}{N}} 
% log_2 += {\textstyle\frac{1}{2}} \log_2 {2\pi \frac{(N-r)}{N}\frac{r}{N} N} 


\chapter{Introduction to Information Theory}
\label{ch.one}
\label{chone}
\addtopic{3}{infotheory}
\addtopic{2}{inference}
\addtrack{1}{inferencecourse}
\addtrack{3}{infotheorycourse}
\addtrack{3}{itprnncourse}
% % \part{Information Theory}
% \chapter{Introduction to Information Theory}
\label{ch1}
%\section{Communication over noisy channels}
% One of the principal questions addressed by information theory is 
%  Shannon's ground-breaking paper on `The Mathematical Theory of 
%  Communication' opens thus:
\begin{quotation}
\noindent
 The fundamental problem of communication is that of reproducing at one point
 either exactly or approximately a message selected at another point.
\\
\mbox{~} \hfill {\em (Claude Shannon, 1948)}  \\
%
\end{quotation}

\noindent
 In the first half of 
 this book we
%are going to
 study how to measure information content; 
 we
% are going to
% learn by how much data from a given source 
% can be compressed; we
% are going to
 learn how
% , practically, to
% achieve  data compression;
 to compress data; and we
% are going to
 learn  how to communicate 
 perfectly over  imperfect communication channels. 

 We start by getting a feeling for this last problem. 

\section[How can we achieve perfect communication?]{How
 can we achieve perfect communication over an imperfect, noisy
 commmunication channel?}
 Some examples of noisy communication channels are:
\bit
\item
 an analogue telephone 
 line,\marginpar{\footnotesize
\setlength{\unitlength}{1mm}%
 \begin{picture}(45,10)(0,5)
\put(0,10){\makebox(0,0)[l]{\shortstack{modem}}}
\put(21,10){\makebox(0,0)[l]{\shortstack{phone\\line}}}
\put(39,10){\makebox(0,0)[l]{\shortstack{modem}}}
\put(15,10){\vector(1,0){3}}
\put(32,10){\vector(1,0){3}}
\end{picture}
}
 over which two modems communicate digital information;
\item
 the radio communication link from the  Jupiter-orbiting spacecraft,
  Galileo,\marginpar{\footnotesize
\setlength{\unitlength}{1mm}%
 \begin{picture}(45,10)(0,5)
\put(0,10){\makebox(0,0)[l]{\shortstack{Galileo}}}
\put(21,10){\makebox(0,0)[l]{\shortstack{radio\\waves}}}
\put(39,10){\makebox(0,0)[l]{\shortstack{Earth}}}
\put(15,10){\vector(1,0){3}}
\put(32,10){\vector(1,0){3}}
\end{picture}
}
 to earth;
\item
\marginpar[c]{\footnotesize
\setlength{\unitlength}{1mm}%
 \begin{picture}(30,20)(0,0)
\put(0,10){\makebox(0,0)[l]{\shortstack{parent\\cell}}}
\put(16,2){\makebox(0,0)[l]{\shortstack{daughter\\cell}}}
\put(16,16){\makebox(0,0)[l]{\shortstack{daughter\\cell}}}
\put(10,10){\vector(1,1){5}}
\put(10,10){\vector(1,-1){5}}
\end{picture}
}reproducing cells, in which the daughter cells's \ind{DNA}
 contains information from the parent
% cell or
 cells;
\item 
 \marginpar{\footnotesize
\setlength{\unitlength}{1mm}%
 \begin{picture}(45,10)(0,5)
\put(0,10){\makebox(0,0)[l]{\shortstack{computer\\ memory}}}
\put(20,10){\makebox(0,0)[l]{\shortstack{disc\\drive}}}
\put(33,10){\makebox(0,0)[l]{\shortstack{computer\\ memory}}}
\put(15,10){\vector(1,0){3}}
\put(29,10){\vector(1,0){3}}
\end{picture}
}a disc drive.
\eit
 The last example shows that communication doesn't have to involve 
 information going from one {\em place\/}  to another. When 
 we write a file on a disc drive, we'll
% typically
 read it off
% again 
 in the same location -- but at a later {\em time}.

 These channels are noisy. A telephone line  suffers
 from cross-talk with other lines; the hardware in the 
 line distorts and adds noise to the transmitted signal.  The deep
 space network that listens to Galileo's puny transmitter
% fairy-bulb power
 receives background radiation  from
 terrestrial and cosmic sources.
 DNA is subject to mutations and damage. 
 A \ind{disc drive}, which  writes
 a binary digit (a one or zero, also known as a {\dbf bit}) by aligning a patch of magnetic
 material in one of two orientations, may later
% , with some probability,
 fail to read out the stored binary digit:
% that was stored
 the patch of material might  spontaneously flip
 magnetization, or
 a glitch of
 background noise might cause the reading circuit
 to report the wrong 
 value for the binary digit, or  the writing head might not induce 
 the magnetization in the first place because of interference
 from neighbouring bits.

 In all these cases, if we transmit data, \eg, a string 
 of bits, over the channel, there is some probability that 
 the received message will not be identical to the transmitted message. 
% And in all cases,
 We would prefer to have a communication channel for
 which this probability was zero -- or so close to zero that 
 for practical purposes it is indistinguishable from zero.  

 Let's consider
% the example of
 a noisy disc drive
% having the property
 that transmits  each bit  correctly
% transmitted
 with probability
 $(1-f)$ and incorrectly  with probability $f$. 
 This model
% favourite
 communication channel  is known 
 as the {\dbf{\ind{binary symmetric channel}}} (\figref{fig.bsc1}).

\begin{figure}[htbp]
\figuremargin{%
\[
\begin{array}{c}
\setlength{\unitlength}{0.46mm}
\begin{picture}(30,20)(-5,0)
\put(-4,9){{\makebox(0,0)[r]{$x$}}}
\put(5,2){\vector(1,0){10}}
\put(5,16){\vector(1,0){10}}
\put(5,4){\vector(1,1){10}}
\put(5,14){\vector(1,-1){10}}
\put(4,2){\makebox(0,0)[r]{1}}
\put(4,16){\makebox(0,0)[r]{0}}
\put(16,2){\makebox(0,0)[l]{1}}
\put(16,16){\makebox(0,0)[l]{0}}
\put(24,9){{\makebox(0,0)[l]{$y$}}}
\end{picture}
\end{array}

\:\:\:
\begin{array}{ccl}%%%%% {c@{}c@{}l} %%%%% (for twocolumn style)
        P(y\eq 0|x\eq 0) &\!=\!& 1 - \q ; \\        P(y\eq 1|x\eq 0) &\!=\!& \q ;
\end{array} 
\begin{array}{ccl}
        P(y\eq 0|x\eq 1) &\!=\!&  \q ; \\ P(y\eq 1|x\eq 1) &\!=\!& 1 - \q .
\end{array} 
\]
}{%
\caption[a]{The binary symmetric channel. The 
 transmitted symbol is $x$ and the 
 received symbol $y$. The noise level, the probability of a bit's being
 flipped, is $f$.}
\label{fig.bsc1}
}%
\end{figure}
\begin{figure}[htbp]
\figuremargin{%
\begin{mycenter}
\begin{tabular}{rcl}
\psfig{figure=bitmaps/dilbert.ps,width=1.2in} 
&\hspace{0.1in}%
\raisebox{0.22in}{%
\setlength{\unitlength}{1.2mm}%
\begin{picture}(20,20)(0,0)%
\put(10,1){\makebox(0,0)[t]{$(1-f)$}}
\put(10,17){\makebox(0,0)[b]{$(1-f)$}}
\put(12,9.5){\makebox(0,0)[l]{$f$}}
% \put(10,16.5){\makebox(0,0)[b]{$(1-f)$}}
\put(5,2){\vector(1,0){10}}
\put(5,16){\vector(1,0){10}}
\put(5,4){\vector(1,1){10}}
\put(5,14){\vector(1,-1){10}}
\put(4,2){\makebox(0,0)[r]{{1}}}
\put(4,16){\makebox(0,0)[r]{{0}}}
\put(16,2){\makebox(0,0)[l]{{1}}}
\put(16,16){\makebox(0,0)[l]{{0}}}
\end{picture}%
}%
\hspace{0.385in}&
\psfig{figure=_is/10000.10.ps,width=1.2in} \\
% & & \makebox[0in][l]{\large 10\% of bits are flipped} \\
\end{tabular}
\end{mycenter}
}{%
\caption[a]{A binary data sequence of length 10000 transmitted over 
 a binary symmetric channel with  noise level $f=0.1$.
\dilbertcopy}
\label{fig.bsc.dil}
}%
\end{figure}

\noindent
 As an example,
% For the sake of argument,
 let's imagine that $f=0.1$, that is, ten \percent\ of the bits are 
 flipped (figure \ref{fig.bsc.dil}).
% For a disc drive to be useful, we would prefer that it should 
% flip no bits at all in its entire lifetime.
 A useful disc drive would  flip no bits at all in its entire lifetime.
%  
 If we expect to read and write a 
 gigabyte per day for ten years,  we require a bit error 
 probability  of the order of $10^{-15}$, or smaller.
 There are two approaches to this goal. 


\subsection{The physical solution}
 The physical solution is to improve the physical characteristics of 
 the communication channel to reduce its error probability. We could 
 improve our disc drive by
% , for example,
\ben
\item
  using more reliable components in its circuitry;
\item
  evacuating the air from the disc enclosure so as
 to eliminate the turbulence that perturbs the 
 reading head from the  track; 
\item
  using a larger magnetic patch to represent each bit;  or 
\item 
   using higher-power signals or cooling the 
 circuitry in order to reduce thermal noise. 
\een
 These physical modifications
 typically
 increase the cost of the communication 
 channel.
%   unit of area  making the disc spin at a slower rate

%
% the system solution
% 
\begin{figure}%[htbp]
\figuremargin{%
\setlength{\unitlength}{1.25mm}
\begin{mycenter}
\begin{picture}(50,40)(-10,5)
\put(0,5){\framebox(25,10){\begin{tabular}{c}Noisy\\ channel\end{tabular}}}
\put(-20,20){\framebox(25,10){\begin{tabular}{c}Encoder\end{tabular}}}
\put(20,20){\framebox(25,10){\begin{tabular}{c}Decoder\end{tabular}}}
%\put(-20,40){\framebox(25,10){\begin{tabular}{c}Compressor\end{tabular}}}
%\put(20,40){\framebox(25,10){\begin{tabular}{c}Decompressor\end{tabular}}}
%\put(-50,20){\makebox(25,10){\begin{tabular}{c}{\sc Source}\\{\sc coding}\end{tabular}}}
% \put(-50,40){\makebox(25,10){\begin{tabular}{c}{\sc Channel}\\{\sc coding}\end{tabular}}}
\put(-20,37){\makebox(25,12){Source}}
%
\put(-10,14){\makebox(0,0){$\bt$}}
\put(-10,34){\makebox(0,0){$\bs$}}
\put(35,14){\makebox(0,0){$\br$}}
\put(35,34){\makebox(0,0){$\hat{\bs}$}}

\put(-7.5,18){\line(0,-1){8}}  
\put(-7.5,10){\vector(1,0){6}} 
\put(32.5,10){\vector(0,1){8}}
\put(32.5,10){\line(-1,0){6}}
%
\put(32.5,31){\vector(0,1){8}}
%\put(32.5,51){\vector(0,1){5}}
\put(-7.5,39){\vector(0,-1){8}}
%\put(-7.5,55){\vector(0,-1){5}}
\end{picture}
\end{mycenter}
}{%
\caption[a]{The `system' solution for
         achieving 
% almost perfect 
        reliable communication
        over a noisy channel. The encoding system introduces
 systematic redundancy
%        in a systematic way 
        into the transmitted vector $\bt$. The decoding system 
   uses this known redundancy to deduce
 from  the 
        received vector $\br$
 {\em both\/}
 the original source vector
        {\em and\/}
 the noise introduced by the channel.
}
\label{system.solution}
}%
\end{figure}
\subsection{The `system' solution}
 Information theory\index{information theory} and
 \ind{coding theory}\index{system} offer
 an alternative (and much more exciting)
 approach: we accept the given noisy channel as it is
 and 
 add communication {\dem systems\/} to it so that we 
 can {detect\/} and {correct\/} the errors introduced by the 
% noise.
 channel.
 As shown in \figref{system.solution}, we   add an 
 {\dem\ind{encoder}\/} before the channel and a {\dem\ind{decoder}\/} after 
 it. The encoder encodes the source message $\bs$ 
 into a {\dem transmitted\/} message $\bt$,
% the idea is that the  encoder adds
 adding {\dem\ind{redundancy}\/} to the original message in some way. The 
 channel adds noise to the transmitted message, yielding a received 
 message $\br$. The decoder uses the  known redundancy 
 introduced by the encoding system to infer both the original signal 
 $\bs$ and the added noise.
% added by the channel was. 

 Whereas  physical solutions give incremental channel improvements
 only  at an ever-increasing cost,
% we hope to find
% there exist
 system solutions  can turn noisy channels into reliable
 communication channels
 with the only cost being a  {\em computational\/} requirement 
 at the encoder and decoder.
% (and the delay associated with those computations.
%
% suggested addition: 
% So, as the cost of computation falls, the cost of reliability will fall as well.

{\dbf Information theory} is concerned with the theoretical limitations and 
% theoretical 
 potentials of such  systems. `What is the best error-correcting 
 performance we could achieve?'

{\dbf Coding theory} is concerned with the creation of practical 
 encoding and decoding systems.
 
% Some
\section{Error-correcting codes for the binary symmetric channel}
 We now consider  examples of encoding and decoding systems. 
 What is the simplest way to  add useful redundancy to a transmission? 
 [To make the rules of the game clear:
 we want to be able to detect {\em and\/} correct errors;
 and retransmission is not an option. We  get only
one chance to encode, transmit,
 and decode.]

\subsection{Repetition codes}
\label{sec.r3}
 A straightforward idea is to repeat every bit of the message a prearranged
 number of times -- for example, three times, as shown in \figref{fig.r3}. 
 We call this {\dem \ind{repetition code}\/} `$\Rthree$'.

%\begin{figure}[htbp]
%\figuremargin{%
\amarginfig{c}{
\begin{mycenter}
\begin{tabular}{c@{\hspace{0.3in}}c} \toprule % \hline
%        Source sequence $\bs$ &  Transmitted sequence $\bt$ \\ \hline
        Source  &  Transmitted  \\[-0.02in] % was -0.1, which was to much
         sequence  &   sequence  \\ 
         $\bs$ &   $\bt$ \\ \midrule % \hline
        \tt 0 &\tt  000 \\
        \tt 1 &\tt  111  \\ \bottomrule % \hline
\end{tabular} 
\end{mycenter}
%}{%
\caption[a]{The repetition code {$\Rthree$}.}
\label{fig.r3}
}%
%\end{figure}

% \noindent
%
 Imagine that
% what might happen if 
 we transmit the source message
\[
 \bs = \mbox{\tt 0 0 1 0 1 1 0}
\]
 over a binary 
 symmetric channel with noise level $f=0.1$ using this repetition code. 
 We can describe the channel as `adding' a sparse noise vector $\bn$ to the 
 transmitted vector -- adding in modulo 2 arithmetic, \ie, the binary algebra in which 
 {\tt 1}+{\tt 1}={\tt 0}.  A possible noise
 vector $\bn$ and received vector $\br = \bt + \bn$
 are shown in 
 \figref{fig.r3.transmission}.
\begin{figure}[htbp]
%
% here i should switch the \[ \] for a display that oes not introduce
% white space at the top (about 0.1in)
%
\figuremargin{%
\[
        \begin{array}{rccccccc}
        \bs & {\tt 0}&{\tt 0}&{\tt 1}&{\tt 0}&{\tt 1}&{\tt 1}&{\tt 0} \\
        \bt & \obr{{\tt 0}}{{\tt 0}}{{\tt 0}}&\obr{{\tt 0}}{{\tt 0}}{{\tt 0}}&\obr{{\tt 1}}{{\tt 1}}{{\tt 1}}&\obr{{\tt 0}}{{\tt 0}}{{\tt 0}}&
                \obr{{\tt 1}}{{\tt 1}}{{\tt 1}}&\obr{{\tt 1}}{{\tt 1}}{{\tt 1}}& \obr{{\tt 0}}{{\tt 0}}{{\tt 0}} \\ 
        \bn & \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}& \nbr{{\tt 0}}{{\tt 0}}{{\tt 1}}&   \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}&  \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}&
                \nbr{{\tt 1}}{{\tt 0}}{{\tt 1}}&   \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}&  \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}} \\ \cline{2-8}
        \br &  \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}& \nbr{{\tt 0}}{{\tt 0}}{{\tt 1}}&   \nbr{{\tt 1}}{{\tt 1}}{{\tt 1}}&  \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}&
                \nbr{{\tt 0}}{{\tt 1}}{{\tt 0}}&   \nbr{{\tt 1}}{{\tt 1}}{{\tt 1}}&  \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}} 
        \end{array}
\]
}{%
\caption{An example transmission using $\mbox{R}_3$.}
\label{fig.r3.transmission}
}
\end{figure}

%\noindent
 How should we decode this received vector?
%
% optimality not clear - should justify? 
%
% Perhaps you can see  that
 The optimal algorithm looks at the received
 bits three at a time and takes 
 a \ind{majority vote} (\algref{alg.r3}).
%
\begin{aside}
% 
 At the risk of explaining the obvious, let's prove this result.
 The optimal decoding decision
 (optimal in the sense
 of having the smallest probability of being wrong)
 is to find which value of $\bs$
 is most probable, given $\br$.\index{MAP}
% to make clear the assumptions.
 Consider the decoding of a single bit $s$, which was encoded
 as
% after encoding as
 $\bt(s)$ 
 and  gave rise to three received bits $\br = r_1r_2r_3$.
 By \ind{Bayes's theorem},\label{sec.bayes.used} the {\dem posterior
 probability\/} of $s$ is
\beq
	P(s \,|\, r_1r_2r_3 ) = \frac{ P( r_1r_2r_3 \,|\, s ) P( s ) }
				{ P( r_1r_2r_3 ) } .
\label{eq.bayestheorem}
\eeq
 We can spell out the posterior probability of the two alternatives thus:
\beq
	P(s\!=\!{\tt 1} \,|\, r_1r_2r_3 ) = \frac{ P( r_1r_2r_3 \,|\, s\!=\!{\tt 1} )
						P( s\!=\!{\tt 1} ) }
				{ P( r_1r_2r_3 ) } ; 
\label{eq.post1}
\eeq
\beq
	P(s\!=\!{\tt 0} \,|\, r_1r_2r_3 ) = \frac{ P( r_1r_2r_3 \,|\, s\!=\!{\tt 0} )
						P( s\!=\!{\tt 0} ) }
				{ P( r_1r_2r_3 ) } .
\label{eq.post0}
\eeq
%
 This \ind{posterior probability} is determined by two factors:
  the
 {\dem{\ind{prior} probability\/}} $P(s)$, and 
 the data-dependent term $P( r_1r_2r_3 \,|\, s )$, which is called
 the {\dem{\ind{likelihood}\/}} of $s$.
 The normalizing constant $P( r_1r_2r_3 )$
% is irrelevant to
 needn't be computed when finding
 the optimal decoding decision,
 which is to guess $\hat{s}\!=\!{\tt 0}$
 if $P(s\!=\!{\tt 0} \,|\, \br ) > P(s\!=\!{\tt 1} \,|\, \br )$,
 and $\hat{s}\!=\!{\tt 1}$ otherwise.

 To find
 $P(s\!=\!{\tt 0} \,|\, \br )$ and $P(s\!=\!{\tt 1} \,|\, \br )$,
% the optimal decoding decision,
 we must  make an assumption about the prior probabilities of the
 two hypotheses ${s}\!=\!{\tt 0}$ and ${s}\!=\!{\tt 1}$, and we 
 must make an assumption about the probability of $\br$ given
 $s$.
% $\bt(s)$.
 We  assume that the prior probabilities are equal:
 $P( {s}\!=\!{\tt 0}) = P( {s}\!=\!{\tt 1}) = 0.5$; 
 then  maximizing the posterior probability $P(s\,|\,\br)$ is
 equivalent to maximizing the likelihood $P(\br\,|\,s)$.\index{maximum likelihood}
 And  we  assume that the
 channel is a binary symmetric channel with noise level $f<0.5$, so that
 the likelihood is
\beq
	P( \br \,|\, s ) = P(\br \,|\, \bt(s) ) = \prod_{n=1}^N
		P(r_n \,|\, t_n(s) ) ,
\eeq
 where $N=3$ is the number of transmitted bits in the block
 we are considering, and
\beq
 P(r_n\,|\,t_n) = \left\{ \begin{array}{lll}
 (1\!-\!f) & \mbox{if} &  r_n=t_n \\
 f & \mbox{if} & r_n \neq t_n. \end{array} \right.
\eeq
 Thus the likelihood ratio for the
 two hypotheses is
% if we define $
\beq
	\frac{P(\br\,|\, s\!=\!{\tt 1})}{P(\br\,|\, s\!=\!{\tt 0})}
%	= \left( \frac{ (1-f) }{f} \right)^{
	= \prod_{n=1}^N
		\frac{P(r_n \,|\, t_n({\tt 1}) )}{P(r_n \,|\, t_n({\tt 0}) )} ;
\label{eq.likelihood.bsc}
\eeq
 each factor
% $P(r_n \,|\, t_n(s) )$
 $\frac{P(r_n \,|\, t_n({\tt 1}) )}{P(r_n \,|\, t_n({\tt 0}) )}$
 equals $\frac{ (1-f) }{f}$ if $r_n=1$ and $\frac{f}{ (1-f) }$ if
 $r_n=0$. 
 The ratio $\gamma \equiv \frac{ (1-f) }{f}$ is greater than 1,
 since $f<0.5$, so the winning hypothesis is the one with the most
 `votes', each vote counting for a factor of $\gamma$ in the
% posterior probability.
 likelihood ratio. 

 Thus the majority-vote decoder shown in \algref{fig.r3d}
 is the optimal decoder if we assume that 
 the channel is  a binary symmetric channel and that the 
 two possible source messages {\tt 0} and {\tt 1} 
 have equal prior probability.
\end{aside}

\begin{algorithm}[htbp]
\algorithmmargin{%
\begin{mycenter}
\begin{tabular}{ccc} % \toprule % \hline
        Received sequence $\br$ &
 Likelihood ratio $\frac{P(\br\,|\, s\hspace{-0.2mm}=\hspace{-0.2mm}{\tt 1})}{P(\br\,|\, s\hspace{-0.2mm}=\hspace{-0.2mm}{\tt 0})}$
 &
 Decoded sequence $\hat{\bs}$ \\ \midrule
\tt      000 & $\gamma^{-3}$ &\tt 0 \\
\tt      001 & $\gamma^{-1}$ &\tt 0 \\
\tt      010 & $\gamma^{-1}$ &\tt 0 \\
\tt      100 & $\gamma^{-1}$ &\tt 0 \\
\tt      101 & $\gamma^{1}$  &\tt 1 \\
\tt      110 & $\gamma^{1}$  &\tt 1 \\
\tt      011 & $\gamma^{1}$  &\tt 1 \\
\tt      111 & $\gamma^{3}$  &\tt 1 \\
% \bottomrule
\end{tabular} 
\end{mycenter}
}{%
\caption[a]{Majority-vote decoding algorithm for {$\Rthree$}.
 Also shown are the likelihood ratios (\ref{eq.likelihood.bsc}), assuming
%  This is the optimal decoder if
 the channel is a binary symmetric channel; $\gamma \equiv (1-f)/f$.}
%
\label{fig.r3d}
\label{alg.r3}
}%
\end{algorithm}

%\noindent
 We now apply the majority vote decoder to the received vector of  \figref{fig.r3.transmission}.
 The first  three received bits are all ${\tt 0}$, so
 we decode this triplet 
 as a ${\tt 0}$. 
 In the second triplet of \figref{fig.r3.transmission},
 there are two {\tt 0}s and one {\tt 1}, so  we decode 
 this triplet as a ${\tt 0}$ -- which in this case corrects the error.
 Not all errors are corrected, however. If we are unlucky and 
 two errors fall in a single block, as in the fifth triplet of 
 \figref{fig.r3.transmission}, 
 then the decoding rule gets the wrong answer, as shown in 
 \figref{fig.decoding.R3}. 
% \Figref{fig.decoding.R3}
% shows the result of decoding the received vector 
% from \figref{fig.r3.transmission}.
\begin{figure}[htbp]
\figuremargin{%
\[
        \begin{array}{rccccccc}
        \bs & {\tt 0}&{\tt 0}&{\tt 1}&{\tt 0}&{\tt 1}&{\tt 1}&{\tt 0} \\
        \bt & \obr{{\tt 0}}{{\tt 0}}{{\tt 0}}&\obr{{\tt 0}}{{\tt 0}}{{\tt 0}}&\obr{{\tt 1}}{{\tt 1}}{{\tt 1}}&\obr{{\tt 0}}{{\tt 0}}{{\tt 0}}&
                \obr{{\tt 1}}{{\tt 1}}{{\tt 1}}&\obr{{\tt 1}}{{\tt 1}}{{\tt 1}}& \obr{{\tt 0}}{{\tt 0}}{{\tt 0}} \\ 
        \bn & \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}& \nbr{{\tt 0}}{{\tt 0}}{{\tt 1}}&   \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}&  \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}&
                \nbr{{\tt 1}}{{\tt 0}}{{\tt 1}}&   \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}}&  \nbr{{\tt 0}}{{\tt 0}}{{\tt 0}} \\ \cline{2-8} 
        \br &  \ubr{{\tt 0}}{{\tt 0}}{{\tt 0}}& \ubr{{\tt 0}}{{\tt 0}}{{\tt 1}}&   \ubr{{\tt 1}}{{\tt 1}}{{\tt 1}}&  \ubr{{\tt 0}}{{\tt 0}}{{\tt 0}}&
                \ubr{{\tt 0}}{{\tt 1}}{{\tt 0}}&   \ubr{{\tt 1}}{{\tt 1}}{{\tt 1}}&  \ubr{{\tt 0}}{{\tt 0}}{{\tt 0}} \\ 
        \hat{\bs} &     {\tt 0}&{\tt 0}&{\tt 1}&{\tt 0}&{\tt 0}&{\tt 1}&{\tt 0} \\
        \mbox{corrected errors} &
                         &\star & & & & & \\ 
        \mbox{undetected errors} &
                         & & & &\star & & 
        \end{array}
\]
}{%
\caption{Decoding
% Applying the maximum likelihood decoder for $\mbox{R}_3$ to 
 the received vector 
 from \protect\figref{fig.r3.transmission}.}
\label{fig.decoding.R3}
}%
\end{figure}

\noindent
% Thus the error probability is reduced by the use of this code. 
% It is easy to compute the error probability.

% Exercise 1.1. Could this be made an Example, i.e. worked through in
%        the text? -- for a beginner, there is a lot in it, and it seems to
%        be important.
%
% see exercise.sty
\exercissx{2}{ex.R3ep}{%%%%%%%% keep this as A2, but cut it from the ITPRNN list
 Show\marginpar{\footnotesize The exercise's rating, \eg,
% `{\em{A}}2'
 `[{\sl2}]'
 indicates its  difficulty:
 `1' exercises are the easiest.
% An exercise rated {\em{A}}2 is important and should not prove too difficult.
 Exercises that are accompanied by a marginal rat are especially recommended.
}
 that  the error probability is reduced by the use of {$\Rthree$}
 by computing the error probability of
 this code for a binary symmetric channel
 with noise level $f$.
%Do so.
}
%
% This fig is 0.1 inch too wide, 9801
%
\begin{figure}
%\fullwidthfigure{%
%\figuredangle{% this hung off the bottom of the page
\figuremarginb{% I think this may make a collision?
\begin{center}
\setlength{\unitlength}{0.8in}% was 0.75 98.12. changed to 0.8 99.01
\begin{picture}(7,4.3)(0,1.4)
\put(0,5){\makebox(0,0)[tl]{\psfig{figure=bitmaps/dilbert.ps,width=1in}}}
\put(0.625,5.4){\makebox(0,0){\Large$\bs$}}
\thicklines
\put(1.35,4.75){\vector(1,0){0.4}}
\put(1.55,5.4){\makebox(0,0){{\sc encoder}}}
\put(2,5){\makebox(0,0)[tl]{\psfig{figure=poster/10000.r3.ps,width=1in}}}
\put(2.625,5.4){\makebox(0,0){\Large$\bt$}}
\put(3.6,5.4){\makebox(0,0){{\sc channel}}}
\put(3.6,5.15){\makebox(0,0){$f={10\%}$}}
\put(3.4,4.75){\vector(1,0){0.4}}
\put(4,5){\makebox(0,0)[tl]{\psfig{figure=poster/10000.r3.0.10.ps,width=1in}}}
\put(4.625,5.4){\makebox(0,0){\Large$\br$}}
\put(5.6,5.4){\makebox(0,0){{\sc decoder}}}
%\put(5.6,3.4){\makebox(0,0)[tl]{\parbox[t]{1.75in}{{\em The decoder takes the majority vote of the three signals.}}}}
\put(5.4,4.75){\vector(1,0){0.4}}
\put(6,5){\makebox(0,0)[tl]{\psfig{figure=poster/10000.r3.0.10.d.ps,width=1in}}}
\put(6.625,5.4){\makebox(0,0){\Large$\hat{\bs}$}}
\end{picture}
\end{center}
}{%
\caption[a]{Transmitting 10000 source bits over a binary symmetric channel
 with $f=10\%$
% 0.1$
  using a repetition code and the majority vote decoding 
 algorithm. The  probability 
 of decoded bit error has fallen to about 3\%; the rate has fallen 
 to 1/3.}
% \dilbertcopy
\label{fig.r3.dilbert}
}%
\end{figure}

 The error probability is dominated by the probability that two
 bits in a block of three are flipped, which scales as $f^2$.
%
% JARGON??????
%
 In the
 case of the binary symmetric channel with $f=0.1$, the {$\Rthree$} code has a
 probability of error, after decoding, of $\pb \simeq 0.03$ per bit.
 \Figref{fig.r3.dilbert} shows the 
 result of transmitting a binary
 image over a binary symmetric channel
 using the repetition code.

%  Should `rate' be explicitly defined?

The repetition code $\Rthree$ has therefore reduced the probability of
 error, as desired.
 Yet we have lost something: our
 {\em rate\/} of information transfer has fallen by a factor of
 three. So if we use a repetition code to communicate data over a telephone
 line, it will reduce the error frequency, but it will also reduce our
 communication rate. We will have to pay three times as much for each
 phone call.
% there will also be a delay
 Similarly,
%As for our disc drive,
 we would need three of the original noisy gigabyte disc drives
 in order to create a one-gigabyte disc drive with $\pb=0.03$.

 Can we 
% What happens as we try to 
 push the error probability lower, to the
 values required for a
% quality
 sellable disc drive -- $10^{-15}$?
 We could achieve lower error probabilities by using repetition 
 codes with more repetitions. 

\exercissx{3}{ex.R60}{
\ben
\item
 Show that the probability of error of $\RN$, the repetition
 code with  $N$ repetitions,  is 
\beq
 p_{\rm b} = \sum_{n=(N+1)/2}^{N} {{N}\choose{n}} f^n (1-f)^{N-n} ,
\eeq
 for  odd $N$.
\item
 Assuming $f = 0.1$, which of the terms in this sum is the biggest?
 How much bigger is it than the second-biggest term? 
\item
 Use \ind{Stirling's approximation} to approximate
% get rid of
 the ${{N}\choose{n}}$
 in the largest term, and find,
 approximately, the probability of error of the repetition
 code with  $N$ repetitions.
\item
  Assuming $f = 0.1$, find how many repetitions
 are required
% show that it takes a repetition 
% code with rate about $1/60$
 to get the probability of error 
 down to $10^{-15}$. [Answer: about 60.]
\een
}
 So to build a {\em single\/}
 gigabyte disc drive 
 with the required reliability from noisy gigabyte drives with $f=0.1$, 
 we would need {\em sixty\/} of the  noisy disc drives.
 The tradeoff between error probability and rate for repetition 
 codes is shown in \figref{fig.pbR.R}.
%
%  see end of l1.tex for method, also see poster1.gnu
%
\newcommand{\pbobject}{\hspace{-0.15in}\raisebox{1.62in}{$\pb$}%
\hspace{-0.05in}}
\begin{figure}
\figuremargin{%
\begin{center}
\begin{tabular}{cc}
\hspace{-0.2in}\psfig{figure=\codefigs/rep.1.ps,angle=-90,width=2.6in} &
\pbobject\psfig{figure=\codefigs/rep.1.l.ps,angle=-90,width=2.6in}  \\
\end{tabular} 
\end{center}
}{%
\caption[a]{Error probability $\pb$ versus rate for repetition codes
 over a binary symmetric channel with $f=0.1$.
 The right hand figure shows $\pb$ on a logarithmic scale. We would like 
 the rate to be large and $\pb$ to be small.
}
\label{fig.pbR.R}
}%
\end{figure}
%  see end of this file for method


\subsection{Block codes -- the (7,4) Hamming code}
\label{sec.ham74}
 We would  like to  communicate with 
 tiny probability of error {\em and\/} at a substantial rate.
 Can we improve on repetition codes? What if we add redundancy to 
 {\dem blocks\/} of data instead of 
% redundantly 
 encoding one bit at a time?
% You may already have heard of the idea of `parity check bits'. 
 We now 
 study a simple {\dem{block code}}.

 A {\dem \ind{block code}\/} is a rule for converting a sequence of source
 bits $\bs$, of length $K$, say, into a transmitted sequence $\bt$ of length
 $N$ bits. To add redundancy, we make $N$ 
 greater than $K$. In a {\dem linear\/} block code,
 the extra $N-K$ bits are linear functions of the
 original $K$ bits; these extra bits are called {\dem\ind{parity check bits}}.
 An example of a \ind{linear block code}\index{error-correcting code!linear} is the \mbox{\dem(7,4)
 \ind{Hamming code}}, which transmits $N=7$ bits for every $K=4$ source
 bits.

\begin{figure}[htbp]
\figuremargin{\small%
\begin{center}
\begin{tabular}{cc}
(a)\psfig{figure=hamming/encode.eps,angle=-90,width=1.3in}  &
(b)\psfig{figure=hamming/correct.eps,angle=-90,width=1.3in}  \\
\end{tabular} 
\end{center}
}{
\caption[a]{Pictorial representation of encoding for the Hamming (7,4) 
 code.
% a and  b are not explained in the caption. Does this matter? 
%
% The parity check bits $t_5,t_6,t_7$ are set so that the parity within
%% each circle is even.
}
\label{fig.74h.pictorial}
\label{fig.hamming.pictorial}
}
\end{figure}
%
 The encoding operation for the code is shown pictorially
 in \figref{fig.74h.pictorial}.
%
% \subsubsection{Encoding}
 We arrange the seven transmitted bits in three intersecting circles.
%  as shown in \figref{fig.hamming.encode}.
 The first four
 transmitted bits,
 $t_1 t_2 t_3 t_4$, are set equal to the four source bits,
 $s_1 s_2 s_3 s_4$.
 The parity check bits\index{parity check bits}
 $t_5 t_6 t_7$ are set so that the {\dem\ind{parity}\/}
 within each circle is even:
 the first parity check bit is the parity of the first three source bits
 (that is, it is
%zero
 {\tt 0} if the sum of those bits is even, and
% one
 {\tt 1} if the sum  is odd); 
 the second is the parity of the last three; and the third parity bit 
 is the parity of source bits one, three and four. 

 As an example, \figref{fig.74h.pictorial}b shows the transmitted 
 codeword  for the case $\bs = {\tt 1000}$. 
% idea for rewriting this: go straight to pictorial story, leave out the 
% matrix description for another time.
%
%
%\noindent
%
 Table \ref{tab.74h} shows the codewords generated
 by each of the  $2^4=$ sixteen settings of the four source bits.
% Notice that the first four transmitted bits are 
% identical to the four source bits, and the remaining three bits 
% are parity bits:
 The special property of these codewords is that
 any pair 
 differ from each other in at least three bits.
\begin{table}[htbp]
\figuremargin{%
\begin{center}
\mbox{\small
\begin{tabular}{cc} \toprule
%       Source sequence
 $\bs$ & 
% Transmitted sequence
               $\bt$ \\ \midrule
\tt     0000 &\tt 0000000 \\
\tt     0001 &\tt 0001011 \\
\tt     0010 &\tt 0010111 \\
\tt     0011 &\tt 0011100 \\ \bottomrule
\end{tabular} \hspace{0.02in}
\begin{tabular}{cc} \toprule
         $\bs$ &   $\bt$ \\ \midrule
\tt     0100 &\tt 0100110 \\
\tt     0101 &\tt 0101101 \\
\tt     0110 &\tt 0110001 \\
\tt     0111 &\tt 0111010 \\ \bottomrule
\end{tabular} \hspace{0.02in}
\begin{tabular}{cc} \toprule 
         $\bs$ &   $\bt$ \\ \midrule
\tt     1000 &\tt 1000101 \\
\tt     1001 &\tt 1001110 \\
\tt     1010 &\tt 1010010 \\
\tt     1011 &\tt 1011001 \\ \bottomrule
\end{tabular} \hspace{0.02in}
\begin{tabular}{cc} \toprule
         $\bs$ &   $\bt$ \\ \midrule
\tt     1100 &\tt 1100011 \\
\tt     1101 &\tt 1101000 \\
\tt     1110 &\tt 1110100 \\
\tt     1111 &\tt 1111111 \\ \bottomrule
\end{tabular}
}%%%%%%%%% end of row of four tables
\end{center} 
}{%
\caption[a]{The sixteen codewords
  $\{ \bt \}$ of the (7,4) Hamming  code. Any pair of
  codewords 
% have the % beautiful % elegant property that they
 differ from each other in at least three bits.}
%\label{fig.hamming.encode}
\label{tab.74h}
\label{tab.h74}
\label{fig.h74}
\label{fig.74h}
}
\end{table}
%

\begin{aside}
 Because the Hamming code is a   {linear\/} code, it can\indexs{error-correcting code!linear}
 be written  compactly in terms of matrices as follows. 
% It is a 
% {\em linear\/} code; that is, t
 The transmitted codeword $\bt$ is
% can be
 obtained 
 from the source sequence $\bs$ by a linear operation,
\beq
        \bt = \bG^{\T} \bs,
\label{eq.encode}
\eeq
 where $\bG$ is the {\dem\ind{generator matrix}} of the code,
\beq
 \bG^{\T} = {\left[ \begin{array}{cccc} 
\tt 1 &\tt 0 &\tt 0 &\tt 0 \\
\tt 0 &\tt 1 &\tt 0 &\tt 0 \\
\tt 0 &\tt 0 &\tt 1 &\tt 0 \\
\tt 0 &\tt 0 &\tt 0 &\tt 1 \\
\tt 1 &\tt 1 &\tt 1 &\tt 0 \\
\tt 0 &\tt 1 &\tt 1 &\tt 1 \\
\tt 1 &\tt 0 &\tt 1 &\tt 1  \end{array} \right] } ,
\label{eq.h74.gen}
\eeq 
 and the encoding operation (\ref{eq.encode}) uses 
  modulo-2 arithmetic [${\tt 1}+{\tt 1}={\tt{0}}$, ${\tt 0}+{\tt 1}={\tt 1}$, etc.].
%\footnote{My notational 
% convention  is that  all  vectors -- $\bs$, $\bt$, etc.\ --
% are column vectors, except that in the figures where many 
% vectors are listed, they are displayed as row vectors. The 
% generator matrix $\bG$  is written ..... as to retain 
% consistency with established notation in coding texts.}

% \begin{aside}
 In the encoding operation
 (\ref{eq.encode}) I have assumed that $\bs$ and $\bt$ are 
 column vectors. If instead they are row vectors, then this equation 
 is replaced by
\beq
        \bt =  \bs \bG,	
\label{eq.encodeT}
\eeq
 where 
\beq
       \bG = \left[ \begin{array}{ccccccc} 
 \tt 1& \tt 0& \tt 0& \tt 0& \tt 1& \tt 0& \tt 1 \\
 \tt 0& \tt 1& \tt 0& \tt 0& \tt 1& \tt 1& \tt 0 \\
 \tt 0& \tt 0& \tt 1& \tt 0& \tt 1& \tt 1& \tt 1 \\
 \tt 0& \tt 0& \tt 0& \tt 1& \tt 0& \tt 1& \tt 1 \\
  \end{array} \right] .
\label{eq.Generator}
\eeq
% f you are like me, you may
 I find it easier to relate to
 the right-multiplication (\ref{eq.encode})
 than the left-multiplication (\ref{eq.encodeT}).
% -- I like my matrices to act to the right.
 Many coding theory texts use the left-multiplying conventions 
 (\ref{eq.encodeT}--\ref{eq.Generator}), however.

 The rows of the generator matrix (\ref{eq.Generator}) can be 
 viewed as defining four basis vectors lying in a seven-dimensional
 binary space. The sixteen codewords are obtained by making all 
 possible linear combinations
% binary sums
 of these vectors.
\end{aside}


%
% should I add a cast of characters here?
% s,t,r,s^

\subsubsection{Decoding the (7,4) Hamming code}
 When we invent a more complex encoder $\bs \rightarrow \bt$,
 the task of decoding the
 received vector $\br$ becomes less straightforward. Remember that
 {\em any\/} of the bits may have been flipped, including the parity bits. 
% We can't assume that the  three extra parity bits 
%(The reader who
% is eager to see the denouement of the plot may skip ahead to section
% \ref{sec.code.perf}.)
 

% General defn of optimal decoder 
 If we assume that the channel is a binary symmetric channel and that
 all source vectors are equiprobable, 
% {\em a priori},
 then  the
 optimal decoder
% is one that
 identifies the source vector $\bs$ whose
 encoding $\bt(\bs)$ differs from the received vector $\br$ in the
 fewest bits. [{Refer to the likelihood function 
% equation
% {eq.bayestheorem}--\ref{eq.likelihood.bsc}}
 \bref{eq.likelihood.bsc}} to see why this is so.]
 We could solve the decoding problem by measuring how far $\br$
 is from each of the 
 sixteen codewords in \tabref{tab.74h} then picking the closest.
 Is there a more efficient way of finding the most probable source vector?


\subsubsection{Syndrome decoding for the Hamming code}
\label{sec.syndromedecoding}
 For the (7,4) Hamming code there is a pictorial solution to the 
% syndrome
 decoding problem, based on the  encoding  picture,
 \figref{fig.74h.pictorial}. 
%
% \subsubsection{Decoding}
%
% sanjoy says this is CONFUSING - tried to importve it Sat 22/12/01
% also romke did not like it

 As a first example, let's assume the transmission was
 $\bt = {\tt 1000101}$ and the noise flips the second bit,
 so the received vector is
 $\br = {\tt 1000101}\oplus{\tt{0100000}} = {\tt{1100101}}$.
% \ie,  $\bn=({\tt 0},{\tt 1},{\tt 0},{\tt 0},{\tt 0}, {\tt 0},{\tt 0})$,
% and the received vector 
 We write the received vector  into the three circles
 as shown in \figref{fig.hamming.decode}(a), and
 look at each of the three circles to see whether its parity is even.
 The circles whose parity is {\em{not}\/} even are shown by
 dashed lines in \figref{fig.74h.pictorial}b.
% The fact that all codewords differ from each other in at least 
% three bits means that if the noise has flipped any one or two bits, 
% the received vector will no longer be a valid codeword, and some of 
% the parity checks  will be broken.
%
 The decoding task is
%We want
 to find the smallest
 set of flipped bits that can account for these violations
 of the parity rules.
% violated.
 [The  pattern of violations of the parity checks is called the {\dem\ind{syndrome}}, and can be written as a binary vector -- for example,
 in \figref{fig.hamming.decode}b, the syndrome is $\bz = ({\tt1},{\tt1},{\tt0})$,
 because the first two circles are `unhappy' (parity {\tt1}) and the
 third circle is `happy' (parity {\tt0}).]
% RESTORE ME:
%, and the task of  syndrome decoding 
% syndrome (just as a
% \ind{doctor} might seek the most probable underlying \ind{disease} to account for
% the symptoms shown by a \ind{patient}).


\begin{figure}% [htbp]
\figuremargin{\small%
\begin{center}
\begin{tabular}{ccc}
(a)\psfig{figure=hamming/decode.eps,angle=0,width=1.3in}  \\
(b)\psfig{figure=hamming/s2.eps,angle=-90,width=1.3in}  &
(c)\psfig{figure=hamming/t5.eps,angle=-90,width=1.3in}  &
(d)\psfig{figure=hamming/s3.eps,angle=-90,width=1.3in}  \\[0.3in]
\multicolumn{3}{c}{%
(e)\psfig{figure=hamming/s3.t7.eps,angle=0,width=1.3in}  
\setlength{\unitlength}{1in}
\begin{picture}(0.4,0.6)(0,0)
\put(0,0.6){\vector(1,0){0.6}}
\end{picture}
% \raisebox{0.6in}{$\rightarrow$}
(${\rm e}'$)\psfig{figure=hamming/s3.t7.d.eps,angle=0,width=1.3in}  
}\\
\end{tabular} 
\end{center}
}{%
\caption[a]{Pictorial representation of decoding of the Hamming (7,4) 
 code. The received vector is written into the diagram
 as shown in (a).
 In (b,c,d,e), the received vector is
 shown, assuming that the transmitted vector was
 as in
% The bits that are flipped relative to
 \protect
 \figref{fig.hamming.pictorial}(b) and the bits labelled by $\star$
 were flipped. The violated 
 parity checks are highlighted by dashed circles. One of the seven bits 
 is the most probable suspect to account for each `\ind{syndrome}', \ie, each 
 pattern of violated and satisfied parity checks. 

 In examples  (b), (c), and (d), the most probable suspect is
 the one bit that was flipped.

 In example (e), two bits  have been flipped, $s_3$ and $t_7$.
 The most probable suspect is $r_2$, marked by a circle in (${\rm e}'$),
 which shows the output of the decoding algorithm. 
% each circle is even.
}\label{fig.hamming.decode}
\label{fig.hamming.s2}% these labels were in the wrong place feb 2000
\label{fig.hamming.s3}
\label{fig.hamming.correct}
}
\end{figure}
%
% ACTION: sanjoy still thinks this part is hard to follow - fixed Sat 22/12/01?
 To solve the decoding task,
% problem,
 we ask the question:
 can we find  a unique bit that lies {\em inside\/}
 all the `unhappy' circles and {\em outside\/} all the
 `happy' circles? If so, the flipping of that bit
 would account for the observed
 syndrome. 
 In the case shown in   \figref{fig.hamming.s2}(b),
 the bit $r_2$
% that was flipped
 lies  inside the  two unhappy circles and outside the happy
 circle;
 no other single bit has this property, so
 $r_2$  is the only single bit capable of explaining the syndrome.

 Let's work through a couple more examples. 
  \Figref{fig.hamming.s2}(c) shows what happens if one of the
 parity bits, $t_5$, is flipped by the noise. Just one of the checks
 is violated. Only $r_5$ lies inside this unhappy circle and outside
the other two happy circles,
 so $r_5$ is  identified as the only single bit
 capable of explaining the syndrome.

 If the central bit $r_3$ is received flipped, 
  \figref{fig.hamming.s3}(d) shows that all three checks are violated;
 Only $r_3$ lies inside all three circles, so $r_3$  is
 identified as the  suspect bit.

 If you try flipping any one of the seven bits, you'll find 
 that a different syndrome is obtained in each case -- seven non-zero syndromes,
 one for each bit. There is only 
 one other syndrome, the all-zero syndrome. So if
 the channel is a binary symmetric channel with a
 small noise level $f$, the optimal 
 decoder unflips at most one bit, depending  on the 
 syndrome, as shown in \algref{tab.hamming.decode}.
 Each syndrome could have been caused by other noise patterns
 too, but any other noise pattern that has the same syndrome 
 must be less probable because it involves a larger number of 
 noise events.

%\begin{figure}
%\figuremargin{%
\begin{algorithm}
\algorithmmargin{%
\begin{center}
\begin{tabular}{c*{8}{c}}
% Fri 4/1/02 removed toprule and bottomrule because algorithm has its own frame
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \toprule
Syndrome $\bz$ &            {\tt 000} & {\tt 001} & {\tt 010} & {\tt 011} & {\tt 100} & {\tt 101} & {\tt 110} & {\tt 111} \\ \midrule
Unflip this bit  & {\small{\em none}} &   $r_7$      & $r_6$          &        $r_4$  &  $r_5$       & $r_1$          & $r_2$         & $r_3$ \\
% \bottomrule
% Unflip this bit  & {\small{\em none}} &   7      & 6          &        4  &  5       & 1          & 2         & 3 \\
% \bottomrule
% this is appropriate only if z =z3,z2,z1: 
% Unflip this bit  & {\small{\em none}} & 5      & 6          &        2  &  7       & 1          & 4         & 3 \\  \hline
\end{tabular}
\end{center} 
%\begin{center}
%\begin{tabular}{cc}  \hline
%Syndrome $\bz$ & % 3 2 1 !!!!!!!!!!!!!!!!!!!
%Flip this bit  \\  \hline
% 000 &{\small{\em none}} \\
% 001 &5\\
% 010 &6\\
% 011 &2\\
% 100 &7\\
% 101 &1\\
% 110 &4\\
% 111 &3 \\  \hline
%\end{tabular}
%\end{center}
}{%
\caption[a]{Actions taken by the optimal decoder for the (7,4) Hamming 
 code, assuming a binary symmetric channel with small noise level $f$.
 The syndrome vector $\bz$ lists whether each parity check is 
 violated ({\tt 1}) or satisfied ({\tt 0}),
 going through the checks in the order
 of the bits $r_5$, $r_6$,
 and $r_7$. }
\label{tab.hamming.decode}
}%
\end{algorithm}

 What happens if the noise actually flips more than one bit? 
  \Figref{fig.hamming.s3}(e) shows the situation when two bits, 
 $r_3$ and $r_7$, are received flipped.  The syndrome, {\tt 110},
 makes us suspect the single bit $r_2$; so our optimal decoding algorithm 
 flips this bit, giving a decoded pattern with three errors 
 as shown in   \figref{fig.hamming.s3}(${\rm e}'$).
 If we use the optimal decoding algorithm, 
 any two-bit error pattern will lead to a decoded seven-bit vector 
 that contains three errors. 

\subsection{General view of decoding for linear codes: syndrome decoding}
\label{sec.syndromedecoding2}
\begin{aside}
% {\em (Does some of this stuff belong earlier in the pictorial area?)}
 We can also  describe the decoding problem
 for a linear code in terms of matrices.\index{syndrome decoding} 
% In the  case of a linear code and a  symmetric channel, 
% the decoding task can be re-expressed as {\bf syndrome decoding}.
% Let's assume that the noise level $f$ is less than $1/2$.
 The first four received bits, $r_1r_2r_3r_4$, purport to be  
 the four source bits; and the received bits $r_5r_6r_7$ purport
 to be the parities of the source bits, as defined by the generator 
 matrix $\bG$. We evaluate the three parity check bits for the 
 received bits, $r_1 r_2r_3 r_4$, and see whether
 they match the three received 
 bits, $r_5r_6r_7$. The differences (modulo 2) between 
 these two triplets are called the {\dbf\ind{syndrome}}
 of the received vector. 
 If the syndrome is zero -- if all three parity checks are happy
% agree with  the corresponding received bits
 -- then the received vector is a codeword, 
 and the most probable decoding is given by reading out its first four 
 bits.  If the syndrome is non-zero, then
% we are certain that
 the noise 
 sequence for this block was non-zero, and the syndrome is our 
 pointer to the most probable error pattern. 

 The computation of  the  syndrome vector is a
 linear operation. If we define the $3 \times 4$ matrix $\bP$
 such that  the matrix of 
 equation (\ref{eq.h74.gen})
is
\beq
        \bG^{\T} = \left[ \begin{array}{c}{\bI_4}\\
 \bP\end{array} \right], 
\eeq
 where $\bI_4$ is the $4\times 4$ identity matrix,  then 
 the syndrome vector is $\bz  = \bH \br$, where the {\dbf\ind{parity check matrix}}
 $\bH$ is given by $\bH =  \left[ \begin{array}{cc} -\bP & \bI_3 \end{array}
 \right]$; in  modulo 2 arithmetic, $-1 \equiv 1$, so
\beq
        \bH =   \left[ \begin{array}{cc} \bP & \bI_3 \end{array}
 \right] = \left[ 
 \begin{array}{ccccccc} 
\tt  1&\tt 1&\tt 1&\tt 0&\tt 1&\tt 0&\tt 0 \\
\tt  0&\tt 1&\tt 1&\tt 1&\tt 0&\tt 1&\tt 0 \\
\tt  1&\tt 0&\tt 1&\tt 1&\tt 0&\tt 0&\tt 1
 \end{array} \right] .
\label{eq.pcmatrix}
\eeq
 All the codewords $\bt = \bG^{\T} \bs$ of the code satisfy
\beq
	\bH \bt = \left[ {\tt \begin{array}{c} \tt0\\ \tt0\\ \tt0 \end{array} } \right] .
% (0,0,0)  .
\eeq
\exercisxB{2}{ex.GHis0}{
 Prove that this is so by evaluating the $3\times4$ matrix $\bH \bG^{\T}$.
}
 Since the received vector $\br$ is given by $\br = \bG^{\T}\bs + \bn$,
% and $\bH \bG^{\T}$=0, 
 the syndrome-decoding problem is  to find the
 most probable noise vector $\bn$ satisfying
 the equation 
\beq
        \bH \bn = \bz .
\eeq
 A decoding algorithm that solves this problem is called 
 a {\dem maximum-likelihood decoder}. We will discuss 
 decoding problems like this  in later  chapters. 
%\footnote{Somewhere in this book
% I need to spell out Bayes' theorem for decoding. Here would be 
% a good spot; but on the other hand, people can understand decoding
% intuitively, they don't need Bayes theorem and they might find it 
% a hindrance if they were not only being hit by 
% Shannon's theorem but also by likliehoods and priors.}
%
% ACTION NEEDED ????????????????????????????????????????
%
\end{aside}

\begin{figure}
%\fullwidthfigure{%
\figuredanglenudge{%
\begin{center}
\setlength{\unitlength}{0.8in}% was 1in, with figures 1.25 wide % then was 0.8 with 1in
\begin{picture}(7,2.7)(0,2.8)
\put(0,5){\makebox(0,0)[tl]{\psfig{figure=bitmaps/dilbert.ps,width=1in}}}
\put(0.625,5.4){\makebox(0,0){\Large$\bs$}}
\thicklines
\put(1.35,4.75){\vector(1,0){0.4}}
\put(1.55,5.4){\makebox(0,0){{\sc encoder}}}
\put(2,5){\makebox(0,0)[tl]{\psfig{figure=poster/10000.h74.ps,width=1in}}}
\put(1.982,3.75){\makebox(0,0)[tr]{{parity bits} $\left.\rule[-0.342in]{0pt}{0.342in} \right\{$}}
\put(2.625,5.4){\makebox(0,0){\Large$\bt$}}
\put(3.6,5.4){\makebox(0,0){{\sc channel}}}
\put(3.6,5.15){\makebox(0,0){$f={10\%}$}}
\put(3.4,4.75){\vector(1,0){0.4}}
\put(4,5){\makebox(0,0)[tl]{\psfig{figure=poster/10000.h74.0.10.ps,width=1in}}}
\put(4.625,5.4){\makebox(0,0){\Large$\br$}}
\put(5.6,5.4){\makebox(0,0){{\sc decoder}}}
%\put(5.6,3.5){\makebox(0,0)[tl]{\parbox[t]{1.75in}{{\em The decoder picks the $\hat{\bs}$ with maximum likelihood.}}}}
\put(5.4,4.75){\vector(1,0){0.4}}
\put(6,5){\makebox(0,0)[tl]{\psfig{figure=poster/10000.h74.0.10.d.ps,width=1in}}}
\put(6.625,5.4){\makebox(0,0){\Large$\hat{\bs}$}}
\end{picture}
\end{center}
}{%
\caption[a]{Transmitting 10,000 source bits over a binary symmetric channel
 with $f=10\%$
%0.1$
  using a (7,4) Hamming code. The  probability 
 of decoded bit error is about 7\%.}
% \dilbertcopy}
\label{fig.h74.dilbert}
}{0.7in}% third argument is the upward nudge of the caption
\end{figure}
\subsection{Summary of the (7,4) Hamming code's properties}
 Every possible received vector of length 7 bits is either a codeword,
 or it's one flip away from a codeword.\index{Hamming code}

 Since there are three parity constraints, each of which might
 or might not be violated, there are
 $2\times 2\times 2= 8$
% eight
 distinct syndromes. They can be divided 
 into seven non-zero syndromes --  one
 for each of the one-bit error patterns --
 and the all-zero syndrome, corresponding to the zero-noise case. 

 The optimal decoder takes no action if the syndrome is zero, 
 otherwise it uses this mapping of non-zero syndromes onto one-bit error 
 patterns to unflip the suspect bit. 

 There is a {\dbf decoding error}   if the four decoded bits $\hat{s}_1,
 \ldots, \hat{s}_4$ do not all match the source bits ${s}_1,
 \ldots, {s}_4$. The {\dbf probability of block error} $\pB$ is 
 the probability that one or more of the decoded bits in one block fail to 
 match the corresponding source bits,
\beq
 \pB = P( \hat{\bs} \neq \bs ) .
\eeq
 The {\dbf probability of bit error} $\pb$ is 
 the average probability
%  per decoded bit
 that a decoded bit fails to 
 match the corresponding source bit,
\beq
        \pb =  \frac{1}{K} \sum_{k=1}^K P( \hat{s}_k \neq s_k ) .
\eeq

 In the case of the Hamming code, 
 a decoding error will occur whenever  the noise has flipped more than 
 one bit in a block of seven. 
%  Any noise pattern that flips more than one bit will give rise to one of 
%  these syndromes, and our decoder will make an erroneous decision. 
%
 The probability of block error is thus the probability that two or more 
 bits are flipped in a block. This probability scales as $O(f^2)$, as did the 
 probability of error for the repetition code 
 $\Rthree$. But notice that the Hamming code 
 communicates at a greater rate, $R=4/7$. 

 \Figref{fig.h74.dilbert} shows a binary image transmitted over 
 a binary symmetric channel using the (7,4) Hamming code. 
 About 7\% of the decoded bits are in error. Notice that 
 the errors are correlated:
% with each other:
 often two or three successive
 decoded bits are flipped.

\exercissxA{1}{ex.Hdecode}{
 This exercise and the next three  refer to the 
  (7,4) \ind{Hamming code}.  Decode the received strings:
\ben
\item $\br = {\tt 1101011}$ % 10
\item $\br = {\tt 0110110}$ % 4
\item $\br = {\tt 0100111}$ % 4
\item $\br = {\tt 1111111}$. % 15
\een
}
\exercisxA{2}{ex.H74p}{
\ben \item
 Calculate the probability of block error $p_{\rm B}$ of the (7,4) Hamming 
 code 
 as a function of the noise level $f$ and show that to leading order
% \footnote{Do I need to explain what this means? Or use a different
%  terminology? Maybe only physicists are familiar?} 
%
% ACTION!!!
%
 it goes as $21 f^2$.
\item
% }
% \exercis{}{
 $^{B3}$
 Show that to leading order  the probability of 
 bit error $\pb$ goes as $9 f^2$. 
\een}
\exercisxA{2}{ex.H74zero}{
% Hamming (7,4) code.
 Find some noise vectors that give the all-zero syndrome (that is, 
 noise vectors that leave all the parity checks unviolated). 
 How many such noise vectors are there?
}
% they are the codewords. 
\exercisxB{2}{ex.H74detail}{
% Hamming (7,4) code.
 I asserted above that a block decoding error will result 
 whenever two or more bits are flipped in a single block. 
 Show that this is indeed so. [In principle, there might be 
 error patterns that, after decoding, led only to the corruption 
 of the parity bits, without the source bits's being incorrectly 
 decoded.] 
}
\exercisxB{2}{ex.R9}{
 Consider
% the encoder for
 the repetition code $\Rnine$. One way 
 of viewing this code is as a \index{concatenation!error-correcting codes}{\dbf{concatenation}} of  $\Rthree$ with 
 $\Rthree$. We first encode the source stream with $\Rthree$, then encode 
 the resulting output with $\Rthree$. We could call this code `$\Rthree^2$'.
 This idea motivates an alternative decoding algorithm, in which we decode the 
 bits three at a time using the decoder for $\Rthree$; then decode the
 decoded bits from that first decoder using the decoder for $\Rthree$. 

 Evaluate the probability of error for this decoder and compare it with the
 probability of error for the optimal decoder for $\Rnine$. 

 Do the concatenated encoder and decoder for $\Rthree^2$ have advantages over 
 those for $\Rnine$? 
}

\subsection{Summary of codes' performances}
\label{sec.code.perf}
 Figure \ref{fig.pbR.RH} shows the performance of \ind{repetition code}s and
 the \ind{Hamming code}. It also shows the performance of a family of linear
 block codes that are generalizations of Hamming codes, \ind{BCH codes}.  
% Reed-Muller codes, and
%  see end of this file for method
% 
\begin{figure}[htbp]
\figuremargin{%
\begin{center}
\begin{tabular}{cc}
\hspace{-0.2in}\psfig{figure=\codefigs/rephambch.1.ps,angle=-90,width=2.6in} &
\pbobject\psfig{figure=\codefigs/rephambch.1.l.ps,angle=-90,width=2.6in}  \\
\end{tabular} 
\end{center}
}{%
\caption[a]{Error probability $\pb$ versus rate $R$ for repetition codes,
  the (7,4) Hamming code and BCH codes with block lengths up to 1023
 over a binary symmetric channel with $f=0.1$.
 The righthand figure shows $\pb$ on a logarithmic scale.}
\label{fig.pbR.RH}
}
\end{figure}
%

%\noindent
% use this noindent if the ``h'' (here) works, otherwise new para.
 This figure shows that we can, using linear block codes, achieve better
 performance than repetition codes; but the asymptotic situation still
 looks  grim. 
\exercisxA{4}{ex.makecode}{
% invent your own code 
 Design an error-correcting code and a decoding algorithm for it,
 compute its probability of error, 
 and add it to figure \ref{fig.pbR.RH}.
 [Don't worry if you find it difficult to make a code better than the
 Hamming code, or if you find it difficult to find a good
 decoder for your code; that's the point of this exercise.]
}
\exercisxA{3}{ex.makecode2error}{
 A (7,4) Hamming code
 can correct any {\em one\/} error;  might there be a (10,4) code
 that can correct any two errors? What about a (9,4) code?

 {\sf Optional extra:} Does the answer to this question
 depend on whether the code is linear or nonlinear?
}
\exercisxA{4}{ex.makecode2}{
	 Design an error-correcting code, other than
 a repetition code, that can
 correct any {\em two\/} errors  in a block of size $N$.
}

\section{What performance can the best codes achieve?}
 There seems to be a trade-off between the decoded bit-error
 probability $\pb$ (which we would like to reduce) and the rate $R$ (which
 we would like to keep large).  How can this trade-off be
 characterized?
%  Can we do better than repetition codes?
 What points in
 the $(R,\pb)$ plane are achievable?  This question was addressed by
 \ind{Shannon} in his pioneering paper of 1948, in which he both created the
 field of information theory and solved most of its fundamental
 problems.
%  in the same paper.

 At that time  there was a widespread belief that the 
 boundary between achievable and nonachievable points in the 
 $(R,\pb)$ plane was a curve passing through the origin $(R,\pb) = (0,0)$; 
 if this were so, then,  in order to achieve a vanishingly small 
 error probability $\pb$, one would have to reduce the rate 
 correspondingly close to zero.
%  (figure ref here).
% This would seem a reasonable guess, 
% in accordance with the general rule that the better something works
% the more you have to pay for it. 
%
% ACTION: sanjoy doesn't like This
%
 `No pain, no gain.'

 However, Shannon proved the remarkable result that\wow\
% , for any given  channel,
 the boundary 
 between achievable and nonachievable points meets the $R$ 
 axis at a {\em non-zero\/} value $R=C$, as shown in \figref{fig.pbR.RHS}. 
\begin{figure}[htbp]
\figuremargin{%
\begin{center}
\begin{tabular}{cc}
\hspace{-0.2in}\psfig{figure=\codefigs/repshan.1.ps,angle=-90,width=2.6in} &
\pbobject\psfig{figure=\codefigs/repshan.1.l.ps,angle=-90,width=2.6in}  \\
\end{tabular} 
\end{center}
}{%
\caption[a]{Shannon's noisy-channel coding theorem.
 The solid curve shows  the Shannon limit 
 on achievable values of $(R,\pb)$  for
 the binary symmetric channel with $f=0.1$.
 Rates up to $R=C$ are achievable with arbitrarily small $\pb$.
 The  points show the performance of some textbook codes,
 as in \protect\figref{fig.pbR.RH}.


 The equation defining the Shannon limit (the solid curve) is 
%\[
        $R = \linefrac{C}{(1-H_2(\pb))},$
%\]
 where $C$ and $H_2$ are defined in \protect \eqref{eq.capacity}.
}
\label{fig.pbR.RHS}
}
\end{figure}
%  see end of this file for method
%
 For any channel, there exist codes that make it possible to 
 communicate  with {\em arbitrarily small\/} probability of 
 error $\pb$ at non-zero rates.  The first half of this book (parts I--III) will be 
 devoted  to understanding  this  remarkable result, which is called 
 the {\dbf\ind{noisy-channel coding theorem}}.

\subsection{Example: $f=0.1$}% A few details}
 The maximum rate at which communication is possible with arbitrarily
 small $\pb$ is called the {\dbf\ind{capacity}} of the channel.\index{channel!capacity} 
 The formula for the capacity of a binary
 symmetric channel  with noise level $f$ is\index{binary entropy function} 
\beq
        C(f) = 1 - H_2(f) = 1 - \left[ f \log_2
                 \frac{1}{f} + (1-f) \log_2 \frac{1}{1-f} \right] ;
\label{eq.capacity}
\eeq
 the channel we were discussing earlier with noise level $f=0.1$
 has capacity $C \simeq 0.53$. Let us consider what this means in terms 
 of noisy disc drives. The \ind{repetition code} $\Rthree$ could communicate over this
 channel with $\pb=0.03$ at a rate $R = 1/3$. Thus we know how
 to  build a single  gigabyte disc drive with $\pb = 0.03$
 from three noisy  gigabyte disc drives. We also know how to make 
 a single  gigabyte disc drive 
 with  $\pb \simeq 10^{-15}$ from sixty
 noisy one-gigabyte drives \exercisebref{ex.R60}.
 And now \ind{Shannon} passes by, notices us 
 \ind{juggling}
% tinkering 
 with disc drives and codes and says:
\begin{quotation}
\noindent
        `What performance are you trying to achieve? 
        $10^{-15}$? You don't need {\em sixty\/} disc drives   -- 
        you can get that performance with just  
         {\em two\/} disc drives (since 1/2 is  less than $0.53$).  
%       (The capacity is 0.53, so the number of disc drives needed at 
%       capacity is 1/0.53.)
%       `
        And if you want $\pb = 10^{-18}$
% , or $10^{-21}$,
         or $10^{-24}$ or anything,
        you can get there with  two disc drives too!'
\end{quotation}
%\begin{aside}
 [Strictly, the above statements might  not be quite right, since,
 as we shall see, Shannon
 proved his 
 noisy-channel coding theorem
%proves the achievability of ever smaller
% error probabilities at a given rate $Ra$)
 is defined to be $\int_{a}^{b} \d v \: P(v)$. $P(v)dv$ is dimensionless.
 The density $P(v)$ is a dimensional
 quantity, having dimensions inverse to the dimensions of $v$ -- in contrast
 to discrete  probabilities, which are dimensionless.
 Conditional and joint probability densities 
 are defined  in just the same way as conditional and joint probabilities.
% , which is why I choose not to use different notation for them.
 }}
 solve the problem posed in \exampleref{exa.bentcoin}.
 Sketch the posterior distribution of $f_H$
 and compute the probability that the $N\!+\!1$th outcome will be a head,
 for
\ben
\item	$N=3$ and $n_H=0$; 
\item	$N=3$ and $n_H=2$; 
\item	$N=10$ and $n_H=3$; 
\item
	$N=300$ and $n_H=29$.
\een
 You will find the beta integral useful: 
\beq
 \int_0^1 d p_a \: p_a^{F_a} (1-p_a)^{F_b}  = 
        \frac{\Gamma(F_a+1)\Gamma(F_b+1)}{ \Gamma(F_a+F_b+2) } 
        = \frac{ F_a! F_b! }{ (F_a + F_b + 1)! } .
\eeq
 You may also find it instructive to look back at
 \exampleref{ex.ip.urns} and \eqref{eq.laplace.succession.first}.
}
 
 People sometimes confuse assigning a prior distribution 
 to an unknown parameter such as $f_H$ with making an initial guess 
 of the {\em{value}\/} of the parameter. 
% But priors  are not values, they are distributions.
 But the prior over $f_H$, $P(f_H)$, is not a simple statement
 like `initially, I would guess $f_H = \dhalf$'. 
 The prior is a probability density over $f_H$ which 
  specifies the prior degree of belief that $f_H$ lies
 in any interval $(f,f+\delta f)$. It may well be the case
 that our prior   for $f_H$ is symmetric about $\dhalf$, so that the
 {\em mean\/} of  $f_H$ under the prior is $\dhalf$.
%under our prior for $f_H$, the {\em mean\/} of  $f_H$ is $\dhalf$
% -- on symmetry grounds for example.
 In this case, the 
 predictive distribution {\em for the first toss\/} $x_1$ would indeed be 
\beq
	P(x_1 \!=\! \mbox{head}) = 
	\int \! df_H \: P(f_H) P(x_1 \!=\! \mbox{head}| f_H)
	= \int \! df_H \: P(f_H)  f_H = \dhalf .
\eeq
 But the prediction for subsequent tosses will depend on
 the whole prior distribution, not just its mean.

\subsubsection{Data compression and inverse probability}
 Consider the following task.
\exampl{ex.compressme}{
 Write a computer  program capable of compressing  binary files like this
 one:\par
\begin{center}{\footnotesize%was tiny
{\tt 0000000000000000000010010001000000100000010000000000000000000000000000000000001010000000000000110000}\\
{\tt 1000000000010000100000000010000000000000000000000100000000000000000100000000011000001000000011000100}\\
{\tt 0000000001001000000000010001000000000000000011000000000000000000000000000010000000000000000100000000}\\[0.1in]% added this space Sat 21/12/02
}
\end{center}
%  This file contains N=300 and n_1 = 29
 The string shown contains $n_1=29$ {\tt 1}s 
 and $n_0=271$ {\tt 0}s.
% What is the probability that the next character in this file
% is a {\tt 1}? 
}
 Intuitively, compression works by taking advantage of the predictability
 of a file. In this case, the source of the file
 appears more likely to emit
 {\tt 0}s than {\tt 1}s. A data compression program that compresses
 this file must, implicitly or explicitly, be addressing the
 question `What is the probability that the next character in this file
 is a {\tt 1}?' 


 Do you think this problem is similar in character
 to \exampleref{exa.bentcoin}? I do. One of the themes
 of this book is  that data compression and
 data modelling are one and the same, and that they should
 both be addressed, like the  urn of example \ref{ex.ip.urns},
 using inverse probability. 
 \Exampleonlyref{ex.compressme} is solved in  \chref{ch4}.
%
% SOLVE IT HERE???
%
\subsection{The likelihood principle}
\label{sec.lp}
 Please solve the following two exercises.
\exampl{ex.lp1}{
 \marginfig[t]{\psfig{figure=figs/urnsA.ps,width=1.6in}}Urn
 A contains three balls: one black, and two white;
 urn B contains three balls: two black, and one white.
 One of the urns is selected at random and one ball
 is drawn. The ball is black. What is the probability
 that the selected urn is urn A?
}
%
\exampl{ex.lp2}{
 \marginfig{\psfig{figure=figs/urns.ps,width=1.6in}}Urn
 A contains five balls: one black, two white, one green and one pink;
 urn B contains five hundred balls:
 two hundred black, one hundred white, 50 yellow, 40 cyan, 30 sienna,
 25 green, 25 silver, 20 gold, and 10  purple.
 [One fifth of A's balls are black; two-fifths of B's are black.]
 One of the urns is selected at random and one ball
 is drawn. The ball is black. What is the probability
 that the urn is urn A?
}
%
 What do you notice about your solutions?  Does each answer
 depend on the detailed contents of each urn?

 The details of the other possible outcomes and their probabilities
 are irrelevant. All that matters is the probability of the outcome
 that actually happened (here, that the ball drawn was  black) given the different
 hypotheses. We need only  to know the {\em likelihood}, \ie,
 how the probability  of the  data that happened varies with the
 hypothesis.%
\amarginfig{b}{
 {\sf The likelihood principle:}
 given a generative model for data $d$ given parameters $\btheta$, $P(d|\btheta)$,
 and having observed a particular outcome $d_1$, all inferences
 and predictions should depend only on the function $P(d_1|\btheta)$. 
}
 This simple rule about inference
 is known as the {\dbf\ind{likelihood principle}}.\label{sec.likelihoodprinciple}
%
% NOTE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% { \em (connect back to this point when discussing
% early stopping and inference in problems where the stopping rule is not known.)}
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% README NOTE!!!!!!!!!!
 [And,  in spite of the simplicity of this principle,
 many classical statistical methods violate it.]\index{classical statistics!criticisms}

\newpage
\section{Definition of entropy and related functions}
\begin{description} 
\item[The Shannon information content of an outcome $x$] is defined to be
%	We define for each $x \in \A_X$, $
\beq
	h(x) = \log_2 \frac{1}{P(x)} .
\eeq 
% We can interpret $h(a_i)$ as the information content of the event 
% $x \eq  a_i$.
 It is measured in bits. [The word `bit' is also used to
 denote a variable whose value is 0 or 1; I hope context will
 always make clear which of the two meanings is intended.]

\noindent
 In the next few chapters, we will establish  	that 
 the Shannon information content  $h(a_i)$  is indeed a natural measure of 
 the  information content of the event $x \eq a_i$.
 At that point, we will shorten the name of this quantity to 
 `the information content'. 

\marginfig{%
\begin{center}\small%footnotesize
%
% vertical table of a-z with probabilities, and information contents too;
% four decimal place
\begin{tabular}[t]{cccr} \toprule
$i$ & $a_i$ & $p_i$ & \multicolumn{1}{c}{$h(p_i)$} \\ \midrule
% $i$ & $a_i$ & $p_i$ & \multicolumn{1}{c}{$\log_2 \frac{1}{p_i}$} \\ \midrule
%
1 & {\tt a} &.0575  &  4.1 \\ 
2 & {\tt b} &.0128  &  6.3 \\ 
3 & {\tt c} &.0263  &  5.2 \\ 
4 & {\tt d} &.0285  &  5.1 \\ 
5 & {\tt e} &.0913  &  3.5 \\ 
6 & {\tt f} &.0173  &  5.9 \\ 
7 & {\tt g} &.0133  &  6.2 \\ 
8 & {\tt h} &.0313  &  5.0 \\ 
9 & {\tt i} &.0599  &  4.1 \\ 
10 &{\tt j} &.0006  & 10.7 \\ 
11 &{\tt k} &.0084  &  6.9 \\ 
12 &{\tt l} &.0335  &  4.9 \\ 
13 &{\tt m} &.0235  &  5.4 \\ 
14 &{\tt n} &.0596  &  4.1 \\ 
15 &{\tt o} &.0689  &  3.9 \\ 
16 &{\tt p} &.0192  &  5.7 \\ 
17 &{\tt q} &.0008  & 10.3 \\ 
18 &{\tt r} &.0508  &  4.3 \\ 
19 &{\tt s} &.0567  &  4.1 \\ 
20 &{\tt t} &.0706  &  3.8 \\ 
21 &{\tt u} &.0334  &  4.9 \\ 
22 &{\tt v} &.0069  &  7.2 \\ 
23 &{\tt w} &.0119  &  6.4 \\ 
24 &{\tt x} &.0073  &  7.1 \\ 
25 &{\tt y} &.0164  &  5.9 \\ 
26 &{\tt z} &.0007  & 10.4 \\ 
27 &{\tt{-}}&.1928  &  2.4 \\ \midrule
%27 &\verb+-+&.1928  &  2.4 \\ \midrule
 & & & \\[-0.1in]
\multicolumn{3}{r}{
$\displaystyle \sum_i p_i \log_2 \frac{1}{p_i}$
} & 4.1  \\ \bottomrule % 4.11
\end{tabular}\\ 

\end{center}
%  vertical table of a-z with probabilities, and information contents too;
\caption[a]{Shannon information contents of the outcomes {\tt a}--{\tt z}.}
\label{fig.monogram.log}
}
%
 The fourth column  in  \figref{fig.monogram.log} shows the Shannon 
 information content of the 27 possible outcomes when
 a 
 random character is picked from an English document. The 
 outcome
% character
 $x={\tt z}$ has a Shannon information content of
 10.4 bits, and $x={\tt e}$ has an information content of 3.5 bits.


\item[The entropy of an ensemble $X$] is defined to be the average Shannon information 
 content of an outcome:
% from that ensemble:
\beq
	H(X) \equiv \sum_{x \in \A_X} P(x) \log \frac{1}{P(x)}, 
\eeq
%\beq
%	H(X) = \sum_i p_i \log \frac{1}{p_i}, 
%\eeq
 with the convention for $P(x) \eq 0$ that \mbox{$0 \times \log 1/0 \equiv 0$},
 since \mbox{$\lim_{\theta\rightarrow 0^{+}} \theta \log 1/\theta \eq  0 $}.

 Like the information content, entropy is measured in bits. 

 When it is convenient, we may also write $H(X)$ as $H(\bp)$, 
 where $\bp$ is the vector $(p_1,p_2,\ldots,p_I)$.
 Another name for the entropy of $X$ is the uncertainty of $X$. 
\end{description}
\noindent
% The entropy is  a measure of the information content or 
% `uncertainty' of $x$. The question of why entropy is a 
% fundamental measure of information content will  be discussed in the 
% forthcoming chapters. Here w

% was continued example
\exampl{eg.mono}{
 The entropy of a 
 randomly selected letter in an English document
 is about  4.11 bits, assuming its probability 
 is as given in  \figref{fig.monogram.log}.
%, p.\ \pageref{fig.monogram}.
%   \tabref{tab.mono}. 
 We obtain this number  by  averaging $\log 1/p_i$ (shown in the fourth 
 column) under the probability distribution $p_i$ (shown in the third column).  
}

 We now note some properties of the entropy function.
\bit
\item  
	$H(X) \geq 0$ with equality iff $p_i \eq  1$ for one $i$.
\marginpar{\footnotesize{`iff' is short for 
 `if and only if'.}}
\item Entropy is maximized if $\bp$ is uniform:
\beq
	H(X) \leq \log(|\A_X|)
 \:\: \mbox{ with equality iff $p_i \eq  1/|X|$ for all $i$. }
\eeq
% \footnote{Exercise: Prove this assertion.}
 {\sf Notation:}\index{notation!absolute value}\index{notation!set size}
 the vertical bars `$|\cdot|$'
 have two meanings.
% If $X$ is an ensemble, then
 If $\A_X$ is  a set, 
 $|\A_X|$ denotes the number of elements in  $\A_X$;
 if $x$ is a number,
% for example, the value of a random variable,
 then $|x|$ is the absolute  value of $x$.
\eit
%
% Mon 22/1/01
 The {\dem\ind{redundancy}}
 measures the fractional difference
 between $H(X)$ and its maximum possible value,
 $\log(|\A_X|)$.
\begin{description}% 
\item[The redundancy of $X$] is:
\beq
	1 - \frac{H(X)}{\log |\A_X|} .
\eeq
	We won't make use of `redundancy'
% need this definition
 in this book, so
 I have not assigned a symbol to it.
% -- it would be redundant.
\end{description}
% ha ha
% funny but true.
% example: X is select a codeword from a code - H(X) = K, but |X| = 2^N
%
% Redundancy = 1 - R
%  of code


\begin{description}% duplicated in _l1a and _p5A
\item[The joint entropy of $X,Y$] is:
\beq
	H(X,Y) = \sum_{xy \in \A_X\A_Y} P(x,y) \log \frac{1}{P(x,y)}.
\eeq
	Entropy is additive for independent random variables:
\beq
	H(X,Y) = H(X) +H(Y) \:\mbox{ iff }\: P(x,y)=P(x)P(y).
\label{eq.ent.indep}% also appears in p5a (.again)
\eeq
\end{description}
\label{sec.entropy.end.parta}
 Our definitions for information content
 so far  apply only to discrete probability distributions
 over finite sets $\A_X$.  The definitions can be extended
 to infinite sets, though the entropy may then be infinite.
 The case of a probability {\em density\/} over a continuous set is
 addressed in section \ref{sec.entropy.continuous}. 
 Further important definitions and exercises to do with entropy
 will come along in  section \ref{sec.entropy.contd}.

\section{Decomposeability of the entropy}
 The entropy function satisfies a recursive property
 that can be very useful when computing entropies.
 For convenience, we'll stretch our notation
 so that we can write $H(X)$ as $H(\bp)$, where
 $\bp$ is the probability vector  associated with the ensemble $X$.

 Let's illustrate the property by an example first.
 Imagine that a random variable $x \in \{ 0,1,2 \}$
 is created by first flipping a fair coin to determine
 whether $x = 0$; then, if $x$ is not 0,
 flipping a fair coin a second time to determine whether
 $x$ is 1 or 2.
 The probability distribution of $x$ is
\beq
	P( x\! =\! 0 )  = \frac{1}{2} ; \:\:
	P( x\! =\! 1 )  = \frac{1}{4} ; \:\:
	P( x\! =\! 2 )  = \frac{1}{4} .
\eeq
 What is the entropy of $X$? We can either compute it by brute
 force:
\beq
	H(X) = \frac{1}{2} \log 2 +  \frac{1}{4} \log 4  +  \frac{1}{4} \log 4
	     = 1.5 ; 
\eeq
 or we can use the following decomposition, in which the value of $x$
 is revealed gradually.
 Imagine  first learning whether $x\! =\! 0$, and then,
 if $x$ is not $0$, learning which non-zero value is the case. the revelation
 of whether  $x\! =\! 0$ or not entails revealing a
 binary variable whose probability distribution is $\{\dhalf,\dhalf \}$.
 This revelation has an entropy $H(\dhalf,\dhalf) = \frac{1}{2} \log 2 +\frac{1}{2} \log 2 = 1\ubit$.
 If  $x$ is not $0$,  we learn the value of  the second  coin flip.
 This too is  a
 binary variable whose probability distribution is $\{\dhalf,\dhalf\}$, and whose entropy is
 $1\ubit$.
 We only get to experience the second revelation half the time, however,
 so the entropy can be written:
\beq
	H(X) = H( \dhalf , \dhalf ) +  \dhalf  \, H( \dhalf , \dhalf ) .
\eeq

 Generalizing, the observation we are making about the entropy
 of any probability distribution $\bp = \{ p_1, p_2, \ldots , p_I \}$
 is that 
\beq
	H(\bp) =
	H( p_1 , 1\!-\!p_1 )
	+ (1\!-\!p_1)
	H \! \left(
	\frac{p_2}{1\!-\!p_1} , 
	\frac{p_3}{1\!-\!p_1} , \ldots ,
	\frac{p_I}{1\!-\!p_1}
\right) .
\label{eq.entropydecompose}
\eeq

 When it's written as a formula, this property
 looks regrettably ugly; nevertheless it is a simple
 property and one that you should make use of.

 Generalizing further, the entropy also has the property for any $m$
 that
\beqan
	H(\bp) &=&
	H\left[ ( p_1+p_2+\ldots+p_m ) ,  ( p_{m+1}+p_{m+2}+\ldots+p_I )  \right]
\nonumber
\\
&&+   ( p_1+p_2+\ldots+p_m )
	H\! \left(
	\frac{p_1}{ ( p_1+\ldots+p_m ) } , 
	\frac{p_2}{ ( p_1+\ldots+p_m ) } ,
\ldots ,
	\frac{p_m}{ ( p_1+\ldots+p_m ) }
\right) 
\nonumber
\\
&&	+  ( p_{m+1}+
%p_{m+2}+
                    \ldots+p_I )
	H \! \left(
	\frac{p_{m+1}}{ ( p_{m+1}+\ldots+p_I ) } , 
%	\frac{p_{m+2}}{ ( p_{m+1}+\ldots+p_I ) } ,
 \ldots ,
	\frac{p_I}{ ( p_{m+1}+\ldots+p_I ) }
\right) .
\label{eq.entdecompose2}
\eeqan
\exampl{example.entropy}{
 A source produces a character $x$
 from the alphabet $\A = \{ {\tt 0}, {\tt 1}, \ldots, {\tt 9}, {\tt a}, {\tt b}, \ldots, {\tt z} \}$;
 with probability $\dthird$, $x$ is a numeral (${\tt 0}, \ldots, {\tt 9}$);
 with probability $\dthird$, $x$ is a vowel (${\tt a}, {\tt e}, {\tt i}, {\tt o}, {\tt u}$);
 and with probability $\dthird$ it's one of the 21 consonants. All numerals are equiprobable,
 and the same goes for vowels and consonants.
 Estimate  the entropy of $X$.
}
\solution\
 $\log 3 + \frac{1}{3} ( \log 10  + \log 5 + \log 21 )= \log 3 +  \frac{1}{3}  \log 1050 \simeq \log 30\ubits$.
%> pr log(36)/log(2)
%5.16992500144231
%> pr log(30)/log(2)
%4.90689059560852
%> pr (log(3) +log(1050)/3.0 )/log(2)
%4.93035370490565
% This may be compared with the maximum entropy for an alphabet
% of 36 characters, $\log 36\ubits$. 

\section{Gibbs's inequality}
%  We will also find useful the following:
\begin{description}
\item[The \ind{relative entropy} {\em or\/} \ind{Kullback-Leibler divergence}]
	\marginpar{\footnotesize{The `ei' in L{\bf{ei}}bler is pronounced\index{pronunciation}
 the same as in h{\bf{ei}}st.}}between two probability distributions $P(x)$ and $Q(x)$ 
	that are defined over the same alphabet $\A_X$ is
\beq
	D_{\rm KL}(P||Q) = \sum_x P(x) \log \frac{P(x)}{Q(x)} .
\label{eq.KL}
\label{eq.DKL}
\eeq
 The relative entropy satisfies {\dem\ind{Gibbs'
 inequality}}
\beq
	D_{\rm KL}(P||Q) \geq 0
\eeq
 with equality only if $P \eq Q$.  Note that in general
 the relative entropy is not symmetric under interchange of the
 distributions $P$ and $Q$:
 in general
 $D_{\rm KL}(P||Q) \neq D_{\rm KL}(Q||P)$, so $D_{\rm KL}$,
 although it is sometimes called the `\ind{K-L distance}',
 is not strictly a
 distance\index{distance!$D_{\rm KL}$}.
% `distance\index{distance!$D_{\rm KL}$}'.
%  It is also known as the `discrimination' or `divergence',
 The \ind{relative entropy} is important in pattern recognition and neural networks, 
 as well as in information theory.
%
% could include that aston guy's stuff here on (pq)^1/2?
%
% see also ../notation.tex
%
\end{description}
 Gibbs's inequality is probably the most important inequality in this book. 
 It, and many other inequalities, can be proved
 using the concept of convexity.
\section{Jensen's inequality for convex functions}
\begin{aside}
 The
 words `\ind{\convexsmile}'
 and `\ind{\concavefrown}' may be  pronounced `convex-smile'
 and `concave-frown'.
 This terminology has useful redundancy: while  one
  may forget which way up `convex' and `concave' are,
 it is harder to    confuse a smile with a frown.
\end{aside}
\begin{description}
%
\item[{\Convexsmile\ functions}.] A function $f(x)$ is {\dem \ind{\convexsmile}\/}
 over $(a,b)$ if
\amarginfig{c}{%
\footnotesize
\setlength{\unitlength}{0.75mm}
\begin{tabular}{c}
\begin{picture}(60,60)(0,0)
\put(0,0){\makebox(60,65){\psfig{figure=figs/convex.eps,angle=-90,width=45mm}}}
\put(10,8){\makebox(0,0){$x_1$}}
\put(48,8){\makebox(0,0){$x_2$}}
\put(17,2){\makebox(0,0)[l]{$x^* = \lambda x_1 + (1-\lambda)x_2$}}
\put(31,23){\makebox(0,0){$f(x^*)$}}
\put(35,39){\makebox(0,0){$\lambda f(x_1) + (1-\lambda)f(x_2)$}}
\end{picture}
\end{tabular}
\caption[a]{Definition of convexity.}
\label{fig.convex}
}\ 
 any chord of the function
 lies above the function,
  as shown in \figref{fig.convex}; that is,
 for all $x_1,x_2
 \in (a,b)$ and $0\leq \lambda \leq 1$, 
\beq
	f( \lambda x_1 + (1-\lambda)x_2 )  \:\:\leq \:\:\
		\lambda f(x_1) + (1-\lambda) f(x_2 ) .
\eeq
  A function $f$ is {\dem strictly
 \convexsmile\/} if, for all $x_1,x_2 \in (a,b)$, the equality holds only
 for $\lambda \eq 0$ and $\lambda\eq 1$.

 Similar definitions apply to \concavefrown\ and strictly \concavefrown\
 functions.
\end{description}
\newcommand{\tinyfunction}[2]{
\begin{tabular}{@{}c@{}}
{\small{#1}}
\\[-0.25in]
\psfig{figure=figs/#2.ps,width=1.06in,angle=-90}
\\
\end{tabular}
}
 Some strictly \convexsmile\ functions are
\bit
\item $x^2$, $e^x$ and $e^{-x}$ for all $x$; 
\item  $\log (1/x)$ and $x \log x$ for $x>0$.
\eit
\begin{figure}[htbp]
\figuremargin{%
\begin{center}
\raisebox{0.4in}{%
\begin{tabular}[c]{c@{}c@{}c@{}c}
\tinyfunction{$x^2$}{convex_xx} &
\tinyfunction{$e^{-x}$}{convex_exp-x} &
\tinyfunction{$\log \frac{1}{x}$}{convex_logix} &
\tinyfunction{$x \log x$}{convex_xlogx} \\[0.2in]
%\tinyfunction{$x^2$}{convex_xx} &
%\tinyfunction{$e^{-x}$}{convex_exp-x} \\[0.42in]
%\tinyfunction{$\log \frac{1}{x}$}{convex_logix} &
%\tinyfunction{$x \log x$}{convex_xlogx} \\[0.2in]
\end{tabular}
}
\end{center}
}{%
\caption[a]{\Convexsmile\ functions.}
\label{fig.convexf}
}%
\end{figure}
\begin{description}
\item[Jensen's inequality.]  If $f$ is a \convexsmile\ function
 and $x$ is a random variable then:
\beq
	\Exp\left[ f(x) \right] \geq f\!\left( \Exp[x] \right) ,
\label{eq.jensen}
\eeq
 where $\Exp$ denotes \ind{expectation}. If $f$ is strictly \convexsmile\ and 
 $\Exp\left[ f(x) \right] \eq  f\!\left( \Exp[x] \right)$, then the random 
 variable $x$ is a constant.
% (with probability 1).
% |!!!!!!!!!!!!!!!!! removed pedantry

 \ind{Jensen's inequality} can also be rewritten for a
 \concavefrown\ function, with the direction of the inequality
 reversed.
\end{description}

 A physical version of Jensen's \ind{inequality} runs as follows.
\amarginfig{b}{\mbox{\psfig{figure=figs/jensenmass.ps,width=1.75in,angle=-90}}}
\begin{quote}
 If a collection of 
 masses $p_i$ are placed on a
 \convexsmile\ curve $f(x)$
 at locations $(x_i, f(x_i))$, then the
 \ind{centre of gravity} of those masses, which  is at $\left( \Exp[x],
 \Exp\left[ f(x) \right] \right)$, lies above the curve.
\end{quote}
 If this fails to convince you, then feel free to
 do the following exercise.
\exercisxC{2}{ex.jensenpf}{
 Prove \ind{Jensen's  inequality}.
}

\exampl{ex.jensen}{
 Three squares have average area $\bar{A} = 100\,{\rm m}^2$.
 The average of the lengths of their sides is $\bar{l} = 10\,{\rm m}$.
 What can be said about the size of the largest of the 
 three squares? [Use Jensen's inequality.]

}
\solution\
 Let $x$ be the length of the side of a square, and let the 
 probability of $x$ be $\dthird,\dthird,\dthird$ over the 
 three lengths $l_1,l_2,l_3$. Then the information that we have is
 that $\Exp\left[ x \right]=10$ and  $\Exp\left[ f(x) \right]=100$, 
 where $f(x) = x^2$ is the function mapping lengths to areas. 
 This is a strictly \convexsmile\ function. 
 We notice that the equality 
  $\Exp\left[ f(x) \right] \eq  f\!\left( \Exp[x] \right)$ holds, 
 therefore $x$ is a constant, and the three lengths 
 must all be equal. The area of the largest square is 100$\,{\rm m}^2$.


\subsection{Convexity and concavity also relate to maximization}
 If $f(\bx)$ is \convexfrown\ and there exists a point at which
\beq
	\frac{\partial f}{\partial x_k} = 0 \:\: \mbox{for all $k$},
% \forall k
\eeq 
 then $f(\bx)$ has its maximum value at  that point.

 The converse does not hold: if a  \convexfrown\  $f(\bx)$ is maximized at
 some $\bx$ it is not necessarily true that the gradient
 $\grad f(\bx)$ is equal
 to zero there. For example, $f(x) = -|x|$ is maximized at $x=0$
 where its derivative is undefined; and $f(p) = \log(p),$ for
 a probability
 $p \in (0,1)$, is maximized on the boundary of the range,
 at $p=1$, where the gradient $df(p)/dp =1$.
%, since $f$ might for example 
% be an increasing function with no maximum  such as $\log x$, 
% or its maximum might be located at a point $\bx$
% on the boundary of the range of $\bx$. 
%
%{\em  (is this use of range correct?)}







% exercises from that. 
%
% exercises that belong between old chapters 1 and 2.
%
% see also _p5a.tex for moved exercises.
%
\section{Exercises}
\label{sec.exercise.block1}
\subsection*{Expectations and entropies}
 You are probably familiar with the idea of computing the \ind{expectation}\index{notation!expectation} 
 of a function of $x$, 
\beq
	\Exp\left[ f(x) \right] =	\left< f(x) \right> = \sum_{x} P(x) f(x) .
\eeq
 Maybe you are not so comfortable with computing this expectation 
 in cases where the function  $f(x)$  depends on
 the probability $P(x)$. The next few 
 examples address this concern.

\exercissxA{1}{ex.expectn}{ 
 Let $p_a = 0.1$, $p_b = 0.2$, $p_c = 0.7$. 
 Let $f(a) = 10$, $f(b) = 5$, and $f(c) = 10/7$. 
 What is $\Exp\left[ f(x) \right]$?
 What is $\Exp\left[ 1/P(x) \right]$?
}
\exercisxA{2}{ex.invP}{
 For an arbitrary ensemble, what is $\Exp\left[ 1/P(x) \right]$? 
}
\exercisxB{2}{ex.expectng}{
 Let $p_a = 0.1$, $p_b = 0.2$, $p_c = 0.7$. 
 Let $g(a) = 0$, $g(b) = 1$, and $g(c) = 0$. 
 What is $\Exp\left[ g(x) \right]$?
}
\exercisxB{1}{ex.expectng2}{
 Let $p_a = 0.1$, $p_b = 0.2$, $p_c = 0.7$. 
 What is the probability that $P(x) \in [0.15,0.5]$?
 What is 
\[
 P\left( \left| \log \frac{P(x)}{ 0.2} \right| > 0.05 \right) ?
\]
}
\exercisxA{3}{ex.Hineq}{
 Prove the assertion that 
	$H(X) \leq \log(|X|)$ with equality iff $p_i \eq  1/|X|$ for all $i$. 
 ($|X|$ denotes the number of elements in the set $\A_X$.)
 [Hint: use Jensen's inequality (\ref{eq.jensen}); if  your
 first attempt to use Jensen does not succeed, remember that
 Jensen involves both a random variable and a function,
 and you have quite a lot of freedom in choosing
 these; think about whether
 your chosen function $f$ should be convex or concave;
 further hint: try $u=1/p_i$ as the random variable.]
}
\exercissxB{3}{ex.rel.ent}{
 Prove that the relative entropy (\eqref{eq.KL}) 
 satisfies $D_{\rm KL}(P||Q) \geq 0$ (\ind{Gibbs's inequality})
 with equality only if $P \eq Q$.

% You may find this result
% helps with the previous two exercises. Note  (moved to _p5a.tex)
%
%  refer to this in mean field theory chapter {ch.mft}
%
}
%
\exercisxB{3}{ex.Hwords}{
	The probability $p_n$ of the
 $n$th most frequent word in English is roughly approximated
 by
\beq
 p_n \simeq \left\{
\begin{array}{ll}
\frac{0.1}{n} & \mbox{for $n \in 1 \ldots 12367$.}
% 8727$.}
\\
0 & n > 12367 .
\end{array}
\right.
\eeq
 [This remarkable $1/n$ law is known as Zipf's law,
 and applies to the word frequencies of many languages
% cite Shannon collection p.197 - except he has the number 8727, wrong!
% could also cite Gell-Mann
 \cite{zipf}.]
 If we assume that English is generated by picking
 words at random according to this distribution,
 what is the entropy of English (per word)?
 [This calculation  can be found in `Prediction and Entropy of Printed English', C.E.\ Shannon,
 Bell Syst.\ Tech.\ J.\ {\bf 30}, p\pdot50-64 (1950), but, inexplicably,
 the great man made numerical errors in it.] 
% , in bits per word?
}
% Decomposeability of the entropy
\exercisxB{2}{ex.entropydecompose}{
 Prove that the entropy is
 indeed decomposeable as described in 
 \eqsref{eq.entropydecompose}{eq.entdecompose2}.
}
\exercisxB{2}{ex.decomposeexample}{
 A random variable $x \in \{0,1,2,3\}$ is selected
 by flipping a bent coin with bias $f$ to determine whether
 the outcome is in $\{0,1\}$ or $\{ 2,3\}$;
\amarginfig{t}{%
\begin{center}\small%footnotesize
\setlength{\unitlength}{0.6mm}
\begin{picture}(30,50)(-10,-15)
\put(-6,25){{\makebox(0,0)[r]{$f$}}}
\put(-6,5){{\makebox(0,0)[r]{$1\!-\!f$}}}
\put(-10,15){\vector(1,1){17}}
\put(-10,15){\vector(1,-1){17}}
\put(10,35){\vector(1,1){10}}
\put(10,35){\vector(1,-1){10}}
\put(16,45){{\makebox(0,0)[r]{$g$}}}
\put(16,25){{\makebox(0,0)[r]{$1\!-\!g$}}}
\put(16,5){{\makebox(0,0)[r]{$h$}}}
\put(16,-15){{\makebox(0,0)[r]{$1\!-\!h$}}}
\put(10,-5){\vector(1,1){10}}
\put(10,-5){\vector(1,-1){10}}
\put(24,45){{\makebox(0,0)[l]{\tt 0}}}
\put(24,25){{\makebox(0,0)[l]{\tt 1}}}
\put(24,5){{\makebox(0,0)[l]{\tt 2}}}
\put(24,-15){{\makebox(0,0)[l]{\tt 3}}}
\end{picture}
\end{center}
}
 then either  flipping a second bent coin with bias $g$
 or a third bent coin with bias $h$ respectively.
 Write down the probability distribution of $x$.
 Use   the
 decomposeability of the entropy (\ref{eq.entdecompose2})
 to find the entropy of $X$. [Notice how compact
 an expression results if you make use of the binary entropy
 function $H_2(x)$, compared with writing out the four-term
 entropy explicitly.]
}
\exercisxB{2}{ex.waithead0}{
 An unbiased coin is flipped until one head is thrown. What is the 
 entropy of the random variable $x \in \{1,2,3,\ldots\}$, the number of
 flips?
 Repeat the calculation for the case of a biased coin with probability $f$
 of coming up heads.
  [Hint: solve the problem both directly  and by using  the
 decomposeability of the entropy (\ref{eq.entropydecompose}).]
%
}
%
% removed joint entropy questions.
%
\subsection*{Forward probability}%  problems}
\exercisxB{1}{ex.balls}{
 An urn contains $w$ white balls and $b$ black balls.
 Two balls are drawn, one after the other, without replacement.
 Prove that the probability that the first ball
 is white is equal to the probability that the second is white.
}
%
\exercisxB{2}{ex.buffon}{
 A circular \ind{coin} of diameter $a$ is thrown onto a \ind{square} grid
 whose squares are $b \times b$. ($aB$ given that $F>A$?)
}
\exercisxB{2}{ex.liars}{
 The inhabitants of an island tell the
 truth one third of the time. They lie with  probability  2/3.

 On an occasion, after one of them made a statement,
 you ask another `was that statement true?'
 and he says `yes'.

 What is the probability that the statement was indeed true?
% [Ans: 1/5].
}

%
\exercisxB{2}{ex.R3error}{
 Compare two ways of computing the probability of error of
 the repetition code $\Rthree$, assuming a binary
 symmetric channel (you
 did this once for \exerciseref{ex.R3ep}) and confirm that they
 give the same answer.
\begin{description}
\item[Binomial distribution method.]
	Add  the probability of all three bits's
 being flipped to the probability of exactly two bits's being flipped.
\item[Sum rule method.]
% Using the different possible inferences]
 Using the \ind{sum rule},
 compute  the marginal probability that $\br$ takes on each of
 the eight possible values, $P(\br)$.
 [$P(\br) = \sum_s P(s)P(\br|s)$.]
  Then compute
 the posterior probability of $s$ for each of the  eight
 values of $\br$. [In fact, by symmetry, only two example
 cases
 $\br = ({\tt0}{\tt0}{\tt0})$ and 
 $\br = ({\tt0}{\tt0}{\tt1})$ need  be considered.]
\marginpar{\footnotesize{\Eqref{eq.bayestheorem} gives the posterior probability of
 the input $s$, given the received vector $\br$.
}}
% $\br = ({\tt1},{\tt1},{\tt0})$, 
% $\br = ({\tt1},{\tt1},{\tt1})$,
 Notice that some of the
 inferred bits are better determined than others.
 From the posterior probability $P(s|\br)$ you can read out
 the case-by-case error probability,
 the probability that the more probable hypothesis
 is not correct, $P(\mbox{error}|\br)$.
 Find the average error probability using the sum rule,
\beq
	P(\mbox{error}) = \sum_{\br} P(\br) P(\mbox{error}|\br) .
\eeq
\end{description}
}

%

\subsection*{Inference problems}
\exercissxA{2}{ex.logit}{
	If $q=1-p$ and $a = \log \linefrac{p}{q}$, show that
\beq
	p = \frac{1}{1+\exp(-a)} .
\label{eq.sigmoid}
\label{eq.logistic}
\eeq
 Sketch this function and find its relationship to $\tanh(a)$.
}
%
% is this exercise inappopriate now because we have not defined
% joint ensembles yet?
%
\exercisxB{2}{ex.BTadditive}{
	Let $x$ and $y$ be correlated random variables with
 $x$ a binary variable taking values in $\A_X = \{ 0,1 \}$.
	Use Bayes's theorem to show that the log posterior probability 
	ratio for $x$ given $y$ is
\beq
	\log \frac{P(x=1|y)}{P(x=0|y)} = \log \frac{P(y|x=1)}{P(y|x=0)}
		+ \log \frac{P(x=1)}{P(x=0)}  .
\eeq
}
% define ODDS ?
\exercisxB{2}{ex.d1d2}{
	Let $x$, $d_1$ and $d_2$ be random variables such that $d_1$
 and $d_2$  are conditionally independent given a binary variable $x$.
% (That is, $P(x,d_1,d_2)
%  = P(x)P(d_1|x)P(d_2|x)$.)
%
% somewhere I need to introduce graphical repns and define
%
% TO DO!!! TODO
%
% (\ind{conditional independence} is discussed further in section XXX.)
%
% and give examples. A and C children of B. and A->B->C
% Jensen defn is
% A is cond indep of B given C if
%  A|B,C = A|C
% which is symmetric, implying by BT
%  B|A,C = B|C
% pf
%  B|A,C = A|B,C B|C / A|C = B|C
% my defn here is 
%  A,B,C = C  A|C  B|C
% proof: 
%  A,B,C =  C  A|C  B|C,A =  .
% NB graphical model and decomposition are not 1-1 related. The two
% graphs  A and C children of B. and A->B->C  both have a joint prob
% that can be factorized  in either way. 
%
% $x$ is a binary variable taking values in $\A_X = \{ 0,1 \}$.
	Use Bayes's theorem to show that the  posterior probability 
	ratio for $x$ given $\{d_i \}$ is
\beq
	 \frac{P(x=1|\{d_i \} )}{P(x=0| \{d_i \})} = 
	 \frac{P(d_1|x=1)}{P(d_1|x=0)}
		 \frac{P(d_2|x=1)}{P(d_2|x=0)}
		 \frac{P(x=1)}{P(x=0)}  .
\eeq
}

\subsection*{Life in high-dimensional spaces}
%{Life in $\R^N$}
\index{life in high dimensions}
\index{high dimensions, life in}
 Probability distributions and volumes have some unexpected 
 properties in high-dimensional spaces.

% The real line is denoted by $\R$. An $N$--dimensional real space 
% is denoted by $\R^N$.
\exercisxA{2}{ex.RN}{
 Consider a sphere of radius $r$ in an $N$-dimensional real space.
% dimensions.
 Show that the 
 fraction of the volume of the sphere that
 is 
 in the surface shell lying
 at values of the radius between $r- \epsilon$ and $r$, where $0 < \epsilon < r$, is:
\beq
 f = 1 - \left( 1 - \frac{\epsilon}{r} \right)^N .
\eeq
% from Bishop p.29
 Evaluate $f$ for the cases $N=2$, $N=10$ 
 and $N=1000$,  with (a) $\epsilon/r = 0.01$; (b)  $\epsilon/r = 0.5$. 

 {\sf Implication:} points that are uniformly distributed in a sphere in $N$ 
 dimensions, where $N$ is large, are very likely to be in a \ind{thin shell} 
 near the surface.
% (From Bishop (1995).)
}



%%% Local Variables: 
%%% TeX-master: ../book.tex
%%% End:

%
%
%
\fakesection{Inference problems}
\subsection*{Further inference problems}
\nopagebreak[4]
 At this point you have a choice: if you'd
 like to read more about inference, you can look at these exercises
 and read \chref{ch1b}; if you're eager to get on to
 information theory, data compression, and noisy channels, you
 should skip to \chref{ch2}.
\exercissxB{2}{ex.dieexponential}{
	A die is selected at random from two twenty--faced dice 
 on which the symbols 1--10 are written with non-uniform frequency
 as follows.
\begin{center}
\begin{tabular}{l@{\hspace{0.2in}}*{10}{l}} \toprule
Symbol & 1 & 2 & 3 & 4  & 5 & 6 & 7  & 8 & 9 & 10 \\  \midrule
Number of faces of die A & 
        6 & 4  & 3 & 2 & 1 &1 &1 &1 &1 & 0 \\
Number of faces of die B & 
        3 & 3  & 2 & 2 & 2 &2 &2 &2 &1 & 1 \\ \bottomrule
\end{tabular}
\end{center}
 The randomly chosen die is rolled 7 times, with the following
 outcomes:
\begin{center}
 5, 3, 9, 3, 8, 4, 7. %  Sat 21/12/02   tried cutting this \\
\end{center}
 What is the probability that the die is die A?
}
\exercisxB{2}{ex.dieexponentialb}{
 Assume that there is a third twenty--faced die, die C, on which the symbols 
 1--20 are written once each. 
 As above, one of the three dice is selected at random and rolled
 7 times, giving the outcomes:
% \begin{center}
 3, 5, 4, 8, 3, 9, 7. \\
% \end{center}
 What is the probability that the die is (a) die A, (b) die B, (c) die C?
}

% no solution pointer
\exercisxA{3}{ex.exponential}{ {\exercisetitlestyle Inferring a decay constant}\\ 
%\begin{quotation}
	Unstable particles are emitted from a source and decay at a
	distance $x$, a real number
	 that has an exponential probability distribution
	with characteristic length $\lambda$.  Decay events can only
	be observed if they occur in a window extending from $x=1\cm$
	to $x=20\cm$. $N$ decays are observed at locations $\{x_1 ,
	\ldots , x_N\}$. 
% ($x_n$ is a real number.)
	 What is $\lambda$?

%\end{quotation}
\begin{center}
\mbox{\psfig{figure=\FIGS/decay.ps,width=3in,angle=90,%
bbllx=154mm,bblly=147mm,bbury=257mm,bburx=175mm}}\\
\end{center}
}
\nopagebreak[4]
\exercisxB{2}{ex.phonetest}{% ????????????????? needs solution adding (was phonecheck!)
 You move into a new house; the phone is connected, and
% you are unsure of your phone number --
 you're pretty sure that
 the \ind{phone number}\index{telephone number} is
% it's
 {\tt 740511}, but not as sure as you would like to be.
% 
 As an experiment, you pick up the phone and dial {\tt 740511};
 you obtain a `busy' signal.
 Are you now more sure of your phone number? If so, how much?
}
%
% no solution pointer
% \subsection*{Genetic test evidence}
% \begin{quotation}
\exercisxB{3}{ex.blood}{ {\exercisetitlestyle Forensic evidence} \\
% Two people have left traces of their own blood at the scene of a
% crime.  Their blood groups can be reliably identified from these
% traces and are found 
% to be of type `O' (a common type in the local population, having
% frequency 60\%) and of type `AB' (a rare type, with frequency 1\%).
% A suspect is tested and found to have type `O' blood. 
% A careless lawyer might claim that the fact that the suspect's
% blood type was found at the scene is positive evidence for the theory
% that he was present. But do these data
% $D=$ \{type `O' and `AB' blood were found at scene\} make it more
% probable that this suspect was one of the two people present at the
% crime? 
 Two people have left traces of their own blood at the scene of a
 crime. 
 A suspect, Oliver, is tested and found to have type `O' blood.
 The blood groups of the two traces 
 are found
 to be of type `O' (a common type in the local population, having
 frequency 60\%) and of type `AB' (a rare type, with frequency 1\%).
  Do these data
 (type `O' and `AB' blood were found at scene) give evidence in favour 
 of the proposition  that Oliver was one of the two people present at the
 crime? 

}
% \end{quotation}


\dvips
% include urn.tex here for another forward probability exercise.
%
\subchapter{Solutions to Chapter \protect\ref{ch.prob.ent}'s exercises} 
\fakesection{_s1aa solutions}
%=================================
\soln{ex.independence.bigram}{
 No, they are not independent. If they were then all the
 conditional distributions $P(y|x)$ would be identical
 functions of $y$, regardless of $x$ (\cf\ \figref{fig.conbigrams}).
}
\soln{ex.fp.toss}{ 
 We define  the fraction $f_B \equiv B/K$.
\ben 
\item
 The number of black balls
 has a binomial distribution.
\beq P(n_B\,|\,f_B,N) = {N \choose n_B} f_B^{n_B} (1-f_B)^{N-n_B} \eeq
\item
 The mean and variance of this distribution are: 
\beq \Exp [ n_B ] = N f_B \eeq
\beq \var[n_B] = N f_B (1-f_B) .
\label{eq.variance.binomial}
\eeq
 These results were derived in \exampleref{ex.binomial}.
 The standard deviation of $n_B$ is $\sqrt{\var[n_B]} = \sqrt{N f_B (1-f_B)}$.
% on page \pageref{sec.first.binomial.sol}.

 When $B/K = 1/5$ and $N=5$, 
 the expectation and variance of   $n_B$ are
 1 and 4/5. The standard deviation is 0.89.

 When $B/K = 1/5$ and $N=400$, 
 the expectation and variance of   $n_B$ are
 80 and 64. The standard deviation is 8.
\een
}
\soln{ex.fp.chi}{
 The numerator of the  quantity
\[%beq
 z = \frac{(n_B - f_B N)^2}{ {N f_B (1-f_B)} } .
%\label{eq.chisquared}
\]%eeq
 can be recognised as\index{chi-squared}
 $\left( n_B - \Exp [ n_B ] \right)^2$;
 the denominator is equal to
 the variance of $n_B$ (\ref{eq.variance.binomial}),
 which is by definition the expectation of the numerator.
 So the expectation of $z$ is 1. [A random variable like $z$,
 which measures the deviation of data from the
 expected
% average
 value, is sometimes called $\chi^2$ (chi-squared).]

 In the case $N=5$ and $f_B = 1/5$, $N f_B$ is 1, and
 $\var[n_B]$ is 4/5. The numerator has five possible values, only
 one of which is smaller than 1:
 $(n_B - f_B N)^2 = 0$ has probability $P(n_B \!=\! 1)= 0.4096$ ;
% $(n_B - f_B N)^2 = 1$ has probability $P(n_B = 0)+P(n_B = 2)= $ ;
% $(n_B - f_B N)^2 = 4$ has probability $P(n_B = 3)= $ ;
% $(n_B - f_B N)^2 = 9$ has probability $P(n_B = 4)= $ ;
% $(n_B - f_B N)^2 = 16$ has probability $P(n_B = 5)= $ ;
 So the probability that $z < 1$ is 0.4096.
% 
}
%
% stole solution from here
%
%%%%%%%%%%%%%%%%%%%%%%%%%% added 99 9 14
\soln{ex.jensenpf}{
 We wish to prove, given the property 
\beq
	f( \lambda x_1 + (1-\lambda)x_2 ) \:\: \leq  \:\:
		\lambda f(x_1) + (1-\lambda) f(x_2 ) ,
\label{eq.convexdefn}
\eeq
 that, if $\sum p_i = 1$ and $p_i \geq 0$, 
\beq%
%	\Exp\left[ f(x) \right] \geq f\left( \Exp[x] \right) ,
	\sum_{i=1}^I p_i  f(x_i)  \geq f\left( \sum_{i=1}^I p_i x_i  \right) .
\eeq
 We proceed by recursion, working from the right hand side. (This proof
 does not
% needs further work to
 handle
% awkward
 cases where some $p_i=0$; such
 details are left to the pedantic reader.) At the first line we
 use the definition of convexity (\ref{eq.convexdefn}) with
 $\lambda = \frac{p_1}{\sum_{i=1}^I p_i } = p_1$; at the second line,
 $\lambda =  \frac{p_2}{\sum_{i=2}^I p_i }$.
% , and so forth.
\fakesection{temporary solution}
\begin{eqnarray}
\lefteqn{  f\left( \sum_{i=1}^I p_i x_i  \right) = 
% &=&
 f\left( p_1 x_1  +  \sum_{i=2}^I p_i x_i
 \right) } \nonumber
\\
&\leq&
 p_1 f(x_1) +  \left[ \sum_{i=2}^I p_i \right]
		\left[  f\left(  \sum_{i=2}^I p_i x_i
	\left/ \sum_{i=2}^I p_i \right. \right) \right]
 \\
&\leq&
 p_1 f(x_1) +  \left[ \sum_{i=2}^I p_i \right]
	     \left[
             \frac{p_2}
	{\sum_{i=2}^I p_i }             f\left( x_2 \right)
		+  \frac{\sum_{i=3}^I p_i}
                       {\sum_{i=2}^I p_i }
		 f\left( \sum_{i=3}^I p_i x_i
 \left/ \sum_{i=3}^I p_i \right. \right)
            \right] ,
\nonumber
% probably cut this last line, just show one itn of recursion
%
\end{eqnarray}
  and so forth. %
\hfill   $\Box$%\epfs% end proof symbol








}
%%%%%%%%%%%%%%%%%%%%
%
%

%\soln{ex.weigh}{
% See chapter \chtwo.
%}
%
\soln{ex.expectn}{
 $p_a = 0.1$, $p_b = 0.2$, $p_c = 0.7$. 
 $f(a) = 10$, $f(b) = 5$, and $f(c) = 10/7$. 
\beq
	\Exp\left[ f(x) \right] = 0.1 \times 10 + 0.2 \times 5 + 0.7 \times 10/7 = 3.
\eeq
 For each $x$, $f(x) = 1/P(x)$, so 
\beq
 \Exp\left[ 1/P(x) \right] = \Exp\left[ f(x) \right] = 3.
\eeq
}
%
\soln{ex.invP}{
 For general $X$, 
\beq
	\Exp\left[ 1/P(x) \right] = \sum_{x\in \A_X} P(x)  1/P(x) = 
	\sum_{x\in \A_X} 1 = | \A_X | .
\eeq
}
%
\soln{ex.expectng}{
  $p_a = 0.1$, $p_b = 0.2$, $p_c = 0.7$. 
  $g(a) = 0$, $g(b) = 1$, and $g(c) = 0$. 
\beq
	\Exp\left[ g(x) \right]=p_b = 0.2.
\eeq
}
\soln{ex.expectng2}{
\beq
	P\left( P(x) \! \in \! [0.15,0.5] \right) = p_b = 0.2 .
\eeq
\beq
	 P\left( \left| \log \frac{P(x)}{ 0.2} \right| > 0.05 \right) 
		= p_a + p_c = 0.8 .
\eeq
}
%
\soln{ex.Hineq}{
 This type of question can be approached in two ways:
 either  by differentiating
 the function to be maximized, finding the maximum, and proving
 it is a global maximum; this strategy is somewhat risky since it is possible 
 for the maximum of a function to be at the boundary of the space,
 at a place where the derivative is not zero.
 Alternatively, a carefully chosen inequality 
 can establish the answer. The second method is much neater.

\begin{Prooflike}{Proof by differentiation (not the recommended method)}
\beqan
	H(X) &=& \sum_i p_i \log \frac{1}{p_i} \\
	\frac{\partial H(X)}{\partial p_i} &=&  \log \frac{1}{p_i} - 1 
\eeqan 
 we maximize subject to the constraint $\sum_i p_i = 1$ which can be enforced
 with a Lagrange multiplier:
\beqan
	G(\bp) & \equiv & H(X) + \lambda \left( \sum_i p_i - 1 \right) \\
	\frac{\partial  G(\bp)}{\partial p_i}  &=&  \log \frac{1}{p_i} - 1 + \lambda .
\eeqan
 At a maximum, 
\beqan
	\log \frac{1}{p_i} - 1 + \lambda &=& 0 \\
\Rightarrow \log \frac{1}{p_i} &=& 1 - \l ,
\eeqan
 so all the $p_i$ are equal. That this extremum is indeed a maximum
 is established by finding the curvature:
\beq
	\frac{\partial^2  G(\bp)}{\partial p_i \partial p_j}  = -\frac{1}{p_i}
	\delta_{ij} ,
\eeq
 which is negative definite. \hfill
\end{Prooflike}
\begin{Prooflike}{Proof using Jensen's inequality (recommended method)}
 First a reminder of the inequality.
\begin{quotation}
\noindent
 If $f$ is a convex function
 and $x$ is a random variable then:
\beq
	\Exp\left[ f(x) \right] \geq f\left( \Exp[x] \right) .
\eeq
 If $f$ is strictly convex and 
 $\Exp\left[ f(x) \right] \eq  f\left( \Exp[x] \right)$, then the random 
 variable $x$ is a constant
 (with probability 1). 
\end{quotation}

 The secret of a proof using Jensen's inequality is to choose the 
 right function and the right random variable. 
 We could define 
% $f(u) = \log \frac{1}{u}$ and 
\beq
	f(u) = \log \frac{1}{u} = - \log u
\eeq
 (which is a convex function) and 
 think of $H(X) = \sum p_i \log \frac{1}{p_i}$ as the 
 mean of  $f(u)$ where  $u=P(x)$, but this 
 would not get us there -- it would give us an inequality in the 
 wrong direction. If instead we define 
\beq
	u = 1/P(x)
\eeq
 then we find:
% this introduces an extra minus sign:
\beq
	H(X) = - \Exp\left[ f( 1/P(x) ) \right]
	 \leq - f\left( \Exp[ 1/P(x) ] \right)  ;
\eeq
 now we know from   \exerciseref{ex.invP}\ that $\Exp[ 1/P(x) ] = |\A_X|$, so
\beq
	H(X)   \leq - f\left( |\A_X| \right) = \log  |\A_X| .
\eeq
 Equality only holds if the random variable $u = 1/P(x)$ is a constant, 
 which means $P(x)$ is a constant for all $x$.  
\end{Prooflike}
}
%
\soln{ex.Hwords}{
The entropy is 9.7
% 11.8
 bits per word.
% , which is 2.6 bits per letter  WRONG - shannon (p197) is in error
}
%
% solns moved to _s5A.tex
%
\soln{ex.decomposeexample}{
\beq
H(X)= H_2(f) + f H_2(g) + (1-f) H_2(h) .
\eeq
}
%
\soln{ex.waithead0}{
 The probability that there are $x-1$ tails and then one head,
 so we  get the first head on the $x$th
 toss, is
\beq
	P(x) = (1-f)^{x-1} f .
\eeq
 If the first toss is a tail, the probability distribution for
 the future looks just like it did before we made the first toss.
 Thus we have a recursive expression for the entropy:
\beq
	H(X) = H_2(  f ) + (1-f) H(X)  .
\eeq
 Rearranging,
\beq
	H(X) =  H_2(  f )  / f .
\eeq
}
%
\soln{ex.rel.ent}{
\beq
	D_{\rm KL}(P||Q) = \sum_x P(x) \log \frac{P(x)}{Q(x)} .
% \label{eq.KL}
\eeq
\label{sec.gibbs.proof}% cross ref problem? Tue 12/12/00
 We prove \ind{Gibbs's inequality} using \ind{Jensen's inequality}. 
 Let $f(u) = \log 1/u$ and $u=\frac{Q(x)}{P(x)}$. 
 Then 
\beqan
	D_{\rm KL}(P||Q) & =& \Exp[ f( Q(x)/P(x) ) ]
\\ &\geq&
 f\left(
	\sum_x P(x) \frac{Q(x)}{P(x)} \right)
	= \log \left( \frac{1}{\sum_x Q(x)} \right) = 0,
\eeqan
 with equality only if $u=\frac{Q(x)}{P(x)}$ is a constant, that is, 
 if $Q(x) = P(x)$.\hfill$\epfsymbol$\\

\begin{Prooflike}{Second solution}
 In the above proof the expectations were with respect to
 the probability distribution $P(x)$.  A second solution method
 uses Jensen's inequality with $Q(x)$ instead.
 We define $f(u) = u \log u$ and let $u = \frac{P(x)}{Q(x)}$.
 Then
\beqan
	D_{\rm KL}(P||Q)& =&
 \sum_x Q(x) \frac{P(x)}{Q(x)} \log
 	\frac{P(x)}{Q(x)} = \sum_x Q(x) f\left( \frac{P(x)}{Q(x)} \right) \\
	&\geq& f\left( \sum_x Q(x) \frac{P(x)}{Q(x)} \right) = f(1) = 0,
\eeqan
 with equality only if $u=\frac{P(x)}{Q(x)}$ is a constant, that is, 
 if $Q(x) = P(x)$.
\end{Prooflike}
}
%
\fakesection{waithead solution}
\soln{ex.waithead}{
 The probability of the number of tails $t$ is 
\beq
	P(t) = \left(\frac{1}{2}\right)^{t} \frac{1}{2} 
		\:\mbox{ for $t\geq 0$}.
\eeq
 The expected number of heads is 1, by definition of the problem.
 The expected number of tails is 
\beq
	\Exp[t] =
	\sum_{t=0}^{\infty} t \left(\frac{1}{2}\right)^{t} \frac{1}{2} ,
\eeq
 which may be shown to be 1 in a variety of ways. For example, since 
 the situation after one tail is thrown is equivalent to the opening 
 situation, we can write down the recurrence relation
\beq
	\Exp[t] = \frac{1}{2} ( 1 + \Exp[t] )  + \frac{1}{2}0 \:\:
 \Rightarrow \:\: \Exp[t] = 1.
\eeq
% if we define $S=\Exp[t]$ then we can subtract $S/2$ from $S$ to obtain 
% a geometric series:
%\beq
%	(1-1/2)S = \sum_{t=0}^{\infty} \left(\frac{1}{2}\right)^{t+1}
%		= \frac{1/2}{1-1/2} = 1
%\eeq
% which gives $S=2$ --- what?
%%%%%%%%%%%%%%%%
%, for example, introducing 
% $Z(\beta) \equiv \sum_t \left(\frac{1}{2}\right)^{\beta t} \frac{1}{2}
% = \frac{1}{2}/\left(1 - (\linefrac{1}{2})^{\beta}\right)$:
%\beq
%	\sum_{t=0}^{\infty} t \left(\frac{1}{2}\right)^{t} \frac{1}{2}
%	= \frac{d}{d\beta} \log Z
%\eeq

 The probability distribution of the `estimator' $\hat{f} = 1/(1+t)$,
 given that $f=1/2$, is plotted 
 in \figref{fig.f.estimator}. The  probability of $\hat{f}$
 simply the probability of the corresponding
 value of $t$.
%
% gnuplot
% load 'figs/festimator.gnu'
%\begin{figure}
%\figuremargin{%
\marginfig{%
\begin{center}
\begin{tabular}{c}
$P(\hat{f})$\\[-0.3in]
\mbox{\psfig{figure=figs/festimator.ps,angle=-90,width=2in}}\\
\hspace{1.82in}$\hat{f}$
\end{tabular}
\end{center}
%}{%
\caption[a]{The probability distribution of the estimator $\hat{f} = 1/(1+t)$, 
 given that $f=1/2$.}
% , so that  $P(t) = 1/2^{t+1}$.}
\label{fig.f.estimator}
%}
%\end{figure}
}
}
\soln{ex.waitbus}{
\ben
\item
	The mean number of rolls from one six to the next six is six
 (assuming
	we
% don't count the first of the two sixes).
 start counting rolls after   the first of the two sixes).
	The probability that the next six occurs on the $r$th
 roll is the probability of {\em not\/} getting a six
 for $r-1$ rolls multiplied by the probability of then
getting a six:
\beq
 P(r_1 \!=\! r) = \left( \frac{5}{6} \right)^{r-1} \frac{1}{6}.
\eeq
	This  probability distribution of the number of rolls, $r$,
 may be called 
	an \ind{exponential distribution}, since 
\beq
 P(r_1 \!=\! r) = e^{-\alpha r} / Z, 
\eeq
 where $\alpha = \ln({6}/5)$.
\item
  The mean number of rolls from the clock until the next six is six.
\item
 	The mean number of rolls, going back in time,
	until the most recent six is six.
\item
	The mean number of rolls from the six before
	the clock struck to the six after the clock struck
	is the sum of the answers to (b) and (c), less one,
% (assuming	we don't count the first of the two sixes),
 that is, eleven.
\item
	Rather than explaining the difference between (a)
% six and
 and  (d), let me give another hint.\index{bus-stop fallacy}\index{waiting for a bus}
% see gnu/waitbus.gnu
 Imagine that the buses in Poissonville  arrive independently at random
 (a \ind{Poisson process}), with, on average, one bus every six minutes.
 Imagine that passengers turn up at {\busstop}s at a uniform rate,
% random also,
 and are scooped up by the bus without delay, so the
 space between two buses remains constant.
 Buses that follow gaps bigger than six minutes
 become overcrowded. The passengers' representative complains that
 two-thirds of  all passengers found themselves on overcrowded buses.
 The bus operator claims, `no, no -- only one third
 of our buses are overcrowded'. Can both these claims be true? 
\een
\amarginfig{b}{%
\begin{center}
\mbox{\hspace{-0.3in}\psfig{figure=figs/waitbus.ps,angle=-90,width=2.05in}}\\[-0.2in]
\end{center}
\caption[a]{The probability distribution of the number
 of rolls $r_1$
 from one 6 to the next
  (falling solid line),
\[%\beq
	P(r_1 \!=\! r) = \left( \frac{5}{6} \right)^{r-1} \frac{1}{6} ,
\]%\eeq
 and the probability distribution (dashed line)
 of
% the quantity $r_{\rm tot}=r_1+r_2-1$,
 the number of rolls from the 6 before 1pm to the next 6,
% where $r_1$ and $r_2$ are the numbers of rolls before
% and after the clock strikes,
 $r_{\rm tot}$, 
\[%\beq
	P(r_{\rm tot} \!=\! r) = r \, \left( \frac{5}{6} \right)^{r-1}
		\left( \frac{1}{6} \right)^2 
 .
\]%\eeq
 The probability $P(r_1>6)$ is about 1/3; the probability
 $P(r_{\rm tot} > 6 )$ is about 2/3. The mean of $r_1$ is 6, and the
 mean of $r_{\rm tot}$ is 11.
}
% other elegant ways of saying it:
% P( number rolls from one 6 to the next)
% P( number of rolls from the 6 before 1pm to the next)
}% end figure
}% end solbn






%
\soln{ex.sumdice}{
\ben \item For the outcomes $\{2,3,4,5,6,7,8,9,10,11,12\}$, 
 the probabilities are $\P = \{ 
\frac{1}{36},
\frac{2}{36},
\frac{3}{36},
\frac{4}{36},
\frac{5}{36},
\frac{6}{36},
\frac{5}{36},
\frac{4}{36},
\frac{3}{36},
\frac{2}{36},
\frac{1}{36}\}%
$.
\item The value of one die has mean $3.5$ and variance $35/12$. 
 So the sum of one hundred has mean $350$ and variance $3500/12 \simeq 292$,
 and by the central limit theorem the probability distribution 
 is roughly Gaussian (but confined to the integers), with 
 this mean and variance.
\item
	In order to obtain a sum that has a uniform distribution 
 we have to start from random variables some of which
 have a spiky distribution 
 with the probability mass concentrated at the extremes. 
 The unique solution is to have one ordinary die and one with faces 6,6,6,0,0,0.
% That this solution is unique can be proved with an argument 
% that starts by noting 
% that each of the 12 outcomes has to be realized
% by 3 distinct microstates (a microstate
% being one of the 36 particular orientations
% of the two dice).  To create outcome `12'
% in three ways there must be one six on 
% one dice and three sixes on the other; 
% similarly to create outcome `1' three ways, there 
% must be one die with three zeroes on it
% and one with one one.
 Yes, a uniform  distribution can be created,
 for example by labelling the $r$th die with
 the numbers $\{0,1,2,3,4,5\}\times 6^r$.
\een  
}
% \subsection{Move this solution}
%
\subsection*{Conditional probability}
\soln{ex.brothers}{
 Assuming ignorance about the order of the ages $F$, $A$, and $B$,
 the six possible hypotheses have equal probability.
 The probability the $F>B$ is $\dhalf$.

 The conditional  probability that $F>B$ given that $F>A$
 is given by the joint probability divided by the marginal probability:
\beq
		P(  F>B |  F>A ) = \frac{		P(  F>B ,  F>A ) }
	{		P(  F>A )}
= \frac{ \dfrac{2}{6} }{ \dhalf }
% 2/6  / 1/2
 = \frac{2}{3} .
\eeq
 (The joint probability that $F>B$ and $F>A$ is the probability that
 Fred is the oldest, which is $\dthird$.)
}
%
% \soln{ex.R3error}{
%
\fakesection{r3 error soln}
\soln{ex.R3error}{
\begin{description}
\item[Binomial distribution method.]
 From the solution to \exerciseonlyref{ex.R3ep}, 
 $p_B = 3 f^2 (1-f) + f^3$.\index{repetition code}
\item[Sum rule method.]
 The marginal probabilities of the eight values of $\br$ are\index{sum rule}
 illustrated by: 
\beq
 P(\br \eq {\tt0}{\tt0}{\tt0} ) = \dhalf (1-f)^3 + \dhalf f^3 ,
\eeq
\beq
 P(\br \eq {\tt0}{\tt0}{\tt1} ) = \dhalf f(1-f)^2 + \dhalf f^2(1-f)
 =  \dhalf f(1-f) .
\eeq
 The posterior probabilities are represented by 
\beq
 P( s\eq{\tt1} | \br \eq {\tt0}{\tt0}{\tt0} )  = \frac{  f^3  }
		{   (1-f)^3 +  f^3 }
\eeq
 and
\beq
 P( s\eq{\tt1} | \br \eq {\tt0}{\tt0}{\tt1} )
		= \frac{  (1-f)f^2  }
			{   f(1-f)^2 +  f^2(1-f) }
		= f .
\eeq
 The probabilities of error in these representative cases are thus
\beq
 P(\mbox{error}|\br \eq  {\tt0}{\tt0}{\tt0} )  =  \frac{  f^3  }
		{   (1-f)^3 +  f^3 }
\eeq
 and 
\beq
 P(\mbox{error}|\br \eq  {\tt0}{\tt0}{\tt1} )  =  f .
\eeq
 Notice that while the average probability of error of $\Rthree$ is
 about $3 f^2$, the probability that any {\em{particular}\/} bit is
 wrong is either about $f^3$ or $f$.

 The average error probability, using the sum rule, is
\beqa
	P(\mbox{error}) &=& \sum_{\br} P(\br) P(\mbox{error}|\br) \\
 &=& 2 [\dhalf (1-f)^3 + \dhalf f^3]  \frac{  f^3  }
		{   (1-f)^3 +  f^3 }
 + 6  [\dhalf f(1-f)] f .
\eeqa
\marginpar{\vspace{-0.8in}\par\footnotesize{The first two terms are for the cases $\br = \tt000$ and $\tt111$;
 the remaining 6 are for the other outcomes, which share the
 same
 probability of occuring and identical  error probability, $f$.}}%
 So
\beqa
	P(\mbox{error}) 
 &=&   f^3 
 + 3   f^2(1-f) .
\eeqa
\end{description}
}



%
%

\subsection*{Inference problems}
\soln{ex.logit}{
\beqan
	a& =& \log \frac{p}{q}
\\ 
\Rightarrow \hspace{0.6in} \frac{p}{q} & = & e^a
\eeqan
 And $q=1-p$ gives
\beqan
	\frac{p}{1-p} & =&   e^a
\\ \Rightarrow p & = & \frac{e^a}{e^a+1} = \frac{1}{1+\exp(-a)} .
\eeqan
 The hyperbolic tangent is
\beq
	\tanh(a) = \frac{e^a -e^{-a}}{e^a + e^{-a}}
\eeq
 so 
\beqan
	f(a)& \equiv& \frac{1}{1+\exp(-a)} =
\frac{1}{2}	\left( \frac{1-e^{-a}}{1+e^{-a}} + 1 \right) \nonumber \\
	&=&  \frac{1}{2}\left(  \frac{ e^{a/2} - e^{-a/2} }{
			e^{a/2} + e^{-a/2}} +1 \right)
	= \frac{1}{2} ( \tanh(a/2) + 1 ) .
\eeqan
}	
\soln{ex.BTadditive}{
\beqan
 P(x|y) &=& \frac{P(y|x)P(x) }{P(y)}
\\%\eeq\beq
\Rightarrow
 \frac{P(x=1|y)}{P(x=0|y)} &=&  \frac{P(y|x=1)}{P(y|x=0)}
		 \frac{P(x=1)}{P(x=0)}  
\\%\eeq\beq
\Rightarrow
\log \frac{P(x=1|y)}{P(x=0|y)} &=& \log \frac{P(y|x=1)}{P(y|x=0)}
		+ \log \frac{P(x=1)}{P(x=0)}  .
\eeqan
}
\soln{ex.d1d2}{
 The conditional independence of $d_1$ and $d_2$ given $x$
 means
\beq
	P(x,d_1,d_2)  = P(x)P(d_1|x)P(d_2|x) .
\eeq
 This gives a separation of the posterior probability ratio 
 into a series of factors, one for each data point, times 
 the prior probability ratio.
\beqan
 \frac{P(x=1|\{d_i \} )}{P(x=0| \{d_i \})} &=& 
	 \frac{P(\{d_i\}|x=1)}{P(\{d_i\}|x=0)}
		 \frac{P(x=1)}{P(x=0)}  
\\ &=&
	 \frac{P(d_1|x=1)}{P(d_1|x=0)}
		 \frac{P(d_2|x=1)}{P(d_2|x=0)}
		 \frac{P(x=1)}{P(x=0)}  .
\eeqan
}

%
%
\subsection*{Life in high-dimensional spaces}
\soln{ex.RN}{
 The \ind{volume} of a \ind{hypersphere} of radius $r$ in $N$ dimensions is 
\beq
	V(r,N) = \frac{\pi^{N/2}}{(N/2)!} r^{N} .
\eeq
 For this question all that we need is the $r$-dependence, 
 $V(r,N)  \propto r^{N} .$
 So	the fractional  volume in $(r-\epsilon,r)$ is
\beq
	\frac{	r^{N} - (r-\epsilon)^N }{ r^N} = 
		1 -\left( 1 -\frac{\epsilon}{r}\right)^N .
\eeq
 The  fractional volumes in the shells for the required cases are:
\begin{center}
\begin{tabular}[t]{cccc} \toprule
$N$ & 2 & 10 & 1000 \\ \midrule 
$\epsilon/r = 0.01$ & 0.02  & 0.096 & 0.99996 \\
$\epsilon/r = 0.5\phantom{0}$  & 0.75  & 0.999 & $1 - 2^{-1000}$ \\  \bottomrule
\end{tabular}\\
\end{center}
 Notice that no matter how small $\epsilon$ is, for large enough $N$ 
 essentially all the probability mass is in the surface shell of thickness 
 $\epsilon$.
}
% see also _s1A.tex

%\input{tex/_s1a.tex} nothing there any more
\fakesection{_s1A solutions}
%=================================
% quake
%
\subsection*{Solutions to further inference problems}
\soln{ex.dieexponential}{
 Let the data be $D$. Assuming equal prior probabilities, 
\beqan
	\frac{P(A|D)}{P(B|D)} = \frac{1}{2}\frac{3}{2}\frac{1}{1}\frac{3}{2}
				\frac{1}{2}\frac{2}{2}\frac{1}{2} = 9/32.
\eeqan	
 and $P(A|D) = 9/41.$
% (check me).
}
\soln{ex.dieexponentialb}{
 The probability of the data given each hypothesis is:
\beq
	P(D|A) = \frac{3}{20}\frac{1}{20}\frac{2}{20}\frac{1}{20} 
			\frac{3}{20}\frac{1}{20} \frac{1}{20} =
	 \frac{18}{20^7} ;
\eeq	
\beq
	P(D|B) = \frac{2}{20}\frac{2}{20}\frac{2}{20}\frac{2}{20} 
			\frac{2}{20}\frac{1}{20} \frac{2}{20}
			= \frac{64}{20^7} ;
\eeq	
\beq
	P(D|C) = \frac{1}{20}\frac{1}{20}\frac{1}{20}\frac{1}{20} 
			\frac{1}{20}\frac{1}{20} \frac{1}{20} 
		= \frac{1}{20^7}.
\eeq	
 So
\beq
	P(A|D) = \frac{18}{18+64+1} = \frac{18}{83} ; \hspace{0.3in}
	P(B|D) = \frac{64}{83} ;\hspace{0.3in} 
	P(C|D) = \frac{1}{83} .
\eeq
}
\soln{ex.phonetest}{% was phonecheck
	There are two hypotheses.
 $\H_0$: your number is {\tt 740511}; $\H_1$: it is another number.
 The data, $D$, are `when I dialed {\tt 740511}, I got a busy signal'.
 What is the probability of $D$, given each hypothesis?
 If your number is {\tt 740511}, then we expect a busy signal with certainty:
\[
	P(D|\H_0) = 1  .
\]
 On the other hand, if $\H_1$ is true, then the probability that the number dialled
 returns a busy signal is smaller than 1, since various other outcomes
 were also possible (a ringing tone, or a number-unobtainable signal,
 for example).  The value of this probability $P(D|\H_1)$
 will depend on  the probability $\alpha$ that a random phone number
 similar to your own phone number would be a valid phone number,
 and on the probability $\beta$ that you get a busy signal when you dial
 a valid phone number.

% 37 per col, 4 cols per page, 250 pages.
% 20 per col, 3 cols per page, 270 pages.
% 50,000. maybe another 50% ex-directory?
 I estimate from the size of
 my phone book that Cambridge has about 75,000 valid phone numbers, all of length six
 digits. The probability that a random six-digit number is valid is
 therefore about $75,000/10^6 = 0.075$. If we exclude numbers beginning with 0, 1, and 9
 from the random choice, the probability $\a$
 is about $75,000/700,000 \simeq 0.1$.
 If we assume that
 telephone numbers are clustered then  a misremembered number
 might be more likely to be valid than a randomly chosen number; so
 the probability, $\alpha$,
 that our guessed number would be valid, assuming $\H_1$ is true,
 might be bigger than 0.1. Anyway, $\alpha$  must be somewhere between 0.1 and 1.
 We can carry forward this uncertainty in the probability
 and see how much it matters at the end.

 The  probability $\beta$ that you get a busy signal when you dial
 a valid phone number is equal to the fraction of phones you think are in use
 or off-the-hook
 when you make your tentative call.
 This fraction varies from town to town and with the time of day.
 In Cambridge, during the day, I would guess that about 1\% of phones
 are in use. At 4am,
% four in the morning,
 maybe 0.1\%, or fewer.

 The probability $P(D|\H_1)$ is the product of $\alpha$ and $\beta$,
 that is, about $0.1 \times 0.01 = 10^{-3}$. According to
 our estimates, there's about a one-in-a-thousand
 chance of getting a busy signal when you dial a random number;
 or one-in-a-hundred, if valid numbers are strongly clustered;
 or one-in-$10^4$, if you dial in the wee hours.

 How  do the data affect your beliefs about your phone number?
 The posterior probability ratio is the likelihood ratio
 times the prior probability ratio:
\beq
	\frac{ P(\H_0|D) }{ P(\H_1|D) }
=	\frac{ P(D|\H_0) }{ P(D|\H_1) }
	\frac{ P(\H_0) }{ P(\H_1) }
\eeq
 The likelihood ratio is about 100-to-1 or 1000-to-1, so the posterior
 probability ratio is swung by a factor of 100 or 1000 in favour of $\H_0$.
 If the prior probability of $\H_0$ was 0.5 then the posterior
 probability is
\beq
	 P(\H_0|D)  = \frac{1}{1 + \frac{ P(\H_1|D) }{ P(\H_0|D) } }
		\simeq  0.99 \: \mbox{or} \: 0.999 . 
\eeq
}

%\soln{ex.exponential}{
% See chapter \chbayes.
%}
%\soln{ex.blood}{
% See chapter \chbayes.
%}
%

 The other exercises are discussed in the next chapter.


%%%%%%%%%%%%%%%%%%%%%%%%%%
\dvipsb{solutions 1a}
% now another inference chapter !
\prechapter{About   Chapter} 
\fakesection{About the first Bayes chapter}
 If you'd like to get on with data compression, information
 content
 and entropy, you can skip to chapter \ref{ch2}. 
 Data compression and data modelling are
 intimately connected, however, so you'll probably
 want to come back to this chapter
 by the time you get to chapter \ref{ch4}. 

% move this later
%
% The exercises in this chapter are not a prerequisite for
% chapters \ref{ch2}--\ref{ch7}.

\fakesection{prerequisites for chapter 8}
 Before reading chapter \cheight, you should have worked on
% finished 
% all the exercises in chapter \chone, in particular, 
 \exerciserefrange{ex.logit}{ex.exponential}.
%
%  \exthirtyone--\exthirtysix.
% uvw to HXY>0


%%%%%%%%%% (many are repeated from _s1aa)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \prechapter{About Chapter}
\mysetcounter{page}{54} 
\chapter{More about  Inference}
\label{ch.bayes}\label{ch1b}
\addtopic{3}{inference}
\addtrack{3}{inferencecourse}
\addtrack{1}{infotheorycourse}
\addtrack{2}{itprnncourse}
% contains the decay problem, the bent coin, and blood.
%
%
% solutions to exercises are in _s8.tex
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\fakesection{Inference intro}
 It is not a controversial statement that Bayes's theorem
 provides the correct language for describing the inference of a 
 message communicated over a
 noisy channel, as we used it in \chref{ch1} (\pref{sec.bayes.used}).
 But strangely, when it comes to other
 inference problems, the use of
% approaches based on
 Bayes's theorem
 is not so widespread.
%let's take a little tour of other applications of 
% probabilistic inference. 

 Coherent inference can always be mapped onto probabilities (Cox, 1946).
% \cite{cox}.
  Many
 textbooks on statistics do not mention this fact, so maybe it is worth
 using an example to emphasize the contrast between Bayesian inference
 and the orthodox methods of statistical inference.
% involving
% estimators, confidence intervals, hypothesis testing, etc.
 If this topic interests you, excellent further reading is
 to be found in the works of Jaynes, for example,
 \citeasnoun{Jaynes.intervals}.

\section{A first example of probability theory}
\label{sec.decay}
 When I was an undergraduate in Cambridge, I was privileged to receive
 supervisions from Steve Gull. Sitting at his desk in a dishevelled
 office in St.\ John's College, I asked him how one ought to answer an
 old Tripos question (\exerciseonlyref{ex.exponential}):
\begin{quotation}
	Unstable particles are emitted from a source and decay at a
	distance $x$, a real number
	 that has an exponential probability distribution
	with characteristic length $\lambda$.  Decay events can only
	be observed if they occur in a window extending from $x=1\cm$
	to $x=20\cm$. $N$ decays are observed at locations $\{x_1 ,
	\ldots , x_N\}$. 
% ($x_n$ is a real number.)
	 What is $\lambda$?

\end{quotation}
\begin{center}
\mbox{\psfig{figure=\FIGS/decay.ps,width=3in,angle=90,%
bbllx=154mm,bblly=147mm,bbury=257mm,bburx=175mm}}\\
\end{center}
 I had scratched my head over this for some time.
 My education had provided me with a couple of  approaches to solving
 such inference problems: contructing `\ind{estimator}s'
 of the unknown parameters; or  `fitting' the model to
 the data, or a processed version of the data.

 Since the mean of an unconstrained exponential distribution is $\l$,
 it seemed reasonable to examine the sample mean $\bar{x} = \sum_n x_n / N$
 and see
 if  an estimator $\hat{\l}$  could be obtained from it.
 It was evident that the {estimator}
 $\hat{\l}=\bar{x}-1$ would be appropriate for
 $\lambda \ll 20\,$cm, but not for cases where the
 truncation of the distribution at the right hand side
 is significant; with a little ingenuity and the introduction of
 ad hoc bins, promising estimators for $\lambda \gg 20$ cm could be
 constructed.  But there was no obvious estimator that would work
 under all conditions.

 Nor could I find a satisfactory
 approach based on fitting the density $P(x|\lambda)$ to
 a histogram derived from the data.  I was stuck.
 
 What is the general solution to this problem and others like it?
 Is it always necessary, when confronted by a new inference problem,
 to grope in the dark for appropriate `estimators' and worry
 about finding the `best' estimator (whatever that means)?

%% I hope you have already stopped and thought about this question.
% problem. 
% \\ \mbox{~}\dotfill\ \mbox{~} \\
% \newpage

 Steve 
% Gull
 wrote down the probability of one data point, given $\l$: 
\beq
        P(x|\lambda) =\left\{ \begin{array}{ll}
        {\textstyle \dfrac{1}{\l}} 
        e^{-x/\lambda } / Z(\lambda) & 1 < x < 20 \\
 0                                      & {\rm otherwise }
        \end{array} \right.
\label{basic.likelihood}
\eeq
where 
\beq
        Z(\l) = \int_1^{20} dx \: \frac{1}{\l} e^{-x/\lambda } = \left(e^{-1/\l} - e^{-20 /\l} \right).
\label{basic.likelihood.Z}
\eeq
 This seemed obvious enough. 
 Then he wrote {\dem{\ind{Bayes's theorem}}}: 
\beqan
\label{bayes.theorem}
% \begin{array}{l}
 P(\l|\{x_1, \ldots, x_N\}) &=& 
        \frac{P(\{x\}|\lambda) P(\l)}{P(\{x\}) } \\
%&& \hspace{0.5in}
 &\propto&    \frac{1}{\left( \l Z(\l) \right)^N}
                 \exp \left( \textstyle - \sum_1^N x_n / \l \right)  P(\l) 
 .
% \end{array}
\label{basic.posterior}
\eeqan
 Suddenly, the straightforward distribution $P(\{x_1 ,\ldots, x_N \}|
 \l)$, defining the probability of the data given the hypothesis $\l$,
 was being turned on its head so as to define the probability of a
 hypothesis given the data.  A simple figure showed the probability of
 a single data point $P(x|\l)$ as a familiar function of $x$, for
 different values of $\l$ (figure \ref{decay.like.1}).  Each curve was
 an innocent exponential, normalized to have area 1.  Plotting the
 same function as a function of $\l$ for a fixed value of $x$,
 something remarkable happened: a peak emerges (figure
 \ref{decay.like.2}). To help understand these two points
 of view of the one function, \figref{decay.probandlike}
 shows a surface plot of   $P(x|\l)$ as a function of $x$ and $\l$.

\begin{figure}
\figuremargin{%
\begin{center}
\mbox{\psfig{figure=\FIGS/decay.like.1.ps,%
width=2 in,angle=-90}}
\end{center}
}{%
\caption{{The probability density $P(x|\l)$ as a function of $x$.}}
\label{decay.like.1}
}%
\end{figure}
\begin{figure}
\figuremargin{%
\begin{center}
\mbox{\psfig{figure=\FIGS/decay.like.2.ps,%
width=2 in,angle=-90}}
\end{center}
}{%
\caption[a]{{The probability density $P(x|\l)$ as a function of $\l$,
 for three different values of $x$.}
 \small
 When plotted this way round, the function is known as 
 the {\dem\ind{likelihood}\/} of $\l$.
 The marks indicate the three values of $\l$, $\l=2,5,10$,
 that were used in the preceding figure.
}
\label{decay.like.2}
}
\end{figure}
%\begin{figure}
%\figuremargin{%
\marginfig{
\begin{center}
\begin{tabular}{c}
\makebox[0pt][l]{\hspace*{0.21in}\raisebox{0.435in}{$x$}}%
\mbox{\psfig{figure=\FIGS/probandlike.ps,%
width=2in,angle=-90}%
\makebox[0pt][l]{\hspace*{-0.352in}\raisebox{0.435in}{$\l$}}}\\[-0.3in]% was -0.6 Sat 5/10/02
\end{tabular}\end{center}
%}{%
\caption[a]{{The probability density $P(x|\l)$ as a function of  $x$
 and $\l$.}
}
\label{decay.probandlike}
}
%\end{figure}
\begin{figure}
\figuremargin{%
\begin{center}
\mbox{\psfig{figure=\FIGS/decay.like.xxx.ps,%
width=2in,angle=-90}}
\end{center}
}{%
\caption[a]{{The likelihood function in the case of a six-point  dataset, 
  $P(\{x\} = \{1.5,2,3,4,5,12\}|\lambda)$, as a function of   $\l$.}
}
\label{decay.like.xxx}
}
\end{figure}
 For a dataset consisting of several  points, \eg, the
 six points
 $\{x\}_{n=1}^{N} = \{1.5,2,3,4,5,12\}$,  the likelihood function
 $P(\{x\}|\lambda)$ is the product of the $N$ functions of $\l$, 
 $P(x_n|\l)$ (\figref{decay.like.xxx}).
%
% Added Mon 4/2/02 
\marginpar{\footnotesize{[If you have any difficulty understanding this chapter I recommend
 ensuring you are happy with 
 exercises \ref{ex.dieexponential} and \ref{ex.dieexponentialb} (\pref{ex.dieexponentialb})
 then noting their similarity to 
 \exerciseonlyref{ex.exponential}.]}}

 Steve summarised Bayes's theorem
% (equation \ref{bayes.theorem})
 as
 embodying the fact that what you know about $\lambda$ 
 after the data arrive is what
 you knew before [$P(\lambda)$], and what the data told you 
 [$P(\{x\}|\lambda)$]. Probabilities are used here to 
 quantify degrees of belief. 
% The probability 
% of $\lambda$ is a quantification of what you know about $\lambda$. 
 To nip possible confusion in the bud, it must be
 emphasized that the hypothesis $\lambda$ which correctly describes
 the situation is {\em not\/} a stochastic variable, and the fact that
 the Bayesian uses a probability\index{probability!Bayesian}
 distribution $P$ does {\em not\/} mean
 that he thinks of the world as stochastically changing its nature
 between the states described by the different hypotheses. He uses the
 notation of probabilities to represent his {\em beliefs\/} about the mutually
 exclusive micro-hypotheses (here, values of $\l$),
 of which only one is actually true.  That
 probabilities can denote degrees of belief, given assumptions, seemed
 intuitive to me.
% , and is proved  by Cox  (1946). 
% \citeasnoun{cox}.
% . Anyone who does not find it reasonable to use
% probabilities to quantify degrees of belief can read
% paper, where it is proved to be
% valid.


\label{sec.decayb}
 The posterior probability distribution
% of equation
 (\ref{basic.posterior}) represents 
 the unique and complete solution to the problem. 
 There is no need to invent\index{classical statistics!criticisms}
 `estimators'; nor do we need to invent 
 criteria for comparing alternative estimators with each other. 
 Whereas orthodox statisticians offer twenty ways of solving a
 problem, and another twenty different criteria for deciding which of
 these solutions is the best, Bayesian statistics only offers one
 answer to a well-posed problem.

\subsection{Assumptions in inference}
 Our inference is conditional on our assumptions [for example, the
 prior $P(\lambda)$]. Critics view such priors as a difficulty because 
 they are  `subjective', but I
 don't see how it could be otherwise.  How can one perform inference
 without making assumptions? 
 I believe that it is of great value that Bayesian
 methods force one to make these tacit assumptions explicit.  

 First,
 once assumptions are made, the inferences are objective and unique,
 reproduceable with complete agreement by anyone who has the same
 information and makes the same assumptions.  For example, given the
 assumptions listed above, $\H$, and the data $D$,
% from an experiment
% measuring decay lengths,
 everyone will agree about the posterior
 probability of the decay length $\l$:
\beq
P(\l|D,\H) = \frac{ P(D|\l,\H) P(\l|\H) }{ P(D|\H) } .
\eeq

 Second, when the assumptions are explicit, they are easier to
 criticize, and easier to modify -- indeed,
 we can quantify the sensitivity of our inferences to
 the details of the assumptions. For example,
 we can note from the likelihood curves 
 in figure \ref{decay.like.2} that in the case of a single data point at 
 $x=5$, the likelihood 
 function is less strongly peaked than in the case $x=3$;  the 
 details of the prior $P(\lambda)$ become  increasingly important as the sample 
 mean $\bar{x}$ gets closer to the middle of the window, 10.5. In the case 
 $x=12$, the likelihood function doesn't have a peak at all -- such data 
 merely rule out small values of $\lambda$, and don't give any information 
 about the relative probabilities of large values of $\lambda$. So 
 in this case, the details of the prior at the small $\lambda$ end 
 of things are not important, but at  the large $\lambda$ end, the prior 
 is important. 
%  is whatever we knew before 
%   the experiment, \ie, our prior.

 Third, when we are not sure which of various alternative assumptions
 is the most appropriate for a problem, we can treat this question as
 another inference task.  Thus, given data $D$, we can
% learn from the data  
 compare alternative assumptions $\H$ using Bayes's theorem: 
\beq
P(\H|D,\I) = \frac{ P(D|\H,\I) P(\H|\I) }{ P(D|\I) } ,
\label{basic.ev}
\eeq
 where $\I$ denotes the highest assumptions, which we are not
 questioning.  

 Fourth, we can take into account our uncertainty regarding such
 assumptions when we make subsequent predictions. Rather than choosing
 one particular assumption $\H^{*}$, and working out our predictions
 about some quantity $\bt$, $P(\bt|D,\H^{*},\I)$, we obtain
 predictions that take into account our uncertainty about $\H$ by
 using the sum rule:
\beq
P(\bt | D, \I) = \sum_{\H} P(\bt | D, \H , \I ) P(\H|D,\I) .
\label{basic.marg}
\eeq
 This is another contrast with orthodox statistics, in which it is
 conventional to `test' a default model, and then, if the test
 `accepts' the model at some `significance level', to use exclusively that model  to make
 predictions.

 Steve thus persuaded me that
\begin{quotation}
        Probability theory reaches parts that ad hoc methods cannot reach.
\end{quotation}
% However, that is a topic for another lecture. 

 Let's look at a few more examples of simple inference problems. 
\section{The bent coin}
\label{sec.bentcoin}
 A \ind{bent coin}\index{inference problems!bent coin}
 is tossed $F$ times; we observe a sequence $\bs$ of 
 heads and tails (which we'll denote by the symbols $\ta$ and $\tb$).
 We wish to know the bias of the coin, and predict 
 the probability that the next toss will result in a head. 
 We first encountered this task in \exampleref{exa.bentcoin},
 and we will encounter it again
 in chapter \chfour, when we discuss adaptive data compression. 
% the adaptive encoder for $a$s and $b$s. 
 It is also the original inference problem studied by
% Rev.\
 {Thomas Bayes}
 in his essay published in 1763.\index{Bayes, Rev.\ Thomas}
% cite{Bayes}

 As in
% \chref{ch.prob.ent}
 \exerciseref{ex.postpa}, we will
 assume 
% In chapter \chfour\ we assumed
 a uniform prior distribution and
 obtain a posterior distribution by multiplying by the likelihood. A
 critic might object, `where did this prior come from?'  I will not
 claim that the uniform prior is in any way fundamental; indeed
 we'll give examples of nonuniform priors later.  The prior is
% It is simply
 a subjective assumption. One of the themes of this book is:
%
% put this back somewhere?
%
% One way to justify the need for a prior is
% to assume, as in  chapter \chfour,
% that our task is simply to make a code to encode the
% outcome $\bs$ as efficiently as possible. We have to compress the
% data from the source somehow, and any choice of a compression scheme
% must correspond to a prior distribution over coin biases.  I see no
% way round this.  The choice of code implies an assumed probability
% distribution over outcomes.
%\begin{quotation}
\begin{quote}
\noindent
        You can't do  inference -- or  data compression -- without
 making assumptions.
%        You can't do data compression -- or inference -- without
% making assumptions.
\end{quote}
%\end{quotation}

%
% change notation? f_H?????????????????????????????????
% 
\subsubsection*{Likelihood function}
 We give the name $\H_1$ to our assumptions. [We'll be introducing
 an alternative set of assumptions in a moment.]
 The probability, given $p_a$, that  $F$ tosses
 result in a sequence $\bs$
 that contains $\{F_{\ta},F_{\tb}\}$ counts of the two outcomes
%  $\{ a , b \}$
 is
\beq
        P( \bs | p_{\ta} , F,\H_1 ) =  p_{\ta}^{F_{\ta}} (1-p_{\ta})^{F_{\tb}} .
\label{eq.pa.likeb}
\eeq
 [{For example, $P(\bs\!=\!aaba|p_{\ta},F\!=\!4,\H_1)
 = p_{\ta}p_{\ta}(1-p_{\ta})p_{\ta}.$}]
 This function of $p_{\ta}$ (\ref{eq.pa.likeb}) defines the likelihood function.
% Model 1
 Our first model assumes a uniform prior distribution for $p_{\ta}$,
\beq
        P(p_{\ta}|\H_1) = 1 , \: \: \: \: \: \: p_{\ta} \in [0,1] 
\label{eq.pa.priorb}
\eeq
 and $p_{\tb} \equiv 1-p_{\ta}$.


\subsubsection{Infering  unknown parameters}
 Given a string of length $F$ of which $F_{\ta}$ are $a$s and 
 $F_{\tb}$ are $\tb$s we are interested in (a) inferring 
 what $p_{\ta}$ might be;  (b) predicting the probability of an $\ta$ 
 or $\tb$ being the next character.

 Assuming $\H_1$ to be true, the posterior probability of $p_{\ta}$, given a
 string $\bs$ of length $F$ that has 
 counts  $\{F_{\ta},F_{\tb}\}$, is, by Bayes's theorem,
\beqan
        P( p_{\ta} | \bs ,F,\H_1) &=& 
        \frac{  P( \bs | p_{\ta} , F,\H_1 ) P(p_{\ta}|\H_1) }{ P(  \bs | F,\H_1 )  } .
\label{eq.pa.post}
\label{eq.pa.post.again}
\eeqan 
 The factor $P( \bs | p_{\ta} , F,\H_1 )$, which, as a function
 of $p_{\ta}$, is known as the likelihood function,
 was given in \eqref{eq.pa.likeb}; the prior
 $P(p_{\ta}|\H_1)$  was given in \eqref{eq.pa.priorb}. 
 Our inference of $p_{\ta}$ is thus:
% The posterior 
\beqan
        P( p_{\ta} | \bs ,F,\H_1) &=& 
        \frac{    p_{\ta}^{F_{\ta}} (1-p_{\ta})^{F_{\tb}}  }{ P(  \bs | F,\H_1 )  } .
\label{eq.pa.postb.again}
\eeqan 
 The normalizing constant is given by the beta integral
\beq
        P(  \bs | F,\H_1 )  = \int_0^1 d p_{\ta} \: p_{\ta}^{F_{\ta}} (1-p_{\ta})^{F_{\tb}} = 
        \frac{\Gamma(F_{\ta}+1)\Gamma(F_{\tb}+1)}{ \Gamma(F_{\ta}+F_{\tb}+2) } 
        = \frac{ F_{\ta}! F_{\tb}! }{ (F_{\ta} + F_{\tb} + 1)! } .
\label{eq.evidenceZ}
\eeq
 Our inference of $p_{\ta}$, assuming $\H_1$ to be true,
 is thus given by \eqref{eq.pa.postb.again}. 

%%%%%%%%%%%%%
\exercisxA{2}{ex.postpaII}{
 Sketch the posterior probability $P( p_{\ta} | \bs\eq {\tt aba} ,F\eq 3)$.
 What is the most probable value of $p_{\ta}$ (\ie, the value that maximizes 
 the posterior probability density)? What is the mean value of $p_{\ta}$ 
 under this distribution?

 Answer the same questions for
 the posterior probability $P( p_{\ta} | \bs\eq {\tt bbb} ,F\eq 3)$.
}
 
\subsubsection{From inferences to predictions}
 Our prediction about the next toss, the probability of  the next toss's  being an $\ta$,
 is obtained by integrating over $p_{\ta}$. This has the effect of 
 taking into account our uncertainty about $p_{\ta}$ when making predictions.
 By the sum rule,
\beqan
        P(\ta |  \bs ,F)& =& \int d p_{\ta} \: P(\ta | p_{\ta} ) P(p_{\ta} | \bs,F )  .
\eeqan
 The probability of an $\ta$ given $p_{\ta}$ is simply $p_{\ta}$, 
 so
\beqan
\lefteqn{        P(\ta |  \bs ,F)  
        = \int d p_{\ta} \: p_{\ta} \frac{p_{\ta}^{F_{\ta}} (1-p_{\ta})^{F_{\tb}}}
        {P(  \bs | F ) }  }
\\
&=& \int d p_{\ta} \: \frac{p_{\ta}^{F_{\ta}+1} (1-p_{\ta})^{F_{\tb}}}
        {P(  \bs | F ) } 
\\
&=& \left.
% \frac
 { \left[ \frac{ (F_{\ta}+1)! F_{\tb}! }{ (F_{\ta} + F_{\tb} + 2)! } \right] } \right/
 { \left[  \frac{ F_{\ta}! F_{\tb}! }{ (F_{\ta} + F_{\tb} + 1)! } \right] } 
\:\: = \:\: \frac{ F_{\ta}+1 }{ F_{\ta} + F_{\tb} + 2 } ,
\label{eq.laplacederived}
\eeqan
 which is known as {\dem{\ind{Laplace's rule}}}.


\section{The bent coin and model comparison}
\label{sec.bentcoin2}
 Imagine that a scientist introduces another theory for our data. 
 He asserts that the source is not really a bent coin but is really a 
 perfectly formed die with one face painted heads (`$\ta$') and the other five
 painted tails (`$\tb$'). Thus the parameter $p_{\ta}$, which in the original model,
 $\H_1$, could take any value between 0 and 1, is according 
 to the new hypothesis, $\H_0$, not a free parameter at all; rather, it
 is equal to 
% p_{\ta} = 
 $1/6$. [This hypothesis is termed $\H_0$ so that the suffix of each model
 indicates its number of free parameters.] 

 How can we compare these two models in the light of data? 
 We wish to
 infer  how probable 
 $\H_1$ is relative to $\H_0$.
% , so we can use Bayes's theorem again. 
% Let us write down the first model's probabilities again.

% {\em Here we repeat some material from the arithmetic coding
% chapter, chapter \ref{ch4}.}

\subsubsection*{Model comparison as inference}
 In order to perform model comparison, we write down 
 Bayes's theorem again, but this time with a different 
 argument on the left hand side. We wish to know how probable 
 $\H_1$ is given the data.
\beq
 P( \H_1 | \bs ,F ) = \frac{ P(  \bs | F,\H_1 )  P( \H_1 ) }{  P(  \bs | F) }
\eeq
 Similarly, the posterior probability of $\H_0$ is 
\beq
 P( \H_0 | \bs ,F ) = \frac{ P(  \bs | F,\H_0 )  P( \H_0 ) }{  P(  \bs | F) }.
\eeq
 The normalizing constant in both cases is $P(\bs|F)$, which is the total 
 probability of getting the observed data.
% regardless of which model  is true.
 If $\H_1$ and $\H_0$ are the only models under 
 consideration, this  probability is given by the sum rule: 
\beq
         P(  \bs | F) =  P(  \bs | F,\H_1 )  P( \H_1 ) 
                 + P(  \bs | F,\H_0 )  P( \H_0 ) .
\eeq
 To evaluate the posterior probabilities of the hypotheses we 
 need to assign values to the prior probabilities $P( \H_1 )$ 
 and $P( \H_0 )$; in this case, we might set these to 1/2 each. And
 we need to evaluate the data-dependent terms
 $P(  \bs | F,\H_1 )$ and $P(  \bs | F,\H_0 )$. 
 We can give names to these quantities. 
 The quantity $P(  \bs | F,\H_1 )$ is a measure of how much the data 
 favour $\H_1$, and we call it the {\dbf\ind{evidence}} for model $\H_1$. 
 We already encountered this quantity in equation (\ref{eq.pa.post.again})
 where it appeared 
 as the normalizing constant of the first inference we made -- the 
 inference of $p_{\ta}$ given the data. 

\begin{description}
\item[Model comparison --  message number 1:]
 The evidence for a model is usually
 the normalizing constant of an earlier Bayesian inference.
\end{description}

 We evaluated the normalizing constant for model $\H_1$ in
 (\ref{eq.evidenceZ}).
 The evidence for model $\H_0$ is very simple because this model 
 has no parameters to infer. Defining $p_0$ to be $1/6$, we have
\beq
        P(  \bs | F,\H_0 )  =  p_0^{F_{\ta}} (1-p_0)^{F_{\tb}} .
\eeq

 Thus the posterior probability ratio  of model $\H_1$ to model $\H_0$ is
\beqan
\frac{ P( \H_1 | \bs ,F )}
{P( \H_0 | \bs ,F )}
& =&
 \frac{ P( \bs | F,\H_1 ) P( \H_1 ) }
      { P( \bs | F,\H_0 ) P( \H_0 ) }
\\ 
 &=& 
\frac{ \frac{ F_{\ta}! F_{\tb}! }{ (F_{\ta} + F_{\tb} + 1)! } }{  p_0^{F_{\ta}} (1-p_0)^{F_{\tb}} } .
\label{eq.compare.final}
\eeqan
 Some values of this posterior probability ratio are illustrated in 
 table \ref{tab.mod.comp}. The first five lines illustrate that 
 some outcomes  favour one model, and some favour the other.
 No  outcome is completely incompatible with either model.
\begin{table}
\figuremargin{%
\begin{center}
\begin{tabular}{cccl}  \toprule
$F$ & Data $(F_{\ta},F_{\tb})$ & $\displaystyle \frac{ P( \H_1 | \bs ,F )}
                                {P( \H_0 | \bs ,F )}$ \\  \midrule
6 & (5,1) & 222.2 & \\
6 & (3,3) & 2.67 &\\
6 & (2,4) & 0.71 & =  1/1.4 \\
6 & (1,5) & 0.356 & = 1/2.8 \\
6 & (0,6) & 0.427 & = 1/2.3 \\ \midrule
20 & (10,10) & 96.5 & \\
20 & (3,17) & 0.2 & = 1/5 \\
20 & (0,20) & 1.83 &  \\  \bottomrule
\end{tabular}
\end{center}
}{%
\caption{Outcome of model comparison between models $\H_1$ and $\H_0$
 for the `bent coin'. Model $\H_0$ states that  $p_{\ta}=1/6, p_{\tb}=5/6$.}
\label{tab.mod.comp}
}
\end{table}
 With small amounts of data (six tosses, say) it is typically not the case that 
 one of the two models is overwhelmingly more probable than 
 the other. But with more data, the evidence against $\H_0$ given 
 by any data set with the ratio $F_{\ta}:F_{\tb}$ differing from 1:5 mounts up.
%
% add figure showing some typical histories
%
 You can't predict in advance how much data is needed to be pretty sure
 which theory is true.\index{key points!how much data needed}  It depends what $p_0$ is.
%
% THIS IS A VERY GENERAL
% message for machine learning.

% corrected Wed 28/11/01
 The simpler model, $\H_0$, since it has no adjustable parameters, 
 is able to lose out by the biggest margin. The odds may be hundreds to one 
 against it. The more complex model can never lose out 
 by a large margin; there's no data set that is actually {\em unlikely\/}
 given model $\H_1$.
\exercisxB{2}{ex.evidencebounds}{
 Show that after $F$ tosses have taken place, the
 biggest value that the log evidence ratio
\beq
\log \frac{ P( \bs | F,\H_1 ) }
          { P( \bs | F,\H_0 ) }
\eeq
 can have scales {\em linearly\/} with $F$ if
 $\H_1$ is more probable, but
 the log evidence in favour of $\H_0$ can grow
 at most as $\log F$.
}
\exercisxB{3}{ex.evidenceest}{
 Putting your sampling theory hat on, assuming $F_{\ta}$ has not yet been measured, 
 compute a plausible range that
% the mean and variance -- or some sort of most probable value, and indication of spread -- of the
 the log evidence ratio might lie in, as a function of $F$ and
 the true value of $p_{\ta}$,
 and sketch it
 as a function of $F$ for $p_{\ta}=p_0=1/6$, $p_{\ta}=0.25$,
 and $p_{\ta}=1/2$.
 [Hint:  sketch the log evidence as a function
 of the random variable $F_{\ta}$ and work out the mean
 and standard deviation of $F_{\ta}$.]
% [Hint: Taylor-expand the log evidence as a function
% of $F_{\ta}$.]
}
\subsection{Typical behaviour of the evidence}
% see figs/sixtoone
% and bin/sixtoone.p
 \Figref{fig.evidencetyp} shows the log evidence ratio
 as a function of the number of
 tosses, $F$, in a number of simulated experiments.
 In the left-hand experiments, $\H_0$ was true.
 In the right-hand ones, $\H_1$ was true, and the value of
 $p_{\ta}$ is either 0.25 or 0.5.
% \newcommand{\sixtoone}[2]{%  in newcommands1.tex
\begin{figure}
\figuremargin{%
\small%
\begin{center}
\begin{tabular}{cccc}
$\H_0$ is true &&
\multicolumn{2}{c}{$\H_1$ is true} \\ \cmidrule{1-1}\cmidrule{3-4}
\sixtoone{$p_{\ta}=1/6$}{h09}&&
\sixtoone{$p_{\ta}=0.25$}{h69}&
\sixtoone{$p_{\ta}=0.5$}{h29}\\
\sixtoone{}{h08}&&
\sixtoone{}{h68}&
\sixtoone{}{h28}\\
\sixtoone{}{h07}&&
\sixtoone{}{h67}&
\sixtoone{}{h27}\\
\end{tabular}
\end{center}
}{%
\caption[a]{Typical behaviour of the evidence in favour of $\H_1$ as
 bent coin tosses accumulate
 under three different conditions. Horizontal axis is the number of
 tosses, $F$. The vertical axis on the left is
$\log \frac{ P( \bs | F,\H_1 ) }
          { P( \bs | F,\H_0 ) }$;
  the right hand vertical axis shows the values of 
$\frac{ P( \bs | F,\H_1 ) }
          { P( \bs | F,\H_0 ) }$.

 (See also \protect\figref{fig.evidenceMSD}, \pref{fig.evidenceMSD}.)
}
\label{fig.evidencetyp}
}%
\end{figure}
 

 We will discuss model comparison more in a later chapter. 

\section{An example of legal evidence}
 The following example
 (\exerciseonlyref{ex.blood})  illustrates that there is more 
 to Bayesian inference than the priors.

\begin{quote}
% Two people have left traces of their own blood at the scene of a
% crime.  Their blood groups can be reliably identified from these
% traces and are found 
% to be of type `O' (a common type in the local population, having
% frequency 60\%) and of type `AB' (a rare type, with frequency 1\%).
% A suspect is tested and found to have type `O' blood. 
% A careless lawyer might claim that the fact that the suspect's
% blood type was found at the scene is positive evidence for the theory
% that he was present. But do these data
% $D=$ \{type `O' and `AB' blood were found at scene\} make it more
% probable that this suspect was one of the two people present at the
% crime? 
 Two people have left traces of their own blood at the scene of a
 crime. 
 A suspect, Oliver, is tested and found to have type `O' blood.
 The blood groups of the two traces 
 are found
 to be of type `O' (a common type in the local population, having
 frequency 60\%) and of type `AB' (a rare type, with frequency 1\%).
  Do these data
 (type `O' and `AB' blood were found at scene) give evidence in favour 
 of the proposition  that Oliver was one of the two people present at the
 crime? 

\end{quote}
 A careless \ind{lawyer} might claim that the fact that the suspect's
 blood type was found at the scene is positive evidence for the theory
 that he was present. But this is not so.

 Denote the proposition `the suspect and one unknown person were
 present' by $S$. The alternative, $\bar{S}$, states `two unknown people
 from the population were present'. 
 The prior  in  this problem is the prior probability ratio between the 
 propositions $S$ and $\bar{S}$. This quantity is important to the final 
 verdict and would be based on all other available information 
 in the case. Our task here is just to evaluate the contribution made by the 
 data $D$, that is, the likelihood ratio, $P(D|S,\H)/P(D|\bar{S},\H)$.
 In my view, a jury's task should generally be to multiply together carefully 
 evaluated 
 likelihood ratios from each independent piece of admissible evidence
 with an equally carefully reasoned prior probability.
 [This  view is shared by many statisticians but learned British appeal judges\index{judge}   
 recently disagreed and actually overturned the verdict of a trial
 because the \index{jury}{jurors} {\em had\/} been taught to use Bayes's theorem to 
 handle complicated \ind{DNA} evidence.]

%
 The probability of the data given $S$ is the probability that one unknown person 
 drawn from the population has blood type AB:
\beq
P(D|S,\H) = p_{\rm{AB}} 
\eeq
 (since given $S$, we already know that one trace will be of type O). 
The probability of the data given  $\bar{S}$ is the 
probability that  two unknown people drawn from the population have 
types O and AB: 
\beq
P(D|\bar{S},\H) = 2 \, p_{\rm{O}} \, p_{\rm{AB}}
\eeq
 In these equations $\H$ denotes the assumptions that two people were
 present and left blood there, and that the probability distribution
 of the blood groups of unknown people in an explanation is the same
 as the population frequencies. 
% Our posterior probability ratio for
% $S$ relative to $\bar{S}$ is obtained by multiplying the probability
% ratio based on all other independent information by the ratio of
% these likelihoods. The most straightforward way to summarize the
% contribution of any piece of evidence is in terms of a likelihood
% ratio.

 Dividing, we obtain the likelihood ratio: 
\beq
        \frac{P(D|S,\H)}{P(D|\bar{S},\H)} = \frac{1}{2 p_{\rm O}} 
        = \frac{1}{2 \times 0.6}
         = 0.83
\eeq
 Thus the data in fact provide weak evidence {\em against\/} the
 supposition that Oliver was present.

 This result may be found surprising, so let us examine it from
 various points of view. First consider the case of another suspect,
 Alberto, 
 who has type AB.  Intuitively, the data do provide evidence in favour
 of the theory $S'$ that this suspect was present, relative to the
 null hypothesis $\bar{S}$. And indeed the likelihood ratio in this
 case is:
\beq
        \frac{P(D|S',\H)}{P(D|\bar{S},\H)} = \frac{1}{2\, p_{\rm{AB}}} = 50.
\eeq 
 Now let us change the situation slightly; imagine that 99\% of people
 are of blood type O, and the rest are of type AB. Only these two 
 blood types exist in the population. The data at the
 scene are the same as before. Consider again how these data influence
 our beliefs about Oliver,
a  suspect of type O and Alberto, a suspect of type
 AB.  Intuitively, we still believe that the presence of the rare AB
 blood provides positive evidence that  \ind{Alberto} was
 there.  But does
% we still have the feeling that
 the fact that type O
 blood was detected at the scene favour the hypothesis that
 Oliver was present? If this were the case, that would mean that
 regardless of who the suspect is, the data make it more probable they
 were present; everyone in the population would be
 under greater suspicion, which would be absurd.  The data may be {\em
 compatible\/} with any suspect of either blood type being present, but
 if they provide  evidence {\em for\/} some theories, they must also
 provide evidence {\em against\/} other theories.

 Here is another way of thinking about this: imagine that instead of
 two people's blood stains there are ten, and that in the entire local
 population of one hundred, there are ninety type O suspects and ten
 type AB suspects.
% Initially all 100 people are suspects. 
 Consider a particular type O suspect, \ind{Oliver}: without any other information,
 and before the blood test results come in,
 there is a one in 10 chance that he was at the scene, since
 we know that 10 out of the 100 suspects were present.  We now get the
 results of blood tests, and find that {\em nine\/} of the ten stains are of
 type AB, and {\em one\/} of the stains is of type O. Does this make it more
 likely that Oliver was there? No,
% although he could have been,
 there is now only a one in ninety chance that he was there, since we
 know that only one person present was of type O.

 Maybe the intuition is aided finally by writing down the formulae for
 the general case where $n_{\rm{O}}$ blood stains of individuals of type $O$
 are found, and $n_{\rm{AB}}$ of type $\rm{AB}$, a total of $N$ individuals in
 all, and unknown people come from a large population with fractions
 $p_{\rm{O}}, p_{\rm{AB}}$. (There may be other blood types too.) 
 The task is to evaluate the likelihood ratio for the
 two hypotheses:  $S$, `the type O suspect (Oliver)
 and $N\!-\!1$ unknown others
 left $N$ stains'; and $\bar{S}$, `$N$ unknowns left $N$ stains'. The
 probability of the data under hypothesis $\bar{S}$ is just the
 probability of getting $n_{\rm{O}}, n_{\rm{AB}}$ individuals of the two types
 when $N$ individuals are drawn at random from the population:
\beq
        P(n_{\rm{O}},n_{\rm{AB}}|\bar{S}) = 
        \frac{ N! }{ n_{\rm{O}} ! n_{\rm{AB}}! } p_{\rm{O}}^{n_{\rm{O}}} p_{\rm{AB}}^{n_{\rm{AB}}} .
\eeq
 In the case of hypothesis $S$, we need  the distribution of
 the $N\!-\!1$ other individuals:
\beq
        P(n_{\rm{O}},n_{\rm{AB}}|S) = 
        \frac{ (N-1)! }{ (n_{\rm{O}}-1)!  n_{\rm{AB}}! } p_{\rm{O}}^{n_{\rm{O}}-1} p_{\rm{AB}}^{n_{\rm{AB}}} .
\eeq
 The likelihood ratio is:
\beq
        \frac{ P(n_{\rm{O}},n_{\rm{AB}}|S) }{ P(n_{\rm{O}},n_{\rm{AB}}|\bar{S}) }
        = \frac{n_{\rm{O}}/N}{p_{\rm{O}}} .
\eeq
 This is an instructive result. The likelihood ratio, \ie\ the
 contribution of these data to the question of whether Oliver
 was present, depends simply on a comparison of the frequency
 of his blood type
% type O blood
 in the observed data with the background frequency 
% of type O blood
 in the population. There is no dependence on the counts
 of the other types found at the scene, or their frequencies in the
 population.  If there are more type O stains than the average number
 expected  under hypothesis $\bar{S}$, then the data give
 evidence in favour of the presence of Oliver.
 Conversely, if there are fewer type O stains than the expected number
 under $\bar{S}$, then the data reduce the probability of the
 hypothesis that he was there.  In the special case $n_{\rm{O}}/N = p_{\rm{O}}$, the
 data contribute no evidence either way, regardless of the fact that
 the data are compatible with the hypothesis $S$.


\section{Exercises}
% \subsection*{The game show}
%\subsubsection*{The normal rules}
%\subsubsection*{The earthquake scenario}
\exercisxA{2}{ex.3doors}{
  {\sf The \ind{three doors},\index{Monty Hall problem} normal rules.}
% "Let's Make A Deal," hosted by Monty Hall

 On a \ind{game show},\index{doors, on game show}
 a contestant is told the rules as 
 follows:
\begin{quote}
 There are three doors, labelled 1, 2, 3. A single
 prize has been hidden behind one of 
 them. You get to select one door. Initially your chosen door will {\em not\/} 
 be opened. Instead, the gameshow host will open one of the other two doors, 
 and {\em he will do so in such a way as not to reveal the prize.}
 For example, if you first
 choose door 1, he will then open {one\/} of doors 2 and 3, and it 
 is guaranteed that he will choose which one to open so that
 the prize will not be revealed. 

 At this point, you will be given a fresh choice of door:
 you can either stick with your first choice,
 or you can switch to the other 
 closed door.  All the doors will then be opened and 
 you will  receive whatever is behind your final 
 choice of door.
\end{quote}
  Imagine that the contestant chooses door 1 first; then the gameshow host 
 opens door 3, revealing nothing behind the door, as promised. 
 Should the contestant (a) stick with door 1, or (b)
 switch to door 2, or (c) does it make no difference?
}
\exercisxA{2}{ex.3doorsb}{
 {\sf The three doors,  earthquake scenario.}

 Imagine that the game happens again
 and  just as the gameshow host is about to open one of the 
 doors a violent earthquake\index{earthquake, during game show}
 rattles the  building and one of the 
 three doors flies open. It happens to be door 3, and it 
 happens not to have the prize behind it. The contestant had initially 
 chosen door 1.

 Repositioning his toup\'ee,
 the host suggests, `OK, since you chose door 1 initially, 
 door 3 is a valid door for me to open, according to the
 rules of the game; I'll let door 3 stay open. Let's carry on 
 as if nothing happened.'
 
 Should the contestant stick with door 1, or switch to door 2, or
 does it make no difference? Assume that the prize was placed randomly, that
 the gameshow host does not know where it is, and that the door flew open
 because its latch was broken by the earthquake.

 [A similar alternative scenario is a game show whose {\em confused host\/}\index{confused gameshow host}
 forgets the rules, and  where the prize is, and opens one of
 the unchosen doors at random. He opens door 3, and the prize is not revealed.
 Should the contestant choose what's behind door 1 or door 2?
 Does the optimal decision for
 the contestant depend on the contestant's \ind{belief}s about
 whether  the gameshow host is confused or not?]\index{game show}\index{three doors}\index{doors, on game show}\index{prize, on game show}\index{Monty Hall problem}
}
\exercisxB{2}{ex.girlboy}{
%\subsection
{\sf Another example in which the emphasis is not on priors.}
%\begin{quote}
 You visit a family whose three children are all at the local school.
 You don't know anything about the sexes of the children.
 While walking clumsily round the home, you stumble through
 one of the  three unlabelled bedroom doors that you know
 belong, one each, to the three children, and find that the bedroom
 contains \ind{girlie stuff} in sufficient quantities to
 convince you that the child who lives in that bedroom
 is a girl.
  Later, you sneak a look at a letter addressed to the parents,
 which reads ``From the Headmaster:
  we are sending this letter to all parents who have male children at
 the  school to inform them about the following \ind{boyish matters}\ldots''.

 These two sources of evidence establish that at least
 one of the children is
 a girl, and that at least one of the children is a boy.
 What are the probabilities that there are (a) two girls and one boy;
 (b) two boys and one girl?
%\end{quote}
}
% Another example of legal evidence}
\exercisxB{2}{ex.simpsons}{
 Mrs\ S is found stabbed in her family
 garden.
% \index{Simpson, O.J., similar case to} 
 Mr\ S behaves strangely after her death and is considered as
 a suspect. On investigation of police and social records 
 it is found that Mr\ S had  beaten up his wife on at least 
 nine previous occasions. The prosecution advances this 
 data as evidence in favour of the hypothesis that Mr\ S is 
 guilty of the murder. 
 `Ah no,' says 
% Mr.\ Merd-Kopf,
 Mr\ S's highly paid lawyer,\index{lawyer}\index{wife-beater}\index{murder}  
 `{\em statistically}, only one in a  thousand wife-beaters 
 actually goes on to murder his wife.\footnote{In the U.S.A., it 
 is estimated that 
% http://www.umn.edu/mincava/papers/factoid.htm
 2 million women are abused each year by their partners.
 In 1994 4,739 women were victims of homicide; of those,
% 28 \percent,
 1,326 women (28\%)
    were slain by husbands and boyfriends.\\ (Sources: 
 {\tt http://www.umn.edu/mincava/papers/factoid.htm,\\ 
 http://www.gunfree.inter.net/vpc/womenfs.htm})
% http://www.gunfree.inter.net/vpc/womenfs.htm
%  In keeping 
% with the fictitious nature of this story, the $1/100,000$ 
% figure was made up by me.
 }\label{footnote.murder} So the wife-beating
% , which  is not denied by Mr\ S,
 is not strong evidence at all. In fact, 
 given the wife-beating evidence alone, it's extremely unlikely 
 that he would be the murderer of his wife -- only a 
 $1/1000$ chance. You should therefore find him innocent.'

 Is the lawyer
% Mr\ Merd-Kopf
 right to imply that the history of wife-beating does
 not point to Mr\ S's being the murderer? Or is the lawyer a lying slimy trickster? If 
 the latter, what is wrong with his argument?

 [Having received an indignant letter from a lawyer about
 the preceding paragraph, I'd like to
 add an extra inference exercise at this point:
 {\em Does my suggestion that Mr.\ S.'s lawyer
 may have been a  lying slimy trickster imply that
 I believe {\em all} lawyers are  lying slimy tricksters?} [Answer: No.]]
}
% Lewis Carroll's Pillow Problem
\exercisxB{2}{ex.bagcounter}{ A bag contains one counter, known to be
 either white or black. A white counter is put in, the bag is shaken,
 and a counter is drawn out, which proves to be white. What is now the
 chance of drawing a white counter?
 [Notice that
 the state of the bag, after the operations, is exactly identical to its state before.] 
}

%\subsection*{Another quasi-legal story}
%     \exercis{ex.}{
% During a radio chat show on the health consequences of 
% secondary smoking, it is reported by an expert that 
% twelve recent studies have  investigated whether 
% there was a link between secondary smoking and cancer. 
% Of these, eleven studies  failed to establish a link
% and one study  found significant evidence of a causal 
% link -- secondary smoking increasing the risk of getting 
% cancer.  The expert said that the net evidence from these 
% twelve results was that there was significant evidence of a causal
% link. 
%
% Shortly thereafter, a Mr.\ N.T.\ Social called in in support 
% of smokers' ``rights'' to pollute public air. `If eleven 
% of the studies didn't find a link, and only one found a link, 
% then it's eleven to one  that there isn't a link, isn't it?'
%
% `Well, you clearly don't understand statistics, do you?' responded
% the condescending  host. 
%
% Can you suggest a more helpful explanation of the expert's statement?
%}
% euro.tex
\exercisxB{2}{ex.eurotoss}{
 A statistical statement appeared in 
% \footnote{Quoted by  Charlotte Denny and Sarah Dennis
 {\em The Guardian} on Friday January 4, 2002:
\begin{quote}
 When spun on edge 250
                times, a Belgian one-euro
 coin came up heads 140 times and tails 110. 
 ``It looks very suspicious to me," said Barry Blight, a statistics lecturer
  at the London School of Economics.
 ``If the coin were unbiased the
  chance of getting a result as extreme as that would be less than 7\%."
\end{quote}
 But {\em do\/} these
 data give evidence that the coin is biased rather than fair?

[Hint: see \eqref{eq.compare.final}.]
}

% \input{tex/bayes_occam.tex}
\dvips 
\subchapter{Solutions to Chapter \protect\ref{ch.bayes}'s exercises} % 
\fakesection{Bent coin exercise solns}
\begin{figure}[htbp]
\figuremargin{%
\footnotesize
\begin{center}
\begin{tabular}{cc}
(a) \psfig{figure=figs/aba.ps,width=2in,angle=-90}&
(b) \psfig{figure=figs/bbb.ps,width=2in,angle=-90}\\
$P( p_{\tt{a}} | \bs\!=\!{\tt{aba}} ,F\!=\!3) \propto p_{\tt{a}}^2 (1-p_{\tt{a}})$
&
 $P( p_{\tt{a}} | \bs\!=\!{\tt{bbb}} ,F\!=\!3) \propto (1-p_{\tt{a}})^3$ \\
\end{tabular}
\end{center}
}{%
\caption[a]{Posterior probability for the bias $p_a$ of a bent coin given
 two different data sets.}
\label{fig.aba.bbb}
}%
\end{figure}
\soln{ex.postpa}{
\ben
\item
 $P( p_{\tt{a}} | \bs\!=\!{\tt{aba}} ,F\!=\!3) \propto p_{\tt{a}}^2 (1-p_{\tt{a}})$.
 The most probable value of $p_{\tt{a}}$ (\ie, the value that maximizes 
 the posterior probability density) is $2/3$.
 The mean value of $p_{\tt{a}}$  is $3/5$. 

 See \figref{fig.aba.bbb}(a).
\item
 $P( p_{\tt{a}} | \bs\!=\!{\tt{bbb}} ,F\!=\!3) \propto (1-p_{\tt{a}})^3$.
 The most probable value of $p_{\tt{a}}$ (\ie, the value that maximizes 
 the posterior probability density) is $0$.
 The mean value of $p_{\tt{a}}$  is $1/5$. 

 See \figref{fig.aba.bbb}(b).
\een
}
%/home/mackay/_courses/itprnn/figs
%gnuplot> plot x**2*(1-x)
%gnuplot> set xrange [0:1]
%gnuplot> replot
%gnuplot> set nokey
%gnuplot> set size 0.4,0.4
%gnuplot> replot
%gnuplot> set noytics
%gnuplot> replot
%gnuplot> set yrange [0:0.4]
%gnuplot> replot
%gnuplot> set yrange [0:0.17] 
%gnuplot> replot             
%gnuplot> set term post
%Terminal type set to 'postscript'
%Options are 'landscape monochrome dashed "Helvetica" 14'
%gnuplot> set output "aba.ps"
%gnuplot> replot
%gnuplot> set term X
%Terminal type set to 'X11'
%gnuplot> set yrange [0:1]   
%gnuplot> plot (1-x)**3
%gnuplot> set term post      
%Terminal type set to 'postscript'
%Options are 'landscape monochrome dashed "Helvetica" 14'
%gnuplot> set output "bbb.ps"
%gnuplot> replot


\fakesection{evidence est}
\begin{figure}[htbp]
\figuremargin{%
\small%
\begin{center}
\begin{tabular}{cccc}
$\H_0$ is true &&
\multicolumn{2}{c}{$\H_1$ is true} \\ \cmidrule{1-1}\cmidrule{3-4}
\sixtoone{$p_a=1/6$}{h0MSD}&&
\sixtoone{$p_a=0.25$}{h6MSD}&
\sixtoone{$p_a=0.5$}{h2MSD}\\
\end{tabular}
\end{center}
}{%
\caption[a]{Range of plausible values of the log evidence in favour of $\H_1$ as
 a function of $F$. The vertical axis on the left is
$\log \frac{ P( \bs | F,\H_1 ) }
          { P( \bs | F,\H_0 ) }$;
  the right hand vertical axis shows the values of 
$\frac{ P( \bs | F,\H_1 ) }
          { P( \bs | F,\H_0 ) }$.

 The solid line shows the log evidence if the random variable $F_a$
 takes on its mean value, $F_a = p_aF$. The dotted lines show (approximately)
 the log evidence if $F_a$ is at its 2.5th or 97.5th percentile.

 (See also \protect\figref{fig.evidencetyp}, \pref{fig.evidencetyp}.)
 }
\label{fig.evidenceMSD}
}%
\end{figure}
\soln{ex.evidenceest}{
 The curves in \figref{fig.evidenceMSD} were found by finding the mean and standard deviation
 of $F_a$, then setting $F_a$ to (mean $\pm$ two standard deviations
 to get a 95\% plausible range for $F_a$, and computing the three
 corresponding values of the log evidence ratio.

}%
\fakesection{simpsons}
\soln{ex.simpsons}{
 The statistic quoted by the lawyer indicates the 
% {prior\/} 
 probability
% \index{Simpson, O.J., similar case to}%
%\index{Simpson, O.J., allusion to}
\index{lawyer}\index{wife-beater}\index{murder} 
 that a randomly selected wife-beater will also murder his wife. 
 The probability that the husband was the murderer, {\em given
 that the wife has been murdered}, is a completely different quantity. 

 To deduce the latter, we need to make  further assumptions about 
 the probability of the wife's being murdered by someone else. 
 If she lives in a neighbourhood with frequent random murders, then 
 this probability is large and the posterior probability that 
 the husband did it (in the abscence of other evidence) may not 
 be very large. But in more peaceful regions, it may well be
 that the most likely person to have murdered you, if you are found
 murdered, is 
 one of your closest relatives.

%{\em  Numbers here.}
 Let's work out some illustrative numbers with the help
 of the statistics on page \pageref{footnote.murder}.
 Let $m=1$ denote the proposition that a woman has been murdered;
 $h=1$, the proposition that the husband did it; and $b=1$,
 the proposition that he beat her in the year preceding the
 murder. The statement `someone else did it'
 is denoted by  $h=0$.
 We need to define $P(h|m=1)$, $P(b|h=1,m=1)$, and $P(b=1|h=0,m=1)$
 in order to compute the posterior probability $P(h=1|b=1,m=1)$.
 From the statistics, we can read out  $P(h=1|m=1)=0.28$.
 And if two million women out of 100 million are beaten,
 then $P(b=1|h=0,m=1)=0.02$. Finally, we need a
 value for  $P(b|h=1,m=1)$: if a man murders his wife, how likely is
 it that this is the first time he laid a finger on her? I
 expect it's pretty unlikely; so maybe  $P(b=1|h=1,m=1)$ is 0.9
 or larger.

 By Bayes's theorem, then,
\beq
	P(h=1|b=1,m=1)
 = \frac{ .9 \times .28 }{  .9 \times .28 + .02 \times .72 }
	\simeq 95\% .
\eeq
 One way to make obvious the dishonesty of the slimy lawyer on \pref{ex.simpsons}
 is to construct arguments, with the same logical structure
 as his, that
 are clearly wrong. For example, the lawyer could say `Not only
 was Mrs.\ S murdered, she was murdered between 4.02pm and
 4.03pm. {\em Statistically}, only one in a {\em million\/} wife-beaters 
 actually goes on to murder his wife between 4.02pm and
 4.03pm. So the wife-beating
% , which  is not denied by Mr.\ S,
 is not strong evidence at all. In fact, 
 given the wife-beating evidence alone, it's extremely unlikely 
 that he would murder his wife in this way -- only a 
 $1/1000000$ chance.''
}
\soln{ex.3doors}{
 Let $\H_i$ denote the hypothesis that the prize is behind 
 door $i$.
 We make the following assumptions: the three hypotheses
 $\H_1$, $\H_2$ and $\H_3$ are equiprobable {\em a priori}, \ie, 
\beq
	P(\H_1) = P(\H_2) = P(\H_3) = \frac{1}{3} .
\eeq
 The datum we receive, after choosing door 1,
 is one of $D=3$ and $D=2$ (meaning door 3 or 2 is opened, respectively.
 We assume that these two possible outcomes have the following probabilities.
 If the prize is behind door 1 then the host has a free choice; in 
 this case we assume that the host selects at random between $D=2$ and $D=3$.
 Otherwise the choice of the host is forced and the probabilities
 are 0 and 1.
\beq
\begin{array}{|r@{\,}c@{\,}l|r@{\,}c@{\,}l|r@{\,}c@{\,}l|}
	P( D\!\!=\!\!2 | \H_1) &=& \dfrac{1}{2}  & 
	P( D\!\!=\!\!2 | \H_2) &=& 0  & 
	P( D\!\!=\!\!2 | \H_3) &=& {1} \\
	P( D\!\!=\!\!3 | \H_1) &=& \dfrac{1}{2}  & 
	P( D\!\!=\!\!3 | \H_2) &=& {1}  & 
	P( D\!\!=\!\!3 | \H_3) &=& 0
\end{array} 
\eeq
 Now, using Bayes's theorem, we evaluate the posterior probabilities
 of the hypotheses:
\beq
	P( \H_i | D\!\!=\!\!3 ) = \frac{P( D\!\!=\!\!3 | \H_i)  P(\H_i) }{P(D\!\!=\!\!3) }
\eeq
\beq
\begin{array}{|r@{\,}c@{\,}l|r@{\,}c@{\,}l|r@{\,}c@{\,}l|}
	P(\H_1 | D\!\!=\!\!3) &=& \frac{ (1/2)  (1/3) }{P(D=3) }  & 
	P(\H_2 | D\!\!=\!\!3) &=& \frac{ ({1})  (1/3) }{P(D=3) }  & 
	P(\H_3 | D\!\!=\!\!3) &=& \frac{ ({0})  (1/3) }{P(D=3) } 
\end{array}
\eeq
 The denominator $P(D\!\!=\!\!3)$ is  $(1/2)$ because it is the  normalizing 
 constant for this posterior distribution. 
So
\beq
\begin{array}{|rcl|rcl|rcl|}
	P( \H_1  | D\!\!=\!\!3 ) &=&	 \dfrac{ 1}{3} &
P(\H_2 | D\!\!=\!\!3) &=&	 \dfrac{ 2}{3} &
P(\H_3 | D\!\!=\!\!3) &=&	 0 .
\end{array} 
\eeq
 So the contestant should switch to door 2 in order to have
 the biggest chance of getting the prize.

 Many people find this outcome surprising. There are two 
 ways to make it more intuitive. One is to play the game thirty
 times with a friend and keep track of the frequency with 
 which switching gets the prize. Alternatively, 
 you can perform a thought experiment in which the game is 
 played with a million doors. The rules are now that the contestant
 chooses one door, then the game show host opens 
 999,998 doors in such a way as not to reveal the prize, leaving 
 the {\em contestant's\/}
 selected door  and {\em one other door\/}
 closed. The contestant may 
now stick or switch. 
 Imagine the contestant confronted by a million doors, of which 
  doors 1 and 234,598  have not been opened, door 1 having been 
 the contestant's initial guess. Where do you think the prize is?
}
%
\soln{ex.3doorsb}{
% earthquake rules.
 If door 3 is opened by an earthquake, the inference comes out
 differently --- even though visually the scene looks the same.  The
 nature of the data, and the probability of the data, are both now
 different.  The possible data outcomes are, firstly, that any number
 of the doors might have opened. We could label the eight possible
 outcomes $\bd = (0,0,0), (0,0,1), (0,1,0), (1,0,0), (0,1,1), \ldots,
 (1,1,1)$. Secondly, it might be that the prize is visible after the
 earthquake has opened one or more doors.  So the data $D$ consists of
 the value of $\bd$, and a statement of whether the prize was
 revealed.  It is hard to say what the probabilities of these outcomes
 are, since they depend on our beliefs about the reliability
 of the door latches and the properties of earthquakes,
 but it is possible to extract the desired posterior probability
 without naming the values of $P(\bd|\H_i)$ for each $\bd$.  All that
 matters are the relative values of the quantities $P(D|\H_1)$,
 $P(D|\H_2)$, $P(D|\H_3)$, for the value of $D$ that actually occured.
 [This is the {\dem\ind{likelihood principle}\/} which
 we met in \sectionref{sec.lp}.]
% !!!!!!!!! add page ref?
 The  value of $D$ that actually occured is
 $\bd = (0,0,1)$, and no prize visible. First, it is clear that
 $P(D|\H_3)=0$, since the datum that no prize is visible is
 incompatible with $\H_3$.  Now, assuming that the contestant selected
 door 1, how does the probability $P(D|\H_1)$ compare with
 $P(D|\H_2)$?  Assuming that earthquakes are not sensitive to
 decisions of game show contestants,
 these two quantities have to be equal,  by symmetry. We don't know how likely it is
 that door 3 falls off its hinges, but however likely it is, it's just
 as likely to do so whether the prize is behind door 1 or door 2.  So,
 if $P(D|\H_1)$ and $P(D|\H_2)$ are equal, we obtain:
\beq
 \begin{array}{|r@{}c@{}l|r@{}c@{}l|r@{}c@{}l|}
	P(\H_1 | D) &=& \frac{ P(D|\H_1)  (1/3) }{P(D) }  & 
	P(\H_2 | D) &=& \frac{ P(D|\H_2)  (1/3) }{P(D) }  & 
	P(\H_3 | D) &=& \frac{ P(D|\H_3)  (1/3) }{P(D) } 
\\
 &=&	 \dfrac{ 1}{2} &
 &=&	 \dfrac{ 1}{2} &
 &=&	 0 .
\end{array} 
\eeq
 The two possible hypotheses are now equally likely.

 If we assume that 
 the host knows where the prize is and might be acting 
 deceptively, then the answer might be further modified, because we 
 have to view the host's words as part of the data.

 Confused? It's well worth  making sure you
 understand these two gameshow  problems.
 Don't worry, I slipped up on the second problem, the
 first time I met it.

 There is a general rule which  helps immensely
 in confusing probability problems:\index{key points!how to solve probability problems}
\begin{quote}
 Always write down the probability of everything.\\ \hfill {\em (Steve Gull)}
\end{quote}
 From this joint probability, any desired inference can
 be mechanically obtained. (\Figref{fig.everything})
\amarginfig{b}{
\begin{center}
\newcommand{\tabwidth}{30}
\newcommand{\tabheight}{80}
\setlength{\unitlength}{1mm}{
\begin{picture}(43,92)(-13,0)
\put(15,90){\makebox(0,0){\small\sf{Where the prize is}}}
\put( 5,85){\makebox(0,0){\small{door}}}
\put(15,85){\makebox(0,0){\small{door}}}
\put(25,85){\makebox(0,0){\small{door}}}
\put( 5,82){\makebox(0,0){\small{1}}}
\put(15,82){\makebox(0,0){\small{2}}}
\put(25,82){\makebox(0,0){\small{3}}}
\put(-1, 5){\makebox(0,0)[r]{\footnotesize{1,2,3}}}
\put(-1,15){\makebox(0,0)[r]{\footnotesize{2,3}}}
\put(-1,25){\makebox(0,0)[r]{\footnotesize{1,3}}}
\put(-1,35){\makebox(0,0)[r]{\footnotesize{1,2}}}
\put(-1,45){\makebox(0,0)[r]{\footnotesize{3}}}
\put( 5,75){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{\rm none}}{3}$}}}
\put(15,75){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{\rm none}}{3}$}}}
\put(25,75){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{\rm none}}{3}$}}}
\put( 5,45){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{3}}{3}$}}}
\put(15,45){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{3}}{3}$}}}
\put(25,45){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{3}}{3}$}}}
\put( 5, 5){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{1,2,3}}{3}$}}}
\put(15, 5){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{1,2,3}}{3}$}}}
\put(25, 5){\makebox(0,0){\footnotesize{$\displaystyle\frac{p_{1,2,3}}{3}$}}}
\put(-1,55){\makebox(0,0)[r]{\footnotesize{2}}}
\put(-1,65){\makebox(0,0)[r]{\footnotesize{1}}}
\put(-1,75){\makebox(0,0)[r]{\footnotesize{none}}}
\put(-12,40){\makebox(0,0){\rotatebox{90}{\small\sf{Which doors opened by earthquake}}}}
\multiput(0,0)(0,10){9}{\line(1,0){\tabwidth}}
\multiput(0,0)(10,0){4}{\line(0,1){\tabheight}}
\end{picture}}
\end{center}
\caption[a]{The probability of everything, for the second three-door problem,
 assuming an earthquake has just occured.
 Here, $p_3$ is the probability that door 3 alone is opened by an earthquake.}
\label{fig.everything}
}
}
\soln{ex.eurotoss}{
% see also
% http://www.dartmouth.edu/~chance/chance_news/recent_news/chance_news_11.02.html
% for lots of practical info on coin biases.
%%%%%%%%%%%%%%%%%%%%%%%%%%% included by _s8.tex
% First, could confirm his sampling theory
%Sampling theory:  number of heads $\sim 125 \pm 8$
%$ \sqrt{62.5}$
%so two-tail probability is
% pr 2*(1-myerf(14.5/7.9))    ans = 0.066440
% if the data were 141 out of 250 then we get 
%  2*(1-myerf(15.5/7.9))    ans = 0.049760
 \index{euro}We compare the models $\H_0$ -- the coin is fair --
 and $\H_1$ -- the \ind{coin} is biased, with
 the prior on its bias set to the uniform
 distribution $P(p|\H_1)=1$.  
% ent, as defined  in this chapter.
\amarginfig{t}{
\begin{center}
\mbox{\psfig{figure=gnu/euro.ps,width=1.4in,angle=-90}}
\end{center}
\caption[a]{The probability distribution of the
 number of heads given the two hypotheses, that
 the coin is fair, and that it is biased, with
 the prior distribution of the bias being uniform.
 The outcome ($D = 140$ heads) gives weak evidence
 in favour of $\H_0$, the  hypothesis that the coin is fair.}
\label{fig.euro}
}
 [The use of a uniform prior seems reasonable to me, since I know
 that some coins, such as American pennies,
 have severe biases when spun on edge; so the situations $p=0.01$ or $p=0.1$
 or $p=0.95$ would not surprise me.]
\begin{aside}
 When I mention $\H_0$ -- the coin is fair -- a pedant would say, `how
 absurd to even consider that the coin is fair -- any coin is surely
 biased to some extent'. And of course I would agree. So will pedants
 kindly understand $\H_0$ as meaning `the coin is fair to within
 one part in a thousand, \ie, $p \in 0.5\pm 0.001$'.
\end{aside}
 The likelihood ratio is:
% given  in \eqref{eq.compare.final}.
\beq
% Bayesian approach: Model comparison:
\frac{ P( D|\H_1  )}
      {P( D|\H_0  )}
= \frac{ \frac{ 140! 110! }{ 251! } }{  1/2^{250} } = 0.48 .
\eeq
 Thus the data give scarcely any evidence
 either way; in fact they
 give weak evidence (two to one) in favour of $\H_0$!
% load 'gnu/euro.gnu'

 `No, no', objects the believer in bias, `your silly uniform
 prior doesn't represent {\em my\/} prior beliefs about
 the bias of biased coins -- I was {\em expecting\/}  only  a small bias'. 
 To be as generous as possible to the $\H_1$,
 let's see how well it could fare
 if the prior were presciently set.
 Let us allow a prior of the form
\beq
	P(p|\H_1,\a) = \frac{1}{Z(\a)} p^{\a-1}(1-p)^{\a-1},
	\:\:\:\: \mbox{where $Z(\a)=\Gamma(\alpha)^2/\Gamma(2 \alpha)$}
\eeq
 (a Beta
% Dirichlet (or Beta)
 distribution, with the original uniform prior reproduced
 by setting  $\a=1$). By tweaking $\alpha$, 
 the likelihood ratio for $\H_1$ over $\H_0$,
\beq
 \frac{ P( D|\H_1,\a  )}
      {P( D|\H_0 )} =
 \frac{\Gamma(140 \!+\! \alpha) \, \Gamma(110 \!+\! \alpha) \, \Gamma(2 \alpha) 2^{250}}
              {  \Gamma(250 \!+\! 2 \alpha) \, \Gamma(\alpha)^2 },
\eeq
 can
 be increased a little. It
 is  shown for several values of $\a$ in  \figref{fig.eurot}.
%
% fig.eurot WAS here but has been moved away to avoid a crunch
% 
 Even the most favourable choice of $\alpha$ ($\a \simeq 50$)
 can 
 yield a likelihood ratio of only two to one in favour of
 $\H_1$.

 In conclusion, the data are not `very suspicious'. They
 can be construed as giving at most two-to-one evidence
 in favour of one or other of the two hypotheses.

\begin{aside}
 Are these wimpy likelihood ratios the fault
 of over-restrictive
 priors? Is there any way of producing
 a `very suspicious' conclusion?
 The prior that is best-matched to the data,
 in terms of likelihood, 
%  and one that surely has to be viewed as unreasonable,
 is the prior that sets $p$ to $f \equiv 140/250$ with probability
 one. Let's call this model $\H_*$.
% , since it is a parameterless model like $\H_0$.
 The likelihood ratio  is $P(D|\H_*)/P(D|\H_0) = 2^{250} f^{140} (1-f)^{110}
 =6.1$.  So the strongest evidence that these data can possibly
 muster against the hypothesis that there is no bias is six-to-one.
\end{aside}
% b.blight@lse.ac.uk
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% alternate answers for the case of 141 heads where
% the P value is 0.05 (0.04976)
%
%The outcomes of the computations for this case (141 from 250)
% are
% alpha , likelihood ratio
%
%.3678794412, .3166098681
%1., .6110726692
%2.718281828, 1.049115229
%7.389056099, 1.627382387
%20.08553692, 2.181864309
%54.59815003, 2.303276774
%148.4131591, 1.882663014
%403.4287935, 1.419011740
%1096.633158, 1.168433218
%2980.957987, 1.063851106
%8103.083928, 1.023737702
%22026.46579, 1.008765749
%
% and H_BF achieves 7.796


% This figure belongs earlier.
 \amarginfig{t}{
{\footnotesize
\begin{tabular}{r@{}l@{$\:\:\:$}r@{\hspace*{0.3in}}r@{}l}
\toprule
\multicolumn{2}{c}{$\alpha$}&
\multicolumn{3}{c}{$\displaystyle \frac{ P( D|\H_1,\a  )}
                                        {P( D|\H_0     )}$}\\
\midrule
 &.37 & & &.25\\
1&.0  & & &.48\\
2&.7  & & &.82\\
7&.4  & &1&.3\\
20&   & &1&.8\\
55&   & &1&.9\\
148&  & &1&.7\\
403&  & &1&.3\\
1096& & &1&.1\\
% from euro.dat
\bottomrule
\end{tabular}
}
\caption[a]{Likelihood ratio for various choices of
 the prior distribution's hyperparameter $\alpha$.
}
\label{fig.eurot}
}%
 While we are noticing the absurdly misleading\index{sermon!sampling theory}\index{P-value}
 answers that `sampling theory' statistics produces,
 such as the P-value of 7\% in the  exercise we just solved,
 let's stick the boot in.\label{sec.sampling5percent}
 If we make a tiny change to the data set, increasing the
 number of heads in 250 tosses from 140 to 141,
 we find that the P-value goes below the mystical value of 0.05
 (the P-value is 0.0497).
 The classical statistician would happily squeak `the probability
 of getting a result as extreme as 141 heads is smaller than 0.05 --
 we thus reject the null hypothesis at a significance level of 5\%'.
 The correct answer
 is  shown for several values of $\a$ in  \figref{fig.eurot141}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% alternate answers for the case of 141 heads where
% the P value is 0.05 (0.04976)
% Radford: Using R, I get that the true p-value (with genuine binomial
%probabilities) for 141 out of 250 is 0.04970679, close to your value.
%5
%The outcomes of the computations for this case (141 from 250)
% are
% alpha , likelihood ratio
%
%.3678794412, .3166098681
%1., .6110726692
%2.718281828, 1.049115229
%7.389056099, 1.627382387
%20.08553692, 2.181864309
%54.59815003, 2.303276774
%148.4131591, 1.882663014
%403.4287935, 1.419011740
%1096.633158, 1.168433218
%2980.957987, 1.063851106
%8103.083928, 1.023737702
%22026.46579, 1.008765749
%
% and H_BF achieves 7.796
 The values worth highlighting from this table are, first,
 the likelihood ratio when $\H_1$ uses the standard uniform prior,
 which is 1:0.61 in favour of the {\em null hypothesis\/} $\H_0$.
 Second, the  most favourable choice of $\a$, from the
 point of view of $\H_1$, can only 
 yield a likelihood ratio of about 2.3:1 in favour of
 $\H_1$.\label{sec.pvalue05}

 Be warned! A P-value of 0.05 is often interpreted
% gives the impression to many
 as implying 
 that the odds are stacked about twenty-to-one
 {\em against\/} the null hypothesis. But the truth in this case
 is that the evidence
 either slightly  {\em favours\/} the  null  hypothesis,
 or disfavours it by at most three to one, depending on
 the choice of prior.
\amarginfig{t}{
{\footnotesize
\begin{tabular}{r@{}l@{$\:\:\:$}r@{\hspace*{0.3in}}r@{}l}
\toprule
\multicolumn{2}{c}{$\alpha$}&
\multicolumn{3}{c}{$\displaystyle \frac{ P( D'|\H_1,\a  )}
                                        {P( D'|\H_0     )}$  }\\
\midrule
 &.37 & & &.32\\
1&.0  & & &.61\\
2&.7  & &1&.0\\
7&.4  & &1&.6\\
20&   & &2&.2\\
55&   & &2&.3\\
148&  & &1&.9\\
403&  & &1&.4\\
1096& & &1&.2\\
% from euro.dat
\bottomrule
\end{tabular}
}
\caption[a]{Likelihood ratio for various choices of
 the prior distribution's hyperparameter $\alpha$, when the data are
 $D'=141$ heads in 250 trials.
}
\label{fig.eurot141}
}
%

% P-values
 The \ind{P-value}s and `\ind{significance levels}' of \ind{classical statistics}\index{sermon!classical statistics}
 should be treated with {\em extreme caution}.\index{caution!sampling theory}
% This is the  last we will see of them in this book.
 Shun them!
 Here ends the sermon.\index{sermon!sampling theory}
% Classical statistics  and  Microsoft Windows 95 --
% two of the greatest evils to come out of the twentieth century.

}





\dvipsb{solutions bayes}
% \input{tex/_l1b.tex}
%
% message passing was here
%
\part{Data Compression} 
\prechapter{About    Chapter}
\fakesection{prerequisites for chapter 2}
%
 In this chapter we 
 discuss how to measure the information content of the outcome
 of a random experiment. 

 This chapter has some tough bits.
 If you find the mathematical details  hard,
% to follow,
 skim through them and keep going -- you'll be able to enjoy chapters
 \ref{ch3} and \ref{ch4} without this chapter's tools.

% of typicality.
\amarginfig{t}{%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Cast of characters}
\footnotesize
\begin{tabular}{@{}lp{1.14in}}
\multicolumn{2}{c}{
{\sf Notation}
}\\
\midrule
$x \in \A$  & $x$ is a {\dem{member}\/} of the \ind{set} $\A$ \\
$\S \subset \A$  & $\S$ is a {\dem\ind{subset}\/} of the set $\A$ \\
$\S \subseteq \A$  & $\S$ is a {\ind{subset}} of, or equal to, the set $\A$ \\
% \union
$\V = \B \cap \A$
       & $\V$ is the {\dem\ind{union}\/} of the sets $\B$ and $\A$ \\
$|\A|$ & number of elements in set $\A$\\

\bottomrule
\end{tabular} \medskip
% end marginstuff
}%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 Before reading \chref{ch2}, you should have
read
% section \ref{ch1.secprob}
 \chref{ch1.secprob}
 and
 worked on
% \exerciseref{ex.expectn}.
% It  will also help if you have worked on
%
% do I need to ensure that {ex.Hadditive} occurs earlier? 
%
  \exerciseonlyrange{ex.expectn}{ex.Hineq} and \ref{ex.sumdice}
%  \exerciseonlyrangeshort{ex.sumdice}{ex.RNGaussian}
 \pagerange{ex.invP}{ex.sumdice},
% {ex.RNGaussian}.
% exercises \exnine-\exfourteen\ and \extwentyfive-\extwentyseven.
 and \exerciseonlyref{ex.weigh} below.

 The following
 exercise is intended to
 help you think about how to measure information content. 
% Please work on this exercise now.
% weighing
% ITPRNN Problem 1
%
% weighing problem
%
\fakesection{the weighing problem}
\exercis{ex.weigh}{
  -- {\em Please work on this problem before reading  chapter \chtwo.}

 \index{weighing problem}You are given 12 balls, all equal in weight except for
 one that is either heavier or lighter. You are also given a two-pan
 \ind{balance} to use.
% , which you are to use as few times as possible. 
 In each use of the balance you may put {any\/} number of the 12
 balls on the left pan, and the same number on the right pan, and push
 a button to initiate the weighing; there are three possible outcomes:
 either the weights are equal, or the balls on the left are heavier,
 or the balls on the left are lighter.  Your task is to design a
 strategy to determine which is the odd ball {\em and\/} whether it is
 heavier or lighter than the others {\em in as few uses of the balance
 as possible}.

% There will be a prize for the best answer.

 While thinking about this problem, 
 you
% should
 may find it helpful to 
 consider the following questions:
\ben
\item How can one measure {\dem\ind{information}}?
\item When you have identified the odd ball and whether it is heavy or 
	light, how much information have you gained?
\item Once  you have designed a strategy, draw a tree showing,
 for each of the possible   outcomes
 of a weighing, what weighing you perform next.
 At each node in the tree, how much information have the outcomes
 so far given you, and how much information remains to be
 gained?
% What is the probability of each of the possible outcomes of the first
% weighing?
%\item
% What is the most information you can get from a single weighing?
%	How much information do you get from a single weighing
% if the three outcomes are equally probable? 
%\item What is the smallest number of weighings that might conceivably 
%be sufficient  always to identify the odd ball and whether it is heavy
%or light?
\item How much information is gained when you learn (i) the state of a
 flipped coin; (ii) the states of two flipped coins; 
 (iii)  the outcome when a four-sided die is rolled?
\item
 How much information is gained on the first step of the weighing 
 problem if 6 balls are weighed against the other 6?  How much is gained
 if 4 are weighed against 4 on the first step, leaving out 4 balls?
% the other 4 aside?
\een
}
% 
% How many possible outcomes of an e weighing process are there? To put it another way, imagine that you report the outcome by sending a postcard  which says, for example, "ball number 5 is heavy", how many  prepare a postcard 
% 
% how many outcomes are there?
% How many possible states of the world are y
% if you tell someone ball number x is heavy, how much info have you given
% them? how much information can be conveyed by $k$ uses of the balance? 
% 
% 
% make clear that you can put any objects on the scales,
% don't have to weigh 6 vs 6.
% no cheating by gradually adding weights
% 
% katriona's problem: 4 bits, randomly rotated every time you ask them
% to be flipped.
% 
% hhhh llll gggg
% hhll  lhgg    lh
% if left is h then
% hh or l
% so do h vs h
% 
% else  gggg gggg ????
% -> ?? ?g
% -> hh l       or ggg -> wegh last dude (1 bit)
% do h vs h
% 
% if 13 and good avail, -  hhhhh llll* gggg
% hhll lhgg     hhl
% 






\mysetcounter{page}{76}
\chapter{The Source Coding Theorem}
\label{ch.two}\label{ch2}\label{chtwo}
\addtopic{3}{infotheory}
\addtopic{3}{probability}
\addtopic{2}{inference}
%\addtopic{3}{computation}
\addtrack{1}{inferencecourse}
\addtrack{3}{infotheorycourse}
\addtrack{3}{itprnncourse}
% _l2.tex 
% \part{Data Compression} 
% \chapter{The Source Coding Theorem}
%
% I introduce the idea of a  "name" (or label?) here, and should clarify
% (example 2.1)
%
% E = 13%, Q,Z = 0.1%
% TH = 3.7%
% 
%  New plan for this chapter: 
% \section{Key concept}
%  Rather than $H(\bp)$ being the measure of information content of
%  an ensemble, 
%  I want the central idea of this chapter to be that 
%  $\log 1/P(\bx)$ is the information content of a particular
%  outcome $\bx$. $H$ is then of interest because it is the average 
%  information content. 
% 
%  An example to illustrate this is `hunt the professor'. Or crack 
%  the combination. Guess the PIN. 
%  An absent-minded professor wishes to remember an 
%  integer between 1 and 256, that is, eight bits of information.
%  He takes 256 large numbered cardboard boxes, and climbs
%  in the  box whose number is the integer to be remembered.
%  The only way to find him 
%  is to open the lid of a box. A single experiment involves 
%  opening a particular box. The outcome is either $x={\tt n}$ -- no 
%  professor -- or $x={\tt y}$ -- the professor is in there. 
%  The probabilities are 
% \beq
% 	P(x\eq {\tt n}) = 255/256; P(x\eq {\tt y}) = 1/256.
% \eeq
%  We open box $n$.
%  If the professor is revealed, we have learned the integer, 
%  and thus recovered 8 bits of information. If he is not revealed, 
%  we have learned very little -- simply that the 
%  integer is not $n$. The information contents are:
% \beq
% 	h(x\eq 0) = \log_2( 256/255) = 0.0056 ; h(x\eq 1) = \log_2 256 = 8 .
% \eeq
%  The average information content is 
% \beq
% 	H(X) = 0.037 \bits .
% \eeq
%  This example shows that in the event of an improbable outcome's occuring, 
%  a large amount of information really is conveyed. 
% 
% \section{Weighing problem}
%  The weighing problem remains useful, let's keep it. 
% 
% \section{Source coding theorem}
%  Relate `information content' $\log 1/P$ to message length 
%  in two steps. First, establish the AEP, that 
%  the outcome from an ensemble $X^N$
%  is very likely to lie in a typical set having `information 
%  content' close to NH. 
% 
%  Second, show that we can count the number of elements in the 
%  typical set, give them all names, and the number of 
%  names will be about $2^{NH}$. 
% 
%  At what point should $H_{\delta}$ be introduced? 

\section{How to measure the information content of a random variable?}
 In the next few chapters, we'll be talking about probability
 distributions and random variables. Most of the time
 we can get by with sloppy notation, but occasionally, we will need
 precise notation. Here is the
%definition and
 notation that we established in chapter \ref{ch.prob.ent}.\indexs{ensemble}
%
\sloppy
\begin{description} 
\item[An ensemble] $X$ is a triple $(x,\A_X, \P_X)$,
  where the {\dem outcome\/} $x$ is the value of a random variable,
% whose value $x$ can take on a
 which   takes on one of a 
	set of possible values,
% the alphabet
% {\em outcomes}, 
	$\A_X \eq  \{a_1,a_2,\ldots,a_i,\ldots, a_I\}$,
%	\ie, possible values for a random variable $x$
%	and a probability distribution over them, 
	having probabilities
	$\P_X \eq \{p_1,p_2,\ldots, p_I\}$, with $P(x\eq a_i) = p_i$, 
	$p_i  \geq 0$ and $\sum_{a_i \in \A_X} P(x \eq a_i) = 1$.
\end{description}

%\begin{description}
%\item[An ensemble] $X$ is a random variable $x$ taking on a value
% from a 	set of possible {\em outcomes}, 
%	$$\A_X \eq  \{a_1,\ldots,a_I\},$$ 
%	having probabilities
%	$$\P_X = \{p_1,\ldots, p_I\},$$ with $P(x\eq a_i) = p_i$, 
%	$p_i  \geq 0$ and $\sum_{x \in \A_X} P(x) = 1$.
%\end{description}
% An ensemble is a set of possible values for a random variable
%	and a probability distribution over them.
{How can we measure the information content of an outcome
 $x = a_i$ from such an ensemble?}
 In this chapter we examine the assertions 
\ben
\item
 that  the
% It is claimed that the 
 {\dem{\ind{Shannon} information content}},\index{information content!how to measure}
\beq
	h(x\eq a_i) \equiv \log_2 \frac{1}{p_i},
\eeq
 is a sensible measure of the information content of the outcome 
 $x = a_i$, and
\item
 that 
 the {\dem{\ind{entropy}}} of the ensemble,
\beq 
	H(X) = \sum_i p_i \log_2 \frac{1}{p_i},
\eeq
 is a sensible measure of the ensemble's average information content.
\een
\begin{figure}[htbp]
\figuremargin{%1
{\small%
\begin{center}
\mbox{
\mbox{
\hspace{-9mm}
\mbox{\psfig{figure=figs/h.ps,%
width=42mm,angle=-90}}$p$
\hspace{-35mm}
\makebox[0in][l]{\raisebox{\hpheight}{$h(p)=  \log_2 \displaystyle \frac{1}{p}$ }}
\hspace{35mm}
}
\hspace{0.9mm}
\begin{tabular}[b]{ccc}\toprule
$p$ & $h(p)$ & $H_2(p)$ \\ \midrule
0.001             & 10.0            & 0.011 \\ %  9.96578 & 0.0114078
0.01\phantom{0}   & \phantom{1}6.6  & 0.081 \\
0.1\phantom{01}   & \phantom{1}3.3  & 0.47\phantom{1} \\
0.2\phantom{01}   & \phantom{1}2.3  & 0.72\phantom{1} \\
0.5\phantom{01}   & \phantom{1}1.0  & 1.0\phantom{01} \\ \bottomrule
\end{tabular}
\mbox{
% to put H at left: \hspace{1.2mm}
\hspace{6.2mm}
\raisebox{\hpheight}{$H_2(p)$}
% to put H at left: \hspace{-7.5mm}
\hspace{-20mm}
\mbox{\psfig{figure=figs/H2.ps,%
width=42mm,angle=-90}}$p$
}
% see also H2x.tex

\end{center}
}% end small
}{%
\caption[a]{The Shannon information content $h(p) =  \log_2 \frac{1}{p}$ and 
 the binary entropy function $H_2(p)=H(p,1\!-\!p)=p \log_2 \frac{1}{p}
  + (1-p)\log_2 \frac{1}{(1-p)}$ as a function of $p$.}
\label{fig.h2}
}%
\end{figure}
% gnuplot 
% load 'figs/l2.gnu'

\noindent
 \Figref{fig.h2} shows the Shannon information content 
 of an outcome with probability $p$, as a function of $p$.
 The less probable an outcome is, the greater its
 Shannon information content. 
 \Figref{fig.h2} also shows 
% $h(p) =  \log_2 \frac{1}{p}$,
 the binary entropy function, 
\beq
 H_2(p)=H(p,1\!-\!p)=p \log_2 \frac{1}{p}
  + (1-p)\log_2 \frac{1}{(1-p)} ,
\eeq
 which is the entropy   of the ensemble $X$ whose alphabet and probability
 distribution are 
 $\A_X = \{ a , b \}, \P_X = \{ p , (1-p) \}$.
%

\subsection{Information content of independent random variables}
 Why should $\log 1/p_i$ have anything to do with the
 information content? Why not some other function of $p_i$?
 We'll explore this question in  detail shortly,
 but first, notice a nice property of this particular function
 $h(x)=\log 1/p(x)$.

 Imagine learning the value of two {\em independent\/} random
 variables, $x$ and $y$.
 The definition of independence is that the probability
 distribution is separable into a {\em product}:
\beq
	P(x,y) = P(x) P(y) .
\eeq
 Intuitively, we might want any measure of
 the `amount of information gained' to have the property of
 {\em additivity} --
 that is,
 for independent random variables $x$ and $y$, 
 the information gained when we learn $x$ and $y$ should 
 equal  the sum of  the information gained if $x$ alone were learned
 and  the information gained if $y$ alone were learned.

 The Shannon information content of the outcome $x,y$ is
\beq
	h(x,y) = \log \frac{1}{P(x,y)}
	= \log \frac{1}{P(x)P(y)} 
	= \log \frac{1}{P(x)} 
	+ \log \frac{1}{P(y)} 
\eeq
 so it does indeed satisfy
\beq
	h(x,y) =  h(x) + h(y), \:\:\mbox{if $x$ and $y$ are independent.}
\eeq
\exercisxA{1}{ex.Hadditive}{
	Show that, if $x$ and $y$ are independent,
	the entropy of the outcome $x,y$
	satisfies
\beq
	H(X,Y) = H(X) + H(Y) .
\eeq
 In words, entropy is additive for independent variables.
}

 We now  explore these ideas with some examples;
 then, in section \ref{sec.aep} and in chapters \ref{ch3}
 and \ref{ch4}, we  prove that 
 the Shannon information content and the entropy  are 
 related to the number of bits  needed to describe 
 the  outcome of an experiment.

% \section{Thinking about information content}
% \subsection{Ensembles with maximum average information content}
%  The first property of the entropy that we will 
%  consider is the property that you proved when you solved
%  \exerciseref{ex.Hineq}: the entropy of an ensemble 
%  $X$ is biggest if  all the outcomes 
%  have equal probability $p_i \eq  1/|X|$.
% 
%  If entropy  measures the average information content
%  of an ensemble, then this idea of equiprobable outcomes
%  should have relevance for the design of efficient experiments.

\subsection{The weighing problem: designing informative experiments}
 Have you solved the \ind{weighing problem} \exercisebref{ex.weigh}\
 yet? Are you sure? Notice that in three uses of the balance --
 which  reads either `left heavier', `right heavier', or `balanced' --
 the number 
 of conceivable outcomes is $3^3=27$, whereas the number of possible 
 states of the world is 24: the odd ball could be any of twelve balls, 
 and it could be heavy or light. So in principle, the problem might be 
 solvable in three weighings -- but not in two, since $3^2 < 24$.  

 If you know how you 
 {can} determine the odd weight {\em and\/} whether  it is heavy or 
 light in {\em three\/} weighings, then you may read on.
 If you haven't found a strategy that always gets there in three weighings,
 I encourage you to think about  \exerciseonlyref{ex.weigh}  some more.
% {ex.weigh}

% \subsection{Information from experiments}
 Why is your strategy optimal? What is it about your series of weighings
 that allows useful information to be gained as quickly as possible?
\begin{figure}%[htbp]
\fullwidthfigureright{%
% included by l2.tex
%
% shows weighing trees, ternary
%
% decisions of what to weigh are shown in square boxes with 126 over 345 (l:r)
% state of valid hypotheses are listed in double boxes
% three arrows, up means left heavy,  straioght means right heavy, down is balance
% actually s and d boxes end up having the same defn.
%
\setlength{\unitlength}{0.56mm}% page width is 160mm % was 6mm
\begin{center}
\small
\begin{picture}(260,260)(-50,-130)
%
%   initial state 
%
% all 24 hypotheses
\mydbox{-50,-100}{15,200}{$1^+$\\$2^+$\\$3^+$\\$4^+$\\$5^+$\\$6^+$\\$7^+$\\
$8^+$\\$9^+$\\$10^+$\\$11^+$\\$12^+$\\$1^-$\\$2^-$\\$3^-$\\$4^-$\\
$5^-$\\$6^-$\\$7^-$\\$8^-$\\$9^-$\\$10^-$\\$11^-$\\$12^-$}
\mysbox{-30,-8}{25,16}{$\displaystyle\frac{1\,2\,3\,4}{5\,6\,7\,8}$}
\put(-30,10){\makebox(25,8){weigh}}
%
% 1st arrows
%
\mythreevector{0,0}{1}{3}{30}
%
% first three boxes of hypotheses % boxes of actions 
% #1 is bottom left corner, so has to be offset by height of box
% #2 is dimensions of box
%
% each digit is about 10 high
%
\mydbox{40,55}{15,70}{$1^+$\\$2^+$\\$3^+$\\$4^+$\\$5^-$\\$6^-$\\$7^-$\\$8^-$}
\mysbox{65,82}{25,16}{$\displaystyle\frac{1\,2\,6}{3\,4\,5}$}
\put(65,100){\makebox(25,8){weigh}}
\mydbox{40,-35}{15,70}{$1^-$\\$2^-$\\$3^-$\\$4^-$\\$5^+$\\$6^+$\\$7^+$\\$8^+$}
\mysbox{65,-8}{25,16}{$\displaystyle\frac{1\,2\,6}{3\,4\,5}$}
\put(65,10){\makebox(25,8){weigh}}
\mydbox{40,-125}{15,70}{$9^+$\\$10^+$\\$11^+$\\$12^+$\\$9^-$\\$10^-$\\$11^-$\\$12^-$}
\mysbox{65,-98}{25,16}{$\displaystyle\frac{9\,10\,11}{1\,2\,3}$}
\put(65,-80){\makebox(25,8){weigh}}
%
%    2nd arrows 
%
\mythreevector{95,90}{1}{2}{15}
\mythreevector{95,0}{1}{2}{15}
\mythreevector{95,-90}{1}{2}{15}
% nine intermediate states. top ones
\mydbox{115,113}{35,14}{$1^+2^+5^-$}
\mysbox{155,112}{25,16}{$\displaystyle\frac{1}{2}$}
\mydbox{115,83}{35,14}{$3^+4^+6^-$}
\mysbox{155,82}{25,16}{$\displaystyle\frac{3}{4}$}
\mydbox{115,53}{35,14}{$7^-8^-$}
\mysbox{155,52}{25,16}{$\displaystyle\frac{1}{7}$}
% nine intermediate states. mid ones
\mydbox{115,23}{35,14}{$6^+3^-4^-$}
\mysbox{155,22}{25,16}{$\displaystyle\frac{3}{4}$}
\mydbox{115,-7}{35,14}{$1^-2^-5^+$}
\mysbox{155,-8}{25,16}{$\displaystyle\frac{1}{2}$}
\mydbox{115,-37}{35,14}{$7^+8^+$}
\mysbox{155,-38}{25,16}{$\displaystyle\frac{7}{1}$}
% nine intermediate states. bot ones
\mydbox{115,-67}{35,14}{$9^+10^+11^+$}
\mysbox{155,-68}{25,16}{$\displaystyle\frac{9}{10}$}
\mydbox{115,-97}{35,14}{$9^-10^-11^-$}
\mysbox{155,-98}{25,16}{$\displaystyle\frac{9}{10}$}
\mydbox{115,-127}{35,14}{$12^+12^-$}
\mysbox{155,-128}{25,16}{$\displaystyle\frac{12}{1}$}
% 3rd arrows mainline
\mythreevector{185,60}{1}{1}{10}
\mythreevector{185,0}{1}{1}{10}
\mythreevector{185,-60}{1}{1}{10}
% other branch lines
\mythreevector{185,120}{1}{1}{10}
\mythreevector{185,90}{1}{1}{10}
\mythreevector{185,30}{1}{1}{10}
\mythreevector{185,-30}{1}{1}{10}
\mythreevector{185,-90}{1}{1}{10}
\mythreevector{185,-120}{1}{1}{10}
% final answers aligned at 200,x*10
\mydbox{200,126}{10,8}{$1^+$}
\mydbox{200,116}{10,8}{$2^+$}
\mydbox{200,106}{10,8}{$5^-$}
\mydbox{200,96}{10,8}{$3^+$}
\mydbox{200,86}{10,8}{$4^+$}
\mydbox{200,76}{10,8}{$6^-$}
\mydbox{200,66}{10,8}{$7^-$}
\mydbox{200,56}{10,8}{$8^-$}
\mydbox{200,46}{10,8}{$\star$}% ---------- impossible outcome
\mydbox{200,36}{10,8}{$4^-$}
\mydbox{200,26}{10,8}{$3^-$}
\mydbox{200,16}{10,8}{$6^+$}
\mydbox{200,6}{10,8}{$2^-$}
\mydbox{200,-4}{10,8}{$1^-$}% the middle, 0
\mydbox{200,-14}{10,8}{$5^+$}
\mydbox{200,-24}{10,8}{$7^+$}
\mydbox{200,-34}{10,8}{$8^+$}
\mydbox{200,-44}{10,8}{$\star$}
\mydbox{200,-54}{10,8}{$9^+$}
\mydbox{200,-64}{10,8}{$10^+$}
\mydbox{200,-74}{10,8}{$11^+$}
\mydbox{200,-84}{10,8}{$10^-$}
\mydbox{200,-94}{10,8}{$9^-$}
\mydbox{200,-104}{10,8}{$11^-$}
\mydbox{200,-114}{10,8}{$12^+$}
\mydbox{200,-124}{10,8}{$12^-$}
\mydbox{200,-134}{10,8}{$\star$}
\end{picture}
\end{center}

}{%
\caption[a]{An optimal solution to the weighing problem. 
%
 At each step there are two boxes: the left box  shows which hypotheses are still
 possible; the right box shows the balls involved in  the next weighing.
 The 24 hypotheses are written $1^+,
% 2^+,\ldots,1^-,
 \ldots, 12^-$, 
 with, \eg, $1^+$ denoting that 1 is the odd ball and
 it is heavy.  
 Weighings are written by listing the names of the balls on the 
 two pans, separated by a line; for example, in the first weighing,
% $\displaystyle\frac{1\,2\,3\,4}{5\,6\,7\,8}$ denotes that
 balls 1,
 2, 3, and 4 are put on the left hand side and 5, 6, 7 and 8 on the
 right.
 In each triplet of arrows the upper arrow leads to the situation when 
 the left side is heavier, the middle arrow to the situation  when the right side is heavier, 
 and the lower arrow  to the situation when the outcome is balanced.
 The three points labelled $\star$
% arrows without subsequent boxes at the right hand side
 correspond to impossible outcomes.
%The total number of outcomes
% of the weighing process is 24, which equals $3^3 - 3$, so we would expect
% this ternary tree of depth three to have three spare branches.
}
\label{fig.weighing}
}%
\end{figure}
 The answer is that at each step of an optimal 
 procedure, the three outcomes (`left heavier', `right heavier', and `balance')
 are {\em as close as possible to equiprobable}. 
 An optimal solution is shown in \figref{fig.weighing}. 
 
 Suboptimal strategies, such as weighing balls 1--6 against 7--12
 on the first step, do not achieve all outcomes with equal probability:
 these two sets of balls can never  balance, so the only possible
 outcomes are `left heavy' and `right heavy'.
% Similarly, strategies
% that after an unbalanced initial result
% do not mix together balls that might be heavy with balls that 
% might be light are incapable of giving one of the three outcomes.
 Such a binary outcome only rules out half of the possible
 hypotheses, so a  strategy that uses such outcomes must sometimes
 take longer to find the right answer.
% Some suboptimal strategies produce binary trees rather than ternary trees like 
% the one in \figref{fig.weighing}, and binary trees 
% are  necessarily deeper than balanced ternary trees
% with the same number  of leaves. 

 The insight that the outcomes should be as near as possible
 to equiprobable makes 
 it easier to search for an optimal strategy. The first weighing 
 must divide the 24 possible hypotheses into three groups of eight. Then 
 the second weighing must be chosen so that there is a 3:3:2
 split of the hypotheses. 

 Thus we might conclude:
\begin{quote}
{\em An outcome of a random experiment is guaranteed to be  most informative
 if the probability distribution over outcomes is uniform.}
\end{quote}
 This conclusion agrees with 
 the  property of the entropy that you proved when you solved
 \exerciseref{ex.Hineq}: the entropy of an ensemble 
 $X$ is biggest if  all the outcomes 
 have equal probability $p_i \eq  1/|\A_X|$.

\subsection{Guessing games}
  In the game of \ind{twenty questions}, one player thinks of
  an object, and the other player attempts to guess what the object is
  by asking questions that have yes/no answers, for example,  
  `is it alive?', or `is it human?'
 The aim is to identify the object with as few questions
  as possible.
  What is the best strategy for playing this game?
  For simplicity, imagine that we are playing the rather dull 
  version of twenty questions called `sixty--three'.
% % two hundred and fifty five'.
%  In this game, the permitted objects are the $2^6$ integers 
%  $\A_X = \{ 0 , 1 , 2 , \dots 63 \}$.
%  One player selects an $x \in \A_X$, and we ask 
%  questions  that have yes/no answers in order to identify $x$. 

\exampl{example.sixtythree}{ {\sf The game `sixty--three'}.
 What's the smallest number of   yes/no  questions needed 
 to identify an integer $x$ between 0 and 63?\index{twenty questions}
}
 Intuitively,
 the best questions successively divide 
 the 64 possibilities into equal sized sets.
Six questions suffice.
 One reasonable strategy asks the following questions: 
%
% want a computer program environment here.
%
\begin{quote}
\begin{tabbing}
 {\sf 1:} is $x \geq 32$? \\
 {\sf 2:} is $x \mod 32 \geq 16$? \\
 {\sf 3:} is $x \mod 16 \geq 8$? \\
 {\sf 4:} is $x \mod 8 \geq 4$? \\
 {\sf 5:} is $x \mod 4 \geq 2$? \\
 {\sf 6:} is $x \mod 2 = 1$? 
\end{tabbing}
\end{quote}
%
% I'd like to put this in a comment column on the right beside the 'code':
%
 [The notation $x \mod 32$, pronounced `$x$ modulo 32', denotes the remainder
 when $x$ is divided by 32; for example, $35 \mod 32 = 3$
 and $32 \mod 32 = 0$.]

 The answers to these questions, if translated 
 from  $\{\mbox{yes},\mbox{no}\}$
 to $\{{\tt{1}},{\tt{0}}\}$, 
 give the binary expansion of $x$, for example 
 $35 \Rightarrow {\tt{100011}}$. 

 What are the 
 Shannon information contents of the outcomes in this example?  
 If we assume that all values of $x$ are equally likely, then the
 answers to the questions are independent  and each has 
% entropy $H_2(0.5) = 1 \ubit$. The
 Shannon information content
% of each answer is
 $\log_2 (1/0.5)
 =  1 \ubit$;  the total Shannon information gained 
 is always six bits. Furthermore, the number  $x$ that we learn from 
 these questions is a six--bit binary number. Our questioning 
 strategy defines a way of encoding the random variable $x$
 as a binary file.

 So far, the  Shannon information content  makes sense:
 it measures the length of a binary file that encodes
 $x$. 
%
 However, we have not yet studied ensembles where the 
 outcomes have unequal probabilities. Does the 
 Shannon information content make sense there too?

\fakesection{Submarine figure}
%
\newcommand{\subgrid}{\multiput(0,0)(0,10){9}{\line(1,0){80}}\multiput(0,0)(10,0){9}{\line(0,1){80}}}
\newcommand{\sublabels}{
\put(-5,75){\makebox(0,0){\sf\tiny{A}}}
\put(-5,65){\makebox(0,0){\sf\tiny{B}}}
\put(-5,55){\makebox(0,0){\sf\tiny{C}}}
\put(-5,45){\makebox(0,0){\sf\tiny{D}}}
\put(-5,35){\makebox(0,0){\sf\tiny{E}}}
\put(-5,25){\makebox(0,0){\sf\tiny{F}}}
\put(-5,15){\makebox(0,0){\sf\tiny{G}}}
\put(-5, 5){\makebox(0,0){\sf\tiny{H}}}
%
\put(75,-5){\makebox(0,0){\tiny{8}}}
\put(65,-5){\makebox(0,0){\tiny{7}}}
\put(55,-5){\makebox(0,0){\tiny{6}}}
\put(45,-5){\makebox(0,0){\tiny{5}}}
\put(35,-5){\makebox(0,0){\tiny{4}}}
\put(25,-5){\makebox(0,0){\tiny{3}}}
\put(15,-5){\makebox(0,0){\tiny{2}}}
\put( 5,-5){\makebox(0,0){\tiny{1}}}
}
\newcommand{\misssixteen}{
\put(45,65){\makebox(0,0){$\times$}}
\put(45,45){\makebox(0,0){$\times$}}
\put(35,75){\makebox(0,0){$\times$}}
\put(35,65){\makebox(0,0){$\times$}}
\put(35,55){\makebox(0,0){$\times$}}
\put(35,45){\makebox(0,0){$\times$}}
\put(35,35){\makebox(0,0){$\times$}}
\put(35,25){\makebox(0,0){$\times$}}
\put(35,15){\makebox(0,0){$\times$}}
\put(35, 5){\makebox(0,0){$\times$}}
\put(25,75){\makebox(0,0){$\times$}}
\put(25,65){\makebox(0,0){$\times$}}
\put(25,55){\makebox(0,0){$\times$}}
\put(25,45){\makebox(0,0){$\times$}}
\put(25,35){\makebox(0,0){$\times$}}
\put(25,25){\makebox(0,0){$\times$}}
\put(25,15){\makebox(0,0){$\times$}}
}
\newcommand{\missthirtytwo}{
\put(75,75){\makebox(0,0){$\times$}}
\put(75,65){\makebox(0,0){$\times$}}
\put(75,55){\makebox(0,0){$\times$}}
\put(75,45){\makebox(0,0){$\times$}}
\put(75,35){\makebox(0,0){$\times$}}
\put(75,25){\makebox(0,0){$\times$}}
\put(75,15){\makebox(0,0){$\times$}}
\put(75, 5){\makebox(0,0){$\times$}}
\put(65,75){\makebox(0,0){$\times$}}
\put(65,65){\makebox(0,0){$\times$}}
\put(65,55){\makebox(0,0){$\times$}}
\put(65,45){\makebox(0,0){$\times$}}
\put(65,35){\makebox(0,0){$\times$}}
\put(65,25){\makebox(0,0){$\times$}}
\put(65,15){\makebox(0,0){$\times$}}
\put(65, 5){\makebox(0,0){$\times$}}
\put(55,75){\makebox(0,0){$\times$}}
\put(55,65){\makebox(0,0){$\times$}}
\put(55,55){\makebox(0,0){$\times$}}
\put(55,45){\makebox(0,0){$\times$}}
\put(55,35){\makebox(0,0){$\times$}}
\put(55,25){\makebox(0,0){$\times$}}
\put(55,15){\makebox(0,0){$\times$}}
\put(55, 5){\makebox(0,0){$\times$}}
\put(45,75){\makebox(0,0){$\times$}}
%%\put(45,65){\makebox(0,0){$\times$}}
\put(45,55){\makebox(0,0){$\times$}}
%% \put(45,45){\makebox(0,0){$\times$}}
\put(45,35){\makebox(0,0){$\times$}}
\put(45,25){\makebox(0,0){$\times$}}
\put(45,15){\makebox(0,0){$\times$}}
\put(45, 5){\makebox(0,0){$\times$}}
\put(5,65){\makebox(0,0){$\times$}}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%% submarine figure %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}
\figuredangle{%
\begin{center}
%\begin{tabular}{l@{\hspace{-1mm}}*{5}{@{\hspace{2pt}}c}} \toprule
\begin{tabular}{l@{\hspace{0mm}}*{5}{@{\hspace{8.5mm}}c}} \toprule
% moves made &  1 & 2 & 32 & 48 & 49 \\
 &
%
% 1 miss
%
% this fig actually needs extra width on left, but there is nothing there.
\setlength{\unitlength}{0.26mm} 
\begin{picture}(80,95)(0,-10)\subgrid\sublabels
\put(25,15){\makebox(0,0){$\times$}}
\put(25,15){\circle{15}}
\end{picture} 
 &
%
% 2 miss
%
\setlength{\unitlength}{0.26mm} 
\begin{picture}(80,95)(0,-10)\subgrid
\put(25,15){\makebox(0,0){$\times$}}
\put(5,65){\makebox(0,0){$\times$}}
\put(5,65){\circle{15}}
\end{picture} 
 &
%
% 32 miss
%
\setlength{\unitlength}{0.26mm} 
\begin{picture}(80,95)(0,-10)\subgrid
\put(25,15){\makebox(0,0){$\times$}}
\put(45,35){\circle{15}}
\missthirtytwo
\end{picture} 
 &
%
% 49 miss
%
\setlength{\unitlength}{0.26mm} 
\begin{picture}(80,95)(0,-10)\subgrid
\put(25,15){\makebox(0,0){$\times$}}
\put(5,65){\makebox(0,0){$\times$}}
\missthirtytwo
\misssixteen
\put(25,25){\circle{15}}
\end{picture} 
&
\setlength{\unitlength}{0.26mm} 
\begin{picture}(80,95)(0,-10)\subgrid
\put(25,15){\makebox(0,0){$\times$}}
\put(5,65){\makebox(0,0){$\times$}}
\missthirtytwo
\misssixteen
%%%%%%%%%%%%%%%%%%%%%%% hit the submarine: 
\put(25,5){\circle{15}}
\put(25,5){\makebox(0,0){\tiny\bf S}}
\end{picture} 
 \\
move \# &  1 & 2 & 32 & 48 & 49 \\
question
& G3
& B1
& E5 
& F3
& H3 \\
 outcome     
&  $x = {\tt n}$ % $(\times)$
&  $x = {\tt n}$ %$(\times)$
&  $x = {\tt n}$ %$(\times)$
&  $x = {\tt n}$ %$(\times)$
&  $x = {\tt y}$ %({\small\bf S})
 \\[0.1in]
 $P(x)$ 
& 	$\displaystyle\frac{63}{64}$  
& 	$\displaystyle\frac{62}{63}$  
& 	$\displaystyle\frac{32}{33}$  
& 	$\displaystyle\frac{16}{17}$  
& 	$\displaystyle\frac{1}{16}$  
 \\[0.15in]
 $h(x)$ 
& 	 0.0227
& 	 0.0230
& 	 0.0443 
% & 	 0.0430 -------- 0.9556 , just before 32 are pasted
& 	 0.0874
& 	 4.0 
 \\[0.05in]
 Total info.
& 	 0.0227
&  0.0458
&  1.0
&  2.0
&  6.0
 \\  \bottomrule
\end{tabular}
\end{center}
}{%
\caption[a]{A game of submarine. The submarine is hit on the 49th attempt.}
\label{fig.sub}
}%
\end{figure}

\subsection{The game of {\ind{submarine}}: how many bits can one bit convey?}
 In the game of {\ind{battleships}}, each player hides a fleet of 
 ships in a sea represented by a square grid. On each 
 turn, one player
 attempts to hit the other's ships by firing at one square
 in the opponent's sea. The response to a selected square such 
 as `G3' is either `miss', `hit', or `hit and destroyed'.

 In a
% rather
 boring version of battleships called {\tt submarine}, 
 each player hides just one submarine in one square of 
 an eight-by-eight grid. 
 \Figref{fig.sub} shows a few pictures of  this game in progress:
 the circle represents the square that is being fired at, and the
 $\times$s show squares in which the outcome was $x={\tt{n}}$; the
 submarine is hit (outcome $x={\tt{y}}$ shown by
 the symbol $\bs$) on the 49th attempt.
 
 Each shot made by a player defines an ensemble. The 
 two possible outcomes are $\{  {\tt{y}} ,{\tt{n}}\}$, 
 corresponding to a hit and a miss, and their probabilities
 depend on the state of the board. 
 At the beginning, $P({\tt{y}}) = \linefrac{1}{64}$ and 
 $P({\tt{n}}) = \linefrac{63}{64}$. 
 At the second shot, if the first shot missed,
% enemy sub has not yet been hit, 
 $P({\tt{y}}) = \linefrac{1}{63}$ and $P({\tt{n}}) = \linefrac{62}{63}$. 
 At the third shot, if the first two shots missed,
% enemy submarine has not yet been hit, 
 $P({\tt{y}}) = \linefrac{1}{62}$ and $P({\tt{n}}) = \linefrac{61}{62}$. 

% According to the Shannon information content, t
 The  Shannon information
 gained from an outcome $x$ is $h(x) = \log (1/P(x))$.
% Let's investigate this assertion.
 If we are lucky, and hit the submarine on the first shot, then 
\beq
	h(x) = h_{(1)}({\tt y}) = \log_2 64 = 6 \ubits .
\eeq
 Now, it might seem a little strange that
 one binary outcome can convey six bits.
% , but it does make sense. W
 But we have learnt the hiding place,
% where the submarine was,
 which 
 could have been any of 64 squares; so we have, by one lucky 
 binary question, indeed learnt six bits. 

 What if the first shot misses?  The Shannon information that we gain from this outcome
 is
\beq
	h(x) = h_{(1)}({\tt n}) = \log_2 \frac{64}{63} = 0.0227 \ubits .
\eeq
 Does this make sense? It is not so obvious. Let's keep going.
 If our second shot also misses, the Shannon information 
 content of the second outcome is
\beq
	 h_{(2)}({\tt n}) = \log_2 \frac{63}{62} = 0.0230 \ubits .
\eeq
 If we miss thirty-two times (firing at a new square each time), 
 the total Shannon information gained is
\beqan
%\hspace*{-0.2in}
\lefteqn{ \log_2 \frac{64}{63} + \log_2 \frac{63}{62} + \cdots +
	\log_2 \frac{33}{32} } \nonumber \\
&	\!\!\!=\!\!\! &  0.0227  +  0.0230 + \cdots + 0.0430  \:\:=\:\:
  1.0 \ubits .
\eeqan
 Why this round number? Well, what have we learnt? We now know
 that the submarine is not in any of the 32 squares we fired at; 
 learning that fact is just like playing a game of \sixtythree\ 
 (\pref{example.sixtythree}),
 asking as our first question `is $x$  one of the 
 thirty--two numbers corresponding to these squares I fired at?',
 and receiving the answer `no'.  This answer rules out half of the 
 hypotheses, so it gives us one bit.
%It doesn't matter what the 
% outcome might have been; all that matters is the probability 
% of what actually happened.

 After 48 unsuccessful shots, the information 
 gained is 2 bits: the unknown location has been narrowed down to
 one quarter of the original hypothesis space.

 What if we hit the submarine on the 49th shot, when there 
 were 16 squares left? 
 The Shannon information content of this outcome is
\beq
		 h_{(49)}({\tt y}) = \log_2 16 = 4.0 \ubits .
\eeq
 The total Shannon information content of all the outcomes is
\beqan
\lefteqn{	\log_2 \frac{64}{63} + \log_2 \frac{63}{62} + \cdots +
%	\log_2 \frac{33}{32} + \cdots +
	\log_2 \frac{17}{16} + 
	\log_2 \frac{16}{1} }
  \nonumber \\
	&=&  0.0227  +  0.0230 + \cdots
% + 0.0430 + \cdots 
		+ 0.0874 + 4.0 \:\: =\:\: 6.0 \ubits .
\label{eq.sum.me}
\eeqan
 So once we know where the submarine is, the total Shannon information 
 content gained is 6 bits.

 This result holds regardless of when  
 we hit the submarine. If we hit it when there are  $n$ squares 
 left to choose from --   $n$ was 16 in 
 \eqref{eq.sum.me} -- then the total information gained
 is: 
\beqan
\lefteqn{	\log_2 \frac{64}{63} + \log_2 \frac{63}{62} + \cdots +
	\log_2 \frac{n+1}{n} + 
	\log_2 \frac{n}{1} } \nonumber \\
&=& \log_2 \left[
	\frac{64}{63} \times \frac{63}{62} \times \cdots
                   \times \frac{n+1}{n} \times \frac{n}{1} \right]
%\times 63 \times \cdots \times (n+1) \times n}
%		{63 \times 62 \times \cdots \times n \times 1} 
	\:\:=\:\: \log_2 \frac{64}{1}\:\: =\:\: 6 \,\bits.
\eeqan

%
% add winglish here?
%
% follows in lecture 2, after submarine
%
% aim: introduce the language of Wenglish
% and demonstrate Shannon info content.

 What have we learned from the examples so far?
 I think the {\tt submarine} example makes quite a convincing
 case for the claim that the Shannon information content
 is a sensible measure of information content.
 And the game of {\tt sixty-three} shows that
 the Shannon information content can be  intimately connected
 to the size of a file that encodes the outcomes of
 a random experiment, thus suggesting a possible connection to
 data compression.

 In case you're not convinced, let's look at one more example.

 

\subsection{The \Wenglish\ language}
\label{sec.wenglish}
% [this section under construction]}
 {\dem{\ind{\Wenglish}}} is a  language similar to \ind{English}.
 \Wenglish\ sentences consist of words drawn at random from the
 \Wenglish\ dictionary, which contains $2^{15}=32,768$ words, all of length 5
 characters. Each word in the \Wenglish\ dictionary was constructed
% by the \Wenglish\  language committee, who created each of those 32,768 words
 at random by picking five letters from the
 probability distribution over {\tt a$\ldots$z} depicted
 in \figref{fig.monogram}.
% Since all words are five characters long

%\begin{figure}
%\figuremargin{
\marginfig{\small
\begin{center}
\begin{tabular}{rc} \toprule
%  & Word \\ \midrule
1 & {\tt{aaail}} \\
2 & {\tt{aaaiu}} \\
3   & {\tt{aaald}} \\
    & $\vdots$ \\
129 & {\tt{abati}} \\
    & $\vdots$ \\
2047 & {\tt{azpan}} \\
2048 & {\tt{aztdn}} \\
    & $\vdots$ \\
    & $\vdots$ \\
 16384   & {\tt{odrcr}} \\
    & $\vdots$ \\
    & $\vdots$ \\
 32737 & {\tt{zatnt}} \\
    & $\vdots$ \\
 32768 & {\tt{zxast}} \\ \bottomrule
\end{tabular}
\end{center}
%}{
\caption[a]{The \Wenglish\ dictionary.}
\label{fig.wenglish}
}
%\end{figure}
% 5366+1219+2602+2718+8377+1785+1280+3058+5903+70+800+3431+2319+5470+6526+1896+539+4660+5453+6767+3108+652+1388+765+1564+78
% 77794
 Some entries from the dictionary are shown in
  alphabetical order in \figref{fig.wenglish}.
 Notice that the number of words in the \ind{dictionary}
 (32,768)
 is much smaller than the total number of possible  words of length 5 letters,
 $26^5 \simeq 12,000,000$.

 Because the probability of the letter {{\tt{z}}} is about $1/1000$,
 only 32 of the words in the dictionary begin with the letter {\tt z}.
 In contrast,  the probability of the letter {{\tt{a}}} is about $0.0625$,
 and 2048 of the words begin  with the letter {\tt a}. Of those 2048 words,
 two start {\tt az}, and 128 start {\tt aa}.

 Let's imagine that we are reading a \Wenglish\ document, and let's discuss
 the Shannon \ind{information content} of the characters as we acquire them.
 If we are given the text one word at a time, the Shannon information
 content of each five-character word is $\log 32768 = 15$ bits,
 since \Wenglish\ uses all its words with equal probability. The
 average information content per character is 3 bits.

 Now let's look at the information content if we read the document
 one character at a time.
 If, say, the first letter of a word is {\tt a}, the Shannon information
 content is
 $\log 1/ 0.0625 \simeq 4$ bits.
 If the first letter is {\tt z}, the Shannon information content
 is $\log 1/0.001 \simeq 10$ bits.
 The information content is thus highly variable
 at the first character. The total information
 content  of the 5 characters in a word, however,
 is exactly 15 bits; so the letters that
 follow an initial {\tt{z}} have lower average  information content
 per character than the letters that follow an initial {\tt{a}}.
 A rare initial letter such as {\tt{z}} indeed conveys
 more information about what the word is
 than a common initial letter.


 Similarly, in English, if  rare characters occur at the start of the word (\eg, {\tt{xyl}\ldots},
 then often we can identify the whole word immediately; whereas
 words that start with common characters (\eg, {\tt{pro}\ldots}) require more characters
 before we can identify them.

% Does this make sense? Well, in English,
% the first few characters of a word do very often fully identify the whole word.
%
% {\em MORE HERE........}
 







\section{Data compression}
 \index{data compression}\index{source code}The
 preceding examples justify the idea that the Shannon \ind{information 
 content} of an outcome is a natural  measure of its
 \ind{information content}.  Improbable outcomes
 do convey more information than probable outcomes.
 We now discuss the  information content 
 of a source by considering how many bits are needed to describe 
 the  outcome of an experiment.
% , that is, by studying {data compression}. 

 If we can show that we can  compress data from a particular source 
 into a file of $L$ bits per source symbol and recover the data reliably,
 then we will say that the average information 
 content of that source is at most
% less than or equal to
 $L$ bits per symbol.
%
% cut Sat 13/1/01
%
% We will show that, for any source, the information content of the source 
% is intimately  related to its entropy.

\subsection{Example: Compression of text files}
 A file is composed of a sequence of bytes.  A byte is composed of 8
 bits\marginpar{\footnotesize{Here we use the word `bit' with its meaning, `a
 symbol with two values', not to be confused with the
 unit of information content.}}
 and can have a decimal value between 0 and 255.  A
 typical text file is composed of the
 ASCII character set (decimal values 0 to 127). 
 This character set uses only 
 seven of the eight bits in a byte. 
\exercissxB{1}{ex.ascii}{
 By how much could the size of a file be reduced given that 
 it is an ASCII file? How would you achieve this reduction?
} 
 Intuitively, it seems reasonable to assert that an ASCII file 
 contains $7/8$ as much information as an arbitrary file of the same 
 size, since we already know one out of every eight bits before we even 
 look at the file. 
 This is a 
% very
 simple example of redundancy. 
 Most sources of data have further redundancy: English text files
 use the ASCII characters with non-equal frequency; certain pairs 
 of letters are more probable than others;  and entire words 
 can be predicted given the context and a semantic understanding
 of the text.
% this par is repeated in l4. 

% compressibility.

\subsection{Some simple data compression methods that define
     measures of information content}
%
% IDEA: connect back to opening
%
 One way of measuring the information content of a  random variable 
 is  simply to count the number of  {\em possible\/} outcomes,
 $|\A_X|$. (The number of elements in a set $\A$ is denoted by $|\A|$.)
 If we  gave a binary name  to each outcome, the length 
 of each name would be $\log_2 |\A_X|$ bits, if $|\A_X|$ happened
 to be a power of 2.
 We thus make the following definition.  
\begin{description}%%%% was: [Perfect information content] Raw bit content
%%%%%%%%%%%%%%%%%%%%%%% see newcommands1.tex
\item[The \perfectic] of $X$ is
\beq
	H_0(X) = \log_2 |\A_X| .
\eeq 
 \end{description}
 $H_0(X)$ is a lower bound for 
 the number of binary questions that are always guaranteed to identify
 an outcome from the ensemble $X$.
 It is an additive quantity: the \perfectic\ of an ordered pair $x,y$,
 having $|\A_X||\A_Y|$  
 possible outcomes,
 satisfies    
\beq
	H_0(X,Y)= H_0(X) + H_0(Y).
\eeq

 This measure of information content does not include any
 probabilistic element, and the encoding  rule it corresponds to
 does not `compress' the source data, it simply maps each
 outcome
% source character
 to a constant-length binary string.
 
\exercisxA{2}{ex.compress.possible}{
 Could there be a compressor that maps
 an outcome $x$ to a binary code $c(x)$, and a decompressor
 that maps $c$ back to $x$, such that {\em every
 possible outcome\/} is compressed into a binary code
 of length {\em shorter\/}
 than $H_0(X)$ bits?
}
 Even though  a simple counting argument\index{compression!of {\em any\/} file}
 shows that it is impossible to make a reversible
 compression program that reduces the size of {\em all files},
 amateur compression enthusiasts frequently announce that they have invented
 a program that  can do this -- indeed that they can further compress
 compressed files by putting them through their compressor several\index{compression!of already-compressed files}\index{myths!compression}
 times. Stranger yet, patents have
 been granted to these modern-day \ind{alchemists}. See
 the {\tt{comp.compression}} frequently asked questions
% \verb+http://www.faqs.org/faqs/compression-faq/part1/+
 for further reading.\footnote{\tt{http://sunsite.org.uk/public/usenet/news-faqs/comp.compression/}}
%\footnote{\verb+http://www.lib.ox.ac.uk/internet/news/faq/+}
% ............by_category.compression-faq.html+}
% http://www.faqs.org/faqs/compression-faq/part1/preamble.html

 There are only two ways in which a `compressor' can actually
 compress files:
\ben
\item
	A {\dem lossy\/} compressor compresses some\index{compression!lossy}
	files, but maps some files
% {\em distinct\/} files are mapped
 to the
	{\em same\/} encoding. We'll assume that
	the user requires perfect recovery of the source
	file, so  the occurrence of one of these
	 confusable files leads to a failure (though in 
	applications such as \ind{image compression}, lossy compression is viewed as
 satisfactory).  We'll denote by
  $\delta$ 
 the probability of the
	source string's being one of the confusable files, so a
 lossy compressor\index{error probability!in compression}
	has a probability $\delta$ of
	failure.	If $\delta$ can be made very small then
	a lossy compressor may be practically useful. 
\item
	A {\dem lossless} compressor maps all files
 to different encodings; if it
% f a lossless compressor
 shortens some files,\index{compression!lossless}
	it necessarily {\em  makes others longer}.  We try to design the
	compressor so that the probability that a
	file is lengthened is very small, and the probability that
 it is shortened is large.
\een
 In this chapter we  discuss a simple lossy compressor.
 In subsequent chapters we  discuss  lossless compression
 methods.

%
\section{Information content defined in terms of lossy
 compression}
%

 Whichever type of compressor we construct, we need somehow to
 take into account the {\em probabilities\/} of the  different outcomes. 
 Imagine comparing the information contents of
 two text files -- one
 in which all 128 ASCII characters are used with equal probability,
 and one in which the characters are used with their frequencies 
 in English text.
%: $P(x={\tt e})=$, 
% $P(x={\tt e})=$, $P(x={\tt e})=$,$P(x={\tt e})=$,$P(x={\tt e})=$, \ldots
% $P(x={\tt e})=$, \ldots. 
% only the characters {\tt 0} and {\tt 1} are used. 
 Can we define a measure of information content that
 distinguishes between these two files? Intuitively,
 the latter file contains less information per character
 because it is more predictable.

%And a file of {\tt 0}s 
% and {\tt 1}s in which nearly all the characters are {\tt 0}s 
% conveys even less information. 
% Maybe introducing 0 and 1 is nto a good idea. 
% At this point I start talking in terms of compression. 
% How can we include a probabilistic element?
 One simple way to use
 our knowledge that some symbols have a smaller probability is
 to imagine recoding the observations into a smaller alphabet -- thus losing
 the ability to encode some of the more improbable
 symbols -- and then measuring the \perfectic\ of the new alphabet.
% choice here - could either map multiple symbols onto 
% one, so the compression is lossy, 
% or could define no entry at all for some symbols, so compression
% fails. 
%  The general mapping situation is not ideal since I really want all 
% the losers to be mapped to one symbol. Student might imagine mapping
% Z and z to Z, Y and y to Y.. and claim they are losing little info.
% But this messes up the defn of delta.
 For example, 
 we might take a risk when compressing English text, guessing that the most
 infrequent  characters won't occur, 
 and make a reduced ASCII code that omits the characters
% for example, 
%  `\verb+!+', `\verb+@+', `\verb+#+',
%  `\verb+$+', `\verb+%+', `\verb+^+', `\verb+*+', `\verb+~+', 
%  `\verb+<+', `\verb+>+', `\verb+/+',   `\verb+\+',  `\verb+_+',
%  `\verb+{+',  `\verb+}+',  `\verb+[+',  `\verb+]+',
%  and `\verb+|+',
 $\{$ \verb+!+, \verb+@+, \verb+#+,
% \verb+$+, $
 \verb+%+, \verb+^+, \verb+*+, \verb+~+, 
 \verb+<+, \verb+>+, \verb+/+,   \verb+\+,  \verb+_+,
 \verb+{+,  \verb+}+,  \verb+[+,  \verb+]+, \verb+|+ $\}$,
 thereby reducing the size of the alphabet
% the total number of characters
 by seventeen.
%
% cut this dec 2000
% Thus we can give new
%%%%  a (not necessarily unique)
% names to a {\em subset\/} of the possible outcomes and count how many names we
% use.
 The larger the risk we are willing to take, the smaller
 our final alphabet becomes.
% ] the number of names we need.
% We thus relax the exhaustive requirement of the definition of 
%
% aside
%
% We could imagine doing this to the numbers coming out of the guessing 
% game with which this chapter started, for example. It seems 
% quite unlikely that the subject would have to guess 25, 26 or 27 times 
% to get the next letter; these outcomes 
%%`27' is
% are very improbable, 
% and we might be willing to record the sequence of numbers using 
% 24 symbols only, taking the gamble that in fact more guesses might 
% be needed. 

 We  introduce a parameter $\delta$ that describes the risk we 
 are taking when using this compression method:  $\delta$ is 
 the probability that there will be no name for an outcome $x$.
\exampl{exHdelta}{
 Let 
\beq
\begin{array}{l*{14}{@{\,}c}}
     & \A_X & = & \{  & {\tt a},& {\tt b},&{\tt c},&{\tt d},&{\tt e},&{\tt f},&{\tt g},&{\tt h} & \}, \\
 \mbox{and }\:\:
  & \P_X & = & \bigl\{  &    \frac{1}{4} ,&    \frac{1}{4} ,&   \frac{1}{4} ,&  \frac{3}{16} ,&  \frac{1}{64} ,&  \frac{1}{64} ,&  \frac{1}{64} ,&  \frac{1}{64}  & \bigr\} .
\end{array} 
\eeq
 The \perfectic\ of this ensemble is 3 bits, corresponding to 
 8 binary names.
 But notice that $P( x \in \{ {\tt a}, {\tt b}, {\tt c}, {\tt d} \} ) = 15/16$.
 So if we are willing to run a risk of $\delta=1/16$ of not having a name
 for $x$, then we can get by with four names --
 half as many names as are needed if
 every $x \in \A_X$  has a name.

 Figure \ref{fig.delta.examples} shows binary names that could be given 
 to the different outcomes in the cases $\delta = 0$ and $\delta = 1/16$.
 When $\delta=0$ we need 3 bits to encode the outcome;
 when $\delta=1/16$ we only need 2 bits. 
}

%\begin{figure}[htbp]
%\figuremargin{%
\amarginfig{t}{
\begin{center}
\begin{tabular}{cc} 
\toprule
\multicolumn{2}{c}{$\delta = 0$}
\\
\midrule
$x$ & $c(x)$ \\ \midrule
{\tt a} & {\tt{000}} \\
{\tt b} & {\tt{001}} \\
{\tt c} & {\tt{010}} \\
{\tt d} & {\tt{011}} \\
{\tt e} & {\tt{100}} \\
{\tt f} & {\tt{101}} \\
{\tt g} & {\tt{110}} \\
{\tt h} & {\tt{111}} \\
 \bottomrule
\end{tabular}
% \hspace{0.61in}
\hspace{0.1in}
\begin{tabular}{cc} 
\toprule
\multicolumn{2}{c}{$\delta = 1/16$}
\\
\midrule
$x$ & $c(x)$ \\ \midrule
{\tt a} & {\tt{00}} \\
{\tt b} & {\tt{01}} \\
{\tt c} & {\tt{10}} \\
{\tt d} & {\tt{11}} \\
{\tt e} & $-$ \\
{\tt f} & $-$ \\
{\tt g} & $-$ \\
{\tt h} & $-$ \\
 \bottomrule
\end{tabular}
\end{center}
%}{%
\caption[a]{Binary names for the outcomes,
 for two failure probabilities $\delta$.}
\label{fig.delta.examples}
}%
%\end{figure}

%\noindent
 Let us now formalize this idea.\index{source code}
%
 To make a compression strategy with risk $\delta$,
% we consider all  subsets $T$ of the alphabet $\A_X$ and 
% seek out
 we make the smallest possible subset
 $S_{\delta}$ such that  the
 probability that $x$ is not in $S_{\delta}$ is less than or equal to 
 $\delta$, \ie,
 $P(x \not\in S_{\delta} ) \leq \delta$. For each value of $\delta$ we can then
 define a new measure of information content -- the log of the size
 of this smallest subset $S_{\delta}$. [In ensembles in which
 several elements have the same probability, there may be several
 smallest subsets that contain different elements, but all that matters
 is their sizes (which are equal), so we will not dwell on this ambiguity.]
% worry about this possibility.
\begin{description}
\item[The smallest $\delta$-sufficient subset] $S_{\delta}$ is the smallest
	subset of $\A_X$ satisfying
\beq
	P(x \in S_{\delta} ) \geq 1 - \delta.
\eeq
%\beq
% S_{\delta} = \argmin 
%\eeq
\end{description}
 The subset  $S_{\delta}$ can be constructed by
 ranking the elements of $\A_X$ in order of decreasing probability
 and adding successive elements starting from the
 most probable elements
% front of the list
 until the total
 probability is $\geq$ ($1\!-\!\delta$).

 We can make a data compression code by assigning a binary name
 to each element of the smallest sufficient subset. This compression  
 scheme motivates the following measure of information content: 
\begin{description}
\item[The \essentialic] of $X$ is: %%%%% was ESSENTIAL information content
% consider risk-delta bit content?
\beq
	H_{\delta}(X) = \log_2 |S_{\delta}|
% =	\log_2 \min 	\left\{ |S| : S\subseteq \A_X,
%% P(S)\geq 1-\delta \right\}.
% P(x \in S)\geq 1-\delta \right\}.
\eeq
\end{description}
 Note that $H_0(X)$ is the special case of $H_{\delta}(X)$ with $\delta = 0$ 
 (if $P(x) > 0$ for all $x \in \A_X$). 
%
 [{\sf Caution:} Do not confuse $H_0(X)$ and $H_{\delta}(X)$
 with the function $H_2(p)$ displayed in \figref{fig.h2}.] 

%%%%%%%(Should  I change notation to avoid confusion?)
%
\newcommand{\gapline}{\cline{1-4}\cline{6-9}}
\begin{figure}
\figuremargin{%
\begin{center}
\footnotesize%
\begin{tabular}{rc}
(a)&
%%%%%%%% written by hand
%
% picture of Sdelta for X
%
% rewritten to change orientation 99 07 26. (old version preserved in X.tex.bak)
%
\newcommand{\forestgap}{-4}
\newcommand{\forest}[3]{\multiput(#1)(\forestgap,0){#2}{\line(0,1){#3}}}
\setlength{\unitlength}{0.61pt}
\begin{picture}(600,205)(-660,-105)% was (600,220)(-660,-120) Sun 22/12/02
% - log P = 2.0 , 2.4 and 6.0
\forest{-200,0}{3}{64}% 4/16 = 16/64
\forest{-241,0}{1}{48}% 3/16 = 12/64
\forest{-600,0}{4}{10}%  1/64
% \forest{600,0}{4}{4}%  1/64 should be height 4 to be literal
% axis: 
\put(-660,95){\vector(1,0){520}}
%
% axis labels
\put(-125,105){\makebox(0,0)[b]{$\log_2 P(x)$}}
\put(-200,100){\makebox(0,0)[b]{$-2$}}
\put(-241,100){\makebox(0,0)[b]{$-2.4$}}
\put(-400,100){\makebox(0,0)[b]{$-4$}}
\put(-600,100){\makebox(0,0)[b]{$-6$}}
%
% the S0 box 445+185=630
\put(-630,-15){\framebox(445,95){}}
\put(-638,35){\makebox(0,0)[r]{$S_0$}}
% S 1/16 box
\put(-271,-10){\framebox(81,85){}}
\put(-274,35){\makebox(0,0)[r]{$S_{\frac{1}{16}}$}}
%
% object labels
\put(-200,-60){\makebox(0,0)[t]{{\tt a},{\tt b},{\tt c}}}
\put(-241,-60){\makebox(0,0)[t]{{\tt d}}}
\put(-600,-60){\makebox(0,0)[t]{{\tt e},{\tt f},{\tt g},{\tt h}}}
\put(-200,-50){\vector(0,1){45}}
\put(-241,-50){\vector(0,1){45}}
\put(-600,-50){\vector(0,1){45}}
\end{picture}
%
%
%

(b)&
\mbox{\makebox[0in][r]{\raisebox{1.3in}{$H_{\delta}(X)$}}\hspace{-5mm}%
\psfig{figure=Hdelta/byhand/X.ps,%
width=70mm,angle=-90}$\delta$}%
\\
\end{tabular}
\end{center}
}{%
\caption[a]{(a) The outcomes of $X$ (from \protect\exampleref{exHdelta}),
 ranked by their probability.
 (b) The 
 \essentialic\ $H_{\delta}(X)$. The labels on the graph
 show the smallest sufficient set as a function of $\delta$.
  Note  $H_0(X) = 3$ bits and $H_{1/16}(X) = 2$ bits. 
}
\label{fig.hd.1}
}
\end{figure}

%\noindent
{\Figref{fig.hd.1} shows $H_{\delta}(X)$ for the ensemble
 of \exampleonlyref{exHdelta} as a function of  $\delta$.
}  

\subsection{Extended ensembles}
% The compression method we're studying in which a subset of
% outcomes are given binary names is not giving us a
% measure of information content for a single symbol.
%
% sanjoy wants a motivation here.
%
 Is this compression method any more useful if we compress
 {\em blocks\/} of symbols from a source?\index{source code!block code}\index{ensemble!extended}\index{extended ensemble}
%

 We now turn to examples where the outcome $\bx = (x_1,x_2,\ldots, x_N)$ is a string of  $N$
 independent identically distributed random variables
 from a single ensemble $X$. 
 We will denote by
% $\bX$ or
 $X^N$ the ensemble $( X_1, X_2, \ldots, X_N )$.
% for which $\bx$ is the random variable.
 Remember that entropy is additive for independent variables, (\exerciseref{ex.Hadditive})
% \footnote{There should have been an exercise on this by now.}
 so 
% $H(\bX) = N H(X)$. 
 $H(X^N) = N H(X)$. 

\exampl{ex.Nfrom.1}{
% {\sf Example 2:}
 Consider a string of $N$ flips of a bent coin,
 $\bx = (x_1,x_2,\ldots, x_N)$, where $x_n \in
 \{{\tt{0}},{\tt{1}}\}$, with probabilities $p_0 \eq 0.9,$ $p_1 \eq
 0.1$. The most probable strings $\bx$ are those with most {\tt{0}}s.  If
 $r(\bx)$ is the number of {\tt{1}}s in $\bx$ then
\beq
% |p_0,p_1
 P(\bx) = p_0^{N-r(\bx)} p_1^{r(\bx)} .
\eeq
 To evaluate  $H_{\delta}(X^N)$
 we must find the smallest sufficient subset $S_{\delta}$.
 This  subset will contain 
 all $\bx$ with $r(\bx) = 0, 1, 2, \ldots$, up to some $r_{\max}(\delta)-1$,
 and some of the $\bx$ with $r(\bx) = r_{\max}(\delta)$.
% Working backwards, we can evaluate the cumulative probability 
% $P(r(\bx) \leq r)$ and evaluate the size of the subset $T(r): \{ \bx:
% r(\bx) \leq r \}$. 
%\beq
%	|T(r)| = \sum_{r=0}^{r} \frac{N!}{(N-r)!r!}
%\label{l2.T}
%\eeq
%\beq
%	P(r(\bx) \leq r)  = \sum_{r=0}^{r} \frac{N!}{(N-r)!r!}  p_0^{N-r} p_1^{r}
%\label{l2.Pr}
%\eeq
% We can then plot $\log |T(r)|$ versus $P(r(\bx) \leq r)$. This defines 
% a graph of $H_{\delta}(\bX)$ against $\delta$. 
 Figures \ref{fig.hd.4} and \ref{fig.hd.10} 
% Figure \ref{fig.hd.4}
 show  graphs of $H_{\delta}(X^N)$ against
 $\delta$ for the cases $N=4$ and $N=10$. The steps are the values of
 $\delta$ at which $|S_{\delta}|$ changes by~1, and the cusps where the slope
 of the staircase changes are the points
 where $r_{\max}$ changes by 1.  
}
\exercisxC{2}{ex.cusps}{
 What are the mathematical shapes of the curves between the cusps? 
}
% , both with $p_1 =
% 0.1$.  The points defined by equations (\ref{l2.T}) and (\ref{l2.Pr})
% are the cusps in the curve.
%
% I think this figure may be sick. CHECK IT.
%
\renewcommand{\gapline}{\cline{1-3}\cline{5-8}}
\begin{figure}
\figuremargin{%
%
% this table done by hand with help of (above hd.p command) /home/mackay/itp/Hdelta> more figs/4.tex
%
\begin{center}
\footnotesize%
\begin{tabular}{r@{\hspace*{-0.3in}}c}
(a)&
%%%%%%%% written by hand    see also X.tex
%
% picture of Sdelta for X^4
%
\newcommand{\axislevel}{24}
\newcommand{\axislevelp}{29.5}
\newcommand{\axislevelm}{21}
\newcommand{\axislevelmm}{18}
\newcommand{\forestgap}{-0.7}
\newcommand{\forest}[3]{\multiput(#1)(\forestgap,0){#2}{\line(0,1){#3}}}
%
%
%
\setlength{\unitlength}{2.2pt}%
\begin{picture}(155,50)(-143,-20)% adjusted vertical height from 50 to 60 Sat 5/10/02. And put back again Sun 22/12/02  was (-143,-22) Sun 22/12/02
% - log P = 2.0 , 2.4 and 6.0
\forest{-6.1,0}{1}{16}% heights fictitious
\forest{-37.3,0}{4}{12.5}% 
\forest{-68.5,0}{6}{9.4}% 69.5
\forest{-100.8,0}{4}{6.3}%
\forest{-132.9,0}{1}{4.2}%
% axis: 
\put(-143,\axislevelm){\vector(1,0){151.0}}
%
% axis labels
\put(5,\axislevelp){\makebox(0,0)[b]{\small$\log_2 P(x)$}}
\put(0,\axislevel){\makebox(0,0)[b]{\small$0$}}
\put(-20,\axislevel){\makebox(0,0)[b]{\small$-2$}}
\put(-40,\axislevel){\makebox(0,0)[b]{\small$-4$}}
\put(-60,\axislevel){\makebox(0,0)[b]{\small$-6$}}
\put(-80,\axislevel){\makebox(0,0)[b]{\small$-8$}}
\put(-100,\axislevel){\makebox(0,0)[b]{\small$-10$}}
\put(-120,\axislevel){\makebox(0,0)[b]{\small$-12$}}
\put(-140,\axislevel){\makebox(0,0)[b]{\small$-14$}}
%
% this box is right size for the whole set
%\put(0,-2.5){\framebox(140,\axislevelm){}}
%\put(142,13){\makebox(0,0)[l]{\small$S_0$}}
% this box is round 3 clumps
\put(-83.5,-2.5){\framebox(83.5,\axislevelm){}}
\put(-84.5,13){\makebox(0,0)[r]{\small$S_{0.01}$}}
% a smaller box round 3 clumps
%\put(2.5,-1){\framebox(81,\axislevelmm){}}
%
\put(-53.5,-1){\framebox(51,\axislevelmm){}}
\put(-54.5,13){\makebox(0,0)[r]{\small$S_{0.1}$}}
%
% object labels
\put(-6.1,-12){\makebox(0,0)[t]{\footnotesize{\tt 0000}}}
\put(-37.7,-12){\makebox(0,0)[t]{\footnotesize{\tt 0010},{\tt 0001},\ldots}}
\put(-69.5,-12){\makebox(0,0)[t]{\footnotesize{\tt 0110},{\tt 1010},$\ldots$}}
\put(-101.2,-12){\makebox(0,0)[t]{\footnotesize{\tt 1101},{\tt 1011},$\ldots$}}
\put(-132.9,-12){\makebox(0,0)[t]{\footnotesize{\tt 1111}}}
\multiput(-6.1,-10)(-31.6,0){5}{\vector(0,1){5}}  
\end{picture}
%
%
%
%

(b)&
\makebox[0in][r]{\raisebox{1.3in}{$H_{\delta}(X^4)$}}\hspace{-5mm}%
\psfig{figure=Hdelta/figs/hd/4.ps,%
width=65mm,angle=-90}$\delta$%%
%
% 
% useful for making table: 
% hd.p mmin=4 mmax=4 mstep=6 scale_by_n=0 plot_sub_graphs=1 latex=1 
%
\end{tabular}
\end{center}
}{%
%
% I think this figure may be sick. CHECK IT.
%
\caption[a]{(a) The sixteen outcomes of the ensemble $X^4$ with $p_1=0.1$, ranked by probability. (b) The
 \essentialic\ $H_{\delta}(X^4)$. The upper
 schematic diagram indicates the strings's
 probabilities by the vertical lines's lengths (not to scale).}
\label{fig.hd.4}
}%
\end{figure}
%
%
%
\begin{figure}%[htbp]
\figuremargin{%
\begin{center}
\mbox{%%%%%%%%%%%%% (twocol) %}\\ \mbox{
\makebox[0in][r]{\raisebox{1.3in}{$H_{\delta}(X^{10})$}}\hspace{-5mm}%
\psfig{figure=Hdelta/figs/hd/10.ps,%
width=65mm,angle=-90}$\delta$}
% command, in Hdelta: 
% hd.p mmin=4 mmax=10 mstep=6 scale_by_n=0 plot_sub_graphs=1 | gnuplot 
\end{center}
}{%
\caption[a]{$H_{\delta}(X^N)$ 	for  $N=10$ binary variables with $p_1=0.1$.}
\label{fig.hd.10}
}%
\end{figure}

 For the examples shown in figures \ref{fig.hd.1}--\ref{fig.hd.10},
 $H_{\delta}(X^N)$ depends strongly on the 
 value of $\delta$, so it might not seem  a  fundamental or useful 
 definition of information content.  
 But we will consider what happens as $N$, the number of independent variables
 in $X^N$, increases. We will find the remarkable result that 
 $H_{\delta}(X^N)$ becomes almost independent of $\delta$ -- and for all 
 $\delta$ it is very close to $N H(X)$, where $H(X)$ is the 
 entropy of one of the random variables.
% sketch? 
\begin{figure}
\figuremargin{%
\begin{center}
\mbox{\makebox[0in][r]{\raisebox{1.3in}{$\frac{1}{N}H_{\delta}(X^{N})$}}\hspace{-5mm}%
\psfig{figure=Hdelta/figs/hd/all.10.1010.ps,%
width=65mm,angle=-90}$\delta$}
\end{center}
}{%
\caption[a]{$\frac{1}{N} H_{\delta}(X^{N})$ 
	for  $N=10, 210, \dots,1010$ binary variables with $p_1=0.1$.}
\label{fig.hd.10.1010}
}
\end{figure}


 \Figref{fig.hd.10.1010} illustrates this asymptotic tendency for 
 the binary ensemble of  example \ref{ex.Nfrom.1}.
% discussed earlier with $N$ binary variables with $p_1 = 0.1$. 
 As $N$ increases, $\frac{1}{N} H_{\delta}(X^N)$  becomes an increasingly 
 flat function, except for tails close to $\delta=0$ and $1$.
%  The limiting value of  the plateau is $H(X) = 0.47$.
% We will explain and prove this result in the remainder of
% this chapter. Let's first note the implications of this result.
% The limiting value of the plateau, which for  $N$ binary variables with $p_1 = 0.1$
% appears to be about 0.5, defines how much compression is possible:
% $N$  binary variables with $p_1 = 0.1$ can be compressed into
% about $N/2$ bits, with a probability of error $\delta$ which
% can be any value between 0 and 1.
% We will show that the plateau value to which  $\frac{1}{N} H_{\delta}(X^N)$
% tends, for large $N$, is the entropy, $H(X)$.
%
% IDEA: Box this next sentence?
%
 As long as we are allowed
 a tiny probability of error $\delta$, compression down to
 $NH$ bits is possible. Even if we are allowed a large probability of error,
 we still can  compress only down to $NH$ bits.
%
% IDEA: Box above?
%
 This is the \ind{source coding theorem}.
% \subsection{The theorem}
\begin{ctheorem}
\label{thm.sct}
 {\sf Shannon's Source Coding theorem.}
% HOW TO NAME THIS?????????????????
% this name is taken later
	Let $X$ be an ensemble with entropy $H(X) = H$ bits. Given $\epsilon>0$
 and $0<\delta<1$, there exists a positive integer $N_0$ such that for 
 $N>N_0$, 
\beq
 \left| \frac{1}{N} H_{\delta}(X^N) - H \right| < \epsilon. 
\eeq
\end{ctheorem}
%
% sanjoy wants explan here
%
% The reason that increasing $N$ helps is that, if $N$ is large,
% the outcome $\bx$ 

\section{Typicality}
 Why does increasing $N$ help?\index{typicality}
 Let's examine long strings from $X^N$.
 Table \ref{tab.typical.tcl} shows fifteen samples from $X^N$ 
 for  $N=100$ and $p_1=0.1$.
\begin{figure}
\figuremargin{%
\begin{center}
\begin{tabular}{lr} \toprule
$\bx$ &
% \multicolumn{1}{c}{$\log_2(P(\bx))$}
\hspace{-0.3in}{$\log_2(P(\bx))$}
% {\rule[-3mm]{0pt}{8mm}}%strut
 \\ \midrule
% REQUIRE MONOSPACED FONT!!!
{\tinytt{%VERB
...1...................1.....1....1.1.......1........1...........1.....................1.......11...%END
}} & $-$50.1  \\
{\tinytt{%VERB
......................1.....1.....1.......1....1.........1.....................................1....%END
}} & $-$37.3  \\
{\tinytt{%VERB
........1....1..1...1....11..1.1.........11.........................1...1.1..1...1................1.%END
}} & $-$65.9  \\
{\tinytt{%VERB
1.1...1................1.......................11.1..1............................1.....1..1.11.....%END
}} & $-$56.4  \\
{\tinytt{%VERB
...11...........1...1.....1.1......1..........1....1...1.....1............1.........................%END
}} & $-$53.2  \\
{\tinytt{%VERB
..............1......1.........1.1.......1..........1............1...1......................1.......%END
}} & $-$43.7  \\
{\tinytt{%VERB
.....1........1.......1...1............1............1...........1......1..11........................%END
}} & $-$46.8  \\
{\tinytt{%VERB
.....1..1..1...............111...................1...............1.........1.1...1...1.............1%END
}} & $-$56.4  \\
{\tinytt{%VERB
.........1..........1.....1......1..........1....1..............................................1...%END
}} & $-$37.3  \\
{\tinytt{%VERB
......1........................1..............1.....1..1.1.1..1...................................1.%END
}} & $-$43.7  \\
{\tinytt{%VERB
1.......................1..........1...1...................1....1....1........1..11..1.1...1........%END
}} & $-$56.4  \\
{\tinytt{%VERB
...........11.1.........1................1......1.....................1.............................%END
}} & $-$37.3  \\
{\tinytt{%VERB
.1..........1...1.1.............1.......11...........1.1...1..............1.............11..........%END
}} & $-$56.4  \\
{\tinytt{%VERB
......1...1..1.....1..11.1.1.1...1.....................1............1.............1..1..............%END
}} & $-$59.5  \\
{\tinytt{%VERB
............11.1......1....1..1............................1.......1..............1.......1.........%END
}} & $-$46.8  \\ \midrule % [0.2in]
%															 
{\tinytt{%VERB
....................................................................................................%END
}} & $-$15.2 \\
{\tinytt{%VERB
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111%END
}} & $-$332.1\\
%
\bottomrule
\end{tabular}
\end{center}
}{%
\caption[a]{The top 15 strings are samples from $X^{100}$, 
 where $p_1 = 0.1$ and $p_0 = 0.9$. 
 The bottom two are the most and least probable strings in this ensemble.
 The final column shows the 
% Compare the
 log-probabilities of the random strings,
 which may be compared with the entropy
% with 
% the \aep: $H(X) = 0.469$, so
 $H(X^{100}) = 46.9$ bits.}
\label{tab.typical.tcl}
}
\end{figure}
% 1000 Typical set size +/-    28.46 has log_2(p(x)) within +/-    90.22
%  i.e. 1/N (logp) is within 0.090
% 100  Typical set size +/-        9 has log_2(p(x)) within +/-    28.53
%  i.e. 1/N(logp) is within 0.285
% 200  Typical set size +/-    12.73 has log_2(p(x)) within +/-    40.35
%
% N=100 alternative (see hd.p for the commands)
%
\begin{figure}
\fullwidthfigureright{
%\figuremargin{%
\begin{center}
\begin{tabular}{r@{\hspace*{-0in}}c@{\hspace*{-0.1in}}c} \toprule
 & $N=100$ & $N=1000$ \\ \midrule
\raisebox{0.71in}{\small$n(r) = {N \choose r}$}
  & \mbox{\psfig{figure=Hdelta/figs/num/100.ps,%
width=50mm,angle=-90}} 
  & \mbox{\psfig{figure=Hdelta/figs/num/1000.ps,%
width=50mm,angle=-90}} \\
\raisebox{0.71in}{\small$p(\bx) = p_1^r (1-p_1)^{N-r}$}
 & \mbox{\psfig{figure=Hdelta/figs/per/100.ps,%
width=50mm,angle=-90}}%
\makebox[0in][r]{\raisebox{0.4in}{%
\psfig{figure=Hdelta/figs/perdet/100.ps,%
width=30mm,angle=-90}}\hspace{0.2in}} 
&
\\
\raisebox{0.71in}{\small$\log_2 p(\bx)$}
 & \mbox{\psfig{figure=Hdelta/figs/logper/100.ps,%
width=50mm,angle=-90}} 
 & \mbox{\psfig{figure=Hdelta/figs/logper/1000.ps,%
width=50mm,angle=-90}} \\
\raisebox{0.71in}{\small$n(r)p(\bx)= {N \choose r} p_1^r (1-p_1)^{N-r}$} 
& \mbox{\psfig{figure=Hdelta/figs/tot/100.ps,%
width=50mm,angle=-90}}
& \mbox{\psfig{figure=Hdelta/figs/tot/1000.ps,%
width=50mm,angle=-90}}
% \makebox[0in][l]{$r$}
\\
 & 
$r$ & $r$  \\ \bottomrule
\end{tabular}
\end{center}
}{%
\caption[a]{Anatomy of the typical set $T$.
	For  $p_1=0.1$ 
 and $N=100$ and $N=1000$, these graphs show $n(r)$, the number of 
 strings containing $r$ {\tt{1}}s; the probability $p(\bx)$ of a single 
 string that contains $r$ {\tt{1}}s; the same probability on a 
 log scale; and the total probability 
 $n(r)p(\bx)$ of all strings that contain $r$ {\tt{1}}s. 
 The number $r$ is on the  horizontal axis. 
 The plot of $\log_2 p(\bx)$ also shows by a dotted line the mean value of
 $\log_2 p(\bx) = -N H_2(p_1)$ which equals $-46.9$
 when $N=100$ and $-469$ when $N=1000$. The typical set includes 
 only the strings that have $\log_2 p(\bx)$ close to this value.
 The range marked {\sf T} shows the set $T_{N \beta}$ (as defined
 in \protect\sectionref{sec.ts})
 for $N=100$ and $\beta = 0.29$ (left) and  $N=1000$,  $\beta = 0.09$ (right).
} 
\label{fig.num.per.tot}
}%
\end{figure}
 The probability of a string $\bx$ that contains $r$ {\tt{1}}s and
 $N\!-\!r$ {\tt{0}}s is
\beq
	p(\bx) = p_1^r (1-p_1)^{N-r} .
\eeq
 The number of strings that contain $r$ {\tt{1}}s  is
\beq
	n(r) =  {N \choose r} .
\eeq
 So the number of {\tt{1}}s, $r$, has a binomial distribution:
\beq
	p(r) =  {N \choose r} p_1^r (1-p_1)^{N-r} .
\eeq
 These functions are shown in \figref{fig.num.per.tot}.
The mean of $r$ is $N p_1$, and its standard deviation is
 $\sqrt{N p_1 (1-p_1)}$ (\pref{sec.first.binomial}).
 If $N$ is 100 then
\beq
	r \sim 	N p_1 \pm  \sqrt{N p_1 (1-p_1)} \simeq 10 \pm 3 .
\eeq
 If $N=1000$ then 
\beq
	r \sim 100 \pm 10 .
\eeq
 Notice that as $N$ gets bigger, the probability distribution
 of $r$ becomes more concentrated, in the sense that
 while the
  range of possible values of $r$  grows
 as $N$, the standard deviation of $r$  only
 grows as $\sqrt{N}$. 
 That $r$ is most likely to fall
 in a small range of values implies
 that the outcome $\bx$ is also most likely to
 fall in a corresponding  small  subset of outcomes
 that we will call the {{\dbf\inds{typical set}}}. 

\subsection{Definition of the typical set}
\label{sec.ts}
% Let us generalize our discussion to an arbitrary ensemble $X$
% with alphabet $\A_X$
% and define typicality.
 Let us define \ind{typicality}\index{typical set!for compression}
 for an arbitrary ensemble $X$
 with alphabet $\A_X$.
 Our definition of a typical string will
 involve the string's probability.
 A long string
% message
 of $N$ symbols will usually
 contain
% with high	probability
 about $p_1N$ occurrences of the first symbol,
	$p_2N$ occurrences of the second, etc. Hence the probability
	of this string
% long message
 is roughly
\beq
	p(\bx)_{\rm TYP}
 = p(x_1)p(x_2)p(x_3) \ldots p(x_N)
 \simeq p_1^{(p_1N)} p_2^{(p_2N)} \ldots p_I^{(p_IN)}
\eeq
%  p_i^{p_iN}
 so that
 the information content of a typical string is
\beq
	\log_2 \frac{1}{p(\bx)}
 \simeq N \sum_i p_i \log_2 \frac{1}{p_i} \simeq N H . 
\eeq
	So the random variable $\log_2 \dfrac{1}{p(\bx)}$,
%	So the random variable $\frac{1}{N} \log_2 \frac{1}{p(\bx)}$,
% which is the average information content per symbol, is
 which is the  information content of $\bx$, is
	very likely to be close in  value  to $N H$.
 We build our definition of typicality on this observation.

 We   define  the typical elements of $\A_X^N$ to be
	those elements that
	have probability  close to $2^{-NH}$. (Note that the typical set,
 unlike the
% best subset for compression
 smallest sufficient subset,  does
	{\em not\/} include the most probable elements of $\A_X^N$, but we
	will show that these most probable elements
 contribute negligible probability.)

 We introduce a parameter $\beta$ that defines how close
 the probability has to be  to   $2^{-NH}$ for
 an element to be `typical'.
% $\beta$-
 We call the set of typical elements the typical set,
% $T$, or, to be more precise,
 $T_{N \beta}$
% , where the parameter $\beta$
%% controls the breadth of the typical set by defining
% defines what we mean by a probability `close' to $2^{-NH}$:
\beq
	T_{N\b} \equiv \left\{ \bx\in\A_X^N : 
	\left| \frac{1}{N} \log_2 \frac{1}{P(\bx)} - H \right| < \b 
	\right\} .
\label{eq.TNb}
\eeq
%
% check whether < has propagated to all necvessary places
%

 We will show  that whatever value of $\beta$ we choose,
 the typical set  contains almost all the probability
 as $N$ increases. 

 This important result is sometimes called the
 `Asymptotic Equipartition' Principle.
% \newpage
%\section{`Asymptotic Equipartition' and Source Coding}
\label{sec.aep}
%	We will prove the following result: 
\begin{description}
\item[`Asymptotic Equipartition' Principle.]
% (AEP).]
 For an ensemble of $N$ independent identically distributed (i.i.d.)
 random variables 
 $X^N \equiv ( X_1, X_2, \ldots, X_N )$, with $N$ sufficiently large, 
 the outcome $\bx = (x_1,x_2,\ldots, x_N)$ is almost certain to belong 
 to a subset of $\A_X^N$ having only $2^{N H(X)}$ members, each having 
 probability `close to' $2^{-N H(X)}$.
\end{description}
 Notice that if $H(X) < H_0(X)$ then $2^{N H(X)}$ is a {\em tiny\/}
 fraction of the number of possible outcomes $|\A_X^N|=|\A_X|^N=2^{N
 H_0(X)}.$

\begin{aside}
 The term \ind{equipartition} is chosen to describe the idea
 that the members of the typical set have {\em roughly equal\/}
 probability. [This should not be taken too literally, hence my
 use of quotes around `asymptotic equipartition';
% in the phrase \aep;
 see page \pageref{sec.aep.caveat}.]

 A second meaning for equipartition, in thermal physics,
 is the idea that each degree of freedom of a classical system
 has {equal\/} average energy, $\dhalf kT$. This second meaning
 is not intended here.
\end{aside}

%
	The \aep\ is equivalent to:
\begin{description}
\item[Shannon's source coding theorem (verbal statement).]
	 $N$ i.i.d.\ random variables each 
	with entropy $H(X)$ can be compressed into more than $NH(X)$ bits with 
	negligible risk of information  loss, as $N\rightarrow \infty$; 
	conversely if they are compressed into fewer than $NH(X)$ bits 
 	it is virtually certain that information will be lost.
\end{description}
 These two theorems are equivalent
	because we can define a compression algorithm that gives a distinct 
 name of length $N H(X)$ bits to each $\bx$ in the typical set.
% probable subset. 
% as follows: 
% enumerate the $\bx$ belonging to 
% the subset of $2^{N H(X)}$ equiprobable outcomes as 000\ldots000, 
% 000\ldots001, etc. 


\begin{figure}
\figuredangle{%
\begin{center}
%%%%%%%% written by hand    see also X.tex
%
% picture of Sdelta for X^100
%
\newcommand{\axislevel}{27}
\newcommand{\axislevelp}{32.5}
\newcommand{\axislevelm}{24}
\newcommand{\axislevelmm}{21}
\newcommand{\forestgap}{-0.4}
\newcommand{\forestgab}{-0.6}
\newcommand{\forestgac}{-0.56}
\newcommand{\forestgad}{-0.52}
\newcommand{\forestgae}{-0.48}
\newcommand{\forestgaf}{-0.44}
% \newcommand{\forestgag}{0.48}
%\newcommand{\forestgap}{0.35} was .35 when I went up to 14.
\newcommand{\forest}[3]{\multiput(#1)(\forestgap,0){#2}{\line(0,1){#3}}}
\newcommand{\foresb}[4]{\multiput(#1)(#4,0){#2}{\line(0,1){#3}}}
%
% picture
%
%\setlength{\unitlength}{2.45pt}%
\setlength{\unitlength}{2.87pt}%
\begin{picture}(170,71)(-170,-42)
\forest{0,0}{1}{16.5}%
\foresb{-5,0}{2}{16}{\forestgab}
\foresb{-10,0}{3}{15.5}{\forestgab}
\foresb{-15,0}{4}{15}{\forestgac}
\foresb{-20,0}{5}{14.5}{\forestgad}
\foresb{-25,0}{6}{14}{\forestgae}
\foresb{-30,0}{7}{13.5}{\forestgaf}
\foresb{-35,0}{8}{13}{\forestgap}
\foresb{-40,0}{9}{12.5}{\forestgap}
\forest{-45,0}{10}{12}%
\forest{-50,0}{11}{11.5}%
\forest{-55,0}{12}{11}%
\forest{-60,0}{12}{10.5}%
\forest{-65,0}{12}{10}%
\forest{-70,0}{12}{9.5}%
\forest{-75,0}{12}{9}%
\forest{-80,0}{12}{8.5}%
\forest{-85,0}{12}{8}%
\forest{-90,0}{12}{7.5}%
\forest{-95,0}{12}{7}%
\forest{-100,0}{12}{6.5}%
\forest{-105,0}{12}{6}%
\forest{-110,0}{12}{5.5}%
\forest{-115,0}{11}{5}%
\forest{-120,0}{10}{4.5}%
\foresb{-125,0}{9}{4.2}{\forestgap}
\foresb{-130,0}{8}{3.9}{\forestgap}
\foresb{-135,0}{7}{3.6}{\forestgaf}
\foresb{-140,0}{6}{3.3}{\forestgae}
\foresb{-145,0}{5}{3.0}{\forestgad}
\foresb{-150,0}{4}{2.7}{\forestgac}
\foresb{-155,0}{3}{2.4}{\forestgab}
\foresb{-160,0}{2}{2.1}{\forestgab}
\forest{-165,0}{1}{1.8}%
%
% axis: 
\put(-168,\axislevelm){\vector(1,0){171.0}}
%
% axis labels
\put(0,\axislevelp){\makebox(0,0)[br]{\small$\log_2 P(x)$}}
\put(-42.4,\axislevel){\makebox(0,0)[b]{\small$-NH(X)$}}
% tic mark  (was at -40 until Tue 8/1/02)
\put(-42.4,\axislevelm){\line(0,1){2}}
% the S0 box
%\put(-3,-2.5){\framebox(172,\axislevelm){}}
%\put(142,16){\makebox(0,0)[l]{$S_0$}}
%
%
% typical set box
\put(-49.5,-1){\framebox(15,\axislevelmm){}}
\put(-51,16){\makebox(0,0)[r]{$T_{N\b}$}}
%
% object labels
\put(0,-40){\vector(0,1){35}}  
\put(-15,-35){\vector(0,1){30}}  
%\put(26,-30){\vector(0,1){25}}  
\put(-36,-25){\vector(0,1){20}}  
\put(-46,-20){\vector(0,1){15}}  
%\put(56,-15){\vector(0,1){10}}  
\put(-155,-10){\vector(0,1){5}}  
\put( 0,-40){\makebox(0,0)[tr]{\footnotesize{{\tt 0000000000000}\ldots{\tt{00000000000}}}}}
\put(-15,-35){\makebox(0,0)[tr]{\footnotesize{{\tt 0001000000000}\ldots{\tt{00000000000}}}}}
%\put(26,-30){\makebox(0,0)[tl]{\footnotesize{{\tt 0000001000000}\ldots{\tt{00000010000}}}}}
\put(-36,-25){\makebox(0,0)[tr]{\footnotesize{{\tt 0100000001000}\ldots{\tt{00010000000}}}}}
\put(-46,-20){\makebox(0,0)[tr]{\footnotesize{{\tt 0000100000010}\ldots{\tt{00001000010}}}}}
%\put(56,-15){\makebox(0,0)[tl]{\footnotesize{{\tt 0100001000100}\ldots{\tt{00010100100}}}}}
\put(-155,-10){\makebox(0,0)[tl]{\footnotesize{{\tt 1111111111110}\ldots{\tt{11111110111}}}}}
\end{picture}
%
%
%
%

\end{center}
}{%
\caption[a]{Schematic diagram showing all strings
 in the ensemble $X^{N}$
% with $p_0 = 0.9, p_1=0.1$
% of large length $N$
 ranked by their probability, and
 the typical set $T_{N\b}$.}
\label{fig.typical.set.explain}
}%
\end{figure}


\section{Proofs}
 This section may be skipped if found tough going.


\subsection{The law of large numbers}
 Our proof of the source coding theorem  uses  the law of 
 large numbers.
\begin{description}
% \item[A random variable $u$] is any real function of $x$, 
\item[Mean and variance] of a real random variable%
%\footnote
\marginpar{\footnotesize
 Technical note: 
	strictly I am assuming here that $u$ is a function $u(x)$ of a
	sample $x$ from a finite discrete ensemble $X$. Then the
	summations $\sum_u P(u) f(u)$ should be written $\sum_x P(x)
	f(u(x))$.  This means that $P(u)$ is a finite sum of delta
	functions.  This restriction guarantees that the mean and
	variance of $u$ do exist, which is not necessarily the case for general
	$P(u)$.  } 
 are $\Exp[u] = \bar{u} = \sum_u P(u) u$ and $\var(u) =
	\sigma^2_u = \Exp[(u-\bar{u})^2] = \sum_u P(u) (u - \bar{u})^2.$

\item[Chebyshev's inequality 1.]
	Let $t$ be a non-negative real random variable, and\index{Chebyshev inequality} 
 let $\a$ be a positive real number.  Then\index{inequality}
\beq
	P(t \geq \a) \:\leq\: \frac{\bar{t}}{\a}.
\label{eq.cheb.1}
\eeq

	{\sf Proof:} $P(t \geq \a) = \sum_{t \geq \a} P(t)$. 
 We multiply each 
 term by $t/\a \geq 1$ and obtain: 
 $P(t \geq \a) \leq \sum_{t \geq \a} P(t) t/\a.$
 We add the (non-negative) missing terms and obtain:
 $P(t \geq \a) \leq \sum_{t} P(t) t/\a = \bar{t}/\a$.

\item[Chebyshev's inequality 2.]
	Let $x$ be a random variable, and let $\a$ be a positive real number.
 Then
\beq
	P\left( (x-\bar{x})^2 \geq \a \right) \:\leq\: \sigma^2_x / \a.
\eeq

{\sf Proof:} Take $t = (x-\bar{x})^2$ and apply the previous proposition.

\item[Weak law of large numbers.]
	Take $x$ to be the average of $N$ independent random variables 
 $h_1, \ldots , h_N$, having common mean $\bar{h}$ and common variance  
 $\sigma^2_h$: $x = \frac{1}{N} \sum_{n=1}^N h_n$. Then 
\beq
	P( (x-\bar{h})^2 \geq \a ) \leq \sigma^2_h/\a N.
\eeq

{\sf Proof:} obtained by showing that $\bar{x}=\bar{h}$ and that 
 $\sigma^2_x = \sigma^2_h/ N$.

\end{description}
 We are interested in $x$ being very close to the mean ($\a$ very small).
 No matter how large $\sigma^2_h$ is, and no matter how small the
 required $\a$ is, and no matter how small the desired probability of
 $(x-\bar{h})^2 \geq \a$, we can always achieve it by
 taking $N$ large enough.

\subsection{Proof of theorem \protect\ref{thm.sct} (\pref{thm.sct})}
% the source coding theorem}
% or could say theorem 1
 We apply the law of large numbers to the random variable $\frac{1}{N}
 \log_2 \frac{1}{P(\bx)}$ defined for $\bx$ drawn from the ensemble $X^N$. 
 This random variable can be written as the average of $N$ information
 contents 
 $h_n = \log_2 ( 1 / P(x_n))$, each of which is a random variable with 
 mean $H = H(X)$ and variance $\sigma^2 \equiv \var[ \log_2 ( 1 / P(x_n)) ]$.
 (Each  term $h_n$
 is in fact the Shannon information content of the $n$th
 outcome.)

 We again define the typical set with parameters $N$ and $\beta$ thus: 
\beq
	T_{N\b} = \left\{ \bx\in\A_X^N : 
	\left[ \frac{1}{N} \log_2 \frac{1}{P(\bx)} - H \right]^2 < \b^2 
	\right\} .
\label{eq.TNb.2}
\eeq
 For all $\bx \in T_{N\b}$, the probability of $\bx$ satisfies
\beq
2^{-N(H+\b)} < P(\bx) < 2^{-N(H-\b)}.
\eeq
 And by the law of large numbers, 
\beq
	P(\bx \in T_{N\b}) \geq 1 - \frac{\sigma^2}{\b^2 N} .
\eeq
 We have thus proved the \aep. As $N$ increases, the probability
 that $\bx$ falls in  $T_{N\b}$ approaches 1, for any $\beta$.
 How does this result relate to source coding?

%	We will prove the \aep\ first; then w
 We must relate $T_{N\b}$ to $H_{\delta}(X^N)$.
 We will
 show that for any given $\delta$ there is
	a sufficiently big $N$ such that
	$H_{\delta}(X^N) \simeq N H$.


\subsubsection{Part 1:  $\frac{1}{N} H_{\delta}(X^N) <  H + 
	\epsilon$.}
% of the source coding theorem.
%
% More words here reminding what H_delta is
%
 The set $T_{N\b}$ is not the best subset for  compression. So the
 size of $T_{N\b}$ gives an upper bound on $H_{\delta}$.
 We show how {\em small} $H_{\delta}(X^N)$ must be by calculating
% the  largest cardinality that $T_{N\b}$ could have.
 how big  $T_{N\b}$  could possibly be.
 We are
 free to set $\beta$ to any convenient value.
 The smallest possible 
 probability that a member of $T_{N\b}$ can have is  $2^{-N(H+\b)}$, and 
 the  total probability that $T_{N\b}$ contains can't be any bigger
 than 1. So 
\beq
	|T_{N\b}|  \,  2^{-N(H+\b)}  < 1 ,
\eeq
 that is, the size of the typical set is bounded by
% so we can bound
\beq
	|T_{N\b}| < 2^{N(H+\b)} . 
\eeq
 If we set $\b = \epsilon$ and $N_0$ such that
 $\frac{\sigma^2}{\epsilon^2 N} \leq \delta$, then $P(T_{N\b}) \geq
 1 - \delta$,
 and the set $T_{N\b}$ becomes a witness to the fact that
 $H_{\delta}(X^N) \leq \log_2 | T_{N\b} | < N ( H + \epsilon)$.
%
\amarginfig{b}{
{\footnotesize
\setlength{\unitlength}{1.2mm}
\begin{picture}(40,40)(-5,0)
\put(5,5){\makebox(0,0)[bl]{\psfig{figure=figs/gallager/Hdeltaconcept.eps,width=36mm}}}
\put(5,35){\makebox(0,0){$\frac{1}{N} H_{\delta}(X^N)$}}
\put(5,27){\makebox(0,0)[r]{$H_0(X)$}}
\put(5,4){\makebox(0,0)[t]{$0$}}
\put(30,4){\makebox(0,0)[t]{$1$}}
\put(35,4){\makebox(0,0)[t]{$\delta$}}
\put(33,11){\makebox(0,0)[l]{$H-\epsilon$}}
\put(33,15){\makebox(0,0)[l]{$H$}}
\put(33,19){\makebox(0,0)[l]{$H+\epsilon$}}
\end{picture}
}
\caption[a]{Schematic illustration of the two parts of the theorem.
 Given any $\delta$ and $\epsilon$, we show that
 for large enough $N$, $\frac{1}{N} H_{\delta}(X^N)$
 lies (1) below the line 
 $H+\epsilon$ and (2) above the line $H-\epsilon$.}
\label{fig.Hd.schem}
}
\subsubsection{Part 2: $\frac{1}{N} H_{\delta}(X^N) > 
	H - \epsilon$.}
% of the source coding theorem.} 

%
% needs work ,sanjoy says: 
%
% (jan 99)_
%
 Imagine that someone claims this second part is not so -- that,
 for any $N$, the 
 smallest $\delta$-sufficient subset $S_{\delta}$ is smaller than the above
 inequality would allow.
% They claim that 
% $|S_{}| \leq 2^{N(H-\epsilon)}$   and $P(\bx \in S_{})
% \geq 1 - \delta$.
 We can   make use of our typical set to show that they must be mistaken.
 Remember that we are free to set $\beta$ to any value we choose.
 We will set $\beta = \epsilon/2$, so that our task is to 
 prove that  a 
% that an alternative {\em smaller\/}
 subset $S'_{}$ having 
 $|S'_{}| \leq 2^{N(H-2\beta)}$ and achieving $P(\bx \in S'_{}) \geq 1 - \delta$
 cannot exist (for $N$ greater than an $N_0$ that we will specify).
%(We attach the
% prime to $S$ to denote the fact that this is a conjectured smallest subset.)

 So, let us consider the probability of falling in this rival smaller subset $S'_{}$.
 The probability of the subset $S'_{}$ is\marginpar[t]{%
\begin{center}
\raisebox{-0.5in}[0in][0in]{
%%%%%%%% written by hand   Sun 22/12/02
%
% Venn picture
%
%
\setlength{\unitlength}{0.321pt}%
{\begin{picture}(452,215)(-173,-132)% 
% axis labels
\put(-100,39){\makebox(0,0)[r]{\small$T_{N\b}$}}
\put(100,39){\makebox(0,0)[l]{\small$S'$}}
\thinlines
\put(-33,-1){\circle{126}}
\thicklines
\put(33,-1){\circle{126}}
\thinlines
\put(18,-85){\vector(-1,4){18}}
\put(33,-90){\makebox(0,0)[t]{\small$ S'_{} \cap T_{N\b} $}}
\put(105,-51){\vector(-1,1){40}}
\put(112,-39){\makebox(0,0)[tl]{\small$ S'_{} \cap \overline{T_{N\b}} $}}
\end{picture}}
%
%
%
%

\end{center}}
\beq
	P(\bx \in S'_{} \cap T_{N\b}) + 
 P(\bx \in S'_{} \cap \overline{T_{N\b}}),
\eeq
 where $\overline{T_{N\b}}$ denotes 
 the complement $\{ \bx \not \in T_{N\b}\}$.
 The maximum value of the first term is found if
 $S'_{} \cap T_{N\b} $ contains
 $2^{N(H-2\beta)}$ outcomes all with the maximum probability,   
 $2^{-N(H-\beta)}$. The maximum value  the second term can have is 
 $P( \bx \not \in T_{N\b})$. So: 
\beq
	P(\bx \in S'_{}) \, \leq  \, 2^{N(H-2\beta)}
                \, 2^{-N(H-\beta)}
      + \frac{\sigma^2}{\b^2 N} 
	= 2^{-N \b} + \frac{\sigma^2}{\b^2 N} .
\eeq
 We can now set $\b = \epsilon/2$ and $N_0$ such that $P(\bx \in S'_{}) < 1-
 \delta$, which shows that $S'$ cannot satisfy the definition of
 a sufficient subset $S_{\delta}$.
 Thus {\em any\/} subset $S'$ with size
 $|S'| \leq 2^{N(H-\epsilon)}$ has probability less than $1-\delta$, so
 by the definition of $H_\delta$, $H_{\delta}(X^N) > N ( H - \epsilon)$.

% this sentence used to be below at
% hereherehere
 Thus for large enough $N$, 
 the function
 $\frac{1}{N} H_{\delta}(X^N)$ is essentially a constant function of $\delta$,
 for $0 < \delta < 1$,
 as  illustrated in figures \ref{fig.hd.10.1010}
 and \ref{fig.Hd.schem}. \hfill $\Box$


\section{Comments}
 The source coding theorem  (\pref{thm.sct}) has two parts,
	$\frac{1}{N} H_{\delta}(X^N)  < H + \epsilon$, 
 and
 $\frac{1}{N} H_{\delta}(X^N) > 
	H - \epsilon$.
% $H  -\frac{1}{N} H_{\delta}(X^N)< \epsilon$.
 Both results  are interesting. 

 The first part tells us that even if the probability of
 error $\delta$ is extremely small, 
 the
% average
 number of bits per symbol
 $\frac{1}{N} H_{\delta}(X^N)$ needed to specify a long $N$-symbol 
 string $\bx$ with vanishingly 
 small error probability does not 
 have to exceed $H+ \epsilon$ bits. 
 We  need to have only a tiny tolerance for error, and the number of bits 
 required drops significantly from $H_0(X)$ to $(H + \epsilon)$. 

 What happens if we are yet more tolerant to compression errors? Part
 2 tells us that even if $\delta$ is very close to 1, so that  errors
  are made most of the time, the average number of bits per symbol needed to
 specify $\bx$ must  still  be at least $H - \epsilon$ bits. These two
 extremes tell us that regardless of our specific allowance for error,
 the number of bits per symbol needed to specify $\bx$ is
% boils down to
 $H$ bits; no more and no less. 
\medskip

% hereherehere

%In section 2.4.2 `$\epsilon$ can decrease with increasing $N$'. I'd prefer
%something like $N$ increases with decreasing $\epsilon$', since $N$ 
%depends on $\epsilon$ and not vice versa -- if I got it right.
% caution warning
\subsection{Caveat regarding `asymptotic equipartition'}
\label{sec.aep.caveat}
 \index{warnings|see{caution}}\index{caveats|see{caution}}\index{caution!equipartition}I
 put the words `asymptotic equipartition' in quotes because 
 it is important not to\index{asymptotic equipartition!why it is a misleading term}
% be misled into
 think that the 
 elements of the typical set $T_{N\beta}$
 really do have roughly the same 
 probability as each other. They are  similar in probability only
 in the sense that their values of $\log_2 \frac{1}{P(\bx)}$ are 
 within $2 N \beta$ of each other. Now, as $\beta$ is decreased,
 how does $N$ have to increase, if we are to keep our bound on the
 mass of the typical set, 
 $P(\bx \in T_{N\beta}) \geq 1 - \frac{\sigma^2}{\beta^2 N}$, constant?
% CHANGED 9802:
% Since $\beta$ can decrease
%scales
% with increasing
 $N$ must grow as $1/ \beta^2$, so, if we write
 $\beta$ in terms of 
 $N$ as $\alpha/\sqrt{N}$, for some constant $\alpha$, then
 the  most probable string in the typical set will be of order 
 $2^{\alpha \sqrt{N}}$ times greater than the least probable string in the 
 typical set. As $\beta$ decreases, $N$ increases,
 and this ratio $2^{\alpha \sqrt{N}}$ grows exponentially.
 Thus we  have `equipartition'  only in a weak sense!
% relative

%\section{Summary and overview}
%\section{Where next}
% We have established that the entropy $H(X)$ measures
% the average information content of an ensemble.
%%
% In this chapter we discussed a lossy {block}-compression scheme that 
% used large blocks of fixed size.
% In the next chapter we  discuss variable length compression schemes that are
% practical for small block sizes and that are not lossy.
%%
%
\section{Exercises}
% weighing problems in here
% ITPRNN Problem 1a
%
\subsection*{Weighing problems}
%
\exercisxB{1}{ex.weighexplain}{
 While some people, when  they first  encounter   
 the
 weighing problem with  12 balls and the three-outcome balance (\exerciseref{ex.weigh}),
 think that weighing six balls against six balls is a good first weighing,
 others say `no, weighing six against six conveys {\em no\/} information
 at all'.  Explain to the second group why they are right and why they
 are wrong.  Compute the information gained about {\em  which is the
 odd ball\/}, and the information gained about {\em  which is the
 odd ball and whether it is heavy or light}.
}
\exercissxB{2}{ex.binaryweigh}{
 You are given 16 balls, all of which are equal in weight except for
 one that is either heavier or lighter. You are also given a bizarre
 two-pan balance that can  report only two outcomes: `the two sides balance'
 or `the two sides do not balance'.
 Design a
 strategy to determine which is the odd ball {in as few uses of the balance
 as possible}.
}
\exercisxB{2}{ex.flourforty}{
	You have a two-pan balance; your job is to weigh
 out bags of flour with integer weights  1 to 40 pounds inclusive.
 How many weights do you need? [You are allowed
 to put  weights on either pan. You're only allowed to
 put one flour bag on the balance at a time.]
}
\exercissxC{4}{ex.twelve.generalize.weigh}{ 
\ben
\item% {ex.weigh}
 Is it possible to solve  \exerciseref{ex.weigh}
 (the
 weighing problem with  12 balls and the three--outcome balance)
 using a sequence of three {\em fixed\/} weighings, such that the
 balls chosen for the second weighing do not depend on the outcome of the first, and
 the third weighing does not depend on the first or second?
\item
 Find a  solution to the general weighing problem in which exactly one of  $N$
 balls is odd.
 Show that in $W$ weighings, an odd ball can be identified from among 
$N = (3^W - 3 )/2$ balls.
%How large can $N$ be if you are allowed $W$ weighings? 
% How are the weighings arranged in the case of the largest $N$? 
\een
}
\exercissxC{3}{ex.twelve.two.weigh}{ 
 You are given 12 balls and the three-outcome balance 
 of \exerciseonlyref{ex.weigh}; this time, {\em two} of the balls are odd;
 each odd ball may be heavy or light, and we don't know which.
 We want to identify the odd balls and in which direction they are odd.
\ben
\item
 {\em Estimate\/} how many weighings are required by the optimal strategy.
 And what if there are three odd balls?
%\item
% How do your answers change if it is known in advance that 
% the odd balls will all have the same bias (all heavy, or all light)?
\item
 How do your answers change if it is known that all the regular balls
 weigh 100\grams, that light balls weigh 99\grams, and heavy ones
 weigh 110\grams?
\een
}

% end weighing
\subsection*{Source coding with a lossy compressor, with loss $\delta$}
\exercisxB{2}{ex.Hd46}{
 Let ${\cal P}_X = \{ 0.4,0.6 \}$. Sketch $\frac{1}{N} H_{\delta}(X^N)$
 as a function of $\delta$ for $N=1,2$ and 100.
}
\exercisxB{2}{ex.Hd55}{
 Let ${\cal P}_Y = \{ 0.5,0.5 \}$. Sketch $\frac{1}{N} H_{\delta}(Y^N)$
 as a function of $\delta$ for $N=1,2,3$ and 100.
}
\exercissxB{2}{ex.HdSB}{ 
% (For Physicists)
 Discuss the 
 relationship
% similarities
 between the proof of the \aep\ and the  equivalence 
 (for large systems) of the Boltzmann entropy and the Gibbs entropy.}
\subsection*{Distributions that don't obey the law of large numbers}
%
% Cauchy distbn here? 
 The law of large numbers, which we used in this chapter, 
 shows that the mean  of a set of $N$ i.i.d.\ random variables 
 has a probability distribution that becomes 
% more concentrated
 narrower, with width $\propto 1/\sqrt{N}$, as $N$ increases. 
 However, we have proved this property only for 
 discrete random variables,  that is, for real numbers 
 taking on a {\em finite\/} set of possible values. 
 While many random variables
 with continuous probability distributions also satisfy the 
 law of large numbers, there are important distributions that 
 do not. Some continuous distributions do not have 
 a mean or variance. 
\exercisxB{3}{ex.cauchy}{
 Sketch the \ind{Cauchy distribution}
\beq
	P(x) = \frac{1}{Z} \frac{1}{x^2 + 1} , \:\:\:\: x \in (-\infty,\infty).
\eeq
 What is its normalizing constant $Z$? Can you evaluate
 its mean or variance?

 Consider the sum $z=x_1 + x_2$, where $x_1$ and $x_2$ are independent 
 random variables from a Cauchy 
 distribution. What is $P(z)$? What is the probability 
 distribution of the mean of $x_1$ and $x_2$, $\bar{x}=(x_1+x_2)/2$?
 What is the 
 probability
 distribution of the mean of $N$ samples from this \ind{Cauchy distribution}? 
}
%
\subsection{Other asymptotic properties}
% Levy flights too?
\exercisxC{3}{ex.chernoff}{ {\sf\ind{Chernoff bound}.}
 We derived the weak law of large numbers from Chebyshev's inequality\index{Chebyshev inequality}
 (\ref{eq.cheb.1}) by letting the random variable $t$
 in the inequality
$%\beq
	P(t \geq \a) \:\leq\: \bar{t}/\a
%\label{eq.cheb.1a}
$
 be a function, $t = (x-\bar{x})^2$,
 of the random variable $x$ we were interested in.

 Other useful inequalities can be obtained by using other
 functions. The \ind{Chernoff bound}, which is useful\index{bound}
 for bounding the \ind{tail}s of a distribution, is obtained by
 letting $t = \exp( s x)$.

 Show that
\beq
	P( x \geq a ) \leq e^{-sa} g(s) , \:\:\:\mbox{ for any $s>0$ }
\eeq
 and 
\beq
	P( x \leq a ) \leq e^{-sa} g(s) , \:\:\:\mbox{ for any $s<0$ }
\eeq
 where $g(s)$ is the moment-generating function of $x$,
\beq
	g(s) = \sum_x  P(x) e^{sx} .
\eeq
%
% Hence show that if $z$ is a sum of $N$ random variables $x$,
%\beq
%	P( z \geq a ) \leq  
%\eeq
}
% end






%
\subsection*{Curious functions related to $p \log 1/p$}
\exercisxE{4}{ex.fxxxxx}{
 This exercise has {no purpose at all}; it's  included
 for the enjoyment of those who like mathematical curiousities.

 Sketch the function
\beq
	f(x) = x^{x^{x^{x^{x^{\cdot^{\cdot^{\cdot}}}}}}} 
%	f(x) = x^{x^{x^{x^{x^{\ddots}}}}} 
\eeq
 for $x \geq 0$.
% To be explicit about the order in which the powers are evaluated, 
% here's another definition of $f$:
%\beq
%	f(x) = x^{\left(x^{\left(x^{\cdot^{\cdot^{\cdot}}}\right)}\right)}
%\eeq
 {\sf Hint:}
 Work out the inverse function to $f$ -- that is, the function $g(y)$
 such that if $x=g(y)$ then $y=f(x)$ --  it's closely related to
 $p \log 1/p$.
% {\sf Hints:}
%\ben
%\item Consider $f(\sqrt{2})$:
% you might be able to persuade yourself
% that $f(\sqrt{2})=2$. You might also be able
% to persuade yourself that  $f(\sqrt{2})=4$. What's going on?
% [Yes, a two-valued function.]
%\item
% For a given  $x$, if $f(x)=y$, then we have $y = x^{y}$, so
% $y$ is found at the intersection of the curves $u_1(y)=x^y$ and $u_2(y)=y$.
%\item
% Work out the inverse function to $f$ -- that is, the function $g(y)$
% such that if $x=g(y)$ then $y=f(x)$ -- hint: it's closely related to
% $p \log 1/p$.
%\een
}



\dvips
%\chapter{The Source Coding Theorem (old version of this Chapter)}
%\label{ch.two.old}
%\input{tex/_l2old.tex}
%\dvips
\subchapter{Solutions to Chapter \protect\ref{ch.two}'s exercises} 
\fakesection{_s2}
% chapter 2
% ex 39...
%
\soln{ex.ascii}{
 An ASCII file can be reduced in size by 7/8. This reduction 
 could be achieved by a  block code that maps 8-byte blocks 
 into 7-byte blocks by copying the
% . The mapping would copy
 56 information-carrying bits  into 
 7 bytes.  
}
\soln{ex.compress.possible}{
% Theorem:
%  No program can compress without loss *all* files of size >= N bits, for
%  any given integer N >= 0.
%
%Proof:
%  Assume that the program can compress without loss all files of size >= N
%  bits.  Compress with this program all the 2^N files which have exactly N
%  bits.  All compressed files have at most N-1 bits, so there are at most
%  (2^N)-1 different compressed files [2^(N-1) files of size N-1, 2^(N-2) of
%  size N-2, and so on, down to 1 file of size 0]. So at least two different
%  input files must compress to the same output file. Hence the compression
%  program cannot be lossless.
%
%The proof is called the "counting argument". It uses the so-called
 The pigeon--hole
principle states: you can't put 16 pigeons into 15 holes without using one of the
holes twice.

 Similarly, you can't give $\A_X$ outcomes unique 
 binary names of some length $l$
 shorter than $\log_2 |\A_X|$ bits, because there are only $2^l$
 such binary names, and $l < \log_2 |\A_X|$ implies $2^l <  |\A_X|$,
 so at least two different inputs to the compressor would compress to
 the same output file.
}
\soln{ex.cusps}{
 Between the cusps, all the changes in 
 probability are equal, and the number of elements 
 in $T$ changes by one at each step. So $H_{\delta}$ 
 varies logarithmically with $(-\delta)$.
% NEEDS WORK!
}
\soln{ex.binaryweigh}{
 Going by the rule of thumb that the most efficient strategy is the
 most informative strategy, in the sense of having all possible
 outcomes as near as possible to equiprobable, we want the first
 weighing to have outcomes `the two sides balance' in eight cases and
 `the two sides do not balance' in eight cases.  This is achieved by
 initially weighing 1,2,3,4 against 5,6,7,8, leaving the other eight
 balls aside. Iterating this binary division of the
 possibilities, we arrive at a strategy requiring 4 weighings.

 The above strategy for designing a sequence of binary 
 experiments by constructing a binary tree  from the top down 
 is actually not always optimal; the optimal 
 method of constructing a binary tree will be explained  in the 
 next chapter. 
}
%
% Another solution from Conway:
% Label them
% F AM NOT LICKED
% then use these divisions
% MA DO   LIKE
% ME TO   FIND
% FAKE    COIN
%
%\soln{ex.twelve.generalize.weigh}{
% Thu, 28 Jan 1999 19:19:30 -0500 (EST)
% From:
% 
\begin{Sexercise}{ex.twelve.generalize.weigh}
 This solution was  found by Dyson and Lyness in 1946
 and presented in the following elegant form  by
 {John Conway}\index{Conway, John} in 1999.\footnote{Posting to  
 {\tt{geometry-puzzles@forum.swarthmore.edu}}
 Thu, 28 Jan 1999.
}
%
 Be warned: the symbols A, B, and C are used to  name the
 balls, to name the pans of the balance, 
 to name the outcomes, and to name
 the possible states of the odd ball!
\ben%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% enumerate 1
\item
   Label the 12 balls by the sequences
%
% verbatim not allowed in the argument of a command
%
{\small
\begin{verbatim}
   AAB  ABA  ABB  ABC  BBC  BCA  BCB  BCC  CAA  CAB  CAC  CCA
\end{verbatim}
}
and in the
{\small
\begin{verbatim}
1st               AAB ABA ABB ABC           BBC BCA BCB BCC
2nd weighings put AAB CAA CAB CAC in pan A, ABA ABB ABC BBC in pan B.
3rd               ABA BCA CAA CCA           AAB ABB BCB CAB
\end{verbatim}
}
 Now in a given weighing, a pan will either end up in the
\bit
\item
   {\tt C}anonical position ({\tt C}) that it assumes when the pans are balanced, or
\item
   {\tt A}bove that position ({\tt A}), or
\item
   {\tt B}elow it ({\tt B}),
\eit
 so the weighings determine a sequence of three of these letters.

   If this sequence is {\tt CCC}, then there's no odd ball.  Otherwise,
for {\em just one\/} of the two pans, the sequence is among the 12 above,
and names the odd ball, whose weight is {\tt A}bove or {\tt B}elow the proper
one according as the pan is  {\tt A}  or  {\tt B}.
\item

 In $W$  weighings the odd ball can be identified from
 among
\beq
 N = (3^W - 3 )/2
\eeq
 balls in the same way, by labelling them with all
 the non-constant sequences of  $W$  letters from  {\tt A}, {\tt B}, {\tt C}  whose
 first change is  A--to--B  or  B--to--C  or  C--to--A, and at the
 $w$th weighing putting those whose  $w$th  letter is  {\tt A}  in pan {\tt A} 
 and those whose  $w$th  letter is  {\tt B}  in pan {\tt B}.
\een
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%}
\end{Sexercise}
\begincuttable
\soln{ex.twelve.two.weigh}{
\ben
\item
 A sloppy answer to this question counts the number of possible
 states, ${{12}\choose{2}} 2^2 = 264$, and takes its base 3 logarithm, 
 which is 5.07, which exceeds 5.
 We might estimate that six weighings
 suffice to find the state of the two odd balls among 12.  If there 
 are three odd balls then there are  ${{12}\choose{3}} 2^3 = 1760$
  states, whose logarithm is 6.80, so seven weighings might 
 be estimated to suffice.

  However, these answers neglect the possibility
 that we will learn something more from our experiments than 
 just which are the odd balls.
  Let us define the oddness of an odd ball to be the absolute 
 value of the difference between its weight and the regular weight.
 There is a good chance that we will 
 also learn something about the relative oddnesses 
 of the two odd balls.
%
% If, say, balls A and B are both heavy, 
% and A is heavier than B,
% there  is a good chance that the optimal weighing strategy 
% will at some point put ball A on one side of the balance 
% and ball B on the other, along with a load of regular balls;
% the outcome of this weighing 
% reveals, at the end of the day, that A was heavier than B, which 
% is not something we were asked to find out.  From the
 If balls $m$ and $n$ are the odd balls,
 there  is a good chance that the optimal weighing strategy 
 will at some point put ball $m$ on one side of the balance 
 and ball $n$  on the other, along with a load of regular balls;
 if $m$ and $n$  are both heavy balls, say,
 the outcome of this weighing will
% allow us to deduce
 reveal, at the end of the day, whether $m$ was heavier than $n$, or lighter,
 or the same, which 
 is not something we were asked to find out.  From the
 point of view of the task, finding the relative oddnesses 
 of the two balls  is a waste of experimental capacity.

 A more careful estimate takes this annoying possibility into account.

 In the case of two odd balls,
 a complete description of the balls, including a ranking of their 
 oddnesses,  has three times as many states as we counted above (the
 two odd balls could be odd by the same amount, or by amounts
 that differ), \ie,
 $264\times 3 = 792$ outcomes, whose logarithm is 6.07.
 Thus to identify the {\em full\/} state 
 of the system in 6 weighings is impossible --- at least seven are needed.
 I don't know whether the original
 problem can be solved in 6 weighings.
%with a strategy that
% sometimes avoids finding the ranking of the oddnesses. 

 In the case of three odd balls, there are $3!=6$ possible rankings
 of the oddnesses if the oddnesses are different (\eg, 
 $0  set size 0.6,0.6
% gnuplot>  set output 'figs/hd/all.1.100.ps'
% gnuplot> plot 'figs/1' u 5:6 t 'N=1' w l, 'figs/2' u 5:6 t 'N=2' w l, 'figs/100' u 5:6 t 'N=100' w l
 The curves $\frac{1}{N} H_{\delta}(X^N)$
 as a function of $\delta$ for $N=1,2$ and 100 are shown in \figref{fig.hd.1.100}.
% and table \ref{tab.Hdelta.0.4}.
 Note that $H_2(0.4) = 0.971$ bits.
\begin{figure}[htbp]
%\figuremargin{%
\figuredanglenudge{%
\begin{center}
\begin{tabular}[t]{rl}
\begin{tabular}[t]{l}\vspace{0in}\\% alignment hack
\mbox{\psfig{figure=Hdelta/figs/hd/all.1.100.ps,%
width=60mm,angle=-90}}
\end{tabular}
%
\hspace{0in}
&
%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}[t]{r@{--}lcc} \toprule
\multicolumn{4}{c}{$N=1$} \\ \midrule
%    delta          1/N Hdelta        2^{Hdelta}
\multicolumn{2}{c}{$\delta$} & $\frac{1}{N} H_{\delta}(\bX)$ & $2^{H_{\delta}(\bX)}$ 
% raise the roof!
% {\rule[-3mm]{0pt}{8mm}}
\\ \midrule
0    &    0.4 &           1   &         2           \\
0.4  &      1 &           0   &         1           \\ \bottomrule
\end{tabular}
\hspace{0.1in} 
\begin{tabular}[t]{r@{--}lcc} \toprule% {r@{--}lcc}
\multicolumn{4}{c}{$N=2$} \\  \midrule
%    delta          1/N Hdelta        2^{Hdelta}
\multicolumn{2}{c}{$\delta$} & $\frac{1}{N} H_{\delta}(\bX)$ & $2^{H_{\delta}(\bX)}$ 
% raise the roof!
% {\rule[-3mm]{0pt}{8mm}}
\\ \midrule 
0 &         0.16  &           1  &           4            \\
0.16 &      0.4   &    0.79248   &          3            \\
0.4 &       0.64  &         0.5  &           2            \\
0.64 &         1  &           0  &           1            \\ \bottomrule
\end{tabular}\\
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{center}
}{%
\caption[a]{$\frac{1}{N} H_{\delta}(\bX)$ (vertical axis) against $\delta$ (horizontal), 
	for  $N=1, 2, 100$ binary variables with $p_1=0.4$.}
\label{fig.hd.1.100}
\label{tab.Hdelta.0.4}
}{0.25in}
\end{figure}
%\begin{table}[htbp]
%\figuremargin{%
%\begin{center}
%\end{center}
%}{%
%\caption[a]{Values of $\frac{1}{N} H_{\delta}(\bX)$  against $\delta$.}
%% add 0.4 to this caption
%\label{tab.Hdelta.0.4}
%}
%\end{table}
%
}
\soln{ex.Hd55}{% ex 43 
%  hd.p p=0.5 mmin=1 mmax=3 mstep=1 scale_by_n=1 plot_sub_graphs=1 | gnuplot 
%  hd.p p=0.5 mmin=100 mmax=100 mstep=1 suppress_early_detail=1 scale_by_n=1 plot_sub_graphs=1 | gnuplot 
% plot 'figs/1' u 5:6 t 'N=1' w l,'figs/2' u 5:6 t 'N=2' w l,'figs/3' u 5:6 t 'N=3' w l, 'figs/100' u 5:6 t 'N=100' w l
% gnuplot> set term postscript
% Terminal type set to 'postscript'
% Options are 'landscape monochrome dashed "Helvetica" 14'
% gnuplot> set output 'figs/hd/1.2.3.100.ps
% gnuplot> replot
 The curves $\frac{1}{N} H_{\delta}(Y^N)$
 as a function of $\delta$ for $N=1,2,3$ and 100 are shown in \figref{fig.hd.1.2.3.100}.
% and table \ref{tab.Hdelta.0.5}.
 Note that $H_2(0.5) = 1$ bit.
\begin{figure}[htbp]
\figuredangle{%
\begin{center}
\mbox{%
\begin{tabular}[t]{r}\vspace{0in}\\% alignment hack
\mbox{\psfig{figure=Hdelta/figs/hd/1.2.3.100.ps,%
width=60mm,angle=-90}}\end{tabular}
%
\hspace{0in} 
%%%%%%%%%%%%%%%%%%%
\begin{tabular}[t]{r@{--}lcc} \toprule % {r@{--}lcc}  \midrule
\multicolumn{4}{c}{$N=2$} \\ \midrule
%    delta          1/N Hdelta        2^{Hdelta} 
\multicolumn{2}{c}{$\delta$} & $\frac{1}{N} H_{\delta}(\bY)$ & $2^{H_{\delta}(\bY)}$ 
% raise the roof!
%{\rule[-3mm]{0pt}{8mm}}
\\ \midrule
%
0 &         0.25  &           1  &           4            \\
0.25 &      0.5   &    0.79248   &          3            \\
0.5 &       0.75  &         0.5  &           2            \\
0.75 &         1  &           0  &           1            \\ \bottomrule
\end{tabular}
\hspace{0.1in} 
\begin{tabular}[t]{r@{--}lcc} \toprule
\multicolumn{4}{c}{$N=3$} \\ \midrule
%    delta          1/N Hdelta        2^{Hdelta}
\multicolumn{2}{c}{$\delta$} & $\frac{1}{N} H_{\delta}(\bY)$ & $2^{H_{\delta}(\bY)}$ 
% raise the roof!
%{\rule[-3mm]{0pt}{8mm}}
\\ \midrule
% 
0&      0.125 &           1   &         8           \\
0.125&   0.25 &     0.93578   &         7           \\
0.25 &  0.375 &     0.86165   &         6           \\
0.375 &   0.5 &     0.77398   &         5           \\
0.5  &  0.625 &     0.66667   &         4           \\
0.625 &  0.75 &     0.52832   &         3           \\
0.75 &  0.875 &     0.33333   &         2           \\
0.875 &     1 &           0   &         1           \\ \bottomrule
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%
%
}
\end{center}
}{%
\caption[a]{$\frac{1}{N} H_{\delta}(\bY)$ (vertical axis) against $\delta$ (horizontal), 
	for  $N=1, 2, 3, 100$ binary variables with $p_1=0.5$.}
\label{fig.hd.1.2.3.100}
\label{tab.Hdelta.0.5}
}%
\end{figure}
%
%\begin{table}[htbp]
%\figuremargin{%
%\begin{center}
%\end{center}
%}{%
%\caption[a]{Values of $\frac{1}{N} H_{\delta}(\bY)$  against $\delta$.}
%% add 0.5 to this caption
%\label{tab.Hdelta.0.5}
%}
%\end{table}
}
\soln{ex.HdSB}{
 The Gibbs entropy is $\kB \sum_i p_i \log \frac{1}{p_i}$, where $i$
 runs over all states of the system. This entropy is equivalent  (apart from the factor of $\kB$) 
 to  the Shannon entropy of the ensemble. 

 Whereas the Gibbs entropy can be
 defined for any ensemble, the Boltzmann entropy is only
 defined for  {\dem microcanonical\/} ensembles, which
 have a probability distribution that is uniform over a
 set of accessible states.
 The Boltzmann entropy is defined to be $S_{\rm B} = \kB \log \Omega$
 where $\Omega$ is the number of accessible states 
 of the  microcanonical  ensemble. This is equivalent 
 (apart from the factor of $\kB$) to the perfect information content 
 $H_0$ of that constrained
 ensemble. The Gibbs entropy of a microcanonical
 ensemble is trivially equal to the Boltzmann entropy. 

 We now  consider a   \ind{thermal distribution} (the
 {\dem\ind{canonical}\/} ensemble),
 where the probability of a state  $\bx$ is 
\beq
%	P(\bx) =\frac{1}{Z} \exp( - \beta E(\bx) )? 
	P(\bx) =\frac{1}{Z} \exp\left( - \frac{ E(\bx) }{\kB T} \right) . 
\eeq
 With this canonical ensemble we can associate a
 corresponding microcanonical ensemble,
% typically
% usually
 an ensemble 
 with  total energy  fixed to the mean
 energy of the canonical ensemble
 (fixed to within some precision $\epsilon$).
% Recalling that under the 
% thermal distribution (the canonical ensemble) we see that
 Now, fixing the total energy to a precision $\epsilon$ is equivalent to 
 fixing the value of $\log 1/P(\bx)$ to within
% $\epsilon/\beta$.
 $\epsilon \kB T$.
 Our definition of the typical set 
 $T_{N \beta}$ was precisely that it consisted of all elements that 
 have a  value of $\log P(\bx)$ very close to the mean value
 of $\log P(\bx)$ under the canonical ensemble, $- N H(X)$. 
 Thus the microcanonical ensemble is equivalent to 
 a uniform distribution over 
% constraining the state $\bx$ to be in 
 the typical set of the canonical ensemble. 

 Our proof of the \aep\  thus proves --- for the 
 case of a system whose energy is separable into a sum of independent
 terms --- that the 
 Boltzmann entropy of the microcanonical ensemble 
 is very close (for large $N$) to the Gibbs entropy of 
 the canonical ensemble, if the energy of the microcanonical
 ensemble is constrained to equal the mean energy of the 
 canonical ensemble.
}
\soln{ex.cauchy}{
 The normalizing constant of  the \ind{Cauchy distribution}
\[
	P(x) = \frac{1}{Z} \frac{1}{x^2 + 1} 
\]
 is
\beq
	Z = \int^{\infty}_{-\infty} dx \: \frac{1}{x^2 + 1}
  = \left[ {\tan}^{-1} x \right]^{\infty}_{-\infty} = \frac{\pi}{2} - \frac{-\pi}{2} = \pi .
\eeq
 The mean and variance of this distribution are both undefined. (The distribution
 is symmetrical about zero, but this does not imply that its mean is zero. The mean 
 is the value of a divergent integral.)
% ; depending what limiting procedure we 
%  define to evaluate this integral we 
 The sum $z=x_1 + x_2$, where $x_1$ and $x_2$ both 
 have Cauchy distributions, has probability density given by the convolution
\beq
 P(z) = \frac{1}{\pi^2} \int^{\infty}_{-\infty} dx_1 \:
	\frac{1}{x_1^2 + 1}
	\frac{1}{(z-x_2)^2 + 1}
 , 
\eeq
%  Introducing $\Delta \equiv x_1-x_2$ this can be written more symmetrically 
%  as
% \beq
%  P(z) = \frac{1}{\pi^2} \int^{\infty}_{-\infty} d \Delta \:
% \eeq
 which after a considerable labour using standard methods
%\footnote{Can anyone 
% give me an elegant solution?} 
 gives
\beq
	P(z) = \frac{1}{\pi^2} 2 \frac{\pi}{z^2+4} = \frac{2}{\pi}  \frac{1}{z^2+2^2} ,
\label{eq.cauchysum}
\eeq
 which we recognize as a Cauchy distribution with width parameter 2
 (where the original distribution has width parameter 1).
 This implies that the mean of the two points, $\bar{x} = (x_1+x_2)/2 = z/2$, 
 has a Cauchy distribution with width parameter 1. Generalizing, the mean 
 of $N$ samples from a Cauchy distribution is Cauchy-distributed 
 with the {\em same parameters\/} as the individual samples. The probability 
 distribution of the mean does {\em not\/} become narrower 
 as $1/\sqrt{N}$. 

 {\em The central limit theorem does not apply to the \ind{Cauchy distribution}, 
 because it does not have a finite \ind{variance}.}

 An alternative neat method for getting to \eqref{eq.cauchysum} makes 
 use of the Fourier transform of the Cauchy distribution, which is 
 a biexponential $e^{-|\omega|}$. Convolution in real space 
 corresponds to multiplication in Fourier space,
 so the \ind{Fourier transform} of $z$ is simply $e^{-|2 \omega|}$.
 Reversing the transform, we obtain \eqref{eq.cauchysum}.
}
%\begincuttable
\soln{ex.fxxxxx}{
\amarginfig{c}{
\begin{center}
\begin{tabular}{c}
\psfig{figure=gnu/fxxxxx50.ps,width=1.7in,angle=-90}\\
\psfig{figure=gnu/fxxxxx5.ps,width=1.7in,angle=-90}\\
\psfig{figure=gnu/fxxxxx.5.ps,width=1.7in,angle=-90}\\
\end{tabular}
\end{center}
%}{%                  gnu: load 'fxxxxx.gnu'
\caption[a]{
% The function
$\displaystyle
	f(x) = x_{\:,}^{x^{x^{x^{x^{\cdot^{\cdot^{\cdot}}}}}}} 
$ shown at three different scales.}
\label{fig.xxxxx}
}%
 The function $f(x)$
%\beq
%	f(x) = x^{x^{x^{x^{x^{\ddots}}}}} 
%\eeq
 has inverse function 
% to $f$ is
\beq
 g(y) = y^{1/y}. 
\eeq
 Note
\beq
	\log g(y) = 1/y \log y .
\eeq
 I obtained a tentative graph of $f(x)$ by plotting $g(y)$ with
 $y$ along the vertical axis and $g(y)$ along the horizontal
 axis. The resulting  graph suggests that $f(x)$
 is single valued for $x \in (0,1)$, and looks surprisingly well-behaved
 and ordinary. For $x \in (1, e^{1/e})$, $f(x)$ is two-valued.
 $f(\sqrt{2})$ is  equal both to 2 and 4.
 For $x > e^{1/e}$ (which is about 1.44), $f(x)$ is infinite.
% undefined.
 However, it might be argued that this approach to sketching $f(x)$
 is  only partly valid, if we define $f$ as the  limit of the
 sequence of functions  $x$, 
 $x^x$, $x^{x^x}, \ldots$;
	 this sequence does not
 have a limit for
% , below
% pr (1.0/exp(1.0))**exp(1.0)
% 0.0659880358453126
 $0 \leq x \leq  (1/e)^e \simeq 0.07$
 on account of a pitchfork \ind{bifurcation} at $x=(1/e)^e$;
 and for $x \in (1,e^{1/e})$, the sequence's limit is single-valued --
 the lower of the two values sketched in the figure.
% load 'fxxxxx.gnu2'
%
}
%\endcuttable




\dvipsb{solutions source coding}
\prechapter{About     Chapter}
\fakesection{intro for chapter 3}
 In the last chapter, we saw a proof of the fundamental status of the entropy 
 as a measure of average information content.
 We defined a data compression scheme using
 {\em fixed length block codes}, and
 proved that as  $N$ increases,
 it is possible to encode $N$ i.i.d.\ variables 
 $\bx = (x_1,\ldots,x_N)$ into a block of $N(H(X)+\epsilon)$ bits
 with vanishing probability of error, whereas if we attempt to 
 encode $X^N$ into $N(H(X)-\epsilon)$ bits, the probability of 
 error is virtually 1.

        We thus verified the {\em possibility\/} of 
 data compression, but the block coding defined in the proof 
 did not  give a  practical algorithm. 
 In this chapter and the next,
 we  study practical data compression algorithms. 
 Whereas the last chapter's compression scheme
 used large blocks of {\em fixed\/} size and was
 {\em lossy}, in the next chapter we discuss
 {\em variable-length\/} compression schemes that are
 practical for small block sizes and that are {\em not lossy}.

 Imagine a rubber glove filled with water. If we compress two
 fingers of the glove, some other part of the glove has
 to expand, because
 the total volume of water is constant. [Water is essentially
 incompressible.] Similarly, when we shorten
 the codewords for some outcomes, there must be other
 codewords that get longer, if the scheme is not lossy.
 In this chapter we will discover the information-theoretic
 equivalent of water volume.
% the constant volume of water in the glove.
%%
\medskip

\fakesection{prerequisites for chapter 3}
 Before reading chapter \chthree, you should have worked on 
 \extwenty.
\medskip

 We will use the\index{notation!intervals} 
 following notation for intervals:
% the statement
\begin{center}
\begin{tabular}{ll}
 $x \in [1 ,2)$ & means that $x \geq 1$ and $x < 2$; \\
% the statement 
 $x \in (1 ,2]$ & means that $x > 1$ and $x \leq 2$.\\
\end{tabular}
\end{center}
 

% {All these definitions of source
%        codes, Huffman codes, etc., can be generalized to codes over
%        other $q$-ary alphabets, but little is lost by concentrating on 
%        the binary case.} 


%\chapter{Data Compression II: Symbol Codes}
\mysetcounter{page}{102}
\chapter{Symbol Codes}
\label{ch.three}
\addtopic{3}{infotheory}
\addtopic{1}{probability}
%\addtopic{2}{inference}
%\addtopic{3}{computation}
%\addtrack{1}{inferencecourse}
\addtrack{3}{infotheorycourse}
\addtrack{3}{itprnncourse}
% %.tex 
% \documentstyle[twoside,11pt,chapternotes,lsalike]{itchapter}
% \begin{document}
% \bibliographystyle{lsalike} 
% \input{psfig.tex} 
% \include{/home/mackay/tex/newcommands1}
% \include{/home/mackay/tex/newcommands2}
% \input{itprnnchapter.tex} 
% \setcounter{chapter}{2}%  set to previous value
% \setcounter{page}{34} % set to current value 
% \setcounter{exercise_number}{45} % set to imminent value
% % 
% \renewcommand{\bs}{{\bf s}}
% \newcommand{\eq}{\mbox{$=$}}
% \chapter{Data Compression II: Symbol Codes}
% % \section*{Source Coding: Lossless data compression with symbol codes}
% % Practical source coding
\label{ch3}
%\section{Symbol codes}
 In this chapter, we  discuss
 {\dem variable-length symbol codes\/}\indexs{symbol code},\index{source code!symbol code}
% , variable-length},
 which encode one source symbol at a time, instead of encoding huge strings of 
 $N$ source symbols. These codes  are 
 {\dem lossless:}
 unlike the last chapter's block codes, they are guaranteed to
 compress and  decompress without
 any errors; but there is a chance that the codes may sometimes produce 
 encoded strings longer  than the original source string.

 The idea is that we can achieve compression, on average,
 by assigning {\em shorter\/} encodings to the more
probable outcomes and {\em longer\/} encodings to the less probable.

 The key issues are:
\begin{description}
\item[What are the implications if a symbol code is {\em lossless\/}?]
 If some codewords are shortened, by how much do other codewords
 have to be lengthened?
\item[Making compression practical.]
 How can we ensure that a symbol code is easy to decode?
\item[Optimal symbol codes.]
 How should we assign codelengths to achieve the best
 compression, and what is the best achievable compression?
\end{description}

 We  again verify the 
 fundamental status of the Shannon information content and the entropy, proving:\index{source coding theorem}
%
%
\begin{description}
\item[Source coding theorem (symbol codes).]
        There exists a variable-length encoding $C$ of an ensemble
 $X$ such that the average length of an encoded symbol, 
 $L(C,X)$, satisfies
 $L(C,X) \in \left[ H(X) ,  H(X) + 1 \right)$.

The average length is equal to the entropy $H(X)$ only if the codelength
 for each outcome is equal to its Shannon information content.
\end{description}
%
 We will also define a constructive  procedure, the 
 \index{Huffman code}Huffman 
 coding algorithm, that produces optimal symbol codes.\index{symbol code!optimal}\index{source code!symbol code!optimal} 

\begin{description}
\item[Notation for alphabets.]  $\A^N$ denotes the set of 
        ordered $N$-tuples of elements from the set $\A$, \ie,
        all strings of length $N$. 
        The symbol $\A^+$ will denote the set of all strings of finite
        length composed of elements from the set $\A$. 
\end{description}
\exampla{ $\{{\tt{0}},{\tt{1}}\}^3 = \{{\tt{0}}{\tt{0}}{\tt{0}},{\tt{0}}{\tt{0}}{\tt{1}},{\tt{0}}{\tt{1}}{\tt{0}},{\tt{0}}{\tt{1}}{\tt{1}},{\tt{1}}{\tt{0}}{\tt{0}},{\tt{1}}{\tt{0}}{\tt{1}},{\tt{1}}{\tt{1}}{\tt{0}},{\tt{1}}{\tt{1}}{\tt{1}}\}$. }
\exampla{
        $\{{\tt{0}},{\tt{1}}\}^+ = \{ {\tt{0}} , {\tt{1}} , {\tt{0}}{\tt{0}} , {\tt{0}}{\tt{1}} , {\tt{1}}{\tt{0}} , {\tt{1}}{\tt{1}} , {\tt{0}}{\tt{0}}{\tt{0}} , {\tt{0}}{\tt{0}}{\tt{1}} , \ldots \}$.
}
% This notation is borrowed from the standard notation for expressions 
% in computer science
\section{Symbol codes}
\label{sec.symbol.code.intro}
\begin{description}
\item[A (binary) symbol code]
        $C$ for an ensemble $X$ is a mapping from the range of $x$,
        $\A_X \eq \{a_1,\ldots, $ $a_I\}$, to $\{{\tt{0}},{\tt{1}}\}^+$.
% a set of finite length strings of symbols 
%       from an alphabet (NAME?). 
        $c(x)$ will denote the {\dem{codeword}\/}\indexs{symbol code!codeword}
 corresponding to $x$, 
        and $l(x)$ will denote its length, with $l_i = l(a_i)$.

        The {\dem \inds{extended code}\/} $C^+$ 
        is a mapping from $\A_X^+$ to $\{{\tt{0}},{\tt{1}}\}^+$
        obtained by concatenation, without punctutation, of the 
 corresponding codewords:\index{concatenation!in compression} 
\beq
        c^+(x_1 x_2 \ldots x_N) = c(x_1)c(x_2)\ldots c(x_N) .
\eeq

 [The term `\ind{mapping}' here is a synonym for `function'.] 
\end{description}
\exampla{
 A symbol code for the ensemble 
 $X$ defined by
\beq
\begin{array}{*{4}{c}*{5}{@{\,}c}}
             & \A_X & = & \{ & {\tt a}, & {\tt b}, & {\tt c}, & {\tt d} & \} , \\
             & \P_X & = & \{ & \dhalf, & \dquarter, & \deighth, & \deighth  &  \}, 
\end{array}
\eeq
% : \A_X = \{{\tt{a}},{\tt{b}},{\tt{c}},{\tt{d}}\},$ $\P_X = \{ \dhalf,\dquarter,\deighth,\deighth \}$
 is   $C_0$, shown in the margin.
% = \{ {\tt{1}}{\tt{0}}{\tt{0}}{\tt{0}}, {\tt{0}}{\tt{1}}{\tt{0}}{\tt{0}}, {\tt{0}}{\tt{0}}{\tt{1}}{\tt{0}}, {\tt{0}}{\tt{0}}{\tt{0}}{\tt{1}}\}$.
\marginpar{
\begin{center}
$C_0$: 
\begin{tabular}{clc} \toprule
$a_i$ & $c(a_i)$ & $l_i$ 
% {\rule[-3mm]{0pt}{8mm}}%strut
\\ \midrule 
{\tt a} & {\tt 1000}   &   4      \\
{\tt b} & {\tt 0100}   &     4    \\
{\tt c} & {\tt 0010}   &    4     \\
{\tt d} & {\tt 0001}   &   4      \\
 \bottomrule
\end{tabular}
\end{center}
}

 Using the extended code, we may encode ${\tt{acdbac}}$
 as
\beq
	c^{+}({\tt{acdbac}}) =
 {\tt{1000}} 
 {\tt{0010}} 
 {\tt{0001}}
 {\tt{0100}} 
 {\tt{1000}}
 {\tt{0010}}
\eeq
}
 There are  basic requirements for a useful symbol code. 
 First, any encoded string must have a unique decoding.
  Second, the symbol code must be easy to decode.
 And third, the code should achieve as much compression as possible.
\subsection{Any encoded string must have a unique decoding}
\begin{description}
\item[A code $C(X)$ is uniquely decodeable] if, under the 
 extended code $C^+$, no two distinct
 strings have the same encoding,
% every element of $\A_X^+$  maps into a different string,
 \ie, 
\beq
        \forall \, \bx,\by \in \A_X^+, \:\: \bx \not = \by  \:\:  \Rightarrow   \:\:
        c^+(\bx) \not = c^+(\by).
\label{eq.UD}
\eeq
%cnp22@maths.cam.ac.uk:
% I'm missing the word `injectivity'. This would explain, why 
% (3.2) is necessary for an inverse function.
%
% {\em I believe mathematicians would put it this way:
% a code is uniquely decodeable if the extended code is an injective
% mapping.}
\end{description}
 The code $C_0$ defined above is  an example of a uniquely decodeable
 code.

\subsection{The symbol code must be easy to decode}
 A symbol code 
 is easiest to decode if it is possible to identify the end of a 
 codeword as soon as it arrives, which means that no codeword can 
 be a {\dem \inds{prefix}\/} of another codeword.
%
% {\em (Need a defn of a prefix here.)}
%\marginpar{\footnotesize
% [A word $c$
%% \in \A^{+}$
% is a {\dem prefix\/} of another word $d$
%% \in \A^{+}$
% if there exists a tail string $t$
%% \in \A^{*}
% such that the concatenation $ct$ is
% identical to $d$. For example, {\tt 1} is a prefix of {\tt 101},
% and so is {\tt 10}.]
%}
 [A word $c$
% \in \A^{+}$
 is a {\dem prefix\/} of another word $d$
% \in \A^{+}$
 if there exists a tail string $t$
% \in \A^{*}
 such that the concatenation $ct$ is
 identical to $d$. For example, {\tt 1} is a prefix of {\tt 101},
 and so is {\tt 10}.]

%
 We will show later that we don't lose 
 any performance if we constrain our symbol code to be 
 a prefix code. 
\begin{description}
\item[A symbol code is called a \inds{prefix code}]
 if no codeword is a prefix of 
 any other codeword.

 A prefix code is also known as an {\dem\ind{instantaneous}\/}
 or {\dem\ind{self-punctuating}\/}
 code, because an encoded string  can be decoded 
 from left to right without looking ahead to subsequent 
 codewords. The end of a codeword is immediately recognizable.
 A prefix code is  uniquely decodeable.


\end{description}
\begin{aside}
 {Prefix codes are also
% is more accurately called
 known as  `prefix-free codes' or  `prefix condition codes'.}
\end{aside}

 Prefix codes correspond to trees.

\exampla{
\marginpar[t]{\mbox{\small$C_1$ \psfig{figure=figs/C1.ps,angle=-90,width=1in}}}
        The code $C_1 = \{ {\tt{0}} , {\tt{1}}{\tt{0}}{\tt{1}} \}$ is a prefix code because 
        ${\tt{0}}$ is not a prefix of {\tt{1}}{\tt{0}}{\tt{1}}, nor is {\tt{1}}{\tt{0}}{\tt{1}} a prefix of {\tt{0}}.

}
\exampla{
        Let $C_2 = \{ {\tt{1}} , {\tt{1}}{\tt{0}}{\tt{1}} \}$. This code is not a prefix code because 
        ${\tt{1}}$ is  a prefix of {\tt{1}}{\tt{0}}{\tt{1}}.
}
\exampla{
% \marginpar[t]{\mbox{\small\raisebox{0.4in}[0in][0in]{$C_3$} \psfig{figure=figs/C3.ps,angle=-90,width=1in}}}
 The code $C_3 = \{ 
{\tt 0}   ,
{\tt 10}  ,
{\tt 110} ,
{\tt 111}
\}$
 is a prefix code.
%
}
%%%%%%%%%%%%%%%
\exampla{
\marginpar[t]{\mbox{\small\raisebox{0.4in}[0in][0in]{$C_3$} \psfig{figure=figs/C3.ps,angle=-90,width=1in}}\\[0.21in]
\mbox{\small%
\raisebox{0.2in}[0in][0in]{$C_4$} \psfig{figure=figs/C4.ps,angle=-90,width=0.681in}%
}\\[0.05in]
\small\raggedright
 Prefix codes can be represented on binary trees. {\dem Complete\/} prefix codes
 correspond to binary trees with no unused branches. $C_1$ is an incomplete code.}
 The code $C_4 = \{ 
{\tt 00}   ,
{\tt 01}  ,
{\tt 10} ,
{\tt 11}
\}$
 is a prefix code.
%
}
%%%%%%%%%%%%%%%

\exercissxA{1}{ex.C1101}{
        Is $C_2$ uniquely decodeable?
}
%
% example
%
% morse code with spaces stripped out. Is it a prefix code? Is it UD?
% (no,no)
%
\exampla{
% ref corrected 9802
 Consider  \exerciseref{ex.weigh} and \figref{fig.weighing} (\pref{fig.weighing}).
 Any weighing strategy that identifies the odd ball and whether it 
 is heavy or light can be viewed as assigning a  {\em ternary\/}
 code to each of the 24 possible states. 
 This code is a prefix code.
}
\subsection{The code should achieve as much compression as possible}
\begin{description}
\item[The expected length $L(C,X)$] of a symbol code $C$ for ensemble $X$ is 
\beq
        L(C,X) = \sum_{x \in \A_X} P(x) \, l(x).
\eeq
 We may also write this quantity as
\beq
	L(C,X) = \sum_{i=1}^{I} p_i l_i
\eeq
 where $I = |\A_X|$. 
\end{description}
%
\exampla{
% {\sf Example 1:}
\marginpar[b]{
\begin{center}
$C_3$:\\[0.1in] 
\begin{tabular}{cllcc} \toprule
$a_i$ & $c(a_i)$ & $p_i$  &
% \multicolumn{1}{c}{$\log_2 \frac{1}{p_i}$}
 $h(p_i)$
 & $l_i$ 
% {\rule[-3mm]{0pt}{8mm}}%strut
\\ \midrule 
{\tt a} & {\tt 0}   & \dhalf         &  1.0     &   1      \\
{\tt b} & {\tt 10}  & \dquarter        &  2.0     &   2      \\
{\tt c} & {\tt 110} & \deighth       &  3.0     &   3      \\
{\tt d} & {\tt 111} & \deighth       &  3.0     &   3      \\
 \bottomrule
\end{tabular}
\end{center}
}
 Let 
\beq
\begin{array}{*{4}{c}*{5}{@{\,}c}}
             & \A_X & = & \{ & {\tt a}, & {\tt b}, & {\tt c}, & {\tt d} & \} , \\
\mbox{and} \:\:& \P_X & = & \{ & \dhalf, & \dquarter, & \deighth, & \deighth  &  \}, 
\end{array}
\eeq
 and  consider the code $C_3$.
% $c(a)\eq {\tt{0}}$, $ c(b)\eq {\tt{1}}{\tt{0}}$,
% $c(c)\eq {\tt{1}}{\tt{1}}{\tt{0}}$, $ c(d)\eq {\tt{1}}{\tt{1}}{\tt{1}}$.
%
 The entropy of $X$ is 1.75 bits, and the expected length $L(C_3,X)$ of this 
 code is also 1.75 bits. The sequence of symbols $\bx\eq ({\tt acdbac})$ is 
% 134213
 encoded as $c^+(\bx)={\tt{0110111100110}}$. 
% You can confirm that no other sequence of 
% symbols $\bx$ has the same encoding.
% In fact,
 $C_3$ is a {prefix code\/}
 and is therefore \inds{uniquely decodeable}. 
 Notice that the codeword lengths satisfy $l_i \eq  \log_2 (1/p_i)$,  or
 equivalently,
 $p_i \eq  2^{-l_i}$.
}
%\medskip
%
%\noindent {\sf Example 2:}
\exampla{
 Consider the fixed length code for the same ensemble
 $X$, $C_4$.
% $ c(1)\eq {\tt{00}}$, $ c(2)\eq {\tt{01}}$, $ c(3)\eq {\tt{10}}$, $ c(4)\eq {\tt{11}}$.
%
% C4 by itself in a table, moved to graveyard
\marginpar[b]{
\begin{center}
 \begin{tabular}{cll} \toprule
% $a_i$
 &
$C_4$&
$C_5$
%&$C_6$
 \\
% $c(a_i)$ & $p_i$  &
% \multicolumn{1}{c}{$\log_2 \frac{1}{p_i}$}
% $h(p_i)$ & $l_i$
% {\rule[-3mm]{0pt}{8mm}}%strut
\\ \midrule 
{\tt a} & {\tt 00} & {\tt 0}        \\
{\tt b} & {\tt 01} & {\tt 1}         \\
{\tt c} & {\tt 10} & {\tt 00}     \\
{\tt d} & {\tt 11} & {\tt 11}     \\
 \bottomrule
\end{tabular}
\end{center}
}
 The expected length $L(C_4,X)$ is 2 bits.
}
% edskip
% 
% \noindent {\sf Example 3:}
\exampla{
 Consider $C_5$.
%$ c(1)\eq {\tt{0}}$, $ c(2)\eq {\tt{1}}$, $ c(3)\eq {\tt{00}}$,  $c(4)\eq {\tt{11}}$.
 The expected 
 length $L(C_5,X)$ is 1.25 bits, which is less than $H(X)$. 
 But the code is not uniquely decodeable. 
 The sequence $\bx\eq ({\tt acdbac})$
% 134213)$
 encodes as {\tt{000111000}}, which can also be 
 decoded as $({\tt cabdca})$.
}
% \medskip
% 
% \noindent {\sf Example 4:}
\exampla{
 Consider the code $C_6$.
\marginpar[b]{
\begin{center}
$C_6$:\\[0.1in]
 \begin{tabular}{cllcc} \toprule
$a_i$  & $c(a_i)$    & $p_i$  &
% {$\log_2 \frac{1}{p_i}$}
 $h(p_i)$
 & $l_i$ 
% {\rule[-3mm]{0pt}{8mm}}%strut
\\ \midrule 
{\tt a} & {\tt 0}    & \dhalf         &  1.0     &   1     \\
{\tt b} & {\tt 01}   & \dquarter        &  2.0     &   2     \\
{\tt c} & {\tt 011}  & \deighth       &  3.0     &   3     \\
{\tt d} & {\tt 111}  & \deighth       &  3.0     &   3     \\
 \bottomrule
\end{tabular}
\end{center}
}
%$ c(1)\eq {\tt{0}}$, $ c(2)\eq {\tt{01}}$, $ c(3)\eq {\tt{011}}$,  $c(4)\eq {\tt{111}}$. 
 The  expected length $L(C_6,X)$ of this 
 code is  1.75 bits. The sequence of symbols $\bx\eq ({\tt acdbac})$ is 
 encoded as $c^+(\bx)={\tt{0011111010011}}$. 

 Is $C_6$  a {prefix code}?
 It is not, because $c({\tt a}) = {\tt 0}$ is a prefix of $c({\tt b})$ and $c({\tt c})$. 

 Is $C_6$ {uniquely decodeable}? This is not so obvious. If you think that
 it might {\em not\/} be {uniquely decodeable},  try to prove it 
 so by finding a pair of strings $\bx$ and $\by$ that have the same
 encoding. [The definition of unique decodeability is given in \eqref{eq.UD}.]

 $C_6$ certainly isn't {\em easy\/} to decode. 
 When we receive `{\tt{00}}', it is possible that $\bx$ could start `{\tt{aa}}', 
 `{\tt{ab}}' or `{\tt{ac}}'. Once we have received `{\tt{001111}}', the second symbol 
 is still ambiguous, as $\bx$ could be `{\tt{abd}}\ldots' or `{\tt{acd}}\ldots'. 
 But eventually a unique decoding crystallizes, once the next {\tt{0}} appears in the 
 encoded stream. 

 $C_6$ {\em is\/} in fact {uniquely decodeable}. Comparing with the prefix code $C_3$, 
 we see that the codewords of $C_6$ are  the reverse of $C_3$'s.
 That $C_3$ is uniquely decodeable proves that $C_6$ is too, since
 any string from $C_6$ is identical to a string from $C_3$ read backwards. 
}
% \medskip
% something I recall reading in cover was a contrary statement that said that
% with a nonprefix code it will take an arb long time to figure things out. 
% maybe that was just a w.c. result.

% What is it that distinguishes a uniquely

\section{What limit is imposed by unique decodeability?}
 We now ask, given a list of positive integers $\{ l_i
 \}$, does there exist a uniquely decodeable\index{uniquely decodeable}\index{source code!uniquely decodeable} code with those
 integers as its codeword lengths?
 At this stage, we  ignore the probabilities of the different
 symbols; once we understand unique decodeability better, we'll
 reintroduce the probabilities and discuss how to make
 an {\dem optimal\/} uniquely decodeable symbol code. 

 In the examples above, we have observed that if we take a code 
 such as $\{{\tt{00}},{\tt{01}},{\tt{10}},{\tt{11}}\}$, and
 shorten one of its codewords, 
 for example ${\tt{00}} \rightarrow {\tt{0}}$, then we can  retain unique 
 decodeability only if we lengthen  other codewords.
 Thus there seems to be a constrained budget\index{symbol code!budget} that we can spend
 on codewords, with shorter codewords being more expensive.

 Let us explore the nature of this \ind{budget}. 
 If we build a code purely from codewords of length $l$ equal 
 to three, how many 
 codewords can we have and retain unique decodeability?
 The answer is $2^l = 8$. Once we have chosen all eight 
 of these codewords, is there any way we could add to the code another 
 codeword of some {\em other\/} length and retain unique decodeability? 
 It would seem not.

 What if we make a code that includes a length-one codeword, `{\tt{0}}', 
 with the other codewords being of length three?  How many length-three
 codewords can we have?
 If we restrict attention to prefix codes, then
% it is clear  that
 we can  have only four codewords of length three, namely 
 $\{ {\tt{100}},{\tt{101}},{\tt{110}},{\tt{111}} \}$. What about other codes? Is there any other 
 way of choosing codewords of length 3 that can give more codewords? 
 Intuitively, we  think this unlikely. 
 A codeword of length $3$ appears to 
 have a cost that is $2^{2}$ times smaller than a codeword of length 1. 
% "... cost ... times smaller ..."; I suspect some
%        readers may have difficulty with this sentence.

 Let's  define a total budget of size 1, 
 which we can spend on codewords.
 If we set the cost of a codeword whose length is $l$ to $2^{-l}$,
 then we have a pricing system that fits the examples
 discussed above. Codewords of length 3 cost $\deighth$ each;
 codewords of length 1 cost $1/2$ each. 
 We can spend our budget on any codewords.
 If we go over our budget then the code will certainly not be 
 uniquely decodeable. If, on the other hand,
\beq
	\sum_i 2^{-l_i} \leq 1,
\label{eq.kraft}
\eeq
 then the code may be uniquely decodeable. This inequality is
 the \inds{\Kraft\ inequality}.\label{sec.kraft}
%
%  Symbol Coding Budget
%
\begin{figure}
\figuremargin{%
\begin{center}
\mbox{\psfig{figure=figs/budget1.eps,height=3in}\ \psfig{figure=figs/budgetmax.eps,height=3in}}
\end{center}
}{%
\caption[a]{The symbol coding \ind{budget}.\index{supermarket}\indexs{symbol code!budget}
 The `cost' $2^{-l}$ of each codeword
 (with length $l$)
 is indicated by the size of the box it is written in. The total budget 
 available when making a uniquely decodeable code is 1.}
\label{fig.budget1}
}%
\end{figure}
\begin{figure}
\figuredangle{%
\begin{center}
\mbox{
%\begin{tabular}{cc}
% $C_0$ & $C_3$ \\
%\psfig{figure=figs/budget0.eps,height=1.48in}&
%\psfig{figure=figs/budget3.eps,height=1.48in} \\[0.2in]
% $C_4$ & $C_6$ \\
%\psfig{figure=figs/budget4.eps,height=1.48in}&
%\psfig{figure=figs/budget6.eps,height=1.48in}\\
%\end{tabular}}
\begin{tabular}{cccc}
 $C_0$ & $C_3$ &  $C_4$ & $C_6$ \\
\psfig{figure=figs/budget0.eps,height=1.66in}&
\psfig{figure=figs/budget3.eps,height=1.66in}&
\psfig{figure=figs/budget4.eps,height=1.66in}&
\psfig{figure=figs/budget6.eps,height=1.66in}\\
\end{tabular}}
\end{center}
}{%
\caption[a]{Selections of codewords made by codes $C_0,C_3,C_4$ and $C_6$
 from section \protect\ref{sec.symbol.code.intro}.}
\label{fig.budget0}
\label{fig.budget6}
}%
\end{figure}
\begin{description}
\item[\Kraft\ inequality.] 
 For any uniquely decodeable code $C$ over the binary alphabet $\{0,1\}$, 
 the codeword lengths must satisfy:
\beq
        \sum_{i=1}^I 2^{-l_i} \leq 1 ,
\eeq
 where $I = |\A_X|$.
\end{description}
\begin{description}
\item[Completeness.]
 If a uniquely 
 decodeable code satisfies the \Kraft\ inequality with equality 
 then  it is called a {\dbf complete} code.
\end{description}
% It is less obvious that t
 We want  codes that are uniquely decodeable; 
 prefix codes are uniquely decodeable, and are easy to decode.
% ;  and it is  easy to assess whether a code is a prefix code. 
% codes that are not prefix codes are less straightforward to decode than 
% prefix codes. 
 So life would be simpler for us if we could restrict attention to prefix
 codes.\index{prefix code}  
 Fortunately,
% we can prove that
 for any source there {\em is\/}
 an optimal symbol code that is also a prefix
 code. 
% We wi, and we will discuss an 
% algorithm    we can restrict attention to prefix
% codes.
% The following
% result is also true:
\begin{description}
\item[\Kraft\ inequality and prefix codes.]
 Given a set of codeword lengths that satisfy
 the Kraft inequality,
% this inequality,
 there exists a uniquely decodeable prefix
 code\index{source code!prefix code}\index{prefix code} with these
 codeword lengths.
\end{description}
\begin{aside}
%\subsection*{The small print}
 The Kraft inequality
% , which appears on page \pageref{sec.kraft},
 might be more accurately referred to 
 as the Kraft-McMillan inequality: 
 Kraft (1949) proved that if the inequality is satisfied, 
 then a prefix code exists with the given lengths.
 McMillan (1956) proved the converse, that unique decodeability 
 implies that the inequality holds.
\end{aside}
\begin{prooflike}{Proof of the \Kraft\ inequality}
%
        Define $S = \sum_i 2^{-l_i}$.
 Consider the quantity 
\beq
        S^N = \left[ \sum_i 2^{-l_i} \right]^N 
        = \sum_{i_1=1}^{I} \sum_{i_2=1}^{I} \ldots \sum_{i_N=1}^{I}
         2^{-\displaystyle \left(l_{i_1} + l_{i_2} + \ldots l_{i_N} \right) }
\eeq
 The quantity in the exponent, $\left(l_{i_1} + l_{i_2} + \ldots
 l_{i_N} \right)$, is the length of the encoding of the string $\bx =
 a_{i_1} a_{i_2} \ldots a_{i_N}$. For every string $\bx$ 
 of length $N$, there is one term in the above sum. Introduce an 
 array $A_l$ that counts how many strings $\bx$ have encoded length $l$. 
 Then, defining $l_{\min} = \min_i l_i$ and $l_{\max} = \max_i l_i$:
\beq
        S^N = \sum_{l = N l_{\min} }^{N l_{\max}} 2^{-l} A_l .
\eeq
 Now assume $C$ is
 uniquely decodeable, so that for all $\bx \not = \by$, 
 $c^+(\bx) \not = c^+(\by)$. Concentrate on the $\bx$ that have encoded 
 length $l$. There are a total of $2^l$ distinct bit strings of length $l$, 
 so it must be the case that $A_l \leq 2^l$. 
%
 So
\beq
        S^N = \sum_{l = N l_{\min} }^{N l_{\max}} 2^{-l} A_l \leq
         \sum_{l = N l_{\min} }^{N l_{\max}} 1 \:\: \leq \:\:  N l_{\max}.
\label{eq.kraft.climax}
\eeq
 Thus $S^N \leq l_{\max} N$ for all $N$.
 Now if $S$ were greater than 1, then as $N$ increases,
 $S^N$ would be an exponentially growing function, and for large enough
 $N$, an exponential always exceeds a polynomial such as $l_{\max} N$.
 But our  result $(S^N \leq l_{\max} N)$
% \ref{eq.kraft.climax}
 is true for {\em any\/} $N$.
 Therefore $S \leq 1$. \hfill
% Q.E.D. 
%
% to have
% enabled me to understand it the first time round, it would have been
% sufficient to have said 'for the inequality to be true for all N,
% regardless of how large, S has to be <= 1.'
%
\end{prooflike}


\exercissxB{3}{ex.KIconverse}{ 
% (optional)
 Prove
 the  result stated above,
 that for any set of codeword lengths $\{ l_i \}$
 satisfying the \Kraft\ inequality, there is a prefix code having those 
 lengths.
}
 A pictorial view of the \Kraft\ inequality may  help you solve this exercise.
 Imagine that we are choosing the codewords to make a symbol code. 
 We can draw the set of all candidate codewords
% that we might  include in a code
 in a figure that
 shows the `cost' of the codeword by the area of a box (\figref{fig.budget1}). 
 The total budget available -- the `1' on the right hand side of 
 the \Kraft\ inequality -- is shown at one side. 
 Some of the codes discussed in section \ref{sec.symbol.code.intro}
 are illustrated in figure \ref{fig.budget0}. Notice that the codes that 
 are prefix codes, $C_0$, $C_3$,
 and $C_4$,  have the property that to the right of any selected 
 codeword, there are no other selected codewords --
 because prefix codes correspond to trees.
% The {\em complete\/} prefix codes  $C_0$, $C_3$,
% and $C_4$ have the property that
% the codewords abut 
% Notice also that the 
% `incomplete' code 
% -\ref{fig.budget6}.
 Notice that a {\em complete\/} prefix code
 corresponds to a {\em complete\/} tree having no unused branches.

\medskip

 We are now ready to put back the symbols's probabilities $\{ p_i \}$.
 Given a set of symbol probabilities (the English language
 probabilities of \figref{fig.monogram}, for example),
 how do we make the best symbol code --  one  with the smallest
 possible expected length $L(C,X)$? And what is that smallest possible
 expected length?
 It's not
 obvious how to assign the codeword lengths.
 If we give short codewords to the more probable
 symbols then the expected length might be reduced; on the other
 hand, shortening some codewords necessarily causes others
 to lengthen, by the Kraft inquality.

\section{What's the most compression that we can hope for?}
% there must be a compromise.
% of s
% Of the four codes  displayed in figure \ref{fig.budget0},
% $C_3$ and $C_6$
 We wish to minimize the expected length of a code,  
\beqan
        L(C,X) &=& \sum_i p_i l_i .
\eeqan

 As you might have guessed, the entropy  appears as the 
% It is easy to show that there is a 
 lower bound on the expected length of a code.
\begin{description}
\item[Lower bound on expected length.] The expected length $L(C,X)$ 
 of a uniquely decodeable code 
 is bounded below by $H(X)$. 

\item[{\sf Proof:}]
% Introduce the optimum codelengths $l^*_i \equiv \log (1/p_i)$, 
        We define the {\dem\inds{implicit probabilities}\/}
 $q_i \equiv 2^{-l_i}/z$,
        where $z\eq \sum_{i'} 2^{-l_{i'}}$, so that $l_i \eq  \log 1/q_i -
        \log z$.  We then use Gibbs's inequality,
        $\sum_i p_i \log 1/q_i \geq \sum_i p_i \log 1/p_i$, with
        equality if $q_i \eq  p_i$, and the \Kraft\ inequality $z\leq 1$:
\beqan
        L(C,X) &=& \sum_i p_i l_i =
        \sum_i p_i \log 1/q_i - \log z
\label{eq.expected.length}
\\
        & \geq & \sum_i p_i \log 1/p_i - \log z
\\
        & \geq & H(X) . 
\eeqan
        The equality $L(C,X) \eq  H(X)$ is achieved only if the \Kraft\ 
        equality $z
% \sum_i 2^{-l_i} 
        \eq  1$ is satisfied, and if 
        the codelengths satisfy $l_i \eq  \log (1/p_i)$. \hfill $\Box$

\end{description}
 This is an important result so let's say it again: 
\begin{description}
\item[Optimal source codelengths.]
        The\index{source code!optimal lengths}
         expected length is minimized and is equal to 
 $H(X)$ only if the codelengths 
        are equal to the {\dem Shannon information contents}:
\beq
        l_i = \log_2 (1/p_i)  .
\eeq
\item[Implicit probabilities defined by codelengths.]
	Conversely, any choice of codelengths $\{l_i\}$ {\em implicitly\/}
 defines a probability distribution $\{q_i\}$,
\beq
	q_i \equiv 2^{-l_i}/z  ,
\eeq
 for  which  those codelengths would be the  optimal codelengths.
 If the code is complete then $z=1$ and the implicit probabilities 
 are given by $q_i =  2^{-l_i}$.
\end{description}
%  This is one of the central themes of this course.
%
%
%
\section{How much can we compress?}
 So, we can't compress below the entropy.
% using a symbol code.
 How close can we expect  to get to the entropy?
% if we are using a symbol code?
% \section{Existence of good symbol codes}
\begin{ctheorem}
{\sf Source coding theorem for symbol codes.}
 For an ensemble $X$ there exists a prefix code $C$ with  expected length 
 satisfying\indexs{extra bit} 
\beq
        H(X) \leq L(C,X) < H(X) + 1.
\label{eq.source.coding.symbol}
\eeq
\label{th.source.coding.symbol}
\end{ctheorem}
\begin{prooflike}{Proof}    We set the codelengths to integers slightly 
 larger than the optimum lengths:
\beq
        l_i = \lceil \log_2 (1/p_i) \rceil
\eeq
        where $\lceil l^* \rceil$ denotes the smallest integer greater
        than or equal to $l^*$.
 [We are not asserting that the {\em optimal\/} code necessarily uses
 these lengths, we are simply choosing these lengths 
 because we can use them to prove the theorem.]

  We check that there {\em is\/} a
        prefix code with these lengths by confirming that the
        \Kraft\ inequality is satisfied.
\beq
	\sum_i 2^{-l_i} = \sum_i 2^{-\lceil \log_2 (1/p_i) \rceil} 
	\leq \sum_i 2^{ -\log_2 (1/p_i) } = \sum_i p_i = 1 . 
\eeq

        Then we confirm
\beq
	L(C,X) = \sum_i p_i \lceil \log (1/p_i) \rceil
        < \sum_i p_i ( \log (1/p_i) + 1 ) = H(X) + 1.
\eeq
% corrected < to =  , 9802
%
\end{prooflike}

\subsection{The cost of using the wrong codelengths}
 If we use a code whose lengths are not equal to the optimal 
 codelengths,  the average message length will be larger
 than the entropy.

%when        we use the `wrong' code. 
 If the true probabilities are $\{ p_i
        \}$ and we use a complete code with lengths $l_i$,
% that satisfy the
%         \Kraft\ equality (that is, 
%  the  \Kraft\ inequality with equality),
  we can view those lengths as defining 
        \ind{implicit probabilities} $q_i = 2^{-{l_i}}$.
% l_i \eq  \log 1/q_i$ such
%       that $\sum_i q_i \eq  1$, then 
        Continuing from \eqref{eq.expected.length},
 the average length is
\beq
	L(C,X) = H(X)+\sum_i p_i \log p_i/q_i,
\eeq
        \ie, it exceeds the entropy by the Kullback--Leibler divergence
        $D_{\rm KL}(\bp||\bq)$ (as defined on  \pref{eq.KL}).

\section{Optimal source coding with symbol codes:  Huffman coding}
 Given a set of probabilities $\P$, how can we design an optimal
 prefix code? For example,
 what is the best symbol code for the English language ensemble
 shown in \figref{fig.elfig}? 
% hinton diagram with labels and probabilities to 4 d.p.
%%%%%%%%%%%%%%%%%%%%%%%
\setlength{\unitlength}{3.538mm}
\begin{picture}(2,28.9)(-28,0)
\put(-28,0.15){\makebox(0,0)[bl]{\psfig{figure=bigrams/hd_marg.ps,angle=-90}}}
\put(-28.65,28.7){\makebox(0,0)[r]{{$x$}}}          %   0.06 
\put(-25.70,28.7){\makebox(0,0)[l]{{$P(x)$}}}
\put(-28.65,27){\makebox(0,0)[r]{{\tt a}}}          %   0.06   0.0575
\put(-25.60,27){\makebox(0,0)[l]{{\footnotesize   0.0575 }}}             
\put(-28.65,26){\makebox(0,0)[r]{{\tt b}}}	    %   0.01   0.0128
\put(-25.60,26){\makebox(0,0)[l]{{\footnotesize   0.0128 }}}             
\put(-28.65,25){\makebox(0,0)[r]{{\tt c}}}	    %   0.03   0.0263
\put(-25.60,25){\makebox(0,0)[l]{{\footnotesize   0.0263 }}}             
\put(-28.65,24){\makebox(0,0)[r]{{\tt d}}}	    %   0.03   0.0285
\put(-25.60,24){\makebox(0,0)[l]{{\footnotesize   0.0285 }}}             
\put(-28.65,23){\makebox(0,0)[r]{{\tt e}}}	    %   0.09   0.0913
\put(-25.60,23){\makebox(0,0)[l]{{\footnotesize   0.0913 }}}             
\put(-28.65,22){\makebox(0,0)[r]{{\tt f}}}	    %   0.02   0.0173
\put(-25.60,22){\makebox(0,0)[l]{{\footnotesize   0.0173 }}}             
\put(-28.65,21){\makebox(0,0)[r]{{\tt g}}}	    %   0.01   0.0133
\put(-25.60,21){\makebox(0,0)[l]{{\footnotesize   0.0133 }}}             
\put(-28.65,20){\makebox(0,0)[r]{{\tt h}}}	    %   0.03   0.0313
\put(-25.60,20){\makebox(0,0)[l]{{\footnotesize   0.0313 }}}             
\put(-28.65,19){\makebox(0,0)[r]{{\tt i}}}	    %   0.06   0.0599
\put(-25.60,19){\makebox(0,0)[l]{{\footnotesize   0.0599 }}}             
\put(-28.65,18){\makebox(0,0)[r]{{\tt j}}}	    %   0.00   0.0006
\put(-25.60,18){\makebox(0,0)[l]{{\footnotesize   0.0006 }}}             
\put(-28.65,17){\makebox(0,0)[r]{{\tt k}}}	    %   0.01   0.0084
\put(-25.60,17){\makebox(0,0)[l]{{\footnotesize   0.0084 }}}             
\put(-28.65,16){\makebox(0,0)[r]{{\tt l}}}	    %   0.04   0.0335
\put(-25.60,16){\makebox(0,0)[l]{{\footnotesize   0.0335 }}}             
\put(-28.65,15){\makebox(0,0)[r]{{\tt m}}}	    %   0.02   0.0235
\put(-25.60,15){\makebox(0,0)[l]{{\footnotesize   0.0235 }}}             
\put(-28.65,14){\makebox(0,0)[r]{{\tt n}}}	    %   0.06   0.0596
\put(-25.60,14){\makebox(0,0)[l]{{\footnotesize   0.0596 }}}             
\put(-28.65,13){\makebox(0,0)[r]{{\tt o}}}	    %   0.07   0.0689
\put(-25.60,13){\makebox(0,0)[l]{{\footnotesize   0.0689 }}}             
\put(-28.65,12){\makebox(0,0)[r]{{\tt p}}}	    %   0.02   0.0192
\put(-25.60,12){\makebox(0,0)[l]{{\footnotesize   0.0192 }}}             
\put(-28.65,11){\makebox(0,0)[r]{{\tt q}}}	    %   0.01   0.0008
\put(-25.60,11){\makebox(0,0)[l]{{\footnotesize   0.0008 }}}             
\put(-28.65,10){\makebox(0,0)[r]{{\tt r}}}	    %   0.05   0.0508
\put(-25.60,10){\makebox(0,0)[l]{{\footnotesize   0.0508 }}}             
\put(-28.65,9 ){\makebox(0,0)[r]{{\tt s}}}	    %   0.06   0.0567
\put(-25.60,9 ){\makebox(0,0)[l]{{\footnotesize   0.0567 }}}             
\put(-28.65,8 ){\makebox(0,0)[r]{{\tt t}}}	    %   0.07   0.0706
\put(-25.60,8 ){\makebox(0,0)[l]{{\footnotesize   0.0706 }}}             
\put(-28.65,7 ){\makebox(0,0)[r]{{\tt u}}}	    %   0.03   0.0334
\put(-25.60,7 ){\makebox(0,0)[l]{{\footnotesize   0.0334 }}}             
\put(-28.65,6 ){\makebox(0,0)[r]{{\tt v}}}	    %   0.01   0.0069
\put(-25.60,6 ){\makebox(0,0)[l]{{\footnotesize   0.0069 }}}             
\put(-28.65,5 ){\makebox(0,0)[r]{{\tt w}}}	    %   0.01   0.0119
\put(-25.60,5 ){\makebox(0,0)[l]{{\footnotesize   0.0119 }}}             
\put(-28.65,4 ){\makebox(0,0)[r]{{\tt x}}}	    %   0.01   0.0073
\put(-25.60,4 ){\makebox(0,0)[l]{{\footnotesize   0.0073 }}}             
\put(-28.65,3 ){\makebox(0,0)[r]{{\tt y}}}	    %   0.02   0.0164
\put(-25.60,3 ){\makebox(0,0)[l]{{\footnotesize   0.0164 }}}             
\put(-28.65,2 ){\makebox(0,0)[r]{{\tt z}}}	    %   0.00   0.0007
\put(-25.60,2 ){\makebox(0,0)[l]{{\footnotesize   0.0007 }}}             
\put(-28.65,1 ){\makebox(0,0)[r]{{$-$}}}       %   0.19   0.1928
%\put(-28.65,1 ){\makebox(0,0)[r]{{\verb+-+}}}       %   0.19   0.1928
\put(-25.60,1 ){\makebox(0,0)[l]{{\footnotesize   0.1928 }}}
\end{picture}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%






















































\caption[a]{An ensemble in need of a symbol code.}\label{fig.elfig}}
 When we say `optimal', let's assume our aim is to minimize the
 expected length $L(C,X)$.

\subsection{How not to do it}
 One might try
 to roughly split the set $\A_X$ in two, and
 continue bisecting the subsets so as to define a binary tree from the
 root. This construction has the right spirit, as in the weighing problem, 
% is how the {\em Shannon-Fano code\/} is constructed, 
 but it is not
 necessarily optimal; it achieves $L(C,X) \leq H(X) + 2$. 
%
% find a reference for proof of this?
%
%{\em [Is Shannon-Fano
% the correct name? According to Goldie and Pinch this has a different
% meaning. Check.]}
\subsection{The Huffman coding algorithm}
 We now present a beautifully simple algorithm for finding an optimal
 prefix code.
 \indexs{Huffman code}The trick is to
 construct the code {\em backwards\/} starting from the tails of the
 codewords;  {\em we build the binary tree  from its leaves}.
%\begin{description}
\ben
\item%[{\sf 1.}]
 Take the two least probable symbols in the alphabet. These two symbols 
 will be given the longest codewords, which will have equal length, 
 and differ only in the last digit. 
\item%[{\sf 2.}]
 Combine these two symbols into a single symbol, and repeat.
\een
%\end{description}
 Since each step reduces the size of the alphabet by one, 
 this algorithm will have assigned strings to all the symbols 
 after  $|\A_X|-1$ steps.
\exampla{
% {\sf Example:}
 \begin{tabular}[t]{*{11}{@{\,}l}}
 Let \hspace{0.1in} & $\A_X$  &=&$\{$& {\tt a},&{\tt b},&{\tt c},&{\tt d},&{\tt e} &$\}$ \\
 and \hspace{0.1in} &  $\P_X$  &=&$\{$& 0.25,  &0.25,  & 0.2,  & 0.15, & 0.15   & $\}$.
 \end{tabular}
\begin{center}
% \framebox{\psfig{figure=figs/huffman.ps,%
%angle=-90}}
\setlength{\unitlength}{0.015in}%was0125
\begin{picture}(200,95)(40,40)
\put( 60,105){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.25}}}
\put( 60,090){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.25}}}
\put( 60,075){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.2}}}
\put( 60,060){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.15}}}
\put( 60,045){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.15}}}
\put(100,105){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.25}}}
\put(100,090){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.25}}}
\put(100,075){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.2}}}
\put(100,060){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.3}}}
\put(140,105){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.25}}}
\put(140,090){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.45}}}
\put(140,060){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.3}}}
\put(180,105){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.55}}}
\put(180,090){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{0.45}}}
\put(220,105){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{1.0}}}
\put( 40,105){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt a}}}
\put( 40,090){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt b}}}
\put( 40,075){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt c}}}
\put( 40,060){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt d}}}
\put( 40,045){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt e}}}
\put( 85,067){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 0}}}
\put( 85,045){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 1}}}
\put(125,097){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 0}}}
\put(125,075){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 1}}}
\put(165,112){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 0}}}
\put(165,065){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 1}}}
\put(205,112){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 0}}}
\put(205,090){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\tt 1}}}
\thinlines
\put( 80,110){\line( 1, 0){ 15}}
\put( 80,095){\line( 1, 0){ 15}}
\put( 80,080){\line( 1, 0){ 15}}
\put( 80,065){\line( 1, 0){ 15}}
\put( 95,065){\line(-1,-1){ 15}}

\put(120,110){\line( 1, 0){ 15}}
\put(120,065){\line( 1, 0){ 15}}
\put(120,095){\line( 1, 0){ 15}}
\put(135,095){\line(-1,-1){ 15}}

\put(160,095){\line( 1, 0){ 15}}
\put(160,110){\line( 1, 0){ 15}}
\put(175,110){\line(-1,-3){ 15}}

\put(200,110){\line( 1, 0){ 15}}
\put(215,110){\line(-1,-1){ 15}}
\put( 40,125){\makebox(0,0)[bl]{\raisebox{0pt}[0pt][0pt]{$x$}}}
\put( 85,125){\makebox(0,0)[b]{\raisebox{0pt}[0pt][0pt]{step 1}}}
\put(125,125){\makebox(0,0)[b]{\raisebox{0pt}[0pt][0pt]{step 2}}}
\put(165,125){\makebox(0,0)[b]{\raisebox{0pt}[0pt][0pt]{step 3}}}
\put(205,125){\makebox(0,0)[b]{\raisebox{0pt}[0pt][0pt]{step 4}}}
\end{picture}

\end{center}
 The  codewords are then obtained by concatenating the binary digits
 in reverse order:
% Codewords
 $C = \{ {\tt{00}}, {\tt{10}} , {\tt{11}}, {\tt{010}}, {\tt{011}} \}$.
\margintab{
\begin{center}
\begin{tabular}{clrrl} \toprule
$a_i$  & $p_i$  &
 \multicolumn{1}{c}{$h(p_i)$%$\log_2 \frac{1}{p_i}$}
 }
& $l_i$ & $c(a_i)$
%{\rule[-3mm]{0pt}{8mm}}%strut
\\ \midrule 
{\tt a} & 0.25        &  2.0     &   2 & {\tt 00}       \\
{\tt b} & 0.25        &  2.0     &   2 & {\tt 10}       \\
{\tt c} & 0.2         &  2.3     &   2 & {\tt 11}       \\
{\tt d} & 0.15        &  2.7     &   3 & {\tt 010}      \\
{\tt e} & 0.15        &  2.7     &   3 & {\tt 011}      \\ \bottomrule
\end{tabular}
\end{center}
\caption[a]{Code created by the Huffman algorithm}
\label{tab.huffman}
}
 The codelengths  selected by the Huffman algorithm (column 4
 of \tabref{tab.huffman}) are
 in some cases longer and in some cases shorter than 
 the ideal codelengths, the  Shannon information contents $\log_2 \dfrac{1}{p_i}$ (column 3).
 The expected length of the code is $L=2.30$ bits, whereas the
 entropy is $H=2.2855$ bits.
}
 If at any point there is more than one way of selecting the two least 
 probable symbols then the choice may be made in any manner -- the 
 expected length of the code will not depend on the choice.
\exercissxC{3}{ex.Huffmanconverse}{ 
% (Optional)
 Prove\index{Huffman code!`optimality'} 
 that there is no better symbol code for a source than the
 Huffman code.
}
%
\exampla{
 We can make a Huffman code for the probability distribution
 over the alphabet introduced in \figref{fig.monogram}.
 The result is shown in \figref{fig.monogram.huffman}.
 This code has an expected length of    4.15 bits; the entropy of
 the ensemble is     4.11 bits.
% It is interesting to notice how
% some symbols, for example {\tt q}, receive codelengths that
% differ by more than 1 bit from
 Observe the disparities between the assigned
 codelengths and the ideal codelengths
 $\log_2 \frac{1}{p_i}$.
}
%%%%%%%%%%%%%%%%%%%%%%%%% alphabet of english!
\begin{figure}
\figuremargin{%
\begin{center}
\mbox{\small
\begin{tabular}{clrrl}  \toprule
$a_i$  & $p_i$  & \multicolumn{1}{c}{$\log_2 \frac{1}{p_i}$}  & $l_i$ & $c(a_i)$
%{\rule[-3mm]{0pt}{8mm}}%strut
\\[0in] \midrule
{\tt a}&    0.0575 & 4.1  &   4 & {\tt 0000       } \\
{\tt b}&    0.0128 & 6.3  &   6 & {\tt 001000       } \\
{\tt c}&    0.0263 & 5.2  &   5 & {\tt 00101       } \\
{\tt d}&    0.0285 & 5.1  &   5 & {\tt 10000       } \\
{\tt e}&    0.0913 & 3.5  &   4 & {\tt 1100       } \\
{\tt f}&    0.0173 & 5.9  &   6 & {\tt 111000       } \\
{\tt g}&    0.0133 & 6.2  &   6 & {\tt 001001       } \\
{\tt h}&    0.0313 & 5.0  &   5 & {\tt 10001       } \\
{\tt i}&    0.0599 & 4.1  &   4 & {\tt 1001       } \\
{\tt j}&    0.0006 & 10.7 &  10 & {\tt 1101000000       } \\
{\tt k}&    0.0084 & 6.9  &   7 & {\tt 1010000       } \\
{\tt l}&    0.0335 & 4.9  &   5 & {\tt 11101       } \\
{\tt m}&    0.0235 & 5.4  &   6 & {\tt 110101       } \\
{\tt n}&    0.0596 & 4.1  &   4 & {\tt 0001       } \\
{\tt o}&    0.0689 & 3.9  &   4 & {\tt 1011       } \\
{\tt p}&    0.0192 & 5.7  &   6 & {\tt 111001       } \\
{\tt q}&    0.0008 & 10.3 &   9 & {\tt 110100001       } \\
{\tt r}&    0.0508 & 4.3  &   5 & {\tt 11011       } \\
{\tt s}&    0.0567 & 4.1  &   4 & {\tt 0011       } \\
{\tt t}&    0.0706 & 3.8  &   4 & {\tt 1111       } \\
{\tt u}&    0.0334 & 4.9  &   5 & {\tt 10101       } \\
{\tt v}&    0.0069 & 7.2  &   8 & {\tt 11010001       } \\
{\tt w}&    0.0119 & 6.4  &   7 & {\tt 1101001       } \\
{\tt x}&    0.0073 & 7.1  &   7 & {\tt 1010001       } \\
{\tt y}&    0.0164 & 5.9  &   6 & {\tt 101001       } \\
{\tt z}&    0.0007 & 10.4 &  10 & {\tt 1101000001       } \\
{--}& 0.1928 & 2.4  &   2 & {\tt 01       } \\  \bottomrule
%{\verb+-+}& 0.1928 & 2.4  &   2 & {\tt 01       } \\  \bottomrule
\end{tabular}
\hspace*{0.5in}\raisebox{-2in}{\psfig{figure=tex/sortedtree.eps,width=1.972in}}
}
\end{center}
}{%
\caption[a]{Huffman code for the English language ensemble (monogram statistics).}
%  introduced in \protect\figref{fig.monogram}.}
\label{fig.monogram.huffman}
}%
\end{figure}
% see \cite[p. 97]{Cover&Thomas}
% \medskip
\subsection{Constructing a binary tree top-down is suboptimal}
 In previous chapters we studied weighing problems
 in which we built ternary or binary trees. 
 We noticed that balanced trees -- ones in which at every step, the two 
 possible outcomes were as close as possible to equiprobable --
 appeared to describe the most efficient experiments. 
 This gave an intuitive motivation for entropy as a measure of information 
 content.

 It is not the case, however, that optimal codes can {\em always\/}
 be constructed
 by a greedy top-down method in which the alphabet
 is successively divided into subsets that are as near as possible to equiprobable.
% /home/mackay/itp/huffman> huffman.p latex=1 < fiftywrong3
\exampla{
 Find the optimal binary symbol code for the ensemble:
\beq
\begin{array}{*{3}{@{\,}c@{\,}}*{6}{c@{,\,}}*{2}{@{\,}c}}
\A_X & = & \{ & 
{\tt a} & 
{\tt b} & 
{\tt c} & 
{\tt d} & 
{\tt e} & 
{\tt f} & 
{\tt g} &
 \} \\ 
\P_X & = & \{ 
& 0.01 
& 0.24 
& 0.05 
& 0.20 
& 0.47 
& 0.01 
& 0.02 
& \} \\ 
\end{array} .
\eeq
 Notice that a greedy top-down method can split this set into two
% equiprobable
 subsets
 $\{ {\tt a},{\tt b},{\tt c},{\tt d} \}$ and $\{{\tt e},{\tt f},{\tt g}\}$
 which both have probability $1/2$,
 and that  $\{ {\tt a},{\tt b},{\tt c},{\tt d} \}$  can be divided
 into
% equiprobable
 subsets $\{ {\tt a},{\tt b} \}$ and $\{{\tt c},{\tt d}\}$,
 which have probability $1/4$; 
 so  a greedy top-down method gives the code shown
 in the third column of  \tabref{tab.greed},\margintab{
\begin{center}
\begin{tabular}{clll} \toprule
$a_i$  & $p_i$  & Greedy  & Huffman   \\[0in] \midrule 
{\tt a} & .01  & {\tt 000}  & {\tt 000000}     \\
{\tt b} & .24  & {\tt 001}  & {\tt 01}             \\
{\tt c} & .05  & {\tt 010}  & {\tt 0001}         \\
{\tt d} & .20  & {\tt 011}  & {\tt 001}           \\
{\tt e} & .47  & {\tt 10}   & {\tt 1}              \\
{\tt f} & .01  & {\tt 110}  & {\tt 000001}     \\
{\tt g} & .02  & {\tt 111}  & {\tt 00001}       \\
 \bottomrule
\end{tabular}
\end{center}
\caption[a]{A greedily-constructed code compared with the Huffman code}
\label{tab.greed}
}
 which has expected length 2.53.
 The Huffman coding algorithm yields the code shown in the fourth
 column,
%\begin{center}
%\begin{tabular}{clrrl} \toprule
%$a_i$  & $p_i$  & \multicolumn{1}{c}{$\log_2 \frac{1}{p_i}$}  & $l_i$ & $c(a_i)$
%%{\rule[-3mm]{0pt}{8mm}}%strut
%\\[0in] \midrule 
%{\tt a} & 0.01        &  6.6     &   6 & {\tt 000000}   \\
%{\tt b} & 0.24        &  2.1     &   2 & {\tt 01}       \\
%{\tt c} & 0.05        &  4.3     &   4 & {\tt 0001}     \\
%{\tt d} & 0.20        &  2.3     &   3 & {\tt 001}      \\
%{\tt e} & 0.47        &  1.1     &   1 & {\tt 1}        \\
%{\tt f} & 0.01        &  6.6     &   6 & {\tt 000001}   \\
%{\tt g} & 0.02        &  5.6     &   5 & {\tt 00001}    \\
% \bottomrule
%\end{tabular}
%\end{center}
 which has 
 expected length       1.97.
% entropy     1.9323
%
}


%\subsection{Twenty questions}
% The Huffman algorithm defines the optimal way to 
% play `twenty questions'. 
%
% {\em [MORE HERE]}

\section{Disadvantages of the Huffman code}
\label{sec.huffman.probs}
 The Huffman\index{Huffman code!disadvantages}\index{symbol code!disadvantages}
 algorithm produces an
 optimal symbol code for an ensemble, but this is not the end of the
 story. Both the word `ensemble' and the phrase `symbol code' 
 need careful attention. 
%\begin{description}
%\item[Changing ensemble.] 
\subsection{Changing ensemble}
        If we wish to communicate a sequence of outcomes from one
        unchanging ensemble, then a Huffman code may be convenient.
        But often the appropriate ensemble changes. If, for
        example, we are compressing text, then the symbol frequencies
        will vary with context: the letter  {\tt{u}} is
 much more probable after a {\tt{q}} than after an {\tt{e}}.  And
        furthermore, our knowledge of these context-dependent symbol
        frequencies will also change as we learn 
% accumulate statistics on
	the statistical properties     of the
	text source.\index{adaptive models}
% So our probabilities	should change 

        Huffman codes do not handle  changing
        ensemble probabilities with any elegance.
 One brute-force approach would be to
        recompute the Huffman code every time the probability over
        symbols changes. Another attitude is to deny the option of
        adaptation, and instead to run through the entire file in
        advance and compute a good probability distribution, which will
        then remain fixed throughout transmission. The code itself must 
 also be communicated in this scenario. Such a technique is
        not only cumbersome and restrictive, it is also suboptimal,
        since the initial message specifying the code and the document
        itself are partially redundant.
% -- knowing the algorithm that
%       defines the code for a given document, one can deduce what the
%       initial header has to be from the .
        This technique  therefore wastes bits.
% flag this: 
% could discuss bits back here
%
\subsection{The extra bit}
%item[The extra bit.] 
        An equally serious problem with Huffman codes is the
        innocuous-looking `\ind{extra bit}' relative to the ideal average
        length of $H(X)$ -- a Huffman code achieves a length that
        satisfies $H(X) \leq L(C,X) < H(X) + 1,$ as proved in theorem
        \ref{th.source.coding.symbol}.
%\eqref{eq.source.coding.symbol}).
  A
        Huffman code thus incurs an overhead of between 0 and 1 bits per
        symbol. If $H(X)$ were large, then this overhead would be an
        unimportant fractional increase.  But for many applications,
        the entropy may be as low as one bit per symbol, or even smaller,
        so  the overhead
%`$+1$'
 $L(C,X)- H(X)$ may dominate the encoded file length. Consider English
        text: in some contexts, long strings of characters may be
        highly predictable.
% , as we saw in the guessing game of chapter \chtwo.
% given a simple model of the language. 
        For
        example, in the context `{\verb+strings_of_ch+}', one might
        predict the next nine symbols to be `{\verb+aracters_+}' with
        a probability of 0.99 each. A traditional Huffman code would
        be obliged to use at least one bit per character, making a total cost
        of nine bits where virtually no information is being
        conveyed (0.13 bits in total, to be precise).
 The entropy of English, given a good model, is about
        one bit per character \cite{shannon93}, so a Huffman code is likely to be highly
% nearly 100\%
        inefficient.

        A traditional patch-up of Huffman codes uses them to compress
        {\dem blocks\/} of symbols, for example the `extended sources'
        $X^N$ we discussed in chapter \chtwo.
% \ref{ch2}
% rather than defining a code for       single symbols. 
        The overhead per block is at most 1 bit so the 
 overhead per symbol
% goes down as
 is at most $1/N$ bits. For
        sufficiently large blocks, the problem of the extra bit may be
        removed -- but only at the expenses of (a) losing the elegant
        instantaneous decodeability of  simple Huffman coding; and 
         (b) having 
        to compute the probabilities of all relevant strings and build 
        the associated Huffman tree. One will end up explicitly 
 computing the 
 probabilities and codes for a huge number of strings, most
 of which will never actually occur. 

% A further problem is that it may not be appropriate to model
%        successive symbols as coming independently from a single ensemble
%        $X$.  As we already asserted, any decent model for text will
%        assign a probability over symbols that depends on the context.
%  A changing probability distribution over symbols is
%        not incompatible with the construction of Huffman codes for
%        blocks of symbols. One could consider each possible sequence,
%        computing the relevant probability distributions along the way
%        to evaluate the probability of the entire sequence, then build
%        a Huffman tree for the sequences.  One could account for
%        dependencies between blocks as well, if one were willing to
%        use a different Huffman code each time.  But this modified
% encoder would be
%        computationally expensive, since for large block sizes an
%        exponentially large number of possible sequences would have 
%        to be considered along with their adaptive probabilities.
%% is context-dependent.
% \end{description}
% \medskip

\subsection{Beyond symbol codes}
%
        Huffman codes, therefore, although widely trumpeted as 
        `optimal', have many defects for practical
 purposes.\index{Huffman code!`optimality'}
 They {\em are\/}  optimal {\em symbol\/} codes, but  for practical 
 purposes {\em we don't want a symbol code}.

        The defects of Huffman codes are rectified by {\dem arithmetic
        coding},\index{arithmetic coding} which dispenses with the
        restriction that each symbol must translate into an integer
        number of bits. Arithmetic coding is the main topic of the next 
        chapter.
% is not a symbol coding. This
%       we will discuss next.
%       In an arithmetic code, the probabilistic modelling is clearly 
%       separated from the encoding operation.


\section{Summary}
\begin{description}
\item[Kraft inequality.]
 If a code is {\dbf uniquely decodeable} its lengths must satisfy
\beq
	\sum_i 2^{-l_i } \leq 1 .
\eeq
 For any lengths satisfying the Kraft inequality, there exists
 a prefix code with those lengths.

\item[Optimal source codelengths for an ensemble] are equal to the 
 Shannon information contents
\beq
	l_i = \log_2 \frac{1}{p_i} ,
\eeq
 and conversely, any choice of codelengths defines
 {\dbf implicit probabilities}
\beq
	q_i = \frac{2^{-l_i}}{z} .
\eeq

\item[The relative entropy] $D_{\rm KL}(\bp||\bq)$ measures
 how many bits per symbol are wasted by using a
% mismatched
 code whose implicit probabilities are $\bq$, when
 the ensemble's true probability distribution is $\bp$.

\item[Source coding theorem for symbol codes.]
 For an ensemble $X$, there exists a prefix code
 whose expected length satisfies 
\beq
	H(X) \leq L(C,X) < H(X) + 1 .
\eeq
% The expected length is only equal to the entropy if the
 
\item[The Huffman coding algorithm] generates an optimal symbol code
  iteratively. At each iteration,  the two least probable symbols are combined.
\end{description}

\section{Exercises}
\exercissxB{2}{ex.Cnud}{
 Is the code $\{ {\tt 00}, {\tt 11}, {\tt 0101}, {\tt 111}, {\tt 1010},
 {\tt 100100}, {\tt 0110} \}$
% $\{ 00,11,0101,111,1010,100100,0110 \}$
 uniquely decodeable?
}
\exercisxB{2}{ex.Ctern}{
 Is the ternary code
 $\{ {\tt 00},{\tt 012},{\tt 0110},{\tt 0112},{\tt 100},{\tt 201},{\tt 212},{\tt 22} \}$ uniquely decodeable?
}
\exercisxA{3}{ex.HuffX2X3}{
 Make  Huffman codes for $X^2$, $X^3$ and $X^4$ where ${\cal A}_X = \{ 0,1 \}$ 
 and ${\cal P}_X = \{ 0.9,0.1 \}$. Compute their expected lengths and compare 
 them with the entropies $H(X^2)$, $H(X^3)$ and $H(X^4)$.

 Repeat this exercise for $X^2$  and $X^4$ where ${\cal P}_X = \{ 0.6,0.4 \}$.
}
\exercisxA{2}{ex.Huffambig}{
 Find a  probability distribution $\{ p_1,p_2,p_3,p_4 \}$ such that 
 there are {\em two\/} optimal codes that assign different lengths $\{ l_i \}$
 to the four symbols.
}
\exercisxC{3}{ex.Huffambigb}{
 (Continuation of \exerciseonlyref{ex.Huffambig}.)
 Assume that the four probabilities  $\{ p_1,p_2,p_3,p_4 \}$ are ordered
 such that $p_1 \geq p_2 \geq p_3 \geq p_4 \geq 0$.  Call the 
 set of  
 all probability vectors $\bp$  such that 
 there are {\em two\/} optimal codes with different lengths the set 
 `$\cal Q$'.
 Give a complete description of $\cal Q$. 
 Find three probability vectors $\bq^{(1)}$,  $\bq^{(2)}$,  $\bq^{(3)}$, 
 which are the \ind{convex hull} of  $\cal Q$, \ie, such that 
 any $\bp \in \cal Q$ can be written as 
\beq
        \bp = \mu_1 \bq^{(1)} + \mu_2 \bq^{(2)}  +\mu_3 \bq^{(3)} ,
\eeq
 where $\{\mu_i\}$ are positive.
}
\exercisxB{1}{ex.twenty.questions}{
 Write a short essay discussing how to play
 the game of {\sf{\ind{twenty questions}}} optimally.
 [In twenty questions, one player thinks of an object,
 and the other player has to guess the object using as few binary
 questions as possible, preferably fewer than twenty.]
}
\exercisxB{2}{ex.make.huffman.suck}{
 Make ensembles for which the difference between the entropy
 and the expected length of the Huffman code is as big as possible.
}% 14. Gallager, R. G., "Variations on a Theme by Huffman", 
%     IEEE Trans. on Information Theory, Vol. IT-24, No. 6, Nov. 1978, pp. 668-674. 
%
% DONE from {tex/huffmanI.tex} !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! add this after Mon 11/4/22
\exercisxB{2}{ex.huffman.uniform}
{
% from 02q.tex on rum 
 A binary source $X$  has an alphabet 
 of eleven characters $$\{  a , b , c , d , e ,   f , g , h , i , j , k \},$$
 all of which have equal probability, $1/11$.
% State the meaning of the ideal codelengths

  Find an {optimal uniquely decodeable  symbol code}
  for this source.
 How much greater is the expected length of this optimal code  
 than the entropy of $X$?

}
\exercisxB{2}{ex.huffman.uniform2}{
 Consider the optimal symbol code for an ensemble $X$ with alphabet size 
 $I$ from which all symbols have identical probability 
 $p = 1/I$. $I$ is not a power of 2.

 Show that the fraction $f^+$ of the $I$ symbols  that are assigned 
 codelengths equal to 
\beq
 l^+ \equiv \lceil \log_2 I \rceil
\eeq
 satisfies 
\beq
        f^+ =  2 - \frac{2^{l^+}}{I} 
\label{eq.HIf}
\eeq
 and that the expected length of the optimal symbol code
 is 
\beq
        L = l^+ -1 + f^+ .
\label{eq.HIL}
\eeq
 By differentiating 
 the {\em excess length\/}
\beq
        \Delta L \equiv L - H(X)
\eeq
 with respect to $I$, show that the excess
 length  is bounded by
\beq
        \Delta L \leq 1 - \frac{ \ln ( \ln 2 )}{ \ln 2}  -\frac{ 1 }{ \ln 2}
                = 0.086 .
\eeq

}
\exercisxB{2}{ex.Huff99}{
 Consider a sparse binary source with ${\cal P}_X = \{ 0.99 , 0.01  \}$. 
 Discuss how Huffman codes could be used to compress this source
 {\em efficiently}. 
%  The entropy - hint: could think about run length encoding?
%
}
\exercisxB{2}{ex.poisonglass}{
%   p.111 martin gardner mathematical carnival{Gardner:Carnival}
 {\em Scientific American\/} carried the following puzzle in 1975.
% roughly!
\begin{description}
\item[The poisoned glass.]% This should be \exercisetitlestyle ?
 `Mathematicians are curious birds', the police commissioner said to
 his wife. `You see, we had all those partly filled glasses lined up
 in rows on a table in the hotel kitchen. Only one contained poison,
 and we wanted to know which one before searching that glass for
 fingerprints. Our lab could test the liquid in each glass, but the
 tests take time and money, so we wanted to make as few of them as
 possible by simultaneously testing mixtures of small samples from
 groups of glasses. The university sent over a mathematics professor
 to help us. He counted the glasses, smiled and said:

` ``Pick any glass you want, Commissioner. We'll test it first.''

` ``But won't that waste a test?'' I asked.

` ``No,'' he said, ``it's part of the best procedure. We can test one glass
 first. It doesn't matter which one.'' '

 `How many glasses were there to start with?' the commissioner's wife asked.

 `I don't remember. Somewhere between 100 and 200.'

 What was the exact number of glasses?

\end{description}% \cite{Gardner:Carnival}
 Solve this puzzle and then explain why the professor was in fact 
 wrong and the commissioner was right. What is in fact the optimal procedure
 for identifying the one poisoned glass? What is the expected waste
 relative to this optimum if one followed the professor's strategy?
 Explain the relationship to symbol coding.
}
% could get worked up over the all zero codeword, which corresponds to 
% a possible non-detection; if this would require an extra test
% then presumably the story is a bit different, with some deliberate 
% skewing of the tree to make it more likely that we get a positive 
%result along the way.
\exercisxA{2}{ex.optimalcodep1}{% problem fixed Tue 12/12/00
 Assume that a sequence of symbols
 from the ensemble $X$ introduced at the beginning of this
 chapter is compressed using the code $C_3$.
\amarginfig{b}{
\begin{center}
$C_3$:\\[0.1in] 
\begin{tabular}{cllcc} \toprule
$a_i$ & $c(a_i)$ & $p_i$  & \multicolumn{1}{c}{$h({p_i})$}  & $l_i$ 
% {\rule[-3mm]{0pt}{8mm}}%strut
\\ \midrule 
{\tt a} & {\tt 0}   & \dhalf         &  1.0     &   1      \\
{\tt b} & {\tt 10}  & \dquarter        &  2.0     &   2      \\
{\tt c} & {\tt 110} & \deighth       &  3.0     &   3      \\
{\tt d} & {\tt 111} & \deighth       &  3.0     &   3      \\
 \bottomrule
\end{tabular}
\end{center}
}
 Imagine picking one bit at random from
 the binary encoded sequence $\bc = c(x_1)c(x_2)c(x_3)\ldots$.
 What is the probability  that this bit is a 1?
}
\exercissxB{2}{ex.Huffmanqary}{ 
% (Optional)
 How should the\index{Huffman code!general alphabet} binary 
 Huffman encoding scheme be modified to make optimal symbol codes 
 in an encoding alphabet with $q$ symbols? (Also known as `radix $q$'.)
}
% answer, Hamming p.73: 
% add enough states with probability zero to make the total 
% number of states equal to $k(q-1)+1$, for some integer $k$.
%  then repeatedly combine $q$ into 1


% \end{document} 
% 
% \item[A code $C(X)$ is {\em non-singular\/}] if every element of $\A_X$ 
%  maps into a different string, \ie, 
% \beq
%       a_i \not = a_j \Rightarrow c(a_i) \not = c(a_j).
% \eeq
% 
% \item[The extension $C^+$ of a code $C$] is a mapping from finite length 
%  strings of $\A_X$ to $\{0,1\}^+$
% % finite length strings of NAME? 
%  defined by the concatentation:
% \beq
%       c(x_1 x_2 \ldots x_N) = c(x_1)c(x_2)\ldots c(x_N)
% \eeq
% 
% \item[A code is uniquely decodeable] if its extension is non-singular.
%
\subsection*{Mixture codes}
 It is a tempting idea to construct a `\ind{metacode}' from several symbol
 codes that assign different-length codewords to the alternative
 symbols.  For example, we might switch from one
  code to another, choosing whichever assigns the shortest codeword
   to the current symbol.
   Clearly we cannot do this for free.\index{bits back}
   If one wishes to  choose between two codes, then 
 it is necessary to lengthen the message in a way that 
 indicates which of the two codes is being used. If we indicate this
 choice by 
 a single leading bit, it will be found that the resulting code 
 is suboptimal because it is incomplete (that is,
 it fails the Kraft equality).
\exercisxA{3}{ex.mixsubopt}{
 Prove that this metacode  is incomplete,
 and explain why this combined code is 
 suboptimal.
}
  

%
% need more on prefix property to make clear how strings are decodeable,
% self-punctuating.
\dvips
\subchapter{Solutions to Chapter \protect\ref{ch3}'s exercises} 
\fakesection{solns 3}
\soln{ex.C1101}{
 Yes,
 $C_2 = \{ {\tt{1}} , {\tt{1}}{\tt{0}}{\tt{1}} \}$
% $C_2 = \{ 1 , 101 \}$
 is uniquely decodeable, even though 
 it is not a prefix code, because no two different strings 
 can map onto the same string; only the codeword $c(a_2)={\tt 101}$ contains 
 the symbol {\tt0}. 
}
\soln{ex.KIconverse}{
 We wish to prove that for any set of codeword lengths $\{ l_i \}$
 satisfying the \Kraft\ inequality, there is a prefix code having those 
 lengths.
%
%  Symbol Coding Budget -- cut this figure later, it is already in _l3
%
\begin{figure}[htbp]
\figuremargin{%
\begin{center}
\mbox{\psfig{figure=figs/budget1.eps,height=3in}\ \psfig{figure=figs/budgetmax.eps,height=3in}}
\end{center}
}{%
\caption[a]{The codeword supermarket and
 the symbol coding budget. The `cost' $2^{-l}$ of each codeword
 (with length $l$)
 is indicated by the size of the box it is written in. The total budget 
 available when making a uniquely decodeable code is 1.}
\label{fig.budget1a}
}%
\end{figure}
 This is readily proved by thinking of  
 the codewords  illustrated in \figref{fig.budget1a}
 as being in a `codeword supermarket', with size indicating 
 cost. We imagine purchasing 
 codewords one at a time, starting from the shortest codewords (\ie, the biggest
 purchases), 
 using the  budget shown at the right of \figref{fig.budget1a}. 
 We start at one side of the codeword supermarket, say the top, 
 and purchase the first codeword of the required length. We advance down 
 the supermarket a distance $2^{-l}$, and purchase the next codeword 
  of the next required length, and so forth. 
 Because the codeword lengths are getting longer, and the corresponding 
 intervals are getting shorter, we can always buy 
 an adjacent codeword to the latest purchase, so there is no wasting 
 of the budget. Thus at the $I$th codeword we have advanced 
 a distance $\sum 2^{-l_i}$ down the supermarket; 
 if $\sum 2^{-l_i} \leq 1$, we will have purchased 
 all the codewords without running out of budget.
}
\soln{ex.Huffmanconverse}{
 The proof that Huffman coding is optimal depends on 
 proving that the key step in the algorithm --- the decision to give
% combination of 
 the two symbols
 with smallest probability equal encoded lengths
 --- cannot lead to a larger expected length 
 than any other code. We can prove this by contradiction. 

 Assume that 
 the two symbols with smallest probability, called $a$ and $b$, 
 to which the Huffman algorithm would assign equal length 
 codewords, 
 do {\em not\/} have equal lengths in {\em any\/}
 optimal symbol code. 
 The optimal symbol code is some 
 other rival code in which these two codewords 
 have  unequal lengths $l_a$ and $l_b$ with $l_a < l_b$.
 Without loss of
 generality we can assume that this other  code is a complete prefix code, 
 because any codelengths of a uniquely decodeable code
 can  be realized by a prefix code.  
%  We now consider transforming the other code into a new code 
%  in which we interchange \ldots

 In this rival code, there must be some other symbol $c$ whose 
 probability $p_c$ is greater than $p_a$ and whose length 
 in the rival code is greater than or equal to $l_b$, because 
 the code for $b$ must have an adjacent codeword of equal or greater
 length --- a complete prefix code never has a solo codeword 
 of the maximum length.
\begin{figure}[htbp]
\figuremargin{%
\begin{tabular}{llllll} \toprule % \hline
symbol & \multicolumn{2}{c}{probability} & Huffman  & Rival code's & Modified rival \\
  & & & codewords & codewords & code \\ \midrule % [0.1in]\hline
$a$ & $p_a$ & \framebox[0.15in]{} &  \framebox[1.50cm]{$c_{\rm H}(a)$} & \framebox[1.0cm]{$c_{\rm R}(a)$} &  \framebox[1.6cm]{$c_{\rm R}(c)$}   
\\[0.1in]	   
$b$ & $p_b$ & \framebox[0.1in]{}  &  \framebox[1.50cm]{$c_{\rm H}(b)$} & \framebox[1.5cm]{$c_{\rm R}(b)$} &  \framebox[1.5cm]{$c_{\rm R}(b)$} 
\\[0.1in]		                                         
$c$ & $p_c$ & \framebox[0.25in]{} &  \framebox[0.95cm]{$c_{\rm H}(c)$} & \framebox[1.6cm]{$c_{\rm R}(c)$} &  \framebox[1.0cm]{$c_{\rm R}(a)$}   
\\ \bottomrule  % [0.1in] \hline
\end{tabular}
}{%
\caption[a]{Proof that Huffman coding makes an optimal symbol code.
% The proof works by contradiction. 
 We assume that the rival code, which is said to be optimal, assigns {\em unequal\/} length 
 codewords to the two symbols with smallest probability,  $a$ and $b$. 
 By interchanging  codewords $a$ and $c$ of the rival code, where $c$ is a
 symbol with rival codelength as long as $b$'s, we can make 
 a code better than the rival code. This shows that the rival code
 was not optimal.
}
\label{fig.huffman.optimal}
}%
\end{figure}

 Consider exchanging the codewords of $a$ and $c$ (\figref{fig.huffman.optimal}), so that 
 $a$ is encoded with the longer codeword that was $c$'s, and 
 $c$, which is more probable than $a$, gets the shorter codeword. 
 Clearly this reduces the expected length of the code. 
 The change in expected length is $(p_a-p_c)(l_c-l_a)$. 
 Thus we have contradicted the assumption that the rival code is optimal.
 Therefore it is valid to give the two symbols
 with smallest probability equal encoded lengths. 
 Huffman coding produces optimal symbol codes.  
}
\soln{ex.Cnud}{
 The code $\{ {\tt 00}, {\tt 11}, {\tt 0101}, {\tt 111}, {\tt 1010},
 {\tt 100100},$ ${\tt 0110} \}$ is not 
 uniquely decodeable because ${\tt 11111}$ can be realized from $c(2)c(4)$ 
 and $c(4)c(2)$.
}
\soln{ex.Ctern}{
 The ternary code
 $\{ {\tt 00},{\tt 012},{\tt 0110},{\tt 0112},$ ${\tt 100},{\tt 201},{\tt 212},{\tt 22} \}$
% $\{ 00,012,0110,0112,100,201,$ $212,22 \}$
 {\em is\/} uniquely decodeable
 because it is a prefix code.
}
\soln{ex.HuffX2X3}{
 A Huffman code 
 for $X^2$ where ${\cal A}_X = \{ {\tt 0},{\tt 1} \}$ 
 and ${\cal P}_X = \{ 0.9,0.1 \}$ 
 is $\{{\tt 00},{\tt 01},{\tt 10},{\tt 11}\} \rightarrow
 \{{\tt 1},{\tt 01},{\tt 000},{\tt 001}\}$.
 This code has $L(C,X^2) = 1.29$, whereas the entropy $H(X^2)$ is 0.938.

  
A Huffman code  for $X^3$ is
\[ \{{\tt 000},{\tt 100},{\tt 010},{\tt 001},{\tt 101},{\tt 011},{\tt 110},{\tt 111}\}
 \rightarrow
 \{{\tt 1},{\tt 011},{\tt 010},{\tt 001},
 {\tt 00000},{\tt 00001},{\tt 00010},{\tt 
 00011}\}.
\]
% corrected from 1.472 to 1.598
% 9802
 This has expected length $L(C,X^3) =  1.598$ whereas  the entropy $H(X^3)$
 is 1.4069.

  A Huffman code  for $X^4$ maps the sixteen source strings to the
 following codelengths: 
\[
\begin{array}{c}
 \{ {\tt 0000},{\tt 1000},{\tt 0100},{\tt 0010},{\tt 0001},{\tt 1100},{\tt 0110},{\tt 0011},{\tt 0101}, 
 {\tt 1010},{\tt 1001},{\tt 1110},{\tt 1101}, \\
 {\tt 1011},{\tt 0111},{\tt 1111} \}
 \rightarrow \:\: \{ 1,3,3,3,4,6,7,7,7,7,7,9,9,9,10,10 \}.
% 10,10,9,9,9,7,7,7,7,7,6,4,3, 3,3,1\}.
\end{array}
\]
 This has expected length $L(C,X^4) =  1.9702$ whereas  the entropy $H(X^4)$
 is 1.876.
% 

% 0.6,0.4
 When ${\cal P}_X = \{ 0.6,0.4 \}$, the Huffman code for $X^2$ has lengths
 $\{ 2,2,2,2 \}$; the expected length is 2 bits, and the
 entropy is 1.94 bits. A
 Huffman code  for $X^4$ is shown in \figref{fig.X4huff2}.
% , has lengths
% $\{0000,1000,0100,0010,0001,1100,0110,0011,0101,1010,1001,1110,1101,1011,0111,1111\} \rightarrow$
% $\{3,3,4,4,4,4,4,4,4,4,4,4,5,5,5,5\}$.
 The expected length is 3.92 bits, and the entropy is 3.88 bits.
% see tmp3 for soln using huffman.p
% $\{0000,1000,0100,0010,0001,1100,0110,0011,0101,1010,1001,1110,1101,1011,0111,1111\} \rightarrow \{5,5,5,5,4,4,4,4,4,4,4,4,4,4,3,3\}$.
}
% see tmp3 for use of huffman.p
%\begin{figure}
%\figuremargin{%
\marginfig{\footnotesize
\begin{center}
\begin{tabular}{clrl} \toprule % \hline
$a_i$  & $p_i$  &
% \multicolumn{1}{c}{$h({p_i})$}  &
 $l_i$ & $c(a_i)$
% {\rule[-3mm]{0pt}{8mm}}%strut
% \\[0.1in] \hline
\\ \midrule 
{\tt 0000} & 0.1296      &   3 & {\tt 000 }\\ 
{\tt 0001} & 0.0864      &   4 & {\tt 0100 }\\ 
{\tt 0010} & 0.0864      &   4 & {\tt 0110 }\\ 
{\tt 0100} & 0.0864      &   4 & {\tt 0111 }\\ 
{\tt 1000} & 0.0864      &   3 & {\tt 100 }\\ 
{\tt 1100} & 0.0576      &   4 & {\tt 1010 }\\ 
{\tt 1010} & 0.0576      &   4 & {\tt 1100 }\\ 
{\tt 1001} & 0.0576      &   4 & {\tt 1101 }\\ 
{\tt 0110} & 0.0576      &   4 & {\tt 1110 }\\ 
{\tt 0101} & 0.0576      &   4 & {\tt 1111 }\\ 
{\tt 0011} & 0.0576      &   4 & {\tt 0010 }\\ 
{\tt 1110} & 0.0384      &   5 & {\tt 00110 }\\ 
{\tt 1101} & 0.0384      &   5 & {\tt 01010 }\\ 
{\tt 1011} & 0.0384      &   5 & {\tt 01011 }\\ 
{\tt 0111} & 0.0384      &   4 & {\tt 1011 }\\ 
{\tt 1111} & 0.0256      &   5 & {\tt 00111 }\\ \bottomrule %\hline
%expected length     3.9248
%entropy     3.8838
\end{tabular}
\end{center}
%}{%
\caption[a]{Huffman code for $X^4$ when $p_0=0.6$. Column 3 shows the
 assigned codelengths and column 4 the codewords. Notice some strings
 whose probabilities are identical, \eg, the fourth and fifth,
 receive different codelengths.}
\label{fig.X4huff2}
}%
%\end{figure}
\soln{ex.Huffambig}{
 The set of probabilities  $\{ p_1,p_2,p_3,p_4 \} = 
 \{ \dsixth,\dsixth,\dthird,\dthird\}$ gives rise to two different optimal 
 sets of codelengths, because at the second step of the Huffman 
 coding algorithm we can choose any of the three possible pairings. 
 We may either put them in a constant length code
 $\{ {\tt00},{\tt01},{\tt10},{\tt11} \}$ or
 the code $\{ {\tt000},{\tt001},{\tt01},{\tt1} \}$. 
 Both codes have expected length 2.

 Another solution is  $\{ p_1,p_2,p_3,p_4 \}$ $=$ 
 $\{ \dfifth,\dfifth,\dfifth,\dtwofifth\}$.
% =$ $\{ 0.2 , 0.2 , 0.2 , 0.4 \} $.

 And a third is  $\{ p_1,p_2,p_3,p_4 \} = 
 \{ \dthird,\dthird,\dthird,0\}$.
}
\soln{ex.Huffambigb}{
        Probability vectors leading to a free choice in the Huffman 
 coding algorithm satisfy $p_1 \geq p_2 \geq p_3 \geq p_4 \geq 0$ and 
\beq
        p_1 = p_3 + p_4 .
\label{eq.Huffambig}
\eeq
%  The 
% % reason for this is that the 
%  first step  of the Huffman coding algorithm always combines the 
% symbols with smallest probability giving a new symbol with 
%  probability $p_3 + p_4$. The only way we can get alternative 
%  lengths is if this probability is equal to 
 The convex hull of $\cal Q$ is most easily obtained by 
 turning two of the three inequalities 
 $p_1 \geq p_2 \geq p_3 \geq p_4$ into equalities, and then solving 
 \eqref{eq.Huffambig} for $\bp$. Each choice of equalities gives 
 rise to one of  the set of three vectors
\beq
        \{ \dthird,\dthird,\dsixth,\dsixth\} , \:
 \{ \dtwofifth,\dfifth,\dfifth,\dfifth\} \mbox{ and } \{ \dthird ,\dthird,\dthird,0\}.
\eeq
}
\soln{ex.make.huffman.suck}{
 Let $p_{\max}$ be the largest probability in $p_1,p_2,\ldots,p_I$.
 The difference between the  expected length
 $L$ and the entropy $H$  can be no bigger than
 $\max ( p_{\max} , 0.086 )$ \cite{Gallager78}.

%
 See exercises \ref{ex.huffman.uniform}--\ref{ex.huffman.uniform2} to understand
 where the curious 0.086 comes from.
}
\soln{ex.huffman.uniform}{
% removed to  cutsolutions.tex
 Length $-$ entropy = 0.086.
%length / entropy     1.0249

}
\begincuttable
\soln{ex.Huff99}{
 The sparse source ${\cal P}_X = \{ 0.99 , 0.01  \}$ 
 could be compressed with a Huffman code based on blocks of 
 length $N$, but $N$ would need to be quite large 
 for the code to be efficient. The probability of the all--{\tt{0}} sequence
 of length $N$
 has to be reduced to about 0.5 or smaller for the code to be efficient.
 This sets $N \simeq \log 0.5/\log 0.99 = 69$. 
 The Huffman code would then have $2^{69}$ entries in its tree, 
 which probably exceeds the memory capacity of all the computers 
 in this universe and several others. 
 
 There are other ways that we could describe the data stream. One 
 is run-length encoding. We could chop the source into 
 the substrings ${\tt{1}},{\tt{01}},{\tt{001}},{\tt{0001}},{\tt{00001}},\ldots$ with the last elements
 in the set being, say, two strings of equal maximum length
 ${\tt{00}}\ldots{\tt{01}}$ and ${\tt{00}}\ldots{\tt{00}}$. 
 We can give names to each of these strings and compute their 
 probabilities, which are not hugely dissimilar to each other. 
 This list of probabilities starts $\{ 0.01, 0.0099, 0.009801 , \ldots\}$. 
 For this code to be efficient, the string with largest probability 
 should have probability about 0.5 or smaller; this means that we would 
 make a code out of about 69 such strings.  It is perfectly feasible to 
 make such a code.  The only difficulty with this code is the issue
 of termination. If a sparse file ends with a string of 20 {\tt 0}s
 still left to transmit, what do we do? This problem has arisen 
 because we failed to include the end--of--file character
 in our source alphabet. The best solution to this 
 problem is to use an arithmetic code as described in the next chapter. 
}
\ENDcuttable
\soln{ex.poisonglass}{
 The poisoned glass problem is intended to have the solution `129',
 this being the only number of the form $2^m + 1$
% power--of--two plus one
 between 100 and 200.
 However the optimal strategy, assuming all glasses have equal probability, 
 is to design a Huffman code for the glasses. This produces a binary 
 tree in which each pair of branches have almost equal weight.
 On the first measurement, either
 64 or 65 of the glasses are tested. (Given the 
 assumption  that one of the glasses is poisoned, it makes no difference 
 which; however, going for 65 might be viewed as preferable if there 
 were any uncertainty over this assumption.) There is a 2/129 probability
 that an extra test is needed after seven tests have occurred. So the 
 expected number of tests is 7$\frac{2}{129}$, whereas the 
 strategy of the  professor  takes 8 tests with probability $128/129$
 and one test with probability $1/129$, giving 
 a mean number of tests $7\frac{122}{129}$. The expected waste is $40/43$
 tests.
%  glasses, pairing them
}
\soln{ex.optimalcodep1}{% problem fixed Tue 12/12/00
	There are two ways to answer this problem correctly,
 and one popular way  to answer it incorrectly.
 Let's give the incorrect answer first:
\begin{description}
\item[Erroneous answer.]
 ``We can pick a random bit by first picking a
 random source symbol $x_i$ with probability $p_i$,
 then picking a random bit from $c(x_i)$. If we define $f_i$
 to be the fraction of the bits of $c(x_i)$ that are {\tt 1}s,
 we find
\marginpar[b]{\small
\begin{center}
$C_3$: 
\begin{tabular}{cllc} \toprule
$a_i$ & $c(a_i)$ & $p_i$  & $l_i$ 
\\ \midrule 
{\tt a} & {\tt 0}   & \dhalf     &   1      \\
{\tt b} & {\tt 10}  & \dquarter        &   2      \\
{\tt c} & {\tt 110} & \deighth       &   3      \\
{\tt d} & {\tt 111} & \deighth       &   3      \\
 \bottomrule
\end{tabular}
\end{center}
}
\beqan
	P(\mbox{bit is {\tt 1}}) &=& \sum_i p_i f_i
\label{eq.wrongp1}
	\\ &=&
	\dfrac{1}{2} \times 0 + 
	\dfrac{1}{4} \times \dfrac{1}{2} + 
	\dfrac{1}{8} \times \dfrac{2}{3} + 
	\dfrac{1}{8} \times 1
	= \dthird \mbox{.''}
\eeqan
\end{description}
 This answer is wrong because it falls for the \ind{bus-stop fallacy},\index{paradoxes}
 which was introduced in \exerciseref{ex.waitbus}: if buses arrive
 at random, and we are interested in `the average time from  one bus until
 the next', we must distinguish two possible averages:
 (a) the average time from a randomly chosen bus until the next;
 (b) the average time between the bus you just missed and the next bus.
 The second `average' is twice as big as the first because,
 by waiting for a bus at a random time, you bias your selection of
 a bus in favour of buses that follow a large gap. You're unlikely
 to catch a bus that comes 10 seconds after a preceding bus!
 Similarly, the symbols {\tt c} and {\tt d} get encoded into
 longer-length binary strings than {\tt a}, so when we pick a bit
 from the compressed string at random, we are more likely
 to land in a bit belonging to a {\tt c} or a {\tt d}
 than would be given by the probabilities $p_i$ in the
 expectation (\ref{eq.wrongp1}). All the probabilities need to
 be scaled up by $l_i$, and renormalised.
\begin{description}
\item[Correct answer in the same style.]
 Every time symbol $x_i$ is encoded, $l_i$ bits
 are added to the binary string, of which $f_i l_i$ are {\tt 1}s.
 The expected number of  {\tt 1}s added per symbol is
\beq
	\sum_i p_i f_i l_i ;
\eeq
 and the expected total number of bits added per symbol is
\beq
	\sum_i p_i  l_i .
\eeq
 So the fraction of {\tt 1}s in the transmitted string is
\beqan
	P(\mbox{bit is {\tt 1}}) &=& \frac{ \sum_i p_i f_i l_i }{ \sum_i p_i  l_i }
\label{eq.rightp1}
	\\ &=&
\frac{	\dfrac{1}{2} \times 0 + 
	\dfrac{1}{4} \times 1 + 
	\dfrac{1}{8} \times 2 + 
	\dfrac{1}{8} \times 3
}{ \dfrac{7}{4} }
	= \frac{\dfrac{7}{8}}{\dfrac{7}{4}}  = 1/2  .
\nonumber
\eeqan
\end{description}
 For a general symbol code and a general ensemble,
 the expectation (\ref{eq.rightp1}) is the correct answer.
 But in this case, there is a more powerful argument
 we can use.
\begin{description}
\item[Information-theoretic answer.]
 The encoded string $\bc$ is the output of
 an optimal compressor that compresses samples from
 $X$ down to an expected length of $H(X)$ bits. We can't expect to compress
 this data any further. But if the probability $P(\mbox{bit is {\tt 1}})$
 were not equal to $\dhalf$ then it {\em would\/} be possible to compress
 the binary string further (using a block compression code, say).
 Therefore  $P(\mbox{bit is {\tt 1}})$
 must be equal to $\dhalf$; indeed the probability of any sequence
 of $l$ bits in the compressed stream taking on any particular
 value must be $2^{-l}$.  The output of a perfect compressor is always
 perfectly random bits.

\begincuttable
 To put it another way, if the probability $P(\mbox{bit is {\tt 1}})$
 were not equal to $\dhalf$, then the information content per bit of
 the compressed string would be at most $H_2( P(\mbox{{\tt 1}}) )$,
 which would be less than 1;
 but this contradicts the fact that we can recover the original data
 from $\bc$, so the information content per bit of the
 compressed string must be $H(X)/L(C,X)=1$.
\ENDcuttable
\end{description}
}
%
% this one is a new addition 
%
\soln{ex.Huffmanqary}{ The \index{Huffman code!general alphabet}{general Huffman coding algorithm} for 
 an encoding alphabet with $q$ symbols
 has one difference from the binary case. 
 The process of combining $q$ symbols into 
 1 symbol reduces the number of symbols by $q\!-\!1$. 
 So if we start with $A$ symbols, we'll only end up 
 with a complete $q$--ary tree if $A \mod (q\!-\!1)$ is equal 
 to 1. 
 Otherwise, we know that whatever prefix code we make, it 
 will be an incomplete tree with a number of missing 
 leaves equal, modulo $(q\!-\!1)$, to  $A \mod (q\!-\!1) - 1$. 
 For example, if a ternary tree is built for eight symbols, 
 then there will unavoidably be one missing leaf in the tree.

 The optimal $q$--ary code is made by putting these 
 extra leaves in the longest branch of the tree. This can be achieved
 by adding the appropriate number of symbols to the original source 
 symbol set, all of these extra symbols having probability zero. 
 The total number of states is then equal to $r(q\!-\!1)+1$, for some
 integer $r$.
 The symbols are then repeatedly combined by taking 
 the $q$ symbols with smallest probability and replacing them 
 by a single symbol, as in the binary Huffman coding algorithm.}

\soln{ex.mixsubopt}{ 
%This is important but I haven't written it yet. 
 We wish to show that a greedy \ind{metacode}, which 
 picks the code which gives the shortest encoding, is 
 actually suboptimal, because it violates the Kraft 
 inequality. 

% For generality, let's call the 
% that the objects to be encoded, 
% $x$, `symbols'.
 We'll assume that each symbol $x$ is
 assigned lengths $l_k(x)$ by each of the candidate codes $C_k$. 
 Let us assume there are $K$ alternative codes and that we can 
 encode which code is being used with a header of length $\log K$
 bits. 
 Then the metacode assigns lengths $l'(x)$ that are given by 
\beq
        l'(x) = \log_2 K + \min_k l_k(x) .
\eeq
 We compute the Kraft sum:
\beq
 S = \sum_x 2^{- l'(x)}
                = \frac{1}{K}  \sum_x 2^{- \min_k l_k(x)}
\eeq 
 Let's divide the set $\A_X$ into non--overlapping  subsets $\{\A_k\}_{k=1}^{K}$ 
 such that  subset $\A_k$ contains all the symbols  $x$ 
 that  the metacode sends via code $k$.
 Then 
\beq
        S = \frac{1}{K} \sum_k \sum_{x \in \A_{k}}  2^{- l_k(x)} .
\eeq
 Now if one sub--code $k$ satisfies the Kraft equality
 $\sum_{x\in \A_X} 2^{- l_k(x)}  \eq 1$, then 
 it must be the case that 
\beq
 \sum_{x \in \A_{k}}  2^{- l_k(x)}  \leq 1 , 
\label{eq.from.kraft}
\eeq
 with equality only if all the symbols $x$ are in $\A_k$, which would mean that we
 are only using one of the $K$ codes. 
 So
\beq
        S \leq \frac{1}{K} \sum_{k=1}^K 1 = 1 ,
\eeq
 with equality only if \eqref{eq.from.kraft} is an equality for all codes $k$. 
 But   it's impossible for all the symbols to be in {\em all\/} the 
 non--overlapping  subsets $\{\A_k\}_{k=1}^{K}$, so 
 we can't have equality  (\ref{eq.from.kraft}) holding 
 for {\em all\/} $k$.
 So
\beq
        S < 1 .
\eeq

 Another way of seeing that a mixture code is suboptimal is to consider
 the binary tree that it defines.  Think of the special case of two
 codes.  The first bit we send identifies which code we are using.
 Now, in a complete code, any subsequent binary string is a valid
 string. But once we know that we are using, say, code A, we know that
 what follows can only be a codeword corresponding to a symbol $x$
 whose encoding is shorter under code A than code B.  So some strings
 are invalid continuations, and the mixture code is incomplete
  and suboptimal. 
 
%%% MAYBE!!!!!!!!!!!!!!
 We will  further discuss this issue
 and its relationship to probabilistic modelling 
 in the chapter on `\ind{bits back} coding'.
}

% \dvipsb{solutions 3}
\prechapter{About      Chapter}
\fakesection{prerequisites for chapter known as 4}
 Before reading chapter \chfour, you should have read  the previous chapter
 and worked on 
 most of the exercises in it.

 We'll also make use of some Bayesian modelling ideas
 that arrived in the vicinity of \exerciseref{ex.postpa}.

% Arithmetic coding has been invented several times,
% by Elias, by Rissanen, and 
% but is only slowly becoming well known

 {The description of Lempel-Ziv coding  is based on that of Cover and Thomas (1991).}

%\chapter{Data Compression III: Stream Codes}
\mysetcounter{page}{126}
\chapter{Stream Codes}
\label{ch.four}
\label{ch.ac}
\addtopic{3}{infotheory}
\addtopic{3}{probability}
\addtopic{1}{inference}
%\addtopic{3}{computation}
%\addtrack{1}{inferencecourse}
\addtrack{3}{infotheorycourse}
\addtrack{3}{itprnncourse}
% _l4.tex  
\fakesection{Data Compression III: Stream Codes}
%
% still need to change notation for R(|)
%
\label{ch4}
 In this chapter we discuss  two data
 compression schemes.\index{source code!stream codes|(}\index{stream codes|(}
%% that constitute the state of the art. 

 {\dem\indexs{arithmetic coding}Arithmetic coding}
 is a beautiful   method that goes 
 hand in hand with the philosophy that compression of data 
 from a source entails
 probabilistic modelling of that source. As of 1999, 
 the best compression methods for text files use arithmetic coding,
 and several state-of-the-art image compression systems
 use it too.

  {\dem\ind{Lempel-Ziv coding}} is a `universal' method, 
%  in my opinion an ugly hack, but 
 designed under the philosophy that we would like a single compression
 algorithm that will do a reasonable job for {\em any\/} source.
 In fact, for many real
 life sources, this
 algorithm's universal properties hold only 
 in the limit of unfeasibly large amounts of data, but,
 all the same, Lempel-Ziv compression is   widely used
 and often  effective.

\section{The guessing game}
\label{sec.startofch4}
 As a motivation for these\index{game!guessing}
 two compression  methods,
 let us consider the redundancy in a 
 typical
% imagine compressing a 
 \ind{English} text file. Such files have redundancy at several levels: for example,
 they contain the ASCII characters with non-equal frequency; certain consecutive
 pairs  of letters are more probable than others; and entire words 
 can be predicted given the context and a semantic understanding
 of the text.

 To illustrate the redundancy of English, and a curious way in which 
 it could be compressed, we can imagine a \ind{guessing game}
 in which an English speaker repeatedly
 attempts to predict the next character
 in a text file. 

% \subsection{The guessing game}
\label{sec.guessing}
% Could discuss the compression of English text by guessing 
 For simplicity, let us assume that the allowed alphabet consists
 of
 the 26 upper case letters {\tt  A,B,C,\ldots, Z} and a space `{\tt -}'.
 The game involves asking the subject to guess the next character
 repeatedly, the only feedback being whether the guess is correct
 or not, until the character is correctly guessed. 
 After a correct guess, we note the number of guesses that 
 were made when the character was identified, and ask the subject
 to guess the next character in the same way. 

 One sentence 
% given by Shannon
 gave the following result when a human was asked to guess a sentence.
%  in a guessing game.
 The numbers of guesses 
 are listed below each character.
%  and the idea of having an identical twin. This introduces the idea 
%  of mapping to a different alphabet with nonuniform probability.
%  The guessing game. From Shannon.
\begin{center}
%\begin{tabular}{*{36}{c@{\,\,}}}
\begin{tabular}{*{36}{p{0.15in}@{}}}
\small\tt
T&\small\tt H&\small\tt E&\small\tt R&\small\tt E&\small\tt -&\small\tt I&\small\tt S&\small\tt -&\small\tt N&\small\tt O&\small\tt -&\small\tt R&\small\tt E&\small\tt V&\small\tt E&\small\tt R&\small\tt S&\small\tt E&\small\tt -&\small\tt O&\small\tt N&\small\tt -&\small\tt A&\small\tt -&\small\tt M&\small\tt O&\small\tt T&\small\tt O&\small\tt R&\small\tt C&\small\tt Y&\small\tt C&\small\tt L&\small\tt E&\small\tt -\\
\footnotesize
1&\footnotesize 1&\footnotesize 1&\footnotesize 5&\footnotesize 1&\footnotesize 1&\footnotesize 2&\footnotesize 1&\footnotesize 1&\footnotesize 2&\footnotesize 1&\footnotesize 1&\footnotesize \hspace{-0.05in}1\hspace{-0.25mm}5&\footnotesize 1&\footnotesize \hspace{-0.05in}1\hspace{-0.25mm}7&\footnotesize 1&\footnotesize 1&\footnotesize 1&\footnotesize 2&\footnotesize 1&\footnotesize 3&\footnotesize 2&\footnotesize 1&\footnotesize 2&\footnotesize 2&\footnotesize 7&\footnotesize 1&\footnotesize 1&\footnotesize 1&\footnotesize 1&\footnotesize 4&\footnotesize 1&\footnotesize 1&\footnotesize 1&\footnotesize 1&\footnotesize 1\\
\end{tabular}
\end{center}
 Notice that in many cases, the next letter is guessed immediately, in one guess.
 In other cases, particularly at the start of syllables,
 more guesses are needed.

 What do this game and these results offer us?
 First, they demonstrate the redundancy of English from the point of
 view of an English speaker. 
 Second, this game might be used in
 a data compression scheme, as follows.

% encoding
 The string of numbers `1, 1, 1, 5, 1, \ldots', listed above,
 was obtained by presenting
 the text to the subject. The maximum number of guesses that the
 subject will make for a given letter is twenty-seven, so what the subject is
 doing for us is performing a time-varying mapping of the twenty-seven letters
 $\{ {\tt A,B,C,\ldots, Z,-}\}$ onto the twenty-seven numbers $\{1,2,3,\ldots,
 27\}$, which we can view as symbols in a new alphabet. The total number of
 symbols has not been reduced, but since he uses some of
 these symbols much more frequently than others -- for example, 1 and
 2 -- it should be  easy to compress this new string of
 symbols.
% ; we will discuss data compression
%% the details  of how to do this
% properly  shortly. 

% decoding
 How would the {\em uncompression\/} of the sequence of numbers
 `1, 1, 1, 5, 1, \ldots' work? At uncompression time,
 we do not have the original string `{\small\tt{THERE}}\ldots', we just 
 have the encoded sequence. Imagine that our subject has an
 absolutely \ind{identical twin}\index{twin}
%({\em absolutely\/} identical)
 who also
 plays the guessing game\index{guessing game} with us, as if we
%, the experimenters,
 knew the source text.
 If we stop him whenever he has made a
 number of guesses equal to the given number, then he will have just 
 guessed the correct letter, and we can then say `yes, that's right', 
 and move to the next character.
 Alternatively, if the identical twin is not available, we could
 design a compression system with the help of just one human as follows.
 We choose a window length $L$, that is, a number of characters of context
 to show the human. For every one of the $27^L$ possible
 strings of length $L$, we ask them, `What would you predict is the next character?',
 and `If that prediction were wrong, what would your next guesses be?'.
 After tabulating their answers to these $26 \times 27^L$ questions,
 we could use two copies of these enormous tables at the encoder and the
 decoder in place of the two human twins.

 These systems are clearly  unrealistic  for practical compression, 
 but they illustrate several principles that we will make use of now.



\section{Arithmetic Codes}
\label{sec.ac}
% In lecture 2 we discussed fixed length block codes. 
 When we discussed variable-length symbol codes, and the optimal 
 Huffman algorithm for constructing them, we concluded by pointing 
 out two practical
 and theoretical problems with Huffman codes (section \ref{sec.huffman.probs}). 

%
% index decision:  {arithmetic coding} not  {arithmetic codes}
%
        These defects are rectified by {\dem\index{arithmetic coding}{arithmetic codes}}, which
 were  invented by Elias\nocite{EliasACmentionedpages61to62},\index{Elias}
 by \index{Rissanen, Jorma}{Rissanen} and by \ind{Pasco}, 
 and subsequently made practical by
% Witten, Neal, and Cleary. 
 \citeasnoun{arith_coding}.\index{Neal, Radford} 
        In an arithmetic code, the
 probabilistic modelling is clearly separated from the encoding
 operation. 
 The system is rather similar to the guessing game.\index{guessing game}
%  that we considered  in chapter \chtwo.
 The human predictor is replaced by a
 {\dem\ind{probabilistic model}} of the source. 
 As each symbol is produced by the source, the probabilistic model 
 supplies a {\dem\ind{predictive distribution}}
 over all possible values of the next
 symbol, that is, a list of positive numbers $\{ p_i \}$ that sum to 
 one. If we choose to model the source as producing i.i.d.\ symbols with some
 known distribution, 
 then the predictive distribution is the same every time; but arithmetic
 coding can with equal ease handle  complex  adaptive models that produce 
 context-dependent
% time-varying
 predictive distributions.  The predictive  model is usually
 implemented in a computer program.
% a model which hypothesizes arbitrary 
% context-dependencies and non-stationarities, and which learns as it 
% goes, so that predictive distributions in any given context gradually 
% sharpen up. 
% I will give  an example later on, 
% of an adaptive model producing appropriate  probabilities 
% but first let us discuss the arithmetic coding algorithm itself. 

 The encoder makes use of the model's predictions  to create a 
 binary string. The decoder makes use of an identical twin of the 
 model (just as in the guessing \index{guessing game}game) to interpret the binary string.

 Let the source alphabet be $\A_X = \{a_1 ,\ldots, a_I\}$, and let the 
 $I$th symbol $a_I$ have the special meaning `end of transmission'.
 The source
 spits out a sequence $x_1,x_2,\ldots,x_n,\ldots$. The source does {\em not\/}
 necessarily produce i.i.d.\ symbols.
 We will assume that a computer program is provided to the encoder
 that assigns a predictive 
 probability distribution over $a_i$ given the sequence that has occurred 
 thus far, 
 $P(x_n \eq a_i|x_1,\ldots,x_{n-1})$. 
% Nor will we assume that the source
% is correctly modeled by $P$. But if it is, then arithmetic coding achieves 
% the Shannon rate.
%
% The encoder will send a binary transmission to the receiver.
%
        The receiver has  an identical program that produces the 
 same predictive 
 probability distribution $P(x_n \eq a_i|x_1,\ldots,x_{n-1})$.
% and uses  it to interpret the received message.
\medskip

\begin{figure}[htbp]
\figuremargin{%
\begin{center}
\setlength{\unitlength}{1mm}
\begin{picture}(50,40)(0,0)
\put(18,40){\makebox(0,0)[r]{0.00}}
\put(18,30){\makebox(0,0)[r]{0.25}}
\put(18,20){\makebox(0,0)[r]{0.50}}
\put(18,10){\makebox(0,0)[r]{0.75}}
\put(18, 0){\makebox(0,0)[r]{1.00}}
%
% major horizontals
%
\put(20,40){\line(1,0){37}}
\put(20,30){\line(1,0){13}}
\put(20,20){\line(1,0){28}}
\put(20,10){\line(1,0){13}}
\put(20, 0){\line(1,0){37}}
%
% biggest intervals
%
\put(45,30){\vector(0,1){9}}
\put(45,30){\vector(0,-1){9}}
\put(47,30){\makebox(0,0)[l]{{\tt{0}}}}
\put(45,10){\vector(0,1){9}}
\put(45,10){\vector(0,-1){9}}
\put(47,10){\makebox(0,0)[l]{{\tt{1}}}}
%
\put(35,25){\vector(0,1){4}}
\put(35,25){\vector(0,-1){4}}
\put(37,25){\makebox(0,0)[l]{{\tt{01}}}}
% some subdivs
\put(20,35){\line(1,0){7}}
\put(20,25){\line(1,0){7}}
\put(20,15){\line(1,0){7}}
\put(20, 5){\line(1,0){7}}
%
% 01101 = 13/32 = 16.25 
% 01110 = 14/32 = 17.5  
\put(20,23.75){\line(1,0){4}}
\put(20,22.50){\line(1,0){4}}
\put(62,23.125){\makebox(0,0)[l]{{\tt{01101}}}}
%
% interrupted pointer: 
\put(60,23.125){\line(-1,0){14}}
\put(44,23.125){\line(-1,0){8}}
\put(34,23.125){\vector(-1,0){9.5}}
%
\end{picture}
\end{center}
}{%
\caption[a]{Binary strings define real intervals within the real line [0,1).
  We first encountered a picture like this when we discussed the 
 \ind{symbol-code supermarket} in chapter \ref{ch3}.
}
\label{fig.arith.Rbinary}
}%
\end{figure}
\subsection{Concepts for understanding arithmetic coding}
\begin{aside}
%\item[Notation for intervals.]
 {\sf Notation for intervals.} The interval $[0.01, 0.10)$ is all numbers 
 between $0.01$ and $0.10$, including $0.01\dot{0}\equiv0.01000\ldots$ but not $0.10\dot{0}\equiv0.10000\ldots$.
\end{aside}

        A binary transmission defines an interval within
 the real line from 0 to 1. For example, the string {\tt{01}} is
 interpreted as a binary real number 0.01\ldots, which corresponds to
 the interval  $[0.01, 0.10)$ (binary), \ie, the interval
 $[0.25,0.50)$ (base ten). 

%
% why strange line breaks?
%
 The longer string  {\tt{01101}} corresponds to a smaller
 interval $[0.01101,$ $0.01110)$. Because {\tt{01101}} has the first string, 
 {\tt{01}}, as a prefix, the new interval is a sub-interval
 of the interval $[0.01, 0.10)$.
 A one-megabyte binary file ($2^{23}$ bits) is thus viewed as specifying a number 
 between 0 and 1 to a precision of about two million
% $10^7$
 decimal places -- {Two million decimal digits, because
 each byte translates into a little more than two decimal digits.}
% byte = 8 bits ~= 2 digits.
% 
% one meg-byte = 2^3 * 2^20  = 2^23 binary places -> 2.5*10^7  or (2**23=8388608) .
% shall I tell you a bedtime number between 0 and 1 to 10^7 d.p. darling?
%
\medskip

        Similarly, we can divide the real line [0,1) into $I$ intervals of 
 lengths equal to the probabilities $P(x_1 \eq a_i)$, as shown
 in \figref{fig.arith.R}.
%                   upsidedown
% p1 = 6         -- 34   mid: 37  w = 3-1
% p2 = 16 cum 22 -- 18   mid: 26  w = 8-1
% last = 6 cum   --  6   mid:  3  w = 3-1
\newcommand{\aonelevel}{34}
\newcommand{\atwolevel}{18}
\newcommand{\apenlevel}{6}% penultimate
\newcommand{\apenmid}{12}% put dots here
\newcommand{\aonemid}{37}
\newcommand{\aonew}{2}
\newcommand{\atwow}{7}
\newcommand{\atwomid}{26}
\newcommand{\aIw}{2}
\newcommand{\aImid}{3}
\begin{figure}[htbp]
\figuremargin{%
\begin{center}
\setlength{\unitlength}{1mm}
\begin{picture}(50,40)(0,0)
\put(18,40){\makebox(0,0)[r]{0.00}}
\put(18,\aonelevel){\makebox(0,0)[r]{$P(x_1\eq a_1)$}}
\put(18,\atwolevel){\makebox(0,0)[r]{$P(x_1\eq a_1)+P(x_1\eq a_2)$}}
\put(18,\apenlevel){\makebox(0,0)[r]{$P(x_1\eq a_1)+\ldots+P(x_1\eq a_{I\!-\!1})$}}
\put(18, 0){\makebox(0,0)[r]{1.0}}
%
% major horizontals
%
\put(20,40){\line(1,0){37}}
\put(20,\aonelevel){\line(1,0){20}}
\put(20,\atwolevel){\line(1,0){20}}
\put(20,\apenlevel){\line(1,0){20}}
\put(20, 0){\line(1,0){37}}
\put(30,\apenmid){\makebox(0,0)[l]{$\vdots$}}
%
% biggest intervals
%
\put(35,\aonemid){\vector(0,1){\aonew}}
\put(35,\aonemid){\vector(0,-1){\aonew}}
\put(37,\aonemid){\makebox(0,0)[l]{$a_1$}}% or $P(x_1\eq a_1)$}}
\put(35,\atwomid){\vector(0,1){\atwow}}
\put(35,\atwomid){\vector(0,-1){\atwow}}
\put(37,\atwomid){\makebox(0,0)[l]{$a_2$}}% or $P(x_1\eq a_2)$}}
\put(35,\aImid){\vector(0,1){\aIw}}
\put(35,\aImid){\vector(0,-1){\aIw}}
\put(37,\aImid){\makebox(0,0)[l]{$a_I$}}% or $P(x_1\eq a_I)$}}
\put(37,\apenmid){\makebox(0,0)[l]{$\vdots$}}
%
\put(20,23){\line(1,0){4}}% beg of a5
\put(20,20){\line(1,0){4}}% end a5
%
\put(62,21.5){\makebox(0,0)[l]{$a_2 a_5$}}
% interrupted pointer: 
\put(60,21.5){\line(-1,0){24}}
\put(34,21.5){\vector(-1,0){9.5}}
%
% a2a1: 34 is the top
%
\put(20,30){\line(1,0){4}}% end of a1
\put(20,28){\line(1,0){4}}% end of a2
\put(20,25){\line(1,0){4}}% end of a3
%
\put(62,32){\makebox(0,0)[l]{$a_2 a_1$}}
% interrup pointer: 
\put(60,32){\line(-1,0){24}}
\put(34,32){\vector(-1,0){9.5}}
%
\end{picture}
\end{center}
}{%
\caption[a]{A probabilistic model defines real
 intervals within the real line [0,1).}
\label{fig.arith.R}
}%
\end{figure}

 We may then take each interval $a_i$ and subdivide it into intervals
 denoted $a_ia_1,a_ia_2,\ldots, a_ia_I$, such that the length of
 $a_ia_j$ is proportional to $P(x_2 \eq a_j|x_1 \eq a_i)$. Indeed the
 length of the interval $a_ia_j$ will be precisely the joint probability
\beq
	P(x_1 \eq
 a_i,x_2\eq a_j)=P(x_1\eq a_i)P(x_2 \eq a_j|x_1 \eq a_i).
\eeq

        Iterating this procedure, the interval $[0,1)$ can be divided
 into a sequence of intervals corresponding to all possible finite
 length strings $x_1x_2\ldots x_N$, such that the length of an
 interval is equal to the probability of the string given our model.
% This iterative procedure

\subsection{Formulae describing arithmetic coding}
\begin{aside}
 The process depicted in \figref{fig.arith.R} can be written
 explicitly as follows.
 The  intervals are  defined in terms of the lower and upper cumulative probabilities
\beqan
        Q_{n}(a_i|x_1,\ldots,x_{n-1})
 &       \equiv & \sum_{i'\eq 1 }^{i-1} P(x_n \eq a_{i'}|x_1,\ldots,x_{n-1}) ,
\label{eq.arith.Q} \\
        R_{n}(a_i|x_1,\ldots,x_{n-1})
  &      \equiv & \sum_{i'\eq 1 }^{i} P(x_n \eq a_{i'}|x_1,\ldots,x_{n-1}) .
\label{eq.arith.R}
\eeqan
%
 As the $n$th  symbol arrives, we subdivide the $n-1$th
 interval at the points defined by $Q_n$ and $R_n$. 
 For example, starting with the first symbol,
 the intervals `$a_1$',  `$a_2$',
% `$a_3$',
 and `$a_I$'  are
% first interval, 
% which we will call
\beq
 a_1 \leftrightarrow  [Q_{1}(a_1),R_{1}(a_1))= [0,P(x_1 \eq a_1)) ,
\eeq
\beq
 a_2 \leftrightarrow [Q_{1}(a_2),R_{1}(a_2))=
 \left[
 P(x\eq a_1),P(x\eq  a_1)+P(x\eq a_2) \right) ,
\eeq
%\beq
% a_3 \leftrightarrow [Q_{1}(a_3),R_{1}(a_3))=
% \left[
% P(x\eq a_1)+P(x\eq a_2) , P(x\eq  a_1)+P(x\eq a_2) +P(x\eq a_3)\right),
%\eeq
 and
\beq
 a_I  \leftrightarrow
 \left[ Q_{1}(a_{I}) , R_{1}(a_{I}) \right)
 = \left[ P(x_1\eq a_1)+\ldots+P(x_1\eq a_{I\!-\!1}) ,1.0 \right) .
\eeq
 Algorithm \ref{alg.ac} describes the general procedure.
\end{aside}

\begin{algorithm}
\algorithmmargin{%
\begin{center}
\begin{tabular}{l}
%\begin{description}% should be ALGORITHM
%\item[Iterative procedure to find the interval $[u,v)$
% corresponding to
% 	for the string   $x_1x_2\ldots x_N$]
%
 {\tt $u$ := 0.0} \\
 {\tt $v$ := 1.0} \\
 {\tt $p$ := $v-u$} \\
 {\tt for $n$ = 1 to $N$  ( } \\
 \hspace*{0.5in} Compute the cumulative probabilities $Q_n$ and $R_n$
	\protect(\ref{eq.arith.Q},\ref{eq.arith.R})
%       $\{  R_{n}(a_i|x_1,\ldots,x_{n-1})  \}_{i=1}^{I}$
%% $\{ R_{n,i|x_1,\ldots,x_{n-1}} \}_{i=0}^{I}$
%        using \eqref{eq.arith.R} \\
 \\
 \hspace*{0.5in} {\tt $v$ := $u + p R_{n}(x_n|x_1,\ldots,x_{n-1}) $  } \\
 \hspace*{0.5in} {\tt $u$ := $u + p Q_{n}(x_n|x_1,\ldots,x_{n-1}) $  } \\
 \hspace*{0.5in}  {\tt $p$ := $v-u$} \\
 {\tt  ) } \\
% {\tt   \verb+}+ } \\
\end{tabular} 
\end{center}
%\end{description}
}{
\caption[a]{Arithmetic coding.
 Iterative procedure to find the interval $[u,v)$
 	for the string   $x_1x_2\ldots x_N$.
}
\label{alg.ac}
}
\end{algorithm}
        To encode a string $x_1x_2\ldots x_N$,
 we  locate the interval corresponding to $x_1x_2\ldots x_N$, and 
 send a binary string whose interval lies within 
 that interval. This encoding can be performed
 on the fly, as we now illustrate.

% \eof defined in itprnnchapter
\subsection{Example: compressing the tosses of a bent coin}
 Imagine that we watch as a bent coin is tossed some number of times.
 [c.f. \exampleref{exa.bentcoin} and  \secref{sec.bentcoin}
 (\pref{sec.bentcoin}).]
 The two outcomes when the coin is tossed
 are denoted $\tt a$ and $\tt b$. A third possibility is that the
 experiment is halted, an event denoted by the  `end of file' symbol, `$\eof$'.
 Because the coin is bent, we expect that the probabilities of the outcomes $\tt a$
 and $\tt b$ are not equal, though beforehand we don't know which
 is the more probable outcome.

% Let $\A_X=\{a,b,\eof\}$, where 
% $a$ and $\tb$ make up a binary alphabet with 
% $\eof$ is an `end of file' symbol.
\subsubsection{Encoding\subsubpunc}
 Let the source string be `$\tt bbba\eof$'. We pass along the string one symbol
 at a time and use our model to compute the probability
 distribution of the next symbol given the string thus far.
 Let these probabilities be: 
\[\begin{array}{l*{3}{r@{\eq}l}} \toprule
\mbox{Context } \\
\mbox{(sequence thus far) }
       & \multicolumn{6}{c}{\mbox{Probability of next symbol}} \\[0.05in] \midrule
& P( \ta ) &  0.425 & P( \tb ) &  0.425 & P( \eof ) &  0.15 \\[0.05in]
\tb& P( \ta | \tb ) &   0.28 & P( \tb | \tb ) &   0.57 & P( \eof | \tb ) &   0.15 \\[0.05in]
\tb\tb&P( \ta | \tb\tb ) &   0.21 & P( \tb | \tb\tb ) &   0.64 & P( \eof | \tb\tb ) &   0.15 \\[0.05in]
\tb\tb\tb&P( \ta | \tb\tb\tb ) &   0.17 & P( \tb | \tb\tb\tb ) &   0.68 & P( \eof | \tb\tb\tb ) &   0.15 \\[0.05in]
\tb\tb\tb\ta& P( \ta | \tb\tb\tb\ta ) &   0.28 & P( \tb | \tb\tb\tb\ta ) &   0.57 & P( \eof | \tb\tb\tb\ta ) &   0.15 \\ \bottomrule
\end{array}
\]
 \Figref{fig.ac} shows the corresponding intervals.  The
 interval $\tb$ is the middle 0.425 of $[0,1)$. The interval $\tb\tb$ is the
 middle 0.567 of $\tb$, and so forth.
% in the following figure. 

\begin{figure}[htbp]
\figuremargin{%
\begin{center}
% created by ac.p only_show_data=1 > ac/ac_data.tex   %%%%%%% and edited by hand
\mbox{
\hspace{-0.1in}\small
\setlength{\unitlength}{4.8in}
%\setlength{\unitlength}{5.75in}
\begin{picture}(0.59130434782608698452,1)(-0.29565217391304349226,0)
\thinlines
% line    0.0000 from   -0.5000 to    0.0000 
\put(  -0.2957,   1.0000){\line(1,0){   0.2957}}
% a at   -0.4500,   0.2125
\put(  -0.2811,   0.7875){\makebox(0,0)[r]{\tt{a}}}
% line    0.4250 from   -0.5000 to    0.0000 
\put(  -0.2957,   0.5750){\line(1,0){   0.2957}}
% b at   -0.4500,   0.6375
\put(  -0.2811,   0.3625){\makebox(0,0)[r]{\tt{b}}}
% line    0.8500 from   -0.5000 to    0.0000 
\put(  -0.2957,   0.1500){\line(1,0){   0.2957}}
% \teof at   -0.4500,   0.9250
\put(  -0.2811,   0.0750){\makebox(0,0)[r]{\teof}}
% line    1.0000 from   -0.5000 to    0.0000 
\put(  -0.2957,   0.0000){\line(1,0){   0.2957}}
% ba at   -0.3500,   0.4852
\put(  -0.2220,   0.5148){\makebox(0,0)[r]{\tt{ba}}}
% line    0.5454 from   -0.4500 to    0.0000 
\put(  -0.2661,   0.4546){\line(1,0){   0.2661}}
% bb at   -0.3500,   0.6658
\put(  -0.2220,   0.3342){\makebox(0,0)[r]{\tt{bb}}}
% line    0.7862 from   -0.4500 to    0.0000 
\put(  -0.2661,   0.2138){\line(1,0){   0.2661}}
% b\teof at   -0.3500,   0.8181
\put(  -0.2220,   0.1819){\makebox(0,0)[r]{\tt{b\teof}}}
% bba at   -0.2300,   0.5710
\put(  -0.1510,   0.4290){\makebox(0,0)[r]{\tt{bba}}}
% line    0.5966 from   -0.3500 to    0.0000 
\put(  -0.2070,   0.4034){\line(1,0){   0.2070}}
% bbb at   -0.2300,   0.6734
\put(  -0.1510,   0.3266){\makebox(0,0)[r]{\tt{bbb}}}
% line    0.7501 from   -0.3500 to    0.0000 
\put(  -0.2070,   0.2499){\line(1,0){   0.2070}}
% bb\teof at   -0.2300,   0.7682
\put(  -0.1510,   0.2318){\makebox(0,0)[r]{\tt{bb\teof}}}
% bbba at   -0.1000,   0.6096
\put(  -0.0741,   0.3904){\makebox(0,0)[r]{\tt{bbba}}}
% line    0.6227 from   -0.2300 to    0.0000 
\put(  -0.1360,   0.3773){\line(1,0){   0.1360}}
% bbbb at   -0.1000,   0.6749
\put(  -0.0741,   0.3251){\makebox(0,0)[r]{\tt{bbbb}}}
% line    0.7271 from   -0.2300 to    0.0000 
\put(  -0.1360,   0.2729){\line(1,0){   0.1360}}
% bbb\teof at   -0.1000,   0.7386
\put(  -0.0741,   0.2614){\makebox(0,0)[r]{\tt{bbb\teof}}}
% line    0.6040 from   -0.1000 to    0.0000 
\put(  -0.0591,   0.3960){\line(1,0){   0.0591}}
% line    0.6188 from   -0.1000 to    0.0000 
\put(  -0.0591,   0.3812){\line(1,0){   0.0591}}
% line    0.0000 from    0.0100 to    0.5000 
\put(   0.0059,   1.0000){\line(1,0){   0.2897}}
% 0 at    0.0100,   0.2500
\put(   0.2811,   0.7500){\makebox(0,0)[l]{\tt0}}
% line    0.5000 from    0.0100 to    0.5000 
\put(   0.0059,   0.5000){\line(1,0){   0.2897}}
% 1 at    0.0100,   0.7500
\put(   0.2811,   0.2500){\makebox(0,0)[l]{\tt1}}
% line    1.0000 from    0.0100 to    0.5000 
\put(   0.0059,   0.0000){\line(1,0){   0.2897}}
% 00 at    0.0100,   0.1250
\put(   0.2397,   0.8750){\makebox(0,0)[l]{\tt00}}
% line    0.2500 from    0.0100 to    0.4500 
\put(   0.0059,   0.7500){\line(1,0){   0.2602}}
% 01 at    0.0100,   0.3750
\put(   0.2397,   0.6250){\makebox(0,0)[l]{\tt01}}
% 000 at    0.0100,   0.0625
\put(   0.1806,   0.9375){\makebox(0,0)[l]{\tt000}}
% line    0.1250 from    0.0100 to    0.3800 
\put(   0.0059,   0.8750){\line(1,0){   0.2188}}
% 001 at    0.0100,   0.1875
\put(   0.1806,   0.8125){\makebox(0,0)[l]{\tt001}}
% 0000 at    0.0100,   0.0312
% was at 0.1037, move 0.02 right -> 1207
\put(   0.1207,   0.9688){\makebox(0,0)[l]{\tt0000}}
% line    0.0625 from    0.0100 to    0.2800 
\put(   0.0059,   0.9375){\line(1,0){   0.1597}}
% 0001 at    0.0100,   0.0938
\put(   0.1207,   0.9062){\makebox(0,0)[l]{\tt0001}}
% 00000 at    0.0100,   0.0156
\put(   0.0387,   0.9844){\makebox(0,0)[l]{\tt00000}}
% line    0.0312 from    0.0100 to    0.1500 
\put(   0.0059,   0.9688){\line(1,0){   0.0828}}
% 00001 at    0.0100,   0.0469
\put(   0.0387,   0.9531){\makebox(0,0)[l]{\tt00001}}
% line    0.0156 from    0.0100 to    0.0400 
\put(   0.0059,   0.9844){\line(1,0){   0.0177}}
% line    0.0078 from    0.0100 to    0.0200 
\put(   0.0059,   0.9922){\line(1,0){   0.0059}}
% line    0.0234 from    0.0100 to    0.0200 
\put(   0.0059,   0.9766){\line(1,0){   0.0059}}
% line    0.0469 from    0.0100 to    0.0400 
\put(   0.0059,   0.9531){\line(1,0){   0.0177}}
% line    0.0391 from    0.0100 to    0.0200 
\put(   0.0059,   0.9609){\line(1,0){   0.0059}}
% line    0.0547 from    0.0100 to    0.0200 
\put(   0.0059,   0.9453){\line(1,0){   0.0059}}
% 00010 at    0.0100,   0.0781
\put(   0.0387,   0.9219){\makebox(0,0)[l]{\tt00010}}
% line    0.0938 from    0.0100 to    0.1500 
\put(   0.0059,   0.9062){\line(1,0){   0.0828}}
% 00011 at    0.0100,   0.1094
\put(   0.0387,   0.8906){\makebox(0,0)[l]{\tt00011}}
% line    0.0781 from    0.0100 to    0.0400 
\put(   0.0059,   0.9219){\line(1,0){   0.0177}}
% line    0.0703 from    0.0100 to    0.0200 
\put(   0.0059,   0.9297){\line(1,0){   0.0059}}
% line    0.0859 from    0.0100 to    0.0200 
\put(   0.0059,   0.9141){\line(1,0){   0.0059}}
% line    0.1094 from    0.0100 to    0.0400 
\put(   0.0059,   0.8906){\line(1,0){   0.0177}}
% line    0.1016 from    0.0100 to    0.0200 
\put(   0.0059,   0.8984){\line(1,0){   0.0059}}
% line    0.1172 from    0.0100 to    0.0200 
\put(   0.0059,   0.8828){\line(1,0){   0.0059}}
% 0010 at    0.0100,   0.1562
\put(   0.1207,   0.8438){\makebox(0,0)[l]{\tt0010}}
% line    0.1875 from    0.0100 to    0.2800 
\put(   0.0059,   0.8125){\line(1,0){   0.1597}}
% 0011 at    0.0100,   0.2188
\put(   0.1207,   0.7812){\makebox(0,0)[l]{\tt0011}}
% 00100 at    0.0100,   0.1406
\put(   0.0387,   0.8594){\makebox(0,0)[l]{\tt00100}}
% line    0.1562 from    0.0100 to    0.1500 
\put(   0.0059,   0.8438){\line(1,0){   0.0828}}
% 00101 at    0.0100,   0.1719
\put(   0.0387,   0.8281){\makebox(0,0)[l]{\tt00101}}
% line    0.1406 from    0.0100 to    0.0400 
\put(   0.0059,   0.8594){\line(1,0){   0.0177}}
% line    0.1328 from    0.0100 to    0.0200 
\put(   0.0059,   0.8672){\line(1,0){   0.0059}}
% line    0.1484 from    0.0100 to    0.0200 
\put(   0.0059,   0.8516){\line(1,0){   0.0059}}
% line    0.1719 from    0.0100 to    0.0400 
\put(   0.0059,   0.8281){\line(1,0){   0.0177}}
% line    0.1641 from    0.0100 to    0.0200 
\put(   0.0059,   0.8359){\line(1,0){   0.0059}}
% line    0.1797 from    0.0100 to    0.0200 
\put(   0.0059,   0.8203){\line(1,0){   0.0059}}
% 00110 at    0.0100,   0.2031
\put(   0.0387,   0.7969){\makebox(0,0)[l]{\tt00110}}
% line    0.2188 from    0.0100 to    0.1500 
\put(   0.0059,   0.7812){\line(1,0){   0.0828}}
% 00111 at    0.0100,   0.2344
\put(   0.0387,   0.7656){\makebox(0,0)[l]{\tt00111}}
% line    0.2031 from    0.0100 to    0.0400 
\put(   0.0059,   0.7969){\line(1,0){   0.0177}}
% line    0.1953 from    0.0100 to    0.0200 
\put(   0.0059,   0.8047){\line(1,0){   0.0059}}
% line    0.2109 from    0.0100 to    0.0200 
\put(   0.0059,   0.7891){\line(1,0){   0.0059}}
% line    0.2344 from    0.0100 to    0.0400 
\put(   0.0059,   0.7656){\line(1,0){   0.0177}}
% line    0.2266 from    0.0100 to    0.0200 
\put(   0.0059,   0.7734){\line(1,0){   0.0059}}
% line    0.2422 from    0.0100 to    0.0200 
\put(   0.0059,   0.7578){\line(1,0){   0.0059}}
% 010 at    0.0100,   0.3125
\put(   0.1806,   0.6875){\makebox(0,0)[l]{\tt010}}
% line    0.3750 from    0.0100 to    0.3800 
\put(   0.0059,   0.6250){\line(1,0){   0.2188}}
% 011 at    0.0100,   0.4375
\put(   0.1806,   0.5625){\makebox(0,0)[l]{\tt011}}
% 0100 at    0.0100,   0.2812
\put(   0.1207,   0.7188){\makebox(0,0)[l]{\tt0100}}
% line    0.3125 from    0.0100 to    0.2800 
\put(   0.0059,   0.6875){\line(1,0){   0.1597}}
% 0101 at    0.0100,   0.3438
\put(   0.1207,   0.6562){\makebox(0,0)[l]{\tt0101}}
% 01000 at    0.0100,   0.2656
\put(   0.0387,   0.7344){\makebox(0,0)[l]{\tt01000}}
% line    0.2812 from    0.0100 to    0.1500 
\put(   0.0059,   0.7188){\line(1,0){   0.0828}}
% 01001 at    0.0100,   0.2969
\put(   0.0387,   0.7031){\makebox(0,0)[l]{\tt01001}}
% line    0.2656 from    0.0100 to    0.0400 
\put(   0.0059,   0.7344){\line(1,0){   0.0177}}
% line    0.2578 from    0.0100 to    0.0200 
\put(   0.0059,   0.7422){\line(1,0){   0.0059}}
% line    0.2734 from    0.0100 to    0.0200 
\put(   0.0059,   0.7266){\line(1,0){   0.0059}}
% line    0.2969 from    0.0100 to    0.0400 
\put(   0.0059,   0.7031){\line(1,0){   0.0177}}
% line    0.2891 from    0.0100 to    0.0200 
\put(   0.0059,   0.7109){\line(1,0){   0.0059}}
% line    0.3047 from    0.0100 to    0.0200 
\put(   0.0059,   0.6953){\line(1,0){   0.0059}}
% 01010 at    0.0100,   0.3281
\put(   0.0387,   0.6719){\makebox(0,0)[l]{\tt01010}}
% line    0.3438 from    0.0100 to    0.1500 
\put(   0.0059,   0.6562){\line(1,0){   0.0828}}
% 01011 at    0.0100,   0.3594
\put(   0.0387,   0.6406){\makebox(0,0)[l]{\tt01011}}
% line    0.3281 from    0.0100 to    0.0400 
\put(   0.0059,   0.6719){\line(1,0){   0.0177}}
% line    0.3203 from    0.0100 to    0.0200 
\put(   0.0059,   0.6797){\line(1,0){   0.0059}}
% line    0.3359 from    0.0100 to    0.0200 
\put(   0.0059,   0.6641){\line(1,0){   0.0059}}
% line    0.3594 from    0.0100 to    0.0400 
\put(   0.0059,   0.6406){\line(1,0){   0.0177}}
% line    0.3516 from    0.0100 to    0.0200 
\put(   0.0059,   0.6484){\line(1,0){   0.0059}}
% line    0.3672 from    0.0100 to    0.0200 
\put(   0.0059,   0.6328){\line(1,0){   0.0059}}
% 0110 at    0.0100,   0.4062
\put(   0.1207,   0.5938){\makebox(0,0)[l]{\tt0110}}
% line    0.4375 from    0.0100 to    0.2800 
\put(   0.0059,   0.5625){\line(1,0){   0.1597}}
% 0111 at    0.0100,   0.4688
\put(   0.1207,   0.5312){\makebox(0,0)[l]{\tt0111}}
% 01100 at    0.0100,   0.3906
\put(   0.0387,   0.6094){\makebox(0,0)[l]{\tt01100}}
% line    0.4062 from    0.0100 to    0.1500 
\put(   0.0059,   0.5938){\line(1,0){   0.0828}}
% 01101 at    0.0100,   0.4219
\put(   0.0387,   0.5781){\makebox(0,0)[l]{\tt01101}}
% line    0.3906 from    0.0100 to    0.0400 
\put(   0.0059,   0.6094){\line(1,0){   0.0177}}
% line    0.3828 from    0.0100 to    0.0200 
\put(   0.0059,   0.6172){\line(1,0){   0.0059}}
% line    0.3984 from    0.0100 to    0.0200 
\put(   0.0059,   0.6016){\line(1,0){   0.0059}}
% line    0.4219 from    0.0100 to    0.0400 
\put(   0.0059,   0.5781){\line(1,0){   0.0177}}
% line    0.4141 from    0.0100 to    0.0200 
\put(   0.0059,   0.5859){\line(1,0){   0.0059}}
% line    0.4297 from    0.0100 to    0.0200 
\put(   0.0059,   0.5703){\line(1,0){   0.0059}}
% 01110 at    0.0100,   0.4531
\put(   0.0387,   0.5469){\makebox(0,0)[l]{\tt01110}}
% line    0.4688 from    0.0100 to    0.1500 
\put(   0.0059,   0.5312){\line(1,0){   0.0828}}
% 01111 at    0.0100,   0.4844
\put(   0.0387,   0.5156){\makebox(0,0)[l]{\tt01111}}
% line    0.4531 from    0.0100 to    0.0400 
\put(   0.0059,   0.5469){\line(1,0){   0.0177}}
% line    0.4453 from    0.0100 to    0.0200 
\put(   0.0059,   0.5547){\line(1,0){   0.0059}}
% line    0.4609 from    0.0100 to    0.0200 
\put(   0.0059,   0.5391){\line(1,0){   0.0059}}
% line    0.4844 from    0.0100 to    0.0400 
\put(   0.0059,   0.5156){\line(1,0){   0.0177}}
% line    0.4766 from    0.0100 to    0.0200 
\put(   0.0059,   0.5234){\line(1,0){   0.0059}}
% line    0.4922 from    0.0100 to    0.0200 
\put(   0.0059,   0.5078){\line(1,0){   0.0059}}
% 10 at    0.0100,   0.6250
\put(   0.2397,   0.3750){\makebox(0,0)[l]{\tt10}}
% line    0.7500 from    0.0100 to    0.4500 
\put(   0.0059,   0.2500){\line(1,0){   0.2602}}
% 11 at    0.0100,   0.8750
\put(   0.2397,   0.1250){\makebox(0,0)[l]{\tt11}}
% 100 at    0.0100,   0.5625
\put(   0.1806,   0.4375){\makebox(0,0)[l]{\tt100}}
% line    0.6250 from    0.0100 to    0.3800 
\put(   0.0059,   0.3750){\line(1,0){   0.2188}}
% 101 at    0.0100,   0.6875
\put(   0.1806,   0.3125){\makebox(0,0)[l]{\tt101}}
% 1000 at    0.0100,   0.5312
\put(   0.1207,   0.4688){\makebox(0,0)[l]{\tt1000}}
% line    0.5625 from    0.0100 to    0.2800 
\put(   0.0059,   0.4375){\line(1,0){   0.1597}}
% 1001 at    0.0100,   0.5938
\put(   0.1207,   0.4062){\makebox(0,0)[l]{\tt1001}}
% 10000 at    0.0100,   0.5156
\put(   0.0387,   0.4844){\makebox(0,0)[l]{\tt10000}}
% line    0.5312 from    0.0100 to    0.1500 
\put(   0.0059,   0.4688){\line(1,0){   0.0828}}
% 10001 at    0.0100,   0.5469
\put(   0.0387,   0.4531){\makebox(0,0)[l]{\tt10001}}
% line    0.5156 from    0.0100 to    0.0400 
\put(   0.0059,   0.4844){\line(1,0){   0.0177}}
% line    0.5078 from    0.0100 to    0.0200 
\put(   0.0059,   0.4922){\line(1,0){   0.0059}}
% line    0.5234 from    0.0100 to    0.0200 
\put(   0.0059,   0.4766){\line(1,0){   0.0059}}
% line    0.5469 from    0.0100 to    0.0400 
\put(   0.0059,   0.4531){\line(1,0){   0.0177}}
% line    0.5391 from    0.0100 to    0.0200 
\put(   0.0059,   0.4609){\line(1,0){   0.0059}}
% line    0.5547 from    0.0100 to    0.0200 
\put(   0.0059,   0.4453){\line(1,0){   0.0059}}
% 10010 at    0.0100,   0.5781
\put(   0.0387,   0.4219){\makebox(0,0)[l]{\tt10010}}
% line    0.5938 from    0.0100 to    0.1500 
\put(   0.0059,   0.4062){\line(1,0){   0.0828}}
% 10011 at    0.0100,   0.6094
\put(   0.0387,   0.3906){\makebox(0,0)[l]{\tt10011}}
% line    0.5781 from    0.0100 to    0.0400 
\put(   0.0059,   0.4219){\line(1,0){   0.0177}}
% line    0.5703 from    0.0100 to    0.0200 
\put(   0.0059,   0.4297){\line(1,0){   0.0059}}
% line    0.5859 from    0.0100 to    0.0200 
\put(   0.0059,   0.4141){\line(1,0){   0.0059}}
% line    0.6094 from    0.0100 to    0.0400 
\put(   0.0059,   0.3906){\line(1,0){   0.0177}}
% line    0.6016 from    0.0100 to    0.0200 
\put(   0.0059,   0.3984){\line(1,0){   0.0059}}
% line    0.6172 from    0.0100 to    0.0200 
\put(   0.0059,   0.3828){\line(1,0){   0.0059}}
% 1010 at    0.0100,   0.6562
\put(   0.1207,   0.3438){\makebox(0,0)[l]{\tt1010}}
% line    0.6875 from    0.0100 to    0.2800 
\put(   0.0059,   0.3125){\line(1,0){   0.1597}}
% 1011 at    0.0100,   0.7188
\put(   0.1207,   0.2812){\makebox(0,0)[l]{\tt1011}}
% 10100 at    0.0100,   0.6406
\put(   0.0387,   0.3594){\makebox(0,0)[l]{\tt10100}}
% line    0.6562 from    0.0100 to    0.1500 
\put(   0.0059,   0.3438){\line(1,0){   0.0828}}
% 10101 at    0.0100,   0.6719
\put(   0.0387,   0.3281){\makebox(0,0)[l]{\tt10101}}
% line    0.6406 from    0.0100 to    0.0400 
\put(   0.0059,   0.3594){\line(1,0){   0.0177}}
% line    0.6328 from    0.0100 to    0.0200 
\put(   0.0059,   0.3672){\line(1,0){   0.0059}}
% line    0.6484 from    0.0100 to    0.0200 
\put(   0.0059,   0.3516){\line(1,0){   0.0059}}
% line    0.6719 from    0.0100 to    0.0400 
\put(   0.0059,   0.3281){\line(1,0){   0.0177}}
% line    0.6641 from    0.0100 to    0.0200 
\put(   0.0059,   0.3359){\line(1,0){   0.0059}}
% line    0.6797 from    0.0100 to    0.0200 
\put(   0.0059,   0.3203){\line(1,0){   0.0059}}
% 10110 at    0.0100,   0.7031
\put(   0.0387,   0.2969){\makebox(0,0)[l]{\tt10110}}
% line    0.7188 from    0.0100 to    0.1500 
\put(   0.0059,   0.2812){\line(1,0){   0.0828}}
% 10111 at    0.0100,   0.7344
\put(   0.0387,   0.2656){\makebox(0,0)[l]{\tt10111}}
% line    0.7031 from    0.0100 to    0.0400 
\put(   0.0059,   0.2969){\line(1,0){   0.0177}}
% line    0.6953 from    0.0100 to    0.0200 
\put(   0.0059,   0.3047){\line(1,0){   0.0059}}
% line    0.7109 from    0.0100 to    0.0200 
\put(   0.0059,   0.2891){\line(1,0){   0.0059}}
% line    0.7344 from    0.0100 to    0.0400 
\put(   0.0059,   0.2656){\line(1,0){   0.0177}}
% line    0.7266 from    0.0100 to    0.0200 
\put(   0.0059,   0.2734){\line(1,0){   0.0059}}
% line    0.7422 from    0.0100 to    0.0200 
\put(   0.0059,   0.2578){\line(1,0){   0.0059}}
% 110 at    0.0100,   0.8125
\put(   0.1806,   0.1875){\makebox(0,0)[l]{\tt110}}
% line    0.8750 from    0.0100 to    0.3800 
\put(   0.0059,   0.1250){\line(1,0){   0.2188}}
% 111 at    0.0100,   0.9375
\put(   0.1806,   0.0625){\makebox(0,0)[l]{\tt111}}
% 1100 at    0.0100,   0.7812
\put(   0.1207,   0.2188){\makebox(0,0)[l]{\tt1100}}
% line    0.8125 from    0.0100 to    0.2800 
\put(   0.0059,   0.1875){\line(1,0){   0.1597}}
% 1101 at    0.0100,   0.8438
\put(   0.1207,   0.1562){\makebox(0,0)[l]{\tt1101}}
% 11000 at    0.0100,   0.7656
\put(   0.0387,   0.2344){\makebox(0,0)[l]{\tt11000}}
% line    0.7812 from    0.0100 to    0.1500 
\put(   0.0059,   0.2188){\line(1,0){   0.0828}}
% 11001 at    0.0100,   0.7969
\put(   0.0387,   0.2031){\makebox(0,0)[l]{\tt11001}}
% line    0.7656 from    0.0100 to    0.0400 
\put(   0.0059,   0.2344){\line(1,0){   0.0177}}
% line    0.7578 from    0.0100 to    0.0200 
\put(   0.0059,   0.2422){\line(1,0){   0.0059}}
% line    0.7734 from    0.0100 to    0.0200 
\put(   0.0059,   0.2266){\line(1,0){   0.0059}}
% line    0.7969 from    0.0100 to    0.0400 
\put(   0.0059,   0.2031){\line(1,0){   0.0177}}
% line    0.7891 from    0.0100 to    0.0200 
\put(   0.0059,   0.2109){\line(1,0){   0.0059}}
% line    0.8047 from    0.0100 to    0.0200 
\put(   0.0059,   0.1953){\line(1,0){   0.0059}}
% 11010 at    0.0100,   0.8281
\put(   0.0387,   0.1719){\makebox(0,0)[l]{\tt11010}}
% line    0.8438 from    0.0100 to    0.1500 
\put(   0.0059,   0.1562){\line(1,0){   0.0828}}
% 11011 at    0.0100,   0.8594
\put(   0.0387,   0.1406){\makebox(0,0)[l]{\tt11011}}
% line    0.8281 from    0.0100 to    0.0400 
\put(   0.0059,   0.1719){\line(1,0){   0.0177}}
% line    0.8203 from    0.0100 to    0.0200 
\put(   0.0059,   0.1797){\line(1,0){   0.0059}}
% line    0.8359 from    0.0100 to    0.0200 
\put(   0.0059,   0.1641){\line(1,0){   0.0059}}
% line    0.8594 from    0.0100 to    0.0400 
\put(   0.0059,   0.1406){\line(1,0){   0.0177}}
% line    0.8516 from    0.0100 to    0.0200 
\put(   0.0059,   0.1484){\line(1,0){   0.0059}}
% line    0.8672 from    0.0100 to    0.0200 
\put(   0.0059,   0.1328){\line(1,0){   0.0059}}
% 1110 at    0.0100,   0.9062
\put(   0.1207,   0.0938){\makebox(0,0)[l]{\tt1110}}
% line    0.9375 from    0.0100 to    0.2800 
\put(   0.0059,   0.0625){\line(1,0){   0.1597}}
% 1111 at    0.0100,   0.9688
\put(   0.1207,   0.0312){\makebox(0,0)[l]{\tt1111}}
% 11100 at    0.0100,   0.8906
\put(   0.0387,   0.1094){\makebox(0,0)[l]{\tt11100}}
% line    0.9062 from    0.0100 to    0.1500 
\put(   0.0059,   0.0938){\line(1,0){   0.0828}}
% 11101 at    0.0100,   0.9219
\put(   0.0387,   0.0781){\makebox(0,0)[l]{\tt11101}}
% line    0.8906 from    0.0100 to    0.0400 
\put(   0.0059,   0.1094){\line(1,0){   0.0177}}
% line    0.8828 from    0.0100 to    0.0200 
\put(   0.0059,   0.1172){\line(1,0){   0.0059}}
% line    0.8984 from    0.0100 to    0.0200 
\put(   0.0059,   0.1016){\line(1,0){   0.0059}}
% line    0.9219 from    0.0100 to    0.0400 
\put(   0.0059,   0.0781){\line(1,0){   0.0177}}
% line    0.9141 from    0.0100 to    0.0200 
\put(   0.0059,   0.0859){\line(1,0){   0.0059}}
% line    0.9297 from    0.0100 to    0.0200 
\put(   0.0059,   0.0703){\line(1,0){   0.0059}}
% 11110 at    0.0100,   0.9531
\put(   0.0387,   0.0469){\makebox(0,0)[l]{\tt11110}}
% line    0.9688 from    0.0100 to    0.1500 
\put(   0.0059,   0.0312){\line(1,0){   0.0828}}
% 11111 at    0.0100,   0.9844
\put(   0.0387,   0.0156){\makebox(0,0)[l]{\tt11111}}
% line    0.9531 from    0.0100 to    0.0400 
\put(   0.0059,   0.0469){\line(1,0){   0.0177}}
% line    0.9453 from    0.0100 to    0.0200 
\put(   0.0059,   0.0547){\line(1,0){   0.0059}}
% line    0.9609 from    0.0100 to    0.0200 
\put(   0.0059,   0.0391){\line(1,0){   0.0059}}
% line    0.9844 from    0.0100 to    0.0400 
\put(   0.0059,   0.0156){\line(1,0){   0.0177}}
% line    0.9766 from    0.0100 to    0.0200 
\put(   0.0059,   0.0234){\line(1,0){   0.0059}}
% line    0.9922 from    0.0100 to    0.0200 
\put(   0.0059,   0.0078){\line(1,0){   0.0059}}
\end{picture}

\hspace{-0.04in}% was -.25
\raisebox{1.1895in}{% was 1.425
\setlength{\unitlength}{33.39in}
%\setlength{\unitlength}{40in}
\begin{picture}(0.085,0.04)(-0.0425,0.37)
\thinlines
% 
% wings added by hand
\put(  -0.0408 ,   0.4082){\line(-1,-3){   0.005}}
\put(  -0.0408 ,   0.3730){\line(-1,3){   0.005}}
%
% arrow identifying the final interval added by hand
% the center of the interval is 0010 below this point
% 10011110  (0.3809)
% 0.0017 is the length of the stubby lines
%
% want vector's tip to end at height 0.37995 and x=0.0010
% 4*34 = 136 -> 36635
% this was perfectly positioned
%\put(   0.0040,   0.36635){\makebox(0,0)[tl]{\tt100111101}}
%\put(   0.0044,   0.36635){\vector(-1,4){0.0034}}
% but I shifted it to this for arty reasons
\put(   0.0048,   0.36635)