%Communication Complexity Scribe Notes template
\ifx\CompleteCourse\relax
\ClassScribeSetupA
\else
\documentclass[11pt]{article}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{homework}{Homework}
\newenvironment{definition}{\begin{trivlist}\item[]{\bf Definition}\ }%
{\end{trivlist}}
\newenvironment{fact}{\begin{trivlist}\item[]{\bf Fact}\ }%
{\end{trivlist}}
\newenvironment{example}{\begin{trivlist}\item[]{\bf Example}\ }%
{\end{trivlist}}
\newenvironment{proof}{\begin{trivlist}\item[]{\bf Proof}\ }%
{\end{trivlist}}
% Make the page large
\addtolength{\textwidth}{1.50in}
\addtolength{\textheight}{1.00in}
\addtolength{\evensidemargin}{-0.75in}
\addtolength{\oddsidemargin}{-0.75in}
\addtolength{\topmargin}{-.50in}
% \vdashsub{X} makes a turnstyle with subscript "X"
% \vdashsup{X} makes a turnstyle with superscript "X"
\newdimen\srbdimenA
\newcommand{\vdashsupsub}[2]{ \mathop{
\setbox251 = \hbox{$\scriptstyle #1$}
\setbox252 = \hbox{$\scriptstyle #2$}
\ifdim \wd251<\wd252 \srbdimenA = \wd252 \else \srbdimenA = \wd251 \fi
\setbox255 = \hbox {${\srbAvdash \vphantom( \kern -\srbdimenA \kern +.05em}
^{\hbox to\srbdimenA{\hfill \box251\hfill}}
_{\hbox to\srbdimenA{\hfill \box252\hfill}}$}
\box255 \kern .05em}}
\newcommand{\srbAvdash}{\hbox{ \vrule height1.4ex width0.02em
\dimen255 = \srbdimenA
\advance\dimen255 by 0.1em
\vbox{\hrule width\srbdimenA height0.02em
\kern .65ex }}}
\newcommand{\vdashsup}[1]{\vdashsupsub{{#1}}{\mbox{~}}}
\newcommand{\vdashsub}[1]{\vdashsupsub{\mbox{~}}{#1}}
\fi
% FOR THE SCRIBE: CUSTOMIZE THE ENTRIES BELOW:
% Fill in the following information particular to these scribe notes:
\usepackage{amsmath}
\usepackage{amssymb}
\newcommand{\disc}{\mathrm{disc}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\ZZ}{\mathbb{Z}}
\newcommand{\cv}[1]{\mathbf{1}_{#1}}
\newcommand{\abs}[1]{\left\lvert#1\right\rvert}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\p}[1]{\left(#1\right)}
\newcommand{\bra}[1]{\left[#1\right]}
\newcommand{\T}[1]{{#1}^{\mathrm{T}}}
\newcommand{\IP}{\mathrm{IP}}
\newcommand{\sdeg}{\mathrm{sign\text{-}deg}}
\newcommand{\edeg}[1][\epsilon]{#1\mathrm{\text{-}deg}}
\newcommand{\DTC}{\mathrm{DTC}}
\DeclareMathOperator*{\Exp}{\mathbb{E}}
\def\scribeone{Toniann Pitassi} % Who is the scribe?
\def\classdate{Fall, 2014} % Date of the class
\def\classnumber{3} % Is this the first, second, ...?
% Here are some commands that stay the same for the whole class.
\def\classinstructor{Toniann Pitassi}
\def\classtitle{Communication Complexity}
\def\doctitle{\textup{CS 2429 - Foundations of Communication Complexity}}
\def\classid{\textup{Lecture \#\classnumber: \classdate}}
% Put your macros for these scribe notes HERE
% It is best to use as few as possible.
% environments for "theorem", "corollary", "lemma", "fact" "definition"
% "homework", "proof", "example", "proposition"
% are already defined above.
% Start the document
\ifx\CompleteCourse\relax
\ClassScribeSetupB
\else
\def\makeatletter{\catcode`\@=11\relax}
\def\makeatother{\catcode`\@=12\relax}
\makeatletter
\def\ps@scribeheadings{\let\@mkboth\@gobbletwo
\def\@oddhead{\sl\doctitle \hfill \classid
}\def\@oddfoot{\hfil \rm \thepage \hfil}\def\@evenhead{\@oddhead}%
\def\@evenfoot{\@oddfoot}\def\sectionmark##1{}\def\subsectionmark##1{}}
\makeatother
\pagestyle{scribeheadings}
\begin{document}
\bibliographystyle{siam}
\fi
\begin{center}
\Large\bf\doctitle\\[1em]
%\Large\bf\classid\\[1em]
{\large\bf Lecturer: \classinstructor}\\[.5em]
%{\large\bf Scribe Notes by: \scribeone}
\end{center}
\vspace*{.4in}
% HERE IS WHERE YOUR SCRIBE NOTES SHOULD START
% DELETE ALL OF ROB'S TEXT AND ENTER YOUR OWN.
\section{Randomized Communication Complexity}
\subsection{Definitions}
\paragraph{}
A \emph{(private coin) randomized protocol} is a protocol where Alice and Bob have access to random strings $r_A$ and $r_B$, respectively. These two strings are chosen independently, according to some probability distribution. We can classify randomized protocols by considering different types of error:
\begin{itemize}
\item \emph{zero-error protocol $\mathcal{P}$}:
\[
\forall x,y \Pr_{r_A,r_B}[\mathcal{P}(x,r_A,y,r_B) = f(x,y)]=1
\]
\item \emph{$\epsilon$-error protocol $\mathcal{P}$}:
\[
\forall x,y \Pr_{r_A,r_B}[\mathcal{P}(x,r_A,y,r_B) = f(x,y)] \geq 1-\epsilon
\]
\item \emph{one-sided $\epsilon$-error protocol $\mathcal{P}$}:
\[
\begin{array}{rl}
\forall x,y: & f(x,y)=0 \Rightarrow \Pr_{r_A,r_B}[\mathcal{P}(x,r_A,y,r_B)=0]=1\\
& f(x,y)=1 \Rightarrow \Pr_{r_A,r_B}[\mathcal{P}(x,r_A,y,r_B)=1] \geq 1-\epsilon
\end{array}
\]
\end{itemize}
\paragraph{}
Due to randomization, the number of bits exchanged may differ in different executions of the protocol on the same input $(x,y)$. So, there are two natural choices for measuring the running time of a randomized protocol:
\begin{itemize}
\item The \emph{worst case running time $\mathcal{P}$ on input $(x,y)$} is the maximum number of bits communicated over all choices of the random strings $r_A$ and $r_B$. The \emph{worst case cost of $\mathcal{P}$} is the maximum, over all inputs $(x,y)$, of the worst case running time of $\mathcal{P}$ on $(x,y)$.
\item The \emph{average case running time $\mathcal{P}$ on input $(x,y)$} is the expected number of bits communicated over all choices of the random strings $r_A$ and $r_B$. The \emph{average case cost of $\mathcal{P}$} is the maximum, over all inputs $(x,y)$, of the average case running time of $\mathcal{P}$ on $(x,y)$.
\end{itemize}
So, for a function $f : X \times Y \rightarrow \{0,1\}$, we define the following complexity measures.
All of these definitions are for private coin protocols.
\begin{itemize}
\item $R_0(f)$ is the minimum average case cost of a randomized protocol that computes $f$ with zero error.
\item For $0 < \epsilon < \frac{1}{2}$, $R_{\epsilon}(f)$ is the minimum worst case cost of a randomized protocol that computes $f$ with error $\epsilon$.
\item For $0 < \epsilon < 1$, $R_{\epsilon}^1(f)$ is the minimum worst case cost of a randomized protocol that computes $f$ with one-sided error $\epsilon$.
\end{itemize}
\paragraph{}
These lead naturally to the following complexity classes:
\begin{itemize}
\item $ZPP^{cc} = \{f\ |\ R_0(f) \in O(\textrm{polylog}(n))\}$
\item $BPP^{cc} = \{f\ |\ R_{\epsilon}(f) \in O(\textrm{polylog}(n))\}$
\item $RP^{cc} = \{f\ |\ R_{\epsilon}^1(f) \in O(\textrm{polylog}(n))\}$
\end{itemize}
\paragraph{}
Analogous definitions hold in a \emph{public coin} model, that is, a model where both Alice and Bob see the results of a single series of random coin flips. A randomized protocol in the public coin model can be viewed as a distribution of deterministic protocols, that is, Alice and Bob choose together a string $r$ (according to a probability distribution $\Pi$, and independently of $x$ and $y$) and then follow the deterministic protocol $P_r$. The \emph{success probability} of a public coin protocol on input $(x,y)$ is the probability of choosing a deterministic protocol, according to the probability distribution $\Pi$, that computes $f(x,y)$ correctly. We use the same complexity measures as in the private coin model, but add a superscript `pub', i.e., $R^{pub}_0(f)$, $R^{pub}_{\epsilon}(f)$, $R^{1\ pub}_{\epsilon}(f)$. We have previously seen the following facts:
\begin{itemize}
\item $R^{pub}_{\epsilon}(f) \leq R_{\epsilon}(f)$
\item for every $\delta >0$ and every $\epsilon > 0$, $R_{\epsilon + \delta}(f) \leq R^{pub}_{\epsilon}(f) + O(\log n + \log \delta^{-1})$
\end{itemize}
\subsection{Distributional Complexity}
\paragraph{}
Let $\mu$ be a probability distribution over $X \times Y$, $X = \{0,1\}^n$, $Y = \{0,1\}^n$. The \emph{$(\mu,\epsilon)$-distributional communication complexity} of $f$, $D^{\mu}_{\epsilon}(f)$, is the cost of the best deterministic protocol that gives the correct answer for $f$ on at least a $(1-\epsilon)$ fraction of all inputs in $X \times Y$, weighted by $\mu$.
\begin{theorem}\label{thm:distrib}
$R^{pub}_{\epsilon}(f) = \max_{\mu} D^{\mu}_{\epsilon}(f)$
\end{theorem}
\begin{proof}
First, we show that $R^{pub}_{\epsilon}(f) \geq \max_{\mu} D^{\mu}_{\epsilon}(f)$. Let $\mathcal{P}$ be a randomized public coin protocol with worst-case cost $R^{pub}_{\epsilon}(f)$ that computes $f$ with success probability at least $1-\epsilon$ for every input $(x,y)$. Therefore, if $\Pi$ is the probability distribution of $\mathcal{P}$'s public coin flips,
\[
\Pr_{r \in \Pi,(x,y) \in (X \times Y)_\mu}(\mathcal{P}_r(x,y) = f(x,y)) \geq 1-\epsilon
\]
By a counting argument, there exists a fixed choice of public coin flips $r'$ such that
\[
\Pr_{(x,y) \in (X \times Y)_\mu}(\mathcal{P}_{r'}(x,y) = f(x,y)) \geq 1-\epsilon
\]
Thus, $\mathcal{P}_{r'}$ is a deterministic protocol that gives the correct answer for $f$ on at least a $1-\epsilon$ fraction of all inputs in $X \times Y$, weighted by $\mu$. So, $R^{pub}_{\epsilon}(f) \geq \textrm{cost}(\mathcal{P}_{r'}) \geq \max_{\mu} D^{\mu}_{\epsilon}(f)$.\\
Next, we show that $R^{pub}_{\epsilon}(f) \leq \max_{\mu} D^{\mu}_{\epsilon}(f)$. Let $c = \max_{\mu} D^{\mu}_{\epsilon}(f)$.
\subsubsection{Minimax Theorem}
%We define a two-player zero-sum game:
%\begin{itemize}
%\item Player $P1$ has all deterministic $c$-bit communication protocols. Player $P2$ has all inputs $X \times Y$.
%\item $P1$ chooses a protocol $\mathcal{P}$, $P2$ chooses an input $(x,y)$ (independently of one another).
%\item $P1$ wins if $\mathcal{P}(x,y) = f(x,y)$, otherwise $P2$ wins.
%\end{itemize}
%Each mixed strategy of $P2$ can be viewed as a distribution $\mu '$ on
%the inputs.
%Since $D^{\mu '}_{\epsilon}(f) \leq c$, there is a protocol that $P1$ can
%pick that ensures that the expected payment is at least $1-\epsilon$.
%By von Neumann's Minimax theorem for zero sum games,
%$P1$ has a randomized strategy that guarantees payoff $1-\epsilon$ for every
%choice $(x,y)$ of $P2$.
We will show this direction of the theorem by an application of
Von Neumann's Minimax Theorem.
In a two-player, zero-sum game, there are two players,
$P1$ and $P2$. $P1$ has a finite set $A=\{a_1,\ldots,a_m\}$ of pure strategies,
and $P2$ has a finite set of pure strategies, $B=\{b_1,\ldots,b_n\}$.
Each player has a utility for each pair $(a_i,b_j)$ of actions.
The utility for $P1$ is denoted by $U_1(a_i,b_j)$ and the utility for
$P2$ is denoted by $U_2(a_i,b_j)$.
It is a zero-sum game if for all $i,j$ $U_1(a_i,b_j) = - U_2(a_i,b_j)$.
In our case, for each $(a_i,b_j)$, one of the players will win
and the other one will lose.
Each player can use a mixed strategy by creating a probability
mass function and playing each pure strategy with a fixed probability.
Let $p_i$ denote the probability that $P1$ plays action $a_i$ and
let $q_j$ denote the probability that $P2$ plays action $b_j$.
Since $p$ and $q$ are probabilities, we have that each $p_i, q_j \geq 0$,
and the sum of the $p_i$'s is 1, and the sum of the $q_j$'s is 1.
A mixed strategy for $P1$ will be denoted by $p$, and similarly
$q$ denotes a mixed strategy for $P2$.
For each mixed strategy pair $(p,q)$, the payoff $M(p,q)$ is
defined to be
$$\sum_{i=1}^m \sum_{j=1}^n p_i M(a_i,b_j) q_j.$$
When $P1$ uses pure strategy $a_i$ and $P2$ uses mixed strategy
$q$, then $M(a_i,q) = \sum_{j=1}^n M(a_i,b_j)q_j$, and analogously
for $M(p,b_j)$.
We let $P$ and $Q$ denote the set of all mixed strategies available
to player 1 and 2 respectively.
Player $P1$'s objective is to select a mixed strategy $p \in P$
soas to maximize $min_q M(p,q)$, and at the same time
$P2$'s objective is to select a mixed strategy $q \in Q$ soas to
minimize $max_p M(p,q)$.
The Minimax theorem states that for every two-person zero-sum game,
there exists an equilibrium strategy. That is
there exists a value $v$ such that
$$max_p min_q M(p,q) = min_q max_p M(p,q)$$
In other words, in every two-person zero-sum game with finite strategies,
there exists a value $v$ and a mixed strategy for each player such that:
(a) given Player 2's strategy, the best payoff for Player 1 is $v$,
and (b) given Player 1's strategy, the best payoff for Player 2 is $-v$.
In our context, we define a two-player zero-sum game as follows:
\begin{itemize}
\item $P1$ (the protocol designer): his pure strategies are
all c-bit deterministic protocols ${\cal P_r}$, one for each
choice of coin flips. His mixed strategies are all
randomized protocols, $P$, (each of which is a distribution over the deterministic protocols).
\item $P2$ (the adversary): her pure strategies are all inputs $(x,y)$.
Her mixed strategies are all distributions $\mu$ over the inputs.
\item $P1$ has payoff $1$ if ${\cal P}_r(x,y) = f(x,y)$ and -1 otherwise. That is,
the designer ($P1$) wins the game iff
this protocol is correct on $(x,y)$, and otherwise $P2$ wins.
\end{itemize}
We are given as our assumption that for all distributions $\mu$ over
inputs $(x,y)$, there exists a pure strategy (a protocol) $P$ such that
the probability of a win is at least $1-\epsilon$.
This means that $Min_{\mu} Max_{P} M(\mu,P) \geq 1-\epsilon$.
(Since for each choice of $\mu$, there is a fixed strategy $P_r$ that
achieves payoff $1-\epsilon$, so no matter what $\mu$ we choose,
the designer will be able to come up with a protocol that wins $1-\epsilon$
of the time.
Now by the Minimax theorem, this means that
$Max_{P} Min_{\mu} M(\mu,P) \geq 1-\epsilon$.
From this it follows that there is a randomized strategy $P$
such that for all fixed $(x,y)$, the payoff is at least $1-\epsilon$.
%Each mixed strategy of $P1$ is a distribution over all $c$-bit protocols, and
%each mixed strategy of $P2$ is a distribution $\mu'$ over inputs.
%By our assumption $c=max_{\mu} D^{\mu}_{\epsilon}(f)$,
%for every $\mu$, there exists a $c$-bit deterministic protocol that
%has error at most $1-\epsilon$ with respect to distribution $\mu$.
%That is, there is a mixed strategy
%Since $D^{\mu'}_{\epsilon}(f) \leq c$, there is a protocol that $P1$ can pick that
%ensures that the expected payment is at least $1-\epsilon$ for every $(x,y)$.
%Now by the above Minimax theorem, this implies that $P1$ has a randomized
%strategy that guarantees payoff $1-\epsilon$ for every choice $(x,y)$ of $P2$.
%This mixed strategy for $P1$ is a randomized
%This randomized strategy is a distribution $\Pi$ over $c$-bit
%deterministic protocols, so it is a randomized public coin protocol
%$\mathcal{P}$ for $f$ with cost at most $c$ and error at most $\epsilon$.
%Therefore, $c \geq \textrm{cost}(\mathcal{P}) \geq R^{pub}_{\epsilon}(f)$.
%\hfill \fbox{}
\end{proof}
Theorem \ref{thm:distrib} is useful because, for any choice of $\mu$, a lower bound for $D_{\epsilon}^{\mu}$ gives a lower bound on $R_{\epsilon}^{pub}(f)$.
\begin{definition}
A distribution $\mu$ over $X \times Y$ is a \emph{product distribution} if $\mu(x,y) = \mu_X(x) \cdot \mu_Y(y)$ for some distributions $\mu_X$ over $X$ and $\mu_Y$ over $Y$. Let $R^{[\ ]}(f) = \max_{\mu}D^{\mu}(f)$, where the maximum is taken over all product distributions $\mu$.
\end{definition}
\underline{Exercise}: Prove that $R_{\epsilon}^{[\ ]}(DISJ) = O(\sqrt{n}\log{n})$. On the other hand, show that $R_{\epsilon}(DISJ)=\Theta(n)$.
\paragraph{}Sherstov showed a separation between product and non-product distributional complexity by proving the existence of a function $f$ such that $R^{[\ ]}(f) = \Theta(1)$ but $R_{\epsilon}(f) = \Theta(n)$.
\section{Lower Bounds for Randomized Protocols: Discrepancy}
We now consider a technique for proving lower bounds for $D_{\epsilon}^{\mu}$. It consists of finding an upper bound for the size of rectangles in $M_f$ that are ``almost" monochromatic. If we can prove that all such rectangles
for a given function $f$ are small, then we need a lot of rectangles to ``cover" the function.
\begin{definition}
Let $f : X \times Y \rightarrow \{0,1\}$ be a function, $R$ be any rectangle, and $\mu$ be a probability distribution on $X \times Y$.
$$Disc_{\mu}(R) = | \mu(R \cap f^{-1}(1)) - \mu(R \cap f^{-1}(0)|.$$
The discrepancy of $f$ under $\mu$ is the maximum over all possible rectangles:
$$Disc_{\mu}(f) = max_R Disc_{\mu}(R).$$
If $f$ has small discrepancy it means (informally) that all
large rectangles are roughly balanced.
Consider a deterministic protocol that partitions the input space
into rectangles $R_1, \ldots, R_{2^c}$. And suppose it has success
probability $2/3$ with respect to $\mu$. The best thing that the
protocol can do if it has to give one output $a_i$ for all inputs
in the rectangle $R_i$ is to set $a_i$ to the bit value with the
highest weight in that rectangle.
This contributes $\mu(R_i \cap f^{-1}(a_i))$ to the success probability
and $\mu(R_i \cap f^{-1}(1-a_i))$ to the failure probability.
Thus the overall success probability is
$\sum_i \mu(R_i \cap f^{-1}(a_i))$ and the overall error
probability is $\sum_i \mu(R_i \cap f^{-1}(1-a_i))$.
Since the difference between these two has to be at least
$2/3 - 1/3 = 1/3$, we have
\begin{eqnarray}
1/3 & \leq & \sum_{i=1}^{2^c} \mu(R_i \cap f^{-1}(a_i)) - \sum_{i=1}^{2^c}
\mu(R_i \cap f^{-1}(1-a_i)) \\
& \leq & \sum_{i=1}^{2^c} | \mu(R_i \cap f^{-1}(a_i)) -
\mu(R_i \cap f^{-1}(1-a_i))| \\
& = & \sum_{i=1}^{2^c} Disc_{\mu}(R_i) \\
& \leq & 2^c Disc_{\mu}(f).
\end{eqnarray}
This gives a lower bound on communication:
$c \geq \log(1/3 Disc_{\mu}(f))$.
To get a lower bound for randomized protocols, it suffices
to find a distribution $\mu$ such that $Disc_{\mu}(f)$ is small.
We have proved
\begin{theorem}
For every distribution $\mu$, $R_{\mu}(f) \geq \log (1/3 Disc_{\mu}(f)).$
\end{theorem}
%\begin{eqnarray*}
%{Disc}_{\mu}(R,f) & = & \left | \Pr_{\mu}[f(x,y)=0 \textrm{ and } (x,y) \in R] - \Pr_{\mu}[f(x,y)=1 \textrm{ and } (x,y) \in R] \right | \\
%& = & \Pr_{\mu} [(x,y) \in R] \times
%|\Pr_{\mu}[f(x,y)=0| (x,y) \in R] - \Pr_{\mu}[f(x,y)=1| (x,y) \in R]|
%\end{eqnarray*}
%Another common expression for the descrepancy is:
%$$Disc_{\mu}(R,f) = \sum_{x,y \in R} (-1)^{f(x,y)} \mu_{x,y}.$$
%The \emph{discrepancy of $f$ according to $\mu$} is
%$$Disc_{\mu}(f) = \max_{R}\{\textrm{Disc}_{\mu}(R,f)\} $$
%where the maximum is taken over all rectangles $R$.
%\end{definition}
%\begin{proposition}\label{prop:discrep}
%For every function $f : X \times Y \rightarrow \{0,1\}$, every probability distribution $\mu$ on $X \times Y$, and every $\epsilon \geq 0$,
%\[
%D^{\mu}_{\frac{1}{2}-\epsilon}(f) \geq \log_{2}(\frac{2\epsilon}{\textrm{Disc}_{\mu}(f)})
%\]
%\end{proposition}
%
%\begin{proof}
%Let $\mathcal{P}$ be a $c$-bit deterministic protocol for $f$ which is correct with probability at least $\frac{1}{2} + \epsilon$, where the inputs are weighted by $\mu$. Then,
%\[
%\begin{array}{rcl}
%(\frac{1}{2} + \epsilon) - (\frac{1}{2} - \epsilon) & \leq & \displaystyle \Pr_{\mu}[\mathcal{P}(x,y) = f(x,y)] - \Pr_{\mu}[\mathcal{P}(x,y) \neq f(x,y)]\\
%2\epsilon &=& \displaystyle \sum_{\ell} \left( \Pr_{\mu}[\mathcal{P}(x,y) = f(x,y) \textrm{ and } (x,y) \in R_{\ell}] - \Pr_{\mu}[\mathcal{P}(x,y) \neq f(x,y) \textrm{ and } (x,y) \in R_{\ell}] \right)
%
%\end{array}
%\]
%where the summation is over all leaves $\ell$ of the protocol. Since each leaf designates either a 0 or a 1, we can bound this expression from above by
%\[
%\sum_{\ell} \left | \Pr_{\mu}[f(x,y)=0 \textrm{ and } (x,y) \in R_{\ell}] - \Pr_{\mu}[f(x,y)=1 \textrm{ and } (x,y) \in R_{\ell}] \right |
%\]
%Each $R_\ell$ is a rectangle, so each of the terms in this sum is bounded from above by $\textrm{Disc}_\mu(f)$. Since there at most $2^c$ leaves, we get $2\epsilon \leq 2^c\cdot \textrm{Disc}_\mu(f)$, which implies the result.
%\hfill \fbox{}
%\end{proof}
We now demonstrate how to prove a lower bound for the inner product (IP) function by calculating the discrepancy of IP according to the uniform distribution.
Before we prove this result, we will study the communication matrix
for the IP function for $n=3$ to get some intuition.
We will actually switch things a little bit and analyze the
matrix whose $(x,y)$ entry is $(-1)^{x \cdot y}$. This is
just the communication matrix for IP, with 0's replaced
by 1's and 1's replaced by -1's.
With this switch of basis, The associated IP matrices are the Hadamard matrices.
Hadamard matrices are defined to be square matrices where
each entry is either $+1$ or $-1$ and such that all pairs of
rows are mutually orthogonal.
The $IP$ matrix, $H_n$, for $n=3$ looks like this:
\medskip
\begin{tabular}{llllllllllll}
1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\
1 & -1 & 1 & -1 & 1 & -1 & 1 & -1 \\
1 & 1 & -1 & -1 & 1 & 1 & -1 & -1 & \\
1 & -1 & -1 & 1 & 1 & -1 & -1 & 1 \\
1 & 1 & 1 & 1 & -1 & -1 & -1 & -1 \\
1 & -1 & 1 & -1 & -1 & 1 & -1 & 1 \\
1 & 1 & -1 & -1 & -1 & -1 & 1 & 1 \\
1 & -1 & -1 & 1 & -1 & 1 & 1 & -1 \\
\end{tabular}
\medskip
More generally $H_0 = [1]$ and $H_n$ is built from
$H_{n-1}$ as follows: the lower right quadrant of $H_n$ is
equal to $-H_{n-1}$ and the other three quadrants are equal
to $H_{n-1}$.
The following facts are easy to prove about $H_n$:
\begin{itemize}
\item Every pair of rows is orthogonal, and therefore $H_n^2 = N \cdot I$.
\item We can interpret the rows as parity functions
\item The matrix is symmetric about the diagonal
\item The eigenvectors form an orthonormal basis. (That is
$ = 0$ for all $i \neq j$, and $v_i^2 =1$ for all $i$.)
\item The only eigenvalues of $H_n$ are $+/- \sqrt{N}$.
\end{itemize}
%First, we prove the following result, known as the Lindsey Lemma.
%\begin{lemma}(Lindsey's Lemma)
%$$2^{2n} disc_{\mu}(IP_n,A \times B) \leq \sqrt {2^n |A| \cdot |B|}.$$
%\end{lemma}
We want to find the eigenvalues of the Hadamard matrices, as
claimed in the last bullet point above.
Recall these are defined by the following recursive construction:
\[
H_0 = [1], \qquad H_{n+1} = \left[ \begin{array} {cc} H_n & H_n \\ H_n & -H_n \end{array} \right].
\]
\begin{lemma}
For each $n$, $H_n^2 = H H^T = 2^n I_{2^n}$.
\end{lemma}
\begin{proof}
The proof is by induction. Since $H_0 = I_1$, the lemma is correct for $n=0$.
Given that $H_n^2 = 2^n I$, we can calculate $H_{n+1}^2$ explicitly:
\begin{align*}
H_{n+1}^2 &= \left[ \begin{array}{cc} H_n & H_n \\ H_n & -H_n \end{array}^2 \right] \\
&= \left[ \begin{array} {cc} H_n^2 + H_n^2 & H_n^2 - H_n^2 \\ H_n^2 - H_n^2 & H_n^2 + H_n^2
\end{array} \right] = \left[ \begin{array} {cc} 2^{n+1} I_{2^n} & 0 \\ 0 & 2^{n+1} I_{2^n}
\end{array} \right] = 2^{n+1} I_{2^{n+1}}.
\end{align*}
\end{proof}
\begin{corollary}
The eigenvalues of $H_n$ are all $\pm 2^{n/2}$.
%Moreover, if $n > 0$, then half of the eigenvalues are~$+2^{n/2}$, and half of them~$-2^{n/2}$.
\end{corollary}
\begin{proof}
By the above lemma, for all $v$, $v H H^T = 2^n v$ and therefore
$2^n$ is the only eigenvector of $H H^T$.
Thus, the only eigenvectors of $H$ are $\pm 2^{n/2}$.
\end{proof}
%The norm of $A$ is defined to be $max_{v, ||v||=1} ||A v||$,
%where $||v||$ is the Euclidean norm.
%It is clear that $||A|| = max_{\lambda} {\sqrt \lambda}$,
%where $\lambda$ is an eigenvector of $AA^T$.
%Since $H H^T$ has only one eigenvector, it follows that
%$||H|| = {\sqrt 2^n}$.
%If $\lambda$ is an eigenvalue of $H_n$ then
%$\lambda^2$ is an eigenvalue of $2^n I$, so $\lambda^2 = 2^{n}$.
%Let $Tr(M)$ denote the trace of matrix $M$, that is the sum of the diagonal
%entries of $M$. The trace of a matrix is the sum of the eigenvalues.
%For all $n > 0$, $\mathrm{Tr}(H_n) = 0$ by construction, where exactly half
%of the eigenvalues are positive and half are negative.
%Since all are either plus/minus 1, it follows that half are
%$+2^{n/2}$ and half are $-2^{n/2}$.
%the upper half being
%positive and the lower half being negative.
%and so exactly half of the eigenvalues are positive, and exactly half are negative.
%\end{proof}
%Lindsey's lemma follows.
%\begin{lemma}[Lindsey's Lemma]
%We have $2^{2n} \disc_{\IP_n}(A \times B) \leq \sqrt{2^n |A| \cdot |B|}$.
%We will use the fact that, for any two rows $r_i$ and $r_\ell$ of a Hadamard matrix, $ = 0$ whenever $i \neq \ell$.
%
%\begin{lemma}\label{lem:lindsey}
%Let $H$ be an $N \times N$ Hadamard matrix. Let $K = S \times T$, where $|S|=a$ and $|T|=b$, be an $(a \times b)$ submatrix of $H$. The absolute value of the sum of all entries in $K$ is bounded above by $\sqrt{abN}$.
%\end{lemma}
%\begin{proof}
%Let $\alpha = \displaystyle \sum_{i \in S} \sum_{j \in T} K_{ij}$. Let $K_i$ denote the $i^{th}$ row of $K$ and define $\overline{y} = \displaystyle \sum_{i \in S} K_i$. Denote by $\overline{x}$ the vector such that
%\[
%\overline{x}_{j} = \left\{
%\begin{array}{rl}
%1 & \textrm{if $j \in T$}\\
%0 & \textrm{if $j \notin T$}
%\end{array}\right.
%\]
%It follows that $\alpha = <\overline{x},\overline{y}>$. But,
%\[
%\begin{array}{rcl}
%<\overline{x},\overline{y}>^2 & \leq & |\overline{x}|_2^2 \cdot |\overline{y}|_2^2\\
%& = & b\cdot |\overline{y}|_2^2\\
%& = & b\cdot <\overline{y},\overline{y}>\\
%& = & b\cdot <\sum_{i \in S} K_i\ ,\ \sum_{i \in S} K_i>\\
%& = & b\cdot \sum_{i \in S}\sum_{\ell \in S}\\
%& = & b\cdot \sum_{i \in S}\ \ \textrm{(as $=0$ when $i\neq \ell$)}\\
%& = & b\cdot(aN)
%\end{array}
%\]
%
%where the first inequality follows by Cauchy-Swartz.
%Thus, $\alpha = \sqrt{abN}$.
%\hfill \fbox{}
%\end{proof}
%Now, we calculate an upper bound on the discrepancy of IP according to the uniform distribution. We define $f = IP$ as:
%\[
%
%f(x,y) = \left\{
%\begin{array}{rl}
%1 & \textrm{if $\sum_{i=1}^{n}x_iy_i$ mod $2 = 0$}\\
%-1 & \textrm{if $\sum_{i=1}^{n}x_iy_i$ mod $2 = 1$}
%\end{array}\right.
%\]
%Then, any matrix $M_f$ is a $2^n \times 2^n$ Hadamard matrix. So, for any rectangle $K = S \times T$ with $|S|=a$ and $|T|=b$,
%\[
%\begin{array}{rcl}
%\textrm{Disc}_{uniform}(K,f) & = & \sum_{i \in S} \sum_{j \in T} K_{ij}\\
%& \leq & \frac{\sqrt{ab2^n}}{2^{2n}}\ \ \ \textrm{by Lemma \ref{lem:lindsey}}
%\end{array}
%\]
%As $a,b \leq 2^n$,
%\[
%\begin{array}{rcl}
%\textrm{Disc}_{uniform}(f) & \leq & \frac{\sqrt{2^n2^n2^n}}{2^{2n}}\\
%&=& 2^{-\frac{n}{2}}
%\end{array}
%\]
%So, by Proposition \ref{prop:discrep}, $D_{\frac{1}{2}-\epsilon}^{\mu}(IP) \geq \log_2\left( \frac{2\epsilon}{2^{-\frac{n}{2}}}\right) = \frac{n}{2} + 1 + \log_2(\epsilon)$.
We denote the discrepancy of~$f$ (with respect to the uniform distribution) and a
rectangle $A\times B$ by $\disc(f, A\times B)$.
All our results can be generalized to arbitrary distributions by multiplying each entry of $M_f$ by the probability of the corresponding cell.
Recall that Boolean functions can be considered as taking values in either $\{0,1\}$ or $\{+1,-1\}$. In this section, we will use the $\pm 1$ convention
when describing the matrices and rectangles.
We use the notation $\cv{A}$ for the characteristic vector of~$A$, which contains~$1$ in positions corresponding to the elements of~$A$, and 0's elsewhere.
\subsection{The Eigenvalue Method}
The eigenvalue method upper bounds the discrepancy using the maximal eigenvalue of $M_f$.
\begin{lemma} [Eigenvalue Bound]
Let $f$ be a symmetric Boolean function, i.e. $f(x,y)=f(y,x)$. Then
\[ \disc(f, A\times B) \leq 2^{-2n} \lambda_\mathrm{max} \sqrt{|A| \cdot |B|}, \]
where $n=|x|=|y|$ is the input size, and $\lambda_\mathrm{max}$ is the largest eigenvalue of the symmetric matrix~$M_f$.
\end{lemma}
\begin{proof}
Since $M_f$ is symmetric, its eigenvectors~$v_i$ form an orthonormal basis for~$\RR^n$.
Denote by~$\lambda_i$ the eigenvalue corresponding to~$v_i$, so that $M_f v_i = \lambda_i v_i$.
Expand the characteristic vectors of~$A$ and~$B$ in this basis:
\[ \cv{A} = \sum \alpha_i v_i, \qquad \cv{B} = \sum \beta_i v_i
\]
Putting these expansions into the definition of discrepancy, we are almost done.
Since $2^{2n} \disc(f,A \times B)$ is equal to the absolute value of
the difference between the number of 1's and the number of 0's in
$A \times B$, we have:
\begin{align*}
2^{2n} \disc(f,A\times B) &= \abs{\T{\cv{A}} M_f \cv{B}} \\
&= \abs{ \T{\p{\sum \alpha_i v_i}} \p{\sum \beta_i \lambda_i v_i} } \\
&= \abs{ \sum \alpha_i \beta_i \lambda_i } \leq \lambda_\mathrm{max} \abs{\sum \alpha_i \beta_i}.
\end{align*}
Note that $\sum \alpha_i^2 = \norm{\cv{A}}^2 = |A|$
by Parseval's identity.
(Parseval's identity relates the values of the Fourier coefficients
to the values of the function. Namely, it states that for any function
$f: \{0,1\}^n \rightarrow R$, the sum of the
squares of the Fourier coefficients of $f$ is equal to
$f^2$. Note that in our case we have not normalized.
If we had normalized -- so that the Fourier coefficients were
normalized, then the sum of the squares of the Fouerier coefficients
of $f$ would be equal to $E[f^2]$.)
and similarly $\sum \beta_i^2 = |B|$.
The lemma follows from an application of Cauchy-Schwarz:
\begin{align*}
2^{2n} \disc(f, A\times B) &\leq \lambda_\mathrm{max} \abs{\sum \alpha_i \beta_i} \\
&\leq \lambda_\mathrm{max} \sqrt{\sum \alpha_i^2} \sqrt{\sum \beta_i^2} =
\lambda_\mathrm{max} \sqrt{|A| \cdot |B|}.
\end{align*}
\end{proof}
We are now ready to prove Lindsey's Lemma which gives a bound
on the disrepancy of the inner product function:
\begin{lemma}[Lindsey's Lemma]
$2^{2n} \disc(\IP_n, A \times B) \leq \sqrt{2^n |A| \cdot |B|}$.
Here $\IP_n(x,y) = \sum x_i y_i \pmod{2}$.
\end{lemma}
\begin{proof}
The matrix corresponding to~$\IP_n$ is~$H_n$. We have shown that $\lambda_\mathrm{max}(H_n) = 2^{n/2}$, and so the lemma follows by the Eigenvalue Bound.
\end{proof}
We are now ready to prove the following theorem.
\begin{theorem}
$R^{cc}(IP) = \Omega(n)$
\end{theorem}
By Lindsey's Lemma, discrepancy is maximized when $|A|=|B| = 2^n$,
and this gives
$disc(IP_n, A \times B) \leq 2^{3n/2} 2^{-2n} = 2^{-n/2}$.
Thus
$R(IP_n) \geq \log(1/3 disc(IP_n)) = log(2^{n/2}/3) = \Omega(n)$.
%\section{Application: lower bound on the area-time tradeoff in chips}
%
%Suppose we have a chip that computes a boolean function
%$f: \{0,1\}^m \rightarrow \{0,1\}$. Abstractly, the chip can be viewed
%as a planar rectangle with $m$ input ports and one output port.
%Its width $a$ and height $b$ are measured in units $\Delta$, which is the
%minimal width of a wire.
%The area is $A = ab$. The chip works in cycles.
%In each cycle, ports can do local computation and can send a bit
%(across some wire) to another port. The time $T$ is the number of
%cycles that the chip uses for its computation.
%Now make some imaginary "cut" in the chip, call the $m_A$ input
%ports on the left "Alice" (and call their $m_A$ bits $x$) and
%call the $m_B$ input ports on the right "Bob" (with $m_B$ bits $y$).
%We can make the cut so that only $O({\sqrt {A}})$ wires go between
%the left and the right.
%Note that the chip solves the communication complexity
%problem $f(x,y)$ using only $O(T {\sqrt A})$ many bits of
%communication: in each cycle it only sends $O({\sqrt A})$ bits
%between left and right. Hence the communication complexity
%$D(f)$ (for our specific split of variables) gives a lower
%bound on $T {\sqrt A}$.
\end{document}