\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}
%\usepackage{amsmath,amssymb,amsthm}
\usepackage{amsmath,amssymb}
\newcommand{\BC}{\{-1,1\}^{n}}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{../preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{6}{February 22, 2007}{Ronitt Rubinfeld}{Ning Xie}

%%%% body goes in here %%%%
In this lecture, we continue studying the problem of testing 
whether a monotone distribution is uniform. 
We will restrict to distributions over the Boolean cube. 
First recall some notation. 
Let $p:\BC \rightarrow \mathbb{R}^{\geq 0}$ be a 
monotone distribution over the Boolean cube. Define
$\delta(x)=2^{n}p(x)-1$, that is, the weight deviation from
uniform distribution. Note that since $p(x)$ is monotone, so
is $\delta(x)$. As we proved last time, $\sum_{x}\delta(x)=0$ and
$\sum_{x}|\delta(x)|=2^{n}\epsilon$, where
$\epsilon$ is the $\ell_{1}$-distance between $p$ and uniform
distribution, $\epsilon=|p-U|_{1}$.
We also define, for each $x\in \BC$, the bias of $x$
to be $bias(x)=\sum_{i=1}^{n}x_{i}$. 
It is easy to see that for uniform distribution
$\mathbb{E}_{U}[bias(x)]=0$.
Our main result for this lecture
is the follow lemma:
\begin{lemma}
Let $p$ be a monotone distribution over Boolean cube and 
$\epsilon=\sum_{x\in \BC}|p(x)-\frac{1}{2^{n}}|$, then
\[
\mathbb{E}_{p}[bias(x)] \geq \epsilon.
\]
\end{lemma}
\begin{proof}
Let $S=\mathbb{E}_{p}[bias(x)]$, our goal is to obtain a lower bound for it in 
term of $\epsilon$. Since $p(x)=\frac{1}{2^n}+\frac{\delta(x)}{2^n}$ and the expectation
of bias(x) over the uniform distribution is zero, we can rewrite $S$ as
\[
S=\sum_{x}\sum_{i=1}^{n}x_{i}p(x) = 
\frac{1}{2^{n}}\sum_{x}\sum_{i=1}^{n}x_{i}\delta(x).
\]
Our proof strategy is to first transform this sum over nodes to a sum over the edges
in the Boolean cube. Then we get a lower bound of the sum over the edges by 
considering a so-called ``canonical path''. To get an idea of how the first
step works, consider the 2-dimensional Boolean cube with $A=(1,1),
B=(1,-1), C=(-1,1)$ and $D=(-1,-1)$. Then we have
\begin{align*}
2^{n}S=&\delta_{A}1+\delta_{A}1+\delta_{B}1+\delta_{B}(-1)+\delta_{C}(-1)+\delta_{C}1
+\delta_{D}(-1)+\delta_{D}(-1) \\
=&(\delta_{A}-\delta_{B})+(\delta_{A}-\delta_{C})+(\delta_{B}-\delta_{D})+
(\delta_{C}-\delta_{D}).
\end{align*}
If we assign the weight $\delta(u)-\delta(v)$ to each edge $(u,v)\in E$,
then $S$ can be expressed as a sum of the weights of all edges in the 
Boolean cube\footnote{
Note that the Boolean cube naturally gives rise to
a graph: the vertices are the $2^{n}$ elements in $\BC$
and there is an edge between vertices $x$ and $y$ if and only if they differ 
at exactly one coordinate.}. 
Also note that we only sum over \emph{downward} edges, 
that is, edges $(u,v)$ with $u > v$. 

Next we define two sets of points:
\[
P=\{x|\delta(x) \geq 0 \}
\] 
and 
\[
N=\{x|\delta(x) < 0 \}.
\] 
For each pair of points $x$ and $y$ with $x \in P$ and $y \in N$, define
\begin{align*}
\Delta(x,y) & =|\delta(x)-\delta(y)| \\
            & =|\delta(x)| + |\delta(y)|,
\end{align*}
because $\delta(y) < 0$.
We have 
\begin{align*}
 \sum_{x \in P, y \in N}\Delta(x, y) 
&=\sum_{x \in P, y \in N}\delta(x) + \sum_{x \in P, y \in N}\delta(y) \\
&=|P|\frac{\epsilon}{2}2^{n}+|N|\frac{\epsilon}{2}2^{n}=\frac{\epsilon}{2}2^{2n}. 
\end{align*}

Let $x, y \in \BC$. Let $(z_{0}=x, z_{1}, \ldots, z_{k-1}, z_{k}=y)$
be a path from $x$ to $y$ along the edges of the Boolean cube,
where $(z_{i}, z_{i+1}) \in E$, for $0 \leq i \leq k-1$.
The ``$z$-distance path between $x$ and $y$'' is defined to be
\[
dist_{z_{xy}}(x,y)=\sum_{i=0}^{k-1}|\delta(z_{i})-\delta(z_{i+1})| 
\geq \Delta(x,y),
\]
where the last step follows from triangle inequality.

Now we can rewrite the sum over distance pairs in P and N as
\begin{align*}
\frac{\epsilon}{2}2^{2n} = \sum_{x \in P, y \in N}\Delta(x, y) 
& \leq \sum_{x \in P, y \in N}dist_{z_{xy}}(x, y)
\qquad \text{(for arbitrary choice of path $z_{xy}$)} \\
& =\sum_{x \in P, y \in N} 
\sum_{j=0}^{|z_{xy}-1|}|\delta(z_{xy}^{j})-\delta(z_{xy}^{j+1})|.  
\end{align*}
The key point here is to pick a routing path so that there is
no edge get crossed too many times.

Now we introduce a very useful concept called ``canonical path''.
For $x \in P$ and $y \in N$, $z_{xy}$ is the canonical path
from $x$ to $y$ obtained by scanning from left to right and
flip bits to match $y$ whenever the scanned bit of current string
is different from that of $y$. Here is an example:
\begin{align*}
x=& (1, -1, 1, 1, -1) \\
  & (1, 1, 1, 1, -1)\\
  & (1, 1, -1, 1, -1)\\
y=& (1,1,-1,1,1).
\end{align*} 

Now let us count the number of downward paths between 
$x \in P$ and $y \in N$ crossing a given edge. A naive bound
is $2^{2n}$. A more careful analysis is the following. Suppose
the edge is $(a1b, a -1 b)$, where $a \in \{-1,1\}^{k}$ and
$b \in \{-1,1\}^{n-k-1}$. Then according to the definition
of canonical path, the starting vertex must be of the form $(\alpha 1 b)$
and the ending vertex must be of the form $(a-1\beta)$, where
$\alpha \in \{-1,1\}^{k}$ and
$\beta \in \{-1,1\}^{n-k-1}$. Then the number of downward paths crossing
the edge is simply the total number of choices of strings $\alpha$ and
$\beta$, which is $2^{n-1}$.

Now we have
\begin{align*}
\frac{\epsilon}{2}2^{2n}
=&\sum_{x \in P, y \in N}\Delta(x, y) \\
=&\sum_{x \in P, y \in N} 
\sum_{j=0}^{|z_{xy}|-1}|\delta(z_{xy}^{j})-\delta(z_{xy}^{j+1})| \\
\leq & \sum_{(u,v)\in E}|\delta(u)-\delta(v)|\cdot(\text{max. \# of times $(u,v)$
used in any canonical path})\\
=& 2^{n-1}\sum_{(u,v)\in E}|\delta(u)-\delta(v)|.
\end{align*}

Note that, since $p$ is monotone and we
are summing over downward edges, $\delta(u) \geq \delta(v)$. Therefore we have
\begin{align*}
\epsilon 
\leq & \frac{1}{2^{n}}\sum_{(u,v)\in E}|\delta(u)-\delta(v)|\\
=&\frac{1}{2^{n}}\sum_{i=1}^{n}\sum_{(u,v)\in E_{i}}|\delta(u)-\delta(v)|\\
=&\frac{1}{2^{n}}\sum_{i=1}^{n}\sum_{(u,v)\in E_{i}}(\delta(u)-\delta(v))\\
=&\frac{1}{2^{n}}\sum_{i=1}^{n}\sum_{x \in \BC}x_{i}\delta(x)\\
=&\mathbb{E}[bias(x)].
\end{align*}
This completes the proof of the lemma. 
\end{proof}

The testing algorithm is the following
\begin{center}
\fbox{
\begin{minipage}[t]{6cm}
\begin{tabbing}
\textbf{Testing Uniformity of Monotone Distributions over Boolean Cube} \\
Pick $s$ samples $X^{(1)}, \ldots, X^{(s)} \in p$, with
 $s=\Theta(\frac{n}{\epsilon^{2}}\log \frac{n}{\epsilon})$ \\
Fail if any $X^{(i)}$ has $|bias(X^{(i)})| \geq \sqrt{2n\log(20s)}$ \\
Let $\hat{\mu}=\frac{1}{s}\sum_{i=1}^{s}bias(X^{(i)})$ \\
If $\hat{\mu} \geq \frac{\epsilon}{4}$ fail\\
Else pass 
\end{tabbing}
\end{minipage}
}
\end{center}

The tricky part of the algorithm is in the first step. We check 
all the sample strings to see if anyone has large bias. If
so, we fail immediately. This is for a technical reason. Due to the following
additive Chernoff bound, in order to reduce the number of samples needed
to get desired concentration around mean, we need to reduce the range of the 
i.i.d. random variables. Note that this is essentially the best we can do:
if we further reduce the threshold to a smaller value, we will not be able 
to let uniform distribution pass the test with high probability.


\begin{theorem}[Additive Chernoff bound, a.k.a. Hoeffding bound]
Let $X_{1},\ldots, X_{m}$ be i.i.d. random variables ranging in
$[-a, a]$. Let $\hat{\mu} = \frac{1}{m}\sum_{i=1}^{m} X_i$
and $\mu = \mathbb{E}[\hat{\mu}]$.
Then for all $\rho > 0 $, we have 
$\Pr[|\hat{\mu} - \mu|\geq \rho] \leq 
2\cdot e^{-\frac{\rho^{2}}{2a^{2}}m}$.
\end{theorem}

Now we will show that if $p$ is the uniform distribution, then the testing 
algorithm will pass with high probability. 
The analysis for distributions that are far from uniform is left for next lecture.
The intuition is, if $p$ is the uniform distribution, then for large enough $n$, 
the distribution of $bias(x)$ approaches a Gausssian distribuion with mean $0$
and variance $\Theta(n)$.
Therefore, $bias(x)$ is concentrated in $[-\Theta(\sqrt{n}), \Theta(\sqrt{n})]$
and we are unlikely to see any sample with large bias.

Formally, let $s$ be the number of samples and define $\rho = \sqrt{\frac{2\log(20s)}{n}}$.
Let $X^{(i)}$ be any sample and consider the random variable $Y_{1}, \ldots, Y_{n}$ to be
the $n$ bits of $X^{(i)}$.
Applying the additive Chernoff bound with $a=1$ and $m=n$ gives
\[
\Pr[|bias(X^{(i)})|\geq \sqrt{2n\log(20s)}] = \Pr[|bias(X^{(i)})|\geq\rho n] 
= \Pr[\frac{1}{n}|bias(X^{(i)})|\geq\rho] \leq 2e^{-\rho^{2}n/2}=\frac{1}{10s}
\]

By union bound, $\Pr[\text{there is an $i$ such that $bias(X^{(i)}) \geq \sqrt{2n\log(20s)}$}] \leq \frac{1}{10}$.
Therefore, if $p$ is the uniform distribution, the probability that step 1 outputs fail is at most 1/10.

If we reach step 3, then there is no sample has large bias. Let $D'=\{x \in D| |bias(X^{(i)})| < \sqrt{2n\log(20s)}\}$
and $p'=p_{|D}$ be the restriction of $p$ to $D'$. Since $p$ is uniform over $D$, $p'$ is uniform over $D'$.
Then by symmetry, $\mathbb{E}_{p'}[bias(x)]=0$. Apply the additive Chernoff bound again, but this time
set $a=\sqrt{2n\log(20s)}, \rho=\frac{\epsilon}{4}$ and $m=s$, we have
\[
\Pr[\bar{\mu} \geq \frac{\epsilon}{4}] \leq 2e^{-\epsilon^{2}s/64n\log(20s)} < \frac{1}{10}.
\]
Combined with the previous calculation, we see that the total probability of output fail is at most 1/5.
\end{document}