\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\usepackage{amsmath}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{../preamble.tex}

\lecture{3}{February 13, 2007}{Ronitt Rubinfeld}{Jeremy Hurwitz}

In this lecture, we will prove the correctness of the equality tester for $L_1$.

\vspace{1em}
\textsc{$L_1$-Dist-Test}($p,q,\epsilon$) \\
\indent \indent $m \leftarrow O(\max(\epsilon^{-2},4)n^{\frac{2}{3}}\log n)$ \\
\indent \indent $S_p \leftarrow m$ samples from $p$ \\
\indent \indent $S_q \leftarrow m$ samples from $q$ \\
\indent \indent $S \leftarrow S_p \cup S_q$ \\ 
\indent \indent discard any element that appears less than $(1 - \frac{\epsilon}{63})mn^{-\frac{2}{3}}$ times in $S$ \\
\indent \indent \IF \ $S \neq \emptyset$ \\
\indent \indent \indent $\hat{p_i} \leftarrow$ number of times $i$ appears in $S_p$ \\
\indent \indent \indent $\hat{q_i} \leftarrow$ number of times $i$ appears in $S_q$ \\
\indent \indent \indent {\bf fail} \IF \ $\displaystyle\sum_{i \in S} |\hat{p_i}- \hat{q_i}| > \frac{\epsilon m}{8}$ \\
\indent \indent \indent define $p'$ as output of following process \\
\indent \indent \indent \indent sample $x \in p$ \\
\indent \indent \indent \indent \IF \ $x \notin S$ output $x$ \\
\indent \indent \indent \indent \ELSE \ output $x \in U_D$ \\
\indent  \indent \indent define $q'$ similarly \\
\indent \indent \ELSE \ $p' \leftarrow p$, \  $q' \leftarrow q$ \\
\indent \indent run $T(p', q', \frac{\epsilon}{2 \sqrt{n}})$ with $O(\frac{n^{\frac{2}{3}}}{\epsilon^4})$ samples \\
\vspace{1em}


$T$ is a tester described in the last lecture which
passes $p,q$ for which the $L_2$ distance
is at most $\epsilon/2$ and fails $p,q$ for which the $L_2$ distance
is at least $\epsilon$.  We assume that the error probability
is at most 1/6.

\section{Useful Claims, Observations, and a Lemma}
We begin by proving a series of claims. We will then analyze the behavior of the algorithm.

\begin{claim}
$\abs{S}\leq 2n^{2/3}(1-\frac{\epsilon}{63})^{-1}$
\label{cards}
\end{claim}
\begin{proof}
If $\abs{S}> 2n^{2/3}(1-\frac{\epsilon}{63})^{-1}$, then
\begin{align*}
\#samples &\geq \abs{S}\cdot(1-\epsilon)mn^{-2/3} \\
					&> 2n^{2/3}(1-\frac{\epsilon}{63})^{-1} \cdot (1-\epsilon)mn^{-2/3} \\
					&= 2m 
\end{align*}
But we only have $2m$ samples. So we have a contradiction.
\end{proof}

\begin{lemma}
With probability greater than $1-\frac{1}{6}$, for all $i$ \\
$(\star)\left\{
\parbox{5in}{
(1) If $p_i\geq\epsilon^2n^{-2/3}$, then $\abs{\hat{p_i}-p_i}<\frac{\epsilon}{63}\max(p_i,n^{-2/3})$\\
(2) If $p_i\leq \epsilon^2 n^{-2/3}$, then $\hat{p_i}<(1-\frac{\epsilon}{63}n^{-2/3})$}\right.$
\label{approx}
\end{lemma}
This means that, most of the time, the "heavy" $p_i$s are well approximated and "very light" $p_i$s will be discarded from $S$. Anytime I refer to Lemma~\ref{approx} "holding", I mean that (1) and (2) are satisfied by the current sampling.

\begin{proof}
The lemma follows from Chernoff bounds.
\end{proof}

\begin{claim}
If $(\star)$ holds, then 
$$\abs{\sum_{i\in S}{\abs{\hat{p_i}-\hat{q_i}}}-\sum_{i\in S}{\abs{p_i-q_i}}}\leq\frac{\epsilon}{9}$$
\label{e9}
\end{claim}
\begin{proof}
Since $(\star)$ holds, $p_i$ and $q_i$ are well approximated.

Therefore, given $i\in S$,
\begin{align*}
\abs{\abs{\hat{p_i}-\hat{q_i}}-\abs{p_i-q_i}} 
	&< \frac{\epsilon}{63}(\max(p_i,n^{-2/3}))+\max(q_i,n^{2/3}) \\
	&< \frac{\epsilon}{63}(p_i+q_i+2n^{-2/3}) \\
\end{align*}

Therefore,
\begin{align*}
\sum_{i\in S}{\abs{\abs{\hat{p_i}-\hat{q_i}}-\abs{p_i-q_i}}}
	&< \sum_{i\in S}{\frac{\epsilon}{63}(p_i+q_i+2n^{-2/3})} \\
	&\leq \frac{\epsilon}{63}(1+1+2n^{-2/3}\cdot\abs{S}) \\
	&\leq \frac{\epsilon}{9}
\end{align*}
for sufficiently small $\epsilon$, by Claim~\ref{cards}.
\end{proof}

\begin{observation} ~\\
\indent (a) For all $i\not\in S$, $p_i'= p_i+\Pr[S]\frac{1}{\abs{D}}$ \\
\indent (b) For all $i\in S$, $p_i'= \Pr[S]\frac{1}{\abs{D}}$ \\
\indent (c) $\sum_{i\not\in S}{\abs{p_i-q_i}}=\sum_{i\not\in S}{\abs{p_i'-q_i'}}$
\label{obser}
\end{observation}

\begin{claim}
If $(\star)$ holds, then for all $i\not\in S$, $p_i'<\frac{1}{\abs{D}}+n^{-2/3}$.
\end{claim}
\begin{proof}
Suppose that $p_i'\geq \frac{1}{\abs{D}}+n^{-2/3}$. Then $p_i\geq n^{-2/3}$. 

Assuming that $(\star)$ holds, $p_i$ is approximated well. So 
\begin{align*}
\hat{p_i} 
	&\geq p_i-\frac{\epsilon}{63}\max(p_i,n^{-2/3}) = p_i(1-\frac{\epsilon}{63})
	&\geq n^{-2/3}(1-\frac{\epsilon}{63})
\end{align*}
which would mean that $i$ was in $S$, contradicting the requirement on $S$.
\end{proof}

Finally, we consider the number of samples that the $L_2$ algorithm reqiures. Given parameters $$\epsilon'=\frac{\epsilon}{2\sqrt{n}} \quad b=n^{-2/3}+\frac{1}{\abs{D}}<2n^{-2/3}$$
we need $$m\geq c(b^2+\sqrt{b}\epsilon^2)\epsilon^{-4}=O(n^{2/3}\epsilon^{-4})$$ samples.

\section{Behavior of the $L_1$ tester}
We analyze the algorithm separately for the case $p=q$ and the case $|p-q|_1>\epsilon$.

\subsection{$p=q$}
Since $p=q$, $\sum_i{\abs{p_i-q_i}}=0$. By Claim~\ref{e9}, and assuming $(\star)$ holds, $$\sum_i{\abs{\hat{p_i}-\hat{q_i}}}\leq\frac{\epsilon}{9}<\frac{\epsilon}{8}$$
which means that the distribution does not fail the first test with probability $1-\frac{1}{6}$.

Since $p=q$ originally, $p'=q'$, so the $L_2$ test returns the correct answer with probability $1-\frac{5}{6}$.

So the total failure rate is bounded by $\frac{1}{6}+\frac{1}{6}=\frac{1}{3}$, as desired.

\subsection{$|p-q|_1>\epsilon$}
Since $|p-q|_1>\epsilon$, at least one of $$\text{ (I) }\sum_{i\in S}{\abs{p_i-q_i}}> \frac{\epsilon}{2} \quad\text{ or }\quad\text{ (II) }\sum_{i\not\in S}{\abs{p_i-q_i}}> \frac{\epsilon}{2}$$
is true.

\subsubsection*{Case I}
If (I) holds, then assuming $(\star)$ holds, $$\sum_{i\in S}{\abs{\hat{p_i}-\hat{q_i}}}\geq \frac{\epsilon}{2}-\frac{\epsilon}{9}>\frac{\epsilon}{8}$$ which means that the algorithm correctly returns "fail". So the algorithm returns correctly at least $1-\frac{1}{6}$ of the time in this case.

\subsubsection*{Case II}
If (II) holds, then $$\sum_{i\not\in S}{\abs{p_i-q_i}}\geq \frac{\epsilon}{2}$$ which means that $$\sum_{i\not\in S}{\abs{p_i'-q_i'}}\geq\frac{\epsilon}{2}$$ by Observation~\ref{obser}. Therefore, $$\abs{\abs{p'-q'}}_2\geq\frac{\epsilon}{2\sqrt{n}}$$ so the $L_2$ test runs correctly with probability $1-\frac{5}{6}$.

So the total failure rate is again bounded by $\frac{1}{6}+\frac{1}{6}=\frac{1}{3}$, as desired.
\end{document}