\documentclass[10pt]{article}
\newtheorem{define}{Definition}
\usepackage{amssymb}
\usepackage{amsmath}

\newcommand{\normltwo}[1]{\norm{#1}_2}
\newcommand{\normlone}[1]{|#1|_1}
\newcommand{\Exp}{{\mathbb{E}}}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{2}{February 8, 2007}{Ronitt Rubinfeld}{Adriana Lopez}

In this lecture, we will prove a theorem from last lecture that states that there exists a {\it closeness tester} in $L_2$.  We will also give an algorithm for $L_1$-distance testing.  

Recall the following definitions and facts from last lecture: 
\begin{definition}
$p_x = \Pr[\mbox{$p$ outputs $x$}]$
\end{definition}

\begin{definition}
(collision probability of $p$ and $q$) \\
$CP(p,q) = \Pr[\mbox{sample from $p$ equals sample from $q$}] = \displaystyle\sum_x p_xq_x$
\end{definition} 

\begin{definition}
(self-collision probability of $p$) \\
$SCP(p) = \Pr[\mbox{2 samples from $p$ are equal}] = \displaystyle\sum_x p_x^2 = \normltwo{p}^2$
\end{definition}

\section{$L_2$-Distance Testing}

\begin{theorem}
\label{testerthm}
For all $\epsilon, p, q$, there exists an $O(\epsilon^{-4})$ sample tester $T$ such that
\begin{itemize}
\item If $\normltwo{p-q} < \frac{\epsilon}{2}$ then $\Pr[T \ accepts] \geq \frac{2}{3}$.
\item If $\normltwo{p-q}  > \epsilon$ then $\Pr[T \ rejects] \geq \frac{2}{3}$. 
\end{itemize}
\end{theorem}

We will prove that tester $T$ from last lecture (reproduced below) satisfies the conditions stated in Theorem \ref{testerthm}.  \\
  \\
\indent $T(p,q, \epsilon)$: \\
\indent \indent $m \leftarrow O(\epsilon^{-4})$ \\
\indent \indent $S_p \leftarrow m$ samples from $p$ \\ 
\indent \indent $S_q \leftarrow m$ samples from $q$ \\
\indent \indent $r_p \leftarrow$ number of self-collisions in $S_p$ \\
\indent \indent $r_q \leftarrow$ number of self-collisions in $S_q$ \\
\indent \indent $Q_p \leftarrow m$ (new) samples from $p$ \\
\indent \indent $Q_q \leftarrow m$ (new) samples from $q$ \\
\indent \indent $r_{pq} \leftarrow$ number of pairs $i,j$ s.t. $i$th sample in $Q_p$ is equal to $j$th sample from $Q_q$ \\
\indent \indent $r \leftarrow \frac{2m}{m-1}(r_p + r_q)$ \\
\indent \indent $s \leftarrow 2r_{pq}$ \\
\indent \indent \IF \ $r-s > \frac{m^2\epsilon^2}{2}$ \\ 
\indent \indent \indent \THEN \ reject \\
\indent \indent \indent \ELSE  \ accept \\

Also recall the following fact from last lecture:
$\Exp[r-s] = m^2\normltwo{p-q}^2$ 

\begin{lemma}
\label{var-r-s}
Let $b = \max_x p_x, q_x$.  Then $\Var[r-s] \leq O(m^3b^2 + m^2b)$.  
\end{lemma}

\begin{proof}
First notice that since $r$ and $s$ are independent, we have that $\Var[r-s]=\Var[r]+\Var[s]$.  
We start by computing $\Var[s]$.  Let $c_{ij}$ be the indicator random variable defined as follows:
$$c_{ij} =  \left\{ \begin{array} {ll}
1 & \mbox{ if the $i$th sample from $Q_p$ is equal to the $j$th sample from $Q_q$} \\
0 & \mbox{ otherwise}
\end{array}\right.$$
Notice that $\Exp[c_{ij}] = CP(p,q)$. Define $\bar{c_{ij}} = c_{ij} - \Exp[c_{ij}]$, and notice that $\Exp[\bar{c_{ij}}] = 0$.   It is easy to verify that for all $i,j,k,l$ we have  $\Exp[\bar{c_{ij}}\bar{c_{kl}}] = \Exp[c_{ij}c_{kl}] + \Exp[c_{ij}]\Exp[c_{kl}]$, and thus $\Exp[\bar{c_{ij}}\bar{c_{kl}}] \leq \Exp[c_{ij}c_{kl}]$.  We will use this fact later in the proof.  

>From the definition of $s$, we get 
\begin{eqnarray}
\Var[s] &=& 4\Var[\displaystyle\sum_{i,j} c_{ij}] = 4 \Exp[(\displaystyle\sum_{i,j} c_{ij} - \Exp[\displaystyle\sum_{i,j} c_{ij}])^2]  \nonumber \\
 & = & 4 \Exp[(\displaystyle\sum_{i,j} (c_{ij} - \Exp[c_{ij}]))^2]  = 4\Exp[(\displaystyle\sum_{i,j} \bar{c_{ij}})^2] \nonumber \\
 & = & 4\Exp[\displaystyle\sum_{i,j} {\bar{c_{ij}}}^2] + 4\Exp[\displaystyle\sum_{\substack{i,j,k,l \\ (i,j) \neq (k,l)}} \bar{c_{ij}} \bar{c_{kl}}] \label{vars}
 \end{eqnarray}

We will bound each of the summands in \ref{vars} separately.  Using the fact that we proved above, we get that
$$\Exp[{\bar{c_{ij}}}^2] \leq \Exp[c_{ij}^2] = \Exp[c_{ij}] = CP(p,q)$$
This implies that
\begin{eqnarray}
\Exp[\displaystyle\sum_{i,j} {\bar{c_{ij}}}^2] & \leq  & m^2 CP(p,q) = m^2 \displaystyle\sum_x p_xq_x \nonumber \\
 & \leq & m^2 \displaystyle\sum_x bq_x = m^2b \displaystyle\sum_x q_x \nonumber \\
 & = & m^2b \label{sum1}
\end{eqnarray}

To bound the second summand, first notice that in the case that $i \neq k, j \neq l$ we know that $\bar{c_{ij}}$ and $\bar{c_{kl}}$ are independent, so that $\Exp[\bar{c_{ij}}\bar{c_{kl}}] = \Exp[\bar{c_{ij}}] \Exp[\bar{c_{kl}}] = 0$. This means that
\begin{eqnarray}
\Exp[\displaystyle\sum_{\substack{i,j,k,l \\ (i,j) \neq (k,l)}} \bar{c_{ij}} \bar{c_{kl}}] & = & \displaystyle\sum_{i, j \neq l} \Exp[\bar{c_{ij}}\bar{c_{il}}] + \displaystyle\sum_{j, i \neq k} \Exp[\bar{c_{ij}}\bar{c_{kj}}] \nonumber \\
& \leq & \displaystyle\sum_{i, j \neq l} \Exp[c_{ij}c_{il}] + \displaystyle\sum_{j, i \neq k} \Exp[c_{ij}c_{kj}] \nonumber \\
& \leq & \displaystyle\sum_{i, j \neq l} \displaystyle\sum_x p_xq_x^2 + \displaystyle\sum_{j, i \neq k} \displaystyle\sum_x p_x^2q_x \nonumber \\
& = & m\binom{m}{2}  (\displaystyle\sum_x p_xq_x^2 + \displaystyle\sum_x p_x^2q_x) \nonumber \\
& \leq & 2m^3b^2 (\displaystyle\sum_x q_x + \displaystyle\sum_x p_x) \nonumber \\
& = & 4m^3b^2 \label{sum2}
\end{eqnarray}

Plugging in \ref{sum1} and \ref{sum2} in \ref{vars}, we get that 
$\Var[s] \leq 4m^2b + 16m^3b^2 = O(m^2b + m^3b^2)$.  Similarly, we can prove that $\Var[r] = O(m^2b + m^3b^2)$ and thus $\Var[r-s] = \Var[r] + \Var[s] = O(m^2b + m^3b^2)$.  
\end{proof}

\begin{proofof}{Theorem 1}
We use Chebyshev's inequality to bound the probability that $r-s$ is far from its expectation.  Let Event $A$ be the event that $|(r-s) - \Exp[r-s]| > \frac{m^2\epsilon^2}{4}$.  Using Chebyshev and lemma \ref{var-r-s}, we have that 
$$\Pr[A]  \leq O\left(\frac{m^3b^2 + m^2b}{m^4 \epsilon^4}\right)$$

If we pick $m$ such that $m > \frac{c(b^2 + \epsilon^2 \sqrt{b})}{\epsilon^4}$ with large enough $c$, we can get $O\left(\frac{m^3b^2 + m^2b}{m^4 \epsilon^4}\right) \leq \frac{1}{3}$, or in other words, there exists constant $c$ such that picking $m > \frac{c(b^2 + \epsilon^2 \sqrt{b})}{\epsilon^4}$ yields $O\left(\frac{m^3b^2 + m^2b}{m^4 \epsilon^4}\right) \leq \frac{1}{3}$, and therefore $\Pr[A] \leq \frac{1}{3}$.  

Suppose that $\normltwo{p-q} \leq \frac{\epsilon}{2}$.  Then $\Exp[r-s] = m^2\normltwo{p-q}^2 \leq \frac{m^2\epsilon^2}{4}$.    This means that 
$$\Pr\left[|(r-s) - \frac{m^2\epsilon^2}{4}| \leq  \frac{m^2\epsilon^2}{4}\right] = 1 - \Pr[A] \geq \frac{2}{3}$$
$$\Pr[r-s < \frac{m^2\epsilon^2}{2}] \geq \frac{2}{3}$$
Therefore, if $\normltwo{p-q} \leq \frac{\epsilon}{2}$, $T$ will accept with probability at least $\frac{2}{3}$. 

Now suppose $\normltwo{p-q} > \epsilon$.  Then $\Exp[r-s] = m^2\normltwo{p-q}^2 > m^2\epsilon^2$. This means that
$$\Pr\left[|(r-s) - m^2\epsilon^2| \leq \frac{m^2\epsilon^2}{4}\right] = 1 - \Pr[A] \geq \frac{2}{3}$$
$$\Pr\left[r-s > \frac{3m^2\epsilon^2}{4}\right] \leq \frac{2}{3}$$
Therefore, if $\normltwo{p-q} > \epsilon$ then $T$ will fail with probability at least $\frac{2}{3}$.
\end{proofof}

\section{$L_1$-Distance Testing}

Recall from last lecture that $\normltwo{p-q} \leq \normlone{p-q} \leq \sqrt{n} \normltwo{p-q}$.  An idea would be to use the $L_2$-distance tester $T$ from above with $\epsilon = \frac{\epsilon_0}{\sqrt{n}}$.  If $\normlone{p-q} = 0$, then $\normltwo{p-q} = 0$ and $T$ will accept with at least probability $\frac{2}{3}$.  If $\normlone{p-q} > \epsilon_0$ then $\normltwo{p-q} \geq \frac{\normlone{p-q}}{\sqrt{n}} \geq \frac{\epsilon_0}{\sqrt{n}} $ and $T$ will fail with probability at least $\frac{2}{3}$.  So running $T$ with $\epsilon = \frac{\epsilon_0}{\sqrt{n}}$ gives a valid $L_1$-distance tester.  However, the running time of such tester is $O(n^2)$, which is larger than the running time of the naive tester!

What we will do is combine the naive tester with the $L_2$-distance tester.  We will first use the naive tester to discard high probability elements, and then we'll run $T$ on the remaining elements.  \\


$L_1$-Distance Tester $(p,q,\epsilon)$ \\
\indent \indent $m \leftarrow O(\max(\epsilon{-2},4)n^{\frac{2}{3}}\log n)$ \\
\indent \indent $S_p \leftarrow m$ samples from $p$ \\
\indent \indent $S_q \leftarrow m$ samples from $q$ \\
\indent \indent $S \leftarrow S_p \cup S_q$ \\ 
\indent \indent discard any element that appears less than $(1 - \frac{\epsilon}{63})mn^{-\frac{2}{3}}$ times in $S$ \\
\indent \indent \IF \ $S \neq \emptyset$ \\
\indent \indent \indent $\hat{p_i} \leftarrow$ number of times $i$ appears in $S_p$ \\
\indent \indent \indent $\hat{q_i} \leftarrow$ number of times $i$ appears in $S_q$ \\
\indent \indent \indent {\bf fail} \IF \ $\displaystyle\sum_{i \in S} |\hat{p_i}- \hat{q_i}| > \frac{\epsilon m}{8}$ \\
\indent \indent \indent define $p'$ as output of following process \\
\indent \indent \indent \indent sample $x \in p$ \\
\indent \indent \indent \indent \IF \ $x \notin S$ output $x$ \\
\indent \indent \indent \indent \ELSE \ output $x \in U_D$ \\
\indent  \indent \indent define $q'$ similarly \\
\indent \indent \ELSE \ $p' \leftarrow p$, \  $q' \leftarrow q$ \\
\indent \indent run $T(p', q', \frac{\epsilon}{2 \sqrt{n}})$ with $O(\frac{n^{\frac{2}{3}}}{\epsilon^4})$ samples \\

Notice that $S$ contains high-probability elements.  We will call them ``heavy" elements.  It is possible to prove that with probability at least $1- \frac{1}{n}$, $S$ contains all ``heavy" elements.  We formalize this as follows. 

\begin{definition}
An element $i \in D$ is {\rm heavy} if $p_i > n^{-\frac{2}{3}}$ or if $q_i > n^{-\frac{2}{3}}$.
\end{definition}

\begin{lemma}
With probability at least $1- \frac{1}{n}$, for every $i$:
\begin{enumerate}
\item If $p_i \geq \epsilon^2n^{-\frac{2}{3}}$ then $|\hat{p_i} - p_i| < \frac{\epsilon}{63} \max(p_i, n^{-\frac{2}{3}})$, or in other words, if $i$ is heavy then $i \in S$.   
\item If $p_i < \epsilon^2n^{-\frac{2}{3}}$ then $\hat{p_i} < (1 - \frac{\epsilon}{63})n^{-\frac{2}{3}}$, or in other words, if $i$ is not heavy then $i \notin S$. 
\end{enumerate}
\end{lemma}

\begin{proof-idea}
Use Chernoff bounds. 
\end{proof-idea}

\end{document}