
\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{amsthm}
\usepackage{epsfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{1}{February 6, 2007}{Ronitt Rubinfeld}{Chih-yu Chao}

%%%% body goes in here %%%%

\section{Testing Global Properties of Distributions}
We will begin by considering how to test properties of distributions
over large domains.  We assume that the only access we have
to the distributions is via samples of the distribution, and
we will focus on the number of samples required in order
to answer our questions about the distribution.
We will begin by studying the sample complexity of determining
whether two distributions are
identical, or far apart according to some measure of distance.
% If a distribution is of high entropy, it means that we will have to change a lot of elements to move the distribution. 
\subsection{Closeness}

\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.6]{f01.eps}
\caption{The black box that outputs the distribution of $p$} \label{F:boxp}
\end{center}
\end{figure}

Here, we will first define the closeness of distributions. Suppose we are given a distribution $p$ in domain $D$, where $|D| = n$, and probabilities ($p_1, \dots p_n$), such that \par
$p_x = \Pr[p \textrm{ outputs } x] \leftarrow$ unknown to the algorithm, $\  \sum_{x \in D}\limits$ $p_x=1$. \\
We will consider algorithms that are given oracle access to a independent identical distribution $p$ (see Figure \ref{F:boxp}) -- each time we press the button on the black box, we get a sample $i$ with probability $p_i$. 
\par

We would like to consider the $L_1$ distance between two distributions: 
\begin{define}[$L_1$ distance]{
\[|p-q|_1 = \sum_x\limits |p_x - q_x|\]
}
\end{define}
Another common  measure is the $L_2$ distance, defined as follows.
\begin{define}[$L_2$ distance]{
\[\|p-q\|_2 = \sqrt{\sum_x\limits(p_x-q_x)^2} \]
}
\end{define}

Note that the L$_1$ norm is defined as
\begin{define}[L$_1$ norm]{
\[|p| = \sum_x\limits |p_x|\]}

\end{define}
and the L$_2$ norm is defined as
\begin{define}[L$_2$ norm]{
\[\|p\| \equiv \sqrt{\sum_x\limits p_x^2}\]
}
\end{define}

\begin{itemize}
\item Often we just look at $\|p-q\|^2_2$, and later we will use this to estimate the distance bewteen two probability distributions.
\item A well-known fact is that $ \|p-q\|_2 \leq |p-q|_1 \leq \sqrt{n} \|p-q\|_2 $, but this is not enough to get a good estimate.
\end{itemize}

Other popular distance measures include
\begin{itemize}
\item[] {\bf KL divergence:} $ p \| q = \sum_i\limits p_i \log \frac{p_i}{q_i}$ \	 \	(not symmetric, not transitive)\\
and
\item[] {\bf Earth Movers Distance (EMD)}\\
\end{itemize}

\subsection{Testing Closeness of Two Distributions}
We would like to test whether two probability distributions $p$ and $q$ are ``close''. 
The black box {\bf p} in Figure \ref{F:boxpq} outputs the distribution $p$, 
while the black box {\bf q} outputs the distribution $q$. 
However, the values of the probabilities of each element
are not accessible to us; 
the only information we know is the samples from the black boxes.

\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.4]{f02.eps}
\caption{The tester output tells us whether the distributions $p$ and $q$ are close} \label{F:boxpq}
\end{center}
\end{figure}
\par
We define $\mathcal{T}$ to be a ``closeness tester'' in 
$\begin{array}{l}\textrm{ L}_1 \\(\textrm{L}_2)
\end{array}$ distance if it satisfies:

\begin{enumerate}
\item if $p=q$, $\Pr[ \mathcal{T}\textrm{ outputs ``pass''}] \geq \frac{2}{3}$
\item if 
$\begin{array}{l}\ \ |p-q|_1 > \varepsilon \\
(\|p-q\|_2 > \varepsilon)\end{array}$, $\Pr[\mathcal{T}\textrm{ outputs ``fail''}] \geq \frac{2}{3}$
\end{enumerate}

\subsubsection*{Comments:}
\begin{enumerate}
\item $\frac{2}{3}$ is an arbitrary constant $> \frac{1}{2}$\\
can repeat $O(\log \frac{1}{\beta})$ times and take majority to get error $< \beta$
\item Probabilities are over coin tosses of algorithm and samples \\
not any assumptions on $p, q$
\item interest in sample/time complexity of $\mathcal{T}$
\end{enumerate}

\subsubsection{Naive Algorithm}
We can use a naive algorithm to test whether two distributions $p$ and $q$ are close:\\
\begin{itemize}
\item[]{}take $m$ samples of $p$ \\
\item[]{}take $m$ samples of $q$ \\
\item[]{}estimate $\hat{p_x} \leftarrow \frac{\textrm{\# of samples of } p \textrm{ that are } x}{m}$
\item[]{}estimate $\hat{q_x} \leftarrow \frac{\textrm{\# of samples of } q \textrm{ that are } x}{m}$
\item[]{}if 
$\begin{array}{l}\ \sum_x\limits|\hat{p_x}-\hat{q_x}| > \frac{\varepsilon}{2} \\
(\sum_x\limits(\hat{p_x}-\hat{q_x})^2 > \frac{\varepsilon}{2})\end{array}$, then ``fail''
\end{itemize}
We will need $\Omega(n)$ samples for this algorithm.

\subsubsection{Collision Probability}
Then, we define the following notions:
\begin{define}[collision probability of $p, q$]
{\[CP(p, q)=\Pr[\textrm{sample from }p\ = \textrm{ sample from }q] = \sum_x\limits p_x q_x\]}
\end{define}

\begin{define}[self-collision probability]
{\[SCP(p)=\Pr[2 \textrm{ samples from }p \textrm{ are equal}] = \sum_x\limits p_x^2 = \|p\|_2^2\]}
\end{define}
\par
The above definitions show that the $L_2$ distance can be expressed in terms of collision probabilities:\\
\[\|p-q\|_2^2 = \sum_x\limits(p_x - q_x)^2 = \sum_x\limits(p_x^2 - 2p_x q_x + q_x^2)= SCP(p) + SCP(q) -2CP(p,q)\] \\
To estimate collision probability $CP(p, q)$, we can do the following:\\
\begin{itemize}
\item[]{}$\textrm{Pick } m \textrm{ samples} \qquad
\begin{array}{l}
u_1 \dots u_m \quad \in P \\
v_1 \dots v_m \quad \in Q \\
\end{array}$
\item[]{}$\sigma_i \leftarrow \left\{ \begin{array}{ll} 1 & \textrm{if } u_i = v_i \\ 0 & \textrm{otherwise} \end{array}\right.$
\item[]{}output $\frac{\sum\sigma_i}{m}$
\end{itemize}

Note that $CP(U_D, U_D)=\sum\left ( \frac{1}{n}\right ) ^2=\frac{1}{n}$.

\subsubsection{The Tester}
The tester is to estimate the $L_2$ distance by estimating the $CP$ and the $SCP$.
\begin{itemize}
\item[]{}Given $p, q, \sum$
\item[]{}$m \leftarrow O(\varepsilon^{-4})$
\item[]{}
$\begin{array}{l}SCP(p), \\ SCP(q)\end{array}\left\{ 
\begin{array}{l}
S_p \leftarrow m \textrm{ samples of }p\\
S_q \leftarrow m \textrm{ samples of }q\\
r_p \leftarrow \textrm{\# of self-collisions in }S_p\\
r_q \leftarrow \textrm{\# of self-collisions in }S_q\\
\end{array}\right.$
\item[]{}
$CP(p, q) \left\{\begin{array}{l}
Q_p \leftarrow m \textrm{ samples from }p\\
Q_q \leftarrow m \textrm{ samples from }q\\
r_{pq} \leftarrow \textrm{\# } i \ j \ \textrm{such that } i^{th} \textrm{ sample in } Q_p = j^{th} \textrm{ sample in } Q_q\\
\end{array}\right.$
\item[]{}$r \leftarrow \frac{2m}{m-1}(r_p + r_q)$
\item[]{}$S \leftarrow 2r_{pq}$
\item[]{} if $r-S > \frac{m^2\varepsilon^2}{2}$, reject
\end{itemize}

\begin{theorem}
$\forall \ \varepsilon, p, q \ \exists \textrm{ an algorithm }\mathcal{T} \textrm{ that runs in}\ O(\varepsilon^{-4})$ sample \& time complexity which satisfies:
\begin{enumerate}
\item if $\|p-q\|_2 < \frac{\varepsilon}{2},\ \Pr[\mathcal{T} \textrm{ outputs ``pass''}] \geq \frac{2}{3}$
\item if $\|p-q\|_2 > \varepsilon,\ \Pr[\mathcal{T} \textrm{ outputs ``fail''}] \geq \frac{2}{3}$
\end{enumerate}
\end{theorem}

\begin{claim}
If $p = U_D$, and $S$ \& $q$ are chosen as follows:
\begin{itemize}
\item[]{}$S \leftarrow \textrm{random subset of }D \textrm{ of size } \frac{n}{2}$
\item[]{}$q \leftarrow U_s$
\end{itemize}
We need $\Omega(\sqrt{n})$ queries to distinguish input $(p, p)$ from input $(p, q)$.
\end{claim}
\begin{proof-idea}{
WLOG, assume the algorithm gives the same result for any permutation of underlying domain. (can only use collision statistics, \emph{i.e.} how many elements appear once, twice, \dots)\\
With high probability, if $m$ is $O(\sqrt{n})$, no collisions.
}
\end{proof-idea}

Note that for the $p,q$ in the above proof,
\[|p-q|_1 = \sum\frac{1}{n} = 1 \qquad \textrm{(really far)}\]
\[\|p-q\|_2 = \sqrt{\sum{\frac{1}{n^2}}} = 
\frac{1}{\sqrt{n}}\qquad \textrm{(tiny)}\]
Thus, the proof says very little about a lower bound for
estimating the $L_2$ distance.


\subsubsection{Analysis}
We would like to show that the expectation of the random variable $r-S$ is equal to the quantity we want to measure. In the next lecture, we will bound the variance of this random variable and use Chebychev's inequality to show that the value of the variable will be close to its expection.
\begin{claim}[Expectation]{The following equalities hold:
\begin{itemize}
\item[]{}$E[r_p] = {m \choose 2}\|p\|^2$
\item[]{}$E[r_q] = {m \choose 2}\|q\|^2$
\item[]{}$E[r_{pq}] = m^2CP(p, q)$
\item[]{}$E[S] = 2m^2CP(p, q)$
\item[]{}$E[r-S] = \frac{2m}{m-1}{m \choose 2} \left ( \|p\|^2 + \|q\|^2\right ) - 2m^2CP(p, q) = m^2\|p-q\|^2_2$
\end{itemize}
}\end{claim}

\begin{proof}{$E[r_p] = {m \choose 2}\|p\|^2$
\[\sigma_{ij} \leftarrow \left\{ \begin{array}{ll}
1 & \textrm{ if } i^{th}\textrm{ sample in }S_p = j^{th}\textrm{ sample in }S_p \\
0 &\textrm{ otherwise}\end{array}\right.\]
\[\Pr[\sigma_{ij}=1] = E[\sigma_{ij}] = \|p\|^2\]
\[r_p = \sum\sigma_{ij},\textrm{ so }E[r_p] = {m \choose 2}\|p\|^2\]
}
\end{proof}

\end{document}
