
\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\newcommand{\dist}{\mathsf{dist}}
\newcommand{\calF}{\mathcal{F}}

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{22}{April 26, 2007}{Ronitt Rubinfeld}{Brendan Juba}

%%%% body goes in here %%%%

\section{Overview}
We will continue examining sublinear time algorithms for clustering. Last time,
we considered a set of $n$ points and gave an algorithm to decide, in a constant
number of queries, ``are they $(k,b)$-radius clusterable?'' Today we'll give an
algorithm for a completely different notion of clustering, which examines the
{\em average} distance of the points from their centers rather than the {\em
maximum} distance. Our algorithm will make $O(\log n)$ queries, which is worse,
but outputs an approximation of how well the data can be clustered rather than
simply testing whether or not a clustering exists. In fact, we'll even see how
to {\em find} a concise representation of an approximate clustering in sublinear
time. It's worth noting that the exact version of this problem is also NP-complete.

\section{Notation and preliminaries}
Let $X$ be a set of $n$ points such that for any two points $x,y\in X$, the 
{\em distance} between $x$ and $y$, $\dist(x,y)$, is at most $M$. Given $k$ {\em
centers}, $c_1,\ldots,c_k\in X$, we define
\[
f_{c_1,\ldots,c_k}(x)=\min_i\dist(x,c_i)
\]
We remark that, in clustering, one should {\em always} check whether the cluster
centers are allowed to be arbitrary points, or whether they are restricted to
come from the input data. In this case, notice that they must lie in $X$, the
set we wish to cluster.

We will define the {\em cost of a clustering} to be the average over all points
of the distance to the closest center, i.e., the cost of the clustering
$f_{c_1,\ldots,c_k}$ is
\[
E_X[f_{c_1,\ldots,c_k}(x)]=\frac{1}{|X|}\sum_{x\in X}f_{c_1,\ldots,c_k}(x)
\]
Our goal to choose centers $c_1,\ldots,c_k$ to minimize this quantity; ideally,
we would like to find the optimal clustering
\[
f^o={\arg\min}_{f_{c_1,\ldots,c_k}:c_1,\ldots,c_k\in X}E_X[f_{c_1,\ldots,c_k}]
\]
and calculate $E_X[f^o]$. Instead, we will output some $\hat{f}$ approximating
$f^o$ and an estimate of $E_X[f^o]$.

For a {\em sample} $S\subset X$ and centers $c_1,\ldots,c_k\in X$, we will also 
consider $E_S[f_{c_1,\ldots,c_k}]$, i.e., the average cost of points in the
sample under the clustering $f_{c_1,\ldots,c_k}$. Notice that $c_1,\ldots,c_k$
are not restricted to lie in $X$!

\section{The Algorithm}
Our algorithm is particularly simple. We pick a sample $S$ of $m=
O\left(\frac{M^2}{\varepsilon^2}\left(k\log n+\ln\frac{2}{\delta}\right)\right)$
points independently and uniformly at random from $X$. We then find, by 
exhaustive search,
\[
(\hat{c}_1,\ldots,\hat{c}_k)={\arg\min}_{c_1,\ldots,c_k\in S}E_S[f_{c_1,\ldots,c
_k}]
\]
Let $\hat{f}=f_{\hat{c}_1,\ldots,\hat{c}_k}$. We then output $\hat{c}_1,\ldots,
\hat{c}_k$ and $E_S[\hat{f}]$.

\section{Analysis}
The correctness of our algorithm is established by the following:
\begin{theorem}[Mishra, Oblinger, Pitt]
With probability at least $1-\delta$,
\[
E_X[f^o]-\varepsilon\leq E_S[\hat{f}]\leq 2E_X[f^o]+2\varepsilon
\]
and $E_X[\hat{f}]\leq 2E_X[f^o]+3\varepsilon$.
\end{theorem}
We will prove this shortly. First, we will require a couple of lemmas.

Recall that the analysis of our algorithms for the MST problem depended on the 
ratio of the maximum weight to the minimum weight. Our dependence on the maximum
distance between points comes from our use of a Chernoff bound on the sums of
bounded range random variables in the following lemma:
\begin{lemma}\label{lem1}
Let $\calF$ be a set of functions on $X$ such that $\forall f\in\calF,x\in X$
$0\leq f(x)\leq M$. Let $S=\{s_1,\ldots,s_m\}$ be samples chosen uniformly at 
random from $X$ such that 
$m\geq\frac{M^2}{2\varepsilon^2}\left(\ln|\calF|+\ln\frac{2}{\delta}\right)$.
Then
\[
\Pr[\exists f\in\calF:|E_X[f]-E_S[f]|\geq\varepsilon]\leq\delta
\]
\end{lemma}
\begin{proof}
Recall our additive Chernoff bound for bounded range random variables: given
$X_1,\ldots,X_m$ i.i.d. random variables with range $[-a,a]$, $\hat\mu=\frac
{1}{m}\sum_{i=1}^mX_i$ and $\mu=E[\hat\mu]$, $\forall\rho>0$,
\[
\Pr[|\hat\mu-\mu|>\rho]\leq 2\exp\left(-\frac{\rho^2}{2a^2}m\right)
\]
Fix $f\in\calF$. Let $a=M/2$, $X_i=f(s_i)-M/2$, and put $\rho=\varepsilon$. 
Notice that $\hat\mu=E_S[f]-M/2$ and $\mu=E_X[f]-M/2$. Therefore, the bound 
tells us that
\[
\Pr[|E_S[f]-E_X[f]|>\varepsilon]\leq 2\exp\left(-\frac{2\varepsilon^2}{M^2}m
\right)\leq \frac{\delta}{|\calF|}
\]
for our fixed choice of $f\in\calF$. By a union bound over each fixed 
$f\in\calF$ now,
\[
\Pr[\exists f\in\calF:|E_S[f]-E_X[f]|>\varepsilon]\leq\sum_{f\in\calF}
\Pr[|E_S[f]-E_X[f]|>\varepsilon]\leq\delta
\]
\end{proof}

Now, our algorithm minimizes the cost on its sample $S$ over choices of centers
from $S$ rather than $X$. We argue that this is not more than a factor of two
worse than we would have achieved if we minimized the cost on $S$ over choices
of centers from all of $X$:
\begin{lemma}\label{lem2}
Let $\calF_S=\{f_{c_1,\ldots,c_k}:c_1,\ldots,c_k\in S\}$, $\calF_X=\{f_{c_1,
\ldots,c_k}:c_1,\ldots,c_k\in X\}$. If $f^*={\arg\min}_{f\in\calF_X}E_S[f]$,
then $\exists\hat{f}\in\calF_S$ such that $E_S[\hat{f}]\leq 2E_S[f^*]$.
\end{lemma}
\begin{proof}
Suppose $f^*=f_{c_1,\ldots,c_k}$ for $c_1,\ldots,c_k\in X$. Let $c_i'$ be the
closest point in the sample to $c_i$, and let $\hat{f}=f_{c_1',\ldots,c_k'}$.

For $p\in S$, suppose $f^*$ assigned $p$ to the cluster with center $c_i$ and
$\hat{f}$ assigned $p$ to the center $c'_j$. (Note that in general we may have
$i\neq j$.) Now, by our choice of assignments:
\[
\dist(p,c'_j)\leq\dist(p,c'_i)\leq\dist(p,c_i)+\dist(c_i,c'_i)
\]
Notice, since $p\in S$ and $c'_i$ was chosen to be the closest point in $S$ to
$c_i$, $\dist(c_i,c'_i)\leq\dist(p,c_i)$. Therefore, we see that $\dist(p,c'_j)
\leq 2\dist(p,c_i)$, and hence
\[
E_S[\hat{f}]=
\frac{1}{|S|}\sum_{p\in S}f_{c'_1,\ldots,c'_k}(p)\leq
\frac{1}{|S|}\sum_{p\in S}2f_{c_1,\ldots,c_k}(p)=
2E_S[f^*]
\]
\end{proof}

\begin{proof}{\bf (of Theorem) }
First, observe that by Lemma \ref{lem2}, (again letting $f^*={\arg\min}_{f\in
\calF_X}E_S[f]$)
\[
E_S[\hat{f}]\leq 2E_S[f^*]\leq 2E_S[f^o]
\]
where the second inequality follows since $f^*$ is chosen to be an element of
$\calF_X$ that minimizes $E_S[f^*]$, while $f^o$ is just some other element of 
$\calF_X$.

Second, notice that the number of choices for $c_1,\ldots,c_k\in X$ is $n^k$, so
$|\calF_X|=n^k$, and by Lemma \ref{lem1} $m$ is sufficiently large that with
probability at least $1-\delta$, $\forall f\in\calF_X$
\[
|E_X[f]-E_S[f]|\leq\varepsilon
\]
In particular, for $\hat{f}$,
\[
E_X[\hat{f}]\leq E_S[\hat{f}]+\varepsilon
\]
where, since $f^o$ was chosen to minimize $E_X[f^o]$ and $\hat{f}$ is some other
element of $\calF_X$, $E_X[f^o]\leq E_X[\hat{f}]$, so
\[
E_X[f^o]-\varepsilon\leq E_S[\hat{f}]
\]
Now, notice that Lemma \ref{lem1} also guarantees for $f^o$ that
\[
E_S[f^o]\leq E_X[f^o]+\varepsilon
\]
so, returning to our first inequality above, we further find
\[
E_S[\hat{f}]\leq 2E_X[f^o]+2\varepsilon
\mathrm{\ and\ }
E_X[\hat{f}]\leq 2E_X[f^o]+2\varepsilon+\varepsilon
\]
as needed.
\end{proof}

\section{Preview}
Next time we will talk about testing properties of functions rather than testing
properties of data. One may well wonder what the difference will be, and 
justifiably so---there won't be any. We will view the function as being given by
an input/output table, where we query an oracle for an input of the function and
obtain the corresponding output. In this setting ``$\varepsilon$-closeness''
will be taken to mean that we need to change the table in at most a $\varepsilon
$-fraction of its locations to make the function satisfy a given property.

The first problem we will consider will be monotonicity testing, i.e., looking at
the table to test whether it is sorted or whether we need to delete a
$\varepsilon$-fraction of the entries to make it sorted. We will first consider
the one-dimensional case, and then move to higher dimensions.
\end{document}




