\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}
%\usepackage{amsmath,amssymb,amsthm}
\usepackage{amsmath,amssymb}
\newcommand{\BC}{\{-1,1\}^{n}}
\newcommand{\B}{{\mathbb B}}
\newcommand{\Rd}{\mathbb{R}^{d}}
\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in
\begin{document}
\input{preamble.tex}
%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{21}{April 24, 2007}{Ronitt Rubinfeld}{Ning Xie}
%%%% body goes in here %%%%
In this lecture, we are going to study property testing for clustering problems.
\section{Clustering problems}
We will be working in $d$-dimensional Euclidean space $\Rd$. For any two points $x$ and $y$ in $\R^{d}$,
their $L_2$-distance (Euclidean distance) is defined to be $\text{dist}(x,y)=\sqrt{\sum_{i=1}^{n}(x_{i}-y_{i})^{2}}$.
A Euclidean ball at center $x_{0}\in \R^{d}$ of radius $r$ is the set of points that within distance $r$ from $x_{0}$,
i.e., $\B_{b}(x_{0})=\{x \in \R^{d}: \text{dist}(x, x_{0}) \leq b\}$.
Let $X$ be a set of $n$ points in $\R^{d}$.
\begin{definition}
$X$ is {\bf $\mathbf{(k,b)}$-radius clusterable} ($(k,b)$-r.c. for short) is $X$ can be partitioned into $k$ subsets
such that each subset can be covered by a Euclidean ball of radius at most $b$.
\end{definition}
Given a set of $n$ points in $\R^{d}$, the problem of deciding if these points are $(k,b)$-r.c. is known to be
NP-hard. Therefore it is unlikely to find any polynomial time algorithm for this problem. Can we find an efficient
property testing algorithm for Clustering?
\section{Testing algorithm}
First we define a subset $X$ of $\R^{d}$ of size $n$ to be $\epsilon$-far from $(k,b)$-r.c. if we need to
delete at least $\epsilon n$ points from $X$ in order to make it $(k,b)$-r.c.
We have the following surprisingly efficient testing algorithm for Clustering:
\begin{center}
\fbox{
\begin{minipage}[t]{12cm}
\textbf{Testing Algorithm for Clustering}
\begin{description}
\item[1] sample $m=\Theta(\frac{dk}{\epsilon}\log \frac{dk}{\epsilon})$ points from $X$
\item[2] if the sampled points are $(k,b)$-r.c., PASS;
\item[ ] else FAIL
\end{description}
\end{minipage}
}
\end{center}
There are algorithms that find $k$ balls with minimum radius that contain all $m$ sampled points (known as
Euclidean $k$-Center Problem) in time $O(m^{kd+2})$. Therefore the testing algorithm has both sample complexity
and time complexity independent of $n$, the size of the input.
The rest of this lecture is devoted to proving the following theorem, which shows the correctness of the
testing algorithm.
\begin{theorem} \label{Thm-main}
\begin{enumerate}
\item If $X$ is $(k,b)$-r.c., then tester passes;
\item If $X$ is $\epsilon$-far from $(k,b)$-r.c., then $\Pr[\text{tester fails}] \geq 3/4$.
\end{enumerate}
\end{theorem}
It is clear that is $X$ is $(k,b)$-r.c., then the tester always passes. The non-trivial part of the proof is the
second part. In order to prove the second part, we need to introduce
an important concept called shatter exponent.
\section{Shatter exponent}
We begin with some notations.
\begin{enumerate}
\item $\B_{b}=\{B: B ~\text{is a ball in $\R^{d}$ of radius at most $b$}\}$, i.e., $\B_{b}$ is the set of all balls
in $\R^{d}$ of radius $\leq b$.
\item $\B_{k,b}=\{S: S ~\text{is a union of at most $k$ balls in $\B_{b}$}\}$.
\item $\B_{k,b}^{c}=\{\bar{S}:S \in \B_{k,b}$\}.
\end{enumerate}
\begin{observation}
If $X \subseteq S$ for some $S \in \B_{k,b}$, then $X$ is $(k,b)$-r.c. Conversely, if $X$ is not
$(k,b)$-r.c., then for all $S \in \B_{k,b}$, $X \nsubseteq S$. Or equivalently, if $X$ is not
$(k,b)$-r.c., then for all $\bar{S} \in \B_{k,b}^{c}$, $X \cap \bar{S} \neq \emptyset$.
Similarly, if $X$ is $\epsilon$-far from $(k,b)$-r.c., then $\forall S \in \B_{k,b}$, $|X \setminus S|\geq \epsilon n$;
so, $\forall \bar{S} \in \B_{k,b}^{c}$, $|X \cap \bar{S}|\geq \epsilon n$.
\end{observation}
Therefore, our plan is to show that, with high probability, the sample points contain at least one point of each
$\bar{S} \in \B_{k,b}^{c}$. To this end, we introduce the following definitions.
\paragraph{Definitions}
\begin{enumerate}
\item Let $X \subseteq \R^{d}$ be a point set and $\mathcal{S}$ be a family of subsets of $X$
(i.e., $\mathcal{S} \subseteq 2^{X}$).
A subset $N \subseteq X$ is an \emph{$\epsilon$-net on $X$ w.r.t. $\mathcal{S}$} if for all $S \in \mathcal{S}$
such that $|X \cap S| \geq \epsilon |X|$, then $N \cap S \neq \emptyset$.
Therefore, our hope is to show, w.h.p., the sample set is an $\epsilon$-net on $X$ w.r.t. $\B_{k,b}^{c}$.
\item For any $A \subseteq \R^{d}$, define the projection of $\mathcal{S}$ on $A$ to be
$\varphi_{\mathcal{S}}(A)=\{A \cap S: S \in \mathcal{S}\}$.
\item For each $m\in \mathbb{N}$, define $\varPhi_{\mathcal{S}}(m)$ to be the maximum size of a projection of
$\mathcal{S}$ on a set of size $m$: $\varPhi_{\mathcal{S}}(m)=\max_{A: |A|=m}|\varphi_{\mathcal{S}}(A)|$.
\item The {\bf shatter exponent} of $(X, \mathcal{S})$, denoted SE($X, \mathcal{S}$)\footnote{The shatter exponent is
a lower bound on the corresponding VC-dimension of $(X, \mathcal{S})$. The
VC-dimension (Vapnik-Chervonekis Dimension), a very important notion in machine learning theory, is defined as follows.
For any $A \subseteq X$, we say $A$ is shattered if $\varphi_{\mathcal{S}}(A)=2^{A}$, i.e., the projection of
$\mathcal{S}$ on $A$ includes all possible subsets of $A$. The VC-dimension of $(X, \mathcal{S})$ is the
cardinality of the largest set $A$ that it shatters: $\text{VC-dim}=\max\{|A|: A~\text{is shattered}\}$.
By definition, for every $m \leq \text{VC-dim}(X, \mathcal{S})$, $\varPhi_{\mathcal{S}}(m)=2^{m}$. It can be
shown that, for all $(X, \mathcal{S})$, $\text{SE}(X, \mathcal{S}) \leq \text{VC-dim}(X, \mathcal{S})$.}, is
the minimum $\ell$ such that
there exists a constant $c$ with $\varPhi_{\mathcal{S}}(m) \leq c m^{\ell}$ for all $m \geq 2 $.
\end{enumerate}
The importance of shatter exponent is demonstrated in the following theorem.
\begin{theorem}[Haussler and Welzl, 1987]
Let $X$ be a set of points in $\R^{d}$ and let $\mathcal{S}$ be a family of subsets of $X$. Let $U$ be a random
sample set from $X$ of size
\[
|U| \geq \Theta\left(\frac{\text{SE}(X, \mathcal{S})}{\epsilon}\log \frac{\text{SE}(X, \mathcal{S})}{\epsilon}\right),
\]
then $\Pr[U~\text{is an $\epsilon$-net on $X$ w.r.t. $\mathcal{S}$}] \geq 3/4$.
\end{theorem}
We are not going to prove this theorem here, interested readers are referred to the original paper.
Combining our previous observations with this theorem, our task of proving Theorem~\ref{Thm-main} is thus
reduced to proving that $\text{SE}(\B_{k,b}^{c})= O(kd)$.
\section{Bounding the shatter exponent}
We will begin with a simple fact for $d$-dimensional Euclidean space. Consider the simplest one dimensional case.
Let $A$ be a set of points on a line. Define $B^{*}(A)$ to be the minimum segment that covers $A$. Then
there are two points $a$ and $b$ in $A$, namely the two endpoints on the line, such that the minimum segment that covers
$\{a,b\}$ is the same segment that covers $A$. Generalizing this observation to higher dimension,
for a point set $A$ in $\R^{d}$, define
$B^{*}(A)=\{\text{minimum radius ball in $\R^{d}$ that covers}~ A\}$, then we have
\begin{fact}\label{fact-1}
For any finite point set $A\subseteq \R^{d}$, there exists a subset $A' \subseteq A$ and $|A'| \leq d+1$ such that
$B^{*}(A')=B^{*}(A)$.
\end{fact}
It follows that the total number of different minimum radius covering balls for $X$ is at most
$\binom{n}{d+1} < n^{d+1}$.
Our next observation is, for clustering purposes, it is enough to argue in terms of \emph{minimum radius covering balls}
w.r.t. set $X$. That is, all of our previous discussions on the relation between $X$ being $(k,b)$-r.c. and
$b$-radius balls in $\R^{d}$ still apply if we replace the general balls with minimum radius covering balls
w.r.t. $X$. Combining this observation with Fact~\ref{fact-1} gives
\begin{align*}
\varPhi_{\B_{1,b}}(n)
&=\text{the number of minimum radius covering balls w.r.t. $X$ of radius at most $b$}\\
&\leq \text{the number of minimum radius covering balls w.r.t. $X$}\\
&< n^{d+1}.
\end{align*}
Therefore $\text{SE}(X, \B_{1,b}) < (d+1)$ by definition. Since $\B_{k,b}$ includes unions of $k$ balls,
$\text{SE}(X, \B_{k,b}) < k(d+1)$. Finally note that $A \subseteq X$ is in $\mathcal{S}$ if and only if
$X \setminus A$ is in $\mathcal{\bar{S}}$. Therefore, $\text{SE}(X, \B_{k,b}^{c})=\text{SE}(X, \B_{k,b})
< k(d+1)=O(kd)$. This complete the proof of Theorem~\ref{Thm-main}.
\end{document}