\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\usepackage{amstext}
\usepackage{graphicx}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

\lecture{11}{March 13, 2007}{Ronitt Rubinfeld}{Christina Sauper}

\section{Estimating the Number of Connected Components}

Given a graph $G(V,E)$ with max degree $d$ and adjacency list representation
and some $\epsilon$, we want to give an additive estimate of the number of
connected components to within $\epsilon$$n$.

\subsection{Main Idea}

Define:
\begin{eqnarray*}
n_u & \equiv & \text{number of nodes in } u\text{'s component, where }u \in V\\
\end{eqnarray*}

\begin{fact} 

    For any connected component $A \subseteq V$:
    \[\sum_{u \in A} \frac{1}{n_u} = \sum_{u \in A} \frac{1}{|A|} = 1\]
    In addition, there are $\sum_{u \in V} \frac{1}{n_u}$ connected components.
\end{fact}
\noindent
    Determining this value exactly takes $\bigO(n^2)$ time, but we will
    estimate the sum and the values of $n_u$. \\

    
\noindent
Define:
\begin{eqnarray*}
\hat{n}_u & \equiv & \min \set{ \text{nodes in }u\text{'s component},
\frac{2}{\epsilon} } \\
\hat{c} & = & \sum_{u \in V} \frac{1}{\hat{n}_u} \\
\end{eqnarray*}

\begin{fact}  The error in estimating $\frac{1}{\hat{n}_u}$ is small.

\[\abs{\frac{1}{\hat{n}_u} - \frac{1}{n_u}} \leq \frac{\epsilon}{2}\]
\end{fact}

Either $\hat{n}_u = n_u$ or $n_u > \hat{n}_u = \frac{2}{\epsilon}$.  In the
latter case, $\frac{\epsilon}{2} = \frac{1}{\hat{n}_u} \geq \frac{1}{n_u} \geq
0$.  Therefore, the error is small, at most $\frac{\epsilon}{2}$.

\begin{corollary} $\frac{1}{\hat{n}_u}$ is a good estimate of connected
components.

\[\sum_{u \in V} \abs{\frac{1}{n_u} - \frac{1}{\hat{n}_u}} \leq
\frac{{\epsilon}{n}}{2}\]
\[c-\frac{{\epsilon}{n}}{2} \leq \frac{1}{\hat{n}_u} \leq c+\frac{{\epsilon}{n}}{2}\]
\end{corollary}

\begin{fact} We can compute $\hat{n}_u$ in $\bigO(\frac{d}{\epsilon})$ time.
\end{fact}

Take $\frac{2}{\epsilon}$ steps of a BFS.  If we see the entire connected
component, set $\hat{n}_u = n_u = \frac{1}{\text{size}}$.  Otherwise,
$\hat{n}_u = \frac{2}{\epsilon}$.\\


\noindent
Summing these $\hat{n}_u$ values yields a linear time algorithm.  Now, we want
to estimate this sum by estimating the average cluster size ($\sum_{u \in V}
\frac{1}{\hat{n}_u}$) and multiplying by $|V|$.

\subsection{Algorithm}

\vspace{1em}
\textsc{Approx\_Num\_CC}($G,\epsilon$)\\
\indent \indent Choose $r = \bigO(\frac{1}{\epsilon^3})$ nodes $u_1$ \dots
$u_r$\\
\indent \indent $\forall u_i$ compute $\hat{n}_{u_i}$\\
\indent \indent Output $\tilde{c} = \frac{n}{r} \sum_{i=1}^r
\frac{1}{\hat{n}_{u_i}}$\\
\vspace{1em}

Runtime of this algorithm is $\bigO(\frac{1}{\epsilon^3} \cdot
\frac{d}{\epsilon}) = \bigO(\frac{d}{\epsilon^4})$.

\begin{theorem} $\Pr \left [ \abs{\tilde{c}-\hat{c}} \leq \frac{\epsilon}{2}n \right ] \geq
\frac{3}{4}$
\end{theorem}

\begin{corollary} Since $\abs{c-\tilde{c}} \leq \abs{c-\hat{c}} +
\abs{\hat{c}-\tilde{c}}$ and $\abs{c-\hat{c}} \leq \frac{{\epsilon}n}{2}$:
\[\Pr \left [ \abs{c-\tilde{c}} \leq {\epsilon}n \right ] \geq \frac{3}{4}\]
\end{corollary}

\begin{proofof}{theorem} We know upper and lower bounds on our estimated
average cluster size:
\[\forall{i} \frac{\epsilon}{2} \leq \frac{1}{\hat{n}_i} \leq 1\]

Using Chernoff bounds, we can compute the error probability for the estimated
cluster size:

\[\Pr \left [ \abs{\frac{1}{r} \sum_{1 \leq i \leq r} \frac{1}{\hat{n}_{u_i}}
- \E \left [\frac{1}{\hat{n}_{u_i} } \right ] } > \frac{\epsilon}{2}\E \left
[\frac{1}{\hat{n}_{u_i}} \right ] \right ] \leq \exp \left ({-\bigO(r\E \left
[\frac{1}{\hat{n}_{u_i}}\right ] \cdot \left (
\frac{\epsilon}{2}\right)^2)}\right ) \leq \frac{1}{4}\] 

Here, using $r = \frac{c}{\epsilon^3}$ samples is good enough for constant
$c$.  The cutoff bound gets a better running time by bounding the maximum vs.
minimum cluster sizes.

Likewise, we can see the error probability for the estimated sum:

\begin{center}
\begin{tabular}{cccc}
$\Pr \left [ \right .$ & $\abs{ \frac{n}{r} \cdot \sum \frac{1}{\hat{n}_{u_i}}
- n \cdot \E \left [ \frac{1}{\hat{n}_{u_i}} \right ] }$ & $\leq$ & $\left .
\epsilon \cdot \E \left [ \frac{n}{\hat{n}_{u_i}}\right ] \right ] \geq
\frac{3}{4}$\\ $\Pr \left [ \right .$ & $\abs{ \tilde{c} -
\hat{c}(=\sum\frac{1}{\hat{n}})}$ & $\leq$ & $\left . \epsilon \cdot
\hat{c}(\leq n)\right ] \geq \frac{3}{4}$\\
\end{tabular}
\end{center}

\end{proofof}

\section{Minimum Spanning Tree}

\subsection{Definitions}

Given a graph $G = (V, E)$ of degree $\leq d$, in adjacency list format and
with edge weights $w_{ij} \in {1 \dots w} \cup {\infty}$.  We will assume
the graph is connected; i.e., there is a minimum spanning tree of finite
weight.

For a tree $T \subseteq E$: 
\begin{eqnarray*}
w(T) & = & \sum_{(ij) \in T} w_{ij} \\
M & = & \min_{T \text{ spans } G} w(T) \\
\end{eqnarray*}

We will assume that all weights are positive and finite, therefore $n-1 \leq M
\le \infty$.

\subsection{Main Idea}

Our goal is to output $\hat{M}$ such that $(1-\epsilon)M \leq \hat{M} \leq (1+\epsilon)M$.  This is close to an $\epsilon$-multiplicative estimate because $\frac{1}{1+\epsilon} \approx 1-\epsilon$.

\noindent
Given a graph $G$:

\begin{eqnarray*}
G^{(i)} & = & \text{edges of }G\text{ which have weight at least }i \\
c^{(i)} & = & \text{number of connected components in }G^{(i)} \\
\end{eqnarray*}

So the number of edges of weight at least $k$ is $c^{(k-1)}-1$.

\noindent
For example:

\begin{tabular}{c|c}
\includegraphics[width=.61in,height=.64in]{graph_a2.png}&\includegraphics[width=.61in,height=.64in]{graph_a1.png} \\
\hline
$G^{(1)}$, $c^{(1)} = 2$ & $G^{(2)}$, $c^{(2)} = 1$ \\
\hline
\end{tabular}

$MST(G) = (n-1) + (c^{(1)} - 1) = n-2 + c^{(1)} = 4$\\

\begin{tabular}{c|c|c}
\includegraphics[width=.63in,height=.72in]{graph_b3.png}&\includegraphics[width=.63in,height=.72in]{graph_b2.png}&\includegraphics[width=.68in,height=.72in]{graph_b1.png} \\
\hline
$G^{(1)}$, $c^{(1)} = 3$ & $G^{(2)}$, $c^{(2)} = 2$ & $G^{(3)}$, $c^{(3)} = 1$ \\
\hline
\end{tabular}

$MST(G) = (n-1) + (c^{(1)} - 1) + (c^{(2)} - 1) = n-3 + c^{(1)} + c^{(2)} = 7$

\begin{claim} $MST(G) = n-w + \sum_{1 \leq i \leq w-1} C^{(i)}$
\end{claim}
\begin{proof}

Let $\alpha_i =$ the number of weight $i$ edges in the MST.

\begin{fact} For any MST of G, $\alpha_i$'s are the same.  Note that $\sum_{i=l+1}^w \alpha_i = c^{(l)} - 1$, and in particular $\sum_{i=1}^w \alpha_i = n-1$; $\alpha_w = c^{(w-1)} - 1$.
\end{fact}

\begin{eqnarray*}
MST(G) & = & \sum_{i=1}^w i\alpha_i \\
 & = & \sum_{i=1}^w \alpha_i + \sum_{i=2}^w \alpha_i + \dots + \alpha_w \\
 & = & n-1 + c^{(1)}-1 + c^{(2)} - 1 + \dots + c^{(w-1)} - 1 \\
 & = & n-w + \sum_{i=1}^{w-1} c^{(i)} \\
\end{eqnarray*}
\end{proof}


\subsection{Algorithm}

\begin{tabbing}
\qquad\=\qquad\=\kill
\textsc{MST\_Approx\_Alg}($G$, $\epsilon$, $w$) \\
\> \FOR $\quad i = 1 \dots w-1$ \\
\> \> $\hat{c}^{(i)} = $\textsc{Approx\_Num\_CC}($G^{(i)}$, $\frac{\epsilon}{w}$) \\
\> Output $\hat{M} = n-w + \sum_{i=1}^{w-1} c^{(i)}$
\end{tabbing}

\noindent
Run time:

There are $w$ calls to \textsc{Approx\_Num\_CC} (run time $\bigO(d/(\frac{\epsilon}{w})^4)$), for an overall run time of $\bigO(\frac{dw^5}{\epsilon^4})$. Because this running time depends on $w$, it is best when there is a good max to min ratio of edge weights.

\begin{proof-sketch}
$\forall i |\hat{c}^{(i)} - c^{(i)}| \leq \frac{\epsilon}{w}n$ (with high enough probability) then $|M-\hat{M}| \leq {\epsilon}n$.

\noindent
Since $M > n$:

$(1-\epsilon)M \leq \hat{M} \leq M+{\epsilon}n \leq M+{\epsilon}M = (1+\epsilon)M$

The lower bound is proved similarly.
\end{proof-sketch}

\noindent
\end{document}