
\documentclass[10pt]{article}
\newtheorem{dfn}{Definition}
\newtheorem{clm}{Claim}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

% pseudocode package:
\usepackage[noend]{algorithm2e}
\incmargin{1em}
\restylealgo{unboxed}
\linesnumbered
\dontprintsemicolon
\SetArgSty{rm}
\SetTitleSty{textsc}{11pt}
\SetKwComment{Comment}{ // }{}

% custom commands:
\newcommand{\textbi}[1]{\textbf{\textit{#1}}}
\newcommand{\la}{\leftarrow}
\newcommand{\ra}{\rightarrow}
\newcommand{\NL}{\textrm{NL}}
\renewcommand{\L}{\textrm{L}}
\newcommand{\A}{\mathcal{A}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\Obj}{\textrm{Obj}}
\newcommand{\eps}{\varepsilon}
\newcommand{\La}{\Leftarrow}

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{10}{March 8, 2007}{Ronitt Rubinfeld}{Khanh Do Ba}

\section{Recap}
\subsection{Algorithm and notation}

LZ77$(w)$\\
\begin{algorithm}[H]
	$t \la 1$\;
	\Repeat{$t>n\;\;(=|w|)$}{
		find longest substring $w_t...w_{t+\ell-1}$ s.t. $\exists$ index $p<t$ with
		$w_p...w_{p+\ell-1} = w_t...w_{t+\ell-1}$\;
		\If{none}{
			next symbol $=w_t$\;
			$t \la t+1$\;
		}\Else{
			next symbol $=(p,\ell)$\;
			$t \la t+\ell$\;
		}
	}
\end{algorithm}

Recall the following notation.
\begin{eqnarray*}
	n_\ell(w) &=& \textrm{\# compressed segments of length $\ell$ in $w$, not counting
												alphabet symbols and last}\\
						& & \textrm{compressed segment}\\
	C_{LZ}(w) &=& \textrm{\# symbols in compressed string (\# pairs + \# alphabet
												symbols)}\\
	d_\ell(w) &=& \textrm{\# distinct substrings of length $\ell$}
\end{eqnarray*}

\subsection{Main theorem}

Recall the main theorem, which forms the basis of our LZ-Approx algorithm.
\begin{theorem}\label{thm:1} For any integer $\ell_0>1$, let
	\[
		m=m(\ell_0)=\max_{\ell\in[\ell_0]}\frac{d_\ell(w)}{\ell}.
	\]
	Then
	\[
		m \le C_{LZ}(w) \le 4\left(m\log\ell_0 + \frac{n}{\ell_0}\right).
	\]
\end{theorem}

Last lecture, we already proved the first inequality. We also claimed that
	\begin{enumerate}
		\item[(*)] For every $\ell \in \{1,\dots,\ell_0/2\}$ (wlog, assume $\ell_0$ is even),
					\[
						\sum_{k=1}^\ell n_k \le 2(m+1)\sum_{k=1}^\ell \frac{1}{k}.
					\]
		\item[(**)] For every $\ell \ge 1$,
					\[
						\sum_{k=\ell+1}^n n_k \le \frac{n}{\ell+1}.
					\]
	\end{enumerate}
and proved (**), as well as that (*) and (**) together imply the second inequality.
Today, it remains to prove (*).

\section{Proof of (*) and Theorem \ref{thm:1}}

\begin{proof}
	We reexpress the summation as
	\[
		\sum_{k=1}^\ell kn_k = A + B,
	\]
	where $A =$ \# locations within $\{\ell,\dots,n-\ell\}$ that lie in compressed
	substrings of length $\le\ell$, and $B =$ \# locations within $[n]\setminus
	\{\ell,\dots,n-\ell\}$ that lie in compressed substrings of length $\le\ell$.
	Clearly, $B \le 2\ell$. Let $C =$ \# distinct length-$2\ell$ substrings, so that
	$C \le 2m\ell$ by the definition of $m$. It remains to show that $A \le C$ by
	giving a one-to-one mapping $f: A \ra C$.\\
	
	Define $f$ as follows. For each location $j \in A \subset \{\ell,\dots,n-\ell\}$, 
	let $f(j)$ be the substring of $w$ of length $2\ell$ ending at location $j+\ell$,
	i.e., $w_{j-\ell+1}..w_{j+\ell}$. Clearly $f$ is well-defined, so we are left to
	show that it is one-to-one.\\
	
	Now, let us call a substring $w_t..w_{t+k-1}$ ``new''
	if there exists no $p<t$ such that $w_p..w_{p+k-1}=w_t..w_{t+k-1}$. It follows
	that if $w_t..w_{t+k-1}$ is a compressed substring of length $k$, then
	since LZ always adds the longest previously seen substring, adding any suffix
	would make it new. Moreover, as long as we add some suffix, we can add as much
	of a prefix as we like and the substring would still be new. In other words, a
	substring which entirely contains a compressed substring, plus at least one
	location after it, must be new. In particular, $f(j)$ is new for any $j\in A$,
	since the compressed segment containing $j$ (which is of length $\le\ell$) can
	start no further left than $w_{j-\ell+1}$ and end no further right than
	$w_{j+\ell-1}$, hence is (right-hand-side) strictly contained in
	$f(j) = w_{j-\ell+1}..w_{j+\ell}$. This completes the proof of the theorem.
\end{proof}

\section{Models of Computation for Sublinear Algorithms}

Previously, we studied inputs that were black-box probability distributions, out of
which we were only allowed to take independent samples. In that case we wanted
algorithms sublinear in the domain size. Now, our model allows random access
to the input.  That is,
we are given a large input string and wish to only read a sublinear amount of
it. For example, this was the case in our LZ-Approx algorithm, where we read
random \emph{segments} of the input, instead of single, independent locations.\\

Question: must it always be randomized and/or approximate?

\subsection{Deterministic?}

Consider the problem of computing the diameter of a set of points.
\begin{eqnarray*}
	\textrm{Given:}  && \textrm{$m\times m$ distance matrix $D$, symmetric and
										 satisfying triangle inequality}\\
	\textrm{Output:} && \textrm{diameter $d = \max_{ij}D_{ij}$}
\end{eqnarray*}

We wish to compute $d$ in time sublinear in the input size $n=m^2$. Consider the
following deterministic algorithm.\\

\begin{algorithm}[H]
	Pick $x$ arbitrarily\;
	Pick $y$ to maximize $D_{xy}$\;
	Output $D_{xy}$
\end{algorithm}

\begin{clm}
	$d/2 \le D_{xy} \le d$.
\end{clm}

\begin{proof}
	Let $i,j$ be such that $D_{ij}=d$. Then by triangle inequality,
	\begin{eqnarray*}
		d=D_{ij} &\le& D_{ix}+D_{xj}\\
						 &\le& D_{xy}+D_{xy}\\
						 &\le& 2D_{xy},
	\end{eqnarray*}
	so $D_{xy} \ge d/2$. The other inequality is obvious.
\end{proof}

This is provided as an example of a nontrivial sublinear \emph{deterministic}
algorithm, but in general, we will almost always deal with randomized approximation
algorithms.

\subsection{Property testing}

The general idea in property testing is as we have seen: if the data has the
property, we pass; if the data is ``far'' from having the property, we fail; if it
is in between, we make no guarantee. The important variable here is the definition
of ``far.''

\subsection{A property-tester for connectivity}

Given a graph $G=(V,E)$, where $|V|=n$, $|E|=m$, multiple edges are allowed,
self-loops are not, degree is bounded by $d$, and $G$ is given in adjacency list
format, we must tell whether $G$ is connected. More precisely, if $G$ is connected,
$\Pr[\textrm{output pass}] \ge 2/3$, and if $G$ is $\eps$-far from connected,
$\Pr[\textrm{output fail}] \ge 2/3$.\\

We attempt one definition of being $\eps$-far from connected.

\begin{dfn}\label{dfn:1}
	$G$ is $\eps$-far from connected if we must add at least $\eps dn$ edges to make
	it connected.
\end{dfn}

A potential problem is adding these edges can drastically change the degree bound
$d$. Alternately, let us define

\begin{dfn}
	$P_{n,d} = \{G' :
						 G'\textrm{ is a connected $n$-vertex graph with max degree}\le d\}$.
\end{dfn}

\begin{dfn}
	$G$ is $\eps$-far from $P_{n,d}$ if we must add/delete at least $\eps dn$ edges
	to make it a member of $P_{n,d}$.
\end{dfn}

For this algorithm, we will use Definition \ref{dfn:1}. The key idea is as follows.
If $G$ is far from connected, then it has a lot of connected components, which
means it has a lot of small connected components, which means a random vertex is
reasonably likely to land in a small connected component.\\

Conn-Tester$(G)$\\
\begin{algorithm}[H]
	Choose $m = O\left(\frac{1}{\eps d}\right)$ vertices uniformly at random\;
	\For{each vertex $s$}{
		Run BFS until (a) $> \frac{2}{\eps d}$ new nodes explored or
									(b) $s$ is found to be in a component of size $\le \frac{2}{\eps d}$\;
	}
	\lIf{(b) ever happens}{FAIL }\lElse{PASS}
\end{algorithm}

The number of queries is
\[
	O\left(\frac{1}{\eps d} \cdot \frac{2}{\eps d} \cdot d\right)
		= O\left(\frac{1}{\eps^2d}\right),
\]
where $\frac{1}{\eps d}$ is the number of starting vertices, $\frac{2}{\eps d}$ is
the number of BFS steps, and $d$ is the cost of a BFS step.

\subsection{Correctness of algorithm}

\begin{theorem} The algorithm is a property tester for connectivity. More precisely,
	\begin{enumerate}
		\item[(1)] If $G$ is connected then Conn-Tester$(G)$ passes.
		\item[(2)] If $G$ is $\eps$-far from connected, then
							 $\Pr[\textrm{Conn-Tester$(G)$ fails}] \ge 2/3$.
	\end{enumerate}
\end{theorem}

\begin{proof}
	(1) is obvious. For (2), observe that if $G$ is $\eps$-far from connected, then it
	has $> \eps dn$ connected components. A direct consequence of this is that $G$
	must have $\ge \frac{\eps dn}{2}$ components of size $\le \frac{2}{\eps d}$.
	In particular, each of these components has at least one vertex, so the number of
	vertices in small ($\le \frac{2}{\eps d}$) components is at least
	$\frac{\eps dn}{2}$. It follows that
	\[
		\Pr[\textrm{random $s\in V$ lies in component of size $\le\frac{2}{\eps d}$}]
			\ge \frac{\eps dn/2}{n}
			= \frac{\eps d}{2}.
	\]
	Thus, using the standard approximation $(1-1/x)^x \approx 1/e$,
	\[
		\Pr[\textrm{no $s$ lies in small component}]
			\le \left(1-\frac{\eps d}{2}\right)^{c/\eps d}
			\approx e^{-c/2}
			\le 1/3
	\]
	for large enough $c$.
\end{proof}

\end{document}
