
\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

% pseudocode package:
\usepackage[noend]{algorithm2e}
\incmargin{1em}
\restylealgo{unboxed}
\linesnumbered
\dontprintsemicolon
\SetArgSty{rm}
\SetTitleSty{textsc}{11pt}
\SetKwComment{Comment}{ // }{}

% custom commands:
\newcommand{\textbi}[1]{\textbf{\textit{#1}}}
\newcommand{\la}{\leftarrow}
\newcommand{\ra}{\rightarrow}
\newcommand{\NL}{\textrm{NL}}
\renewcommand{\L}{\textrm{L}}
\newcommand{\A}{\mathcal{A}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\Obj}{\textrm{Obj}}
\newcommand{\eps}{\varepsilon}
\newcommand{\La}{\Leftarrow}

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{9}{March 6, 2007}{Ronitt Rubinfeld}{Khanh Do Ba}

\section{Lempel-Ziv Compression}
\subsection{The algorithm}

LZ77$(w)$\\
\begin{algorithm}[H]
	$t \la 1$\;
	\Repeat{$t>n\;\;(=|w|)$}{
		find longest substring $w_t...w_{t+\ell-1}$ s.t. $\exists$ index $p<t$ with
		$w_p...w_{p+\ell-1} = w_t...w_{t+\ell-1}$\;
		\If{none}{
			next symbol $=w_t$\;
			$t \la t+1$\;
		}\Else{
			next symbol $=(p,\ell)$\;
			$t \la t+\ell$\;
		}
	}
\end{algorithm}

\subsection{Some notation}

The following notation will be used extensively.
\begin{eqnarray*}
	n_\ell(w) &=& \textrm{\# compressed segments of length $\ell$ in $w$, not 
	counting alphabet symbols nor last compressed segment}\\
	C_{LZ}(w) &=& \textrm{\# symbols in compressed string (\# pairs + \# alphabet
												symbols}\\
	d_\ell(w) &=& \textrm{\# distinct substrings of length $\ell$}
\end{eqnarray*}

Examples:
\begin{enumerate}
	\item $aaaaaaa$ is encoded as $a(1,6)$ and has $d_\ell(w)=1 ~\forall\ell\in[7]$.
	\item $abcd$ is encoded as $abcd$.
	\item $abaabaaabaaa$ is encoded as $aa(1,1)(1,4)(4,5)$, where the compressed
				segments are broken up as $a,b,a,abaa,abaaa$.
	\item $abcaabbccaaabbbccc$ is encoded as $abc(1,1)(1,2)(3,2)(3,3)(5,3)(7,3)(3,1)$.
				The compressed segments are broken up as $a,b,c,a,ab,bc,caa,abb,bcc,c$, and
				\begin{eqnarray*}
					d_1(w) &=& 3\\
					d_2(w) &=& 7\\
					d_3(w) &=& 12\\
					d_4(w) &=& 13\\
					n_1(w) &=& 2\\
					n_2(w) &=& 2\\
					n_3(w) &=& 3\\
					n_{\ell>3}(w) &=& 0.
				\end{eqnarray*}
\end{enumerate}

\section{Estimating LZ Compressibility}
\subsection{Main theorem}

The key idea is the following theorem.
\begin{theorem} For any integer $\ell_0>1$, let
	\[
		m=m(\ell_0)=\max_{\ell\in[\ell_0]}\frac{d_\ell(w)}{\ell}.
	\]
	Then
	\[
		m \le C_{LZ}(w) \le 4\left(m\log\ell_0 + \frac{n}{\ell_0}\right).
	\]
\end{theorem}

Given this theorem, to approximate $C_{LZ}(w)$ we need to compute
$2m\sqrt{\log\ell_0} + (\textrm{something})$, where (something) accounts for the
$n/\ell_0$ term. We cannot get a purely multiplicative approximation, so we will
have to settle with a combination of multiplicative and additive errors.

\subsection{The algorithm}

We assume the algorithm EstimateColors$(w,\ell,\gamma,\delta)$, where $w$ is a
string, $\ell$ is a substring length, $\gamma$ is a multiplicative
approximation parameter, and $\delta$ is a confidence parameter.
EstimateColors outputs, with probability $\ge 1-\delta$, a
$\gamma$-multiplicative estimate for $d_\ell(w)$, using
$O\left(\frac{n}{\gamma^2}\ell\log\frac{1}{\delta}\right)$ queries.
We essentially saw how to construct this subroutine in the last lecture (although
instead of querying a random location in $w$, here we will pick a random start index
$i$ and look at the length $\ell$ word beginning at location $i$).\\

\noindent LZ-Approx$(w,A,\eps)$\\
\begin{algorithm}[H]
	$\ell_0 \la 2/A\eps$, $B \la A/2\sqrt{\log\ell_0}$\;
	\For{each $\ell \le \ell_0$}{
		$\hat{d}_\ell \la \textrm{EstimateColors}(w,\ell,B,1/3\ell_0)$\;
	}
	$\hat{m} \la \max_\ell (\hat{d}_\ell/\ell)$\;
	output $\hat{C}_{LZ} = 2\hat{m}\sqrt{\log\ell_0} + \eps n$
\end{algorithm}

\subsection{Algorithm behavior}

Using the union bound, we can say that
\[
	\Pr[\textrm{all $\hat{d}_\ell$'s are good}] \ge 1 - \frac{\ell_0}{3\ell_0}
																							\ge \frac{2}{3}.
\]
Now, if each $\hat{d}_\ell$ is a good $B$-multiplicative estimate, then $\hat{m}$
is as well, giving us
\[
	\frac{\hat{m}}{B} \le m
										\le C_{LZ}
										\le 4\left(m\log\ell_0 + \frac{n}{\ell_0}\right)
										\le 4\left(B\hat{m}\log\ell_0 + \frac{n}{\ell_0}\right).
\]
Rearranging the first, third and fifth term, we get
\[
	\frac{A}{B}\cdot\frac{C_{LZ}-4n/\ell_0}{4B\log\ell_0} + \eps n
			\le \frac{A}{B}\cdot\hat{m} + \eps n
			\le \frac{A}{B}\cdot B\cdot C_{LZ} + \eps n
			= A\cdot C_{LZ} + \eps n.
\]
Plugging in $A/B = 2\sqrt{\log\ell_0}$ and $4n/\ell_0 = 2nA\eps$, we get
\[
	(2\sqrt{\log\ell_0})\cdot\frac{C_{LZ} - 2nA\eps}
					{\frac{4A}{2\sqrt{\log\ell_0}}\cdot\log\ell_0}
			=   \frac{C_{LZ}}{A} - \eps n
			\le 2\hat{m}\sqrt{\log\ell_0} + \eps n
			\le A\cdot C_{LZ} + \eps n,
\]
which gives us our $A$-multiplicative, $\eps n$-additive approximation (with
probability $\ge 2/3$).

\subsection{Query complexity}

Each call to EstimateColors uses $O\left(\frac{n}{B^2}\ell_0\log3\ell_0\right)$
queries, so overall we use $O\left(\frac{n}{B^2}\ell_0^2\log3\ell_0\right)$
queries. Observe, however, that the calls to EstimateColors can reuse queries:
their respective probabilities of success would no longer be independent, but this
is not a problem as we only use the union bound. Thus our query complexity works
out to $\tilde{O}\left(\frac{n}{A^3\eps}\right)$. For instance, with
$\eps=n^{-1/2}$ and $A=n^{1/4}$, we use $O(n^{3/4})$ queries to get a
$n^{1/4}$-multiplicative, $n^{1/2}$-additive estimate.

\subsection{Proof of theorem}

It remains to prove our main theorem. We do this via two lemmas, each of which
implies one of the inequalities in our theorem.

\begin{lemma}[first inequality]
	For every $\ell \in [\ell_0]$, $d_\ell(w) \le C_{LZ}(w)\cdot\ell$.
\end{lemma}

\begin{proof}

	\noindent$(\ell=1)$:
		\[
			d_1(w) = \textrm{\# distinct alphabet characters}\\
						 \le C_{LZ}(w)
		\]
		since when LZ77 sees a new alphabet character it simply outputs it.
	
	\noindent$(\ell>1)$:
	
		We scan from left to right and count the number of new length-$\ell$ substrings.
		Observe that if a substring is entirely contained within some compressed segment,
		it cannot be new! Hence, any new substring must start in one compressed segment
		and end in another (i.e., it must straddle a boundary). But before each boundary,
		there are only $\ell-1$ possible starting positions that would allow a
		length-$\ell$ substring to straddle the boundary, and there are only
		$C_{LZ}(w)-1$ boundaries, so
		\[
			d_\ell(w) \le (C_{LZ}(w)-1)(\ell-1) \le C_{LZ}(w)\cdot\ell,
		\]
		proving our lemma.							
\end{proof}

\begin{lemma}[second inequality]
	Fix any $\ell_0\in[n]$. If $m$ is such that for every $\ell\in[\ell_0]$,
	$d_\ell(w) \le m\ell$, then
	\[
		C_{LZ}(w) \le 4\left(m\log\ell_0 + \frac{n}{\ell_0}\right).
	\]
\end{lemma}

\begin{proof}
	Let $n_\ell = n_\ell(w)$ and $d_\ell = d_\ell(w)$. We will show two things:
	\begin{enumerate}
		\item For every $\ell \in \{1,\dots,\ell_0/2\}$ (wlog, assume $\ell_0$ is even),
					\[
						\sum_{k=1}^\ell n_k \le 2(m+1)\sum_{k=1}^\ell \frac{1}{k}.
					\]
		\item For every $\ell \ge 1$,
					\[
						\sum_{k=\ell+1}^n n_k \le \frac{n}{\ell+1}.
					\]
	\end{enumerate}
	The proof of 2 is easy:
	\[
		(\ell+1)\sum_{k=\ell+1}^n n_k \le \sum_{k=\ell+1}^n kn_k \le n.
	\]
	Where the last inequality follows since the compressed segments are disjoint.

	The proof of 1 we defer until next lecture. Now, we put them together with
	$\ell=\ell_0/2$ to get
	\begin{eqnarray*}
		C_{LZ}(w) & = & \sum_{k=1}^n n_k + 1\\
							&\le& 2(m+1)\sum_{k=1}^{\ell_0/2} \frac{1}{k} + \frac{n}{\ell_0/2+1} + 1\\
							&\le& 2(m+1)(\ln\ell_0+1) + \frac{2n}{\ell_0}\\
							&\le& 4\left(m\log\ell_0 + \frac{n}{\ell_0}\right),
	\end{eqnarray*}
	completing the proof of our lemma and hence our theorem (modulo the proof of 1).
\end{proof}

\end{document}
