\documentclass[10pt]{article}
\newtheorem{define}{Definition}
\usepackage{amsmath,amsfonts}

%\newcommand{\Z}{{\mathbb{Z}}}
\newcommand{\e}{\epsilon}
\newcommand{\bq}{\{-1,1\}^{n}}
\newcommand{\Exp}{{\mathbb{E}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{19: PRG for Space-Bounded Computation}{April 16, 2008}{Ronitt Rubinfeld (lecture given by Krzysztof Onak)}{Ning Xie}

\section{Definitions and Models of Computation}
A randomized algorithm $A$ can be thought of as a function $A:\{0,1\}^{n}\times \{0,1\}^{R(n)}\to 
\{\mathrm{Accept},\mathrm{Reject}\}$, that is, 
function $A$ is a \emph{deterministic} algorithm that takes two input strings $x$ and $y$, where
$x$ is the ``real" input to the randomized algorithm and $y$ is the random string used during
the computation.

\begin{definition}
A (deterministic) function $G:\{0,1\}^{m} \to \{0,1\}^{R(n)}$ is a
\emph{pseudorandom generator (PRG)} for algorithm $A$ with parameter $\epsilon$
if for all $x$,
\[
\left|\Pr_{y}[\text{$A(x,y)$ accepts}]-\Pr_{z}[\text{$A(x, G(z))$ accepts}]\right|\leq \epsilon.
\]
\end{definition} 

We are going to study the following PRG construction.

\begin{theorem}[Nisan 1990]
For any algorithm $A$ that runs in $S(n) = \Omega(\log n)$ space and uses $R(n)$ random bits, there is a pseudorandom generator for $A$ with parameter $\frac{1}{10}$ that uses $O(S(n) \log R(n))$ random bits and runs in $O(S(n) \log R(n))$ space. 
\end{theorem}

The following claim easily follows.

\begin{corollary}[Nisan 1990]
If a randomized algorithm $A$ runs in $S(n) = \Omega(\log n)$ space and uses $R(n)$ random bits, then $A$ can
be converted into a randomized algorithm $A'$ that runs in $O(S(n)\log R(n))$ space and uses $O(S(n)\log R(n))$ random bits.
\end{corollary}

Consider the model of Turing machine computation when space complexity is
our main concern. The TM has two tapes, one is read-only input tape of size $n$ and the other is 
a work tape of size $S(n)$. The space complexity of the TM is $S(n)$ (that is, the read-only 
input tape will not be counted). Such a TM has at most $n\cdot 2^{O(s(n))}$ states, and
if $s(n)=\Omega(\log n)$, this can be bounded by $2^{O(s(n))}$ states.

\section{Pairwise Independent Hash Functions and Hash Mixing Lemma}
\begin{definition}
Let $H=\{h:\{0,1\}^{r} \to \{0,1\}^{r}\}$ be a set of functions. $H$ is called
a family of \emph{pairwise independent hash functions} (or a \emph{universal family of hash functions})
if for all $x_{1}\neq x_{2}$ and for all $y_{1}, y_{2}\in \{0,1\}^{r}$,
\[
\Pr_{h\in H}[\text{$h(x_{1})=y_{1}$ and $h(x_{2})=y_{2}$}]=2^{-2r}.
\]
\end{definition}

For our PRG construction purposes, we only need the following well-known fact about universal hash functions.
\begin{fact}
For every $r>0$, there exists a small family $H$ of pairwise independent hash functions that go from $\{0,1\}^r$ to $\{0,1\}^r$ such that each $h\in H$ can be represented by $O(r)$ bits and $h(x)$ can be computed in $O(r)$ space.
\end{fact}
For example, we can take $H$ to be the set of all affine functions over the field $\mathbb{F}_{2^{r}}$.

The next lemma about families of pairwise independent hash functions will be the main technical tool in our proof.
We first need to introduce the following definitions.
\begin{definition}
For a subset $A$ of $\{0,1\}^r$, $\mu(A) = \frac{|A|}{2^r}$.
\end{definition}

\begin{definition}
Let $A, B \subseteq \{0,1\}^{r}$, $h:\{0,1\}^{r} \to \{0,1\}^{r}$ and $\epsilon > 0$.
We say $h$ is \emph{$(\epsilon, A, B)$-good} if 
\[
\left|\Pr_{y\in \{0,1\}^{r}}[\text{$y\in A$ and $h(y)\in B$}]-
 \Pr_{y,z \in \{0,1\}^{r}}[\text{$y\in A$ and $z\in B$}]\right|\leq \epsilon,
\]
or equivalently,
\[
\left|\Pr_{y\in \{0,1\}^{r}}[\text{$y\in A$ and $h(y)\in B$}]-
 \mu(A)\mu(B)\right|\leq \epsilon
\]
\end{definition}

\begin{lemma}[Hash Mixing Lemma]
Let $H$ be a universal family of hash functions that map $\{0,1\}^r$ to $\{0,1\}^r$, then for any $A, B \subseteq \{0,1\}^{r}$,
\[\Pr_{h\in H}[\text{$h$ is not $(\epsilon, A, B)$-good}] \leq \epsilon,\]
where $\epsilon = 2^{-r/3}$.
\end{lemma}

\begin{proof}
We would like to bound the number of $h\in H$ such that $\left|\Pr_{y\in \{0,1\}^{r}}[\text{$y\in A$ and $h(y)\in B$}]-
\mu(A)\mu(B)\right| > \epsilon$, or equivalently, the number of $h$'s with
\begin{equation}\label{Eq1}
\left|\Pr_{y\in A}[h(y)\in B]- \mu(B)\right| > \frac{\epsilon}{\mu(A)}. 
\end{equation}
Now define an indicator random variable $Z_{y}^{h}$ by
\[
Z_{y}^{h}=
\begin{cases}
1 &\text{if $h(y)\in B$},\\
0 &\text{otherwise}.
\end{cases}
\]
By multiplying both sides by $A$, we can rewrite (\ref{Eq1}) in terms of $Z_{y}^{h}$
\[
\left|\sum_{y\in A}Z_{y}^{h}- |A|\mu(B)\right| > \frac{\epsilon|A|}{\mu(A)}=\epsilon \cdot 2^{r}.
\]
First note that, since $H$ is a pairwise independent family of hash functions, one can easily check that it is
also $1$-wise independent. Namely, $\Pr_{h\in H}[h(x)=y]=2^{-r}$ for all $x$ and $y$.
It follows that $\Exp{[Z_{y}^{h}]}=\mu(B)$ and
$\Exp{\left[\sum_{y\in A}Z_{y}^{h}\right]}=|A|\mu(B)$. 

Let $Y=\sum_{y\in A}Z_{y}^{h}$, a random variable that depends on $h$. As we already know that
 $\Exp{[Y]}=|A|\mu(B)$, our plan is to compute the variance of $Y$ and use Chebyshev's inequality to
bound from above the probability that $Y$ deviates from its mean.
\begin{eqnarray*}
\Exp{\left[Y^{2}\right]} &=& \Exp{\left[(\sum_{y\in A}Z_{y}^{h})^{2}\right]}=\Exp{\left[\sum_{y \in A}\sum_{z \in A}Z_{y}^{h}Z_{z}^{h}\right]}\\
&=&\Exp{\left[\sum_{y \in A}Z_{y}^{h}Z_{y}^{h}\right]}+\Exp{\left[\sum_{y \in A}\sum_{z \in A, z\neq y}Z_{y}^{h}Z_{z}^{h}\right]}\\
&=&\Exp{\left[\sum_{y \in A}Z_{y}^{h}\right]}+\sum_{y \in A}\sum_{z \in A, z\neq y}\Exp{[Z_{y}^{h}]}\Exp{[Z_{z}^{h}]}\\
&=&|A|\mu(B)+|A|(|A|-1)\mu(B)^{2},
\end{eqnarray*}
where in the second-to-last step, we use the fact that $H$ is a family of 
universal (pairwise independent) hash functions. Therefore,
\[
\Var{[Y]}=\Exp{[Y^{2}]}-\Exp{[Y]}^{2}=|A|\mu(B)+|A|(|A|-1)\mu(B)^{2}-(|A|\mu(B))^{2} \le |A|\mu(B).
\]
Now applying Chebyshev's inequality, which says $\Pr[|Y-\Exp{[Y]}|>\delta]<\frac{\Var{[Y]}}{\delta^{2}}$,
with $\delta=\epsilon \cdot 2^{r}$, we get
\[
\Pr\left[\left|\sum_{y\in A}Z_{y}^{h}- |A|\mu(B)\right| > \epsilon 2^r \right]<
\frac{|A|\mu(B)}{\epsilon^{2}2^{2r}}
\le \frac{2^r \cdot 1}{2^{-2r/3} \cdot 2^{2r}} = 2^{-r/3} = \epsilon.\]
This completes the proof of the lemma.
\end{proof}

\section{Nisan's Pseudorandom Generator}
%\setlength{unitlength}{5cm}
\begin{figure}
\begin{center}
\begin{picture}(300,200)(0,80)
% Root
\put(150,180){\circle*{5}}
\put(155,180){$y$}
% First layer
\put(150,180){\vector(-1,-2){20}}
\put(150,180){\vector(1,-2){20}}
\put(162, 160){$\longleftarrow$ $h_{2}$}
% Second layer
\put(128,138){\circle*{5}}
\put(118,138){$y$}
\put(172,138){\circle*{5}}
\put(176,138){$h_{2}(y)$}
% Third layer
\put(128,138){\vector(-1,-3){15}}
\put(128,138){\vector(1,-3){15}}
\put(172,138){\vector(-1,-3){15}}
\put(172,138){\vector(1,-3){15}}
\put(112,91){\circle*{5}}
\put(144,91){\circle*{5}}
\put(156,91){\circle*{5}}
\put(188,91){\circle*{5}}
\put(185, 110){$\longleftarrow$ $h_{1}$}
\put(108,81){$y$}
\put(128,81){$h_{1}(y)$}
\put(152,81){$h_{2}(y)$}
\put(182,81){$h_{1}(h_{2}(y))$}
\end{picture}
\end{center}
\caption{An example of how to construct the generator when $\ell=2$. We assign a randomly chosen hash
function for each layer (in this example $h_{1}$ and $h_{2}$ are the hash functions). The left child is
simply the same string as the parent node and the right child is obtained by applying the hash function
of that layer to the string at the parent node. The output of the generator is the concatenation of
all the strings on the bottom layer (in this example, the output is $y\circ h_{1}(y) \circ h_{2}(y)
\circ h_{1}(h_{2}(y))$).}
\label{fig:PRG}
\end{figure}

Now we describe how to construct the PRGs that ``fool" space-bounded computation. Define a generator
$G_{\ell}:\{0,1\}^{r}\times H^{\ell}\to (\{0,1\}^{r})^{2^{\ell}}$, where $H$ is
a family of universal hash functions.
We define $G_{\ell}$ recursively as:
\[
G_{0}(y)=y;
\]
and
\[ 
G_{\ell}(y, h_{1}, \ldots, h_{\ell-1}, h_{\ell})=
G_{\ell -1}(y, h_{1}, \ldots, h_{\ell-1})
\circ 
G_{\ell -1}(h_{\ell}(y), h_{1}, \ldots, h_{\ell-1}),
\]
where $\circ$ denotes concatenation. 
That is, we first randomly pick $\ell$ hash functions from $H$ and then recursively apply these 
hash functions to the seed input $y$ of length $r$ to obtained a pseudorandom string 
$G_{\ell}(y, h_{1}, \ldots, h_{\ell})$ of length $2^{\ell}\cdot r$.
An example with $\ell=2$ is illustrated in Figure~\ref{fig:PRG}.

We now consider the following model of randomized computation.
Let us fix $x$, the input to our algorithm $A$. We now create a finite state automaton $Q$
with states corresponding to all possible configurations of the Turing machine on $x$.
Transitions between states in the automaton are driven by consecutive random bits
delivered to the algorithm. Let us denote the number of all states of the automaton by $T$.
Recall that $T = 2^{O(S(n))}$. One of the states is the start state, and some of the states
are marked as accepting states. If after $R(n)$ transitions corresponding to $R(n)$ random bits,
$Q$ ends up at an accepting state, it accepts the input. Otherwise, $Q$ rejects the input.

Let $D$ be a distribution over $\{0,1\}^k$, sequences of $k$ bits. We denote by $Q(D)$
the probability transition matrix of size $T \times T$. The $(i,j)$-th entry of $Q(D)$
equals the probability that the length-$k$ sequence of bits chosen according to $D$
results in transition from the $i$-th state to the $j$-th state. If we know $Q(D)$ for a distribution $D$ on $R(n)$ bits, we can compute the corresponding probability of accepting the input.

Let $U_{\{0,1\}^{n}}$ denote the uniform distribution over $\{0,1\}^{n}$.
Therefore, if the random bits are truly random, the transition matrix of $Q$ will be
$Q(U_{\{0,1\}^{r\cdot 2^{k}}})$. However, if the random bits come from a pseudorandom
generator, the corresponding transition matrix may be different. We need to define a measure of
distance between the effects of two distributions. We use the standard $\ell_1$-norm for this purpose.

\begin{definition}
For any $x \in \R^{s}$, $\|x\|=\sum_{i=1}^{s}|x_{i}|$. 
For any $s \times s$ real-valued matrix $Q$, the $\ell_1$-norm of $Q$ is
\[
\|Q\|=\sup_{\|x\|=1}\|x Q\|.
\]
\end{definition}

\begin{definition}
A sequence of hash functions $(h_{1}, \ldots, h_{k})$ is called $\epsilon$-good if
\[
\|Q(G(U_{\{0,1\}^{r}}, h_{1}, \ldots, h_{k}))-Q(U_{\{0,1\}^{r\cdot 2^{k}}})\| \leq \epsilon.
\]
\end{definition} 

The correctness of Nisan's PRG follows from the following main lemma.
\begin{lemma}
\[
\Pr[\text{$(h_{1}, \ldots, h_{k})$ is not $(2^{k}-1)T^2\epsilon$-good}]\leq k T^3 \epsilon.
\]
\end{lemma}
\end{document}