\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{15}{April 5, 2006}{Ronitt Rubinfeld}{Jinwoo Shin}
Today, we will review some linear algebra facts which need for us,
and show how a rapidly mixing graph helps randomness.
\section{Linear Algebra Review}
\begin{definition}
$\nu$ is an eigenvector of $A$ with the corresponding eigenvalue
$\lambda$ if $\nu A=\lambda \nu$.
\end{definition}
\begin{definition}
The $L_2$-norm of a vector $v=(v_1,v_2,\dots,v_n)$ is defined as
$\sqrt{\sum_{i=1}^{n} {v_i}^2}$.
\end{definition}
\begin{definition}
The vectors $v^{(1)},\dots,v^{(n)}$ are orthonormal if
$$v^{(i)}\cdot v^{(j)}=\cases{1 & \hbox{if  }i=j \cr 0 & \hbox{otherwise}}$$
where the inner product $v \cdot w$ is defined as $\sum v_i \cdot
w_i$.
\end{definition}
For example, the transition matrix $P$ of the $d$-regular undirected
graph $G$ has an uniform stationary distribution, because $P$ is
doubly stochastic. Therefore, $P$ has an uniform eigenvector
$(\frac1n,\dots,\frac1n)$ with the corresponding eigenvalue $1$.
Also, because the scalar producted vector to the eigenvector is
still an eigenvector, $(\frac1{\sqrt n},\dots,\frac1{\sqrt n})$ is
an eigenvector which has $1$ as the $L_2$-norm. The following
theorem is important in analyzing our algorithm.
\begin{theorem}
If the transition matrix $P$ is a real and symmetric matrix, there
exist eigenvectors $v^{(1)},\dots,v^{(n)}$ which form orthonormal
basis with corresponding eigenvalues $1=\lambda_1 \geq |\lambda_2|
\geq \dots \geq |\lambda_n|$.
\end{theorem}
Let's assume $P$ has eigenvectors $v^{(1)},\dots,v^{(n)}$ with
corresponding eigenvalues $\lambda_1,\dots,\lambda_n$. Then we can
observe the following facts.
\begin{fact}
\begin{itemize}
\item $\alpha P$ has eigenvectors $v^{(1)},\dots,v^{(n)}$ with
corresponding eigenvalues $\alpha\lambda_1,\dots,\alpha\lambda_n$.
\item $P+I$ has eigenvectors $v^{(1)},\dots,v^{(n)}$ with
corresponding eigenvalues $\lambda_1+1,\dots,\lambda_n+1$.
\item $P^k$ has eigenvectors $v^{(1)},\dots,v^{(n)}$ with
corresponding eigenvalues ${\lambda_1}^k,\dots,{\lambda_n}^k$.
\end{itemize}
\end{fact}
Also, if $v^{(1)},\dots,v^{(n)}$ form orthonormal basis, any vector
$w$ can be expressed the linear combination of $v^{(i)}$'s ($w=\sum
\alpha_i v^{(i)}$) and the $L_2$-norm of $w$ is $\sqrt{\sum
{\alpha_i}^2}$.
\section{Mixing Time and Eigenvalues}
Under the observation of the previous section, we can prove the
following main theorem which tells about the mixing time of random
walks.
\begin{theorem}
If $P$ is a transition matrix of an undirected, non-bipartitie,
$d$-regular and connected graph, and $\pi_0$,$\bar \pi$ are the starting
distribution and the stationary distribution respectively, then
$$\parallel \pi_0 P^t - \bar \pi \parallel \leq |\lambda_2 |^t$$
\end{theorem}
\begin{proof}
From the theorem of the previous section, $P$ has eigenvectors
$v^{(1)},\dots,v^{(n)}$ which form orthonormal basis with
corresponding eigenvalues $1=\lambda_1 \geq |\lambda_2| \geq \dots
\geq |\lambda_n|$. Hence, $\pi_0$ can be expressed as $\sum_{i=1}^n
\alpha_i v^{(i)}$, and $\pi_0 P^t = \sum_{i=1}^n \alpha_i v^{(i)}
P^t = \sum_{i=1}^n \alpha_i {\lambda_i}^t v^{(i)}$. Also,
\begin{eqnarray*}
\parallel \pi_0 P^t - \alpha_1 v^{(1)} \parallel &=& \parallel
\sum_{i=2}^n \alpha_i {\lambda_i}^t v^{(i)}
\parallel\\
&=&\sqrt{\sum_{i=2}^n {\alpha_i}^2 {\lambda_i}^{2t}}\\
&\leq& |\lambda_2|^t \cdot \sqrt{\sum_{i=2}^n {\alpha_i}^2}\\
&\leq& |\lambda_2|^t \cdot \parallel \pi_0 \parallel\\
&\leq& |\lambda_2|^t
\end{eqnarray*}
The last inequality holds because the $L_1$-norm of $\pi_0$ is 1 and
its $L_2$-norm is less than its $L_1$-norm. We can check $\alpha_1
v^{(1)}=\bar \pi$ by setting $\pi_0=\bar \pi$ in the above result
and letting $t$ go to $\infty$. Therefore, our result follows.
\end{proof}
\section{Reducing Randomness Requirements}
Suppose probabilistic
polynomial algorithm $A$ outputs a correct value of a function
$f$ with high probability. Let $f$ be a binary function from
$\{0,1\}^n$ to $\{0,1\}$ and assume $A$ 
%be an probabilistic polynomial algorithm which 
tosses $r$ coins such that $\forall x, \Pr[A(x) \neq f(x)]
\leq \frac1{100}$. For getting a better error-ratio (up to $2^{-k}$),
our first algorithm goes like this.
\begin{eqnarray*}
&&\hbox{1. Repeat }O(k)\hbox{ times}\\
&&\hbox{ 1.1. Pick a random }s=(s_1,\dots,s_r) \in \{0,1\}^r.\\
&&\hbox{ 1.2. Run }A(x)\hbox{ with coins }s_1,\dots,s_r.\\
&&\hbox{2. Output the majority answer of the step 1.}
\end{eqnarray*}
This algorithm is just running $O(k)$ copies of $A$ and getting the
majority answer. We can analyze using the Chernoff bounds why this
gives $2^{-k}$ as an error-ratio as we did in the first problem of
the first homework. As you check easily, our first algorithm needs
$O(kr)$ random bits. Our goal is construction an new algorithm which
needs less random bits. This is possible if we use a random walk in a
rapidly mixing graph. For our purpose, let's assume there exists an
undirected, non-bipartitie, $d$-regular and connected graph of $2^r$
nodes which has a transition matrix $P$ such that the absolute value
of its second eigenvalue($\lambda_2$) is less than
$\frac1{10}$. From the theorem in the previous section, we can see
that $\lambda_2$ guarantees the mixing time of the random walk in
the graph. Our second algorithm goes like this.
\begin{eqnarray*}
&&\hbox{1. Pick a random node }s=(s_1,\dots,s_r) \in \{0,1\}^r.\\
&&\hbox{2. Repeat }7k\hbox{ times}\\
&&\hbox{ 2.1. Let a new }s\hbox{ be a random neighbor of an old }s.\\
&&\hbox{ 2.2. Run }A(x)\hbox{ with coins }s_1,\dots,s_r.\\
&&\hbox{3. Output the majority answer of the step 2.}
\end{eqnarray*}
We can easily check that our second algorithm uses $r+7k\cdot \lceil
\log d \rceil = r+O(k)$ random bits.(Assume $d$ is constant.)
Obviously, it is better than $O(kr)$ random bits in our first
algorithm. Now define some notions for analyzing our second
algorithm.(Our analysis is for knowing why our second algorithm
gives $2^{-k}$ as an error-ratio.)
\begin{define}
\begin{itemize}
\item Let $B$ be $\{s|A(x) \neq f(x)$ if $A$ runs with coins $s\}$.
\item Let $N$ be a diagonal matrix such that $N_{ii}=1$ if $i\in B$.
\item Let $M$ be a diagonal matrix such that $M_{ii}=1$ if $i\notin B$.
\end{itemize}
\end{define}
We can see that $|B| \leq \frac1{100}$. Call $s$ 'bad' if $s \in B$,
and 'good' otherwise. Then, if $\rho$ is a probability distribution,
$|\rho N| = \Pr[s$ is bad$]$ and $|\rho M| = \Pr[s$ is good$]$. Let $S$ be the sequence of
"good/bad"("correct/incorrect") of length $7k$, and define $Q_i$ as
follows,
$$Q_i=\cases{M & \hbox{if $S_i$ is 'correct'.} \cr N & \hbox{if $S_i$ is 'incorrect'.}}$$
Then, $\Pr[S]=|\rho PQ_1 \dots PQ_{7k}|$ holds. (This is not a trivial
fact.) Using these notations, we will show why our second algorithm
gives a lower error-ratio ($2^{-k}$) in the next lecture.
\end{document}
