\documentclass[11pt]{article}
\input{../preamble.tex}
\usepackage{amsmath}

\newcommand{\ep}{\epsilon}
\newcommand{\de}{\delta}
\newcommand{\bscp}{\mbox{BSC}_p}
\newcommand{\DD}{\mathcal{D}}
\newcommand{\Ex}{\mbox{Exp}}
\newcommand{\Ball}{\mbox{Ball}}

\begin{document}

\lecture{2}{September 9, 2002}{Madhu Sudan}{Deniss \v{C}ebikins}

\section{Introduction}

In this lecture, we consider classical scenarios in the theory of
communication, in which a sender (which we will refer to as the
\emph{source}) wants to transmit a piece of information, which we will
represent as a binary string, to a receiver. The transmission is to be
conducted through a \emph{channel}, which has certain properties
depending on the scenario we are considering. The natural goal is to
set up a scheme that requires transmission of as little data as
possible and that minimizes the chance of error.

The first situation we deal with concerns the \emph{noiseless
channel}. In this case there is no need to account for transmission
errors, so we can concentrate on minimizing the number of bits that
have to be sent to the receiver. In the second scenario we have a
\emph{noisy channel}, which alters every bit that is transmitted
through it with a small probability $p$. We set a condition that the
probability of the receiver getting an incorrect message is at most
$\ep$, and we show the existence of a certain transmission scheme that
satisfies the condition, and we also show that this scheme has the
best possible efficiency in terms of the amount of data being transmitted.

\section{Noiseless Channel}

Suppose that the source produces a sequence of $n$ independent bits
such that each bit has value $0$ with probability $1-p$ and value $1$
with probability $p$. We use the notation $x \leftarrow \bscp$ to
indicate that a sequence (string) $x$ is generated according to this
principle. Since we would like to minimize the number of bits that we
send to the receiver, it is reasonable for the sender to encode the
message using an encoding function $E$, then send the encoded message, 
so that the receiver can decode the message using a decoding function $D$.
Formally, we need to construct a pair of functions
$$
E\ :\ \{0,1\}^k \rightarrow \{0,1\}^*
$$
$$
D\ :\ \{0,1\}^* \rightarrow \{0,1\}^n
$$
such that $D(E(x)) = x$ for all $x\in \{0,1\}^n$. Our goal is to
minimize the value of the expected size 
$\Ex_{x\leftarrow \bscp}[|E(x)|]$
of the encoded message.

We define a \emph{distribution} on a finite set $S$ to be a function
$\DD\ :\ S \rightarrow [0,1]$ such that $\sum_{x\in S} \DD(x) =
1$. The function 
$$H(\DD) = -\sum_{x\in S} \DD(x) \log \DD(x)$$
is the \emph{entropy function}. The \emph{binary entropy function} is
defined as follows:
$$
H_2(p) = -p\log p - (1-p)\log (1-p).
$$
Notice that $H_2(p) = H(\DD)$ for the distribution $\DD(0) = 1-p$,
$\DD(1) = p$ on the set $\{0,1\}$.

The following result solves the above problem of minimizing data
transmission in a noiseless channel.

\begin{theorem}\label{noiseless}
\emph{\textbf{(Noiseless Coding Theorem)}} There exist functions $E$ and $D$
  satisfying the condition $D(E(x)) = x$ such that
$$
\Ex_{x\leftarrow \bscp} [|E(x)|] = (H_2(p) + o(1))\cdot n,
$$
where $n = |x|$ is the size of the original message.
\end{theorem}

\section{Noisy Channel}

Suppose that the source generates purely random strings (i.e. bits
have values $0$ or $1$ with equal probability). Consider a noisy
channel which alters every bit transmitted through it with probability
$p < 1/2$. We can view the transmission process as follows: on input
$z$, the channel generates a string $\eta \leftarrow \bscp$ of length
$|z|$ and
outputs the modified string $z+\eta$.

As in the previous scenario, we would like to obtain functions
$$
E\ :\ \{0,1\}^k \rightarrow \{0,1\}^n
$$
$$
D\ :\ \{0,1\}^n \rightarrow \{0,1\}^k
$$
for encoding and decoding the message, respectively. One of our goals
is to obtain a pair of such functions such that the probability that
the receiver gets a wrong message after decoding the transmission is
bounded above by a prescribed number $\ep > 0$. Formally, this
condition can be written as follows:
$$
\Pr_{\eta\leftarrow BSC_p \atop x\leftarrow \{0,1\}^k} [D(E(x)+\eta)
  \neq x] \leq \ep.
$$
Having stated the condition, our problem is to maximize the value of
$k/n$ for which such $E$ and $D$ exist.

\begin{theorem}\label{noisy}
\emph{\textbf{(Noisy Coding Theorem)}} Given $\de>0$ and $\ep>0$, there
exist functions $E$ and $D$ satisfying the above conditions such that
$$
{k\over n} = 1-H_2(p)-\de
$$
if $k$ and $n$ are sufficiently large.
\end{theorem}

We use the well-known lemma to prove the theorem.

\begin{lemma}\label{chernoff}
\emph{\textbf{(Chernoff Bound)}} Let $\DD$ be a distribution on
  $\{0,1\}$, and let $x_1$, \dots, $x_N$ be independent bits chosen
  from $\DD$. If $\mu = \Ex_{x\leftarrow\DD}[x]$, then for any $\lambda$ the
  following inequality holds:
$$
\Pr_{x_i\leftarrow\DD}\left[\left|{\sum_{i=1}^N x_i\over N} -
  \mu\right|\geq \lambda\right] \leq e^{-\lambda^2\cdot {N\over 2}}.
$$
\end{lemma}

\noindent
\textit{Proof of Theorem \ref{noisy}.} Let $k$ and $n$ satisfy the
inequality
$$
{k\over n} = 1-H_2(p)-\de,
$$
and suppose that $E\ :\ \{0,1\}^k \rightarrow \{0,1\}^n$ is an
arbitrary function. Define $D(y)$ to be a string $x$ for which the
Hamming distance $\Delta(E(x),y)$ is the smallest possible.

Choose a small constant $\gamma >0$ such that
$$
H_2((p+\gamma)) = H_2(p) + \delta' < H_2(p) + \delta.
$$

Now suppose that a string $x\in\{0,1\}^k$ and the value $E(x) \in
\{0,1\}^n$ are fixed, and the rest of values of $E$ are chosen at
random. For a string $\eta \leftarrow \bscp$ of length $n$, we say
that 
$\eta$ is \emph{bad} if $wt(\eta) > \left(p+\gamma\right)\cdot
n$. It follows from Lemma \ref{chernoff} that there exists a constant
$c>1$ such that
$$
\Pr_{\eta\leftarrow BSC_p} [\mbox{$\eta$ is bad}] < c^{-n}. \eqno{(1)}
$$
(Indeed, we have $\Ex_{\eta\leftarrow BSC_p}[wt(\eta)/n] = p$, so
setting $\lambda = \gamma$ implies that the probability of $\eta$ being
bad is less than $e^{-\gamma^2\cdot {n\over 2}} = c^{-n}$, where $c>1$.)

Define $\Ball(z,r)$ to be the set of all strings $z'$ such that
$\Delta(z,z')\leq r$.
Let $y = E(x) + \eta$. 
Note that
$$
|\Ball(y, (p+\gamma)\cdot n)| = \sum_{i=0}^{(p+\gamma)\cdot n}
\left(n\atop i\right) \leq 1+(p+\gamma)\cdot n \cdot \left(n\atop
(p+\gamma)\cdot n\right) =
$$
$$
=
1 + (p+\gamma)\cdot n\cdot 2^{(H_2(p+\gamma)+o(1))\cdot n} =
2^{(H_2(p) + \delta' + o(1))\cdot n}
$$
since $(p+\gamma)\cdot n = 2^{o(1)\cdot n}$.

Fix $x' \neq x$. Then
$$
\Pr_{E} \left[E(x') \in \Ball\left(y,\left(p+\gamma\right)\cdot
  n \right) \right] = {\left|\Ball\left(y,\left(p+\gamma\right)\cdot
  n\right)\right|\over 2^n} \approx {2^{(H_2(p) + \delta' +
      o(1))\cdot n}\over 2^n}.
$$
We conclude that
$$
\Pr_{E} [\exists\ x'\neq x\ \mbox{s.t.}\
  E(x')\in\Ball(y,(p+\gamma)\cdot n)] \leq {2^k\cdot
  2^{(H_2(p)+\delta'+o(1))\cdot n}\over 2^n} 
= 2^{(\delta'-\delta+o(1))\cdot n}
< a^{-n} \eqno{(2)}
$$
for some $a>1$. If $\eta$ is not bad, i.e. $wt(\eta) \leq
(p+\gamma)\cdot n$, and there is no string $x'\neq x$
such that $\Delta(y, x') \leq (p+\de)\cdot n$, then we have $D(y) =
D(E(x)+\eta)= x$, so the receiver gets the correct message. We
conclude from equations $(1)$ and $(2)$ that
$$
\Pr_{E,\eta} [\mbox{ the receiver gets a wrong message }] < a^{-n} + c^{-n}
$$
for any fixed $x$. It follows that
$$
\Pr_{x,\eta} [\mbox{ the receiver gets a wrong message }] < a^{-n} + c^{-n}
$$
for some function $E$. Finally, observe that $a^{-n} + c^{-n} < \ep$
for sufficiently large $n$. The theorem follows.

\section{Converse Theorem}

The following result shows that the bound in Theorem \ref{noisy}
is in fact tight.

\begin{theorem}\label{converse}
Let $k$ and $n$ satisfy the relation
$$
{k\over n} = 1 - H_2(p) + \delta(n)
$$
for some $\delta(n) = \Omega(1)$.
Then 
$$
\lim_{k,n\rightarrow \infty} \Pr_\eta[D(E(x)+\eta) = x] = 0
$$
for any pair of functions $(E,D)$.
\end{theorem}

We briefly describe the idea of the proof. If $\eta \leftarrow \bscp$,
then with high probability we have $wt(\eta) \geq pn$. There are at
least $\left(n\atop pn\right)$ strings $z$ such that $wt(z) \geq pn$,
and the string $\eta$ equals each such string with probability of at
most $1/\left(n\atop pn\right)$.
Thus in order for the probability of successful decoding to be bounded
below by a positive constant, we need to have for every $x$ about $c\cdot
\left(n\atop pn\right)$ strings $y$ such that $D(y) = x$
(here $c>0$). However,
$$
2^k\cdot c \cdot \left(n\atop pn\right) \approx 2^k \cdot c \cdot
2^{H_2(p)n + o(1)} = c\cdot 2^{n + n\delta (n) + o(1)} > 2^n
$$
for sufficiently large $n$. This is a contradiction, as there are
only $2^n$ strings of length $n$.
\end{document}