\documentclass[10pt]{article}

\oddsidemargin=0.15in \evensidemargin=0.15in \topmargin=-.5in
\textheight=9in \textwidth=6.25in

\usepackage{amsfonts,amssymb,amsmath,amsthm}

%\newtheorem{theorem}{Theorem}
%\newtheorem{proposition}{Proposition}
%\newtheorem{claim}{Claim}
%\newtheorem{conj}[theorem]{Conjecture}
%\newcounter{cor} \newtheorem{corollary}[cor]{Corollary}

\setcounter{secnumdepth}{3} %no more than ? depth numbering
%\newcounter{lammata} \newtheorem{lemma}[lammata]{Lemma}
%\newcounter{def} \newtheorem{definition}[def]{Definition}

\newcommand\remove[1]{}
 \newcommand{\comment}[1]{\begin{quote} \texttt{Adi's Comment: #1}
\end{quote}}

\def\endproof{{\hfill\rule{2mm}{2mm}}}
\newcommand\card[1]{\left| #1 \right|}
\newcommand\tup[1]{\langle #1 \rangle}
\newcommand\sett[2]{\left\{ \left. #1 \;\right\vert #2 \right\}}
\newcommand\settright[2]{\left\{ #1 \;\left\vert\; #2 \right.\right\}}
\newcommand\set[1]{{\left\{ #1 \right\}}}
\renewcommand\emptyset{\phi}
\newcommand\Prob[2]{{\Pr_{#1}\left[ {#2} \right]}}
\newcommand\cProb[3]{{\Pr_{#1}\left[ \left. #3 \;\right\vert #2 \right]}}
\newcommand\Expc[2]{{{\bf E}_{#1}\left[ {#2} \right]}}
\newcommand\ang[1]{\< {#1}\>}
\newcommand\norm[1]{\| #1 \|}
\newcommand\power[1]{\set{0,1}^{#1}}
\newcommand\defeq{\stackrel{def}{=}}
\newcommand\mycases[4]{{
\left\{
\begin{array}{ll}
{#1} & {#2}\\\\
{#3} & {#4}
\end{array}
\right. }}
%\def\RR{\mathbb R}
\def\del{\delta}
\def\eps{\varepsilon}
\def\Sig{\Sigma}
\def\sig{\sigma}
\def\bks{{\backslash}}
\def\nek{,\ldots,}
\def\<{\langle}
\def\>{\rangle}
\def\ie{\textit{i.e.}}
\def\eg{\textit{e.g.}}
\def\a{\alpha}
\def\b{\beta}
\def\f{\frac}
\def\o{\omega}
\def\t{\theta}
\def\ora{\overrightarrow}
\def\cal{\mathcal}
\newcommand\expectation{\mathop{{\mathbb{E}}}}
\def\e{\expectation}
\def\l{\left}
\def\r{\right}

\newcommand{\pr}{\medskip\noindent\textit{Proof}. }
\newcommand{\prs}{\medskip\noindent\textit{Proof Sketch}. }
\newcommand{\pri}{\medskip\noindent\textit{Proof Idea}. }
\newcommand\drightarrow{\stackrel{d}{\rightsquigarrow}}

\newcommand\mymod[1]{[{#1}]_p}
\newcommand\half{{\frac 1 2}}

\newcommand\ff{{\mathsf f}}
\def\gg{{\mathsf g}}
\newcommand\hh{{\mathsf h}}
\newcommand\dd{{\mathsf d}}
\newcommand\V{{\cal V}}
\newcommand\Complex{\mathbb{C}}
%\newcommand\abs[1]{\mathsf{abs}(#1)}
\newcommand\floor[1]{\lfloor(#1)\rfloor}
\newcommand\ceil[1]{\lceil(#1)\rceil}

%\newcounter{rem} \newtheorem{remark}[rem]{Remark}
\newcounter{ex} \newtheorem{example}[ex]{Example}
\newtheorem{notation}{Notataion}
\newtheorem{aside}{Aside}

% \newcommand{\R}{\mathcal{R}}
 \newcommand{\A}{\mathcal{A}}
 \newcommand{\U}{\mathcal{U}}
 \newcommand{\zo}{\set{0,1}}
 % \newcommand{\e}{\mathbb E}
% \newcommand{\defeq}{\doteq}

\def\Ball{\cal B}
%\def\F{\mathbb F}

\begin{document}
\input{preamble.tex}

\lecture{3}{September 15, 2004}{Madhu Sudan}{Adi Akavia}


\section*{Today's Plan}
\begin{itemize}
\item Converse coding theorem
\item Shannon vs. Hamming theories
\item Goals for the rest of the course
\item Tools we use in this course
\end{itemize}

\section*{Shannon's Converse Theorem}

We complete our exposition of Shannon's theory from last lesson in
proving Shannon's Converse Theorem. Namely, we show that there
exists no encryption and decryption algorithms (not necessarily
efficient) such that transmit messages in rate exceeding $H(p)$.

Let us denote by $BSC_{p,n}$ the Binary Symmetric channel that
transmits $n$ bits in each time step, where each bit of the
message is flipped w.p. $p\leq \half$.

\begin{theorem} [Shannon's Converse Theorem]
$\forall BSC_{p,n},\ n \ge n_0,\ \eps>0, R > 1-H(p) + \eps n$, and for all encoding and decoding $E\colon\zo^{Rn}\to\zo^n$ and
$D\colon\zo^{n}\to\zo^{Rn}$,
$$\Pr_{m\in\zo^{Rn},\eta\in BSC_{p,n}}[D(E(m)+\eta) = m] \le
\exp(-n)$$
\end{theorem}
\pr Informally speaking, the proof is based on two facts: On the
one hand, there are roughly $Volume(Ball(p\pm\eps),n)) \approx
2^{H(p)n}$ received words are likely to be a corrupted version of
$m$, since w.h.p $np\pm \eps$ errors falls in any transmitted
message $m$. On the other hand, the average number of received
words that decode to $m$ is $2^{n-k}$, since
$D\colon\zo^{n}\to\zo^k$, for $k=Rn$. Now, since $2^{H(p)n}
>> 2^{n-k}$, then each corrupted word is mapped back to its
original with only a very small probability.

More formally, fix some encoding and decoding mappings $E,D$. Let
$I_{m,\eta}$ be a {\em correct decoding indicator}, namely,
$I_{m,\eta} = 1$ if $D(E(m)+\eta) = m$, and $I_{m,\eta} = 0$ o/w.
We are interested in the sum $\f 1 {2^k}\sum_{m,\eta} I_{m,\eta}$.
To compute this sum, note that when we fix some received vector $r
= E(m)+\eta$, we can write the summation as $\sum_r\sum_m
I_{m,r-E(m)}$. Now, since there is a unique $m_r$ s.t. $D(r)=m_r$,
then $I_{m,r-E(m)}=1$ iff $m=m_r$, and $I_{m,r-E(m)}=0$ otherwise.
Therefore,
$$\sum_{m,\eta} I_{m,\eta} = 2^n$$

To compute the probability of correct decoding, fix $p'=p-\eps$ (note that $p'$
still satisfies $R>1-H(p')$). We consider two types of error vectors $\eta$: (a) $\eta$ has at most $p'n$ non-zero entries, and (b) $\eta$ has more than $p'n$ non-zero entries. By Chernoff Bound, event (a) happens with very small probability $\exp(-n)$. Therefore,
\begin{eqnarray*}
\Pr[\mbox{correct decoding}] &\le& \exp(-n) + \Pr[\mbox{correct decoding}|\eta\mbox{ has more than $p'n$ non-zero entries}]\\
&=&\exp(-n) + \sum_{\eta\notin \Ball(p'n,n)}\sum_m
\Pr[\mbox{message $m$, error $\eta$, }I_{m,\eta}=1]
\end{eqnarray*}
As the choices of message,  error and decoding algorithm are independent, and $\Pr[m] = 2^{-k}$, $\Pr[\eta | \eta\notin \Ball(p'n,n)] \le \f 1 {{n \choose {p'n}}} \approx 2^{-H(p')n}$, we have:
$\Pr[\mbox{message $m$, error $\eta$, $I_{m,\eta}=1$}] \le 2^{-k} \cdot 2^{-H(p')n}$. Therefore,
\begin{eqnarray*}
\Pr[\mbox{correct decoding}] &\le& \exp(-n) + \sum_{\eta\notin B(p'n,n),m} 2^{-k} \cdot 2^{-H(p')n}
\cdot I_{m\eta} \\
&\leq& \exp(-n) + 2^{-(k+H(p')n-n)} = \exp(-n)
\end{eqnarray*}
\hfill$\square$

\section*{Generalizations of Shannon's Theorems}

Shannon's paper profoundly influenced both theory and practice of communication, as well as other fields. The theorems we stated and proved so far cover only a small part of the paper. A few generalization we'd like to mention follows.

Shannon not only considers the Binary Symmetric channel but also channels where the input and output are taken from arbitrary alphabets
$\Sigma$ and $\Gamma$. E.g, $\Sigma =\set{\pm 1}$, $\Gamma =
\mathbb{R}$. Shannon shows that as long as the error has some
finite variance, then we might as well think of the channel as
having some {\em finite} capacity (even when the alphabet in infinite as in $\mathbb{R}$!).

Shannon also considers more general probability distributions on the error of the
channel. In particular, Shannon considers Markovian error model. Markovian models can capture situations where there are bursts of huge amounts of error, by considering a finite set of correlated states. This models influenced not only communication and coding theory but also other area such as Natural Language Processing, where it lead to the $n$-gram model.


\section*{Shannon vs. Hamming}

We now contrast the works of Shannon and Hamming

\begin{itemize}
\item Both state formal mathematical models, and prove both feasibility and
infeasibility.
\item Constructive vs. Non-constructions. Shanon work deals with
constructive settings of encoding and decoding algorithms, while Hamming consider codes as combinatorial objects $\set{E(x)}_x$. Yet, Hamming's
proves constructive results (yielding the Hamming code), while the
technique of the probabilistic method used by Shannon is very un-constructive.
\item Worst-case vs. Probabilistic/Average case theory. Hamming deals with adversarial situations, namely, he analyzes worst-case scenarios; in contrast, Shanon analyzes average-case scenarios with a predefined error model of the channel, or message generation model for the source. Shannon average-case theory gives is far more complete than Hamming's worst-case theory, where there are still many open questions.
\end{itemize}

\section*{Course Goals}

In this course we explore questions arising both from Hamming's and from Shannon's works.

\subsection*{Targets arising from Shannon's Theory}

The main target arising from Shannon's theory is to explicitly find efficient encoding and decoding function. In particular, for Binary Symmetric channel, $BSC_p$, can we come up with polynomial-time encoding
and decoding functions? Or, better yet, linear-time functions?

Shannon tells us that for any rate $R<1-H(p)$ we can encode/decode. However, wjem we are guaranteed $1-H(p)-R = \eps$, we are also interested in studying the complexity as a function of $\eps$. In fact, a lot of the current research is
concentrated on this type of questions.

\subsection*{Targets arising from Hamming's Theory Targets}

To find efficient encoding and decoding algorithms we naturally require good codes. This leads us to targets of Hamming Theory of constructing "good codes", and associating with them efficient encoding/decoding functions. Let us remark that in the analysis of this codes we might consider both adversarial and probabilistic models.

To be more accurate, we must specify what are "Good Codes". Let us list the the interesting parameters and whether we want them to be maximized or minimized.

\noindent{\bf Parameter of Error-Correcting Codes $n,k,d,q$}
\begin{itemize}
\item $n$ is the block length, namely, the code is a subset $\Sigma^n$. We'd like to minimize $n$.
\item $k$ is the information length, that is the length of the messages. Note that the size of the code is $\card{Code} = \card{\Sigma^k}$. We'd like to maximize $k$.
\item $d$ is the minimum distance of code $d=\min_{x\neq y}\Delta_{Hamming}(x,y)$. We'd like to maximize $d$.
\item $q$ is the alphabet size $\card{\Sigma}$.
It is not clear whether want $q$ to be minimized of maximized. Nonetheless, we general assume that we want to minimize $q$, as empirical observation indicate that
it's easier to design good codes with smaller alphabet.
\end{itemize}

To simplify the parameters, we usually consider normalized
parameters: $R = \f k n$ (to be maximized),  $\delta = \f d n$ (to be minimized), and study $R$ vs. $\delta$. To further simplify the parameters, we also often restrict ourselves to $q=2$.

\section*{Tools}

The tools we use in this class usually comes from either Probability Theory or Algebra of Finite Fields. A summary of those tools follows.

\subsection*{Probability Theory}
\begin{itemize}
\item Linearity of expectation
\item Union bound
\item Probability of product of independence variables is the product of their individual probabilities
\item Tail bounds (\ie, the probability that a random variable deviates from its mean):
\begin{itemize}
\item Markov's Inequality: If $X>0$, then $\Pr[X>kE[X]]\le 1/k$
\item Checyshev's Bound: Mardov's inequality applied to $(X-E[X])^2$)
\item  Chernoff Bound: if $X_1...X_n \in [0,1]$ are independent random variables with expectations $E[X_i]=p$, then $\Pr[ \card{\f {\sum_i X_i}{n} - p} > \eps] \le
e^{-\eps^{2n/2}}$
\end{itemize}
\end{itemize}

\subsection*{Algebra of Finite Fields}

\subsubsection*{Fields and Vector Spaces}

\begin{definition}[Field]
A {\em Field} $(\F, +, \cdot, 0, 1)$ is a set $\F$ with addition and multiplication operations $+,\cdot$ (respectively) and special elements $0,1$ such that:
\begin{itemize}
\item addition forms a commutative group on the elements of $\F$,
\item multiplication forms a commutative group on the elements of $\F\setminus\set{0}$, and
\item there is a distribution-law of multiplication over addition.
\end{itemize}
\end{definition}
The important thing for us is that finite fields exists.

\begin{theorem}
For any prime $p$, and $m\in Z^+$, there exists a field $\F_{p^m}$ of size
$p^m$.
\end{theorem}
Where $m=1$ the field of $p$ elements is $\F_p=\set{0,...,p-1}$
with addition and multiplication modulo $p$. For $m>1$, the field
$\F_{p^m}$ is defined by an irreducible
polynomial\footnote{Irreducible polynomial over $\F_p$ is one that
can't be factored over $\F_p$} $f(x)$ of degree $m$ with
coefficients from $\F_p$, and the operations are taken modulo $p$
as well as modulo $f(x)$. The set of all polynomials over $\F_p$
modulo $p$ and $f(x)$ has precisely $p^m$ elements and it fulfills
the requirements of a field. For example, $$\F_{18} =
\set{\mbox{polynomials of deg}\le 17\mbox{ with $0/1$
coefficients, $+$ and $\cdot$ are mod $2$ and mod }x^{18}+x^9+1}$$

\begin{definition}[Vector Space]
A {\em Vector Space} $V$ over a field $\F$ is a quadruple
$(\F,V,+,\cdot)$, where $V=\F^n$, $+$ is vector addition (\ie,
$(v_1,...,v_n)+(u_1,...,u_n) = (v_1+u_1,...,v_n+u_n)$), and
$\cdot$ is a scalar product (\ie, $\a(v_1,...,v_n) = (\a
v_1,...,\a v_n) $).
\end{definition}


%\subsubsection{Linear Sub-Spaces and Linear Codes}
\subsubsection*{Linear Codes}

We'd like the codes we construct to have nice properties such as succinct
representation, efficient encoding, and efficient decoding. Linear codes (as defined below) is a family of codes that has two of these properties: succinct representation, efficient encoding.

Let us first define what is a {\em linear subspace}.
\begin{definition}[Linear Subspace]
Let $V$ be a vector space over a field $\F$. A subspace
$L\subseteq V$ is a {\em linear subspace} if every $v_1, v_2\in L$
and $\a\in\F$ satisfy: $v_1+v_2\in L$ and $\a v1\in L$.
\end{definition}

Now we define linear codes.
\begin{definition}[Linear Codes]
For $\Sigma$ a field $\F$, $C\subseteq \Sigma^n$ is a {\em linear
code} iff $C$ is a linear subspace of $\F^n$.
\end{definition}

Linear codes have the two desirable properties of succinct
representation and efficient encoding.

\noindent{\bf Succinct representation}: a linear codes is defined
by a {\em basis} (\ie, a set $b_1,...,b_k\in C$ s.t. $\forall
\a_1,\a_k\in\F$, $\sum\a_ib_i = 0$ implies $\a_1,..,\a_k =0$).

\noindent{\bf Efficient encoding}: the basis representing the
linear code also defines the generator matrix $G$ by having $b_i$
as the $i$th row of $G$. This yields an efficient encoding of each
$x$ by $Gx$.

An alternate specification of Linear subspace is by its {\em null
space}: $$C^{\perp}\sett{y}{\tup{y,v}=0 \forall v\in C}$$(where
the inner product $\tup{x,y}=\sum x_iy_i$ for any $x,y\in\F$). For
linear codes $C$, the null space $C^{\perp}$ is also linear, and
$dim(C^\perp)+dim(C)=n$.

Empirically, there seem to be no harm in restricting ourselves to
linear code. Therefore the study of linear codes will be a big
emphasis of this course. In particular we'll consider codes based
on polynomials over finite fields.

\subsubsection*{Polynomials over finite fields}
$\F[X]$ denotes the ring of polynomials over $\F$. The elements of $\F[X]$ are vectors $(c_0,...,c_k)$ which are associated with the formal polynomial $\sum_{i=0}^k c_i x^i$ with addition and multiplication defined the standard way.

The following theorem proves to be incredibly useful. The proof is not hard.
\begin{theorem} [Evaluation of polynomials]
Let $C(x)=\sum_{i=0}^k c_i x^i$, and let $C(\a_1),....,C(\a_q)$ be
evaluate of $C(x)$ over fields' elements $\a_1,...,\a_q$. If $q>k$,
then these evaluations specify the polynomial $C(x)$ uniquely.
\end{theorem}

See more details in the algebra notes on web.

\end{document}

\remove{In constructing codes, we often make $\Sigma$ a field, and then the code $\Sigma^n$ is a vector space. We'd like it the code to be a vector space with "nice" properties, such as being a linear subspace: }
