\documentclass[10pt]{article}
\usepackage{epsfig,color}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\newenvironment{proof-of-theorem}[1]{\noindent{\bf Proof of Theorem
    #1}\hspace*{1em}}{\qed\bigskip}

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{7}{March 1, 2006}{Ronitt Rubinfeld}{Jelani Nelson}

%%%% body goes in here %%%%

\section{Recap of Previous Lectures}
\begin{enumerate}
\item Goldreich-Levin Algorithm:

Given $\theta$ and oracle access to $f:\{\pm 1\}^n \rightarrow
  \{\pm 1\}$, output a list $L$ of subsets of $[n]$ such that $\forall
  S \subseteq [n]$, $S$ will be output if $|\hat{f}(S)| > \theta$, and
  will not be output if $|\hat{f}(S)| < \theta / 2$.  Both of these
  guarantees are with high probability, and the algorithm runs in time
  poly$(n, \frac {1}{\theta})$.

\item Sampling Theorem:

$O(k / \delta^2)$ samples allows us to produce an estimate
$\tilde{f}(S)$ of $\hat{f}(S)$ such that $Pr_S [|\tilde{f}(S) -
\hat{f}(S)|~>~\delta]~\le~e^{-k}$.
\end{enumerate}

\section{Decision Trees}

\begin{figure}[h]
\begin{center}
\input{decision.latex}
\caption{A decision tree for computing the majority function on three
  variables.}
\label{figure:decision}
\end{center}
\end{figure}

A decision tree is simply a way to represent a function $f: \{\pm 1\}^n
\rightarrow \{\pm 1\} $.  It is a
rooted binary tree, where each node is a leaf or has exactly two
children.  Leaves contain the values $\{\pm 1\}$, signifying that all
inputs whose prefixes match the edges taken from the root path are
mapped by the function to the value in the leaf.  Nodes containing
exactly two children contain a variable $x_i$ and have two edges
labeled $+1$ and $-1$, dictacting which way to branch based on the
value of $x_i$ in the input.

\begin{definition}
The $L_1$ norm of a function $L_1(f)$ is defined as $\sum_{S \subseteq
  [n]} |\hat{f}(S)|$.
\end{definition}

\begin{theorem}
If $f$ is computable by an $m$-node decision tree, then $L_1(f) \le m$.
\end{theorem}
\begin{proof}
For each leaf $\ell$ define a function $g_{\ell}: \{\pm 1\}^n
\rightarrow \{0, 1\}$.  We would like to define $g_{\ell}$ so that
$g_{\ell}(x)$ is 1 if following $x$ down the tree causes us to arrive
at $\ell$ and 0 otherwise.  Let $V_{\ell}$ be the set of variables
visited on the path from the tree's root to $\ell$.  We define

\[ g_{\ell}(x) = \prod_{i \in V_{\ell}} \frac{1 \pm x_i}{2} \]

The sign is a plus if $x_1$ should be $+1$ to arrive at $\ell$ and is
$-1$ otherwise.  Notice that $g_{\ell}(x)$ can also be written as

\[ \frac{1}{2^{|V_{\ell}|}} \sum_{S \subseteq V_{\ell}} (\pm 1)
\prod_{i \in S} x_i \]

The sign within the sum is negative iff the number of times $-1$ taken
is odd.  Also note that $\prod_{i \in S} x_i$ is just the definition
of $\chi_S(x)$, so the above sum is simply $\frac{1}{2^{|V_{\ell}|}}
\cdot 2^{|V_{\ell}|} = 1$.

Now, $f(X) = \sum_{\ell} g_{\ell}(x) \cdot val(\ell)$ (where
$val(\ell) \in \{\pm 1\}$).  So finally we have

\begin{eqnarray*}
L_1(f) &=& \sum_T |\hat{f}(T)| \\
&=& \sum_T \left|\sum_{\ell} \hat{g}_{\ell}(T) val(\ell)\right| \\
&\le& \sum_T \sum_{\ell} |\hat{g}_{\ell}(T)| \\
&=& \sum_{\ell} \sum_T |\hat{g}_{\ell}(T)| \\
&=& \# leaves
\end{eqnarray*}

Since the number of leaves is certainly at most the number of total
nodes, the claim follows.
\end{proof}

\section{Weak Learning of Monotone Functions}
First we will define the partial order $\preceq$ on $\{\pm 1\}^n$.
For any $x,y \in \{\pm 1\}^n$, we say that $x \preceq y$ iff $\forall
i \ x_i \le y_i$.

\begin{definition}
We say that $f: \{\pm 1\}^n \rightarrow \{\pm 1\}$ is {\em monotone} if $x
\preceq y \Rightarrow f(x) \le f(y)$.
\end{definition}

\begin{theorem}\label{thm:weak-learn}
$\forall f$ monotone, there is a function $g \in \{-1, 1, x_1, x_2,
\ldots, x_n\}$ such that $Pr_x[f(x) = g(x)] \ge \frac{1}{2} +
\Omega(\frac{1}{n})$.
\end{theorem}

Before we prove Theorem \ref{thm:weak-learn}, we will introduce the
notion of the influence of a variable.

In the following, let $u_i = (1, 1, \ldots, 1, -1, 1, \ldots, 1)$ be the vector that
has a $-1$ in the
$ith$ location and $1$'s elsewhere.  

\begin{definition}
The {\em influence of
the variable $x_i$} on the function $f: \{\pm 1\}^n \rightarrow \{\pm
1\}$ to be

\[ Inf_i(f) = Pr_x[f(x) \neq f(x \cdot u_i)] \]

where the multiplication $x \cdot u_i$ is component-wise.  We then
define the {\em influence of $f$} to be

\[ Inf(f) = \sum_{i=1}^n Inf_i(f) \]
\end{definition}

We will also refer to edges between points in $\{\pm 1\}^n$.  There
will be an edge between two points iff all coordinates except for one
match exactly.  We say that the edge crosses the $i$th direction if
the $i$th coordinate is the one that differs.  Also, we refer to the
set of points where $f$ is $+1$ as red points, points where it is
$-1$ as blue, and edges as red-blue if they are incident on one red
and one blue point.

Then 

\[Inf_i(f) = \frac{\hbox{total \# red-blue edges in }
  i\hbox{th direction}}{\hbox{total \# edges in } i\hbox{th
    direction}} = \frac{\hbox{total \# red-blue edges in }
  i\hbox{th direction}}{2^{n-1}} \]

 and 

\[ Inf(f) = \frac{\hbox{total \# red-blue edges}}{\hbox{total \#
    edges}} = \frac{\hbox{total \# of red-blue edges}}{n2^{n-1}} \]

\begin{lemma}
If $f$ is monotone, then $Inf_i(f) = \hat{f}(\{i\})$.
\end{lemma}

\begin{proof-of-theorem}{\ref{thm:weak-learn}}
Assume $Pr[f(x) = 1] \in [\frac{1}{4}, \frac{3}{4}]$, else one of
$\{\pm 1\}$ agrees with $f$ with probability at least $\frac{3}{4}$.
Now we will use a canonical path argument.  Suppose $x$ is red and $y$
is blue.  A canonical path from $x$ to $y$ is computed by scanning the
bits left to right, following an edge where $x_i \neq y_i$.  For
example, the following is a path from $(+1,+1,-1,-1)$ to
$(-1,-1,+1,+1)$:

\begin{center}
\begin{tabular}{cccc}
+1 & +1 & -1 & -1 \\
-1 & +1 & -1 & -1 \\
-1 & -1 & -1 & -1 \\
-1 & -1 & +1 & -1 \\
-1 & -1 & +1 & +1
\end{tabular}
\end{center}

Now, if $Pr[f(x) = 1] \in [\frac{1}{4},\frac{3}{4}]$, then the numbers
of red and blue nodes are each at least $\frac{1}{4}2^n$.  
So the
number of red-blue pairs is at least $\frac{1}{16}2^{2n}$.  Now for
any given edge, let us analyze the number of $x,y$ pairs whose
canonical paths traverse that edge.  We assume the edge crosses the
$i$th direction.  The length $(n-i)$ suffix of $x$ must match that of
the two vertices the edge is incident upon, and the length $i - 1$
prefix of $y$ must match the prefixes of those two vertices.
Furthermore, the $i$th element of $y$ is constrained to be the
opposite of that of $x$.  Thus we have $2^i$ choices for $x$ and
$2^{n-i}$ choices for $y$, for a total of at most $2^n$ $x,y$ pairs
whose canonical paths cross a given edge.  Each canonical path between
a red-blue pair must cross at least one red-blue edge. 
So, there are at least
$\frac{1}{16}2^{n}$ red-blue
edges, and thus there are at least $\frac{1}{16n}2^{n}$ red-blue edges
in direction $i$, for some $i$.  So now we have

\[ \hat{f}(\{i\}) = Inf_i(f) \ge \frac{\frac{1}{16n}2^n}{2^{n-1}} =
\frac{1}{8n} \]

Since $\hat{f}(S) = 2Pr_x[f(x) = \chi_S(x)]$, the above leads to
$Pr_x[f(x) = x_i] \ge \frac{1}{2} + \frac{1}{16n}$.
\end{proof-of-theorem}

In fact, one can do better using the Kruskal-Katona theorem to show
that one of the functions $\{-1, +1, majority\ function\}$ has
$\frac{1}{2} + \Omega(\frac{1}{\sqrt{n}})$ agreement with any monotone
function.  Imagine that the points of $\{\pm 1\}^n$ are placed at
vertices of a hypercube, rotated so that $(+1,+1,\ldots,+1)$ is at the
very top.  We then say that a point is at level $i$ if it contains
exactly $i$ $+1$'s.  Kruskal-Katona tells us about the rate at which
$p_k = Pr[f(x) = 1|x\hbox{ at level } k]$ changes as $k$ decreases.
%More quantitatively, Kruskal-Katona says that $p_i^i \le p_j^j$
%whenever $i \le j$.  
We will not go into the proof details here though.

\end{document}




