\documentclass[10pt]{article}
\usepackage{amsmath,amsfonts}
\usepackage[on]{auto-pst-pdf}
\usepackage{pst-tree}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\def\calA{\mathcal{A}}
\def\scrC{\mathbf{C}}
\def\scrD{\mathbf{D}}
\def\e{\varepsilon}
\newcommand{\paren}[1]{\left( #1 \right)}
\newcommand{\brkt}[1]{\left[ #1 \right]}
\newcommand{\Inf}{{\rm Inf}}

\begin{document}
    \input{preamble.tex}
    \lecture{10}{March 10, 2008}{Ronitt Rubinfeld}{Alex Cornejo}
    \section*{Last Lecture}

    \begin{definition}[L1 Norm]
        Let $f: \set{\pm 1}^n \to \R$, then $L_1(f) = \sum_S \abs{\hat{f}(S)}$.
    \end{definition}
    
    \begin{claim}
        Given $\e$, $S_\e = \set{S \subseteq [n] : \abs{\hat{f}(s)}= \frac{\e}{L_1(f)}}$, we have
        \begin{enumerate}
            \item $\abs{S_\e} \le \frac{(L_1(f))^2}{\e}$
            \item $\sum_{S \in S_\e} \hat{f}(S)^2 \ge 1-\e$
        \end{enumerate}
    \end{claim}
    
    \begin{theorem}
        \label{thm1}
        Boolean functions can be learned to $\e$-accuracy with
        ${\rm poly}(n,L_1(f),1/\e)$ queries under uniform distribution.
    \end{theorem}
    
    \begin{definition}[Monotone functions]
        We assume the following ordering of vectors in $\set{\pm 1}^n$: $x \le y$ if and only if for all coordinates $i$, $x_i \le y_i$. A function $f$ is \emph{monotone} if $\forall x \le y$, $f(x) \le f(y)$.
    \end{definition}
    
    \section{Learning Decision Trees}
    
    We show how to apply learning with $\e$-accuracy using ${\rm poly}(n,L_1(f),1/\e)$ queries to decision trees.
    
    \begin{minipage}{2in}
    \begin{center}
    \begin{postscript}
    $
        \tiny
        \pstree[treesep=.8cm,levelsep=1cm]{\Tcircle{x_1}}{\pstree{\Tcircle{x_2}\tlput{+1}}{\Tr{}\tlput{+1}\pstree{\Tcircle{x_3}\trput{-1}}{\Tcircle{x_4}\tlput{+1}\Tcircle[doubleline=true]{-1}\trput{-1}}}\Tcircle{x_3}\trput{-1}}
    $
    \end{postscript}
    \end{center}
    \end{minipage}
    \begin{minipage}{4.25in}
    \begin{theorem}
        \label{thm2}
        If $f$ has a size $t$ decision tree then $L_1(f) \le t$, where $t$ is the
        number of nodes in the tree.
    \end{theorem}
    
    Notice that Theorem \ref{thm2} together with Theorem \ref{thm1} imply that
    we can learn decision trees with ${\rm poly}(n,t,1/\e)$ queries.
    \end{minipage}
    
    \begin{proof}
        For each leaf $\ell$, we define the following function
        \[
            g_\ell (x) = \begin{cases} 1 & \mbox{if } x\mbox{ reaches } \ell, \\
            0 & \mbox{otherwise}.
            \end{cases}
        \]
W.l.o.g.\ the variables on the path to $\ell$ are $x_1,\ldots,x_k$ and
        they always take the ``-1'' direction. Then
        \[
            g_\ell (x) = \paren{\frac{1-x_1}{2}}\paren{\frac{1-x_2}{2}}\cdots\paren{\frac{1-x_k}{2}} = \sum_{S \subseteq [k]} \frac{(-1)^{|S|}}{2^k} \chi_S
        \]
        Notice that $L_1(g_\ell) = \sum_{S \subseteq [k]} \frac{1}{2^k} = 1$.
        
        We now proceed to define $f$ in terms of $g$.
        
        \begin{align*}
            f(x) =& \sum_{\parbox{1cm}{\centering paths~$\ell$}} g_\ell(x) \cdot \underbrace{\paren{\parbox{2.3cm}{\centering output of leaf at end of $\ell$}}}_{\pm 1} \\
            \hat{f}(S) =& \sum_{\parbox{1cm}{\centering paths~$\ell$}} \hat{g}_\ell(S) \cdot \underbrace{\paren{\parbox{2.3cm}{\centering output of leaf at end of $\ell$}}}_{\pm 1}
        \end{align*}
        
        Finally, we evaluate the $L_1$ norm of $f$.
        
        \begin{align*}
            L_1(f) &= \sum_S \abs{\hat{f}(S)} \\
            &= \sum_S \abs{\sum_{\parbox{1cm}{\centering paths~$\ell$}} \pm \hat{g}_\ell(S)} \\
            &\le \sum_{\parbox{1cm}{\centering paths~$\ell$}} \;\; \underbrace{\sum_S \abs{\hat{g}_\ell(S)}}_{L_1(g_\ell) = 1} \\
            &= \mbox{number of paths} \\
            &\le t
        \end{align*}
    \end{proof}
    
    \section{Learning monotone functions}
    
    
        \noindent{\bf Comment:} {\sl You can improve on the algorithm we are about to describe
        by restricting the set of our potential hypothesis $g$ to $\pm 1$ and majority functions (instead of
        dictators). Instead of $\Omega(\frac{1}{\sqrt{n}})$ advantage, this would give $\Omega(\frac{1}{\sqrt{n}})$ advantage. It is also possible to remove the queries using the low degree algorithm and sampling on the order of $2^{\sqrt{n}}$.}
    
\bigskip
    Throughout the following we assume that we want to learn a function with respect to the uniform distribution.
    We also assume access to queries. Furthermore, we call each pair $(x_1,\ldots,x_{k-1},-1,x_{k+1},\ldots,x_n)$
    and $(x_1,\ldots,x_{k-1},+1,x_{k+1},\ldots,x_n)$ an \emph{edge} in the hypercube $\set{\pm 1}^n$.
    
    \begin{theorem}
        \label{thmfinal}For each monotone function $f : \set{\pm 1}^n \to \set{\pm 1}$,
         there exists a function $g \in \set{\pm 1, x_1, \ldots, x_n}$ such
        that $\Pr_x \brkt{f(x) = g(x)} \ge \half + \Omega(\frac{1}{n})$.
    \end{theorem}
    
	
    \begin{minipage}{2in}
    \vspace{1.2in}
    \begin{center}
    \begin{postscript}
        \rput{45}{%
        \psframe(0,0)(2,2)
        \psbezier{-}(0,1.5)(1,1)(1.5,1.5)(2,0.5)
        \psdots[dotstyle=|](0.2,1.4)(0.4,1.35)(0.6,1.3)(0.8,1.26)(1,1.22)(1.2,1.2)(1.4,1.15)(1.6,1.05)(1.8,0.8)
        \color{red}
        \rput[b]{*0}(2,2){$+1,\ldots,+1$}
        \rput[b]{*0}(1.5,1.5){$+1$}
        \color{blue}
        \rput[b]{*0}(.5,.5){$-1$}
        \rput[t]{*0}(0,0){$-1,\ldots,-1$}}
    \end{postscript}
    \vspace{.1in}
    \end{center}
    \end{minipage}
    \begin{minipage}{4.25in}
        This figure represents a Boolean hypercube. \\
        $2^n$ nodes. \\
        $2^{n-1}$ edges in direction $i$\\
        $n 2^{n-1}$ total edges. \\
        A \emph{cut edge} connects a red node to a blue node.
    \end{minipage}

    \begin{definition}[Influence of the $i^{\text{th}}$ variable]
        $$\Inf_i(f) = \underbrace{\hat{f}(\set{i})}_{\mbox{Homework 2}} = \underbrace{2\Pr\brkt{f(x) \neq x_i}-1}_{\mbox{a previous lecture}}$$
        \[
            \Inf_i(f) = \frac{\mbox{\# of cut edges in $i^{th}$ direction}}{2^{n-1}}
        \]
    \end{definition}
    
    \begin{definition}[Total influence]
        \begin{align*}
           \Inf(f) &= \sum_{i=1}^n \Inf_i (f) \\
           &= \frac{\mbox{\# of cut edges}}{2^{n-1}}
        \end{align*}
    \end{definition}
    
    \paragraph{Plan of attack.} To show that $\Inf_i(f)$ is $\Omega(1/n)$, we
    will first define the concept of a canonical path and use it to prove a
    lower bound.
    
    \begin{definition}[Canonical path]
        For all $(x,y)$ such that $x$ is red and $y$ is blue, a \emph{canonical
        path from $x$ to $y$} scans bits from left to right, flipping bits
        where needed. Each flip corresponds to a step in the path.
    \end{definition}
    
    \begin{center}
        \begin{tabular}{rrrrrr}
            $x=$ & -1 & +1 & +1 & +1 & +1 \\
            & -1 & {\bf -1} & +1 & +1 & +1 \\
            & -1 & -1 & {\bf -1} & +1 & +1 \\
            $y=$ & -1 & -1 & -1 & +1 & -1
        \end{tabular}
    \end{center}
    
    \begin{observation}
        It is clear that since the start of a canonical path is red and the
        end is blue, then there exists at least one edge $(u,v)$ in the path
        such that $u$ is red and $v$ is blue.
    \end{observation}
    
    We can assume that $\Pr \brkt{f(x) =1 } \in \brkt{\frac{1}{4},\frac{3}{4}}$
    since otherwise we could use one of the constant $\pm 1$ functions to approximate
    $f$.
    Under this assumption, how many red-blue $(x,y)$ pairs can we expect?
    
    \[
        \ge \paren{\frac{1}{4} 2^n}^2 = \frac{1}{16} 2^{2n}
    \]

    \begin{lemma}
        \label{lem3}
        For any given edge, there are $\le 2^n$ canonical paths which cross it.
    \end{lemma}
    
    \begin{proof}
        Consider an edge $(w,w^{\oplus i})$, a part of a canonical path from $x$
        to $y$. Notice that $w$  and $w^{\oplus i}$ have Hamming distance one and therefore only
        differ in one bit (the $i^{th}$ bit).
        
        We argue that due to the definition of canonical paths, there are
        a limited number of paths that can share an edge.

        \begin{center}
            \begin{postscript}
                $\begin{array}{rccccccc}
                    x & \rnode{x1}{ } & & & & & &\rnode{x2}{ } \\
                    & &  & & i & & & \\
                    w & \pnode(0,.1){w1} & & & \rnode{wm}{b} & & &\pnode(0,.1){w2} \\
                    w^{\oplus i} & \pnode(0,.1){z1} & & & \rnode{zm}{\neg{b}} & & &\pnode(0,.1){z2} \\
                    & y_1 & \ldots & y_{i-1}&  & x_{i+1}&\ldots & x_n \\
                    y & \rnode{y1}{ } & & & & & &\rnode{y2}{ } \\
                \end{array}$
                \ncbox[boxsize=.15]{x1}{x2}
                \ncbox[boxsize=.15]{y1}{y2}
                \ncbox[boxsize=.15]{z1}{z2}
                \ncbox[boxsize=.15]{w1}{w2}
                \ncbox[boxsize=.3,nodesep=2pt]{wm}{zm}
            \end{postscript}
        \end{center}        
        
        For any canonical path between $(x',y')$ that crosses the edge $(w,w^{\oplus i})$,
        the prefix $y'_1\ldots y'_{i-1}$ of $y'$ has to be the same as the prefix of $w$.
	This gives us at most $2^{n-i}$ choices for the last $n-1$ bits of $y'$.
        Analogously, the suffix $x'_{i+1}\ldots x'_n$ of $x'$ has to be the same as the suffix of $w$,
	and we have at most $2^{i-1}$ choices for the first bits of $x'$.
        Therefore there are $\le 2^n$ settings of $x'$ and $y'$ consistent with the edge.
    \end{proof}
    
    Since we know that each canonical path has at least one red-blue edge,
    using lemma \ref{lem3}, we can now give a lower bound on the number of red-blue
    edges.
    
    \[
        \mbox{\# of red-blue edges} \ge \frac{\frac{1}{16} 2^{2n}}{2^{n}} = \frac{1}{16} 2^n
    \]
    
    Therefore by the pigeon-hole principle, there is $i$ such that $\ge \frac{1}{16 n} 2^n$ red-blue edges exist in direction $i$.
    Finally, using the definition for the influence of a variable,
    
    \[
        \Inf_i(f) \ge \frac{\frac{1}{16 n} 2^n}{2^{n-1}} = \frac{1}{8n}.
    \]
    Since $\Pr\brkt{f(x) \neq x_i}=\half+\Inf_i(f)/2$, we have
    that
    \begin{align*}
        \Pr\brkt{f(x) \neq x_i} &\ge \half + \frac{1}{16 n} \\
        &= \half + \Omega\paren{\frac{1}{n}}.
    \end{align*}
    This completes our proof of Theorem \ref{thmfinal}.
    
    \section{Next Lecture: Boosting PAC Learners}
    
    \begin{definition}[PAC learning]
        An algorithm $\calA$ \emph{PAC learns} a concept class $\scrC$ if $\forall c
        \in \scrC$, $\forall$ distributions $\scrD$, $\forall \e, \delta > 0$,
        given examples of $c$ using $\scrD$, then $\calA$ outputs a hypothesis
        $h$ such that with probability $\ge 1-\delta$
        \[
            \Pr_\scrD \brkt{c(x) \neq h(x)} \le \e.
        \]
    \end{definition}

    \begin{definition}[Weak PAC learning]
        An algorithm $\mathcal{WL}$ \emph{weakly PAC learns} a concept class $\scrC$ with parameter $\tau$ if $\forall
        c \in \scrC$, $\forall$ distributions $\scrD$,
        $\forall \delta > 0$, given examples of $c$ according to distribution $\scrD$
        algorithm $\calA$ outputs a hypothesis $h$ such that with probability
        $\ge 1 -\delta$
        \[
            \Pr_\scrD \brkt{c(x) \neq h(x)} \le \half - \tau.
        \]
        
        The parameter $\tau$ is referred to as the \emph{advantage of the weak learner}.
    \end{definition}
    
    For some years it was thought that these two problems where separate, but
    Schapire proved that it is possible to boost weak PAC learners to ``strong'' PAC
    learners.
    
    \begin{theorem}[Shapire]
        If $\scrC$ can be weakly learned, then $\scrC$ can be ``strongly'' learned.
    \end{theorem}

    Notice that that we cannot apply these definitions to boost the algorithm
    presented in the previous section, since the algorithm we developed relied
    on the assumption of a uniform distribution, and thus it is not a proper ``weak PAC
    learner''.
\end{document}
