\documentclass[10pt]{article}
\newtheorem{define}{Definition}
\usepackage{color}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}
\newtheorem{note}[theorem]{Note}
\newcommand{\RETURN}{{\bf return}}
\newcommand{\Adv}{{\rm Adv}}

\lecture{11}{March 12, 2008}{Ronitt Rubinfeld}{Yoong Keok Lee}

%%%% body goes in here %%%%

Today, we will show how a weak PAC (Probably Approximate Correct)
learning algorithm can be boosted to a strong one.  This result has
far-reaching implications beyond computational learning theory.

\section{Introduction}
\begin{define}
  An algorithm $\mathsf{A}$ (``strongly'') PAC learns a concept class
  $\mathcal{F}$ if~
  $\forall f \in \mathcal{F}, 
  \forall \mbox{distribution } \mathcal{D},
  \forall \epsilon,\delta > 0$,
  with probability $\ge 1-\delta$, 
  given examples $\in \mathcal{D}$ labelled according to $f$, 
  $\mathsf{A}$ outputs $h$ such that
  \begin{equation}
    \Pr_{\mathcal{D}}[h(x) \ne f(x)] \le \epsilon.
  \end{equation}
\end{define}
\begin{remark}
  \begin{itemize}
  \item $\epsilon$ is called the accuracy parameter, and $\delta$ is
    called the security parameter or the failure probability.
  \item Parameter $\delta$ is inconsequential here: As long as it is
    reasonably small, we can drive it down to an arbitrarily small
    value.  (Refer to Question 2 in Homework 2.)  For this reason, we
    shall be omitting this parameter from here onwards.
  \item Hypothesis $h$ does not necessarily have to be in concept
    class $\mathcal{F}$. If it does, then the model is called a proper
    learning model.
  \item Distribution $\mathcal{D}$ does not have to be uniform either.
    It can be any distribution, and therefore, the algorithm is
    distribution-free.
  \end{itemize}
\end{remark}

\begin{define}
  An algorithm $\mathsf{WL}$ {\color{blue}\bf weakly} PAC learns a concept class
  $\mathcal{F}$ if~
  $\forall f \in \mathcal{F}, 
  \forall \mbox{distribution } \mathcal{D},
  {\color{blue}\mathbf{\exists \gamma > 0}},\forall \delta > 0$,
  with probability $\ge 1-\delta$, 
  given examples $\in \mathcal{D}$ labelled according to $f$, 
  $\mathsf{WL}$ outputs $c$ such that
  \begin{equation}
    \Pr_{\mathcal{D}}[c(x) \ne f(x)] \le {\color{blue}\mathbf{ \frac{1}{2} - \frac{\gamma}{2}}}.
  \end{equation}
\end{define}
\begin{define}
  The term $\frac{\gamma}{2}$ is called the \emph{advantage} of $\mathsf{WL}$.
\end{define}
\begin{remark}
  Here, we assume that the concept class $\mathcal{F}$ is Boolean, and so
  hypothesis $c$ can be just doing slightly better than one of the two constant function. Also,
  note that $\mathsf{WL}$ must be able to output such $c$ {\em for all
    distributions}, not just, say, the uniform distribution.
\end{remark}

\begin{theorem}
  If $\mathcal{F}$ can be weakly learned, then $\mathcal{F}$ can be strongly learned.
\end{theorem}

\section{A Boosting Algorithm}
In this section, we present an algorithm which boosts a weak
learner to a strong one, hence proving the above theorem.  There are
several variants the algorithm, but they revolve around the same idea.

\subsection{The Intuition}
Suppose a weaker learner is only $51\%$ accurate.  We can first learn
a weak hypothesis, filter away examples which are correctly
classified, and then call the weak learner on the remaining $49\%$ of
the data.  To increase the collective coverage of the hypotheses, we
can repeat alternating between the filtering and the learning steps.
A natural question is: Given an unseen example, which hypothesis shall
we use?  The basic idea of the boosting algorithm is to construct a
filtering mechanism so that the majority vote of the collective
hypotheses works out.

\subsection{The Algorithm}
Given a weak learner $\mathsf{WL}$, a distribution $\mathcal{D}$, a concept
$f$, parameters $\epsilon$ and $\gamma$, the boosting algorithm
$\mathsf{Boost}$ is the following: (We illustrate the case for the
uniform distribution. Note that the algorithm can be easily modified
to be distribution-free although we are not showing it here.)

\begin{tabbing}
  $\mathsf{Boost}(\mathsf{WL},\mathcal{D},f,\epsilon,\gamma)$ \\
  \qquad {\bf initialize} distribution $\mathcal{D}_0 = \mathcal{D} = \mathcal{U}$ \\
  \qquad \quad Use weak learner $\mathsf{WL}$ to generate weak hypothesis $c_1$ such that $\Pr_{\mathcal{D}_0}[f(x)=c_1(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$\\
  \qquad \quad Set current hypothesis $h=c_1$ \\
  \qquad \FOR~$i = 1 \mbox{ \TO~} T$ \\
  \qquad \quad (1) Construct $\mathcal{D}_i$ with the filtering mechanism $\mathsf{Filter}(\mathcal{D},h=\mbox{maj}(c_1,\ldots,c_i),f,\epsilon,\gamma)$ \\
  \qquad \quad (2) Run $\mathsf{WL}$ on $\mathcal{D}_i$ to get weak hypothesis $c_{i+1}$ such that $\Pr_{\mathcal{D}_i}[f(x)=c_{i+1}(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$ \\
  \qquad \quad (3) Update $h=\mbox{maj}(c_1,\ldots,c_{i+1})$ \\
  \qquad \RETURN~$h=\mbox{maj}(c_1,\ldots,c_{T+1})$ such that $\Pr_{\mathcal{D}}[f(x)=h(x)] \ge 1-\epsilon$
\end{tabbing}

\begin{tabbing}
  $\mathsf{Filter}(\mathcal{D},h,f,\epsilon,\gamma)$ \\
  \qquad \DO~until we have the desired number of examples\\
  \qquad \quad Draw an example $x$ from $\mathcal{D}$ \\
  \qquad \quad \IF~$h=\mbox{maj}(c_1,\ldots,c_i)$ is wrong on $x$, \THEN~keep $x$\\
  \qquad \quad \ELSE~\IF~\# of $c_i$'s right - \# of $c_i$'s wrong $> \frac{1}{\epsilon \gamma}$, \THEN~throw $x$ away \\
  \qquad \quad \ELSE, say \# of $c_i$'s right - \# of $c_i$'s wrong = $\frac{\alpha} {\epsilon \gamma}$, \THEN~keep $x$ with probability $1-\alpha$\\
  \qquad \RETURN~all retained examples $\mathcal{D}_{i+1}$
\end{tabbing}

The algorithm assumes the weak learner never fails.  (Recall that we
can easily decrease the probability of failure.)  Before giving the
bound $T$ on the maximum number of iterations needed, we first
introduce some notations.

\section{Preliminaries} \label{sec:notations}
Here are some notations and their properties:
\begin{enumerate}
  \item \(
      R_c(x) = \left\{ \begin{array}{c l}
          +1 & \mbox{if $f(x) = c(x)$} \\
          -1 & \mbox{o.w.}\\
      \end{array} \right. \)
    \quad gives $+1$ if (weak) hypothesis $c$ is right on example $x$
  \item $N_i(x) = \sum_{1 \le j \le i} R_{c_j}(x)$ \quad is the number
    of right $c$'s exceeding the wrong ones
  \item \(
    M_i(x) = \left\{ \begin{array}{c l}
        1 & \mbox{if $N_i(x) \le 0$} \\
        0 & \mbox{if $N_i(x) \ge \frac{1}{\epsilon \gamma}$} \\
        1 - \epsilon \gamma N_i(x) & \mbox{o.w.}
      \end{array} \right. \)
    \quad \\is a ``measure'' which upper bounds the error of hypothesis $h=\mbox{maj}(c_1,\ldots,c_i)$ on example $x$.
  \item $\mu(M) = \frac{1}{2^n}\sum_x M(x) \ge \mbox{error}(h) \ge
    \epsilon$ \quad is the ``mean'' of $M$.  It upper bounds the error
    of $h$ and therefore also $\epsilon$.  {(We actually estimate
      $\mu(M)$ by sampling in each iteration and stop if $\mu(M) <
      \epsilon$.)}
  \item $|M| = \sum_x M(x) = 2^n\mu(M)$ \qquad is the total ``mass'' of all examples according to ``measure'' $M$.
  \item $D_M(x) = \frac{M(x)}{|M|}$ \qquad is a distribution over $x$
    given $M$. (Note that we obtain $\mathcal{D}_i$ with $c_i$, and so
    $D_{M_i} = \mathcal{D}_i$.)
  \item $\Adv_c(M) = \sum_x R_c(x)M(x)$ \qquad is the advantage of $c$ on $M$. (Random guessing gives $0$.)
  \item $\Adv_c(M) \ge \gamma|M|$ iff $\Pr_{x \in D_M}[c(x)=f(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$
  \item If $\Pr_{x\in D_M}[c(x)=f(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$ and $\mu(M) \ge \epsilon$, then $\Adv_c(M) \ge_{(8)} \gamma |M| = \gamma 2^n \mu(M) \ge_{(4)} \gamma 2^n \epsilon$ \label{item:adv}
\end{enumerate}

\section{Convergence Proof}

\begin{claim}\label{claim:Aix}$A_i(x) = \sum_{0 \le j \le i-1} R_{c_{j+1}}(x)M_j(x) <
  \frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i$
\end{claim}
Before proving this claim, we first use it to bound the maximum number
of iterations required by the boosting algorithm. Hence, if a concept
can be weakly PAC learned, then it can be (``strongly'') PAC learned.
\begin{claim}
  The maximum number of iterations required by the boosting algorithm is $\le \frac{2}{\gamma^2 \epsilon^2}$.
\end{claim}
\begin{proof}
  We prove the claim by showing that assuming the algorithm does not
  stop after $\frac{2}{\gamma^2 \epsilon^2}$ iterations leads to a
  contradiction.  Suppose the algorithm continues to run after
  iteration $i_0 > \frac{2}{(\epsilon \gamma)^2}$ (i.e.\ $\mu(M_i) \ge
  \epsilon$), a lower bound can be derived as follows:
  \begin{eqnarray}
    \sum_x A_{i_0+1} & = & \sum_x \sum_{0 \le j \le i_0} R_{c_{j+1}}(x)M_j(x) \\
    & = & \sum_{0 \le j \le i_0} \underbrace{\sum_x R_{c_{j+1}}(x)M_j(x)}_{Adv_{c{j+1}}(M_j(x))} \\
    & \ge & (i_0+1) \gamma 2^n \epsilon \mbox{\qquad (using property~\ref{item:adv} in section~\ref{sec:notations})}
  \end{eqnarray}
  
  Using Claim~\ref{claim:Aix} leads to an upper bound:
  \begin{eqnarray}
    \sum_x A_{i_0+1} & < & \sum_x (\frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i_0)\\
    & = & 2^n(\frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i_0)
  \end{eqnarray}
  
  Using both bounds, $(i_0+1) \gamma 2^n \epsilon \le \sum_x
  A_{i_0+1}(x) < 2^n(\frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma
  i_0) \Rightarrow i_0 < \frac{2}{\gamma^2 \epsilon^2}$, we arrive at a
  contradiction.  So, the algorithm must run for $\frac{2}{\gamma^2
    \epsilon^2}$ iterations or less.
\end{proof}

\begin{fact}[The Elevator Argument] If one rides an elevator from the
  ground floor, then one ascends from the $k$-th to the $(k+1)$-th
  floor at most $1$ more time than one descends from the $(k+1)$-th to
  the $k$-th floor.  (Analogous argument holds when traveling from the
  ground floor to basements.)
\end{fact}

\begin{proofof}{Claim 2}
  The process of adding each term of $N_i(x)$ corresponds to an
  elevator ride with $R_{c_j}(x)$ dictating the direction and partial
  sum $N_j(x)$ denoting the current level.  The plan is to first match
  pairs of $R_{c{j+1}}(x)M_j(x)$ terms and obtain an upper bound of
  their sum using properties of function $M_j(x)$.  As for the
  unmatched pairs, we can bound the number of them (using the Elevator
  Argument) and also their sums. And so, an upper bound for $A_i(x)$ can
  be obtained.
  \paragraph{Matched Pairs}
  \begin{tabbing}
  For each $k \ge 0$,\\
  \quad match $j$ such that $N_j(x) = k$ and $N_{j+1}(x) = k+1$\\
  \quad with $j'$ such that $N_{j'}(x) = k+1$ and $N_{j'+1}(x) = k$
  \end{tabbing}
  For each matched pair of terms corresponding to indices $a=j,b=j'$,
  the sum is\\ $\underbrace{R_{c_{a+1}}(x)}_{+1}
  \underbrace{M_a(x)}_{N_a(x)=k}+ \underbrace{R_{c_{b+1}}(x)}_{-1}
  \underbrace{M_b(x)}_{N_b(x)=k+1} = M_a(x)~-~M_b(x)$.
  \begin{tabbing}
    If \= $0 \le k \le \frac{1}{\epsilon \gamma}$ or $0 \le k+1 \le \frac{1}{\epsilon \gamma}$,~then\\
    \quad $M_a(x) - M_b(x) \le \epsilon \gamma$  (because $\frac{M_b(x) - M_a(x)}{k+1-k}$ is the slope of $M_i(x)$ which is $\ge -\epsilon \gamma$),\\
    else\\
    \quad $M_a(x) - M_b(x) = 0$.
  \end{tabbing}
  We can arrive at the same result for $k < 0$.  Therefore, the total
  contribution of matched pairs is $\le 0.5\epsilon \gamma i$ (because
  $A_i(x)$ has $i$ terms).
  \paragraph{Unmatched Terms}
  Notice that unmatched terms are in the ``same direction'', i.e.\ all
  $R_{c_j}(x)$'s are either negative or positive.  Suppose all
  $R_{c_j}(x)$'s are negative (i.e.\ $-1$), then their contribution to
  the sum is negative (because each term becomes $-M_j(x) \le 0$).  So
  they do not loosen the upper bound we already derived from matched
  pairs.

  Suppose all $R_{c_j}(x)$'s are positive (i.e.\ $+1$).  Then $N_j(x)
  \ge 0$, and so each term is $M_j(x) = 1-\epsilon \gamma N_j(x)$ if
  $N_j(x) \in [0, \frac{1}{\epsilon \gamma}]$ and $0$ otherwise.  The
  Elevator Lemma tells us that there is at most one unmatched $N_j(x)$
  for each integer value in the interval $[0, \frac{1}{\epsilon
    \gamma}]$, and so the total contribution of them (sum of a
  arithmetic series from $0$ to $1$ with $\frac{1}{\epsilon \gamma}$
  terms) is $\le \frac{1}{2\epsilon \gamma}$ $< \frac{1}{\epsilon
    \gamma}$

  Summing up the total contribution from both matched and unmatched
  terms gives $A_i(x) < \frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i$.
\end{proofof}

\end{document}