\documentclass[10pt]{article}
\newtheorem{define}{Definition}
\usepackage{color}
\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in
\begin{document}
\input{preamble.tex}
\newtheorem{note}[theorem]{Note}
\newcommand{\RETURN}{{\bf return}}
\newcommand{\Adv}{{\rm Adv}}
\lecture{11}{March 12, 2008}{Ronitt Rubinfeld}{Yoong Keok Lee}
%%%% body goes in here %%%%
Today, we will show how a weak PAC (Probably Approximate Correct)
learning algorithm can be boosted to a strong one. This result has
far-reaching implications beyond computational learning theory.
\section{Introduction}
\begin{define}
An algorithm $\mathsf{A}$ (``strongly'') PAC learns a concept class
$\mathcal{F}$ if~
$\forall f \in \mathcal{F},
\forall \mbox{distribution } \mathcal{D},
\forall \epsilon,\delta > 0$,
with probability $\ge 1-\delta$,
given examples $\in \mathcal{D}$ labelled according to $f$,
$\mathsf{A}$ outputs $h$ such that
\begin{equation}
\Pr_{\mathcal{D}}[h(x) \ne f(x)] \le \epsilon.
\end{equation}
\end{define}
\begin{remark}
\begin{itemize}
\item $\epsilon$ is called the accuracy parameter, and $\delta$ is
called the security parameter or the failure probability.
\item Parameter $\delta$ is inconsequential here: As long as it is
reasonably small, we can drive it down to an arbitrarily small
value. (Refer to Question 2 in Homework 2.) For this reason, we
shall be omitting this parameter from here onwards.
\item Hypothesis $h$ does not necessarily have to be in concept
class $\mathcal{F}$. If it does, then the model is called a proper
learning model.
\item Distribution $\mathcal{D}$ does not have to be uniform either.
It can be any distribution, and therefore, the algorithm is
distribution-free.
\end{itemize}
\end{remark}
\begin{define}
An algorithm $\mathsf{WL}$ {\color{blue}\bf weakly} PAC learns a concept class
$\mathcal{F}$ if~
$\forall f \in \mathcal{F},
\forall \mbox{distribution } \mathcal{D},
{\color{blue}\mathbf{\exists \gamma > 0}},\forall \delta > 0$,
with probability $\ge 1-\delta$,
given examples $\in \mathcal{D}$ labelled according to $f$,
$\mathsf{WL}$ outputs $c$ such that
\begin{equation}
\Pr_{\mathcal{D}}[c(x) \ne f(x)] \le {\color{blue}\mathbf{ \frac{1}{2} - \frac{\gamma}{2}}}.
\end{equation}
\end{define}
\begin{define}
The term $\frac{\gamma}{2}$ is called the \emph{advantage} of $\mathsf{WL}$.
\end{define}
\begin{remark}
Here, we assume that the concept class $\mathcal{F}$ is Boolean, and so
hypothesis $c$ can be just doing slightly better than one of the two constant function. Also,
note that $\mathsf{WL}$ must be able to output such $c$ {\em for all
distributions}, not just, say, the uniform distribution.
\end{remark}
\begin{theorem}
If $\mathcal{F}$ can be weakly learned, then $\mathcal{F}$ can be strongly learned.
\end{theorem}
\section{A Boosting Algorithm}
In this section, we present an algorithm which boosts a weak
learner to a strong one, hence proving the above theorem. There are
several variants the algorithm, but they revolve around the same idea.
\subsection{The Intuition}
Suppose a weaker learner is only $51\%$ accurate. We can first learn
a weak hypothesis, filter away examples which are correctly
classified, and then call the weak learner on the remaining $49\%$ of
the data. To increase the collective coverage of the hypotheses, we
can repeat alternating between the filtering and the learning steps.
A natural question is: Given an unseen example, which hypothesis shall
we use? The basic idea of the boosting algorithm is to construct a
filtering mechanism so that the majority vote of the collective
hypotheses works out.
\subsection{The Algorithm}
Given a weak learner $\mathsf{WL}$, a distribution $\mathcal{D}$, a concept
$f$, parameters $\epsilon$ and $\gamma$, the boosting algorithm
$\mathsf{Boost}$ is the following: (We illustrate the case for the
uniform distribution. Note that the algorithm can be easily modified
to be distribution-free although we are not showing it here.)
\begin{tabbing}
$\mathsf{Boost}(\mathsf{WL},\mathcal{D},f,\epsilon,\gamma)$ \\
\qquad {\bf initialize} distribution $\mathcal{D}_0 = \mathcal{D} = \mathcal{U}$ \\
\qquad \quad Use weak learner $\mathsf{WL}$ to generate weak hypothesis $c_1$ such that $\Pr_{\mathcal{D}_0}[f(x)=c_1(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$\\
\qquad \quad Set current hypothesis $h=c_1$ \\
\qquad \FOR~$i = 1 \mbox{ \TO~} T$ \\
\qquad \quad (1) Construct $\mathcal{D}_i$ with the filtering mechanism $\mathsf{Filter}(\mathcal{D},h=\mbox{maj}(c_1,\ldots,c_i),f,\epsilon,\gamma)$ \\
\qquad \quad (2) Run $\mathsf{WL}$ on $\mathcal{D}_i$ to get weak hypothesis $c_{i+1}$ such that $\Pr_{\mathcal{D}_i}[f(x)=c_{i+1}(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$ \\
\qquad \quad (3) Update $h=\mbox{maj}(c_1,\ldots,c_{i+1})$ \\
\qquad \RETURN~$h=\mbox{maj}(c_1,\ldots,c_{T+1})$ such that $\Pr_{\mathcal{D}}[f(x)=h(x)] \ge 1-\epsilon$
\end{tabbing}
\begin{tabbing}
$\mathsf{Filter}(\mathcal{D},h,f,\epsilon,\gamma)$ \\
\qquad \DO~until we have the desired number of examples\\
\qquad \quad Draw an example $x$ from $\mathcal{D}$ \\
\qquad \quad \IF~$h=\mbox{maj}(c_1,\ldots,c_i)$ is wrong on $x$, \THEN~keep $x$\\
\qquad \quad \ELSE~\IF~\# of $c_i$'s right - \# of $c_i$'s wrong $> \frac{1}{\epsilon \gamma}$, \THEN~throw $x$ away \\
\qquad \quad \ELSE, say \# of $c_i$'s right - \# of $c_i$'s wrong = $\frac{\alpha} {\epsilon \gamma}$, \THEN~keep $x$ with probability $1-\alpha$\\
\qquad \RETURN~all retained examples $\mathcal{D}_{i+1}$
\end{tabbing}
The algorithm assumes the weak learner never fails. (Recall that we
can easily decrease the probability of failure.) Before giving the
bound $T$ on the maximum number of iterations needed, we first
introduce some notations.
\section{Preliminaries} \label{sec:notations}
Here are some notations and their properties:
\begin{enumerate}
\item \(
R_c(x) = \left\{ \begin{array}{c l}
+1 & \mbox{if $f(x) = c(x)$} \\
-1 & \mbox{o.w.}\\
\end{array} \right. \)
\quad gives $+1$ if (weak) hypothesis $c$ is right on example $x$
\item $N_i(x) = \sum_{1 \le j \le i} R_{c_j}(x)$ \quad is the number
of right $c$'s exceeding the wrong ones
\item \(
M_i(x) = \left\{ \begin{array}{c l}
1 & \mbox{if $N_i(x) \le 0$} \\
0 & \mbox{if $N_i(x) \ge \frac{1}{\epsilon \gamma}$} \\
1 - \epsilon \gamma N_i(x) & \mbox{o.w.}
\end{array} \right. \)
\quad \\is a ``measure'' which upper bounds the error of hypothesis $h=\mbox{maj}(c_1,\ldots,c_i)$ on example $x$.
\item $\mu(M) = \frac{1}{2^n}\sum_x M(x) \ge \mbox{error}(h) \ge
\epsilon$ \quad is the ``mean'' of $M$. It upper bounds the error
of $h$ and therefore also $\epsilon$. {(We actually estimate
$\mu(M)$ by sampling in each iteration and stop if $\mu(M) <
\epsilon$.)}
\item $|M| = \sum_x M(x) = 2^n\mu(M)$ \qquad is the total ``mass'' of all examples according to ``measure'' $M$.
\item $D_M(x) = \frac{M(x)}{|M|}$ \qquad is a distribution over $x$
given $M$. (Note that we obtain $\mathcal{D}_i$ with $c_i$, and so
$D_{M_i} = \mathcal{D}_i$.)
\item $\Adv_c(M) = \sum_x R_c(x)M(x)$ \qquad is the advantage of $c$ on $M$. (Random guessing gives $0$.)
\item $\Adv_c(M) \ge \gamma|M|$ iff $\Pr_{x \in D_M}[c(x)=f(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$
\item If $\Pr_{x\in D_M}[c(x)=f(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$ and $\mu(M) \ge \epsilon$, then $\Adv_c(M) \ge_{(8)} \gamma |M| = \gamma 2^n \mu(M) \ge_{(4)} \gamma 2^n \epsilon$ \label{item:adv}
\end{enumerate}
\section{Convergence Proof}
\begin{claim}\label{claim:Aix}$A_i(x) = \sum_{0 \le j \le i-1} R_{c_{j+1}}(x)M_j(x) <
\frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i$
\end{claim}
Before proving this claim, we first use it to bound the maximum number
of iterations required by the boosting algorithm. Hence, if a concept
can be weakly PAC learned, then it can be (``strongly'') PAC learned.
\begin{claim}
The maximum number of iterations required by the boosting algorithm is $\le \frac{2}{\gamma^2 \epsilon^2}$.
\end{claim}
\begin{proof}
We prove the claim by showing that assuming the algorithm does not
stop after $\frac{2}{\gamma^2 \epsilon^2}$ iterations leads to a
contradiction. Suppose the algorithm continues to run after
iteration $i_0 > \frac{2}{(\epsilon \gamma)^2}$ (i.e.\ $\mu(M_i) \ge
\epsilon$), a lower bound can be derived as follows:
\begin{eqnarray}
\sum_x A_{i_0+1} & = & \sum_x \sum_{0 \le j \le i_0} R_{c_{j+1}}(x)M_j(x) \\
& = & \sum_{0 \le j \le i_0} \underbrace{\sum_x R_{c_{j+1}}(x)M_j(x)}_{Adv_{c{j+1}}(M_j(x))} \\
& \ge & (i_0+1) \gamma 2^n \epsilon \mbox{\qquad (using property~\ref{item:adv} in section~\ref{sec:notations})}
\end{eqnarray}
Using Claim~\ref{claim:Aix} leads to an upper bound:
\begin{eqnarray}
\sum_x A_{i_0+1} & < & \sum_x (\frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i_0)\\
& = & 2^n(\frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i_0)
\end{eqnarray}
Using both bounds, $(i_0+1) \gamma 2^n \epsilon \le \sum_x
A_{i_0+1}(x) < 2^n(\frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma
i_0) \Rightarrow i_0 < \frac{2}{\gamma^2 \epsilon^2}$, we arrive at a
contradiction. So, the algorithm must run for $\frac{2}{\gamma^2
\epsilon^2}$ iterations or less.
\end{proof}
\begin{fact}[The Elevator Argument] If one rides an elevator from the
ground floor, then one ascends from the $k$-th to the $(k+1)$-th
floor at most $1$ more time than one descends from the $(k+1)$-th to
the $k$-th floor. (Analogous argument holds when traveling from the
ground floor to basements.)
\end{fact}
\begin{proofof}{Claim 2}
The process of adding each term of $N_i(x)$ corresponds to an
elevator ride with $R_{c_j}(x)$ dictating the direction and partial
sum $N_j(x)$ denoting the current level. The plan is to first match
pairs of $R_{c{j+1}}(x)M_j(x)$ terms and obtain an upper bound of
their sum using properties of function $M_j(x)$. As for the
unmatched pairs, we can bound the number of them (using the Elevator
Argument) and also their sums. And so, an upper bound for $A_i(x)$ can
be obtained.
\paragraph{Matched Pairs}
\begin{tabbing}
For each $k \ge 0$,\\
\quad match $j$ such that $N_j(x) = k$ and $N_{j+1}(x) = k+1$\\
\quad with $j'$ such that $N_{j'}(x) = k+1$ and $N_{j'+1}(x) = k$
\end{tabbing}
For each matched pair of terms corresponding to indices $a=j,b=j'$,
the sum is\\ $\underbrace{R_{c_{a+1}}(x)}_{+1}
\underbrace{M_a(x)}_{N_a(x)=k}+ \underbrace{R_{c_{b+1}}(x)}_{-1}
\underbrace{M_b(x)}_{N_b(x)=k+1} = M_a(x)~-~M_b(x)$.
\begin{tabbing}
If \= $0 \le k \le \frac{1}{\epsilon \gamma}$ or $0 \le k+1 \le \frac{1}{\epsilon \gamma}$,~then\\
\quad $M_a(x) - M_b(x) \le \epsilon \gamma$ (because $\frac{M_b(x) - M_a(x)}{k+1-k}$ is the slope of $M_i(x)$ which is $\ge -\epsilon \gamma$),\\
else\\
\quad $M_a(x) - M_b(x) = 0$.
\end{tabbing}
We can arrive at the same result for $k < 0$. Therefore, the total
contribution of matched pairs is $\le 0.5\epsilon \gamma i$ (because
$A_i(x)$ has $i$ terms).
\paragraph{Unmatched Terms}
Notice that unmatched terms are in the ``same direction'', i.e.\ all
$R_{c_j}(x)$'s are either negative or positive. Suppose all
$R_{c_j}(x)$'s are negative (i.e.\ $-1$), then their contribution to
the sum is negative (because each term becomes $-M_j(x) \le 0$). So
they do not loosen the upper bound we already derived from matched
pairs.
Suppose all $R_{c_j}(x)$'s are positive (i.e.\ $+1$). Then $N_j(x)
\ge 0$, and so each term is $M_j(x) = 1-\epsilon \gamma N_j(x)$ if
$N_j(x) \in [0, \frac{1}{\epsilon \gamma}]$ and $0$ otherwise. The
Elevator Lemma tells us that there is at most one unmatched $N_j(x)$
for each integer value in the interval $[0, \frac{1}{\epsilon
\gamma}]$, and so the total contribution of them (sum of a
arithmetic series from $0$ to $1$ with $\frac{1}{\epsilon \gamma}$
terms) is $\le \frac{1}{2\epsilon \gamma}$ $< \frac{1}{\epsilon
\gamma}$
Summing up the total contribution from both matched and unmatched
terms gives $A_i(x) < \frac{1}{\epsilon \gamma} + 0.5\epsilon \gamma i$.
\end{proofof}
\end{document}