\documentclass[10pt]{article}
\newtheorem{define}{Definition}
\usepackage{amssymb} 
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}


\newcommand{\error}{{\mbox{error}}}
\newcommand{\bit}{{\{0 , 1\}}}
\newcommand{\pmone}{{\{-1 , 1\}}}
\newcommand{\bigand}{\displaystyle\bigwedge}
\newcommand{\bigsum}{\displaystyle\sum}
\newcommand{\cala}{{\cal A}}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{5}{February 20, 2008}{Ronitt Rubinfeld}{Sam McVeety}

%%%% body goes in here %%%%

%% 6.842 Notes 2-20-2008

\section{Uniform Distribution Learning}

%% Figure 1

\begin{definition} Given hypothesis $h$, \emph{error of $h$ with $f$} is $\emph{error}(h) = \displaystyle\Pr_{x \in_U D} [f(x) \neq h(x)]$.
\end{definition}

\begin{definition}A \emph{uniform distribution learning algorithm} for concept class $\calc$ is an algorithm $\cala$ such that 
\begin{enumerate}
\item $\cala$ is given $\epsilon, \delta$, and has access to a set of samples $(x,f(x))$,
where each $x$ is chosen uniformly from the domain.
\item $\cala$  outputs $H$ such that with probability at least $\geq 1 - \delta$, error of $h$ with $f$ is at most $\epsilon$.
\end{enumerate}
\end{definition}

We don't know $f$, but we do know it is in concept class $\calc$.  Given access to an example oracle.  $\delta$ is often referred to as the ``security'' or ``confidence parameter''.  $\epsilon$ is the approximation error or ``accuracy parameter''.

\subsection{Parameters of Interest}
\begin{itemize}
\item $m$: ``Sample complexity''.  The number of randomly chosen examples that we get from the oracle.
\item $\epsilon$: Accuracy parameter.  Discussed above.
\item $\delta$: Security parameter.  Discussed above.
\item Running time?  Might be different than the sample complexity or query complexity.  Note that, trivially,  $m \leq$ running time.  We would like to get something polynomial in $\frac{1}{\epsilon}$, $\frac{1}{\delta}$ (actually, $\log\frac{1}{\delta}$) $\log \abs{\mathcal{C}}$, $\log \abs{D}$.  Note that the requirement on the domain space results from the fact that we must write out any queries that we perform, so they should be simple to describe.
\item Description of $h$ should be relatively compact.  Some models restrict this further.
\end{itemize}

Note that $h$ is not necessarily in $\mathcal{C}$.  In some models, ``proper learning algorithms,'' this is the case, but we will not restrict our attention to them.

\subsection{Remarks}
\begin{itemize}
\item Special case of the PAC (probably approximately correct) model.  More generally, Ex($f$) for unknown distribution $\mathcal{D}$.  
$$ \error(h) = \Pr_{x \in \mathcal{D}} [f(x) \neq h(x)]$$
\item Can get the dependence on $\delta $ to be $\log \frac{1}{\delta}$.  
\item If the running time doesn't matter, only sample complexity, then it is easy to write learning algorithms.  As we will see, though the algorithm that makes this possible in infeasible in practice.
\end{itemize}

\section{Brute Force Algorithm}
\begin{itemize}
\item Draw $M = \frac{1}{\epsilon} ( \ln \abs{\mathcal{C}} + \ln \frac{1}{\delta}) $ samples.  $\mathcal{C}$ is generally regarded as a class of exponential size.
\item Search over all $h \in \mathcal{C}$ until one $h$ labels all examples correctly and output it.  Given multiple choices, chose arbitrarily.  Notice that the sample complexity is fine, because we reuse our samples for each iteration of the search.
\end{itemize}
Motivation:  $h$ is bad, if the error of $h$ with respect to $f \geq \epsilon$.  Whether $h = f$ on all inputs or just most, doesn't really matter.  

\begin{eqnarray*}
Pr[\mbox{bad } h \mbox{ is consistent with examples}] & \leq &(1 - \epsilon)^M \\
 Pr[\mbox{bad } h \mbox{ ``survives''}] & \leq &\abs{\calc} (1 - \epsilon)^M  \\
 & \leq &\abs{\calc} (1 - \epsilon)^{\frac{1}{\epsilon} ( \ln \abs{\mathcal{C}} + \ln \frac{1}{\delta})} \\
 & \leq &\delta 
\end{eqnarray*}

Therefore, this algorithm is unlikely to output any bad $h$.  \\
\\
\begin{remark}Ignoring the sample size bounds leads to bad statistics, where the sample size is insufficient for an inordinately large class space to rule out all of the bad $h$ possibilities.  The key is to define in advance the concept class, rather than expanding the concept class to fit ``interesting'' results.
\end{remark}

\section{Benefits of Learning}
\begin{itemize}
\item Learning allows for prediction of $f$ on other values of $x$.
\item Learning can also allow for (lossy) compression.
% $$ (x_1, f(x_1)), (x_2, f(x_2)) \ldots (x_m, f(x_m)) \Rightarrow x_1, x_2, \ldots x_m \mbox{  gives a description of } h \mbox { in } \log\abs{\calc} \mbox{ bits} $$
% 
% \begin{center}
% $ (x_1, f(x_1)), (x_2, f(x_2)) \ldots (x_m, f(x_m)) \Rightarrow x_1, x_2, \ldots x_m$ gives a description of $h$ in $\log\abs{\calc}$ bits
% \end{center}

\end{itemize}

\section{Example: Monomial Functions}
\begin{itemize}
\item $\calc  = $ conjunctions over $\bit^n$.  $\abs{\calc} = 2^n$
\item Example: $f = x_i x_j x_k$
\item Cannot learn efficiently with 0 error.  Think about a very long conjunction, which is 0 unless none of the variables are 0.  In a polynomial number of queries, there is no way to detect this.
\item Brute Force needs $\frac{1}{\epsilon} \left(\ln 2^n + \ln \frac{1}{\delta}\right) = O\left(\frac{n}{\epsilon} + \frac{1}{\epsilon} \ln \frac{1}{\delta}\right)$ samples. %% ln or log?
\end{itemize}

%% check notes afterward

\subsection{Poly-time Algorithm}

We say that a sample (or an input) $(x,f(x))$ is \emph{positive} if $f(x) = 1$. If $f(x)=0$, we say that the sample is \emph{negative}. Let us describe the algorithm. We will set $k$ that appears in the algorithm later on.

\begin{enumerate}
\item Draw $O(\frac{1}{\epsilon^2}\log\frac{1}{\delta})$ samples to estimate the fraction of positive inputs within $\epsilon/4$ with probability at least $1-\frac{\delta}{2}$.

\item If in less than $\epsilon/2$ fraction of samples are positive, output $h(x) = 0$.

% \item Start with something close to the 0 function
% \item Draw $poly(\frac{1}{\delta})$ many $x$s 
% \item Use Chernoff bounds to find an estimate that is within $\epsilon/4$ 
% \item If estimate is less than $\epsilon / 2$
% \item Otherwise, estimate is at least $\epsilon/2$ 
% \item $Pr[f(x) = 1] \geq \epsilon/4$
\item Take $\frac{k}{\epsilon}$ samples.

\item Let $V = \lbrace{i : \mbox{$x_i=1$ in all positive examples}\rbrace}$.

\item Output hypothesis $h(x) = \bigand_{i \in V} x_i$.
\end{enumerate}

%% Figure 2

Why does this work? First we check if the function is sufficiently far from the all-zero function. If it is not, we can output the all-zero function as our hypothesis (Step 2).  Otherwise, we know in Steps 3--5 that $\Pr_x[f(x)=1] \ge \epsilon/4$. If $x_i$ is in in $f$, then it must be turned on for every positive sample.  If it isn't, it is on in a positive sample with probability $1/2$, and off also with probability $1/2$.
We say that an index $i$ is \emph{bad}, if $x_i$ does not appear in the monomial. Therefore, for $k = 8\ln \frac{2n}{\delta}$, we have
\begin{eqnarray*}
\Pr[\mbox{any bad $i$ survives}] & \le& \sum_i \Pr[\mbox{$i$ survives, given it is bad}] \\
& \le & \sum_i \Pr[\hbox{$x_i=1$ in all positive samples}\;|\;\mbox{$i$ is bad}]\\
& \le & \sum_i \left(1 - \Pr[f(x)=1 \land x_i=0 \;|\; \mbox{$i$ is bad}]\right)^{k/\epsilon}\\
& \le & \sum_i \left(1 - \Pr[f(x)=1 \;|\; \mbox{$i$ is bad}] \cdot \Pr[x_i=0 \;|\; \mbox{$i$ is bad $\land$  $f(x)=1$}]\right)^{k/\epsilon}\\
& \le & \sum_i \left(1 - \frac{\epsilon}{4} \cdot \frac{1}{2}\right)^{k/\epsilon} \le \sum_i e^{-\ln \frac{2n}{\delta}} \le \sum_i \frac{\delta}{2n} \le \frac{\delta}{2}.
\end{eqnarray*}
Note that if we detect all bad indices $i$, then the hypothesis $h$ output in Step 5 exactly equals $f$.

In total, we only require $O\left(\frac{1}{\epsilon}\log\frac{n}{\delta} + \frac{1}{\epsilon^2}\log\frac{1}{\delta}\right)$ samples, which means that our algorithm is efficient.

Notice that learning requires logarithmically many queries, even for singletons, while dictator testing takes a constant number of queries.  So, testing can be much faster than learning.

\section{Learning via Fourier Coefficients}
Approximating a single Fourier coefficient of an unknown function.
\begin{lemma} We can approximate any specific Fourier coefficient $s$ to within $\gamma$ (i.e. $\abs{output - \hat{f}(s)} < \gamma$) with probability $\geq 1 - \delta$ in $O\left(\frac{1}{\gamma^2} \log \frac{1}{\delta}\right)$ samples.
\end{lemma}
\begin{proof}
We have $\hat{f}(s) = 2 \times \Pr[f = \chi_s]  - 1$, and we can estimate $\Pr[f = \chi_s]$ to within $\pm \gamma/2$ using Chernoff bounds.
\end{proof}

It is thought to be unlikely in this model that one can efficiently find the ``heavy'' Fourier coefficients with non-trivial values.  Exhaustive search is intractable, and no other method is immediately clear.  If we are dealing with a function class where most of the weight is in the coefficients with small $s$, we can exhaustively check these values.
We will see this approach in work in the next lecture.

% \subsection{Low-degree Algorithm}
% \begin{definition} $f : \pmone^n \rightarrow \mathbb{R}$ has \emph{$\alpha(\epsilon, n)$ Fourier concentration} if 
% $$ \bigsum_{S\subseteq[n] : |S| > n} \hat{f}^2(S) \leq \epsilon.$$
% For Boolean $f$, we have $\bigsum_{S\subseteq[n], |S|\le n} \hat{f}^2(S) > 1 - \epsilon.$
% \end{definition}
% 
% We now define the ``And'' function: $\mbox{And}(x_1, \ldots, x_k) = 1$ if for all $i$, $x_i = 1$; $0$ otherwise.
% Consider $f$ such that $f = 1$ if all $x_i$ are $-1$, 0 otherwise.
% 
% $$ f(x_1, \ldots, x_k) = \frac{(1 - x_1)}{2}\frac{(1 - x_2)}{2} \cdots \frac{(1 - x_k)}{2} $$
% If $x_i = -1$ for all $i$, then we get 1, otherwise we get a 0.  Note that this gives you the Fourier representation of $f$ on inputs 1 through $k$. 
% 
% The number of non-zero weight values of $s$ is approximately $\log 4/\epsilon$.  If we take the And of a small number of variables, then there are no non-zero weights for large values of $s$.  If $k$ is big, then all Fourier coefficients are small, except possibly the zero coefficient.
% 
% If $k \geq \log 4/\epsilon$ then for each of the $2^k - 1$ non-zero Fourier coefficients $\chi_s$ for $s \neq \phi$ we have $$\hat{f}^2(s)  = \Theta\left(\frac{1}{2^2k}\right)$$
% $$ \bigsum_{\abs{s} > 0, s \leq [k]} \hat{f}^2(s) \leq 2^k \Theta\left(\frac{1}{2^2k}\right) \leq \Theta\left(\frac{1}{2^k}\right) < \epsilon$$


\end{document}