\documentclass[10pt]{article}
\usepackage{amsfonts}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}
\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in
\begin{document}
\input{preamble.tex}
\renewcommand{\AND}{{\rm AND}}
\newcommand{\sign}{\mathop{\rm sign}}
%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{6}{February 25, 2008}{Ronitt Rubinfeld}{Debmalya Panigrahi}
%%%% body goes in here %%%%
Today, we will describe and analyze the {\em Low Degree Algorithm}.
\section{Preliminaries}
We need a few definitions.
\begin{define}
A function $f: \{\pm 1\}^n \rightarrow \R$ is said to have
$\alpha(\epsilon, n)$ concentration if
\begin{equation}
\sum_{S\subseteq [n],|S|>\alpha(\epsilon, n)} \hat{f}(S)^2\leq \epsilon,
~\forall~0 < \epsilon < 1.
\end{equation}
\end{define}
In particular, for Boolean functions
$f: \{\pm 1\}^n \rightarrow \{\pm 1\}$, the the above relation
is equivalent to
\begin{equation}
\sum_{S\subseteq [n], |S|\leq\alpha(\epsilon, n)}\hat{f}(S)^2\geq 1-\epsilon,
~\forall~0 < \epsilon < 1.
\end{equation}
This follows from Parseval's theorem,
which states that
\begin{equation}
\sum_{S\subseteq [n]}\hat{f}(S)^2 = 1.
\end{equation}
Intuitively, if $\alpha$ assumes small values,
then the concentration of the function is on the low-degree terms of its
Fourier decomposition. (Throughout these notes, we will use the terms
{\em Fourier concentration} and {\em Fourier degree} interchangeably.)
Let us now consider some functions which satisfy this property, i.e.
have low Fourier concentration.
The first example we consider is the class of {\em junta} functions. These
functions depend on only $k \ll n$ variables. Clearly, if $f$ is such a
function which does not depend on a variable $i$, then
\begin{equation}
\hat{f}(S) = 0,
~\forall~S\subseteq [n],~i\in S.
\end{equation}
Since the junta function depends
on only $k$ variables, a direct consequence of the above observation is that
\begin{equation}
\sum_{S\subseteq [n], |S|>k} \hat{f}(S)^2 = 0.
\end{equation}
Thus, the junta function has Fourier degree at most $k$ for any value of
$\epsilon$.
Our next example is the following slightly unusually defined AND function.
\begin{equation}
\AND(x_1, x_2, \ldots, x_k) = \left\{\begin{array}{ll}
1 & {\rm if}~\forall i,~x_i = -1 \\
-1 & {\rm otherwise}.
\end{array}\right.
\end{equation}
\begin{theorem}
The $\AND$ function defined above has Fourier concentration $\log (4/\epsilon)$.
\end{theorem}
\begin{proof}
We consider two cases depending on the value of $\epsilon$.\\
{\em Case 1:} $k\leq \log(4/\epsilon)$. In this case, from our analysis of the
junta function above, we directly obtain
\begin{equation}
\sum_{S\subseteq [n], |S|>\log(4/\epsilon)} \hat{f}(S)^2 = 0 < \epsilon.
\end{equation}
{\em Case 2:} $k > \log(4/\epsilon)$. Define
\begin{eqnarray}
g(x) & = & \left\{\begin{array}{ll}
1 & {\rm if}~\forall i,~x_i = -1 \\
0 & {\rm otherwise}
\end{array}\right.\\
g(x) & = & (\frac{1-x_1}{2})(\frac{1-x_2}{2})\ldots (\frac{1-x_k}{2}) \\
& = & \sum_{S\subseteq [k]}\frac{(-1)^{|S|}}{2^k} \chi_S.
\end{eqnarray}
Now,
\begin{eqnarray}
\AND(x) & = & 2g(x) - 1\\
& = & (-1 + 2^{1-k}) + \sum_{S\subseteq [k], |S| > 0} (-1)^{|S|}2^{1-k}\chi_S.
\end{eqnarray}
Clearly, there are at most $2^k$ non-zero Fourier co-efficients corresponding
to characters $S$ such that $|S|>0$. Thus,
\begin{eqnarray}
\sum_{S\subseteq [k], |S|>\log(4/\epsilon)} \hat{f}(S)^2 &
\leq & \sum_{S\subseteq [k], |S|>0} \hat{f}(S)^2\\
& \leq & [(-1)^{|S|}2^{1-k}]^2 2^k\\
& = & \frac{4}{2^k}\\
& < & \epsilon.
\end{eqnarray}
The last inequality follows from the assumption that $k > \log(4/\epsilon)$.
\end{proof}
\section{The Low Degree Algorithm}
The low degree algorithm is a uniform distribution learning algorithm which
is efficient for functions with low Fourier degree. The algorithm is presented
in Figure~\ref{fig:low-degree}. The
sub-routine {\sc SingleCoeff} is the algorithm we saw
in the previous lecture for learning a single Fourier coefficient
of a function. Also, as an optimization, note that the result of a
single set of $m$ queries to the function $f$ can be re-used in
all the iterations of the algorithm since the only property we
need to ensure is that the samples in any single iteration are
independent and uniformly distributed.
\begin{figure}
\centering
\noindent \fbox{
\begin{minipage}{5.5in}
%\centering
{\bf Input:} Oracle access to function $f$ under uniform sampling,
Fourier degree
of $f$ denoted by $d = \alpha(\epsilon, n)$, accuracy parameter $\tau$
and confidence or security parameter $\delta$.
\begin{enumerate}
\item Let $m = O(\frac{n^d}{\tau}\ln \frac{n^d}{\delta})$.
\item Collect $m$ samples from the function.
\item For each set $S\subseteq [n]$ such that $|S|\leq d$,
let $C_S$ be an estimate for $\hat{f}(S)$ obtained using sub-routine {\sc SingleCoeff} with $m$ independent samples from the function $f$.
\item Let $h = \sum_{S\subseteq [n], |S|\leq d} C_S \chi_S$.
\item Output $\sign(h)$ as the estimate of $f$.
\end{enumerate}
\end{minipage}
}
\caption{The low-degree algorithm}
\label{fig:low-degree}
\end{figure}
We now analyze the low-degree algorithm. We prove two lemmas
which will lead us to the ultimate result.
\begin{lemma}\label{lma:first}
If $f$ has $\alpha(\epsilon, n)$ Fourier concentration, then $h$ computed by the algorithm satisfies
\begin{equation}
E_x[(f(x)-h(x))^2] \leq \epsilon + \tau,
\end{equation}
with probability $\geq 1-\delta$.
\end{lemma}
To prove this lemma, we need the following lemma.
\begin{lemma}
With probability $\geq 1-\delta$, for each $S$ satisfying $|S|\leq d$,
\begin{equation}
|C_S - \hat{f}(S)| \leq \gamma,
\end{equation}
where $\gamma = \sqrt{\frac{\tau}{n^d}}$.
\end{lemma}
\begin{proof}
Since the
$O(\frac{n^d}{\tau}\log \frac{n^d}{\delta}) = O(\frac{1}{\gamma^2}\log \frac{n^d}{\delta})$ samples used to
estimate any Fourier coefficient $\hat{f}(S)$ are independent,
we can claim, using Chernoff bound, that
\begin{equation}
|C_S - \hat{f}(S)| > \gamma
\end{equation}
with probability $\leq \frac{\delta}{n^d}$. Since there are $\leq n^d$
sets $S$ such that $|S|\leq d$, the lemma follows using union bound.
\end{proof}
We now use this lemma to prove Lemma~\ref{lma:first}.
\begin{proof-of-lemma}{\ref{lma:first}}
Define
\begin{equation}
g(x) = f(x) - h(x).
\end{equation}
Since the Fourier decomposition is linear, for each $S\subseteq [n]$,
\begin{equation}
\hat{g}(S) = \hat{f}(S) - \hat{h}(S).
\end{equation}
From the algorithm, for any $S$ such that $|S| > d$, $\hat{h}(S) = 0$.
Thus,
\begin{eqnarray}
\sum_{S\subseteq [n], |S|>d}\hat{g}(S)^2 & = & \sum_{S\subseteq [n], |S|>d}\hat{f}(S)^2\\
& \leq & \epsilon.
\end{eqnarray}
The second inequality holds since $d = \alpha(\epsilon, n)$.
On the other hand, for each $S$ such that $|S|\leq d$,
\begin{eqnarray}
\hat{g}(S)^2 & = & (\hat{f}(S) - \hat{h}(S))^2\\
& \leq & \gamma^2,
\end{eqnarray}
with probability $\geq 1-\delta$.
The second inequality follows from Lemma~\ref{lma:first}. Since there are
$\leq n^d$ sets $S$ such that $|S|\leq d$,
\begin{eqnarray}
\sum_{S\subseteq [n], |S|\leq d} \hat{g}(S)^2 & \leq & \gamma^2 n^d\\
& = & \tau.
\end{eqnarray}
We now have
\begin{eqnarray}
E_x[(f(x)-h(x))^2] & = & E[g(x)^2]\\
& = & \sum_{S\subseteq [n]} \hat{g}(S)^2\\
& = & \sum_{S\subseteq [n], |S|\leq d} \hat{g}(S)^2 + \sum_{S\subseteq [n], |S|> d} \hat{g}(S)^2\\
& \leq & \tau + \epsilon,
\end{eqnarray}
with probability $\geq 1-\delta$. The second equation follows from
Plancherel's theorem.
\end{proof-of-lemma}
We now prove another lemma which will be useful in the final result.
\begin{lemma}\label{lma:second}
For any functions $f: \{\pm 1\}^n\rightarrow \{\pm 1\}$ and
$h: \{\pm 1\}^n\rightarrow \R$,
\begin{equation}
Pr[f(x)\not= \sign(h(x))] \leq E_x[(f(x)-h(x))^2].
\end{equation}
\end{lemma}
\begin{proof}
We have
\begin{equation}
E_x[(f(x)-h(x))^2] = \frac{1}{2^n}\sum_x (f(x)-h(x))^2,
\end{equation}
while
\begin{equation}
Pr[f(x)\not= \sign(h(x))] = \frac{1}{2^n}\sum_x {\bf 1}(x),
\end{equation}
where $\bf 1$ is defined as
\begin{equation}
{\bf 1}(x) = \left\{\begin{array}{ll}
1 & {\rm if}~f(x)\not=\sign(h(x))\\
0 & {\rm otherwise.}
\end{array}
\right.
\end{equation}
Now, for any $x$, if $f(x)\not= \sign(h(x))$, then
\begin{equation}
(f(x)-h(x))^2 \geq 1 = {\bf 1}(x),
\end{equation}
and if $f(x) = \sign(h(x))$, then
\begin{equation}
(f(x)-h(x))^2 \geq 0 = {\bf 1}(x).
\end{equation}
Thus,
\begin{equation}
\sum_x (f(x)-h(x))^2 \geq \sum_x {\bf 1}(x).
\end{equation}
\end{proof}
We now arrive at our Main Theorem. However, before stating it, we
need another definition.
\begin{define}
The Fourier concentration of a concept class of functions is the
maximum Fourier concentration among all the functions in the class.
\end{define}
\begin{theorem}[Main Theorem]
Let concept class $\cal C$ have Fourier concentration
$d = \alpha(\epsilon, n)$. Then, there exists a
$q = O(\frac{n^d}{\epsilon}\log \frac{n^d}{\delta})$-sample uniform
distribution learning algorithm for $\cal C$ (i.e. the algorithm uses
$q$ uniformly distributed independent samples and with probability
$\geq 1-\delta$, outputs a hypothesis $h'$ such that
$Pr[f(x)\not= h'(x)]\leq 2\epsilon$).
\end{theorem}
\begin{proof}
The algorithm is to simply run the low-degree algorithm for
$\tau = \epsilon$. The error bound follows from Lemmas~\ref{lma:first}
and \ref{lma:second} given above.
\end{proof}
What class of functions can we learn using the low-degree algorithm?
One such large class of functions that can be learned in quasi-polynomial
($O(n^{\rm polylog} n)$) time using this algorithm are those which can be
represented by
constant depth circuits. Recall that a circuit consists of gates
(typically AND, OR and NOT gates; we assume that the first two types
of gates have unbounded fan-in), constants 0 and 1 and variables
$X_1, X_2, \ldots, X_n$. It turns out that any function can be
represented by a constant depth circuit, but we would also like
to ensure that the size of the circuit is polynomial in the number
of input variables $n$. So, one central question is whether all
functions can be computed using constant-depth, polynomial-sized
circuits? The answer is no. In fact, it was proved by
Furst, Saxe and Sipser
that the class of parity functions cannot be computed
using constant depth, polynomial sized circuits.
But what about functions which are computable using constant depth,
polynomial sized circuits? The following theorem explores the
Fourier concentration of such functions.
\begin{theorem}[H$\rm \bf \mathring{a}$stad, Linial Mansour Nisan]
For any function $f$ computable using a size-$s$ depth-$d$ circuit,
\begin{equation}
\sum_{S\in [n], |S|>t} \hat{f}(S)^2 \leq \alpha,
\end{equation}
for $t=O((\log \frac{2s}{\alpha})^{d-1})$.
\end{theorem}
Taking $s$ as a polynomial in $n$, $d$ as a constant and
$\alpha = O(\epsilon)$ gives us $t=O(\log^d \frac{n}{\epsilon})$.
Using our main theorem, we can conclude that the number of samples
required for learning such a constant depth polynomial sized circuit
is $n^{O(\log^d \frac{n}{\epsilon})}$, which is quasipolynomial in $n$.
For DNF formulas, this bound can be improved
to $O(n^{O(\log\log n)})$ samples.
\end{document}