\documentclass[10pt]{article}
\usepackage{amsmath,amssymb}
\newtheorem{define}{Definition}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{9}{March 5, 2008}{Ronitt Rubinfeld}{Andrew Correa}

\section{Last Time}

Today we look at the problem of learning Fourier coefficients with queries.
The goal is to output all $S$ s.t.\ $\hat{f}(S)\ge\theta$ and all $S$ that are
output should have $\hat{f}(S)\ge\frac{\theta}{2}$.

We explore the following full binary tree. The $2^k$ nodes on the $k$-th level, $0 \le k \le k$, correspond to
all subsets of $[k]$. A node $S \subseteq [k]$ on the $k$-th level has two children that correspond to $S$ and $S \cup \{k+1\}$. The leaves of the tree correspond to all subsets of $[n]$, and therefore to all Fourier coefficients.

For a node $S_1 \subseteq [k]$ on the $k$-the level, we define 
$$f_{k,S_1}(x_{k+1}, \ldots, x_{n}) = \sum_{T_2\subseteq\{k+1,\ldots,n\}} \hat f(S_1 \cup T_2)\cdot \chi_{T_2}(x_{k+1},\ldots,x_n).$$

Our plan is to go down subtrees of value $\mathbb E_x[f^2_{k,S_1}(x)] \ge \frac{\delta^2}{2}$.

% With algorithm:
% \begin{itemize}
% 	\item Fix: $0\le k\le n$
% 	\item \hspace{1em} $S_1 \subseteq[k]$ \hspace{6em}\,\,\,\,``current level''
% 	\item \hspace{1em} $f_{k,S_1}: \{\pm1\}^n\mapsto\mathbb{R}$
% 		\hspace{2em} ``current node''
% 	\item \hspace{2em} s.t. $f_{k,S_1}(x)=\sum_{T_2\subseteq\{k+1,\ldots,n\}}\hat{f}
% 		(S_1\cup T_2)\chi_{T_2}(x)$
% \end{itemize}
% \paragraph{Plan:} Go down subtrees whose root value of $\mathbb{E}
% [f^2_{k,S_1}(x)]\ge\frac{\theta^2}{2}$

\section{Learning Fourier Coefficients with Queries}
So far we looked at the case where we had no choice of which $x$ to use. We had to
make the best of the pairs $(x,f(x))$ we were given. This time we look at the case when we are
allowed to choose which $x$ to query, so we can take advantage of that. The basic
idea is that we will be using a exhaustive search of a (hopefully very) pruned binary tree.
How do we prune it? We will be using an ``amazing'' oracle.

\paragraph{Remember Parseval's:} $1=\mathbb{E}_x[f^2(x)]=\sum_S\hat{f}^2(S)$.

\begin{claim}
\hspace{1em}$\forall k,S_1\subseteq[k]$\[\mathbb{E}_x[f^2_{k,S_1}(x)]=
	\sum_{T_2}\hat{f}^2(S_1\cup T_2)\]
\end{claim}
\begin{proof}
This follows directly from Parseval's as $\hat{f}^2(S_1\cup T_2)$ is a Fourier coefficient of $f^2_{k,S_1}(x)$.
\end{proof}

Now we must prove that we are traversing an appropriate number (i.e.\ not
too many and not too few) nodes of the tree. In other words, we must prove
both that we are pruning bad $x$ whose $\mathbb{E}[\hat{f}^2_{k,S_1}(x)]
<\frac{\theta^2}{2}$ and at the same time traversing into nodes that
represent the $x$ whose $\mathbb{E}[\hat{f}^2_{k,S_1}(x)]\ge\frac{\theta^2}{2}$.

\begin{fact}[Not too Few]
\hspace{1em}For all subsets $S_1$, if there exists a
	$\tilde{T}_2$ such that $|\hat{f}(S_1\cup\tilde{T}_2)|>\theta$,
	then: \[\mathbb{E}_x[f^2_{k,S_1}(x)]=\sum_{T_2}\hat{f}^2(S_1\cup T_2)
			  \ge\hat{f}^2(S_1\cup\tilde{T}_2)\ge\theta^2\]
\end{fact}
Thus we know that we will not visit too few nodes in the tree. \\
\indent \ldots but what if we visit too many \ldots

\begin{lemma}[Not too Many]
\hspace{1em}For all $\theta > 0$, we have:
\begin{enumerate}
	\item Less than $\frac{1}{\theta^2}$ $S$'s satisfy $|\hat{f}(S)|\ge\theta^2$.
	\item For all $0\le k\le n$, less than $\frac{1}{\theta^2}$ functions
		$f_{k,S_1}$ have $\mathbb{E}_x[f^2_{k,S_1}(x)]\ge\theta^2$.
\end{enumerate}
\end{lemma}

Notice two things. First, notice that Part 1 bounds the number of returned
nodes $S$ and Part 2 bounds the running time, so both the amount of returned
data \emph{and} the running size are bounded from above. Second, notice that
while the the actual values will differ slightly from those above, it will be
by only a constant factor. \\

\begin{proof}
\begin{enumerate}
	\item Assume that Part 1 (above) is false. Then:\[1=\sum_S\hat{f}^2(S)>\left(
		\frac{1}{\theta^2}\right)\cdot\theta^2=1\]
		And thus $1>1$ which is a contradiction.
	\item Assume that Part 2 (above) is false. Then given $k$:
		\begin{eqnarray*}
			1&=&\sum_S\hat{f}^2(S) \\
			&=& \sum_{S_1\subseteq[k]}\sum_{T_2\subseteq\{k+1,\ldots,n\}}
						\hat{f}^2(S_1\cup T_2) \\
			&=& \sum_{S_1\subseteq[k]}\mathbb{E}_x[f^2_{k,S_1}(x)] \\
			&>& \left(\frac{1}{\theta^2}\right)\cdot\theta^2 ~ = ~ 1
		\end{eqnarray*}
		And thus $1>1$, which is (still) a contradiction.
\end{enumerate}
\end{proof}

Now that we know that the algorithm neither leaves anything out nor selects
too much, how can we speed it up? How can we quickly \emph{estimate}
$f_{k,S_1}(x)$?

\begin{lemma}[$f_{k,S_1}(x)$ Estimation]
	\[f_{k,S_1}(x)=\mathbb{E}_{y\in\{\pm1\}^k}[f(yx)\chi_{S_1}(y)]\]
\end{lemma}

Note that this could be estimated by sampling. In general,Thus, picking random $y$'s and outputting
the average value gives $\gamma$-additive approximation by Chernoff Bounds in
O$(\frac{1}{\gamma^2}\log\frac{1}{\delta})$ (where $\delta$ is the security
parameter). Then we can use these estimates to estimate $\mathbb{E}_x[f_{k,S_1}(x)]$ also by Chernoff Bounds.

\begin{proof}
First notice some (maybe obvious) things:
\begin{itemize}
\item $f(yx)=\sum_T\hat{f}(T)\chi_T(yx)$
\item For $T=T_1\cup T_2$, where $T_1\subseteq[k]$, and $T_2\subseteq\{k+1,\ldots,n\}$,
	$$\chi_T(yx)=\chi_{T_1}(y)\chi_{T_2}(x).$$
\end{itemize}
Now the proof:
\begin{eqnarray*}
	\mathbb{E}_y\left[f(yx)\chi_S(y)\right] & = & \mathbb{E}_y\left[\underbrace{\sum_{T_1}
		\sum_{T_2}\hat{f}(T_1\cup T_2)\chi_{T_1}(y)\chi_{T_2}(x)}_{f(yx)}\chi_{S_1}(y)\right] \\
	& = & \sum_{T_1}\sum_{T_2}\hat{f}(T_1\cup T_2)\chi_{T_2}(x)\underbrace{\mathbb{E}_y\left[
		\chi_{T_1}(y)\chi_{S_1}(y)\right]}_{0 \textrm{ unless }T_1=S_1} \\
	& = & \sum_{T_2}\hat{f}(S_1\cup T_2)\chi_{T_2}(x) \\
	& = & f_{k,S_1}(x)
\end{eqnarray*}
\end{proof}

So the algorithm then becomes:
\begin{enumerate}
\item If $k=n$, output $S_1$.
\item Else: 
\begin{enumerate}
\item If estimate of $\mathbb{E}[f^2_{k+1,S_1\cup\{k+1\}}(x)] \ge\frac{\theta^2}{2}$, recurse on $(k+1,S_1\cup\{k+1\})$.
\item If estimate of $\mathbb{E}[f^2_{k+1,S_1}(x)]\ge\frac{\theta^2}{2}$, recurse on $(k+1,S_1)$.
\end{enumerate}
\end{enumerate}

\begin{theorem}
\hspace{1em} For all $\theta>0$, this algorithm outputs $\mathcal{S}=
\{S_1,\ldots,S_l\}$ such that $l=\theta\left(\frac{1}{\theta^2}\right)$ such that
with probability greater than $1-\delta$:
\begin{enumerate}
	\item $\forall S_i\in\mathcal{S}, \hspace{1em}|\hat{f}(S)|\ge\frac{\theta}{2}$,
	\item $\forall S_i\notin\mathcal{S}, \hspace{1em}|\hat{f}(S)|<\theta$,
\end{enumerate}
with query complexity ${\rm poly}(n,\frac{1}{\theta},\log\frac{1}{\delta})$.
\end{theorem}

\begin{proof}
\begin{enumerate}
	\item If $\hat{f}(S)<\frac{\theta}{2}$, then $\hat{f}^2(S)\le\frac{\theta^2}{4}$
	\item If $\hat{f}(S)>\theta$, then all ancestors of $S$ will continue the recursion.
\end{enumerate}
The total number of nodes explored will be less than O$(\frac{n}{\theta^2})$.
\end{proof}

After we learn a small set of significant Fourier coefficients, we can use the following theorem
to compute a function close to $f$.

\begin{theorem}\label{thm:approximate_function}
There exists an algorithm, which given $\mathcal{S}\subseteq2^{[n]}$ such that
$\sum_{S\in\mathcal{S}}\hat{f}^2(S)\ge1-\varepsilon$ and examples, with probability
$1-\delta$ outputs $g:\{\pm1\}^n\mapsto\mathbb{R}$ such that:
\begin{itemize}
	\item $g(x)=\sum_{S\in\mathcal{S}}C_S\chi_S(x)$
	\item $\Pr[f(x)\ne{\rm sign}(g(x))]\le\varepsilon+\tau$
	\item The number of examples/queries is ${\rm poly}(n,|S|,\frac{1}{\tau},\log\frac{1}{\delta})$.
\end{itemize}
\end{theorem}

\begin{proof}
The same method as proving the low degree algorithm
\end{proof}

\paragraph{In general:} \emph{Good} functions for our approach are functions like the parity
functions $\chi_S$ (only one non-zero Fourier coefficients).
\emph{Problem} functions are, for instance, functions of the form:
\[P_2=(x_1\wedge x_2)\oplus\ldots\oplus(x_{n-1}\wedge x_n)=\sum_S
\pm2^{-\frac{n}{2}}\cdot\chi_S\]
since they have many small Fourier coefficients.

\begin{definition}
For a function $f:\{\pm1\}^n\mapsto\mathbb{R}$ the \emph{$L_1$ norm} of
$f$ is $\sum_S|\hat{f}(S)|$.
\end{definition}

Notice that for the above example of good and bad functions, the $L_1$ norm of a
good function is 1 and the $L_1$ norm of a bad function is $2^{n/2}$.

\begin{claim}
Let $f:\{\pm1\}^n\mapsto\{\pm1\}$ be a function. Given $\varepsilon$, let $\mathcal{S}_\varepsilon=\left\{S\subseteq[n]\,\middle|\,
|\hat{f}(S)|\ge\frac{\varepsilon}{L_1(f)}\right\}$. It holds:
\begin{enumerate}
	\item $\sum_{S\in\mathcal{S}_\varepsilon}\hat{f}^2(S)\ge1-\varepsilon$
	\item $|\mathcal{S}_\varepsilon|\le\frac{L_1^2(f)}{\varepsilon}$
\end{enumerate}
\end{claim}

\begin{proof}
\begin{enumerate}
	\item $\sum_{S\notin\mathcal{S}}\hat{f}^2(S)\le\frac{\varepsilon}{L_1(f)}
		\sum_S|\hat{f}(S)|\le\varepsilon$
	\item $|\mathcal{S}_\varepsilon|\frac{\varepsilon}{L_1(f)}\le\sum_{S\in
		\mathcal{S}_\varepsilon}|\hat{f}(S)|\le\sum_{S\in[n]}|\hat{f}(S)|=
		L_1(f)$
\end{enumerate}
\end{proof}

\begin{theorem}
We can learn any Boolean function $f$ to $\varepsilon$-accuracy with
queries in time ${\rm poly}(n,L_1(f),\frac{1}{\varepsilon})$.
\end{theorem}

\begin{proof-sketch}
Suppose first that we know $L_1(f)$. We first learn all coefficients in
$S_\varepsilon$ (plus perhaps a few other coefficients).
We are interested in coefficients $\hat f(S) \ge \frac{\varepsilon}{L_1(f)}$,
and we run the algorithm that uses queries to find a set of size $O(|S_\varepsilon|)$
that contains all of them. Then, we run the algorithm of Theorem~\ref{thm:approximate_function}
to compute $f$'s approximation.

But we are not given $L_1(f)$, so what should we do? We try setting $L_1(f)$
to 1, 2, 4, 8, \ldots Each time we compare the $g$ that we obtain to $f$ on random samples to test
if we are done.
\end{proof-sketch}

% \paragraph{Question:} The question is class was raised: Can we make this
% faster by using some sort of greedy depth-first search to alter the $\theta$
% parameter on the fly?
% 
% \paragraph{Answer:} Probably. Though the above mthod is worse by only a
% constant factor of 2, so it would not be a huge gain.

\end{document}