\documentclass[10pt]{article}
\newtheorem{define}{Definition}
\usepackage{amsmath,amsfonts}
%\newcommand{\Z}{{\mathbb{Z}}}
\newcommand{\e}{\epsilon}
\newcommand{\bq}{\{-1,1\}^{n}}
\newcommand{\Exp}{{\mathbb{E}}}
%\usepackage{psfig}
\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in
\newcommand{\NS}{{\rm \,NS}}
\begin{document}
\input{preamble.tex}
%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{07: Learning Halfspaces}{February 27, 2008}{Ronitt Rubinfeld}{Ning Xie}
\section{Review of Last Lecture}
Last time we said that a function
$f:\{-1,1\}^{n} \to \mathbb{R}$ has $\alpha(\epsilon, n)$-Fourier
concentration if $$\sum_{S\subseteq[n], |S|>\alpha(\epsilon,n)}\hat{f}(S)^{2}\leq \epsilon$$ for all
$0 < \epsilon < 1$. For functions that have $d=\alpha(\epsilon,n)$-Fourier concentration, we showed
the \emph{Low Degree Algorithm} for learning such functions: estimate all the low-degree Fourier coefficients (that is, $\hat{f}(S)$ for all $|S|\leq d$) and output the sign of the estimated low-degree polynomial (output hypothesis $\mathrm{sign}(\sum_{S:|S|\leq d}C_{S}\chi_{S}(x))$, where $C_S$ is the estimated Fourier coefficients). Today we are going to see further applications of the Low Degree Algorithm in learning theory.
\section{Noise Sensitivity}
\begin{definition}[Linear Threshold Function]
A Boolean function $h(x)$ is called a Halfspace Function (or Linear Threshold Function) if
$h$ can be written as $h(x)=\mathrm{sign}(\sum_{i=1}^{n}w_{i}x_{i}-\theta)$, where
$w_{i}$ are real numbers called weights and $\mathrm{sign}(x)$ is $1$ if $x\geq 0$ and $-1$ otherwise.
\end{definition}
We are going to see an algorithm that learns halfspaces (under the uniform distribution) with sample complexity
$n^{O(1/\epsilon^{2})}$. There are other learning algorithms with better sample complexity.
The advantage of the algorithm we study is that it can be easily generalized to learn any function that depends
on a constant number of halfspaces. The main tool we are going to use is the Low Degree Algorithm but
combined with a key new idea: noise sensitivity.
\begin{definition}[Noise Operator]
For any $0 < \epsilon < 1/2$, define the noise operator $N_{\e}:\bq \to \bq$ such that
each bit of $N_{\e}(x)$ is obtained by randomly flipping each bit of $x$ independently with probability
$\e$. That is, independently for each $1\leq i \leq n$, $\Pr[N_{\e}(x)_{i}=-x_{i}]=\e$.
\end{definition}
\begin{definition}[Noise Sensitivity]
For any Boolean function $f$, define its noise sensitivity, denoted by $\NS_{\e}(f)$, to be
\[
\NS_{\e}(f)=\Pr_{x, {\rm random\ noise}}[f(x) \neq f(N_{\e}(x))]
\]
\end{definition}
Note that the notion of noise operator is similar to the $\delta$-biased distribution we saw in
H{\r{a}}stad's test. One may think H{\r{a}}stad's dictator testing algorithm tests both
linearity and noise sensitivity at the same time. An easy fact is, if $x$ is uniform over $\bq$ then
so is $N_{\e}(x)$.
We next see the noise sensitivities of some functions.
\begin{fact}[Dictator Function]
If $f(x)=x_{i}$, then $\NS_{\e}(f)=\e$.
\end{fact}
\begin{fact}[AND Function]
If $f(x)=x_{1}\bigwedge \cdots \bigwedge x_{k}$, then $\NS_{\e}(f)=\frac{1}{2^{k-1}}(1-(1-\e)^{k})$. This is because
\begin{align*}
\NS_{\e}(f)&=\Pr[f(x)=-1 \mbox{ and } f(N_{\e}(x))=1] + \Pr[f(x)=1 \mbox{ and } f(N_{\e}(x))=-1]\\
&=2\Pr[f(x)=1 \mbox{ and } f(N_{\e}(x))=-1]\\
&=2\frac{1}{2^{k}}(1-(1-\e)^{k}).
\end{align*}
Note that for $k \ll \frac{1}{\e}$, $\NS_{\e}(f) \approx \frac{k\e}{2^{k-1}}$. If
$k\gg \frac{1}{\e}$, then $\NS_{\e}(f) \approx \frac{1-e^{-k\e}}{2^{k-1}}$.
\end{fact}
\begin{fact}[Majority Function]
If $f(x)=\mathrm{MAJ}(x_{1}, \ldots, x_{n})=\mathrm{sign}(x_{1}+\cdots+x_{n})$, then
$\NS_{\e}(f)=O(\sqrt{\e})$.
\end{fact}
\begin{proof-sketch}
Here we only give a rough outline of the proof. One may think of computing the majority of $x$ as a random walk
on the real line. The random walk starts from origin and at step $i$ it flips a fair coin to determine the value of
of $x_{i}$ and moves left or right accordingly. After $n$ steps, it stops and outputs $1$ if it ends at
some position $z\geq 0$ and outputs $-1$ otherwise. A well-known fact is that the expected distance from the origin
after $n$ unbiased coin-flips is $\Theta(\sqrt{n})$. In fact, if $c$ is a sufficiently small constant, then
the probability that the random walk ends at distance from origin $\geq c\sqrt{n}$ is pretty high.
One way of seeing this fact is to consider the weight distribution of vectors in the Boolean cube.
Although $\sum_{i}x_{i}=0$ is the most likely configuration,
but there are only $\Theta(\frac{2^{n}}{\sqrt{n}})$ vectors at this point.
In fact, almost all vectors are distributed between $\sum_{i}x_{i}=-\sqrt{n}$ and $\sum_{i}x_{i}=\sqrt{n}$.
Now we consider $N_{\e}(x)$ as a second random walk starting from the endpoint of the previous walk
(that is, starts from $\sum_{i}x_{i}$). This time there are only $\e n$ coin-flips and each coin-flip outputs
$1$ and $-1$ equally likely. Note that since we are ``correcting" the previous noiseless random walk, so
the step size of the second walk is $2$ and consequently the expected displacement is $2\sqrt{\e n}$.
Suppose the first random walk ends at $c\sqrt{n}$ for some small constant $c$. Then by Markov inequality,
\begin{align*}
&\quad \Pr[\text{2nd walk leaves us on the other side of origin}] \\
&\leq \Pr[\text{the displacement of the second walk is larger than $c\sqrt{n}$}] \\
&\leq \frac{2\sqrt{\e n}}{c\sqrt{n}}=O(\sqrt{\e}).
\end{align*}
\end{proof-sketch}
In fact, it is known that this bound on the noise sensitivity of Majority functions is tight (up to a constant factor).
That is, $\NS_{\e}(\mathrm{MAJ})=\Theta(\sqrt{\e})$.
\begin{fact}[Linear Threshold Function \mbox{[Peres]}]\label{fact:Peres}
For any linear threshold function LTF,
\[
\NS_{\e}(\mathrm{LTF})\leq 8.8\sqrt{\e}.
\]
\end{fact}
\begin{fact}[Parity Function]
If $f(x)=\chi_{S}(x)$ for some $S \subseteq [n]$, then
\[
\NS_{\e}(f)=\frac{1-(1-2\e)^{|S|}}{2}.
\]
\end{fact}
This fact is a special case of the theorem we are going to prove next.
\begin{theorem}
For any Boolean function $f:\bq \to \{-1,1\}$,
\[
\NS_{\e}(f)=\frac{1}{2}-\frac{1}{2}\sum_{S\subseteq [n]}(1-2\e)^{|S|}\hat{f}(S)^{2}.
\]
\end{theorem}
\begin{proof}
By the definition of noise sensitivity, we have
{\allowdisplaybreaks
\begin{align*}
\NS_{\e}(f) &= \Pr_{x, y=N_{\e}(x)}[f(x)\neq f(y)]\\
&=\Exp[\mathbf{1}_{f(x)\neq f(y)}]\\
&=\Exp[\frac{(f(x)-f(y))^{2}}{4}] \quad \text{(since $f$ is a Boolean-valued function)}\\
&=\Exp[\frac{2-2f(x)f(y)}{4}] \\
&=\frac{1}{2}-\frac{1}{2}\Exp_{x,y}[f(x)f(y)] \\
&=\frac{1}{2}-\frac{1}{2}\sum_{S,T \subseteq [n]}\hat{f}(S)\hat{f}(T)\Exp_{x,y}[\chi_{S}(x)\chi_{T}(y)]\\
&=\frac{1}{2}-\frac{1}{2}\sum_{S\subseteq [n]}\hat{f}(S)^{2}\Exp_{x,y}[\chi_{S}(x)\chi_{S}(y)].
\end{align*}
Note that since $\chi_{S}(x)$ and $\chi_{S}(x)$ take values in $\{-1,1\}$, so
if we let $e_{x_{i}}$ (resp. $e_{y_{i}}$) denote the unit vector that has value $x_{i}$ (resp. $y_{i}$) at position $i$ and
$1$ at all other places, then
\begin{align*}
\Exp_{x,y}[\chi_{S}(x)\chi_{S}(y)] &= \Exp_{x,y}[\prod_{i=1}^{n}\chi_{S}(e_{x_{i}})\chi_{S}(e_{y_{i}})]\\
&= \Exp_{x,y}[\prod_{i \in S }\chi_{S}(e_{x_{i}})\chi_{S}(e_{y_{i})}]\\
&= \prod_{i \in S} \Exp_{x,y}[\chi_{S}(e_{x_{i}})\chi_{S}(e_{y_{i}})]\\
&=\prod_{i \in S}(\Pr[\chi_{S}(e_{x_{i}})=\chi_{S}(e_{y_{i}})]-\Pr[\chi_{S}(e_{x_{i}}) \neq \chi_{S}(e_{y_{i}})])\\
&=\prod_{i \in S}(\Pr[x_{i}=y_{i}]-\Pr[x_{i} \neq y_{i}])\\
&=\prod_{i \in S}(1-2\NS_{\e}(x_{i}))\\
&=(1-2\e)^{|S|}.
\end{align*}
}% end of allowdisplaybreaks
This completes the proof of the theorem.
\end{proof}
\section{Noise Sensitivity vs.\ Fourier Concentration}
The main reason that we study noise sensitivity is the following connection between noise sensitivity
and Fourier concentration for Boolean functions.
\begin{theorem}\label{thm:connection}
Let $f:\bq \to \{-1,1\}$ be a Boolean function and let $0 < \gamma < 1/2$. Then
\[
\sum_{|S|\geq 1/\gamma}\hat{f}(S)^{2} < 2.32\NS_{\gamma}(f).
\]
\end{theorem}
\begin{proof}
\begin{align*}
2\NS_{\gamma}(f) &= 1- \sum_{S \subseteq [n]}(1-2\gamma)^{|S|}\hat{f}(S)^{2}\\
&= \sum_{S \subseteq [n]}\hat{f}(S)^{2} - \sum_{S \subseteq [n]}(1-2\gamma)^{|S|}\hat{f}(S)^{2} \\
&= \sum_{S \subseteq [n]}(1-(1-2\gamma)^{|S|})\hat{f}(S)^{2}\\
&\geq \sum_{|S|\geq 1/\gamma}(1-(1-2\gamma)^{1/\gamma})\hat{f}(S)^{2}\\
&\geq \sum_{|S|\geq 1/\gamma}(1-e^{-2})\hat{f}(S)^{2}.
\end{align*}
Finally by numerical calculation, $\frac{2}{1-e^{-2}}< 2.32$.
\end{proof}
The following is a simple corollary of Theorem~\ref{thm:connection} which says that
a Boolean function $f$ has small Fourier concentration if there is a good upper bound on
the noise sensitivity of $f$.
\begin{corollary}\label{cor:main}
Let $f:\bq \to \{-1,1\}$ be a Boolean function and $\beta:[0,1/2] \to [0,1/2]$ be a real-valued function
such that $\NS_{\gamma}(f) \leq \beta(\gamma)$, then
\[
\sum_{|S|\geq \left(\beta^{-1}(\frac{\e}{2.32})\right)^{-1}}\hat{f}(S)^{2} \leq \e,
\]
where $\beta^{-1}$ is the inverse function for function $\beta$.
\end{corollary}
\section{Application: Learning Halfspaces and Intersections of Halfspaces}
Now it is easy to see the following corollary by combining Fact~\ref{fact:Peres} and Corollary~\ref{cor:main}:
\begin{corollary}\label{cor:LTF}
If $f:\bq \to \{-1,1\}$ is a halfspace function, then
\[
\sum_{|S|\geq O(\frac{1}{\e^{2}})}\hat{f}(S)^{2} \leq \e.
\]
Therefore, by applying the Low Degree Algorithm to $f$, we see that halfspace functions can be learned
with $n^{O(\frac{1}{\e^{2}})}$ samples under the uniform distribution.
\end{corollary}
Note that the Fourier concentration bound of halfspace functions in Corollary~\ref{cor:LTF} can be easily
generalized to arbitrary functions that depend on $k$ halfspace functions by upper bound the noise sensitivity
of such functions. Let $h_{1}, \ldots, h_{k}$ by $k$ arbitrary halfspace functions.
Let $g: \{-1,1\}^{k} \to \{-1,1\}$ be any Boolean functions defined on $k$ variables. Define
$f(x)=g(h_{1}(x), \ldots, h_{k}(x))$. Then we have the following upper bound on the noise sensitivity of $f$.
\begin{theorem}
\[
\NS_{\e}(f) \leq 8.8k\sqrt{\e}.
\]
\end{theorem}
\begin{proof}
\begin{align*}
\NS_{\e}(f) &= \Pr[g(h_{1}(x), \ldots, h_{k}(x)) \neq g(h_{1}(N_{\e}(x)), \ldots, h_{k}(N_{\e}(x)))]\\
&\leq \sum_{i=1}^{k}\Pr[h_{i}(x) \neq h_{i}(N_{\e}(x))] \quad \text{(By union bound)}\\
&\leq k\cdot 8.8\sqrt{\e}. \quad \text{(By Fact~\ref{fact:Peres})}
\end{align*}
\end{proof}
Applying the Low Degree Algorithm again, we conclude that any function that depends on
$k$ halfspace functions can be learned
with $n^{O(\frac{k^{2}}{\e^{2}})}$ samples under the uniform distribution.
\end{document}