\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\begin{document}
\input{preamble.tex}

%{lecture number}{lecture date}{Ronitt Rubinfeld}{Your name}
\lecture{4}{February 21, 2006}{Ronitt Rubinfeld}{Daniel Dumitran}

%%%% body goes in here %%%%

\section{Fourier Representation}

Let us consider the functions $f: \{\pm1\}^n \rightarrow \{\pm1\}$ and
the following inner product 
$\langle f, g \rangle = \frac{1}{2^n}\displaystyle\sum_{x\in\{\pm1\}^n}f(x)g(x)$.

\begin{definition}
Let $S\in\{\pm1\}^n$. We define $\chi_S : \{\pm1\}^n \rightarrow \{\pm1\}$ with
$\chi_S(x) = \displaystyle\prod_{i\;st\;s_i=-1}x_i$. Note that throughout the
lectures, $S$ is sometimes used as a subset of $[n]$ instead of a vector. If
$S\subseteq[n]$ then $\chi_S(x) = \displaystyle\prod_{i\in S}x_i$.
\end{definition}

Notice that functions $\chi_S$ form an orthonormal basis under inner product
$\langle \rangle$ (i.e. $\langle \chi_S, \chi_T \rangle = \delta_{S,T}$
\footnote{$\delta(S,T)$ is $1$ if $S=T$ and $0$ otherwise.}).

\begin{definition}
  $\forall S \in \{\pm1\}^n$ we define
  $\hat{f}(S)=\langle f, \chi_S \rangle = 
  \frac{1}{2^n}\displaystyle\sum_{x\in\{\pm1\}^n}f(x)\chi_S(x)$
\end{definition}

\begin{theorem}
  $\forall f$ we have 
  $f(x)=\frac{1}{2^n}\displaystyle\sum_{z\in\{\pm1\}^n}\hat{f}(x)\chi_z(x)$
\end{theorem}

\begin{remark}
  $f$ linear $\Leftrightarrow \exists S\in \{\pm1\}^n$ st $\forall T\in \{\pm1\}^n$
  we have $\hat{f}(T) = \delta_{S, T}$.
\end{remark}

\begin{definition}
  $dist(f,g) = Pr_{x\in\{\pm1\}^n}[f(x) \neq g(x)]$
\end{definition}

\begin{lemma}\label{lemma1}
  $\forall S \in \{\pm1\}^n$ and $f:\{\pm1\}^n \rightarrow \{\pm1\}$, we have
  $\hat{f}(S)=1-2*dist(f, \chi_S)$
\end{lemma}

\begin{proofof}{Lemma \ref{lemma1}}
  \begin{eqnarray*}
    \hat{f}(S)
    &=& \frac{1}{2^n}\sum_{x\in\{\pm1\}^n}f(x)\chi_S(x) \\
    &=& \frac{1}{2^n}[\sum_{x\;st\;f(x)=\chi_S(x)}f(x)\chi_S(x) +
                      \sum_{x\;st\;f(x)\neq\chi_S(x)}f(x)\chi_S(x)]
  \end{eqnarray*}
  However, $f(x)\chi_S(x)$ is $1$ for all terms in the first sum and $-1$ for
  all terms in the second sum. Therefore

  \begin{eqnarray*}
    \hat{f}(S)
    &=& \frac{1}{2^n}[2^n - 2\sum_{x\;st\;f(x)\neq\chi_S(x)}-1] \\
    &=& 1 - 2Pr[f(x) \neq \chi_S(x)] \\
    &=& 1 - dist(f, \chi_S(x))    
  \end{eqnarray*}   

\end{proofof}

Let $S \neq T$ be two elements in $\{\pm1\}^n$. We have
\begin{eqnarray*}
  dist(\chi_S, \chi_T) &=& \frac{1-\hat{\chi_T}(S)}{2} \\
  &=& \frac{1-\langle \chi_S,\chi_T \rangle}{2} \\
  &=& \frac{1}{2}
\end{eqnarray*}

What this tells us is that two different linear functions agree on
EXACTLY half of their inputs.

\section{Parseval's Identity (for Boolean functions only)}

\begin{lemma}\label{lemma2}
  $\forall f: \{\pm1\}^n \rightarrow \{\pm1\}$ we have
  $\displaystyle\sum_{S\in\{\pm1\}^n}[\hat{f}(S)]^2=1$.\\
  (For the general case, we have
  $<f,f>=\displaystyle\sum_{S\in\{\pm1\}^n}[\hat{f}(S)]^2$.)

\end{lemma}

We are going to prove the lemma for Boolean functions only.

\begin{proofof}{Lemma \ref{lemma2}}
  If $f$ is Boolean, we have $<f,f>=1$ because
  \begin{eqnarray*}
    <f,f> &=& \frac{1}{2^n}\sum_{x\in\{\pm1\}^n}f^2(x) \\
          &=& \frac{1}{2^n}\sum_{x\in\{\pm1\}^n}1 \\
          &=& \frac{1}{2^n}2^n \\
          &=& 1
  \end{eqnarray*}
  However, we also have
  \begin{eqnarray*}
    <f,f> &=& <\sum_{S\in\{\pm1\}^n}\hat{f}(S)\chi_S, 
               \sum_{T\in\{\pm1\}^n}\hat{f}(T)\chi_T> \\
          &=& \sum_{S,T} \hat{f}(S) \hat{f}(T) <\chi_S, \chi_T> \\
          &=& \sum_{S,T} \hat{f}(S) \hat{f}(T) \delta_{S,T} \\
          &=& \sum_{S} [\hat{f}(S)]^2*1 \\
          &=& \sum_{S} [\hat{f}(S)]^2
  \end{eqnarray*}
  Therefore $\displaystyle\sum_{S} [\hat{f}(S)]^2=1$
\end{proofof}

\section{More Linearity Testing}

We have
\begin{eqnarray*}
  f(xy)=f(x)f(y) &\Leftrightarrow& f(x)f(y)f(xy)=1 \\
                 &\Leftrightarrow& \frac{1-f(x)f(y)f(xy)}{2}=0
\end{eqnarray*}
and
\begin{eqnarray*}
  f(xy)\neq(x)f(y) &\Leftrightarrow& f(x)f(y)f(xy)=-1 \\
                    &\Leftrightarrow& \frac{1-f(x)f(y)f(xy)}{2}=1.
\end{eqnarray*}
It is therefore a natural choice to use the indicator variable 
$I[\frac{1-f(x)f(y)f(xy)}{2}]$
in order to measure the probability of group law failure of
a function $f$.

\begin{eqnarray*}
  E_{x,y}[f(x)f(y)f(xy)] &=& E_{x,y}\{[\sum_{S}\hat{f}(S)\chi_S(x)]
                                      [\sum_{T}\hat{f}(T)\chi_T(y)]
                                      [\sum_{U}\hat{f}(U)\chi_U(xy)]\} \\
                         &=& E_{x,y}[\sum_{S,T,U} \hat{f}(S)\hat{f}(T)\hat{f}(U)
                                                  \chi_S(x)\chi_T(y)\chi_U(xy)] \\
                         &=& \sum_{S,T,U}\{ \hat{f}(S)\hat{f}(T)\hat{f}(U)
                             E_{x,y}[\chi_S(x)\chi_T(y)\chi_U(xy)]\}
\end{eqnarray*}
Let us first compute $E_{x,y}[\chi_S(x)\chi_T(y)\chi_U(xy)]$. There are two cases to analyze:
($S=T=U$) and ($S\neq U$ or $T\neq U$). \\
If $S=T=U$, we have
\begin{eqnarray*}
E_{x,y}[\chi_S(x)\chi_T(y)\chi_U(xy)]
   &=& E_{x,y}[\chi_S(x)\chi_S(y)\chi_S(xy)] \\
   &=& E_{x,y}[\prod_{i\in S}x_i \prod_{i\in S}y_i \prod_{i\in S}(x_i y_i)] \\
   &=& E_{x,y}[\prod_{i\in S}(x_i y_i)^2] \\
   &=& E_{x,y}[\prod_{i\in S}1] = 1.
\end{eqnarray*}

If $S\neq U$ or $T\neq U$, then
\begin{eqnarray*}
E_{x,y}[\chi_S(x)\chi_T(y)\chi_U(xy)]
  &=& E_{x,y} [\prod_{i\in S}x_i \prod_{j\in T}y_j (\prod_{k\in U}x_k \prod_{l\in U}y_l)]  \\
  &=& E_{x,y} [\prod_{i\in S \Delta U}x_i \prod_{j\in T \Delta U}y_j] \\
  &=& E_x[\prod_{i\in S \Delta U}x_i] * E_y[\prod_{j\in T \Delta U}y_j] \\
  &=& 0
\end{eqnarray*}
because either $E_x[\displaystyle\prod_{i\in S \Delta U}x_i]$ or 
$E_y[\displaystyle\prod_{j\in T \Delta U}y_j]$ is $0$. \\
Having computed $E_{x,y}[\chi_S(x)\chi_T(y)\chi_U(xy)]$, we come back to
$E_{x,y}[f(x)f(y)f(xy)]$:

\begin{eqnarray*}
  E_{x,y}[f(x)f(y)f(xy)] 
    &=& \sum_{S,T,U}\{ \hat{f}(S)\hat{f}(T)\hat{f}(U)
              E_{x,y}[\chi_S(x)\chi_T(y)\chi_U(xy)]\} \\
    &=& \sum_S{[\hat{f}(S)]^3} \leq max [ \hat{f}(S) \sum_S{\hat{f}^2(S)}] \\
    &=& max[\hat{f}(s)] (due\;to\;Parseval's\;identity)\\
    &=& 1 - 2 min[dist(f, \chi_S)].
\end{eqnarray*}
Therefore, we know that $Pr[group\;law\;failure] \geq min_S[dist(f, \chi_S)]$.


\section{Learning functions with Sparse Fourier Representation}

\begin{definition}
  Let $f: \{\pm1\}^n \rightarrow \{\pm1\}$ and $g: \{\pm1\}^n \rightarrow \Re$. \\
  We say that $g$ \emph {$\epsilon$-approximates f} (in $L_2$-norm) if
  $E_x[(f(x)-g(x))^2] \leq \epsilon$.
\end{definition}

We will use the sign of $g$ to predict the values of $f$ (we are not interested in the
magnitude of $g$; just its sign). If $f(x) \neq sign(g(x))$, we have a \emph{prediction error}.

\begin{claim}\label{claim1}
  $Pr[f(x) \neq sign(g(x))] \leq E_x[(f(x)-g(x)^2]$
\end{claim}
\begin{proofof}{Claim \ref{claim1}}
   We will analyze $I[f(x) \neq sign(g(x))] = 1 - \delta_{f(x), sign(g(x))}$. \\
   \ \ \ Let us denote the indicator variable above by $I$. 
   There are two cases to analyze depending
   if $f(x)$ is equal or not to $sign(g(x))$. \\
   \ \ \ If $f(x) = sign(g(x))$ then obviously we have $I=0$. We also know 
   that $(f(x)-g(x))^2\geq0$, therefore $I \leq (f(x)-g(x))^2$. \\
   \ \ \ If $f(x) \neq sign(g(x))$ then $I=1$; however, in this case, 
   $(f(x)-g(x))^2\geq1$. This means that $I \leq (f(x)-g(x))^2$. \\
   \ \ \ We have seen that $I \leq (f(x)-g(x))^2$ regardless of $x$.
\begin{eqnarray*}
  \forall x \; I[f(x) \neq sign(g(x))] \leq (f(x)-g(x))^2
    &\Rightarrow& E_x[I[f(x) \neq sign(g(x))]] \leq E_x[(f(x)-g(x))^2] \\
    &\Leftrightarrow& Pr_x[f(x) \neq sign(g(x))] \leq E_x[(f(x)-g(x))^2]    
\end{eqnarray*}
   
\end{proofof}
\end{document}