\documentclass[runningheads, 11pt]{article}
\usepackage{graphicx,verbatim,xspace,color}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{fullpage}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{claim}[theorem]{Claim}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{remark}[theorem]{Remark}
\setlength{\oddsidemargin}{.25in}
\setlength{\evensidemargin}{.25in}
\setlength{\textwidth}{6in}
\setlength{\topmargin}{-0.4in}
\setlength{\textheight}{8.5in}
\newcommand{\heading}[6]{
\renewcommand{\thepage}{\arabic{page}}
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { \textbf{#2} \hfill #3 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #6 \hfill} }
\vspace{2mm}
\hbox to 5.78in { \textit{Instructor: #4 \hfill #5} }
}
}
\end{center}
\vspace*{4mm}
}
%
%\newenvironment{proof}{\noindent{\bf Proof:} \hspace*{1mm}}{
% \hspace*{\fill} $\Box$ }
%\newenvironment{proof_of}[1]{\noindent {\bf Proof of #1:}
% \hspace*{1mm}}{\hspace*{\fill} $\Box$ }
%\newenvironment{proof_claim}{\begin{quotation} \noindent}{
% \hspace*{\fill} $\diamond$ \end{quotation}}
\newcommand{\lecturenotes}[4]{\heading{#1}{6.S897 Algebra and Computation}{#2}{Madhu Sudan}{Scribe: #3}{#4}}
\newcommand{\lecturenum}{19} % lecture number
\newcommand{\lecturetitle}{Lecture 19 - Computing Partial Derivates and Depth Reduction} % topic of lecture
\newcommand{\lecturedate}{April 23, 2012} % date of lecture
\newcommand{\studentname}{Travis Hance} % full name of student (i.e., you)
\newcommand{\parder}[2]{\ensuremath{\frac{\partial #1}{\partial #2}}}
\newcommand{\G}{\mathcal{G}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%555
\begin{document}
\lecturenotes{\lecturenum}{\lecturedate}{\studentname}{\lecturetitle}
\section{Overview}
In this lecture we cover:
\begin{itemize}
\item \cite{bs} If we can compute polynomial $f$ with an arithmetic circuit of size $s$, then we can compute all of its partial derivatives $(\parder{f}{x_1}, \dotsc, \parder{f}{x_n})$ in size $O(s)$.
\item \cite{vsbr} Depth reduction: If $f$ can be computed with a circuit of size $s$, then $f$ can be computed in size $\mathsf{poly}(s)$ and depth $O(\log^2 s)$.
\end{itemize}
\section{Clarification of arithmetic circuit model}
We might ask the question: how universal are arithmetic circuits for universal computation? We cannot compute everything using arithmetic circuits; for instance, we cannot compute square roots or compute the gcd of two polynomials, because whatever we compute must be a polynomial in the inputs.
However, we may wonder about the something such as the determinant: it is a polynomial in its input variables, and there is a polynomial time algorithm (Gaussian elimination) to compute it. However, this algorithm involves branching: we have to test whether a value is nonzero in order to see if it can be used as a pivot. However, this is not an issue: if we just make a circuit to compute the determinant treating input variables as formal variables, then we do not need to use branching. In the end we will have a circuit computing $f/g$ which equals the determinant wherever it is defined. We already know from the last lecture that we can eliminate division and get a circuit for computing determinant.
This method works in general for computing polynomials with arithmetic circuits if you can compute them with branches of the form ``does $y_i = 0$?''.
Also note that there are known explicit curcuits for computing the determinant by inductively computing the characteristic polynomial \cite{berkowitz}.
\section{Computing partial derivates \cite{bs}}
The key idea is to use a ``top-down'' approach rather than a ``bottom-up'' approach. That is, instead of computing, for each gate, the partial derivatives of that gate with respect to the input variables, we compute the partial derivatives of the output with respect to gates.
Suppose that a polynomial $f(x_1,\dotsc,x_n)$ is computed by a circuit of size $s$, that is, a series of $s$ steps of the form $y_i = y_j \lozenge y_k$ (or $y_i = y_j \lozenge x_k$ or $y_i = x_j \lozenge x_k$) where $\lozenge \in \{+, \times\}$. Rather than thinking of the end result $y_s$ as a function of the input variables $x_1,\dotsc,x_n$, we imagine a function $g(x_1,\dotsc,x_n, y_1, \dotsc, y_s)$ which computes the end result that we can write in many different ways. More specifically,
\begin{itemize}
\item Let $g_s(x_1,\dotsc,x_n,y_1,\dotsc,y_s) = y_s$
\item For $1\le i\le s$, let $g_{i-1} = g_{i}|_{y_i \leftarrow y_j \lozenge y_k}$, that is, $g_i$ is obtained by substituting $y_j \lozenge y_k$ for $y_i$ in $g_{i+1}$, in the circuit, $y_i$ is computed as $y_j \lozenge y_k$. (By abuse of notation, we will always write $y_j$ and $y_k$ when in fact either could be one of the inputs $x_1,\dotsc,x_n$ instead.)
\end{itemize}
Thus for any $i$, $g_i$ is a polynomial in $x_1,\dotsc,x_n,y_1,\dotsc,y_{i}$, and so $g_0$ is the original polynomial $f$ in only the input variables. Thus our goal is to compute
\begin{equation*}
\left(\parder{g_0}{x_1}, \dotsc, \parder{g_0}{x_n}\right)
\end{equation*}
The idea is to compute $\{\parder{g_i}{y_j}(x_1,\dotsc,x_n,y_1,\dotsc,y_s)\}$ for all $i,j$. Note that while there are clearly more than $O(s)$ such derivatives, many of them will be equal and so we will only need $O(s)$ gates.
Since $g_s(x_1,\dotsc,x_n,y_1,\dotsc,y_n) = y_s$, we initially we have $\partial g_s/\partial y_s = 1$, and $\parder{g_s}{y_j} = 0$ for all $j < s$.
Now consider $g_{i-1}$ for $i \le s$. Recall that $g_{i-1} = g_{i}|_{y_i \leftarrow y_j \lozenge y_k}$. Naturally, we have
\begin{equation} \label{derivative-y-i}
\parder{g_{i-1}}{y_i} = 0
\end{equation}
since $y_i$ does not appear anywhere in the polynomial $g_{i-1}$. Since we are just making a substitution $y_i \leftarrow y_j \lozenge y_k$, by the chain rule,
\begin{equation} \label{derivative-y-j}
\parder{g_{i-1}}{y_j} = \parder{g_i}{y_j} + \parder{g_i}{y_i}\cdot \left(\frac{\partial}{\partial y_j}(y_j \lozenge y_k)\right)
\end{equation}
where of course $(\partial / \partial y_j)(y_j + y_k) = 1$ and $(\partial / \partial y_j)(y_j \times y_k) = y_k$. We get a similar equation for $\partial g_{i-1}/\partial y_k$. Finally, for all other $y_\ell$ ($\ell \ne i,j,k$) we get
\begin{equation} \label{derivative-y-l}
\parder{g_{i-1}}{y_\ell} = \parder{g_i}{y_\ell}
\end{equation}
Remember that in identities (\ref{derivative-y-j}) and (\ref{derivative-y-l}), the equality is not of formal polynomials in\linebreak $x_1,\dotsc,x_n,y_1,\dotsc,y_s$ but $y_1,\dotsc,y_s$ should be considered as polynomials in $x_1,\dotsc,x_n$.
Thus, using (\ref{derivative-y-i}), (\ref{derivative-y-j}), and (\ref{derivative-y-l}) we can compute all the derivatives $\partial g_{i-1} / \partial y_j$ for all $j$ from the partial derivates $\partial g_{i} / \partial y_j$, using only a constant number of gates. Thus in $O(s)$ gates we can compute the partial derivates $\partial g_0 / \partial x_j$ as desired.
\section{Depth Reduction \cite{vsbr}}
In this section we show the following result:
\begin{theorem}\label{depth-reduction}
Let $\phi$ be a circuit computing a polynomial $f$. If the size of $\phi$ is $s$ and $\deg f \le s$ then there is a circuit computing $f$ with size $\mathsf{poly}(s)$ and depth $O(\log^2(s))$. \end{theorem}
From the last lecture we know that we there are small arithmetic circuits computing the homogeneous parts of $f$. Thus we assume that $f$ is homogeneous and that $\phi$ is a homogeneous circuit, i.e., the output of each gate is a homogeneous polynomial.
Now some notation: for any gate $v$, let $f_v$ be the polynomial in $x_1,\dotsc,x_n$ computed at gate $v$. Furthermore, if $v$ and $w$ are gates, we can think of $v$ as computing a polynomial from inputs $x_1,\dotsc,x_n,w$ by ``cutting off'' the gate $w$ and pretending its output is just another input to the circuit. Thus, we can write $f_v(x_1,\dotsc,x_n) = g_{v,w}(x_1,\dotsc,x_n, w)|_{w = f_w}$. We then define
\begin{equation}
\partial_w f_v = \left.\parder{g_{v,w}}{w}\right|_{w = f_w}
\end{equation}
This is the notion of a partial derivative of a gate with respect to another gate that we will use in this construction.
To compute $f$ using polylogarithmic depth, we will proceed in stages. At stage $i$ we will compute:
\begin{itemize}
\item Compute all polynomials $f_v$ such that $2^i < \deg f_v \le 2^{i+1}$.
\item Compute all $\partial_w f_v$ such that $2^i < \deg v - \deg w \le 2^{i+1}$ and $\deg w \le \deg v \le 2\deg w$.
\end{itemize}
Each stage will be done with $\mathsf{poly}(s)$ size and with a depth-$2$ (unbounded fan-in) circuit. Any unbounded fan-in gate can we replaced with a constant fan-in, logarithmic depth circuit.
To compute all these things, we rely on the following:
\begin{lemma}\label{depth-reduction-lemma}
Let
\begin{equation}
\G_m \stackrel{\text{\emph{def}}}{=} \{t : t = t_1 \times t_2, \deg(t) > m, \deg(t_1),\deg(t_2) \le m\}.
\end{equation}
Then,
\begin{itemize}
\item[(i)] For all gates $v$ with $m < \deg(f_v) \le 2m$, we have
\begin{equation}\label{lemma-first-statement}
f_v = \sum_{t \in \G_m} f_t \partial_t f_v
\end{equation}
\item[(ii)] For all gates $v,w$ with $\deg(w) \le m < \deg(v) \le 2\deg(w)$, we have
\begin{equation}
\partial_w f_v = \sum_{t \in \G_m} (\partial w f_t) (\partial_t f_v)
\end{equation}
\end{itemize}
\end{lemma}
\begin{proof}
\begin{itemize}
\item[(i)] We use induction on the depth of $v$. The base case has $v \in \G_m$. For this, we want to show that
\begin{equation}
f_v = f_v \cdot \partial_v f_v + \sum_{t \in \G_m \backslash \{v\}} f_t \cdot \partial_t f_v
\end{equation}
but this is true because $\partial_v f_v = 1$, and since there can be no path in $\phi$ from $t$ to $v$ for any other $t$, we must have $\partial_t f_v = 0$.
For induction, suppose $v = v_1 \lozenge v_2$, where we already know the statement (\ref{lemma-first-statement}) holds for $v_1$ and $v_2$. In the `+' case, we have $f_v = f_{v_1} + f_{v_2}$ and so
\begin{align*}
f_v &= \sum_{t\in\G_m} f_t \partial_t f_{v_1} + \sum_{t\in\G_m}f_t \partial_t f_{v_2}\\
&= \sum_{t \in \G_m} f_t(\partial_t f_{v_1} + \partial_t f_{v_2})\\
&= \sum_{t\in\G_m} f_t(\partial_t f_v)
\end{align*}
For the `$\times$' case, assume without loss of generality that $\deg f_{v_1} \ge \deg f_{v_2}$. Then we must have $\deg(v_2) \le m$ and so $\partial_t f_{v_2} = 0$. Thus, by the product rule,
\begin{equation}
\partial_t f_v = (\partial_t f_{v_1})(f_{v_2}) + (\partial_t f_{v_2})(f_{v_1}) = (\partial_t f_{v_1})(f_{v_2}).
\end{equation}
Hence, $f_v = f_{v_1}f_{v_2}$ so
\begin{align*}
f_v &= \left(\sum_{t \in \G_m} f_t \partial_t f_{v_1}\right) f_{v_2}\\
&= \sum_{t \in \G_m} f_t \cdot (f_{v_2} \partial_t f_{v_1})\\
&= \sum_{t \in \G_m} f_t \partial_t f_v
\end{align*}
which completes the induction step.
\item[(ii)] We simply take the derivative of the result from (i). Expanding with the product rule gives
\begin{equation}
\partial_w f_v = \sum_{t \in \G_m} \left( (\partial_w f_t) (\partial_t f_v) + (f_t)(\partial_w\partial_t f_v)\right)
\end{equation}
But $\partial_w \partial_t f_v$ is always $0$ since $\deg(v) \le 2\deg(w) < \deg(w) + \deg(t)$. Thus this simplifies to the desired result.
\end{itemize}
\end{proof}
Now, Lemma (\ref{depth-reduction-lemma}) immediately lets us complete the construction outlined above, thus proving Theorem (\ref{depth-reduction}).
\bibliographystyle{alpha}
%\bibliography{lect18}
\begin{thebibliography}{9}
\bibitem{bs}
W. Baur, V. Strassen. \emph{The complexity of partial derivatives.} Theoretical Computer Science, Volume 22, Issue 3, February 1983, Pages 317-330, ISSN 0304-3975, 10.1016/0304-3975(83)90110-X.
\bibitem{berkowitz}
S. J. Berkowitz. \emph{On computing the determinant in small parallel time using a small number of processors.} Inf. Prod. Letters 18, pages 147-150, 1984.
\bibitem{vsbr}
L. G. Valiant, S. Skyum, S. Berkowitz, C. Rackoff. \emph{Fast parallel computation of polynomials using few processors.} SIAM J. Comput. 12(4), pp. 641-644, 1983.
\end{thebibliography}
\end{document}