\documentclass[11pt]{article}
\bibliographystyle{plain}
\usepackage{amssymb}
\usepackage{times}
\usepackage{color}
%\usepackage{doublespace}
\thispagestyle{empty}
\newcommand{\mse}{mean-square error }
%TPAMI-0029-0403
\newcommand{\hide}[1]{}
\newcommand{\out}[1]{}
\newcommand{\ui}{^{(i)}}
\newcommand{\us}{^{(s)}}
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\beqa}{\begin{eqnarray}}
\newcommand{\eeqa}{\end{eqnarray}}
\newcommand{\ie} {{\it i.e., }}
\newcommand{\eg} {{\it e.g., }}
\newcommand{\cl}[1]{{{\cal{#1}}}}
\newcommand{\mr}[1]{{\mathrm{#1}}}
\newcommand{\mb}[1]{{\mathbf{#1}}}
%\newcommand{\changedStan}[1] {{\textcolor{red}{#1}}}
\newcommand{\changedStan}[1] {#1}
%\newcommand{\changedRom}[1] {{\textcolor{green}{#1}}}
\newcommand{\changedRom}[1] {#1}
\newcommand{\changedRev}[1] {{\textcolor{blue}{#1}}}
\newcommand{\mycaption}[3]{\renewcommand{\baselinestretch}{1}\caption[#1]{#2.}{#3}\renewcommand{\baselinestretch}{1.5}}
\newcommand{\mycaptionS}[1]{\renewcommand{\baselinestretch}{1}\caption[#1]{\small #1}\renewcommand{\baselinestretch}{1.5}}
\newcommand{\CapMViewc}{{Estimating same hand pose at $26$ viewpoints. The feedback function used was estimated from data. The figure has two sets of columns. Each column has the ground truth, MO, and best three MS samples. The viewpoint $(\beta_1,\beta_2)$ is indicated on the right side of each column}}
\newcommand{\CapMViewb}{{Example estimated hand poses at random view points obtained using the MS algorithm. Feedback function was estimated from data. Columns 1-2 show the ground truth and the estimate using the MO algorithm, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
\newcommand{\CapMViewcE}{{Estimating same hand pose at $26$ viewpoints. The feedback function used was the computer graphics rendering. The figure has two sets of columns. Each column has the ground truth, MO, and best three MS samples. The viewpoint $(\beta_1,beta_2)$ is indicated on the right side of each column}}
\newcommand{\CapMViewbE}{{Example estimated hand poses at random view points obtained using the MS approach. Feedback function was computer graphics rendering. Columns 1-2 show the ground truth and the estimate using the MO algorithm, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
%-----
\newcommand{\CapTestI}{{40 examples of estimated hand poses chosen uniformly at random. Reconstruction found using the Mean Output (MO) approach. The feedback function used was estimated from data. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}
\newcommand{\CapTestIE}{{40 examples of estimated hand poses chosen uniformly at random. Reconstruction found using the Mean Output (MO) approach. The feedback function was computed using computer graphics rendering. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom). For comparison, the frames are the same as those used when feedback was estimated from data}}
\newcommand{\CapRTestI}{{40 examples of estimated hand poses captured every 0.9 secs. from real video (RV). Reconstruction found using the Mean Output (MO) approach. The feedback function used was estimated from data. }} %Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}
\newcommand{\CapRTestIE}{{40 examples of estimated hand poses captured every 0.9 secs from real video (RV). Reconstruction found using the Mean Output (MO) approach. The feedback function was computed using computer graphics rendering}} %Each example consists of a pair of images: input video frame (top), and estimate obtained using the mean output algorithm (bottom). Note: for comparison frames are same as those used when feedback was estimated from data}}
\newcommand{\CapTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function was estimated from data. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
\newcommand{\CapTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function was computed using computer graphics rendering. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
\newcommand{\CapRTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach using real video (RV). The feedback function was estimated from data. Columns 1-2 show the input video frame and the MO solution, columns 3-7 show sorted samples (1-4 and 12) obtained via the MS approach where S1 is the most probable sample.}}
\newcommand{\CapRTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach and real video (RV). The feedback function was computed using computer graphics rendering. Columns 1-2 show the input video frame and the MO solution, columns 3-7 show sorted samples (1-4 and 12) obtained via the MS approach where S1 is the most probable sample.}} %%. Frames were chosen every 0.9 secs. Columns 1-2 show the input video frame and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
\newcommand{\CapTestIIWE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function was computed using computer graphics rendering. Column 1 shows ground truth, columns 2-6 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
%%%% ----- Multiple
\newcommand{\CapMTestI}{{40 examples of estimated hand poses chosen uniformly at random and reconstruction found using Mean Output (MO) approach. The feedback function used was estimated from data. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}
\newcommand{\CapMTestIE}{{40 examples of estimated hand poses chosen uniformly at random and reconstruction found using Mean Output (MO) approach. The feedback function was computed using computer graphics rendering. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom). Note: for comparison frames are same as those used when feedback was estimated from data}}
\newcommand{\CapMTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. Views and poses were chosen uniformly at random. The feedback function was estimated from data. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
\newcommand{\CapMTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. Views and poses were chosen uniformly at random. The feedback function was computed using computer graphics rendering. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
\newcommand{\CapRTestBodyII}{{Example estimated body poses obtained using the Multiple Sample (MS) approach using real video (RV). The feedback function was estimated from data. Frames were chosen every $\frac{2}{3}$ secs. Column 1 shows the input video frame, columns 2-6 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}
\usepackage{psfig}
%\pssilent
%\renewcommand{\psfig}[1]{}
\renewcommand{\dbltopfraction}{1.0}
\renewcommand{\textfraction}{0.0}
\renewcommand{\topfraction}{1.0}
\renewcommand{\bottomfraction}{1.0}
\renewcommand{\baselinestretch}{1.5}
%\def\changed#1{{\bf{#1}}}
\def\changed#1{#1}
\psdraft
\newcommand{\comment}[1]{{\large\it #1}}
\setlength{\textwidth}{6.5in} \setlength{\textheight}{9.0in}
\setlength{\oddsidemargin}{0.0in} %\setlength{\topmargin}{-0.5in}
\begin{document}
\thispagestyle{empty}
%\title{A generative/discriminative framework for estimating articulated pose from a single image}
\title{{\vspace*{-1in}\normalsize{\tt Submitted 2004. Portions of this paper appeared in NIPS 14 and ICCV 01}}\\
{\sf\LARGE Combining generative and discriminative models in a framework for articulated
pose estimation}}
%~\\
%~\\
%~\\
%}
\author{\begin{tabular}{cc}
R\'{o}mer Rosales\footnote{R\'{o}mer Rosales is now at MIT Computer Science and Artificial Intelligence Laboratory, email romer@csail.mit.edu} & Stan Sclaroff\\
Probabilistic and Statistical Inference Group& Image and Video
Computing Group \\
Dept.\ of Electrical and Computer Engineering & Dept.\ of Computer
Science \\
University of Toronto & Boston University \\
Toronto, ON M5S 3G4 CANADA & Boston, MA 02215 USA\\
romer@psi.toronto.edu & sclaroff@cs.bu.edu
\end{tabular}}
\date{~}
%\date{Version of \today}
\renewcommand{\baselinestretch}{1.}
\maketitle \thispagestyle{empty}
\renewcommand{\baselinestretch}{1.5}
%without the need for manual initialization, non-linear optimization.
% the camera perspective and
%orthogonal projection models.
%% \begin{abstract}
%% A probabilistic, nonlinear supervised learning model is proposed:
%% the Specialized Mappings Architecture (SMA). The SMA employs a
%% set of several mapping functions that are estimated automatically
%% from training data. Each specialized function maps certain domains
%% of the input space (e.g., image features) onto the output space
%% (e.g., articulated body parameters). One important advantage of
%% the SMA is that it can model ambiguous, one-to-many mappings that
%% may yield multiple valid output hypotheses. Once learned, the
%% mapping functions generate a set of output hypotheses for a given
%% input via a statistical inference procedure. The SMA inference
%% procedure incorporates an inverse mapping or feedback function,
%% which enables the SMA to evaluate the likelihood of each
%% hypothesis. Possible feedback functions include computer graphics
%% rendering routines that can generate images for given hypotheses.
%% The SMA employs a variant of the Expectation-Maximization
%% algorithm for simultaneous learning of the specialized domains
%% along with the mapping functions, and approximate strategies for
%% inference. The framework is demonstrated in a computer vision
%% system that can estimate the articulated pose parameters of a
%% human body or human hands, given image silhouettes. The accuracy
%% and stability of the SMA are also tested using synthetic images of
%% human bodies and hands, where ground truth is known.
%% \end{abstract}
%which allowed us to derive inference
%methods based on the possibility of alternatively use different sets
%of conditional independence assumptions specified by the forward and
%inverse models. The inverse function
\begin{abstract}
A framework is presented that enables reliable estimation of the
articulated pose of the human body or human hand, given a single
image. Pose estimation is formulated as a statistical inference
problem, where the goal is to find a posterior probability
distribution over poses. The framework combines two modeling
approaches, one discriminative and the other generative. The
discriminative model consists of a set of several mapping functions
that are estimated automatically from a labeled training set of body
poses and their respective image features. A key advantage of the
discriminative formulation is that it can model ambiguous, one-to-many
mappings that may yield multiple valid articulated pose hypotheses.
The generative model is defined in terms of a computer graphics
rendering of poses. While this generative model offers an accurate
way to relate observed (image features) and hidden (body pose) random
variables, it is difficult to use it directly in pose estimation,
since inference is intractable. In contrast, inference with the
discriminative model is tractable, but considerably less accurate for
the problem of interest. A combined discriminative/generative
formulation is derived that leverages the complimentary strengths of
both models in a principled framework for articulated pose
inference. Two efficient pose inference algorithms are derived from
this formulation; the first is deterministic and the second
non-deterministic. Performance of the framework is quantitatively
evaluated in estimating articulated pose of both the human hand and
human body.
\hide{We consider the problem of 3D and 2D articulated body pose
estimation/inference from visual features obtained from a single
image. 3D pose estimation (\eg MAP estimation) is generally considered
ill-posed since one cannot fully recover the body pose due that
information is lost after projection to the image plane. Here we
formulate this problem as a statistical inference problem, where the
goal is to find a posterior probability distribution over poses given
features from a single image. Statistical learning can be accomplished
by using labeled training data of body poses and their respective
image features.
Generative models offer a principled way of accounting for hidden
random variables (body pose). However, despite the fact that we can
define an accurate generative model for this problem, inference is
intractable because of the complex non-linear generative process. On
the other hand, we can introduce discriminative models where inference
is tractable. Unfortunately, these models are considerably less
accurate for the problem of interest, since it not clear how to build
appropriate discriminative models for the problem at hand (\ie a
probability distribution that captures the structure of the
problem). These two viewpoints are complementary and thus an ideal
approach should exploit their individual advantages to make inference
both accurate and feasible.
We provide a natural and principled way to combine these models. In
our approach, a discriminative model is learned from training
data. Unlike simply using it to define a posterior distribution over
body poses, the discriminative model is combined with the generative
model to better approximate the intractable posterior distribution
implied by the generative model. We offer theoretical justification
for the resulting inference algorithm and also, provide two algorithms
for MAP estimation that are efficient and have clear advantages over
standard body tracking methods. Performance is thoroughly evaluated
using synthetic and real visual data in the tasks of estimating hand
and human body pose. We show that, even though one image seems
insufficient to recover body pose, our method clearly provides
accurate estimates using a very fast algorithm.}
%To do:
%Refer to Zhu, and others, related in the methodology.
%Check the results and address 'bad' results comments
%Check other comments
%Fix abstract
%%Check the KL stuff and later paragraphs in the critical section
%Pedro Felzenswalb
\hide{
Stan: There are two things I still need to address
(1)add some related work, reviewers wanted us to add Zhu,Felzenswalb,Jojic-Frey...
(2)Address the 'bad' results comments
I am running some experiments to get the absolute joint error per joint instead of RMSE (divided by the # DOF) as I did for the previous paper. I realized that RMSE indeed may be a bad way to show the results. I am also going to say what the random performance is (to avoid the comments like, 'some joints are estimated at around chance')
For a final version, we need to
(*) Spell check, delete 'regions' in figures
(**) Take out 5 pages (any suggestions)
The largest change were Sec. 5.1 -- 5.4, they were practically re-written.
}
\hide{
We provide a theoretical justification for this.... [Jaak 98]
Incorporating.... [J98]
A probabilistic, nonlinear supervised learning model is proposed: the
Specialized Mappings Architecture (SMA). The SMA employs a set of
several forward mapping functions that are estimated automatically
from training data. Each specialized function maps certain domains of
the input space (e.g., image features) onto the output space (e.g.,
articulated body parameters). The SMA can model ambiguous, one-to-many
mappings that may yield multiple valid output hypotheses. Once
learned, the mapping functions generate a set of output hypotheses for
a given input via a statistical inference procedure. The SMA inference
procedure incorporates an inverse mapping or feedback function in
evaluating the likelihood of each of the hypothesis. Possible feedback
functions include computer graphics rendering routines that can
generate images for given hypotheses. The SMA employs a variant of
the Expectation-Maximization algorithm for simultaneous learning of
the specialized domains along with the mapping functions, and
approximate strategies for inference. The framework is demonstrated in
a computer vision system that can estimate the articulated pose
parameters of a human's body or hands, given silhouettes from a single
image. The accuracy and stability of the SMA are also tested using
synthetic images of human bodies and hands, where ground truth is
known.}
%\hide{In the SMA
%formulation it is possible to use different sets of conditional
%independence assumptions in the forward and inverse models if
%desired.}
%In both
%tests, excellent performance is attained.
%SSChanged: Commented out last sentence...
%% It's just begging for abuse from the reviewers.
%% Let the reader be the judge please.
%% RR:OK
%% Reworded the abstract a little. The abstract already says what's important.
%% One need not say things like "An important aspect of the approach...." etc.
%% RR: After the clarification at the beginning of my email, you'll see that this is not correct:
% 'In the SMA formulation
%it is possible to use different sets of conditional independence
%assumptions in the forward and inverse models if desired.'
% That's why I prefer:
%'It incorporates an inverse
%mapping or feedback function, which allowed us to derive inference
%methods based on the possibility of alternatively use different sets
%of conditional independence assumptions specified by the forward and
%inverse models. The inverse function enables the SMA to evaluate the
%likelihood of each of the hypothesis.'
%% The knowledge of the inverse function allowed us to use both sets of CIA's (at the same time for interence).
%% Otherwise SMA would have been like most ML methods
\end{abstract}
\paragraph{Keywords:} Human body pose, hand pose, nonrigid and articulated pose
estimation, statistical inference, generative and discriminative models,
mixture models, Expectation Maximization algorithm.
%\paragraph{Keywords:} Human Body Pose, estimation of articulated structure, supervised learning, combination of generative and discriminative models, statistical inference, Expectation Maximization algorithm, hand shape.
\newpage
%%%%%%%%%%%\renewcommand{\psfig}[1]{}
\section{Introduction}
\hide{ An essential task for vision systems is to infer the state of
the world given some form of visual observations. From a computational
perspective, this often involves facing an ill-posed problem; for
example, relevant information may be lost via projection of the
three-dimensional world into a two-dimensional image. As a result, it
is often the case that multiple valid interpretations of an image are
possible. Solving an ill-posed problem requires some form of
additional information, usually provided as a model of the underlying
process. Interestingly, in their day to day life, humans are
surprisingly adept at interpreting the visual world. }
\changedStan{An essential task for vision systems is to infer the
state of the world given some form of visual observations. From a
computational perspective, this typically involves facing an
ill-posed problem; relevant information is lost via projection of
the three-dimensional world into a two-dimensional image. In this
paper, the focus is on inferring the pose of an articulated object
in an image, in particular the pose of a human body or human hand.
Humans can often solve such pose inference problems, even when
given only a relatively poor-quality, low-resolution, monocular
image. It is believed that humans employ extensive prior knowledge
about human body structure and motion in solving this ill-posed
task \cite{Johansson73}. In this paper, we consider how a computer
vision system might learn such knowledge in the form of
probabilistic models, and how to employ such models in an
algorithm for reliable pose inference.}
%% R_Nov_Change: Took out this paragraph
%% Let us consider an example body pose inference task: given only a
%% person's silhouette, estimate that person's articulated body pose. To
%% be concrete, let us define articulated pose in terms of: (a) the 2D
%% locations of the person's joints in the image, or (b) the 3D locations
%% of the person's joints in Euclidean space. Imagine drawing marks on
%% the silhouette image that approximately label the joints: left elbow,
%% right elbow, left knee, right knee, and so on. Also consider a
%% plausible 3D pose interpretation for this silhouette. While this
%% inference task seems relatively simple for a human to perform, the
%% task is quite challenging, using either representation (a) or (b), for
%% current computer vision systems.
%RRChange ... using either representat
%SS: OK
% An example image is shown in Fig.\ \ref{fig:exampleTask}.
\changedStan{For purposes of computation, the inference task can be
defined as follows: given an observation vector $\mb{x}\in \Re^c$ that
was extracted from an image of a person, infer the parameterized
articulated pose as a vector $\mb{h} \in \Re^t$. The vector spaces
${\Re}^c$ and ${\Re}^t$ are continuous. In a generic machine learning
framework, pose inference might be regarded as a function
$\varphi:{\Re}^c \rightarrow{\Re}^t$ that maps an input vector of
visual observations to an output vector describing the most likely
articulated pose. More generally, the mapping function could produce a
posterior probability distribution, $\varphi:{\Re}^c \rightarrow {\cl
P}$, where ${\cl P}$ is a family of probability density functions on
$\Re^t$.} \hide{There are many different aspects in this problem. Some
of them have been the focus of a lot of attention in statistical
learning. They are by no means solved except for certain basic
instances (\eg see \cite{Pearl88}). For many real-world problems this
is usually not the case.} \changedStan{A number of general questions
arise. What type or form should the mapping function $\varphi$ take?
How can the mapping function be estimated from training data? How can
the approach incorporate prior knowledge about the problem structure?
How can inference be performed efficiently or approximately if exact
inference is intractable? These questions are fundamental and common
in statistical learning, and only in certain basic problem instances
\cite{Pearl88} are the answers immediately clear.}
%%RRChange ... a single pose ...
%%SS: OK
\changedRom{Several perspectives or viewpoints could be taken to
approach learning tasks. However, often it is not clear which seems
more suitable for the problem at hand. It will be useful for the
purpose of this paper to distinguish two major viewpoints: the
generative and the discriminative viewpoints (\eg see
\cite{NG01DisGen}). In the case of learning generative models, a joint
distribution $p(\mb{x},\mb{h})$ over the random variables of the model
(here simply $\mb{h}$ and $\mb{x}$) is estimated from data. Then,
given an observation, \eg $\mb{x}$, a posterior probability
$p(\mb{h}|\mb{x})$ over the unobserved random variables could, in
theory\footnote{However, in practice this task can be intractable or
lack analytic solutions. This is an important open problem in
statistics.}, be calculated by invoking Bayes' rule. In contrast,
using discriminative models the posterior distribution
$q(\mb{h}|\mb{x})$ is directly learned or estimated (see \eg
\cite{McLachlan92,Rubinstein97,NG01DisGen} for further comparisons
between these viewpoints). In this paper, we favor the idea that for
pose estimation, the advantages of each of these viewpoints could be
exploited in a single framework.}
If we try to learn a mapping directly, let us say by estimating the
parameters of a parameterized function $\phi:{\Re}^c
\rightarrow{\Re}^t$ as in a discriminative approach, we encounter
several problems. The form required for $\phi$ may not be simple,
because the mapping from observations to articulated poses is
generally ambiguous (one-to-many), and therefore no single function
can perform this mapping. An example is illustrated in
Fig.~\ref{fig:mappingAmbiguity}. The arm locations cannot be uniquely
inferred given the silhouette $\mb{x}$ and therefore,
$\mb{a}$--$\mb{h}$ are all plausible pose configurations. The hands
and arms can move in such a way that the silhouette does not
change. Note also that pose $\mb{c}$ is the reflection of $\mb{a}$:
the camera looks at the back rather than at the front of the
body. There may be different regions in ${\Re}^t$ that correspond to
ranges of valid poses, and these regions may not be connected, e.g.,
some viewed from the front and others from behind.
\psfigurepath{./figs}
\begin{figure}[t]
\centerline{\psfig{figure=FixedIntro.prn,angle=270,width=3in,clip=t}}
%\psfig{figure=FixedIntro.prn,angle=270,width=3in,clip=t}}
\mycaptionS{\small Example ambiguity in mapping body silhouette
cues in ${\Re}^c$ to articulated body poses in ${\Re}^t$. Given
silhouette $\mb{x}$, poses $\mb{a}$--$\mb{h}$ are all valid
hypotheses. In general, entire regions in ${\Re}^t$ may contain
valid poses. \label{fig:mappingAmbiguity}}
\end{figure}
%% Since you say that you can't use a single function, the next statement is not needed.
\hide{Even though one may be tempted to just increase the
complexity of the function $\phi$ and consider this choice as
necessary (due to the apparent intricacy of the problem at hand)
\footnote{Moreover, unnecessarily increasing the complexity of
$\phi$ can have other awful consequences such as overfitting.}, a
fundamental idea in this paper is that this choice may not be
necessary, as will be seen next.}
%Complexity, and doesn't include knowledge
%%RRChange ...In fact
%%SS: OK
Let us now consider the inverse problem: given an articulated pose
vector $\mb{a}$, generate its silhouette $\mb{x}$. \changedStan{With
a good computer graphics model of the human body, one can easily
render the silhouette $\mb{x}$. Thus, we can define a very accurate
generative model of what we refer to as the inverse mapping of pose
parameters to image features $\zeta:{\Re}^t\rightarrow{\Re}^c$. This
is a key part of our problem definition and it will play an important
role in developing the framework presented in this paper. \changedRev
{Note that this generative process is not perfectly one-to-one, even
given camera parameters, because of noise, clothing, anthropometric
variations, etc., but approximately so, at least from mathematical
convenience perspective}. In fact, the inverse mapping $\zeta$
provides very useful information about the structure of the problem;
however, it cannot be incorporated straightforwardly in a
discriminative approach. Despite the simplicity of $\zeta$, its
inverse may still be complex or not even exist.}
\changedStan{In summary, the one-to-may nature of the problem of
mapping image features to body poses precludes the use of
discriminative supervised learning methods \changedRom{that fit a single (or
finite number of) functions to the data}, e.g., most neural networks,
support vector machines, simple least-squares, boosting, etc. On the
other hand, we have access to the {\it inverse}
$\zeta:{\Re}^t\rightarrow{\Re}^c$ that given a body pose can produce
the corresponding image features, which can be used to define a very
accurate generative model.
%% ROMER (from Stan): This next sentence is not supported yet.
However, as will be shown later, this accurate generative model is
challenging to use directly in body pose inference. The view taken in this
paper is that it can be effective to use the individual advantages of these
two complimentary approaches (discriminative and generative) to formulate
an efficient solution to the pose inference/learning problem.}
\changedRev{The paper is structured as follows. Sec.~2 presents the
related work and how our work fits and differs from existing
approaches for pose estimation. Sec.~3 starts by proposing
independent discriminative and generative models for the problem at
hand without explicitly creating a connection between them. Sec.~4
explains assumes that these two models are given and introduces
inference. First inference is presented for each model separately and
its shortcomings discussed, then a method that combines both models is
introduced. Sec.~4.3 presents the foundations, while Sec.~4.4 and 4.5
concentrate on algorithms. Sec.~5 shows how to learn the models
proposed. Sec.~6 presents the applications considered and Sec.~7 shows
the results of the experimental evaluation. Sec.~8 discusses further
issues and provides some concluding remarks.}
%% This access to the {\it inverse} map, as well as the
%% one-to-many forward ambiguity are two of the key characteristics
%% of our problem that make it different from other supervised
%% learning problems. The core algorithmic challenges are: 1.)
%% estimating the specialized domains and functions in an optimal way
%% that also takes into account the form of the specialized
%% functions, and 2.) using the knowledge of the inverse function to
%% formulate efficient inference and learning algorithms.
%%RRChange [added full paragraph]
%%SS: I removed this stuff before, because it's redundant.
%% I remove it again.
%% The first sentence is simply a restatement of the paragraph
%% before it. And the last sentence of paragraph before that.
%% It simply won't fit.
%RR: Can we somehow state that 'these two are the main characteristics of the problem we are trying to solve which make it different from other supervised learning problems'. It will emphasize that we are doing something different. I think it is important.
%RR:
%This paragraph also states clearly what are the fundamental problems (why it is difficult).
%I think this paragraph is a great summary of the whole machine learning part of the paper. It should be kept somehow.
%% SS: OK, OK. It's your thesis after all :)
%% But as I point out in my email this paragraph is mostly redundant.
%RRChange .... More importantly w
%%SS: is it really more important than other stuff already in this paragraph?
%% Also, I don't know what you mean at all. It's machine learning after all.
%% you have to pick a functional form. If you feel it's important, can you
%% try to explain it in email? It's not really clear what you mean here
%% at all. I would prefer you leave it out.
%%RR:OK
%\psfigurepath{../ICCV01/iccv01/figs}
\begin{figure}[t]
\centerline{\psfig{figure=FigMapsEP.prn,width=5in,clip=t,angle=270}}
%\centerline{
%%(a) \psfig{figure=Learning2.ps,width=0.48\textwidth,clip=t}
%%~(b) \psfig{figure=Inference2.ps,width=0.46\textwidth,clip=t}
%%(a) \psfig{figure=map.GIF.eps,width=0.4325\textwidth,clip=t}
%(a) \psfig{figure=mapFixed.prn,width=0.4325\textwidth,clip=t}
%~~~(b) \psfig{figure=fb.GIF.eps,width=0.414\textwidth,clip=t}
%}
%\begin{figure}[t]
%\vspace*{2.5in}
\label{fig:SMAexample} \mycaptionS{\small
Simplified schematic illustration behind our method for the case of
inferring body pose: (a) Given an input vector $\mb{x}$, we generate a
set of hypotheses. (b) The inverse mapping function $\zeta$ is
employed in evaluating each hypothesis.}
\end{figure}
\hide{
\changed{
The basic concepts are illustrated in Fig.\ \ref{fig:SMAexample}. For
a given input $\mb{x}$, the discriminative model generate a set of output
hypotheses. We then exploit the generative model (defined by the
inverse mapping $\zeta$) to evaluate the probability of each
hypothesis.}}
%%SS: adjusted the size (smaller)
%%% RomerV5: took out following paragraph
\hide{An important advantage of this approach is that it can model
ambiguous, one-to-many mappings that may yield multiple valid output
hypotheses. Unlike other learning approaches that employ a set of
mapping functions (\eg \cite{Friedman91,Hinton98,Jordan94}), this
approach incorporates an inverse mapping $\zeta$ in probabilistic
inference. The framework is evaluated in a computer vision system that
can estimate the articulated pose parameters of a human body or human
hands, given real image silhouettes. Accuracy and stability are also
tested using synthetic images of human bodies and hands, where ground
truth is known.}
%% %% For related work
%% Several other learning models use a similar concept of fitting
%% surfaces to the observed data by splitting the input space into
%% several regions and approximating simpler functions in these regions
%% (\eg \cite{Jordan94,Hinton98,Friedman91}). However, in these
%% approaches, the inverse map is not incorporated in the estimation
%% algorithm because it is not considered in the problem definition and
%% it is necessary to make the forward model more complex.
%% 1111111111111
\section{Related Work}
\label{sec:RelWork}
In computer vision, recovery of articulated body pose from images is
often formulated as a {\it tracking} problem. Usually, link-joint
models comprised of 2D or 3D geometric primitives are designed
beforehand to roughly match the specific morphology of the target in
question
\cite{Bregler98,Deutscher00,Gavrila95,OrmSidBlaHas01,Rehg95,shimada,Felzenszwalb00,Sminchisescu01}.
Mesh models have also been used as an alternative to link-joint models
\cite{heap}. At each frame, these geometric models are fitted to the
image to minimize some cost function that favors the overlap of the
model and associated image regions (or motion). \changedRom{Although
usually not stated, the fitting or cost function in many cases
implicitly defines (or can be used to define) a generative model of
the observed image}. Despite their descriptive power, this family of
approaches has a number of critical drawbacks. Generally, a non-linear
optimization problem must be solved at every frame (sometimes
equivalent to inference in a complex generative model). Careful manual
placement of the model on the first frame in a video sequence is also
required. Moreover, tracking in subsequent frames tends to be
sensitive to errors in initialization and numerical drift; as a
result, these systems cannot recover from tracking errors in the
middle of a sequence.
To address these weaknesses, specialized dynamical models have
been proposed \cite{Isard98J,OrmSidBlaHas01,PavRehMac01}. These
methods learn a prior distribution over some specific motion
class, such as walking. This prior is used to predict and
hopefully improve the pose estimates in future frames. However,
this strong prior substantially limits the generality of the
motions that can be tracked; a prior for a given class of motions
is generally useless when used for tracking objects undergoing a
different class of motion, e.g., walking vs. dancing.
Other methods for constrained tracking include
\cite{Black95}, where a subspace of allowable motions is
learned from a set of examples. These examples and the model
(usually linear) are hoped to be sufficient to span the set of
possible motions to be seen during tracking. Thus, pose inference
involves finding a linear projection of the observed data onto the
motion subspace. This subspace approach enforces a strong prior;
as mentioned previously, this limits the generalization of the
model to classes of motions not seen in the training set.
Furthermore, articulated motion is generally non-linear, and
cannot be easily explained as a linear projection.
In our approach we avoid matching image features (e.g., image
regions, points, or articulated models) from frame to frame.
Therefore, we do not refer to our approach as {\it tracking}, per
se. This is in direct contrast with the techniques mentioned
above. A number of other approaches also depart from the
aforementioned tracking paradigm. We summarize these next.
In \cite{Howe99} a statistical approach is employed in
reconstructing the 3D motions of a human figure. The approach
employs a Gaussian probability model for short human motion
sequences. It is assumed that 2D tracking of the joint positions
in the image is given; therefore, this assumption implicitly
incurs the restrictions found in all tracking approaches.
In \cite{Perona00} dynamic programming is used to calculate the best
global matching of image points to predefined body joints, given a
learned probability density function of the position and velocity of
body features. Although not explicitly mentioned by the authors, the
probability function is defined by a triangulated acyclic graph. Thus,
inference is feasible due to the running intersection property
\cite{Jordan99,Pearl88}. Still, in this approach, the image points
and model initialization must be provided by hand or through some
other method.
In \cite{Brand99}, the manifold of human body dynamics is modeled
via a hidden Markov model with an entropic prior. Once the states
are inferred from observations, a quadratic cost function is used
to generate a continuous path in configuration space, \ie body
pose space.
In all of the non-tracking approaches just referred, models of {\em
motion} were estimated from data. Although the approach presented in
this paper can be used to model dynamics, we argue that when general
human motion dynamics are to be learned, the amount of training data,
model complexity, and computational resources required are
impractical. As a consequence, models with unacceptably large priors
towards specific motions are generated. Although by not modeling the
dynamics we may be ignoring information that could be used to further
constrain the inference process, there are some benefits. For
instance, a model for inferring body pose that does not consider
dynamics provides invariance with respect to speed (\ie sampling
differences) and direction in which motions are performed. This
happens simply because this model treats configurations as temporally
independent of each other. Other approaches that use a single image
include \cite{Kakadiaris00,Haritaoglu98a,Taylor00}; however, most of
these methods also require that projected joint locations be given as
input. In our approach this is not necessary.
%Lee85 erased orourke80 out
Our approach can be thought of mapping visual features to likely body
configurations. Following a machine learning paradigm, stochastic
functions that map visual features to pose parameters are approximated
from training data. A unique aspect of our approach is the combined
use of (1) these mapping functions (defining a discriminative model)
with (2) the inverse mapping function $\zeta$ (defining a generative
model). Generally speaking, after multiple poses have been inferred
from just the visual cues, $\zeta$ transforms these pose
configurations back to the visual cue (observation) space. In this
space, we can then automatically {\it choose} among a set of
reconstruction hypotheses. This is a fully probabilistic inference
process. Our approach avoids the need for manual initialization or
tracking; it thereby avoids the consequent disadvantages of
tracking. Remarkably, relatively few computations are required for
inference. We will now formalize and explain our approach in detail.
%RRChange .This is a ...
%% SS: OK
\renewcommand\arraystretch{0.8} %% SS: This changes separation between table rows
\begin{table}[t] {\small
\begin{tabular}{|ll|}
\hline
number of training examples & $N$\\
training set & $\cl{Z}=\{\mb{z}_1,...,\mb{z}_N\}$ \\
training example (input,output) pair & $\mb{z}_i = (\upsilon_i,\psi_i)$ \\
input (feature) training vector & $\upsilon_i \in \Re^c$ \\
output (pose) training vector & $\psi_i \in \Re^t$\\
\hline
generative and discriminative models probability distributions & $p$,$q$ (respectively)\\
observation random variable (\eg image moments) & $\mb{x} \in {\Re}^c$\\
hidden random variable of pose parameters & $\mb{h} \in{\Re}^t $\\
feedback (rendering) function (for generative model)&$\zeta:{\Re}^t\rightarrow{\Re}^c$\\
\hline
number of samples during inference& $S$\\
a particular observation or input image feature & $\mb{x}^*$\\
output (pose) hypothesis ( a sample from $q(\mb{h}|\mb{x}^*)$)& $\mb{h}_k$\\
estimate of most likely output hypothesis & $\hat{\mb{h}}$\\
\hline
%Mapping functions (one for each mixture distribution component)& $\Phi = \{\phi_1,\dots,\phi_M\}$\\
discrete set of labels for mixture components&${\cal C}=\{1,\dots,M\}$\\
hidden random variables assigning mixture component to training samples & $\mb{y}=(y_1,\dots,y_N), y_i\in{\cal C}$ \\
prior probability that mixture component $k$ will be used & $\lambda_k = Q(y=k)$\\mapping function parameter vector & $\theta_k$\\
discriminative model parameters (to be learned) & $\theta=(\theta_1,\dots,\theta_M,\lambda_1,\dots,\lambda_M)$\\
posterior probability of $k$-th mixture component for $\mb{z}_i$ during EM& $\tilde{Q}(y_i=k)=Q(y_i=k|\psi_i,\upsilon_i,\theta)$ \\
\hline
\end{tabular}}
\mycaptionS{Some mathematical symbols used in this paper.} \label{tab:symbols}
\end{table}
\section{Probabilistic Models}
\label{sec:ProMod}
We propose to a probabilistic, nonlinear framework for combining generative
and discriminative models for articulated pose estimation. The framework
employs a set of $M$ functions $\phi_k:{\Re}^c \rightarrow{\Re}^t$, each
associated to a mixture component in a mixture distribution. Each function
maps certain sub-domains of the input space (cues) onto the output space
(poses). These functions are estimated automatically from training data
via a variant of the Expectation-Maximization algorithm. The learned
conditional distribution is then used as an approximation to an accurate
(generative model) distribution defined using the inverse function $\zeta$,
for which inference is intractable. This basic idea is shown in a schematic
way in Fig.~\ref{fig:SMAexample}. The approximation is employed in a
similar way as a proposal distribution is used to approximate sampling from
a more complex distribution.
We begin by formally defining both the discriminative and generative models
to be employed. The discriminative model will be estimated from training
data and the generative model will be defined by a rendering function
$\zeta$. These models represent two views of the same problem and will be
used together in our framework.
\subsection{The Discriminative Model}
\hide{
In our approach, the discriminative model is represented by a set
of mapping functions. These functions are estimated from training data,
via a supervised learning procedure. }
Let $\cl{Z}=\{\mb{z}_1,...,\mb{z}_N\}$ be an observed training set of
input-output pairs $\mb{z}_i = (\upsilon_i,\psi_i)$. Each $\upsilon_i
\in \Re^c$ is an input (feature) vector, and each $\psi_i \in \Re^t$
is its corresponding output (pose) vector. A summary of mathematical
symbols used in this formulation is provided in Table
\ref{tab:symbols}.
We will approach our discriminative problem as one of hidden variable
density estimation. We begin by introducing the unobserved random
variable $\mb{y}=(y_1,\dots,y_N)$. In our model any $y_i$ has as its
domain the discrete set $\cl{C}=\{1,\dots,M\}$ of labels for the
specialized mapping functions, and can be thought of as the function
number used to map the $i$-th training pair, $\mb{z}_i$. Thus $M$ is
the number of specialized mapping functions. Our model uses parameters
$\theta=(\theta_1,\dots,\theta_M,\lambda_1,\dots,\lambda_M)$, where
$\theta_k$ represents the parameters of the $k$-th mapping function,
and %$\lambda=(\lambda_1,\dots,\lambda_M)$, where $\lambda_k=Q(y=k)$
is the prior probability that the $k$-th mapping function will be used
to map an input-output pair.
Using a maximum-likelihood criterion, we are interested in finding the
optimal parameter settings for our model; thus, we seek to maximize
the joint log-probability:
%\footnote{This is almost
%identical to taking a MAP estimate viewpoint and considering the
%parameters $\theta$ as random variables with uniform prior in some
%(bounded) interval}
\begin{equation}
\label{eq:LeaDisMod}
\theta^* = \arg\max_\theta \log q(\cl{Z}|\theta).
\end{equation}
Assuming independence of observations given $\theta$, we obtain:
\begin{eqnarray}
\theta^*&=&\arg\max_\theta \sum_i \log q(\mb{z}_i|\theta)\\
%&=&\arg\max_\theta \sum_i \log \sum_k q(\mb{z}_i|y_i=k,\theta)
%Q(y_i=k|\theta)\\ \label{eq:OptEq}
&=& \arg\max_\theta \sum_i \log
\sum_k q(\psi_i|\upsilon_i,y_i=k,\theta)Q(y_i=k|\theta)
q(\upsilon_i), \label{eq:LogSum}
\end{eqnarray}
where we used the independence assumption
$q(\upsilon|\theta)=q(\upsilon)$. \changed{The term $q(\upsilon_i)$
describes how input patterns occur. For solving Eq.~\ref{eq:LogSum},
this term is approximated by the empirical distribution implied by our
training data; as a consequence, patterns that occur more often will
have a larger effect in the maximization of Eq.~\ref{eq:LogSum}.} Due
to the sum of terms inside the logarithm of Eq.~\ref{eq:LogSum}, this
optimization is generally intractable. However, a variety of
practical approximate optimization methods exist, for example, methods
that are based on alternating minimizations
\cite{Csiszar84}. Expectation Maximization (EM)
\cite{Dempster77,Neal98} updates are described in Sec.\ \ref{sec:Lea}.
\subsubsection{Choice of a Likelihood Function}
Note that the above formulation is general. In particular, the form of
the probability $q(\psi|\upsilon,y\theta)$ was not specified.
A key question in instantiating our approach is: what form should be
used for $q(\psi|\upsilon,y,\theta)$? This is the probability that
output $\psi$ was generated by the mapping function $y$, given the
input $\upsilon$ and model parameters $\theta$. In this work we
analyze the following possible cases:
\begin{enumerate}
\item A Gaussian joint distribution of input-output vectors:
%\begin{equation}
$q(\upsilon,\psi|y,\theta)=\cl{N}((\upsilon,\psi);\mu_{y},\Sigma_{y})$.
%\end{equation}
\item A Gaussian distribution, whose mean is the output of the
$y$-th mapping function:
%\begin{equation}
$ q(\psi|\upsilon,y,\theta)
=\cl{N}(\psi;\phi_{y}(\upsilon,\theta),\Sigma_{y})$.
%\end{equation}
\end{enumerate}
\out{
One way to interpret (2) is that the error in estimating $\psi$,
given we know what mapping function to use, is Gaussian
distributed. %The distribution's mean is the output of the
%specialized function, and its covariance is dependent on the
%specialized function used.
These are the two forms tested in our experiments; however, this formulation is general, and can accept other forms for the
likelihood function.}
\subsection{The Generative Model}
Our approach also involves the use of a generative model of images (or
image features). In the problem of human body pose estimation from a
single image this generative model can be defined in a simple way. We
will assume that an image or image features are generated by sampling
a pose from a prior distribution $p(\mb{h})$ and an image is then
generated using the rendering function $\zeta$ such that: \beqa
\label{eq:zetaNormalDist}
%\label{eq:gmtd}
p(\mb{x}|\mb{h})={\cl N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta). \eeqa It
is important to notice that despite the fact that the generative model
can be defined in a simple manner, the function $\zeta$ is of a
complex form. In our case, this makes probabilistic inference
intractable as will be further explained later.
\hide{In establishing a connection to previous methods, this
inference problem is usually referred to as {\it tracking}. Fitting an
articulated model (\eg composed of solid primitives) is equivalent to
a form of probabilistic inference with several important, well-known
drawbacks: this problem requires non-linear optimization of a very
complex function and a good initial guess is difficult to determine
automatically (this is usually provided by manual placement of the
articulated model).}
%This form of fitting also have other drawbacks already
%explained.
\section{Inference}
\label{sec:InfSMA}
%%\changed whole section!!
\changed{ In this section, we refer to probabilistic inference as
finding a full probability distribution for $\mb{h}$ given that
$\mb{x}=\mb{x}^*$ once an observation $\mb{x}^*$ has been made (\eg
some image features were observed).}
\subsection{Inference using the Discriminative Model Alone}
\label{sec:InfSMADis}
\changed{ \out{Learning the discriminative model yields a set
of functions that map elements from the input space to the
output space. Each of the specialized functions maps different parts
of the input space with different levels of accuracy. This mapping
behavior is described probabilistically by $q$ in
Eq.~\ref{eq:LeaDisMod}.} A valid approach to inference is to use the
discriminative model alone. In order to understand how this differs
from our proposed solution (where we combine both, generative and
discriminative models), we will now show what inference involves in
terms of maximum a posteriori (MAP) estimation using the
discriminative model.}
\changed{In a general sense inference involves finding a
full probability distribution for $\mb{h}$ given $\mb{x}^*$; the
discriminative model directly provides this expression. In MAP
estimation we just have to maximize it (\ie we want to find the most
likely output hypothesis $\mb{h} \in \Re^t$ for a given observation
$\mb{x}^* \in \Re^c$):
%\begin{equation}
$\hat{\mb{h}}=\arg\max_\mb{h} q(\mb{h}|\mb{x}^*)=\arg\max_\mb{h} \sum_y
q(\mb{h}|\mb{x}^*,y) Q(y)$,
%\end{equation}
where $q(\mb{h}|\mb{x}^*)$ is a shorthand for
$q(\mb{h}|\mb{x}=\mb{x}^*)$. Any further treatment depends on the
properties of the probability distributions involved.}
In both Cases (1) and (2) considered in previous sections, we can
write $q(\mb{h}|\mb{x},y)=
{\cl{N}}(\mb{h};\phi_y(\mb{x}),\Sigma_y)$.
%In Case (2), by
%definition this is exactly the form of the conditional
%distribution. In Case (1), the form of $\phi_y$ and the covariance
%are described in Eqs. \ref{eq:phiGaussian} and
%\ref{eq:sigmaGaussian}.
\changed{Thus, in either case we have that $q(\mb{h}|\mb{x}^*)$ is a mixture of
Gaussians and if we want to find the MAP estimate we need to solve: }
%\begin{equation}
$\label{eq:StdInf}
\hat{\mb{h}}=\arg\max_\mb{h} \sum_y
{\cl{N}}(\mb{h};\phi_y(\mb{x}^*),\Sigma_y) Q(y).$
%\label{eq:hMix}
%\end{equation}
\changedRev{This result was obtained by employing the MAP principle
using our discriminative model alone. Here we have assumed that we
know the model. In practice we need to estimate or 0{\it learn} it
(learning will be covered in the next section), but in general,
$q(\mb{h}|\mb{x})$ will usually be an approximation to the true
distribution defined by $p(\mb{h}|\mb{x})$, obtained using the
training data. Even though we could simply adopt the above MAP
estimate as a solution, it should not be surprising that we could
improve upon this by using our knowledge of $p$, the generative
model.}
%However, we have yet to make use of the inverse (rendering) function
%$\zeta:{\Re}^t\rightarrow{\Re}^c$ in our framework.
%RRChange Eq.~\ref{eq:StdInf}...
%% SS: OK
\subsection{Inference Using the Generative Model Alone}
\label{sec:Inf2}
%(recall that the generative model is built from knowledge of the function $\zeta$, the image generating function, thus we use the term 'true' posterior)
Using the generative model, inference involves finding the posterior $p(\mb{h}|\mb{x}=\mb{x}^*)$ ($p(\mb{h}|\mb{x}^*)$ as a shorthand):
\changedRev{
\beqa
p(\mb{h}|\mb{x}^*)&=&\frac{1}{p(\mb{x}^*)}p(\mb{x}^*|\mb{h}) p(\mb{h})=\frac{1}{Z_p}{\cl N}(\mb{x}^*;\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h})\\
\label{eq:Zp}
Z_p&=&\int {\cl N}(\mb{x}^*;\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h})d\mb{h}.
\eeqa}
There are however at least two difficult obstacles for achieving this:(1) The integral in Eq.~\ref{eq:Zp} cannot be solved easily and moreover, (2) we do not have an expression for $p(\mb{h})$.
\changedRev{ In MAP estimation we do not need to be concerned about
obstacle (1) since in MAP the goal is to find
$\hat{\mb{h}}=\arg\max_\mb{h} p(\mb{h}|\mb{x})=\arg\max_\mb{h} {\cl
N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h})$ because $Z_p$ is a
constant with respect to this optimization problem. However, solving
for $\hat{\mb{h}}$ given the observed $\mb{x}^*$ is a daunting task,
the space of $\mb{h}$ is too large to explore exhaustively and
$\zeta(\mb{h})$ too complex to apply standard directed search
techniques adequately. If we could start the search using a point
$\mb{h}_0$ that we knew it was close enough to the best $\mb{h}$ then
this problem could be mitigated. This idea is often employed in
solving tracking problems, \ie when we have close enough frames (in
time and space) and the previous frame estimate can be trusted
(alternatively many previous estimates can be employed; \eg
\cite{Isard98J}). However, here we solve a different problem where we
only have a single image.}
%\end{equation}
\hide{
needless to say that in the case of body pose estimation, this is in
general, a highly complex non-linear optimization problem (tracking) as we have seen before.}
\changedRev{ A key idea in this paper is that both obstacles would
become much simpler if, somehow, we could accurately obtain samples
from $p(\mb{h}|\mb{x})$. Those samples could be used to (1)
approximate this posterior and (2) find the sample with highest
probability and use it as a MAP estimate (or additionally use it as an
initial point to search for a better estimate). However we cannot even
evaluate $p(\mb{h}|\mb{x})$ and, in addition, accurately sampling from
a given distribution is in general an open problem (\eg
\cite{Mackay98}). }
\hide{A key idea in this paper is that the problems above would become much
simpler if, somehow, we could accurately obtain samples from
$p(\mb{h},\mb{x})$. Those samples could be used to (1) approximate
$Z_p$ and thus the posterior of interest $p(\mb{h}|\mb{x})$ and (2)
find the sample for which $p(\mb{h}|\mb{x})$ is highest for MAP
estimation). However sampling accurately from a given distribution, in
particular $p(\mb{h},\mb{x})$, is in general an open problem
(\eg\cite{Mackay98}).}
\subsection{Inference and Importance Sampling. Combining Generative and Discriminative Models}
\label{sec:GenInf}
\changedRev{
In general, sampling can be used to estimate expectations of a given function
$I(\mb{h})$ with respect to some probability density $\pi(\mb{h})$ that we can
evaluate at any point, but that we cannot sample from. Let us say we
need to calculate the integral
%\beqa
$\label{eq:int}
{\cl I}=\int \pi(\mb{h})I(\mb{h})d\mb{h}$,
%\eeqa
by approximating ${\cl I}$ employing $S$ samples: $\hat{\cl
I}=\frac{1}{S}\sum_{r=1}^S I(w^{(s)})$. Let $p(\mb{h}|\mb{x}^*)$
correspond to $\pi(\mb{h})$ ($I(\mb{h})$ can be any function of the
pose), but note we cannot evaluate $p(\mb{h}|\mb{x}^*)$. However, in
the importance sampling method, it is only necessary to evaluate the
distribution up to a multiplicative factor. It turns out that in our
problem we can evaluate $p(\mb{h},\mb{x}^*)$ which is enough since it
is proportional to $p(\mb{h}|\mb{x}^*)$.}
\changedRev{ The question is how to appropriately generate the samples
to obtain the best estimate. In the importance sampling method we
first we come up with a proposal distribution $\pi'(\mb{h})$, which we
can also evaluate but from which it is possible to sample; then we
sample from $\pi'(\mb{h})$, but also correct for the bias introduced
when sampling, obtaining: } \changedRev{ \beqa \hat{\cl
I}=\frac{1}{R}\sum_{r=1}^R
\frac{p(\mb{h}^{(r)},\mb{x}^*)}{\pi'(\mb{h}^{(r)})}I(\mb{h}^{(r)}).
\eeqa } \changedRev{ It can be shown that when $R\rightarrow\infty$,
$\sqrt{R}(\hat{\cl I}-{\cl I})\sim{\cl N}(0,\sigma^{2}_{\pi'})$, with:
$\sigma^{2}_{\pi'}=\int
(\frac{p(\mb{h}^{(r)},\mb{x}^*)}{\pi'(\mb{h})}I(\mb{h})-{\cl I})^2
\pi'(\mb{h})dw$. Thus, the expected variance of our estimate is
proportional to $\sigma^{2}_{\pi'}$ and inversely proportional to $R$
\cite{Mackay98}. } \changedRev{ Since minimizing variance is a
reasonable criterion to consider, we would like to know what the
optimal proposal distribution $\pi'$ is in terms of minimizing the
estimate variance $\sigma^{2}_{\pi'}$ for a fixed $R$. The optimal
proposal distribution is given by a result in
\cite{Rubinstein81,Cheng00}: \beqa
\label{eq:Just}
\pi'(\mb{h})=p(\mb{h}^{(r)},\mb{x}^*)\int p(\mb{h}^{(r)},\mb{x}^*)
d\mb{h}, \eeqa which is equal to $p(\mb{h}|\mb{x}^*)$. }
\changedRev{
This makes sense in our simple case (for a general proof, see
\cite{Rubinstein81}), since this is the distribution in the initial
integral we wanted to solve. One would expect that in the limit of infinite samples, the
best estimate for ${\cl I}$ whatever the function $I$ is, should be
obtained when sampling from the exact distribution involved in the
integral. Of course, we know that in our case we cannot sample from
it. However, now we know (1) that from an important sampling
perspective, we should sample from $p(\mb{h}|\mb{x}^*)$ to minimize
variance, which is a reasonable criterion, and also (2) that in this
result there is no reference to the explicit $p(\mb{h})$.}
\changedRev{The main reason behind using generative and discriminative
models together is to tackle this particular problem of sampling from
a good distribution. We can use the learned distribution
$q(\mb{h}|\mb{x})$ (the discriminative model) to approximate
$p(\mb{h}|\mb{x})$, but just at $\mb{x}=\mb{x}^*$. As we will see in
the next section, we will build this approximation employing the
maximum likelihood principle. This can also be seen as finding a
discriminative distribution $q$ that is close to the (sampled) $p$
distribution (or empirical distribution) in terms of the KL divergence
\cite{Amari95} (see appendix for further discussion).}
\subsection{Non-deterministic MAP Estimation: Multiple Samples (MS)}
\label{sec:MS}
We are usually interested in providing likely samples from the
posterior distribution, in particular we might be interested in the most likely $\mb{h}$. This is the idea behind MAP estimation, where we are interested in
finding
%\beqa
%\label{eq:ForMAP}
$\hat{\mb{h}}=\arg\max_\mb{h} p(\mb{h}|\mb{x}^*)=\arg\max_\mb{h} p(\mb{x}|\mb{h}^*)p(\mb{h})$.
%\eeqa
We know that the discriminative model distribution $q(\mb{h}|\mb{x})$
tries to approximate $p(\mb{h}|\mb{x})$, and therefore it is good at
minimizing the variance of the estimator. Due to this, we will use the
discriminative model distribution to provide samples for MAP
estimation. In MAP estimation, we sample ${\cl H}_{Spl}= \{\mb{h}_s
\}_{s=1...S}$ using the proposal distribution
$q(\mb{h}|\mb{x}^*)$. Given the samples, the problem the becomes a
discrete optimization problem that can be solved easily:
\begin{equation}
\label{eq:ASolM}
{\hat s}=\arg\max_s p(\mb{x}^*|\mb{h}_s)=\arg\min_s
(\mb{x}^*-\zeta(\mb{h}_s))^\top \Sigma_\zeta
(\mb{x}^*-\zeta(\mb{h}_s)),
\end{equation}
by using the Gaussian form of $p(\mb{x}|\mb{h})$ as given in Eq.\
\ref{eq:zetaNormalDist}. We remark that after using the samples ${\cl
H}_{Spl}$ as a starting point, other more sophisticated methods could
be employed. For example we could use Markov chain Monte Carlo (MCMC)
sampling \cite{Mackay98,Zhu03} to search for regions of higher
probability. Also, instead of stochastic methods, we could employ
standard gradient descent methods to locally search for more likely
poses $\mb{h}$ (as in tracking). These methods may be helpful for some
distributions but in general have several drawbacks: (1) They are
usually very slow in high dimensions and (2) given finite time, they
are not very useful/accurate if the posterior probability is very
complex. Some methods have been proposed to alleviate these problems,
but this goes beyond our current contribution. Keeping this extension
in mind, in this paper we simply use the original samples ${\cl
H}_{Spl}$ to search for a MAP estimate. These estimates proved to be
sufficiently accurate during our experiments.
%% Let us assume that we can approximate $\sum_y p(\mb{h}|\mb{x},y)
%% P(y)$ by a set of samples generated according to
%% $p(\mb{h}|\mb{x},y) P(y)$ and a kernel function
%% $K(\mb{h},\mb{h}_s)$, such that $K(\mb{h},\mb{h}_s) \geq 0$ and
%% $\int K(\mb{h},\mb{h}_s) d\mb{h}=1$ for any given $\mb{h}_s$.
%% Given a set of samples ${\cl H}_{Spl}= \{\mb{h}_s \}_{s=1...S}$,
%% we can construct the approximation $\sum_y p(\mb{h}|\mb{x},y) P(y)
%% \approx \frac{1}{S} \sum_{s=1}^S K(\mb{h},\mb{h}_s)$. We now
%% consider two simple forms for the kernel function $K$.
%% If we use a Dirac delta function kernel centered at each sample
%% $K(\mb{h},\mb{h}_s)=\delta(\mb{h}-\mb{h}_s)$, then we have: $
%% \mb{h}^* \approx \arg\max_\mb{h} p(\mb{x}|\mb{h})
%% \frac{1}{S}\sum_{s=1}^S \delta(\mb{h}-\mb{h}_s)$. This can be
%% reduced to an equivalent discrete optimization problem where the
%% goal is to find the most likely sample $s^*$:
%% \begin{equation}
%% \label{eq:ASolM}
%% s^*=\arg\max_s p(\mb{x}|\mb{h}_s)=\arg\min_s
%% (\mb{x}-\zeta(\mb{h}_s))^\top \Sigma_\zeta
%% (\mb{x}-\zeta(\mb{h}_s)),
%% \end{equation}
%% by using the Gaussian form of $p(\mb{x}|\mb{h})$ as given in Eq.\
%% \ref{eq:zetaNormalDist}.
%% If instead we use Gaussian kernels centered at each sample
%% $K(\mb{h},\mb{h}_s)={\cl N}(\mb{h};\mb{h}_s,\Sigma_{Spl})$, then
%% we have: $\mb{h}^* \approx \arg\max_\mb{h} p(\mb{x}|\mb{h})
%% \frac{1}{S}\sum_{s=1}^S {\cl N}(\mb{h};\mb{h}_s,\Sigma_{Spl})$.
%% This approximation is harder to use in practice. Unlike the Dirac
%% delta kernel approximation, the Gaussian approximation cannot be
%% reduced to an equivalent discrete optimization since there is no
%% guarantee that the optimal $\mb{h}$ for this form is among the
%% samples in general.
\subsection{Deterministic MAP Estimation: Mean Output (MO)}
\label{sec:MO}
In certain applications, it might be advantageous to count with a very
fast method for computing MAP estimates. Two examples are: when
working with multiple articulated bodies and in dynamic settings where
it is necessary to provide estimates at a high rate. Even though the
time complexity of MS scales linearly with the number of samples, this
might not be fast enough. Motivated by speed constraints, here we
propose a very fast MAP estimation algorithm that still performs well
in experiments. Unlike MS, this algorithm is deterministic.
The structure of the problem, as well as the form of the
discriminative distribution components (\ie conditioned on the mixture
label) $q(\mb{h}|\mb{x},y)$ employed (Gaussian), make it possible to
construct this deterministic approximation. The basic intuition is
straightforward. For a given $\mb{x}=\mb{x}^*$, we {\em ask} each
mapping function $\phi_k$ to give its most likely estimate for
$\mb{h}$. We then evaluate the probability of each function's
estimate via the generative model distribution
$p(\mb{x}^*|\mb{h})$. This approximation is good in practice, as will
be demonstrated in the experiments.
To justify this deterministic approximation, we note that due to
\changed{its} concavity properties, the probability of the mean is
maximal in a Gaussian distribution; \ie it is the most-likely value.
Formally, in both Case (1) and Case (2) described earlier,
$q(E[\mb{h}|\mb{x}^*,y,\theta])\geq q(\mb{h}'|\mb{x}^*,y,\theta)$, for
any $\mb{h}'$. Consider again the set of samples ${\cl H}_{Spl}=
\{\mb{h}_s \}_{s=1...S}$ generated in the MS approximation. We can
build a set of samples ${\cl H}_{\phi}=\{\mb{h}_{k}^{\phi}
\}_{k=1...M}$ that has the property
%\begin{equation}
$\forall y, \max_k q(\mb{h}_{k}^{\phi}|\mb{x}^*,y) \geq \max_s
q(\mb{h}_{s}|\mb{x}^*,y)$,
%\end{equation}
simply by setting $\mb{h}_{k}^{\phi}=\phi_k(\mb{x}^*,\theta)$.
This insight leads to a deterministic approximation for inference, the
{\it Mean Output} solution (MO). This approximate solution relies on
the observation that by considering the means $\phi_s(\mb{x}^*)$, we
would be considering the most likely output of each mapping
function (\ie each mixture component in the discriminative model),
given the input. Obviously we expect the discriminative model provides
a good approximation of our generative model posterior distribution as
discussed above. Also, the smaller the overlap among the distributions
associated with each function, the better the accuracy of
this approximation.
In MO approximate inference, the expression to be minimized is the
same as that used in Eq.\ \ref{eq:ASolM}, except for the use of
the $M$ means instead of the $S$ samples:
\begin{equation}
\hat{k}=\arg\max_{k \in {\cl C}} p(\mb{x}^*|\mb{h}_{k}^{\phi})
=\arg\min_{k \in {\cl C}} (\mb{x}^*-\zeta(\mb{h}_{k}^{\phi}))^\top
\Sigma_\zeta (\mb{x}^*-\zeta(\mb{h}_{k}^{\phi})). \label{eq:ASolU}
\end{equation}
This generally requires substantially less computation than would be
required in the MS approach.
\begin{figure}[h]
\fbox{
\begin{minipage}{6.1in}
{\bf Summary of Inference Algorithms}\\
Input: visual features $\mb{x}^*$ computed from single image, generative, and discriminative models.
\begin{itemize}
\item MO Algorithm
\begin{enumerate}
\item For each function $\phi_k, k=1,...,|{\cl C}|$
\begin{enumerate}
\item Compute $\mb{h}_k=\phi_k(\mb{x}^*)$ using the discriminative model
\item Compute $p(\mb{x}^*|\mb{h}_k)$ using generative model by rendering from $\mb{h}_k$ (apply $\zeta(h_k)$)
\end{enumerate}
\end{enumerate}
Output: MAP estimate $\hat{\mb{h}}\leftarrow$ pick the $\mb{h}_k$ that maximizes $p(\mb{x}^*|\mb{h}_k)$ (use Eq.~10)
%\hline
%-----------------------------------------------------------------------------------------------------------------
\item MS Algorithm (extra input required: number $S$)
\begin{enumerate}
\item Generate $S$ samples $\mb{h}_s$ from $q(\mb{h}|\mb{x}^*)$
\item For each $s=1,...,S$
\begin{enumerate}
\item Compute $p(\mb{x}^*|\mb{h}_s)$ using generative model by rendering from $\mb{h}_s$ (apply $\zeta(h_s)$)
\end{enumerate}
\end{enumerate}
Output: MAP estimate $\hat{\mb{h}}\leftarrow$ pick the $\mb{h}_k$ that maximizes $p(\mb{x}^*|\mb{h}_k)$ (use Eq.~9)
%\end{itemize}
\end{itemize}
\end{minipage}
}
%\caption{\small \small Algorithm.}
%\label{fig:InfAlg}
\end{figure}
\section{Learning}
\label{sec:Lea}
\changed{An approximation method will be used in learning the
discriminative model parameters}. We will employ an Expectation
Maximization (EM) approach. EM provides a general framework for
solving the maximum likelihood parameter estimation problem in
statistical models with hidden variables, like Eq.\
\ref{eq:LogSum}. Since the EM algorithm is well known
\cite{Dempster77,Amari95,Neal98}, we will only provide derivations
specific to our formulation.
Note that the unobserved random variables $y_i$ are independent given
$\mb{z}_i$. Thus, the E-step reduces to computing the posterior
probabilities for each $y_i$ given the model parameters and observed
data. We will denote this posterior
$Q(y_i=k|\psi_i,\upsilon_i,\theta)$ using the shortcut notation
$\tilde{Q}^{(t)}(y_i=k)$. We then have:
\begin{equation}
\tilde{Q}^{(t)}(y_i=k)=\lambda_{k}q
(\psi_i|\upsilon_i,y_i=k,\theta^{(t-1)})/\sum_{j \in \cl{C}} \lambda_j
q(\psi_i|\upsilon_i,y_i=j,\theta^{(t-1)}).
\end{equation}
Stated differently, this step estimates the responsibility of each
mapping function, $\phi_k$ for each data point,
$\mb{z}_i$. \changed{$\tilde{Q}^{(t)}(y_i=k)$ represents the so called
responsibility of function $k$ for data pair $i$. Also recall that
$\lambda_k=Q(y_i=k)$ is the prior probability that function $k$ be
used.}
The M-step consists of finding $\theta^{(t)}=\arg\max_\theta
E_{\tilde{Q}^{(t)}}[\log q(\cl{Z},\mb{y}|\theta)]$. In both of our
cases we can show that this is equivalent to finding:
\begin{equation}
\label{eq:MDef} \theta^{(t)}=\arg\max_{\theta} \sum_i \sum_{k \in
\cl{C}} \tilde{Q}^{(t)}(y_i=k) [\log q(\mb{z}_i|y_i=k,\theta)+ \log
Q(y_i=k|\theta)].
\end{equation}
It is important to mention that this is valid if
$q(\mb{z}_i|\theta)$ depends on $y_i$ and not on $y_j$, for any
$j\neq i$. Note that for the distributions discussed above, this
is true. We now present solutions for the cases described above.
\subsection{Case (1)}
In this case we have:
\begin{equation} q(\upsilon,\psi|y,\theta)=
\cl{N}(\upsilon,\psi;\mu_{y},\Sigma_{y})= \cl{N}(\left[
\begin{array}{c}
\upsilon \\
\psi \\
\end{array}
\right];\left[
\begin{array}{c}
\mu_\upsilon \\
\mu_\psi \\
\end{array}
\right],
\left[
\begin{array}{cc}
\Sigma_{\upsilon\upsilon} \Sigma_{\upsilon\psi}\\
\Sigma_{\upsilon\psi}^\top \Sigma_{\psi\psi} \\
\end{array}
\right] )_{y},
\end{equation}
where the subscript $y$ is simply the mapping function number. We can
show that the parameter learning problem is reduced to a mixture of
Gaussian estimation, for which it is straightforward to estimate
$\theta$ using EM. Moreover, the Bayesian estimate of $\psi$ given an
observed $\upsilon$ is also Gaussian: $
%\begin{equation}
q(\psi|\upsilon,y,\theta)=\cl{N}(\psi;\mu_\psi+\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}(\upsilon-\mu_\upsilon),\Sigma_{\psi\psi}-\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}\Sigma_{\upsilon\psi})_{y}.$
%\end{equation}
Therefore in case (1), each function $\phi_{k}$ is just the mean of
the conditional distribution
\begin{equation}
\phi_k(\upsilon,\theta)=(\mu_\psi+\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}(\upsilon-\mu_\upsilon))_{y=k}.
\label{eq:phiGaussian}
\end{equation}
The confidence of the estimate is given by the covariance
%\begin{equation}
$\Sigma_k =
(\Sigma_{\psi\psi}-\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}\Sigma_{\upsilon\psi})_{y=k}.$
%\label{eq:sigmaGaussian}
%\end{equation}
However, this expression
does not depend on the input, a sometimes undesirable consequence
of the given model. Thus, each function $\phi_k$ is linear in the
input vector from ${\Re}^c$.
\subsection{Case (2)}
In this case we have:
\begin{eqnarray}
\label{eq:lambda_der}
\frac{\partial E}{\partial \lambda_k} &=& \sum_i \tilde{Q}^{(t)}(y_i=k)
\frac{\partial}{\partial \lambda_k} \log Q(y_i=k|\theta)\\
\label{eq:sigma_der} \frac{\partial E}{\partial \Sigma_k} &=&
\sum_i \tilde{Q}^{(t)}(y_i=k)
\frac{\partial}{\partial \Sigma_k} \log q(\psi_i|y_i=k,\upsilon_i,\theta_k)\\
\frac{\partial E}{\partial \theta_k}&=&\sum_i
\tilde{Q}^{(t)}(y_i=k) [(\frac{\partial}{\partial
\theta_k}\phi_{k}(\upsilon_i,\theta_k))^\top\Sigma_{k}^{-1}
(\psi_i-\phi_{k}(\upsilon_i,\theta_k))], \label{eq:theta_up0}
\end{eqnarray}
where $E$ is the cost function that we would like to maximize in Eq.~\ref{eq:MDef}.
%RRChange Eq above, [there was an error]
%%SS: OK
This gives the following update rules for $\lambda_k$ and
$\Sigma_k$, where Lagrange multipliers were used to incorporate
the constraint that the sum of the $\lambda_k$'s is 1:
\begin{eqnarray}
\label{eq:lambda_up}
\lambda_k^{(t)}&=&\frac{1}{N}\sum_i\tilde{Q}^{(t)}(y_i=k)\\
\label{eq:Sigma_up} \Sigma_k^{(t)}&=&\sum_i
\tilde{Q}^{(t)}(y_i=k) (\psi_i-\phi_{k}(\upsilon_i,\theta_k))
(\psi_i-\phi_{k}(\upsilon_i,\theta_k))^\top/\sum_i\tilde{Q}^{(t)}(y_i=k)
\end{eqnarray}
To keep the formulation general, we have not yet defined the form
of the mapping functions $\phi_k$. Whether or not we can find
a closed form solution for the update of $\theta_k$ depends on the
form of $\phi_k$. For example if $\phi_k$ is a non-linear
function, we may have to use iterative optimization to find
$\theta_{k}^{(t)}$. If $\phi_k$ yields a quadratic form, then a
closed form update exists.
%\comment{Is there some place in this paper that provides the
%details of the update for the $\phi_k$ you used? For instance NN
%or other function? As it is, the paper is incomplete; it does not
%give all of the details that someone needs to duplicate your
%system.}
\changed{Regarding our generative model, there is is very little
learning involved. If $\zeta$ is very accurate, then we could also
tell very accurately the image that will be generated given a body
pose $\mb{h}$. In practice $\zeta$ can be defined only approximately.
We account for this by properly setting $\Sigma_\zeta$ depending of
how much noise is expected to be present in the observations. This can
also account for inaccuracies in the geometric model.}
% ATT the prior, ATT \Simga_z
% However, the prior probability over poses
%$p(\mb{h})$ is unknown, but interestingly, as we will see in the
%following section, we do not need to specify it in our generative
%model.}
\out{
\subsection{Stochastic Learning}
The aforementioned optimization equations for the discriminative model
can be used to find a local minimum given the initial parameter
values. In order to improve this process, and avoid some of the local
minima that inevitably arise, we use an annealing schedule on the
$\tilde{Q}^{(t)}$ probabilities during the M-step. In this way, we
redefine:
\begin{equation}
\tilde{Q}^{(t)}(y_i=j) \leftarrow
\frac{e^{\log(\tilde{Q}^{(t)}(y_i=j))/T(t)}}{\sum_{k \in \cl{C}}
e^{\log(\tilde{Q}^{(t)}(y_i=k))/T(t)} }.
\end{equation}
In our experiments, the temperature parameter $T$ decays
exponentially. This step not only helps in avoiding local minima,
but it also creates two desirable effects. It forces
$\tilde{Q}^{(t)}(y_i=j)$ to be binary (either $1$ or $0$) at low
temperatures; as a consequence each point will tend to be mapped
by only one function at the end of optimization.
Moreover, it makes $\tilde{Q}^{(t)}(y_i=k)$ ($k=1,2,...,M$) be
fairly uniform at high temperatures, making the optimization less
dependent on initialization.}
%Note that in some cases, there is no closed-form solution for the
%M-step. In practice we have decided to perform two or three
%iterations per M-step. A source of randomness added to the process
%so far described consists of choosing data points randomly and
%uniformly distributed when performing the M-step. These two
%variants of the M-step have been justified in the sense of a
%partial M-step \cite{Neal98}.
%\comment{While the above paragraph makes some sense, it is really
%unclear how you are actually performing the M-step. It is best
%understood in an example (say for your MLP that would be used in
%the experiments anyway). Perhaps a new subsection is needed here
%to give a summary of the learning algorithm, and the MLP example.}
\section{Example Application: Articulated Pose from Visual Features}
\label{sec:Apps}
The formulation presented in this paper is rather general, and could
be applied in a number of supervised learning problems for which the
output-to-input (feedback) map is relatively easy to compute;
\changed{ thus allowing us to specify an accurate generative
model}. To demonstrate and test our framework, we have developed a
system that uses the our approach to infer articulated pose from
low-level visual features. In particular, we focused on pose
estimation of the human hand and body from an image silhouette. In
this class of computer vision applications, ground truth datasets for
training can be obtained via motion capture gloves or body suits, and
computer graphics rendering can be used to generate the input-output
pairs used in supervised learning. We will now give details of this
demonstration system.
\subsection{3D Hand Pose Estimation}
\label{sec:AppsHand}
The goal is to recover detailed 3D hand pose from silhouette features
computed from a single color image. Hand pose is defined in terms of
the hand joint angles. In general, we are also interested in global
orientation of the hand. We explore two applications: estimation of
the internal joint angles only, and later, estimation of both internal
joint angles and global orientation of the hand.
\subsubsection{Hand Model}
We utilize the hand model provided in the VirtualHand programming
library \cite{virtual_hand}. The model parameters are 22 joint
angles. For the index, middle, ring and pinky finger, there is an
angle for each of the distal, proximal and metacarpophalangeal
joints. For the thumb, there is an inner joint angle, an outer
joint angle and two angles for the trapeziometacarpal joint. There
are also abduction angles between the following pairs of
successive fingers: index/middle, middle/ring and ring/pinky.
Finally, there is an angle for the palm arch, an angle measuring
wrist flexion and an angle measuring the wrist bending towards the
pinky finger. However, because the former two wrist angles also
encode global orientation, we decided not to model them in our
application. Hence, ignoring these two angles, our model has 20
DOF for the internal hand configuration.
All of these 20 angles are relative to two global orientation
angles. These two angles will encode the camera viewpoint (or
alternatively hand 3D rotation). Imagine a sphere surrounding the
hand model, \ie a fixed hand center point is at the center of the
sphere. For ease of reference, we will employ the widely used
latitude and longitude notions. The first angle $\beta_1$
represents the latitude from which we are looking at the hand, the
second angle $\beta_2$ represents the longitude. We have defined
$\beta_1 \in [0,\pi]$, with zero and $\pi$ being the {\it poles}
of the sphere and $\beta_2 \in [0,2\pi)$. Thus, in summary our
full hand model has 22 DOF.
\psfigurepath{./figs/}
\begin{figure}[t]
\centerline{\small
\psfig{figure=AllViewsHandBin.ps,width=0.5\textwidth} }
\mycaptionS{\small Example of the 86 silhouettes obtained via
computer graphics rendering for a given a 3D hand pose. Views are
distributed approximately uniformly over the view
sphere.}\label{fig:HAllViews}
\end{figure}
\subsubsection{3D Hand Motion Datasets}
\label{sec:3DHDS}
Using a CyberGlove, we collected approximately 9,000 examples of 3D
hand poses. This data included hand configurations from American Sign
Language (ASL) and other configurations informally performed by
several subjects. Using computer graphics and an artificial hand
model, we then rendered each captured hand pose from multiple
viewpoints on the view sphere. We defined a set of 86 viewpoint angle
pairs $(\beta_1,\beta_2)$ so that the sphere surface is sampled
approximately uniformly. Thus we obtained a full dataset of $9,000
\times 86$ views. Each view has an associated binary image mask
(silhouette), and a 22 DOF pose vector. Fig.\ \ref{fig:HAllViews}
shows the 86 viewpoints used in the dataset for a particular
configuration.
From these silhouettes, we extract the visual features that will
be used for further processing. In our implementation, we used two
classes of features (these features are not used together): Hu
moments and Alt moments. Alt moments \cite{Alt62} are translation
and scale invariant, but not rotation invariant. Hu moments
\cite{Hu62} are invariant to translation and scaling, but also
invariant to rotation in the image plane. These moment features
were used in our implementation because they are relatively easy
to compute, and they provide invariants that are appropriate for
our demonstration application. However, our general formulation can be used with other visual feature representations
if desired. Detailed examination of the feature selection problem
is outside the scope of this paper, and remains a topic for future
research.
We define two experimental datasets:
\begin{enumerate}
\item {\em Hand-Single-View:} In this dataset, the hand is viewed
from only one viewpoint ($\beta_1=\pi/2$, $\beta_2=0$), generally
making the palm of the hand visible. Silhouette features are
computed using Alt moments. This yields approximately 9,000
input-output pairs.
\item {\em Hand-All-Views:} In this dataset, the hand is viewed
from all 86 viewpoints. Silhouette features are computed using Hu
moments. This yields approximately 750,000 input-output pairs.
\end{enumerate}
\subsubsection{Hand Detection and Segmentation}
\label{sec:segment}
For live video input, we will use video sequences collected with a
color digital camera. It will be assumed that these sequences have a
static background and only one person is present. In this
implementation, we are not considering hand occlusion analysis, which
by itself is a difficult task. Our system tracks both hands of the
user automatically using a skin color tracker \cite{sigal_2000,RosalesICCV01}.
%RRChange .... and the person is facing towards the camera [is not needed]
%%SS: OK
\subsection{2D Human Body Pose Estimation}
\label{sec:2DBP}
In this application, our goal is to recover the articulated pose of a
human body observed in a single image. The methodology followed is
very similar to that used in the estimation of hand pose. However,
instead of joint angles, body pose will be specified in terms of
marker positions at a predetermined set of joints. We will estimate
the 2D positions of these body markers in the image plane.
\subsubsection{Human Body Model}
The human body model is defined in terms of 20 3D marker positions
(60 DOF). The 20 markers are distributed as follows: three markers
for the head, three markers for the hip/back bone articulation,
plus one marker for each shoulder, elbow, wrist, hand, knee,
ankle, and foot. For computer graphics rendering, the body model
is composed of cylinders of equal width. The cylinders connect the
markers to form the standard human body structure. The thorax is
modeled using a wider cylinder. Because we are only interested in
the shape of the projected model, we do not include texture or
illumination in our rendering.
\subsubsection{Human Body Pose Dataset}
Human body motion capture data was obtained from several sources:
http://www.biovision.com, Matt Brand's dataset \cite{Brand99}, and
several demo sequences in the software package {\em Character Studio}.
In total there are 32 captured sequences that depict variations of
different activities: dancing, walking, kicking, waving, throwing,
jumping, signaling, crouching down. The total number of frames
collected is approximately 7,000, mostly at 30 frames/second. Using
computer graphics and our artificial body model, we then rendered each
frame from 16 equally-spaced viewpoints on the equator of the view
sphere centered at the hip of the body model. For each view, we also
used the camera model to obtain the 2D marker positions in the image
plane. Thus we obtained a full dataset of approximately $7,000 \times
16$ views. Each view has an associated binary image mask (silhouette)
and a 40 DOF projected marker vector. From the silhouettes, we extract
the visual features that will be used as input. We have chosen Alt
moments \cite{Alt62} as our visual features, mainly due to their ease
of computation and invariance to translation and scaling. We call
this the {\em Body-All-Views} dataset.
\subsubsection{Detection and Segmentation}
\label{sec:BodyDet}
For live video input, we use sequences collected with a color digital
camera. It is assumed that these sequences have a static background,
only one person is present, and the person is fully-visible. We use a
simple and widely-used human body segmentation scheme
\cite{Hogg83,Wren96}. The technique employs statistical learning to
acquire a model of the background appearance, where each pixel's color
(luminance) is represented by a Gaussian distribution. Segmentation is
then approached using maximum-likelihood, where each pixel is
classified as belonging to the background or the foreground (human
body).
\out{
The above process yields a set of input-output (cue-pose) pairs to
be used in our experiments. In this case, the cues are the Alt
moments for a particular view, and the pose is encoded in terms of
the projected locations of the body markers in the image plane (40
DOF).}
\subsection{Common Implementation Details}
We know briefly discuss implementation details common to both
applications.
\subsubsection{Mapping Functions}
In Sec.\ \ref{sec:ProMod}, it was not specified what class of
(deterministic) mapping functions ${\phi_k}$ were to be used. Our
framework is practically independent of this choice. However, from
Eq.\ \ref{eq:theta_up0} we can notice that there are clear advantages
in the M-step if these functions are differentiable with respect to
their parameters. In the case of quadratic or linear functions, the
M-step can be performed exactly in one step. However, the power of
these functions is limited. In our implementation each function takes
the form of a multi-layer perceptron with one hidden layer (MLP); a
widely used feedforward neural network architecture. For this
non-linear function there does not exist a closed-form solution for
Eq.~\ref{eq:theta_up0}, \changedRev{but one can see that the M-step is
like a weighted version of backpropagation for each MLP}. We used four
to five iterations of the conjugate gradient descent method per
M-step.
\hide{
% this paragraph isn't needed
For the non-linear one hidden layer perceptrons, there does not
exist a closed form solution for Eq.~\ref{eq:theta_up0}. We use
the conjugate gradient (CG) optimization method, for performing
the M-step. If $\phi_k$ is a one hidden layer perceptron with
parameters $\theta_k$, we have:
%% This is simply a restatement of Eq 14. with nothing new. So it should not be included.
\beqa \frac{\partial E}{\partial \theta_k}&=&\sum_n
\tilde{Q}(y_i)[(\frac{\partial}{\partial
\theta_k}\phi_{k}(\upsilon_n,\theta_k))^\top\Sigma_{k}^{-1}
(\psi_n-\phi_{k}(\upsilon_n,\theta_k))], \eeqa
% I thought about adding this, but it seems non-essential at this point (sorry).
Since in a one hidden layer perceptron the parameters are a set of
real-valued weights, let us explicitly denote the parameters of
$\phi_k$ as $\theta^k=\{w_{jil}^k\}$, where $w_{jil}^k$ denote the
synaptic weight from node $i$ to node $j$ in layer $l$, for the
function $k$ \cite{HaykinBook96}. Also, denote $\varphi$ the
non-linear function relating input $s$ to output activity $r$ in the
hidden layer nodes, \ie $r_i^{(2)}=\varphi(s_i^{(2)})$, the output
nodes are assumed linear, \ie $r_i^{(3)}=\alpha s_i^{(3)}$
\footnote{In both cases the biases are embedded in the function
definitions.}. With this re-parameterization, we can then show that
the gradient for function $k$ is:
If $l=2$ ($w$ connects the hidden with the output layer): \beqa
\frac{\partial}{\partial
w_{jil}^k}\phi_{k}(\upsilon,\theta_k)=-r_i^{(3)}\varphi'(s_j^{(3)})
\eeqa
If $l=1$ ($w$ connects the input to the hidden layer): \beqa
\frac{\partial}{\partial
w_{jil}^k}\phi_{k}(\upsilon,\theta_k)=-r_i^{(3)}\varphi'(s_j^{(3)})\sum_q
\varphi'(s_q^{(2)}) w_{qj}^{(2)} , \eeqa
with $s_i^l$ the input in node $i$ in layer $l$ and $r_i$ its
corresponding output activity.}
\subsubsection{Generative Model Details: Feedback Functions}
There are at least two ways to define this function. On the one hand,
$\zeta$ could be a computer graphics rendering function. On the other
hand, we could estimate an approximate $\hat{\zeta}$ given a set of
output-input training examples. In our implementation, we experimented
with both ideas. For $\zeta$, we used computer graphics renderings of
our hand and body models obtained via OpenGL. For $\hat{\zeta}$, we
used a one-layer MLP, with twenty hidden nodes (however the method is
overall independent of the functional form chosen). In our experience,
this provides an adequate and efficient approximation.
%RRChange .... with twenty hidden [replaced] with five hidden...
%%SS: OK
The approximate feedback function is useful primarily because it is
faster to compute than a graphical rendering followed by visual
feature computation. \changed{The key issue to keep in mind is that the
feedback mapping is assumed to be simple (one-to-one or even
many-to-one) or that it has a known form, otherwise if we assume too
simple functional forms, we would only introduce more estimation
errors. Of course, this is just a practical issue}. If the feedback
mapping is too complex to approximate easily, we could always rely on
the available feedback function $\zeta$.
%%RRChange [Added] many-to-one
%%SS: OK
\subsubsection{Computational Performance}
For an Athlon 1400 PC with 2GB memory, running unoptimized Matlab 6.0
code, it takes approximately five hours to train a model with 10
dimensions (input) and 10 dimensions (output), using 4500 patterns,
and 40 single hidden layer MLPs with five hidden nodes
each. The system can infer body poses at approximately 11 frames per
second, using the Mean Output (MO) algorithm. \changed{This
approach's} related computations take approximately 70\% of this
time. This time includes OpenGL-based rendering of body poses in
$\zeta$. The rest is spent in segmentation and feature
calculations. The Multiple Sample (MS) algorithm takes time
proportional to the number of samples used. Of course, segmentation
and feature computation for the segmented image is done only once. We
noticed that for our implementation, if we use the approximate
feedback function, $\hat\zeta$, the rendering time is reduced to
approximately one-fourth.
%%RRChange, [I looked at my notes and fixed this]
%%SS: OK
\subsubsection{Early Stopping During Training}
During model training, we used cross-validation for early stopping and
to avoid over-fitting as follows:
%\footnote{The Minimum Description
%Length (MDL) principle \cite{Rissanen86} was also used to avoid
%overfitting as explained in the experiments}:
%%RRChange [added footnote]
%%SS: Removed: I must insist (sorry).
%% This is redundant with text elsewhere and unrelated to early stopping.
\begin{itemize}
\item {\em Training data:} Stop if the log-likelihood changes less
than 0.5\% averaged over the last ten iterations.
\item {\em Held out data:} Stop if the held out data
log-likelihood average change is negative over the last ten
iterations. Held out data was chosen in the same way as the
training and test data.
\item {\em Number of iterations:} Stop if a maximum of 200
iterations is reached.
\end{itemize}
\setlength{\tabcolsep}{1pt} %%SS: This changes separation between table columns
\renewcommand\arraystretch{0.25} %% SS: This changes separation between table rows
\section{Experimental Results}
\label{sec:Exp}
We now present experimental results obtained using our approach in
estimating the pose of the human hand and body. For many additional
performance experiments not included due to space limitations, the
reader is referred to \cite{RosalesPhDThesis} and for several MO
estimation videos to
http://www.psi.toronto.edu/$\sim$romer/SMAHandVideos.htm. The SMA
application independent Matlab code can be found at
http://www.psi.toronto.edu/$\sim$romer/SMACode.htm.
\subsection{Hand Pose Estimation Given a Fixed Camera Viewpoint}
\label{sec:FixCam}
In our first experiments, our approach is tested in the task of
recovering 3D human hand pose given a fixed camera viewpoint: a
view towards the palm of the hand. For training, we used the {\it
Hand-Single-View} dataset, which contains a total of approximately
9,000 examples. Of these, 3,000 were used for training and the
rest for testing. All experiments were performed on a test set
that shared no common poses with the training set. The
input-output pairs were then defined as follows. The input
consisted of 10 Alt moments computed from the silhouette of the
hand, as described in Sec.\ \ref{sec:AppsHand}. The output
consisted of 20 joint angles of a human hand linearly encoded by
nine values using Principal Component Analysis (PCA).
The number of mixture components for the discriminative model (mapping
functions) was set to 20. This number was found to be optimal in the
sense of the Minimum Description Length (MDL) principle
\cite{Rissanen86}; we found this number via a rough model search
(testing MDL and getting the score for the optimized model with
10,12,...,24 functions). Each mapping function (for each of the
Gaussians in the mixture) was a MLP with seven hidden neurons.
\subsubsection{Quantitative Results}
We randomly selected approximately 4,000 frames not included in the
training set. Since ground-truth is available, we used the average
absolute difference per joint angle (between ground-truth and
estimate) as error measure. Table ~\ref{tab:Err1} summarizes our
results (see caption).
\begin{table}[t] {
\begin{center}
\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
\hline
& MO-MAP ($\hat{\zeta}$) & MS-MAP ($\hat{\zeta}$) & MS-20 ($\hat{\zeta})$ & MO-MAP ($\zeta$) & MS-MAP ($\zeta$) & MS-20 ($\zeta$) & Rand/train & Range \\
\hline
\hline
$\hat{\cl E}$ & 0.1322 & 0.1667 & 0.1465 & 0.1651 & 0.1769 &0.1785 & 0.4294 & 1.55\\
\hline
$\sigma^2_{\hat{\cl E}}$ & 0.0317 & 0.0415 &0.0371 & 0.0425 & 0.0452 &0.0547 & 0.1630 & -\\
\hline
\end{tabular}
\end{center}}
\mycaptionS{Mean absolute error $\hat{\cl E}$ and variance
$\sigma^2_{\hat{\cl E}}$. Inference performance using different
rendering functions ($\zeta$ and $\hat{\zeta}$) and inference
algorithms (MO-MAP and MS-MAP). Also shown, the accuracy of the most
probable reconstructions given by MS (MS-20). As a point of
comparison, results are presented for an algorithm that randomly
chooses one of the training examples as result (Rand/train). The
average range of the data is also shown as a reference point. All
units are in radians.}
\label{tab:Err1}
\end{table}
\hide{
Using the estimated feedback function $\hat\zeta$ in the
Mean Output approach (MO), the average $L_2$ error between
reconstruction and ground-truth was $0.1863$ radians (approximately
$10^o$), with variance $0.0185$. These error estimates are averaged
over joint angles. We ran this experiment with the same test set, but
instead used the computer graphics rendering feedback function
$\zeta$. When using $\zeta$, similar accuracy was obtained. The
average $L_2$ error between reconstruction and ground-truth in this
case was $0.241$ radians, with variance $0.0312$. Their symmetric KL
divergence is 0.134 bits. In \cite{RosalesPhDThesis}, we explain in
detail possible reasons for this difference in performance.
%SChange previous paragraph
}
\psfigurepath{./figs/H90}
\begin{figure*}[ht]
\centerline{\begin{tabular}{rcccccccccc}
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.02096.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03973.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03275.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01965.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01265.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.00655.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01729.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02576.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01877.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01091.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.02096.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03973.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03275.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01965.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01265.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.00655.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01729.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02576.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01877.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01091.eps,width=0.61in,clip=t} \\
%
\\
\out{
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.03942.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03569.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01572.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02273.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02575.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01681.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01659.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02401.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02751.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02183.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.03942.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03569.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01572.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02273.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02575.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01681.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01659.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02401.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02751.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02183.eps,width=0.61in,clip=t} \\
%
\\}
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.02663.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03842.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02162.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02353.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02369.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.04272.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.04048.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03872.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03856.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03840.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.02663.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03842.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02162.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02353.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02369.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.04272.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.04048.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03872.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03856.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03840.eps,width=0.61in,clip=t} \\
%
\\
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.03296.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02928.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02896.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02784.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02672.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01825.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02576.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02449.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02001.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03328.eps,width=0.61in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.03296.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02928.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02896.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02784.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02672.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01825.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02576.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02449.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02001.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03328.eps,width=0.61in,clip=t} \\
\end{tabular}
} \mycaption{Estimated hand poses using Mean Output (MO) algorithm
and $\hat{\zeta}$}{\small \CapTestI}{} \label{fig:H90Res}
\end{figure*}
These experiments quantitatively confirmed that MO inference provides
a reasonable approximation, at least for this dataset. Recall from
Sec.\ \ref{sec:MO} that MO inference was based on the premise that the
most-likely reconstruction given by each \changed{discriminative
mixture component} provides a good approximation to the best solution
given by the full probability distribution.
Fig.\ \ref{fig:H90Res} shows example reconstructions obtained via
the MO approach. In many cases, the reconstruction is close to the
ground truth. In other cases, the silhouette is highly ambiguous,
and the reconstruction does not match ground truth. A good example
is shown in image pair number 34 (the last row-pair, fourth column),
where the camera's image plane is perpendicular with the axis of
the pinky finger. Note that the estimated hand pose disagrees with
the ground-truth in the several joint angles associated with this
finger. Similar effects with other joint angles can be seen in
example pairs 8, 16, 27, etc.
Ambiguous configurations are indeed very common with a binary
image representation. Note that in other ambiguous cases shown in
Fig.\ \ref{fig:H90Res} reconstruction is closer to ground truth,
\eg pairs 19, 20, etc. Possible reasons for this agreement are
diverse:
\begin{enumerate}
\item The input is not really ambiguous (probabilistically speaking)
in the observation space. The other possible outputs (geometrically
speaking) associated with this input may be very unlikely given the
training set. This depends on the underlying structure of the
configuration manifold. One of the main goals of a learning algorithm
is to find this structure. Indeed these results show that our
algorithm is finding this structure, since in most cases, MO finds a
valid sample from the manifold.
%RRChange [last sentence]
%%SS: OK
\item \changed{ The learned discriminative model was accurate at
modeling the given input using a single mixture component}(\ie few
mapping functions were trained to map this input, therefore the rest
of the functions produced irrelevant (bad) outputs).
\item By chance, among many very similarly probable solutions, the
{\it right} one was chosen. Of course, even with the help of chance in
this case, the discriminative model needed to be accurate enough at
approximating the true posterior so that samples were relevant at all.
%mapping functions needed to provide the
%right mapping for the given input $\mb{x}$.
\end{enumerate}
\hide{
The accuracy of the Multiple Samples (MS) inference approach was
tested in similar experiments with approximately $4,000$ randomly
chosen test examples not included in the training set. When the
estimated feedback function $\hat\zeta$ was used, the mean $L_2$ error
of the most likely sample to the ground-truth was $0.2202$ radians
with variance $0.0228$. The mean error and variance from the best 20
samples was $0.308$ and $0.0323$ respectively. When we performed the
same experiment, but instead used the computer graphics feedback
function $\zeta$, we obtained a mean error of $0.2628$ radians with
variance $0.0242$ for the most likely sample. The mean error of the
best 20 samples was $0.3128$ radians with variance $0.0300$.
}
\subsubsection{Performance Comparison with Respect to Discriminative Model Alone and a Competing Approach}
\label{sec:H90Comparison}
\changedRev{In this section we experimentally compare our method with
with the purely discriminative approach, that is without employing the
generative model (but only the discriminative one). One can see this
test as a way to measure how effectively the generative model
disambiguates among poses; thus illustrating its level of contribution
in the overall approach. In addition, for further validation, we also
compare our method against the standard MLP, trained using
backpropagation to {\it globally} map image features to 3D poses. }
\changedRev{As before we follow the MAP principle to determine the
best pose $\hat{\mb{h}}$ given input features and a model. Recall
from Sec.~\ref{sec:InfSMADis} that the MAP estimate is given by
$\hat{\mb{h}}=\arg\max_\mb{h} \sum_y
{\cl{N}}(\mb{h};\phi_y(\mb{x}^*),\Sigma_y) Q(y)$. Since this function
is not concave, we used a simple heuristic to choose a maximum. We
performed gradient ascend starting at each of the $M$ points
$\{\phi_y(\mb{x}^*)\}_{y \in {\cl C}}$, and set $\hat{\mb{h}}$ to the
highest point ever reached. }
\changedRev{As expected this method performed poorly. The mean
absolute error and variance for this dataset were $0.3702$ and
$0.2117$ respectively, just better than randomly choosing a pose from
the training set (Table \ref{tab:Err1}). This should cause no surprise
at all since the discriminative model alone is not designed to 'know'
what the right mixture component is, given any input presented. More
formally, the mixture parameters $Q(\mb{y})$ do not depend on the
input. The high variance can be attributed to the inconsistent usage of
good and bad functions to map the input. The role of the generative
model in our approach is essentially that of providing information
about what function (mixture component) is appropriate given the
input. }
\changedRev{For a comparison (outside our method) we now compare our
full approach against the widely used MLP. Note that unlike above,
here we use one MLP in the standard way, that is as a function
approximation approach to map input to outputs using the whole
training set (trained using backpropagation). MLP is an {\it
off-the-shelf} yet, commonly effective method. }
\changedRev{For this comparison, we varied the number of parameters
(number of weights and biases) in a considerably broad range. Results
are shown in Table \ref{tab:H90Comp} as a function of the number of
hidden nodes. In order to establish fair comparison with our model, we
need to use the same number of parameters. It turns out that the
number of hidden nodes of the MLP must be equal to $K\sqrt(M)$ (where
K is the number of hidden nodes for each function in our approach and
$M$ is the number of functions); for this experiment this number is ~22 (shown in bold
face).}
\changedRev{
\begin{table}[t] {
\begin{center}
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c}
\hline
Number of Hidden Nodes & 16 & {\bf 22} & 28 & 34 & 40 & 46 & 52 & 58 & 64\\
\hline
\hline
$\hat{\cl E}_{MLP}$ & 0.2039 & {\bf 0.1953} & 0.1851 & 0.1784 & 0.1733 & 0.1729 & 0.1738 & 0.1891 & 0.2003\\
\hline
$\sigma^2_{\hat{\cl E}_{MLP}}$ & 0.0354 & {\bf 0.0324} & 0.0294 & 0.0280 & 0.0266 & 0.0278 & 0.0341 & 0.0419 & 0.0512\\
\hline
\end{tabular}
\end{center}}
\mycaptionS{Performance comparison between our
discriminative-generative approach against the standard MLP (results
are averages from 10 runs per model). The table shows the mean
absolute error and variance using the same training/test sets as our
method (see Table \ref{tab:Err1}). Overall the performance of our
method is from 1.09 to 1.48 times better than this approach using
the same number of parameters and from 0.97 to 1.31 times better
when letting the MLP have any number of parameters in the table (46
hidden nodes).}
\label{tab:H90Comp}
\end{table}}
\changedRev{By comparing the results from Tables \ref{tab:Err1} and
\ref{tab:H90Comp} we can observe that (1) for a fair comparison with
similar number of free parameters, our discriminative-generative
method (for all inference algorithms in Table \ref{tab:Err1}) clearly
outperforms the MLP and (2) when the MLP is allowed to have more
parameters, our method still outperforms the MLP in average; however,
for a few inputs the performance is similar or better for MLP. Note
that when the number of parameters for the MLP is larger, the variance
also diminishes considerably. However, we should remark that to achieve
such performance, the MLP needed to employ ~1.7 times the number of
parameters employed by our model. }
\subsubsection{Experiments with Real Images}
\label{sec:H90RealImgs}
We now test our approach using uncalibrated video sequences, where the
camera is pointing towards the palm of a person's hand. On average,
the hand occupied an area of approximately $200 \times 200$
pixels. Segmentation was obtained as described in Sec.\
\ref{sec:segment}. In the first experiment, we use the MO approach to
obtain a single {\it best} estimate for each segmented hand. Estimates
for 40 frames, taken 0.9 seconds apart, are shown in Fig.\
\ref{fig:Real90TestIE}. Visually we can notice that in most cases the
estimate is a plausible explanation of the segmented silhouette.
However, there are also a few inaccurate reconstructions.
%as seen in the fourth row, columns 1 and 5.
%% SS: (figure changed, so these numbers are incorrect)
In general, it is expected that the model cannot perform well
in all configurations (this is true for almost any machine
learning model) due to the following reasons:
\begin{enumerate}
\item The proposal distribution $q(\mb{h}|\mb{x})$ does not resemble
the true posterior distribution $p(\mb{h}|\mb{x})$ at the particular
$\mb{x}=\mb{x}^*$: learning is the result of optimizing an {\it
expected} or average error.
\item The real hand and synthetic hand model features are similar
but not the same. Anthropometric differences can influence
inference accuracy.
\item Even the best model could fail in some configurations.
Information theory tells us that this is always the case except
when the {\it information} in the features is equal to the entropy
of the body pose configurations; in other words, when features
tell us everything needed about the configuration. Otherwise,
there might be multiple explanations for a given visual feature
vector.
\end{enumerate}
In order to test the ability of the system to provide these multiple
explanations, we tested the Multiple Samples (MS) approach. Fig.\
\ref{fig:Real90TestII} shows the estimates found using MS. These
estimates can be interpreted as possible hypotheses of hand
configurations given the silhouettes. \changed{Note that MS tends to
bias the hypotheses towards samples from the distribution
$q(\mb{h}|\mb{x}^*)$, but we can account for this when building a full
probability distribution, as explained in Sec.~\ref{sec:GenInf}}
\psfigurepath{./figs/RealResultsHand2}
\begin{figure*}[ht]
\centerline{\small \begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Subsampled1_F71_770.rle.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00046.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00046.eps,width=0.61in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00096.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00096.eps,width=0.61in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00121.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00126.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00131.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00136.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00141.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00146.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00121.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00126.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00131.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00136.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00141.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00146.eps,width=0.61in,clip=t} \\
\out{
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00151.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00156.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00161.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00166.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00171.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00176.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00181.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00186.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00191.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00196.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00151.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00156.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00161.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00166.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00171.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00176.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00181.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00186.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00191.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00196.eps,width=0.61in,clip=t}\\}
\end{tabular}
} \mycaption{Hand pose estimates in real video sequences (RV)
using the Mean Output algorithm (MO).}{\small \CapRTestIE}{}
\label{fig:Real90TestIE}
\end{figure*}
\psfigurepath{./figs/RealResultsHand2}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{ccccccc}
RV & MO& S1& S2& S3 & S4 & S12 \\
%
\psfig{figure=Subsampled1_F71_770.rle.00010.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00010.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00019.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00019.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00028.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00028.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00037.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00037.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00046.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00046.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00055.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00055.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00073.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00073.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00082.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00082.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_012.eps,width=0.61in,clip=t}\\
\end{tabular}}
\mycaption{Hand pose estimates in real sequences using multiple
sampling algorithm}{\small \CapRTestII}{} \label{fig:Real90TestII}
\end{figure*}
\subsection{3D Hand Pose Reconstruction Given an Unrestricted Camera Viewpoint}
\label{sec:3DHan}
Our approach is now tested in the task of recovering 3D human hand pose
from an unknown camera viewpoint. For training, we used the {\it
Hand-All-Views} dataset, which contains a total of approximately
750,000 examples. Of these, 18,000 were used for training and the
rest for testing. The input-output pairs were then defined as
follows. The input consisted of seven Hu moments computed from
the silhouette of the hand, as described in Sec.\
\ref{sec:AppsHand}. The output consisted of 20 internal joint
angles of the hand and two orientation angles. This 22 DOF
representation was linearly encoded by nine values using PCA.
The number of \changed{mixture components} (mapping functions) was
set to 45. This number was determined via the MDL criterion, as
before (testing for the best MDL score using a model with 35,37,...,51
functions). Each function was a MLP with seven hidden
nodes.
\subsubsection{Quantitative Results}
As before, we computed the absolute error in estimating hand pose, and
quantitatively compared this measure across views. Fig.\
\ref{fig:HandPerf1SampleS} shows the error of the most likely estimate
found using the MO approach. From the graphs we see that views towards
the palm of the hand ($90^\circ$) are slightly easier to reconstruct
on average, while the variance seems similar across views. As
expected, the average error is higher than that obtained for the fixed
view hand pose reconstruction experiments. It seems that for
unrestricted hand views it is a bit advantageous to use the computer
graphics feedback function $\zeta$. This is probably because
estimating this inverse mapping $\hat\zeta$ \changed{(to define the
generative model)} over unrestricted viewpoint is more complicated
than for only frontal hand views (and the mapping is likely to be more
complex also).
%%RChange previous paragraph
%SChange Previous paragraph
Fig.\ \ref{fig:HandPerf1SampleM} shows the results using the MS
approach. Fig.\ \ref{fig:HandPerf1SampleM}(a) shows the error
associated with the best sample. This error behaves very similarly to
the MO error. Fig.\ \ref{fig:HandPerf1SampleM}(b) shows the average
error computed using the best 20 samples. This error is higher than
that of the best sample. Note that this is not an obvious result given
that the best sample is determined without having knowledge of
ground-truth. In fact, if the average error of the best 20 samples
were lower than that of the best sample, then we could infer that our
algorithm is very inaccurate at determining what samples are
better. Thus this result positively endorses our MS algorithm.
%%RRChange Thus this result positively endorses our MS algorithm.
%%SS: OK
For comparison, we used the ground-truth to select the best sample,
based on minimum error. In other words, we have an oracle that picks
the sample closest to the ground-truth. The resulting performance
graph is shown in Fig.\ \ref{fig:HandPerf1SampleM}(c). This
represents the lower-bound on the reconstruction error using the
learned forward model. The graph is interesting in the sense that it
separates the errors from the forward and feedback models.
%ATT!!!!
%The feedback model produces a RMSE $< 0.35$ across views. This is
%roughly half the total RMSE error produced by our method overall.
\psfigurepath{./figs}
\begin{figure}[t]
\centerline{(a)
\psfig{figure=GraphViewsRes_softH2V31-7_7_GR7i.mat.eps,width=1.8in,clip=t}
%GG_Res_softH2V31-7_7_GR7i.mat.EType_0.eps,width=3in,clip=t}
~~~(b)
\psfig{figure=GraphViewsRes_softH2V31-7_7_R7iE.mat.eps,width=1.8in,clip=t}
%\psfig{figure=GG_Res_softH2V31-7_7_R7iE.mat.EType_0.eps,width=3in,clip=t}
} \mycaption{Unrestricted view model performance using Mean Output
(MO) and $\hat{\zeta}$}{\small Mean Output (MO) inference
performance for unrestricted view tests at given viewpoint
latitudes (averaging over longitude). The feedback function is (a)
the estimated $\hat\zeta$ (b) the computer graphics rendering
$\zeta$. A frontal view of the hand palm is at latitude
$\beta_1=\pi/2$ , longitude $\beta_2=0$. For reference, the performance of an algorithm that chooses the estimate at random from the training data is shown. The angle range is in average 1.87 radians}{}
\label{fig:HandPerf1SampleS}
\end{figure}
%%RChange previous figure
%SChange previous caption
\psfigurepath{./figs}
\begin{figure}[t]
\centerline{\small (a)
\psfig{figure=GraphViewsMS1.eps,width=1.8in,clip=t}
%GG_Res_softH2V31-7_7_R7iM.mat.EType_1.eps,width=2.0in,clip=t}
\small (b)
\psfig{figure=GraphViewsMS2.eps,width=1.8in,clip=t}
%\psfig{figure=GG_Res_softH2V31-7_7_R7iM.mat.EType_2.eps,width=2.0in,clip=t}
\small (c)
\psfig{figure=GraphViewsMS3.eps,width=1.8in,clip=t}
%\psfig{figure=GG_Res_softH2V31-7_7_R7iM.mat.EType_3.eps,width=2.0in,clip=t}
} \mycaption{Unrestricted view model performance using multiple
sampling and $\hat{\zeta}$}{\small Multiple Samples (MS) inference
for unrestricted view tests at given viewpoint latitudes
(averaging over longitude). Feedback functions is the estimated
$\hat{\zeta}$. A frontal view to the hand palm is at latitude
$\beta_1=\pi/2$ , longitude $\beta_2=0$. (a) Most probable sample.
(b) Average over all samples (20 most probable samples taken). (c)
Best sample (determined using ground-truth information for
comparison). For reference, the performance of an algorithm that chooses the estimate at random from the training data is shown. The angle range is in average 1.87 radians}{} \label{fig:HandPerf1SampleM}
\end{figure}
\subsubsection{Performance Comparison with Respect to Discriminative Model Alone and a Competing Approach}
\label{sec:3DHanComparison}
\changedRev{
In parallel with Sec.\ref{sec:H90Comparison}, we now compare our
full method against the purely discriminative portion. This is done
to illustrate the level of contribution of the generative model in the
overall approach. Similarly as before, we also compare our method
against the standard MLP for this dataset.}
\changedRev{
Using the discriminative model alone, the mean absolute error and
variance for this dataset were $0.6102$ and $0.5117$
respectively. Since the importance of the generative method in the
overall approach should, by now, be more clear, we will not discuss
this point further. Results are analogous to those from
Sec.~\ref{sec:H90Comparison}.}
\changedRev{ As before, we compare our full approach against the
standard MLP (trained on the same training set as our
approach). Results are shown in Table \ref{tab:3DHanComp}. Note that
when the MLP contains 47 hidden nodes, the number of parameters is
comparable with that of our discriminative-generative model. The
performance of our approach for this dataset is shown in
Figs.~\ref{fig:HandPerf1SampleS} and \ref{fig:HandPerf1SampleM}.}
\changedRev{
\begin{table}[t] {
\begin{center}
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c}
\hline
Number of Hidden Nodes & 35 & {\bf 47} & 59 & 71 & 83 & 95 \\
\hline
\hline
$\hat{\cl E}_{MLP}$ & 0.5775 & {\bf 0.5714} & 0.5585 & 0.5534 & 0.5511 & 0.5514 \\
\hline
$\sigma^2_{\hat{\cl E}_{MLP}}$ & 0.3572 & {\bf 0.3680} & 0.3512 & 0.3637 & 0.3794 & 0.4111 \\
\hline
\end{tabular}
\end{center}}
\mycaptionS{Performance comparison between our
discriminative-generative approach against the standard MLP (results
are averages from 10 runs per model). The table shows the mean
absolute error and variance using the same training/test sets as our
method. Overall the performance of our method is 1.31
times better than this approach using the same number of parameters
and 1.28 times better when letting the MLP have any number
of parameters in the table (83 hidden nodes). }
\label{tab:3DHanComp}
\end{table}}
\changedRev{At first sight the performance comparison seems similar to
that of our previous task with fixed viewpoint. However, a more
careful look at Table ~\ref{tab:3DHanComp} reveals that (1) our method
clearly outperforms the MLP even when the MLP uses more than double
the number of parameters with respect to our model, a significant
difference from Sec. \ref{sec:3DHanComparison} (fixed viewpoint) where
performance was more even when letting the MLP have more parameters;
(2) also unlike Sec. \ref{sec:3DHanComparison} the variance is much
larger than that of the estimates computed by our approach. The key
difference between the fixed viewpoint dataset and this dataset
(unrestricted view) was that the mapping visual-features to hand-pose
is much more ambiguous when any view is allowed. This illustrates that
one-to-many datasets is not well suited for function approximation
methods, and our method can indeed provide a more clear advantage in
these cases.}
\hide{By comparing the results in
Figs.~\ref{fig:HandPerf1SampleS} and \ref{fig:HandPerf1SampleM}
(illustrating the performance of our proposed method) and
\ref{tab:3DHan0Comp}}
\subsubsection{Experiments with Real Images}
\label{sec:HAnyRealImgs}
We test our approach using video of hands (in any orientation)
collected from a single uncalibrated camera. Pose estimates from 40
frames (taken every 0.9 secs apart) obtained via the MO approach are
shown in Fig.\ \ref{fig:RealAnyTestIE}. Note that there are
incorrectly-segmented hands in this sequence. We decided to leave
these in to avoid frame rearrangements (losing the uniform frame
sampling), to show that segmentation does not always work correctly,
and to show that this approach is inherently robust to extreme
segmentation errors. In this experiment, there was usually visual
agreement between reconstruction and estimate as seen in the
figure. Note that even for a human observer, looking at the segmented
silhouettes in the figure, reconstruction is sometimes
ambiguous. There are also some configurations for which the system did
not perform correctly.
Fig.\ \ref{fig:RealAnyTestIIE} shows the estimates obtained via the MS
approach. The frames shown were taken approximately every 0.9
seconds. In the second row, we can see some limitations of the Hu
moment feature space: sometimes, different hand orientations are very
similar in the feature space. These apparently different hypotheses
are close to each other in terms of their probability, given the
features. The same effect repeats clearly in the third and sixth
row. This problem might be alleviated by using a different input
feature space. At an extreme one might consider the full silhouette as
a feature. Of course there are important trade-offs to take into
account when considering different features; e.g., invariants, and
dimensionality.
\psfigurepath{./figs/RealResultsH2Unr}
\begin{figure*}[ht]
\centerline{\small
\begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00046.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00046.eps,width=0.61in,clip=t} \\
\out{
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00096.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00096.eps,width=0.61in,clip=t} \\}
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00121.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00126.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00131.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00136.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00141.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00146.eps,width=0.61in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00121.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00126.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00131.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00136.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00141.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00146.eps,width=0.61in,clip=t}\\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00156.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00161.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00166.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00171.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00176.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00181.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00186.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00191.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00196.eps,width=0.61in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00156.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00161.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00166.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00171.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00176.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00181.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00186.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00191.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00196.eps,width=0.61in,clip=t}\\
\end{tabular}}
\mycaption{Estimated hand poses from real sequences using Mean
Output (MO) algorithm and $\zeta$}{\small \CapRTestIE}{}
\label{fig:RealAnyTestIE}
\end{figure*}
\psfigurepath{./figs/RealResultsH2UnrM}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{ccccccc}
RV & MO & S1 & S2 & S3 & S4 & S12 \\
%
\psfig{figure=Subsampled1_F771_1269.rle.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00160.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00160.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00169.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00169.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00178.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00178.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00187.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00187.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00196.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00196.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00214.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00214.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00223.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00223.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_012.eps,width=0.61in,clip=t}\\
% Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_012
\end{tabular}} \mycaption{Estimated hand poses from real video (RV) sequences using
Mean Output (MO) and Multiple Samples (MS) inference.}{\small \CapRTestIIE}{}
\label{fig:RealAnyTestIIE}
\end{figure*}
\subsection{2D Human Body Pose Reconstruction}\label{sec:2DHum}
In order to show that our approach can be employed, with no change, to
perform other similar tasks (possibly with a different
representation), here we now conduct performance tests in the task of
estimating human body pose from a single image. The goal is to
estimate the 2D locations of body markers in the image, given visual
features computed from the person's silhouette. In this experiment, we
use the {\it Body-All-Views} dataset, which contains a total of of
over 100,000 samples. Of these, 8,000 were used for training and the
rest for testing. The input-output pairs were defined as follows. The
input consisted of the 10 Alt moments computed from the
silhouette. The output consisted of 20 2D marker positions (40 DOF),
which were then linearly encoded by nine values using PCA.
The number of \changed{mixture components in the discriminative model}
was set to 15. This number was determined via the MDL criterion,
exactly as before. Each function is a MLP with seven hidden nodes.
\subsubsection{Quantitative Results}
Fig.\ \ref{fig:ArtC} shows the reconstruction obtained with the MO
approach for frames taken from three synthetic sequences
excluded from the training set.
The agreement between reconstruction and observation is easy to
perceive for all frames. Also, for self-occluding configurations,
the estimate is still similar to ground-truth.
%It is important to
%remark that no human intervention nor pose initialization was
%required.
%RRChange, It is important to remark that
%SS: Removed. This is redundant. I can't point to at least
% two other places
% in the paper where you say this already.
Fig.\ \ref{fig:ArtCP} shows the average marker error and variance per
body orientation in percentage of body height. Note that the error is
bigger for orientations closer to $0$ and $\pi$ radians. This
intuitively agrees with the notion that at those angles (side-views),
there is less visibility of the body parts. We consider this
performance promising, given the complexity of the task and the
simplicity of the approach. Just as a reference point, by choosing
poses at random from those in the training set, the RMSE was 10.35\%
of body height (with a standard deviation of 4.4\%). In related work,
quantitative performance has usually been ignored, in part due to the
lack of ground-truth and standard evaluation datasets.
%SChanged above paragraph
%% \begin{figure}[h]
%% \parbox[c]{0.615\textwidth}{
%% \psfigurepath{../NIPS01/epsArt}
%% \centerline{GT
%% \psfig{figure=ArtSil_00000.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00001.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00002.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00019.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00023.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000000-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000001-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000002-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000019-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000023-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } \vspace{-.2in}\rule[.0in]{4.0in}{0.01in}
%% \psfigurepath{../NIPS01/epsArt2}
%% \centerline{GT
%% \psfig{figure=ArtSil_00004.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00005.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00006.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00007.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00009.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00013.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000004-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000005-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000006-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000007-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000009-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000013-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } \vspace{-.4in}\rule[.0in]{4.0in}{0.01in} \centerline{GT
%% \psfig{figure=ArtSil_00035.Art40.eps,width=0.65in,clip=t}
%% \psfig{figure=ArtSil_00036.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00041.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00045.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00049.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000035-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000036-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000041-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000045-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000049-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } } \hfill
%% \parbox[c]{0.42\textwidth}{
%% \psfigurepath{../NIPS01/eps}
%% \centerline{\psfig{figure=ViewPointTest.eps,width=0.4\textwidth,clip=t}}}
%% \mycaptionS{\small Left: Example reconstruction of several test
%% sequences with CG-generated silhouettes. Each set consists of
%% input images and reconstruction (every 5th frame). Right: Marker
%% root-mean-square-error and variance per camera viewpoint (every
%% $2\pi/32$ rads.). Units are percentage of body height. Approx.
%% 110,000 test poses were used. } \label{fig:ArtC}
%% \end{figure}
\psfigurepath{../NIPS01/epsArt/}
\begin{figure}[t]
\centerline{\small
\begin{tabular}{rccccccccc}
GT &
\psfig{figure=ArtSil_00000.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00001.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00002.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00019.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00023.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00004.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00005.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00006.Art40.eps,width=0.65in,clip=t} \\
MO &
\psfig{figure=000000-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000019-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000023-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000004-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000005-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000006-431602080.Art40.tif.eps,width=0.65in,clip=t} \\
\\
\hline
\\
GT &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00007.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00009.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00013.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00035.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00036.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00041.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00045.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00049.Art40.eps,width=0.65in,clip=t} \\
MO &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000007-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000009-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000013-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000035-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000036-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000041-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000045-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000049-431602080.Art40.tif.eps,width=0.65in,clip=t} \\
\end{tabular}
}\hfill \mycaptionS{\small Example reconstruction of frames from
test sequences with computer graphics-generated silhouettes.
%Each %several
%set consists of input images and reconstruction.
% (every 5th frame).
} \label{fig:ArtC}
\end{figure}
\psfigurepath{../NIPS01/eps}
\begin{figure}[t]
\centerline{\psfig{figure=ViewPointTest.eps,width=0.3\textwidth,clip=t}}
\mycaptionS{\small Root mean-square-error (divided by number of
markers) and variance per camera viewpoint (every $2\pi/32$
rads.). Units are percentage of body height. Approx. 110,000 test
poses were used.}
\label{fig:ArtCP}
\end{figure}
\subsubsection{Experiments with Real Images}
We now test the approach using real video sequences of human body
motion. We use the basic segmentation approach described in
Sec.~\ref{sec:BodyDet} to obtain silhouettes. Fig.~\ref{fig:ExampR0}
shows examples of system performance obtained via the MO approach for
several relatively complex motion sequences. Even though the
characteristics of the segmented body differ from the ones used for
training, good performance is still achieved. Most reconstructions are
visually close to what can be thought of as the right pose
reconstruction. Body orientation is also accurate. \changed{In the
Figure, we can see two particularly difficult configurations at the
second row of real video (RV) images, fourth-sixth columns; the arm
configuration is difficult to estimate}. \changed{ This could be due
to the lack of relevant training data, as a consequence the
discriminative model $q$ may not approximate the generative model $p$
very well around the input vector. In general, an important issue to
keep in mind is that the visual differences between the rendered model
and the real body observed could become critical and thus accurate
rendering may be desirable. This varies from application to
application; however in any case the general inference approach
presented here remains the same.}
%We used 60 specialized functions, each one was a MLP with five
%hidden nodes.
\hide{ Fig.\ \ref{fig:RealBodyMS} shows the top-ranked pose samples
obtained via the MS approach. Note that despite low-quality
segmentation, the system outputs reasonably accurate pose
hypotheses. Orientation is accurate and the relative limb
relationships are maintained. However, we can observe that some poses
are inherently difficult and the estimate lacks enough pose detail to
be perceived as a good estimate. For example, the eighth row shows a
side view of a person raising one arm while keeping the other arm at
rest. The resulting MS estimates all show a side-view, however none
has the correct arm configuration. }
%One difference with respect to the hand pose estimation task is
%that the rendering quality or realism for body pose is poorer for
%the human body renderer.
In this work, we did not pursue use of a more realistic human body
renderer. Due to differences in shape and width of body components
observed in training versus testing, the visual features may differ.
This is a relevant point since in almost all learning models, it is
expected that the training data be a good approximation to the real
test data. Improving the match between visual features used in
training and testing, and thus potentially the overall performance, is
an area that we plan to investigate in future
research. \changed{Despite the fact that we have ignored differences
in anthropometric characteristics between CG and real silhouettes, the
performance observed for both articulated objects (hands - human
bodies) is excellent given that only a single image is assumed
available.}
%In theory this could allow us to adapt our algorithm to different body
%or hand anthropometric characteristics.
%\hide{
\psfigurepath{../NIPS01/eps}
\begin{figure}[h]
\centerline{\small \begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Sil_00001.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00003.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00004.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00005.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00000.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00001.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00000.3.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00001.3.eps,width=0.65in,clip=t}\\
MO &
\psfig{figure=000001-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000003-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000004-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000005-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000000-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000000-431602080.3.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.3.tif.eps,width=0.65in,clip=t}\\
\\
\hline
\\
RV &
\psfig{figure=Sil_00001.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00003.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00004.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00005.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00006.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00007.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00008.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00008.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00010.4.eps,width=0.65in,clip=t} \\
MO &
\psfig{figure=000001-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000003-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000004-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000005-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000006-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000007-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000008-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000009-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000010-431602080.4.tif.eps,width=0.65in,clip=t}\\
\end{tabular}
} \caption{\small Reconstruction obtained from observing a human
subject (every 10th frame).}
\label{fig:ExampR0}
\end{figure}
\hide{
\psfigurepath{./figs/ResRealBodyMS}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{cccccc}
RV & S1 & S2 & S3 & S4 & S12 \\
%
\psfig{figure=Sil_00001.1.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.1.tif.eps,width=0.65in,clip=t} \\
%
\psfig{figure=Sil_00003.1.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S002.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S005.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S008.1.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00005.1.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S003.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S005.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S011.1.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.2.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S010.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S011.2.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.3.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S003.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S004.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S005.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.3.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.4.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00003.4.eps,width=0.65in,clip=t}&
\psfig{figure=00002_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00002_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S007.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S008.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S010.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00005.4.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S007.4.tif.eps,width=0.65in,clip=t} \\
%
\psfig{figure=Sil_00007.4.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S005.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00009.4.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S010.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00010.4.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S006.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00012.4.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S007.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S009.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S012.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.5.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S004.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S011.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S012.5.tif.eps,width=0.65in,clip=t}\\
\end{tabular}}
\mycaptionS{Estimated body poses from real sequences obtained via
MS inference.}\label{fig:RealBodyMS}
\end{figure*}
}
\section{Conclusions}
\label{sec:Dis}
\changed{ In this paper, we have described a novel method that allows
us to infer 3D and 2D articulated body pose from observed
visual features in a single image, a problem usually regarded as
ill-posed. This was done by combining generative and discriminative
models to solve the complex probabilistic inference problem. This
approach is most useful when the generative model is accurate (\eg we
have an inverse mapping function) but it is difficult to perform
inference using this model alone.}
\changed{
In order to solve the inference problem (and also perform MAP
estimation), we have shown that a mathematically sound approach is to
use a discriminative model and learn its parameters using relevant
training data. The probability distribution implied by the
discriminative model can be used as a proposal distribution to
generate samples and find a posterior probability distribution
(perform approximate inference) under the (accurate but complex)
generative model.}
%q=p, then done???? OIK
%% In this paper, we have described a novel method that allow us to
%% combine generative and discriminative models for proabbilistic
%% inference. The SMA employs a set of several mapping functions that are
%% learned from training data. Each specialized function maps certain
%% domains of the input space onto the output space. The SMA learning
%% formulation uses ideas from Maximum Likelihood estimation and latent
%% variable models. A variant of the Expectation-Maximization algorithm
%% is used for simultaneous learning of the specialized domains along
%% with the mapping functions. One key advantage of the SMA is that it
%% can model ambiguous, one-to-many mappings that may yield multiple
%% valid output hypotheses.
%% %Once learned, the mapping
%% %functions generate a set of output hypotheses for a given input
%% %via a statistical inference procedure.
%% Another key advantage of the SMA formulation is its incorporation
%% of a feedback or inverse function, $\zeta$ in statistical
%% inference.
%% %if
%% %desired.}
%% %To the best
%% %of our knowledge, we do not know of any other probabilistic
%% %formulation undertaking these ideas.
\changed{ When comparing it to other relevant methods, we can find
alternative (dual) interpretations of this framework. The use of a
generative model (through $\zeta$) affords an alternative to complex
discriminative models; for example, it is an alternative to the gating
networks of the Mixture of Experts paradigm \cite{Jordan94}. In
general, instead of learning increasingly complex discriminative
models such as \cite{Hinton98,Friedman91}, we can exploit an accurate
generative model and learn a simpler discriminative
model. \changedRom{A clear advantage of using a generative model in
this way is that it can provide useful information on the structure of
the problem, a structure that discriminative models try to blindly
uncover. }}
\out{The discriminative
model in our approach assumes that the mixing factors are independent
of the input, as seen in Sec.\ \ref{sec:ProMod}. At first sight, this
seems to limit the architecture's expressiveness. However, the
combination of discriminative (also referred here as 'forward') and
generative models eliminates this independence assumption. In other
words, the generative model $\zeta$ provides an alternative that
avoids increasing the discriminative model complexity without
restricting model expressiveness.}
%% Note that
%% in our formulation formulation, different sets of appropriate
%% conditional independence assumptions are specified by the forward and
%% inverse models.
%% In
%% applications such as those presented in this paper, $\zeta$ can be a
%% computer graphics rendering function or an approximation $\hat{\zeta}$
%% can itself be learned from training data. Thus, the SMA exploits
%% available prior information about the structure of the problem.
%%RRChange This allowed us... [very important]
%%SS: OK. I reworded slightly to make it clearer/shorter.
%%RR: If you remove 'if desired' it would be OK, since SMA needs them both
%%SS: Hope it's OK with you.
%%RR:
%% I would prefer this, hope it is clear what's the point from my email
%% 'Another key advantage of the SMA formulation is its incorporation of a
%% feedback or inverse function, $\zeta$ in statistical inference. This
%% allowed us to derive an inference method was based on the possibility
%% of alternatively use different sets of conditional independence
%% assumptions specified by the forward and inverse models'
%%RRChange To the best of our knowledge, we do not know of any other probabilistic formulation undertaking these ideas. [I think we should emphasize the novelty here]
%%SS: I removed this. added word ``novel'' in prior sentence.
%%RR: Do you think it is too risky to say that? or why did you remove it?
%%RRChange [deleted] ....learned from training data \footnote{It is important to add that the use of $\zeta$ does not limit the possibility of having multi-modal posteriors over $\mb{x}$.}
%%SS: OK
Our approach was demonstrated in a computer vision system that can
estimate the articulated pose parameters of a human body or human
hands, given features computed from a single image. This is a
particularly difficult problem because this mapping is highly
ambiguous, complex and it is infeasible to perform inference using the
discriminative model. We have obtained promising results even using a
very simple set of image features, such as moment invariants of the
body silhouette. Choosing the best subset of image features for this
application is by itself a complex problem, and a topic of ongoing
research.
This approach offers several advantages over many previous methods for
articulated pose estimation. These have tried in numerous ways to use
camera geometry and/or model registration to perform pose estimation,
resulting in iterative procedures that require careful choice of
initial conditions (model placement). We have shown how in some cases
these alternative approaches could be seen as inferring a posterior
distribution using the generative model only. \changedRom{We have used
camera geometry for defining our generative model but have not tried
to solve the resulting optimization problem directly, instead we have
had the help of the proposed discriminative model}. In our approach no
iterative minimization methods are used in pose inference. Moreover,
inference is fully automatic -- no manual initialization of the
articulated model is required. \changedRev {Although our method does
not use iterative optimization for inference, it is related to a
family of approaches that use top-down error correction along with a
bottom up process in an iterative fashion. In image processing for
example, one could think of this idea as useful for recovering from
segmentation errors. For some problems this is the case. However, a
word of caution is necessary, the process of re-iterating up and down
steps and obtaining estimates alternating between a pair of spaces (or
sets) does not, in general, guarantee (1) convergence towards the desired
value (pose in our case) or (2) convergence at all. Only under
specific conditions this desirable behavior can be attained. An
excellent reference guide for this problem in the context of
statistics is \cite{Csiszar84}. Interestingly this idea is related to
the Expectation Maximization algorithm for learning.}
A set of previous approaches attempt to learn articulated model
dynamics \cite{Brand99,Howe99,Perona00}; however, learning dynamics
requires substantially more training data, and tends to produce
systems that are biased towards specific motions \changedRev{(\eg
requiring that we know the motion being performed beforehand)}. Our
framework avoids this and infers pose from a single image
only. \changedRev{It is clear that in highly constrained environments
and where motion is available, models of dynamics can provide an
enormous advantage. In this paper we have defined a different problem,
where motion is not available.}
\hide{ Applications need not be limited to the vision domain. As a
simple example, one could apply this approach in speech recognition
problems, where the input space is given by features computed on
acoustic signals (\eg cepstral coefficients), and the output space
could be the space of phonemes. In this case, the generative model
(feedback function) would involve an acoustical rendering of phonemes.
} Several interesting problems remain for future work. Within the
context of articulated pose estimation. For example, (1) adapt the
system to a specific body morphology, one of the major issues
affecting performance and (2) integration of pose estimation with
image segmentation for potentially greater robustness to occlusion and
noise.
\changedRev{ Another general problem is how to learn what the best
(\eg visual) features are for specific problems or datasets. This old
but important problem has spawned numerous approaches. From general
information theoretic \cite{Cover91} techniques based on maximizing
mutual information to approaches specific to image processing such as
(\eg \cite{Iijima73} for character recognition). Roughly speaking one
wants to obtain a few features that can distinguish among the patterns
we care about, for a specific task or data set. In general this
problem is difficult because the structure of the space of features
cannot be represented in simple way or amenable to optimization. This
concept can be seen as that of {\it learning the features}, and it is
closely connected to that of {\it learning the mapping functions}, in
fact they can be seen as two views of the same problem.}
Methods for incorporating knowledge of dynamics in the same framework
should be investigated, as discussed in
\cite{RosalesPhDThesis}. \changedRev{In this work we have concentrated
on single images, they have practical importance in many tasks, such
as model initialization (\eg for tracking), recovery (\eg when
tracking is lost or not reliable), pose from single image (\eg when
photographs need to be used as sources), etc., and should be
considered a different yet still difficult problem from that of motion
tracking or pose estimation with dynamics information.}
\hide{ this is
evident from the type of approaches followed by previous work
(Sec.~\ref{sec:RelWork})}
While promising advances have been made, extending our framework to
incorporate the above concepts remains a topic for future
investigation.
%%% Adaptive extra learning in the q model
\section*{Acknowledgments} The hand sequences used in our
experiments were collected in collaboration with Vassilis Athitsos at
Boston University. We thank Tommi Jaakkola at MIT, Quaid Morris at
University of Toronto, and Matt Brand at MERL for their valuable
suggestions and interesting discussions. This research was supported
in part by the U.S.\ Office of Naval Research under grants
N000140310108 and N000140110444, and the U.S.\ National Science
Foundation under grants IIS-0208876 and IIS-9809340.
\section*{Appendix}
The KL divergence between the empirical distribution $p_e$
(represented by the training data) and the model $q$, parameterized by
$\mb{\theta}$ is:
\beqa {\rm
KL}(p_e(\mb{x},\mb{h})||q(\mb{x},\mb{h}))=\int p_e(\mb{x},\mb{h}) \log
[p_e(\mb{x},\mb{h})/q(\mb{x},\mb{h})] d\mb{h} d\mb{x}, \eeqa which can
be proven to be equivalent to: \beqa \arg\min_{\rm{\theta}}
E_{p_e(\mb{x})}[{\rm KL}(p_e(\mb{h}|\mb{x})||q(\mb{h}|\mb{x}))], \eeqa
where $\theta$ parameterizes $q$. In practice, the expectation becomes
a sum over the training data pairs, and we obtain
Eq.~\ref{eq:LeaDisMod}. Thus, the optimal distribution in this sense
is the one that results from solving Eq.~\ref{eq:LeaDisMod}, to obtain
$q(\mb{h}|\mb{x})$. Of course, we assume that the data is composed by
representative examples from $p$, so that the empirical distribution
$p_e$ is at all useful.
%ATT
Eq.~\ref{eq:Just} justifies this choice since it tells us that in order
to find a good approximation for the posterior $p(\mb{h}|\mb{x})$ we
should find a proposal distribution that is similar to it, as
intuitively expected. We may then ask if we could use this proposal
distribution alone. The reason why this is not a good idea is that,
since we cannot usually find a proposal distribution that matches the
true posterior perfectly, using this proposal distribution alone is
expected to perform worse than when combined with our accurate
generative model. This is mainly because in regions where the proposal
distribution $q$ is bad at approximating $p$, we can always evaluate
$p$ and note the error or discrepancy.
%ATT^
The distribution $q(\mb{h}|\mb{x})$ is an approximation to
$p(\mb{h}|\mb{x})$ in the space of all distributions with the
structure specified by the discriminative model (a mixture model in
our case). For Gaussian mixture models, it is know that this
approximation can be made as accurate as we wish in the limit of
infinite data and mixture components. Interestingly, we know what we
need in order to obtain a good approximation to the posterior, even if
we do not know explicitly what $p(\mb{h})$ is in our generative
model. In practice $p(\mb{h})$ can be estimated from data using a
density estimation method, but we simply used the uniform distribution
(in a reasonably finite domain).
\hide{
Interestingly, we do not need to know explicitly what $p(\mb{h})$ is
in our generative model (but of course, this is implicitly specified
by the training data) Thus, even if we use a not so good assumption
for $p(\mb{h})$, still we know what we need to do in order to achieve
a good estimate of the posterior. This is helpful since we do not
really know accurately what $p(\mb{h})$ is (given that we may not have
enough data to estimate it accurately). In the following we simply use
an uniform distribution (in a reasonable finite domain).}
\changedRev{
Throughout the paper we were concerned mainly with MAP estimation. For
the sake of completeness, if we are interested in computing the
probability of a body pose $\mb{h}$, given an observation of features
$\mb{x}^*$ , we use the expression: \beqa
\hat{p}(\mb{h}|\mb{x}^*)=\frac{1}{\hat{Z}_p}{\cl
N}(\mb{x}^*;\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h}), \eeqa with
$\hat{Z}_p$ given by $\frac{1}{S} \sum_{s=1}^S
p(\mb{x}^*,\mb{h}\us)/q(\mb{h}\us|\mb{x}^{*})$, using importance
sampling with proposal distribution $q(\mb{h}|\mb{x}^*)$ to obtain the samples $\mb{h}\us$.}
%ATT H
%% Assuming
%% that the data is composed by representative examples from $p$ we hope
%% to learn a good
%% Thus, this justifies why learning a discriminative distribution
%% $q(\mb{h}|\mb{x})$ is a sensible approach. When learning $q$ from
%% training data, we are trying to approximate $p$ assuming that the data
%% is composed by representative examples from the true distribution.
%% It is known that a mixture of Gaussians can approximate
%% any distribution if enough mixture components are used. Thus, in the
%% limit of infinite data and a large enough mixture our discriminative
%% distribution $q$ could in theory approximate the generative
%% distribution $p$.
%% p(x*)=\int p(x*,h) dh
%% approx with (using IS)
%% p(x*)=1/R \sum p(x*,h)/p'(h)
%% Rubinstein says that the best dist that we can use to sample is p(x*,h) normalized, so that it is a valid pdf, which is p(x*,h)/int p(x*,h) dh= p(x*,h)/int p(x*) = p(h|x*)!!!!
\renewcommand{\baselinestretch}{1}
\bibliography{thesis}
\end{document}