@InCollection{BR93,
  author =       { Avrim Blum and Ronald L. Rivest },
  title =        { Training a 3-node neural network is NP-complete },
  pages =        { 9--28 },
  booktitle =    { Machine Learning: From Theory to Applications },
  isbn =         { 978-3-540-56483-6 },
  editor = 	 { Stephen Jos\'{e} Hanson and Werner Remmele and Ronald L. Rivest },
  publisher =    { Springer },
  OPTyear =      { 1993 },
  date =         { 1993 },
  series =       { Lecture Notes in Computer Science },
  volume =       { 661 },
  url =          { http://dx.doi.org/10.1007/3-540-56483-7_20 },
  doi =          { 10.1007/3-540-56483-7_20 },
  note =         { (Reprinted with permission from Neural Networks 5,1 (January 1992),
                   Avrim Blum and Ronald L. Rivest, ``Training a three-node
                   neural net is NP-complete''.) 
                 },                  
  abstract =     { We show for many simple two-layer networks whose
                  nodes compute linear threshold functions of their
                  inputs that training is NP-complete. For any
                  training algorithm for one of these networks there
                  will be some sets of training data on which it
                  performs poorly, either by running for more than an
                  amount of time polynomial in the input length, or by
                  producing sub-optimal weights. Thus, these networks
                  differ fundamentally from the perceptron in a
                  worst-case computational sense.  
                  \par 
                  The theorems
                  and proofs are in a sense fragile; they do not imply
                  that training is necessarily hard for networks other
                  than those specifically mentioned. They do, however,
                  suggest that one cannot escape computational
                  difficulties simply by considering only very simple
                  or very regular networks.  
                  \par 
                  On a somewhat more
                  positive note, we present two networks such that the
                  second is both more powerful than the first and can
                  be trained in polynomial time, even though the first
                  is NP-complete to train. This shows that
                  computational intractability does not depend
                  directly on network power and provides theoretical
                  support for the idea that finding an appropriate
                  network and input encoding for one's training
                  problem is an important part of the training
                  process.  
                  \par
                  An open problem is whether the
                  NP-completeness results can be extended to neural
                  networks that use the differentiable logistic linear
                  functions. We conjecture that training remains
                  NP-complete when these functions are used since
                  their use does not seem to alter significantly the
                  expressive power of a neural network. However, our
                  proof techniques break down. Note that Judd [9], for
                  the networks he considers, shows NP-completeness for
                  a wide variety of node functions including logistic
                  linear functions.  
                 },
}