MultiAgentDecisionProcess  Release 0.2.1
DICEPSPlanner.cpp
Go to the documentation of this file.
1 
28 #include "DICEPSPlanner.h"
30 #include "JPPVValuePair.h"
32 #include "SimulationResult.h"
33 #include <float.h>
34 
35 using namespace std;
36 
37 #define DEBUG_DICEPSPlanner 0
38 #define DEBUG_DICEPSPlannerTIMINGS 1
39 
43  size_t horizon,
44  size_t nrRestarts,
45  size_t nrIterations,
46  size_t nrSamples,
47  size_t nrSamplesForUpdate,
48  bool use_hard_threshold, //(gamma in CE papers)
49  double CEalpha, //the learning rate
50  size_t nrEvalRuns, // value approximation runs (set 0 for exact eval)
51  int verbose
52 ) :
53  PlanningUnitDecPOMDPDiscrete(params, horizon, p)
54  ,_m_foundPolicy(*this)
55 
56 {
57  _m_nrRestarts = 1;//nrRestarts;//TODO is it desirable to have this thing
58  //restart by itself?
59 
60  _m_nrIterations = nrIterations;
61  _m_nrSampledJointPolicies = nrSamples;
62  _m_nrJointPoliciesForUpdate = nrSamplesForUpdate;
63  _m_use_gamma = use_hard_threshold;
64  _m_alpha = CEalpha;
65  _m_nrEvalRuns = nrEvalRuns;
66  _m_verbose = verbose;
68 }
69 
72  int horizon,
73  size_t nrRestarts,
74  size_t nrIterations,
75  size_t nrSamples,
76  size_t nrSamplesForUpdate,
77  bool use_hard_threshold, //(gamma in CE papers)
78  double CEalpha, //the learning rate
79  size_t nrEvalRuns, // value approximation runs (set 0 for exact eval)
80  int verbose
81  ) :
82  PlanningUnitDecPOMDPDiscrete(horizon, p),
83  _m_foundPolicy(*this)
84 {
85  _m_nrRestarts = 1;//nrRestarts; //TODO is it desirable to have this thing
86  //restart by itself?
87  _m_nrIterations = nrIterations;
88  _m_nrSampledJointPolicies = nrSamples;
89  _m_nrJointPoliciesForUpdate = nrSamplesForUpdate;
90  _m_use_gamma = use_hard_threshold;
91  _m_alpha = CEalpha;
92  _m_nrEvalRuns = nrEvalRuns;
93  _m_verbose = verbose;
95 }
96 
100  size_t horizon,
101  size_t nrRestarts,
102  size_t nrIterations,
103  size_t nrSamples,
104  size_t nrSamplesForUpdate,
105  bool use_hard_threshold, //(gamma in CE papers)
106  double CEalpha, //the learning rate
107  size_t nrEvalRuns, // value approximation runs (set 0 for exact eval)
108  bool convergenceStats,
109  ofstream & convergenceStatsFile,
110  int verbose
111 ) :
112  PlanningUnitDecPOMDPDiscrete(params, horizon, p)
113  //,_m_outputConvergenceFile(convergenceStatsFile)
114  ,_m_foundPolicy(*this)
115 {
116  _m_nrRestarts = 1;//nrRestarts;//TODO is it desirable to have this thing
117  //restart by itself?
118 
119  _m_nrIterations = nrIterations;
120  _m_nrSampledJointPolicies = nrSamples;
121  _m_nrJointPoliciesForUpdate = nrSamplesForUpdate;
122  _m_use_gamma = use_hard_threshold;
123  _m_alpha = CEalpha;
124  _m_nrEvalRuns = nrEvalRuns;
125  _m_outputConvergenceStatistics = convergenceStats;
126  _m_outputConvergenceFile = &convergenceStatsFile;
127  _m_verbose = verbose;
128 }
131  int horizon,
132  size_t nrRestarts,
133  size_t nrIterations,
134  size_t nrSamples,
135  size_t nrSamplesForUpdate,
136  bool use_hard_threshold, //(gamma in CE papers)
137  double CEalpha, //the learning rate
138  size_t nrEvalRuns, // value approximation runs (set 0 for exact eval)
139  bool convergenceStats,
140  ofstream & convergenceStatsFile,
141  int verbose
142  ) :
143  PlanningUnitDecPOMDPDiscrete(horizon, p)
144 // ,_m_outputConvergenceFile(convergenceStatsFile)
145  ,_m_foundPolicy(*this)
146 {
147  _m_nrRestarts = 1;//nrRestarts; //TODO is it desirable to have this thing
148  //restart by itself?
149  _m_nrIterations = nrIterations;
150  _m_nrSampledJointPolicies = nrSamples;
151  _m_nrJointPoliciesForUpdate = nrSamplesForUpdate;
152  _m_use_gamma = use_hard_threshold;
153  _m_alpha = CEalpha;
154  _m_nrEvalRuns = nrEvalRuns;
155  _m_outputConvergenceStatistics = convergenceStats;
156  _m_outputConvergenceFile = &convergenceStatsFile;
157  _m_verbose = verbose;
158 }
159 
160 
162 {
163  StartTimer("DICEPS::Plan()");
164  double v_best = -DBL_MAX;
165  // Index jpolI_best = 0;
166  JointPolicyPureVector jpol_best( *this, OHIST_INDEX ); //temporary?
167 
168  //the algorithm has the following form
169  //create initial joint policy distribution
170  //for number of restarts
171  //while improving
172  //sample joint policies
173  //evaluate and rank the sampled policies
174  //update the probability distribution
175 
176  //get some vars:
177  size_t nrAgents = GetReferred()->GetNrAgents();
178  StartTimer("DICEPS::create-Xi");
179 
180  //create initial joint policy distribution
181  //Xi is the parameter 'vector' for the joint probability distribution
182  //with the following form:
183  //Xi[agentI][ohistI][actionI] (= Pr(actionI | ohistI, agentI) )
184  vector< vector< vector<double> > > Xi (nrAgents);
185  for(Index agentI=0; agentI < nrAgents; agentI++)
186  {
187  vector< vector<double> >& ta_vec = Xi.at(agentI);
188  size_t nrOHists = GetNrObservationHistories(agentI);
189  size_t nrAcs = GetReferred()->GetNrActions(agentI);
190  ta_vec = vector< vector<double> >(nrOHists,
191  vector<double>(nrAcs, 1.0 / nrAcs) ); //uniform dist. at start
192  }
193  StopTimer("DICEPS::create-Xi");
194 
195  //for number of restarts
196  for(Index restart=0; restart < _m_nrRestarts; restart++)
197  {
198  StartTimer("DICEPS::run(restart)");
199  //while improving - TODO think of a way to test this!
200  //for now, we use a fixed number of iterations:
201  for(Index iter=0; iter < _m_nrIterations; iter++)
202  {
203 #if DEBUG_DICEPSPlannerTIMINGS
204  StartTimer("DICEPS::(CE)iteration");
205 #endif
206  list<JPPVValuePair*> best_samples;
207  double v_xth_best = -DBL_MAX;
208  double v_gamma = -DBL_MAX;
209  for(Index sample=0; sample < _m_nrSampledJointPolicies; sample++)
210  {
211 #if DEBUG_DICEPSPlannerTIMINGS
212  StartTimer("DICEPS::(CE)sample");
213 #endif
214  JointPolicyPureVector* p_jpol = new JointPolicyPureVector( *this );
215 
216  //sample next joint policy
217  // by sampling individual policy for each agent
218  vector< PolicyPureVector* > & BGpolicies = p_jpol->
219  GetIndividualPolicies();
220  for(Index agentI=0; agentI < nrAgents; agentI++) {
221  SampleIndividualPolicy(*(BGpolicies.at(agentI)),Xi[agentI]);
222  }
223 
224 #if DEBUG_DICEPSPlanner
225  {
226  cout << "sampled new policy: "<<
227  p_jpol->GetIndex();
228  }
229 #endif
230 
231  //evaluate the JointPolicy
232  double v;
233 
234 #if DEBUG_DICEPSPlannerTIMINGS
235  StartTimer("DICEPS::(CE)sample evaluation");
236 #endif
237  if (_m_nrEvalRuns > 0)
238  {
239  // use approximate evaluation:
240  // larger _m_nrEvalRuns yields better approximations
241 
242  v = ApproximateEvaluate(*p_jpol, _m_nrEvalRuns);
243 #if DEBUG_CEPOMDP
244  cout << ", approx. value= " << approxV << endl;
245 #endif
246  }
247  else
248  {
249  // use exact evaluation
250 
251  ValueFunctionDecPOMDPDiscrete vf(*this, *p_jpol);
252  v = vf.CalculateV(true);
253 #if DEBUG_CEPOMDP
254  cout << ", value="<<v<<endl;
255 #endif
256  }
257 #if DEBUG_DICEPSPlannerTIMINGS
258  StopTimer("DICEPS::(CE)sample evaluation");
259 #endif
260 
261 
262 
263  //retain it if it ranks among the best...
264  //we maintain an ordered list with contains the x best
265  //policies (pol-val pairs). (x = _m_nrJointPoliciesForUpdate)
266  //front() is the highest ranked policy and
267  //back() the lowest ranked one.
268  //
269  // TODO: [jkooij] use STL Priority Queue for this?
270 
271  if(
272  //either we have not sampled x policies
273  ( best_samples.size() < _m_nrJointPoliciesForUpdate
274  ||
275  //or the value of this policy is better
276  (v > v_xth_best)
277  )
278  && ( (!_m_use_gamma) || (v > v_gamma) )
279  )
280  {
281  if (best_samples.size() == _m_nrJointPoliciesForUpdate)
282  {
283 #if DEBUG_DICEPSPlanner
284  cout << "best_samples full: making space...";
285 #endif
286  delete best_samples.back();
287  best_samples.pop_back(); //make room
288  }
289  JPPVValuePair* polval = new JPPVValuePair(p_jpol,v);
290  OrderedInsertJPPVValuePair(polval, best_samples);
291  JPPVValuePair* back = best_samples.back();
292  v_xth_best = back->GetValue();
293 #if DEBUG_DICEPSPlanner
294  cout << "inserted pol (v="<<v<<") - v_xth_best now:"
295  << v_xth_best << endl;
296  cout <<"best_samples contains the following pol/val pairs:"
297  << endl;
298  PrintBestSamples(best_samples);
299  cout << endl;
300 #endif
301  }
302  else
303  {
304  delete p_jpol;
305  }
306 #if DEBUG_DICEPSPlannerTIMINGS
307  StopTimer("DICEPS::(CE)sample");
308 #endif
309  } //end for samples
310 
311  if(_m_use_gamma) //update the gamma
312  {
313  JPPVValuePair* back = best_samples.back();
314  v_gamma = back->GetValue();
315  if(_m_verbose >= 1)
316  cout << "new v_gamma="<<v_gamma<<endl;
317  }
318 
319  //retain the very best sample:
320  double v_best_this_iter = best_samples.front()->GetValue();
321  if(v_best_this_iter > v_best)
322  {
323  if(_m_verbose >= 1)
324  cout << "new absolute best="<<v_best_this_iter <<
325  " (old="<< v_best <<")"<<endl;
326  v_best = v_best_this_iter;
327  //LIndex too short...
328  //jpolI_best = best_samples.front()->GetJPPV()->GetIndex();
329  jpol_best = *(best_samples.front()->GetJPPV());
330  }
331 
332  //update the probability distribution
333  UpdateCEProbDistribution(Xi, best_samples);
334 
335 #if DEBUG_DICEPSPlanner
336  for(Index agentI=0; agentI < nrAgents; agentI++)
337  {
338  cout << "updated parameter vector for agent "<<agentI<<":";
339  PrintVectorCout(Xi[agentI]);
340  cout << endl;
341  }
342 #endif
344  {
345  size_t nrCREvals = 2000; //CR for convergence run
346  double r_total = 0.0;
347  JointPolicyPureVector* p_jpol=new JointPolicyPureVector(*this);
348  for(Index cr=0; cr < nrCREvals; cr++)
349  {
350  //sample a policy from the new distribution
351  vector< PolicyPureVector* > & BGpolicies = p_jpol->
352  GetIndividualPolicies();
353  for(Index agentI=0; agentI < nrAgents; agentI++)
355  *(BGpolicies.at(agentI)), Xi[agentI] );
356  //evaluate 1 run of this policy
357  r_total += ApproximateEvaluate(*p_jpol, 1);
358  }
359  delete p_jpol;
360  r_total /= (double)nrCREvals;
361  cout << "iteration " << iter << " ended, V(Xi)="<< r_total;
362  cout << endl;
363  char mean[20];
364  sprintf(mean, "%.6f", r_total);
365  (*_m_outputConvergenceFile) << mean <<"\t";
366 
367  }
368 
369  //delete list
370  while(!best_samples.empty())
371  {
372  delete best_samples.front();
373  best_samples.pop_front();
374  }
375 
376 #if DEBUG_DICEPSPlannerTIMINGS
377  StopTimer("DICEPS::(CE)iteration");
378 #endif
379  } //end for iterations
380  StopTimer("DICEPS::run(restart)");
382  (*_m_outputConvergenceFile) << endl;
383  } // end for restarts
384 
385 
386 
387 
388  // store best found joint policy
389  _m_foundPolicy = jpol_best;
390 
391  if (_m_nrEvalRuns > 0) {
392  // the expected reward for the found policy is an approximation
393  //so we now determine its value more accurately
394  size_t Ventries = GetNrStates() * GetNrJointObservationHistories();
395  //cout << "Value function entries=" << Ventries <<
396  //" (S="<<GetNrStates() <<
397  //", JOH="<<GetNrJointObservationHistories() <<")"<<endl;
398  if(Ventries > 20000)
399  {
400  StartTimer("DICEPS::FoundJPolAccurateEvaluation()");
402  ApproximateEvaluate(jpol_best, 20000);
403  StopTimer("DICEPS::FoundJPolAccurateEvaluation()");
404  }
405  else
406  {
407  StartTimer("DICEPS::FoundJPolExactEvaluation()");
408  // so it is needed to evaluate it once using the exact method
409  ValueFunctionDecPOMDPDiscrete vf(*this, jpol_best);
411  StopTimer("DICEPS::FoundJPolExactEvaluation()");
412  }
413  } else {
414  // the expected reward for the found policy is already given, use it
416  }
417 
418  StopTimer("DICEPS::Plan()");
419 }
420 
422  const vector< vector<double> >& ohistActionProbs )
423 {
424  vector< vector<double> >::const_iterator it = ohistActionProbs.begin();
425  vector< vector<double> >::const_iterator last = ohistActionProbs.end();
426  Index ohistI = 0;
427  while(it != last)
428  {
429  //the action probabilities for the type pointed to by *it
430  const vector<double> & action_probs = *it;
431  double r = ((double)rand()) / RAND_MAX;
432  double cumulativeActionProb = 0.0;
433 
434  Index aI = 0; //the action index
435  vector<double>::const_iterator a_it = action_probs.begin();
436  vector<double>::const_iterator a_last = action_probs.end();
437  while(a_it != a_last)
438  {
439  double prob_aI = *a_it;
440  cumulativeActionProb += prob_aI;
441  if(cumulativeActionProb >= r) //action aI is sampled
442  break;
443  aI++;
444  a_it++;
445  }
446  pol.SetAction(ohistI, aI);
447 
448  ohistI++;
449  it++;
450  }
451 
452 }
453 
455  list< JPPVValuePair*>& l)
456 {
457  double v_pv = pv->GetValue();
458  list<JPPVValuePair*>::iterator it = l.begin(); //=front - highest values
459  list<JPPVValuePair*>::iterator last = l.end(); //=back - lowest values
460  while(it != last)
461  {
462  JPPVValuePair* temp = *it;
463  double val = temp->GetValue();
464  if( v_pv < val )
465  {
466  it++;
467  }
468  else
469  {
470  l.insert(it, pv);
471  return;
472  }
473  }
474  //this should only happen when size=0
475  l.insert(last, pv);
476 }
477 
478 
479 void DICEPSPlanner::PrintBestSamples( const list< JPPVValuePair*>& l)
480 {
481  list<JPPVValuePair*>::const_iterator it = l.begin(); //=front - highest values
482  list<JPPVValuePair*>::const_iterator last = l.end(); //=back - lowest values
483  while(it != last)
484  {
485  JPPVValuePair* temp = *it;
486  double val = temp->GetValue();
487  cout << ""<<val<<", ";
488  //LIndex i = temp->GetJPol()->GetIndex();
489  //cout << "<p"<<i<<", v"<<val<<">,";
490  it++;
491  }
492 }
493 
494 
495 
496 
497 
499  vector< vector< vector<double> > >& Xi,
500  const list<JPPVValuePair* >& best_samples)
501 {
502  size_t nrAgents = Xi.size();
503  size_t nrSamples = best_samples.size();
504  //get counts
505  vector< vector< vector< unsigned int > > > counts;
506  list<JPPVValuePair* >::const_iterator it = best_samples.begin();
507  list<JPPVValuePair* >::const_iterator last = best_samples.end();
508  for(Index agI=0; agI < nrAgents; agI++)
509  {
510  size_t nrH = GetNrObservationHistories(agI);
511  counts.push_back(
512  vector< vector< unsigned int > >(
513  nrH,
514  vector< unsigned int >(GetReferred()->GetNrActions(agI))
515  )
516  );
517  }
518 
519  while(it != last)
520  {
521  JointPolicyPureVector* p_jpol= (*it)->GetJPPV();
522  vector< PolicyPureVector* > & policies =
523  p_jpol->GetIndividualPolicies();
524  for(Index agI=0; agI < nrAgents; agI++)
525  {
526  size_t nrH = GetNrObservationHistories(agI);
527  for(Index ohistI=0; ohistI < nrH; ohistI++)
528  {
529  Index acI = policies[agI]->GetActionIndex(ohistI);
530  counts[agI][ohistI][acI]++;
531  }
532  }
533  it++;
534  }
535  // update
536 
537  for(Index agI=0; agI < nrAgents; agI++) {
538  size_t nrH = GetNrObservationHistories(agI);
539  for(Index ohistI=0; ohistI < nrH; ohistI++) {
540  for(Index acI=0; acI < GetReferred()->GetNrActions(agI); acI++)
541  {
542  double new_prob = ((double)counts[agI][ohistI][acI])/nrSamples;
543  Xi.at(agI).at(ohistI).at(acI) =
544  (1 - _m_alpha) * Xi.at(agI).at(ohistI).at(acI) +
545  _m_alpha * new_prob;
546  }
547  }
548  }
549 
550 }
551 
553 {
554  // perform approximate evaluation of a joint policy
555  // by evaluating it's value nrRuns times randomly.
556 
557  SimulationDecPOMDPDiscrete simulator(*this, nrRuns);
558 #if DEBUG_DICEPSPlannerTIMINGS
559  StartTimer("DICEPS::(CE)sample evaluation: simulator.RunSimulations()");
560 #endif
561  SimulationResult simres = simulator.RunSimulations(&jpol);
562 #if DEBUG_DICEPSPlannerTIMINGS
563  StopTimer("DICEPS::(CE)sample evaluation: simulator.RunSimulations()");
564 #endif
565 
566  return simres.GetAvgReward();
567 }
568