37 #define DEBUG_DICEPSPlanner 0
38 #define DEBUG_DICEPSPlannerTIMINGS 1
47 size_t nrSamplesForUpdate,
48 bool use_hard_threshold,
54 ,_m_foundPolicy(*this)
76 size_t nrSamplesForUpdate,
77 bool use_hard_threshold,
104 size_t nrSamplesForUpdate,
105 bool use_hard_threshold,
108 bool convergenceStats,
109 ofstream & convergenceStatsFile,
114 ,_m_foundPolicy(*this)
135 size_t nrSamplesForUpdate,
136 bool use_hard_threshold,
139 bool convergenceStats,
140 ofstream & convergenceStatsFile,
145 ,_m_foundPolicy(*this)
164 double v_best = -DBL_MAX;
184 vector< vector< vector<double> > > Xi (nrAgents);
185 for(
Index agentI=0; agentI < nrAgents; agentI++)
187 vector< vector<double> >& ta_vec = Xi.at(agentI);
190 ta_vec = vector< vector<double> >(nrOHists,
191 vector<double>(nrAcs, 1.0 / nrAcs) );
203 #if DEBUG_DICEPSPlannerTIMINGS
206 list<JPPVValuePair*> best_samples;
207 double v_xth_best = -DBL_MAX;
208 double v_gamma = -DBL_MAX;
211 #if DEBUG_DICEPSPlannerTIMINGS
218 vector< PolicyPureVector* > & BGpolicies = p_jpol->
219 GetIndividualPolicies();
220 for(
Index agentI=0; agentI < nrAgents; agentI++) {
224 #if DEBUG_DICEPSPlanner
226 cout <<
"sampled new policy: "<<
234 #if DEBUG_DICEPSPlannerTIMINGS
244 cout <<
", approx. value= " << approxV << endl;
254 cout <<
", value="<<v<<endl;
257 #if DEBUG_DICEPSPlannerTIMINGS
258 StopTimer(
"DICEPS::(CE)sample evaluation");
283 #if DEBUG_DICEPSPlanner
284 cout <<
"best_samples full: making space...";
286 delete best_samples.back();
287 best_samples.pop_back();
293 #if DEBUG_DICEPSPlanner
294 cout <<
"inserted pol (v="<<v<<
") - v_xth_best now:"
295 << v_xth_best << endl;
296 cout <<
"best_samples contains the following pol/val pairs:"
306 #if DEBUG_DICEPSPlannerTIMINGS
316 cout <<
"new v_gamma="<<v_gamma<<endl;
320 double v_best_this_iter = best_samples.front()->GetValue();
321 if(v_best_this_iter > v_best)
324 cout <<
"new absolute best="<<v_best_this_iter <<
325 " (old="<< v_best <<
")"<<endl;
326 v_best = v_best_this_iter;
329 jpol_best = *(best_samples.front()->GetJPPV());
335 #if DEBUG_DICEPSPlanner
336 for(
Index agentI=0; agentI < nrAgents; agentI++)
338 cout <<
"updated parameter vector for agent "<<agentI<<
":";
345 size_t nrCREvals = 2000;
346 double r_total = 0.0;
348 for(
Index cr=0; cr < nrCREvals; cr++)
351 vector< PolicyPureVector* > & BGpolicies = p_jpol->
352 GetIndividualPolicies();
353 for(
Index agentI=0; agentI < nrAgents; agentI++)
355 *(BGpolicies.at(agentI)), Xi[agentI] );
360 r_total /= (double)nrCREvals;
361 cout <<
"iteration " << iter <<
" ended, V(Xi)="<< r_total;
364 sprintf(mean,
"%.6f", r_total);
365 (*_m_outputConvergenceFile) << mean <<
"\t";
370 while(!best_samples.empty())
372 delete best_samples.front();
373 best_samples.pop_front();
376 #if DEBUG_DICEPSPlannerTIMINGS
382 (*_m_outputConvergenceFile) << endl;
400 StartTimer(
"DICEPS::FoundJPolAccurateEvaluation()");
403 StopTimer(
"DICEPS::FoundJPolAccurateEvaluation()");
407 StartTimer(
"DICEPS::FoundJPolExactEvaluation()");
411 StopTimer(
"DICEPS::FoundJPolExactEvaluation()");
422 const vector< vector<double> >& ohistActionProbs )
424 vector< vector<double> >::const_iterator it = ohistActionProbs.begin();
425 vector< vector<double> >::const_iterator last = ohistActionProbs.end();
430 const vector<double> & action_probs = *it;
431 double r = ((double)rand()) / RAND_MAX;
432 double cumulativeActionProb = 0.0;
435 vector<double>::const_iterator a_it = action_probs.begin();
436 vector<double>::const_iterator a_last = action_probs.end();
437 while(a_it != a_last)
439 double prob_aI = *a_it;
440 cumulativeActionProb += prob_aI;
441 if(cumulativeActionProb >= r)
455 list< JPPVValuePair*>& l)
458 list<JPPVValuePair*>::iterator it = l.begin();
459 list<JPPVValuePair*>::iterator last = l.end();
481 list<JPPVValuePair*>::const_iterator it = l.begin();
482 list<JPPVValuePair*>::const_iterator last = l.end();
487 cout <<
""<<val<<
", ";
499 vector< vector< vector<double> > >& Xi,
500 const list<JPPVValuePair* >& best_samples)
502 size_t nrAgents = Xi.size();
503 size_t nrSamples = best_samples.size();
505 vector< vector< vector< unsigned int > > > counts;
506 list<JPPVValuePair* >::const_iterator it = best_samples.begin();
507 list<JPPVValuePair* >::const_iterator last = best_samples.end();
508 for(
Index agI=0; agI < nrAgents; agI++)
512 vector< vector< unsigned int > >(
522 vector< PolicyPureVector* > & policies =
524 for(
Index agI=0; agI < nrAgents; agI++)
527 for(
Index ohistI=0; ohistI < nrH; ohistI++)
529 Index acI = policies[agI]->GetActionIndex(ohistI);
530 counts[agI][ohistI][acI]++;
537 for(
Index agI=0; agI < nrAgents; agI++) {
539 for(
Index ohistI=0; ohistI < nrH; ohistI++) {
542 double new_prob = ((double)counts[agI][ohistI][acI])/nrSamples;
543 Xi.at(agI).at(ohistI).at(acI) =
544 (1 -
_m_alpha) * Xi.at(agI).at(ohistI).at(acI) +
558 #if DEBUG_DICEPSPlannerTIMINGS
559 StartTimer(
"DICEPS::(CE)sample evaluation: simulator.RunSimulations()");
562 #if DEBUG_DICEPSPlannerTIMINGS
563 StopTimer(
"DICEPS::(CE)sample evaluation: simulator.RunSimulations()");