38 #define DEBUG_DPJESP 0
64 cout <<
"\n---------------------------------"<<endl;
65 cout <<
"Exhaustive JESP - Plan() started"<<endl;
66 cout <<
"---------------------------------"<<endl;
68 double v_best = -DBL_MAX;
74 if(
DEBUG_DPJESP) {cout <<
"joint policy randomly initialized to:";
78 size_t nr_non_improving_agents = 0;
88 {cout <<
">>>Plan: new best policy:"<<endl; best->
Print();}
90 nr_non_improving_agents = 0;
93 nr_non_improving_agents++;
100 cout <<
"Exhaustive JESP - resulting policy:"<<endl;
101 cout <<
"------------------------------------"<<endl;
112 cout <<
"JESPDynamicProgrammingPlanner::ExhaustiveBestResponse called "
113 <<
"for agent " << agentI << endl;
157 vector<Index>& new_pol
163 for(
Index tab=0; tab < stage; tab++)
165 string tabss = tabsss.str();
166 cout << tabss<<
">>DPBestResponseRecursively(ag="<<agentI<<
", aoh="<<aohI<<
", B, stage="<<stage<<
", jpol) called, with " << endl <<tabss<<
"B="<<endl;
169 vector<Index> otherAgentIndices;
170 for(
Index agI=0; agI < GetNrAgents(); agI++)
172 otherAgentIndices.push_back(agI);
174 size_t nrA = GetReferred()->GetNrActions(agentI);
175 size_t nrO = GetReferred()->GetNrObservations(agentI);
176 size_t nrAgents = GetNrAgents();
177 double v_max = -DBL_MAX;
179 for(
Index actionI=0; actionI < nrA; actionI++)
190 vector<Index> actions(nrAgents);
191 actions.at(agentI) = actionI;
192 for(
Index j=0; j < otherAgentIndices.size(); j++)
194 Index agJ = otherAgentIndices[j];
195 Index oHistJ = oHistI_others[j];
197 actions.at(agJ) = actJ;
200 Index jaI = IndividualToJointActionIndices(actions);
202 Rba += B.
Get(eI) * GetReward(sI, jaI);
207 if(stage < GetHorizon() - 1)
209 double check_p = 0.0;
210 for(
Index observI=0; observI < nrO; observI++)
213 double Po_ba = Bao.
Update(B, actionI, observI, jpol);
215 GetSuccessorAOHI(agentI, aohI, actionI, observI);
216 double F_ao = DPBestResponseRecursively(agentI, next_aohI,
217 Bao, stage+1, jpol, new_pol);
221 if( abs(check_p - 1) > 1e7)
222 throw E(
"Po_ba not summing to 1");
227 cout << tabss<<
"(stage="<<stage<<
")actionI="<<actionI<<
", Q(b,a)="<< v_a <<
228 " (= R+F = "<<Rba<<
" + "<< F <<
")"<<endl;
237 cout << tabss<<
">>ENDED DPBestResponseRecursively(ag="<<agentI<<
", aoh="<<aohI
238 <<
", B, stage="<<stage<<
", jpol) called, with " <<endl<<tabss<<
"B="<<endl;
239 cout << tabss<<
"Selected actionI="<<a_br<<
" with Q(b,a)="
245 new_pol.at(aohI) = a_br;
257 vector<Index>& new_pol
262 for(
Index tab=0; tab < stage; tab++)
264 string tabss = tabsss.str();
267 Index best_a = new_pol[aohI];
270 size_t nrO = GetReferred()->GetNrObservations(agentI);
271 if(stage < GetHorizon() - 1)
273 for(
Index oI=0; oI < nrO; oI++)
275 Index next_aohI = GetSuccessorAOHI(agentI, aohI, best_a, oI);
276 Index next_ohI = GetSuccessorOHI(agentI, ohI, oI);
277 ConstructPolicyRecursively(agentI, next_aohI, next_ohI, stage+1,