MultiAgentDecisionProcess  Release 0.2.1
JESPDynamicProgrammingPlanner.cpp
Go to the documentation of this file.
1 
30 #include "JointPolicyPureVector.h"
31 #include "JointBeliefInterface.h"
32 #include "IndividualBeliefJESP.h"
33 #include <float.h>
34 
35 using namespace std;
36 
37 #define DEBUG_DPBR 0
38 #define DEBUG_DPJESP 0
39 
42  size_t horizon,
44  ) :
45  PlanningUnitDecPOMDPDiscrete(params, horizon, p),
46  _m_foundPolicy(*this)
47  //,_m_exhBRBestPol(*this)
48 {
49 }
50 
52  int horizon,
54  ) :
55  PlanningUnitDecPOMDPDiscrete(horizon, p),
56  _m_foundPolicy(*this)
57  //,_m_exhBRBestPol(*this)
58 {
59 }
60 
62 {
63  if(DEBUG_DPJESP){
64  cout << "\n---------------------------------"<<endl;
65  cout << "Exhaustive JESP - Plan() started"<<endl;
66  cout << "---------------------------------"<<endl;
67  }
68  double v_best = -DBL_MAX;
71  jpol->RandomInitialization();
72  //jpol->ZeroInitialization();
73 
74  if(DEBUG_DPJESP) {cout << "joint policy randomly initialized to:";
75  jpol->Print();}
76 
77  int stop = 0;
78  size_t nr_non_improving_agents = 0;
79  while(nr_non_improving_agents < GetReferred()->GetNrAgents() -1
80  && stop++ < 1000)
81  {
82  int agentI = GetNextAgentIndex();
83  double v = DynamicProgrammingBestResponse(jpol, agentI);
84  if(v > v_best + 1e-9)
85  {
86  (*best) = (*jpol);
87  if(DEBUG_DPJESP)
88  {cout << ">>>Plan: new best policy:"<<endl; best->Print();}
89  v_best = v;
90  nr_non_improving_agents = 0;
91  }
92  else
93  nr_non_improving_agents++;
94  }
95  _m_foundPolicy = *best;
97 
98 
99  if(DEBUG_DPJESP){
100  cout << "Exhaustive JESP - resulting policy:"<<endl;
101  cout << "------------------------------------"<<endl;
102  best->Print();
103  }
104  delete jpol;
105  delete best;
106 }
107 
109  JointPolicyPureVector* jpol, Index agentI)
110 {
111 #if DEBUG_DPBR
112  cout << "JESPDynamicProgrammingPlanner::ExhaustiveBestResponse called "
113  << "for agent " << agentI << endl;
114 #endif
115 
116  //create initial *augmented* POMDP belief B^0(b^0, oHist_{!=i}^0)
117  //(B is an augmented POMDP belief, b is a joint belief (just over states))
118  //
119  //DP
120  //compute
121  //V(B^0)= \max_a Q(B^0, a)
122  // = \max_a [ R(B^0, a) + sum_o P(o|B^0,a) V(B^1)
123  //
124  // where a,o are individual actions/observations.
125  //
126  //BELIEF UPDATE
127  // the belief update (computing B^1 from B^0) should be done as follows
128  // (we denote this agent with i, and the other (we assume 1 for simplicity)
129  // with j)
130  //
131  // Bi^1(s',oHistj') = (1/ P(oi|Bi^0,ai) *
132  // sum_s Bi^0(s, oHistj) P(s',oi,oj|s,ai,polj(oHistj))
133  //
134 
135  //BELIEF implementation
136  //in order to maintain a probability for each <s,ohist_j^t> pair
137  //we need to enumerate them
138  IndividualBeliefJESP B0( agentI, 0, *this );
139  B0.Set( *GetProblem()->GetISD() );
140  vector<Index> newpol (GetNrActionObservationHistories(agentI), 0);
141  double v0 = DPBestResponseRecursively(agentI, 0, B0, 0, jpol, newpol);
142 #if DEBUG_DPBR
143 // { cout << "Best response V="<<v0<<endl;}
144 // { cout << "policy="; jpol->Print();}
145 #endif
146  ConstructPolicyRecursively(agentI, 0, 0, 0, jpol, newpol);
147  return(v0);
148 }
149 
151  (
152  const Index agentI, //the agent we are computing the best response for
153  const Index aohI, //the oservation history of agentI
154  const IndividualBeliefJESP& B,//for which we compute the value+action
155  const Index stage, //the stage of B
156  JointPolicyPureVector* jpol,//the joint policy
157  vector<Index>& new_pol
158  )
159 {
160 
161 #if DEBUG_DPBR
162 stringstream tabsss;
163 for(Index tab=0; tab < stage; tab++)
164  tabsss << "\t";
165 string tabss = tabsss.str();
166 cout << tabss<<">>DPBestResponseRecursively(ag="<<agentI<<", aoh="<<aohI<<", B, stage="<<stage<< ", jpol) called, with " << endl <<tabss<<"B="<<endl;
167 B.Print();
168 #endif
169  vector<Index> otherAgentIndices;
170  for(Index agI=0; agI < GetNrAgents(); agI++)
171  if(agI != agentI)
172  otherAgentIndices.push_back(agI);
173 
174  size_t nrA = GetReferred()->GetNrActions(agentI);
175  size_t nrO = GetReferred()->GetNrObservations(agentI);
176  size_t nrAgents = GetNrAgents();
177  double v_max = -DBL_MAX; //higest expected value
178  Index a_br = 0; //and the corresponding best-response action
179  for(Index actionI=0; actionI < nrA; actionI++)//compute value of this action
180  {
181  double v_a = 0;
182  //Compute the expected immediate reward
183  double Rba = 0.0;
184  for(Index eI=0; eI < B.Size(); eI++) //eI is an index over e=<s,oHistJ>
185  {
186 //-> check if this should be put in a function ?
187 //( duplicated in IndividualBeliefJESP.Update() )
188  Index sI = B.GetStateIndex(eI);
189  vector<Index> oHistI_others = B.GetOthersObservationHistIndex(eI);
190  vector<Index> actions(nrAgents);
191  actions.at(agentI) = actionI;
192  for(Index j=0; j < otherAgentIndices.size(); j++)
193  {
194  Index agJ = otherAgentIndices[j];
195  Index oHistJ = oHistI_others[j];//not agJ!!!
196  Index actJ = jpol->GetActionIndex(agJ, oHistJ);
197  actions.at(agJ) = actJ;
198 
199  }
200  Index jaI = IndividualToJointActionIndices(actions);
201 // <-
202  Rba += B.Get(eI) * GetReward(sI, jaI);
203  }
204 
205  //Compute the future reward
206  double F = 0.0;
207  if(stage < GetHorizon() - 1)
208  {
209  double check_p = 0.0;
210  for(Index observI=0; observI < nrO; observI++)
211  {
212  IndividualBeliefJESP Bao(agentI, stage+1, *this);
213  double Po_ba = Bao.Update(B, actionI, observI, jpol);
214  Index next_aohI =
215  GetSuccessorAOHI(agentI, aohI, actionI, observI);
216  double F_ao = DPBestResponseRecursively(agentI, next_aohI,
217  Bao, stage+1, jpol, new_pol);
218  F += Po_ba * F_ao;
219  check_p += Po_ba;
220  }
221  if( abs(check_p - 1) > 1e7)
222  throw E("Po_ba not summing to 1");
223 
224  }
225  v_a = Rba + F;
226 #if DEBUG_DPBR
227 cout << tabss<<"(stage="<<stage<<")actionI="<<actionI<<", Q(b,a)="<< v_a <<
228  " (= R+F = "<<Rba<<" + "<< F <<")"<<endl;
229 #endif
230  if(v_a > v_max)
231  {
232  v_max = v_a;
233  a_br = actionI;
234  }
235  }
236 #if DEBUG_DPBR
237 cout << tabss<<">>ENDED DPBestResponseRecursively(ag="<<agentI<<", aoh="<<aohI
238  <<", B, stage="<<stage<< ", jpol) called, with " <<endl<<tabss<<"B="<<endl;
239 cout << tabss<<"Selected actionI="<<a_br<<" with Q(b,a)="
240  << v_max<< endl;
241 #endif
242  //NO!!! when we arrived at this ohI with some action history
243  //we are overwriting the best action taken from a previous. one
244  //jpol->SetAction(agentI, aohI, a_br);
245  new_pol.at(aohI) = a_br;
246  return v_max;
247 }
248 
249 
251  (
252  const Index agentI, //the agent we are computing the best response for
253  const Index aohI, //the action-observation history of agentI
254  const Index ohI, //the observation history of agentI
255  const Index stage, //the stage of B
256  JointPolicyPureVector* jpol,//the joint policy
257  vector<Index>& new_pol
258  )
259 {
260 #if DEBUG_DPBR
261 stringstream tabsss;
262 for(Index tab=0; tab < stage; tab++)
263  tabsss << "\t";
264 string tabss = tabsss.str();
265 #endif
266  //the action that is best at this aohI
267  Index best_a = new_pol[aohI];
268  jpol->SetAction(agentI, ohI, best_a);
269 
270  size_t nrO = GetReferred()->GetNrObservations(agentI);
271  if(stage < GetHorizon() - 1)
272  {
273  for(Index oI=0; oI < nrO; oI++)
274  {
275  Index next_aohI = GetSuccessorAOHI(agentI, aohI, best_a, oI);
276  Index next_ohI = GetSuccessorOHI(agentI, ohI, oI);
277  ConstructPolicyRecursively(agentI, next_aohI, next_ohI, stage+1,
278  jpol, new_pol);
279  }
280  }
281 #if DEBUG_DPBR
282 #endif
283 }