MultiAgentDecisionProcess  Release 0.2.1
BayesianGameForDecPOMDPStage.cpp
Go to the documentation of this file.
1 
32 #include "JointPolicyPureVector.h"
35 #include "JointBeliefInterface.h"
36 #include "QFunctionJAOHInterface.h"
37 #include "BeliefIteratorGeneric.h"
38 
39 #define DEBUG_BG4DECPOMDP1 0
40 #define DEBUG_BG4DECPOMDP2 0
41 #define DEBUG_BG4DECPOMDP3 0
42 #define DEBUG_BG4DECPOMDP4 0
43 
44 using namespace std;
45 
46 //Default constructor
49  const QFunctionJAOHInterface* q,
50  const PartialJointPolicyDiscretePure* pastJPol
51  )
52  :
54  //BayesianGameBase(
55  //pu->GetNrAgents(),
56  //pu->GetNrActions(),
57  //pu->GetNrObservationHistoriesVector( pastJPol->GetDepth() )),
59  pu->GetNrAgents(),
60  pu->GetNrActions(),
61  pu->GetNrObservationHistoriesVector( pastJPol->GetDepth() )
62  //the depth of the past policy = the stage we construct the BG
63  //for ( jpol^ts-1 has depth ts )
64  )
65  ,_m_pu(pu)
66  ,_m_qHeuristic(q)
67  //,_m_pastReward(0.0)
68  ,_m_JBs( GetNrJointTypes() )
69  ,_m_areCachedImmediateRewards(false)
70 {
71  //extra stuff we need to do...
72  Initialize();
73 
74 }
77  )
78  :
81  pu->GetNrAgents(),
82  pu->GetNrActions(),
83  pu->GetNrObservationHistoriesVector(0)
84  )
85  ,_m_pu(pu)
86  ,_m_qHeuristic(0)
87  ,_m_JBs( 0 )
88  ,_m_areCachedImmediateRewards(false)
89 {
90 }
93  const QFunctionJAOHInterface* q,
94  Index t,
95  size_t nrAgents,
96  const vector<size_t>& nrActions,
97  const vector<size_t>& nrTypes
98 
99  )
100  :
102  //,BayesianGameBase( nrAgents, nrActions, nrTypes )
103  ,BayesianGameIdenticalPayoff( nrAgents, nrActions, nrTypes )
104  ,_m_pu(pu)
105  ,_m_qHeuristic(q)
106  ,_m_JBs( GetNrJointTypes() )
107  ,_m_areCachedImmediateRewards(false)
108 {
109  //here we do NOT do any extra stuff! The derived class (that called this
110  //protected constructor) is responsible for filling
111  //-the utilities
112  //-the probabilities
113  //-_m_JBs
114  //-...etc.
115 }
116 //Copy constructor.
118  :
120  //BayesianGameBase( o ),
122  ,_m_pu(o._m_pu)
123  ,_m_qHeuristic(o._m_qHeuristic)
124  //,_m_pastReward(o._m_pastReward)
125  ,_m_areCachedImmediateRewards(o._m_areCachedImmediateRewards)
126  ,_m_immR(o._m_immR) //does this work for std::vector< std::vector<double> > ? gues so?
127 {
128  //make deep copy of beliefs in _m_JB
129  //\todo The following members should be deep copied... (because we free
130  //this stuff on deletion of the object) no point in implementing
131  //this right now... TODO
132  //_m_JBs
133  //_m_typeLists
134  throw( E("BayesianGameForDecPOMDPStage copy constructor not fully implemented yet.") );
135 
136 }
137 //Destructor
139 {
140  //we don't need to delete any of the members, it seems.
141  //free all the joint beliefs:
142  std::vector< JointBeliefInterface* >::iterator it = _m_JBs.begin();
143  std::vector< JointBeliefInterface* >::iterator last = _m_JBs.end();
144  while(it != last)
145  {
146  delete *it;
147  it++;
148  }
149 }
150 //Copy assignment operator
152 {
153  if (this == &o) return *this; // Gracefully handle self assignment
154  // Put the normal assignment duties here...
156  _m_pu = o._m_pu;
158  //_m_pastReward = o._m_pastReward;
159  //TODO make deep copy of _m_JBs
161  _m_immR = o._m_immR; //does this work for std::vector< std::vector<double> > ? gues so?
162  throw E("BayesianGameForDecPOMDPStage::operator= not fully implemented yet...");
163 
164  return *this;
165 }
166 
167 
168 
170  vector<Index>& firstOHtsI)
171 {
172  //because the OHs are constructed breath-first, we know the OHs for agent i
173  //for this time step are numbered:
174  //firstOHtsGI[i]...firstOHtsGI[i]+nrOH[i]-1
175  //
176  //(read first-OH-for-time-step-ts its Global Index)
177  //
178  //i.e. ohGI = ohI + firstOHtsGI
179 
180  for(Index agI=0; agI < GetNrAgents(); agI++)
181  {
183  firstOHtsI.push_back(fI);
184  }
185 }
186 
187 
188 //this function extends a previous policy jpolPrevTs for ts-1 with the behavior specified by the policy of the BayesianGame for time step ts (jpolBG).
191  PartialJointPolicyDiscretePure& jpolPrevTs,
192  JointPolicyDiscretePure& jpolBG,
193  vector<size_t>& nrOHts,
194  vector<Index>& firstOHtsI
195  )
196 {
197  //check policy types
199  throw E("BayesianGameForDecPOMDPStage::ConstructExtendedPolicy --- jpolPrevTs.GetIndexDomainCategory() != PolicyGlobals::OHIST_INDEX ");
201  throw E("BayesianGameForDecPOMDPStage::ConstructExtendedPolicy --- jpolPrevTs.GetIndexDomainCategory() != PolicyGlobals::TYPE_INDEX ");
202  //construct a policy for the DecPOMDP:
203  //a copy of jpolPrevTs with extended to this time step (ts) by
204  //jpolBG
205  PartialJointPolicyDiscretePure* jpolTs = new
206  PartialJointPolicyPureVector(jpolPrevTs);
207  jpolTs->SetDepth( jpolTs->GetDepth()+1 );
208  for(Index agentI=0; agentI < GetNrAgents(); agentI++)
209  {
210  for(Index type = 0; type < nrOHts[agentI]; type++)
211  {
212  Index ohI = type + firstOHtsI[agentI];
213  jpolTs->SetAction(agentI, ohI,
214  jpolBG.GetActionIndex(agentI, type) );
215  }
216  }
217  return(jpolTs);
218 }
219 
221  const vector<Index>& indTypes,
222  const vector<Index>& firstOHtsI,
223  Index* joI_arr)
224 {
225  //convert indiv type indices to ind. observation history indices:
226  vector<Index> indOHI = vector<Index>(indTypes);
227  // indivObservations[ti][agI] will contain the observation agI received at tI+1
228  vector< vector<Index> > indivObservations(ts,vector<Index>(GetNrAgents()) );
229  for(Index agentI=0; agentI < GetNrAgents(); agentI++)
230  {
231  indOHI[agentI] += firstOHtsI[agentI];
232  Index obsArr[ts];
233  _m_pu->GetObservationHistoryArrays(agentI, indOHI[agentI], ts, obsArr);
234  //now obsArr is filled and can be copied into indivObservations
235  for(Index tI=0; tI < ts; tI++)
236  indivObservations.at(tI).at(agentI) = obsArr[tI];
237  }
238 
239  for(Index tI=0; tI < ts; tI++)
241  indivObservations[tI] );
242 }
243 
244 //compute the joint actions taken bu jpolPrevTs when joIs is the true joint
245 //observation history at stage ts.
247  Index ts,
248  Index joIs[], //the array of joint observations issued
249  const JointPolicyDiscretePure* jpolPrevTs,
250  Index* jaI_arr
251  )
252 {
253  Index johI = 0;
254  Index t = 0;
255  while(t < ts)
256  {
257  Index ja = jpolPrevTs->GetJointActionIndex(johI);
258  jaI_arr[t] = ja;
259 
260  Index next_joI = joIs[t];
261  johI = _m_pu->GetSuccessorJOHI(johI, next_joI);
262  t++;
263  }
264 }
265 
266 
279  //input args
280  Index ts, Index jtI, Index* jaI_arr,Index* joI_arr,
281  //output args
282  Index& jaohI, double& PjaohI, double& ExpR_0_prevTS_thisJAOH )
283 {
284  //first we get the initial jaoh
287 
288  double CPjaohI = 1.0;
289  PjaohI = CPjaohI; // == 1.0
290 
291  // get the initaal belief
293 
294  Index tI = 0;
295  while(tI < ts)
296  {
297  //calculate the R for tI
298  double ExpR_0_prevTS_thisJAOH_thisT = 0.0;
299 #if USE_BeliefIteratorGeneric
301  do
302  {
303  double R_si_ja = _m_pu->GetReward(it.GetStateIndex(), jaI_arr[tI]);
304  if(DEBUG_BG4DECPOMDP4)
305  cout << "R(s="<<it.GetStateIndex()<<",ja="<<jaI_arr[tI]<<")="<< R_si_ja << "\n";
306  ExpR_0_prevTS_thisJAOH_thisT += it.GetProbability() * R_si_ja;
307  } while(it.Next());
308 #else
309  for(Index sI=0; sI < _m_pu->GetNrStates(); sI++)
310  {
311  double R_si_ja = _m_pu->GetReward(sI, jaI_arr[tI]);
312  if(DEBUG_BG4DECPOMDP4)
313  cout << "R(s="<<sI<<",ja="<<jaI_arr[tI]<<")="<< R_si_ja << "\n";
314  ExpR_0_prevTS_thisJAOH_thisT += jb->Get(sI) * R_si_ja;
315  }
316 #endif
317  ExpR_0_prevTS_thisJAOH += ExpR_0_prevTS_thisJAOH_thisT;
319  {
320  cout << "calculating expected reward R(oaHist,a) for tI="<<tI
321  <<"oaHist:"; jaoht->GetJointActionObservationHistory()->Print();
322  cout << endl; cout << "R(b,a) (exp reward for jtI=" << jtI <<
323  ", tI="<<tI<<") is "<< ExpR_0_prevTS_thisJAOH_thisT <<endl;
324 
325  }
326  jaoht = jaoht->GetSuccessor( jaI_arr[tI], joI_arr[tI] );
327  jaohI = jaoht->GetIndex();
328 
329  CPjaohI = jb->Update( *_m_pu->GetReferred(), jaI_arr[tI], joI_arr[tI] );
330  PjaohI = PjaohI * CPjaohI;
331  tI++;
332  }
333  delete jb; //free the belief allocated with 'new'
335  {
336  cout << "expected previous reward (up to ts-1) for (jtI=" << jtI <<
337  ") ";
339  cout << " is "<< ExpR_0_prevTS_thisJAOH <<endl << endl;
340  }
341 }
342 
343 /*
344 // ExpR_0_prevTS is returned by this function.
345 BayesianGameIdenticalPayoff *
346 BayesianGameForDecPOMDPStage::ConstructBayesianGame (
347  //input
348  const JointPolicyDiscretePure* jpolPrevTs,
349  //output arguments:
350  vector<size_t>& nrOHts,
351  size_t& nrJOHts,
352  vector<Index>& firstOHtsI,
353  double &ExpR_0_prevTS)// const //Should be fixed: can't make const because of timers...
354 {
355  size_t depth = jpolPrevTs->GetDepth(); // = depth = ts(jpolPrevTs) + 1
356 
357  //stringstream ss;
358  //ss << "GMAA::ConstructBG_ts" << depth;
359  //_m_pu->StartTimer(ss.str());
360 
361  if(DEBUG_BG4DECPOMDP1)
362  cout << ">>>ConstructBayesianGame(from ConstructAndValuateNextPolicies)"
363  << " ts=" << depth << endl;
364  if(DEBUG_BG4DECPOMDP2)
365  cout <<" - previous policy: " << jpolPrevTs->SoftPrintBrief() << endl;
366 
367  Index ts = depth; //jpol = jpol^ts-1, we construct BG for ts == depth
368  //vector<size_t> nrOHts; //the number of types for the BG we're constructing
369 
370  for(Index agentI = 0; agentI < GetNrAgents(); agentI++)
371  nrOHts.push_back( _m_pu->GetNrObservationHistories(agentI, ts) );
372  //size_t
373  nrJOHts = _m_pu->GetNrJointObservationHistories(ts);
374 
375  //stores the indices of the first OH for time step ts for each agent
376  //vector<Index> firstOHtsI;
377  Fill_FirstOHtsI(ts, firstOHtsI);
378 
379  //the expected reward for time-steps 0...ts-1
380  //this will be calculated as Sum_jaoh P(jaoh) R_{0...ts-1}(jaoh)
381  ExpR_0_prevTS = 0.0;
382 
383  //initialize the BG:
384  BayesianGameIdenticalPayoff *bg_ts=this;
385 // new BayesianGameIdenticalPayoff(GetNrAgents(), GetNrActions(), nrOHts);
386 
387  if(DEBUG_BG4DECPOMDP2)
388  cout <<"constructing the BG for time-step..." <<ts<<endl;
389 
390  //for each joint obs. history (type of the BG), we determine the actions
391  //that jpolPrevTs would have specified (i.e., we determine JAOH, the act-obs.
392  //history). This is then used to compute:
393  // -the probability of this joint obs. history (given jpolPrevTs)
394  // -the expected reward over 0...ts-1 GIVEN that this JAOH occurs
395  for(Index jtI = 0; jtI < nrJOHts; jtI++)
396  {
397  if(DEBUG_BG4DECPOMDP2)
398  PrintProgress("jtI",jtI,nrJOHts, 10);
399 
400  //we loop over Joint Type indices - these correspond to
401  //joint observation history indices, but non-trivially, so let's first
402  //compute the joint observation history
403  const vector<Index> indTypes = bg_ts->JointToIndividualTypeIndices(jtI);
404 
405  //array for the joint observations at ts=1,...,ts
406  Index joI_arr[ts];
407  Fill_joI_Array(ts, indTypes, firstOHtsI, joI_arr);
408  const JointObservationHistoryTree* joht = Get_joht(ts, joI_arr);
409 
410  //see what joint action-observation history corresponds to
411  // previous policy jpolPrevTs
412 
413  //first get all actions taken
414  Index jaI_arr[ts];//the taken joint actions at t=0,...,ts-1
415  Fill_jaI_Array(ts, joht, jpolPrevTs, jaI_arr);
416  //now we know the taken actions and the observation history, so we
417  //can reconstruct the joint action-observation history and its
418  //probability.
419  Index jaohI = 0;
420  double PjaohI = 0.0;
421  //the cumulative reward over 0...ts-1 GIVEN that this JAOH occurs.
422  double ExpR_0_prevTS_thisJAOH = 0.0;
423  //get the joah Index and corresponding reward and prob.
424  ProbRewardForjoahI(ts, jtI, jaI_arr,joI_arr, jaohI, PjaohI,
425  ExpR_0_prevTS_thisJAOH );
426  ExpR_0_prevTS += ExpR_0_prevTS_thisJAOH * PjaohI;
427 
428  //now we have found the jaohI corresponding to johI (jtI) and
429  //previous policy jpolPrevTs, so we can get the Q-value and prob. for
430  //the BG.
431  bg_ts->SetProbability(jtI, PjaohI);
432  for(Index jaI=0; jaI < GetNrJointActions(); jaI++)
433  {
434  if(PjaohI>0) // asking for a heuristic Q for a history
435  // that cannot have occurred might lead to
436  // problems (QMDP cannot compute a belief for
437  // instance, so just put 0
438  {
439  double ut = _m_qHeuristic->GetQ(jaohI, jaI);
440  bg_ts->SetUtility(jtI, jaI, ut );
441  }
442  else
443  bg_ts->SetUtility(jtI, jaI, 0);
444  }
445 
446  }//end for jtI
447  //now the Bayesian game is constructed completely.
448 
449  if(DEBUG_BG4DECPOMDP2)
450  cout <<"BG for time-step" <<ts<<" constructed, nrJOHts="
451  << nrJOHts << ", nrJPols=" << bg_ts->GetNrJointPolicies() << endl;
452 #if DEBUG_BG4DECPOMDP3
453  if(DEBUG_BG4DECPOMDP3)
454  cout << "previously obtained expected reward="<<ExpR_0_prevTS<<endl;
455 #endif
456 
457  if(bg_ts->GetNrJointPolicies()==0)
458  throw(E("BayesianGameForDecPOMDPStage:ConstructBayesianGame nrJPols==0, possible overflow?"));
459 
460  //StopTimer(ss.str());
461  //_m_bgCounter++;
462  //if(_m_bgBaseFilename!="")
463  //{
464  //stringstream ss;
465  //ss << _m_bgBaseFilename << _m_bgCounter;
466  //BayesianGameIdenticalPayoff::Save(*bg_ts,ss.str());
467  //}
468  return(bg_ts);
469 }
470 */
471 
473 {
474  //stringstream ss;
475  //ss << "GMAA::ConstructBG_ts" << depth;
476  //_m_pu->StartTimer(ss.str());
477 
478  Index ts = _m_t;
479  if(_m_pJPol == 0)
480  {
481  cerr << "Warning Initialize called without past joint policy: aborting."
482  <<endl;
483  return;
484  }
485 
487  cout << ">>>BayesianGameForDecPOMDPStage::Initialize() called for"
488  << " ts=" << ts << endl;
490  cout <<" - previous policy: " << _m_pJPol->SoftPrintBrief() << endl;
491 
492  size_t nrJOHts = _m_pu->GetNrJointObservationHistories(ts);
493  const JointPolicyDiscretePure* jpolPrevTs = _m_pJPol;
494 
495  //stores the indices of the first OH for time step ts for each agent
496  vector<Index> firstOHtsI;
497  Fill_FirstOHtsI(ts, firstOHtsI);
498 
499  //the expected reward for time-steps 0...ts-1
500  //this will be calculated as Sum_jaoh P(jaoh) R_{0...ts-1}(jaoh)
501  //double ExpR_0_prevTS = 0.0;
502 
503 
504  BayesianGameIdenticalPayoff *bg_ts=this;
505  //for each joint obs. history (type of the BG), we determine the actions
506  //that jpolPrevTs would have specified (i.e., we determine JAOH, the
507  //act-obs. history). This is then used to compute:
508  // -the probability of this joint obs. history (given jpolPrevTs)
509  // -the expected reward over 0...ts-1 GIVEN that this JAOH occurs
510  for(Index jtI = 0; jtI < nrJOHts; jtI++)
511  {
513  PrintProgress("jtI",jtI,nrJOHts, 10);
514 
515  //we loop over Joint Type indices - these correspond to
516  //joint observation history indices, but non-trivially, so let's first
517  //compute the joint observation history
518  const vector<Index> indTypes = bg_ts->JointToIndividualTypeIndices(jtI);
519 
520  //array for the joint observations at ts=1,...,ts
521  Index joI_arr[ts];
522  Fill_joI_Array(ts, indTypes, firstOHtsI, joI_arr);
523 
524  //we don't want to be dependent on the generation of joint
525  //observation histories
526  //const JointObservationHistoryTree* joht = Get_joht(ts, joI_arr);
527  //Index johI = _m_pu->GetJointObservationHistoryIndex(ts, joI_arr);
528 
529  //see what joint action-observation history corresponds to
530  // previous policy jpolPrevTs
531 
532  //first get all actions taken
533  Index jaI_arr[ts];//the taken joint actions at t=0,...,ts-1
534  Fill_jaI_Array(ts, joI_arr, jpolPrevTs, jaI_arr);
535  //now we know the taken actions and the observation history, so we
536  //can reconstruct the joint action-observation history and its
537  //probability.
538 
539  //the cumulative reward over 0...ts-1 GIVEN that this JAOH occurs.
540  //double ExpR_0_prevTS_thisJAOH = 0.0;
541  //get the joah Index and corresponding reward and prob.
542  //new:
543  Index jaohI = 0;
544  double PjaohI = 1.0;
546  if(ts > 0)
547  {
548  vector< Index > jaI_vec(&jaI_arr[0],&jaI_arr[ts]);
549  vector< Index > joI_vec(&joI_arr[0],&joI_arr[ts]);
550  jaohI = _m_pu->GetJointActionObservationHistoryIndex(ts, jaI_vec,
551  joI_vec);
552  PjaohI = _m_pu->GetJAOHProbsRecursively(jb, jaI_arr, joI_arr, 0,ts);
553  }
554  _m_JBs.at(jtI) = jb;
555 
556 
557 /* //old:
558  ProbRewardForjoahI(ts, jtI, jaI_arr,joI_arr, jaohI, PjaohI,
559  ExpR_0_prevTS_thisJAOH );
560  ExpR_0_prevTS += ExpR_0_prevTS_thisJAOH * PjaohI;
561 */
562 
563  //now we have found the jaohI corresponding to johI (jtI) and
564  //previous policy jpolPrevTs, so we can get the Q-value and prob. for
565  //the BG.
566  bg_ts->SetProbability(jtI, PjaohI);
567  for(Index jaI=0; jaI < GetNrJointActions(); jaI++)
568  {
569  if(PjaohI>0) // asking for a heuristic Q for a history
570  // that cannot have occurred might lead to
571  // problems (QMDP cannot compute a belief for
572  // instance, so just put 0
573  {
574  double ut = _m_qHeuristic->GetQ(jaohI, jaI);
575  bg_ts->SetUtility(jtI, jaI, ut );
576  }
577  else
578  bg_ts->SetUtility(jtI, jaI, 0);
579  }
580 
581  }//end for jtI
582  //now the Bayesian game is constructed completely.
583 
584  //perhaps store the previous reward somewhere?
585  //_m_pastReward = ExpR_0_prevTS;
586 }
587 
588 
591 {
593  _m_immR.clear();
594 
595  size_t nrJT = GetNrJointTypes();
596  size_t nrJA = GetNrJointActions();
597  _m_immR = vector< vector <double> >(nrJT, vector<double>(nrJA, 0.0) );
598 
599  for(Index jtI=0; jtI< nrJT; jtI++)
600  for(Index jaI=0; jaI< nrJA; jaI++)
601  _m_immR.at(jtI).at(jaI) = ComputeImmediateReward(jtI, jaI);
602 
604 }
605 
608 {
609  //Index jaohI = _m_jaohReps.at(jtI);
610  JointBeliefInterface* jb = _m_JBs.at(jtI);
612  double r = 0.0;
613  do{
614  Index s = bit.GetStateIndex();
615  double p = bit.GetProbability();
616  r += p * _m_pu->GetReward(s, jaI);
617  }while (bit.Next() );
618  return r;
619 
620 }
623 {
624  //eval the expected future payoff of this jpolBG
625  double r = 0.0; //immediate reward (exact)
626  size_t nrJT = this->GetNrJointTypes();
627  for(Index jt = 0; jt < nrJT ; jt++)
628  {
629  Index jaI = jpolBG->GetJointActionIndex(jt);
630  double jt_prob = this->GetProbability(jt);
631  double jt_r = this->GetImmediateReward(jt, jaI);
632  r += jt_prob * jt_r;
633  }
634  double discount = _m_pu->GetDiscount();
635  double discT = pow(discount , (double)(_m_t) );
636  double discounted_r = discT * r;
637  return discounted_r;
638 }
639 
640 
641 
643 {
644  stringstream ss;
645  ss << "Bayesian Game for stage t="<<_m_t<<" of a Dec-POMDP"<<endl;
646  if(_m_pJPol != 0)
647  ss<< "Past policy that lead to this BG=" << _m_pJPol->SoftPrint()<<endl;
649  return(ss.str());
650 }