MultiAgentDecisionProcess  Release 0.2.1
GMAA_MAAstar.cpp
Go to the documentation of this file.
1 
28 #include <vector>
29 #include <float.h>
30 #include "GMAA_MAAstar.h"
31 #include "JPPVValuePair.h"
33 
34 
35 using namespace std;
36 
39  size_t horizon,
41  int verboseness
42  ) :
43 // PlanningUnitDecPOMDPDiscrete(params, horizon, p), //virtual base must be called directly
44  GeneralizedMAAStarPlannerForDecPOMDPDiscrete(params, horizon, p, verboseness)
45 {
46 }
47 
49  size_t horizon,
52 {
53 }
54 
57 {
58  //return(ConstructAndValuateNextPoliciesExactBG( ppi, poolOfNextPolicies));
59  //moved this function here since MAA* is the only GMAA variant that does it
60  //in this way...
61  PartialJointPolicyPureVector* jpolPrevTs = dynamic_cast
62  <PartialJointPolicyPureVector* >(ppi->GetJPol());//jpol^ts-1
63  size_t depth = jpolPrevTs->GetDepth(); // = depth = ts(jpolPrevTs) + 1
64  size_t ts = depth; //jpol = jpol^ts-1, we construct BG for ts == depth
65  bool is_last_ts = (ts == GetHorizon() - 1);
66 
67  double pastReward_prevTs = jpolPrevTs->GetPastReward();
68 
69  vector<Index> firstOHtsI(GetNrAgents());
70  for(Index agI=0; agI < GetNrAgents(); agI++)
71  firstOHtsI.at(agI) = GetFirstObservationHistoryIndex(agI, ts);
72  //size_t nrJOHts;
73 
74 
75  // Construct the bayesian game for this timestep -
76  //BayesianGameIdenticalPayoff *bg_ts=ConstructBayesianGame(jpolPrevTs,
77  //nrOHts, nrJOHts, firstOHtsI, ExpR_0_prevTS);
79  this,
81  jpolPrevTs
82  );
83  // This also returns ExpR_0_prevTS (the expected reward for time-steps
84  // 0...ts-1 (given jpolPrevTs) ), nrOHts, nrJOHts and firstOHtsI.
85 // double ExpR_0_prevTS = bg_ts->GetPastReward();
86 
87  const vector<size_t>& nrOHts = bg_ts->GetNrTypes();
88 #if DEBUG_GMAA3
89  if(_m_verboseness >= 3) {
90  cout << "Constructed BG:"<<endl;
91  bg_ts->Print();
92  }
93 #endif
94 
95 
96  //the policy for the Bayesian game
98 
99  /* Do the full enumeration of all joint policies for the BG...*/
100  bool carry_over = false;
101  LIndex nrJPols = bg_ts->GetNrJointPolicies();
102  LIndex i = 0;
103 
104  if(_m_verboseness >= 3)
105  cout << "starting on solution of BG for t="<<ts<<" with nrJPols="
106  <<nrJPols<<endl;
107 
108  stringstream ss;
109  ss << "GMAA::NextExact_ts" << ts;
110  StartTimer(ss.str());
111 
112  //some variables used when this is the last time-step
113  double maxLBv = -DBL_MAX;
114  JointPolicyPureVector bestLBjpolBG = JointPolicyPureVector(*bg_ts);
115  double newPastReward = 0.0;
116  //we cache the immediate rewards in the BG...
118  while(!carry_over)
119  {
120  if(_m_verboseness >= 0)
121  PrintProgress("BG joint policy", i++, nrJPols, 10000);
122  //eval the expected future payoff of this jpolBG
123  double f = 0.0;
124  double r = 0.0; //immediate reward (exact)
125  size_t nrJT = bg_ts->GetNrJointTypes();
126  for(Index jt = 0; jt < nrJT ; jt++)
127  {
128  Index jaI = jpolBG.GetJointActionIndex(jt);
129  double jt_prob = bg_ts->GetProbability(jt);
130  double jt_r = bg_ts->GetImmediateReward(jt, jaI);
131  r += jt_prob * jt_r;
132  double jt_util = bg_ts->GetUtility(jt, jaI);
133  f += jt_prob * jt_util;
134  }
135  //add the expected reward for 0...ts-1
136  //NOTE the expected reward has been computed as
137  //f = R + gamma * F
138  //by the heuristic. That means that f itself is not yet
139  //discounted!!:
140  double discount = GetDiscount();
141  double discT = pow(discount , (double)(ts) );
142  double discounted_F = discT * f;
143  //double v = ExpR_0_prevTS + discounted_F;
144  double v = pastReward_prevTs + discounted_F;
145  // the new past reward:
146  double discounted_r = discT * r;
147 
148 #if DEBUG_GMAA4
149  cout <<"v = pastReward_prevTs + g^t * f = "
150  << v <<" = "
151  << pastReward_prevTs<<" + "
152  << discounted_F << " "
153  << "(g^t * f ="<<discT << " * " << f << ")"
154  << endl;
155 #endif
156 
157  newPastReward = pastReward_prevTs + discounted_r;
158  if(!is_last_ts)
159  {
160  //not the last time step...construct and return all
161  //partial DecPOMDP policies,
162 
163  // construct a policy for the DecPOMDP: a copy of jpolPrevTs
164  // extended to this time step (ts) by jpolBG
166  ConstructExtendedJointPolicy(*jpolPrevTs,
167  jpolBG, nrOHts, firstOHtsI);
168  jpolTs->SetPastReward(newPastReward);
169 
170  // wrap the policy and put it in the pool of next policies.
171  // (these next policies are candidates to be added to the
172  // main policy pool)
173  // Warning: jpolTs may have been deleted afterwards
174  poolOfNextPolicies->Insert( NewPPI(jpolTs, v) );
175 
176  // conserve memory
177  // is this necessary? - this is performed by
178  // 'SelectPoliciesToProcessFurther'
179  // seems like duplicate work...
180  //if(poolOfNextPolicies->Size()>(_m_nrPoliciesToProcess*10))
181  //Prune(poolOfNextPolicies,_m_nrPoliciesToProcess);
182  }
183  else
184  {
185  //if using a heuristic that does not specify the exact immediate
186  //reward, we still want to return the *EXACT* payoff here...
187  v = newPastReward;
188  //this is the last time step:
189  // -values are lowerbounds, so if we find a policy with total
190  // reward equal to the upperbound of the parent (ppi->
191  // GetValue() ) then we can stop evaluating other policies.
192  // -we will only need to return 1 policy and value
193  // (so we do not put them in a pool)
194  if(v > maxLBv)
195  {
196  bestLBjpolBG = jpolBG;
197  maxLBv = v;
198  }
199  if(v >= ppi->GetValue() - 1e-8)
200  {
201  if(_m_verboseness >= 0)
202  cout << "GMAA_MAAstar::ConstructAndValuateNextPolicies"<<
203  " Last time step, found tight bound: "<<
204  "found value v="<<v<<", parent (upperbound) value="<<
205  ppi->GetValue() << endl;
206  break;
207  }
208  }
209  carry_over = ++jpolBG; //next policy for the BG...
211  }
212  //empty the imm reward cache
213  bg_ts->ClearAllImmediateRewards();
214 
215  //if this is the last time step, we have not constructed the Dec-POMDP
216  //policy which will be returned yet, so we do this now:
217  if(is_last_ts)
218  {
219 
221  *jpolPrevTs,
222  bestLBjpolBG,
223  nrOHts,
224  firstOHtsI
225  );
226  jpolTs->SetPastReward(newPastReward);
227  // Warning: jpolTs may have been deleted afterwards
228  poolOfNextPolicies->Insert( NewPPI(jpolTs, maxLBv ) );
229  }
230 
231  if(_m_verboseness >= 2)
232  cout <<"Solved BG for t="<<ts<<". Max. expected value (including "
233  <<"true expected reward up to this ts:"<<pastReward_prevTs<<") is:"
234  <<poolOfNextPolicies->Select()->GetValue() <<endl;
235 
236  delete(bg_ts);
237  StopTimer(ss.str());
238  //if we created a BG for the last time step t=h-1 - we have a lowerbound
239  return(is_last_ts);
240 
241 }
242 
244  poolOfNextPolicies, bool are_LBs, double bestLB)
245 {
246  //for MAA*, all policies are processed further when they aren't full
247  //policies (i.e. lower bounds)
248  //so, unless these are full policies...
249 
250  if(are_LBs)
251  {
252  while(!poolOfNextPolicies->Empty())
253  poolOfNextPolicies->Pop();
254  }
255  //we can return immediately...
256  return;
257 }
258 
261 {
262  //TODO this conversion to an index does save a lot of space, so
263  //if we can enable this again would be better.
264  //PartialPolicyPoolItemInterface* ppi=new JPPVIndexValPair(jp,v);
265  //delete jp;
267  return (ppi);
268 }