MultiAgentDecisionProcess  Release 0.2.1
MDPValueIteration.cpp
Go to the documentation of this file.
1 
28 #include "MDPValueIteration.h"
29 #include <fstream>
30 #include <boost/numeric/ublas/matrix_sparse.hpp>
31 
32 using namespace std;
33 
34 //Default constructor
36  MDPSolver(pu)
37 {
38  _m_initialized = false;
39 }
40 
41 //Destructor
43 {
44 }
45 
47 {
48  StartTimer("Initialize");
49 
50  size_t horizon = GetPU()->GetHorizon();
51  size_t nrS = GetPU()->GetNrStates();
52  size_t nrJA = GetPU()->GetNrJointActions();
53 
54  size_t nrQfunctions;
55  if(horizon==MAXHORIZON)
56  {
57  _m_finiteHorizon=false;
58  nrQfunctions=1;
59  }
60  else
61  {
62  _m_finiteHorizon=true;
63  nrQfunctions=horizon;
64  }
65 
66  QTable tempTable(nrS,nrJA);
67  for(unsigned int s=0;s!=nrS;++s)
68  for(unsigned int ja=0;ja!=nrJA;++ja)
69  tempTable(s,ja)=0;
70 
71  for(Index t=0; t < nrQfunctions; t++)
72  _m_QValues.push_back(tempTable);
73 
74  _m_initialized = true;
75 
76  StopTimer("Initialize");
77 }
78 
80 {
81  return(_m_QValues);
82 }
83 
85 {
86  return(_m_QValues.at(time_step));
87 }
88 
90 {
91  _m_QValues=Qs;
92 }
93 
94 void MDPValueIteration::SetQTable(const QTable &Q, Index time_step)
95 {
96  _m_QValues[time_step]=Q;
97 }
98 
101 {
102  if(!_m_initialized)
103  Initialize();
104 
105  StartTimer("Plan");
106 
107  size_t horizon = GetPU()->GetHorizon();
108  size_t nrS = GetPU()->GetNrStates();
109  size_t nrJA = GetPU()->GetNrJointActions();
110 
111  double R_i,R_f,maxQsuc;
112 
113  // cache immediate reward for speed
114  QTable immReward(nrS,nrJA);
115  for(Index sI = 0; sI < nrS; sI++)
116  for(Index jaI = 0; jaI < nrJA; jaI++)
117  immReward(sI,jaI)=GetPU()->GetReward(sI, jaI);
118 
119 
120  if(_m_finiteHorizon)
121  {
122  for(size_t t = horizon - 1; true; t--)
123  {
124  for(Index sI = 0; sI < nrS; sI++)
125  {
126  for(Index jaI = 0; jaI < nrJA; jaI++)
127  {
128  //calc. expected immediate reward
129  R_i = immReward(sI,jaI);
130  R_f = 0.0;
131  if(t < horizon - 1)
132  {
133  //calc. expected future reward
134  for(Index ssucI = 0; ssucI < nrS; ssucI++)
135  {
136  //find the best action at ssucI
137  maxQsuc = -DBL_MAX;
138  for(Index jasucI = 0; jasucI < nrJA; jasucI++)
139  maxQsuc = max( _m_QValues[t+1](ssucI,jasucI),
140  maxQsuc);
141 
142  R_f += GetPU()->GetTransitionProbability(sI, jaI,
143  ssucI)
144  * maxQsuc;
145  }//done calc. expected future reward
146  }
147  _m_QValues[t](sI,jaI) = R_i + R_f;
148  }//end for jaI
149  }//end for sI
150  if(t == 0) //escape from (loop t is unsigned!)
151  break;
152  }
153  }
154  else // infinite horizon problem
155  {
156  double maxDelta=DBL_MAX;
157  double gamma=GetPU()->GetDiscount();
158  QTable oldQtable;
159 
160  // in infinite-horizon case, it is typically worth to cache
161  // the transition model
162  typedef boost::numeric::ublas::compressed_matrix<double> CMatrix;
163  vector<CMatrix*> T;
164  CMatrix *Ta;
165  double p;
166  for(unsigned int a=0;a!=nrJA;++a)
167  {
168 #if DEBUG_MDPValueIteration
170 #endif
171  StartTimer("CacheTransitionModel");
172  Ta=new CMatrix(nrS,nrS);
173 
174  for(unsigned int s=0;s!=nrS;++s)
175  for(unsigned int s1=0;s1!=nrS;++s1)
176  {
177  p=GetPU()->GetTransitionProbability(s,a,s1);
178  if(p>0)
179  (*Ta)(s,s1)=p;
180  }
181 
182  T.push_back(Ta);
183  StopTimer("CacheTransitionModel");
184  }
185 
186  Index sI,ssucI;
187  while(maxDelta>1e-4)
188  {
189  StartTimer("Iteration");
190  maxDelta=0;
191  oldQtable=_m_QValues[0];
192  for(Index jaI = 0; jaI < nrJA; jaI++)
193  {
194  for(CMatrix::const_iterator1 ri=T[jaI]->begin1();
195  ri!=T[jaI]->end1(); ++ri)
196  {
197  sI=ri.index1();
198 
199  //calc. expected immediate reward
200  R_i = immReward(sI,jaI);
201  R_f = 0.0;
202  //calc. expected future reward
203 
204  for (CMatrix::const_iterator2 ci = ri.begin();
205  ci != ri.end(); ++ci)
206  {
207  ssucI=ci.index2();
208 
209  //find the best action at ssucI
210  maxQsuc = -DBL_MAX;
211  for(Index jasucI = 0; jasucI < nrJA; jasucI++)
212  maxQsuc = max( oldQtable(ssucI,jasucI),
213  maxQsuc);
214 
215  R_f += *ci * maxQsuc;
216  }//done calc. expected future reward
217 
218  _m_QValues[0](sI,jaI) = R_i + gamma*R_f;
219  maxDelta=max(maxDelta,abs(oldQtable(sI,jaI)-
220  _m_QValues[0](sI,jaI)));
221  }//end for jaI
222  }//end for sI
223 
224  StopTimer("Iteration");
225 
226 #if DEBUG_MDPValueIteration
227  cout << "delta " << maxDelta << endl;
229 #endif
230  }
231 
232  for(unsigned int a=0;a!=nrJA;++a)
233  delete T[a];
234  }
235  StopTimer("Plan");
236 
237 #if DEBUG_MDPValueIteration
239 #endif
240 }