on-disk-point-set-list.h

Go to the documentation of this file.
00001 // Copyright 2007, Massachusetts Institute of Technology.
00002 // The use of this code is permitted for research only. There is
00003 // absolutely no warranty for this software.
00004 //
00005 // Author: John Lee (jjl@mit.edu)
00006 //
00007 // OnDiskPointSets are read-only. They are meant to be used for large
00008 // point set lists which do not fit in memory.
00009 // OnDiskPointSet operates with two caches: an "area" cache and an LRU cache.
00010 // The area cache is a small subsequence of the entire point set list.
00011 // When we want to access something in the point set list, we do the following:
00012 //   1. Check the area cache for it. If it's there, return it.
00013 //   2. Check the LRU cache for it.
00014 //        2a. If it's there, return it, and also move it to the front of
00015 //            the LRU cache. The area cache is unchanged.
00016 //        2b. If it's not there, then add it to the LRU cache, evicting the
00017 //            least-recently-used item if needed. ALSO, we reset the area
00018 //            cache, such that the first element of the area cache is the
00019 //            requested item.
00020 //
00021 // WARNING: the [] operator returns a const ref, which may become invalid
00022 // the next time you invoke the [] operator. BE CAREFUL!
00023 //
00024 
00025 #ifndef POINT_SET_ON_DISK_POINT_SET_LIST_H
00026 #define POINT_SET_ON_DISK_POINT_SET_LIST_H
00027 
00028 #include <vector>
00029 #include <string>
00030 #include <iostream>
00031 #include <fstream>
00032 #include <list>
00033 #include "point_set/point-set-list.h"
00034 #include "point_set/point-set.h"
00035 
00036 using namespace std;
00037 
00038 namespace libpmk {
00040 
00052 class OnDiskPointSetList : public PointSetList {
00053 public:
00055 
00060   OnDiskPointSetList(string filename);
00061 
00063 
00068   OnDiskPointSetList(string filename, int lru_cache_size,
00069                      int area_cache_size);
00070   virtual ~OnDiskPointSetList();
00071 
00072   virtual int point_set_size() const;
00073   virtual int point_size() const;
00074   virtual int point_dim() const;
00075   virtual bool GetSetCardinalities(vector<int>* destination) const;
00076 
00079 
00093   virtual const PointSet& point_set(int index) const;
00094 
00096   static const int DEFAULT_LRU_CACHE_SIZE;
00097 
00099   static const int DEFAULT_AREA_CACHE_SIZE;
00100 
00101 private:
00102   // Returns true iff the <index>th point set in the whole list is
00103   // in the area cache.
00104   bool IsInAreaCache(int index) const;
00105 
00106   // Seeks to the <index>th point set list on the disk and fills up
00107   // the area cache with
00108   //   <index>, <index> + 1, ..., <index> + cache_size_ - 1.
00109   //
00110   // If the end is reached, then cache_ will just end there, so
00111   // the value of cache_.size() may not always be equal to
00112   // this.cache_size_ .
00113   // The resulting location of the file pointer will be the very next
00114   // point set (i.e., the one that would follow the point set at the
00115   // end of the current cache). It will be EOF if we've hit the end.
00116   // This function sets cache_offset_ appropriately.
00117   void SeekAndFillAreaCache(int index) const;
00118 
00119 
00120   // Seeks to the start, then scans through and computes
00121   // num_point_sets_, num_points_, and point_dim_.  It also
00122   // populates the vectors <pointers_> and <num_points_> It then
00123   // returns the file pointer to where it was originally.
00124   void GetListInfo() const;
00125 
00126   // A vector of size <num_point_sets_>. The value of pointers_[ii]
00127   // gives you the position in the stream where you can find the
00128   // iith point set.
00129   mutable vector<streampos> pointers_;
00130 
00131   // A vector of size <num_point_sets_> which tells you how many points
00132   // there are in each point set.
00133   mutable vector<int> num_points_per_set_;
00134 
00135   // Useful metadata
00136   mutable int num_point_sets_;
00137   mutable int num_points_;
00138   mutable int point_dim_;
00139   mutable ifstream input_stream_;
00140 
00141   // cache_[0] is the <cache_offset_>th point set in the whole list.
00142   int max_lru_size_;
00143   int max_area_size_;
00144 
00145   mutable int area_cache_offset_;
00146 
00147   // The LRU cache is such that the most-recently-used is at the front
00148   // and least-recently-used (to evict) is at the back. We need to store
00149   // pairs. The first element in each pair is the index of that point in
00150   // the entire point set list, and the second element in the pair is
00151   // the point set itself.
00152   mutable list<pair<int, PointSet> > lru_cache_;
00153   mutable vector<PointSet> area_cache_;
00154 
00155   // Disallow evil constructors (don't allow users to make copies of
00156   // OnDiskPointSetLists)
00157   OnDiskPointSetList& operator=(const OnDiskPointSetList& foo);
00158   OnDiskPointSetList(const OnDiskPointSetList& foo);
00159 
00160 };
00161 }  // namespace libpmk
00162 
00163 #endif  // POINT_SET_ON_DISK_POINT_SET_LIST_H

Generated on Fri Sep 21 11:39:04 2007 for libpmk2 by  doxygen 1.5.1