00001 // Copyright 2007, Massachusetts Institute of Technology. 00002 // The use of this code is permitted for research only. There is 00003 // absolutely no warranty for this software. 00004 // 00005 // Author: John Lee (jjl@mit.edu) 00006 // 00007 // OnDiskPointSets are read-only. They are meant to be used for large 00008 // point set lists which do not fit in memory. 00009 // OnDiskPointSet operates with two caches: an "area" cache and an LRU cache. 00010 // The area cache is a small subsequence of the entire point set list. 00011 // When we want to access something in the point set list, we do the following: 00012 // 1. Check the area cache for it. If it's there, return it. 00013 // 2. Check the LRU cache for it. 00014 // 2a. If it's there, return it, and also move it to the front of 00015 // the LRU cache. The area cache is unchanged. 00016 // 2b. If it's not there, then add it to the LRU cache, evicting the 00017 // least-recently-used item if needed. ALSO, we reset the area 00018 // cache, such that the first element of the area cache is the 00019 // requested item. 00020 // 00021 // WARNING: the [] operator returns a const ref, which may become invalid 00022 // the next time you invoke the [] operator. BE CAREFUL! 00023 // 00024 00025 #ifndef POINT_SET_ON_DISK_POINT_SET_LIST_H 00026 #define POINT_SET_ON_DISK_POINT_SET_LIST_H 00027 00028 #include <vector> 00029 #include <string> 00030 #include <iostream> 00031 #include <fstream> 00032 #include <list> 00033 #include "point_set/point-set-list.h" 00034 #include "point_set/point-set.h" 00035 00036 using namespace std; 00037 00038 namespace libpmk { 00040 00052 class OnDiskPointSetList : public PointSetList { 00053 public: 00055 00060 OnDiskPointSetList(string filename); 00061 00063 00068 OnDiskPointSetList(string filename, int lru_cache_size, 00069 int area_cache_size); 00070 virtual ~OnDiskPointSetList(); 00071 00072 virtual int point_set_size() const; 00073 virtual int point_size() const; 00074 virtual int point_dim() const; 00075 virtual bool GetSetCardinalities(vector<int>* destination) const; 00076 00079 00093 virtual const PointSet& point_set(int index) const; 00094 00096 static const int DEFAULT_LRU_CACHE_SIZE; 00097 00099 static const int DEFAULT_AREA_CACHE_SIZE; 00100 00101 private: 00102 // Returns true iff the <index>th point set in the whole list is 00103 // in the area cache. 00104 bool IsInAreaCache(int index) const; 00105 00106 // Seeks to the <index>th point set list on the disk and fills up 00107 // the area cache with 00108 // <index>, <index> + 1, ..., <index> + cache_size_ - 1. 00109 // 00110 // If the end is reached, then cache_ will just end there, so 00111 // the value of cache_.size() may not always be equal to 00112 // this.cache_size_ . 00113 // The resulting location of the file pointer will be the very next 00114 // point set (i.e., the one that would follow the point set at the 00115 // end of the current cache). It will be EOF if we've hit the end. 00116 // This function sets cache_offset_ appropriately. 00117 void SeekAndFillAreaCache(int index) const; 00118 00119 00120 // Seeks to the start, then scans through and computes 00121 // num_point_sets_, num_points_, and point_dim_. It also 00122 // populates the vectors <pointers_> and <num_points_> It then 00123 // returns the file pointer to where it was originally. 00124 void GetListInfo() const; 00125 00126 // A vector of size <num_point_sets_>. The value of pointers_[ii] 00127 // gives you the position in the stream where you can find the 00128 // iith point set. 00129 mutable vector<streampos> pointers_; 00130 00131 // A vector of size <num_point_sets_> which tells you how many points 00132 // there are in each point set. 00133 mutable vector<int> num_points_per_set_; 00134 00135 // Useful metadata 00136 mutable int num_point_sets_; 00137 mutable int num_points_; 00138 mutable int point_dim_; 00139 mutable ifstream input_stream_; 00140 00141 // cache_[0] is the <cache_offset_>th point set in the whole list. 00142 int max_lru_size_; 00143 int max_area_size_; 00144 00145 mutable int area_cache_offset_; 00146 00147 // The LRU cache is such that the most-recently-used is at the front 00148 // and least-recently-used (to evict) is at the back. We need to store 00149 // pairs. The first element in each pair is the index of that point in 00150 // the entire point set list, and the second element in the pair is 00151 // the point set itself. 00152 mutable list<pair<int, PointSet> > lru_cache_; 00153 mutable vector<PointSet> area_cache_; 00154 00155 // Disallow evil constructors (don't allow users to make copies of 00156 // OnDiskPointSetLists) 00157 OnDiskPointSetList& operator=(const OnDiskPointSetList& foo); 00158 OnDiskPointSetList(const OnDiskPointSetList& foo); 00159 00160 }; 00161 } // namespace libpmk 00162 00163 #endif // POINT_SET_ON_DISK_POINT_SET_LIST_H