00001 // Copyright 2007, Massachusetts Institute of Technology. 00002 // The use of this code is permitted for research only. There is 00003 // absolutely no warranty for this software. 00004 // 00005 // Author: John Lee (jjl@mit.edu) 00006 // 00007 // OnDiskPointSets are read-only. They are meant to be used for large 00008 // point set lists which do not fit in memory. 00009 // OnDiskPointSet operates with two caches: an "area" cache and an LRU cache. 00010 // The area cache is a small subsequence of the entire point set list. 00011 // When we want to access something in the point set list, we do the following: 00012 // 1. Check the area cache for it. If it's there, return it. 00013 // 2. Check the LRU cache for it. 00014 // 2a. If it's there, return it, and also move it to the front of 00015 // the LRU cache. The area cache is unchanged. 00016 // 2b. If it's not there, then add it to the LRU cache, evicting the 00017 // least-recently-used item if needed. ALSO, we reset the area 00018 // cache, such that the first element of the area cache is the 00019 // requested item. 00020 // 00021 // WARNING: the [] operator returns a const ref, which may become invalid 00022 // the next time you invoke the [] operator. BE CAREFUL! 00023 // 00024 00025 #ifndef POINT_SET_ON_DISK_POINT_SET_LIST_H 00026 #define POINT_SET_ON_DISK_POINT_SET_LIST_H 00027 00028 #include <assert.h> 00029 #include <vector> 00030 #include <string> 00031 #include <iostream> 00032 #include <fstream> 00033 #include <list> 00034 #include "point_set/point-set-list.h" 00035 #include "point_set/point-set.h" 00036 #include "point_set/point-ref.h" 00037 #include "point_set/feature.h" 00038 00039 using namespace std; 00040 00041 namespace libpmk { 00043 00055 class OnDiskPointSetList : public PointSetList { 00056 public: 00058 00063 OnDiskPointSetList(string filename); 00064 00066 00071 OnDiskPointSetList(string filename, int lru_cache_size, 00072 int area_cache_size); 00073 virtual ~OnDiskPointSetList(); 00074 00075 virtual int GetNumPointSets() const; 00076 virtual int GetNumPoints() const; 00077 virtual int GetFeatureDim() const; 00078 virtual vector<int> GetSetCardinalities() const; 00079 00082 00096 virtual const PointSet& operator[](int index) const; 00097 00099 static const int DEFAULT_LRU_CACHE_SIZE; 00100 00102 static const int DEFAULT_AREA_CACHE_SIZE; 00103 00104 private: 00105 // Returns true iff the <index>th point set in the whole list is 00106 // in the area cache. 00107 bool IsInAreaCache(int index) const; 00108 00109 // Seeks to the <index>th point set list on the disk and fills up 00110 // the area cache with 00111 // <index>, <index> + 1, ..., <index> + cache_size_ - 1. 00112 // 00113 // If the end is reached, then cache_ will just end there, so 00114 // the value of cache_.size() may not always be equal to 00115 // this.cache_size_ . 00116 // The resulting location of the file pointer will be the very next 00117 // point set (i.e., the one that would follow the point set at the 00118 // end of the current cache). It will be EOF if we've hit the end. 00119 // This function sets cache_offset_ appropriately. 00120 void SeekAndFillAreaCache(int index) const; 00121 00122 00123 // Seeks to the start, then scans through and computes 00124 // num_point_sets_, num_points_, and feature_dim_. It also 00125 // populates the vectors <pointers_> and <num_features_> It then 00126 // returns the file pointer to where it was originally. 00127 void GetListInfo() const; 00128 00129 // A vector of size <num_point_sets_>. The value of pointers_[ii] 00130 // gives you the position in the stream where you can find the 00131 // iith point set. 00132 mutable vector<streampos> pointers_; 00133 00134 // A vector of size <num_point_sets_> which tells you how many features 00135 // there are in each point set. 00136 mutable vector<int> num_features_; 00137 00138 // Useful metadata 00139 mutable int num_point_sets_; 00140 mutable int num_points_; 00141 mutable int feature_dim_; 00142 mutable ifstream input_stream_; 00143 00144 // cache_[0] is the <cache_offset_>th point set in the whole list. 00145 int max_lru_size_; 00146 int max_area_size_; 00147 00148 mutable int area_cache_offset_; 00149 00150 // The LRU cache is such that the most-recently-used is at the front 00151 // and least-recently-used (to evict) is at the back. We need to store 00152 // pairs. The first element in each pair is the index of that point in 00153 // the entire point set list, and the second element in the pair is 00154 // the point set itself. 00155 mutable list<pair<int, PointSet> > lru_cache_; 00156 mutable vector<PointSet> area_cache_; 00157 00158 // Disallow evil constructors (don't allow users to make copies of 00159 // OnDiskPointSetLists) 00160 OnDiskPointSetList& operator=(const OnDiskPointSetList& foo); 00161 OnDiskPointSetList(const OnDiskPointSetList& foo); 00162 00163 }; 00164 } // namespace libpmk 00165 #endif // POINT_SET_POINT_SET_LIST_H