on-disk-point-set-list.h

Go to the documentation of this file.
00001 // Copyright 2007, Massachusetts Institute of Technology.
00002 // The use of this code is permitted for research only. There is
00003 // absolutely no warranty for this software.
00004 //
00005 // Author: John Lee (jjl@mit.edu)
00006 //
00007 // OnDiskPointSets are read-only. They are meant to be used for large
00008 // point set lists which do not fit in memory.
00009 // OnDiskPointSet operates with two caches: an "area" cache and an LRU cache.
00010 // The area cache is a small subsequence of the entire point set list.
00011 // When we want to access something in the point set list, we do the following:
00012 //   1. Check the area cache for it. If it's there, return it.
00013 //   2. Check the LRU cache for it.
00014 //        2a. If it's there, return it, and also move it to the front of
00015 //            the LRU cache. The area cache is unchanged.
00016 //        2b. If it's not there, then add it to the LRU cache, evicting the
00017 //            least-recently-used item if needed. ALSO, we reset the area
00018 //            cache, such that the first element of the area cache is the
00019 //            requested item.
00020 //
00021 // WARNING: the [] operator returns a const ref, which may become invalid
00022 // the next time you invoke the [] operator. BE CAREFUL!
00023 //
00024 
00025 #ifndef POINT_SET_ON_DISK_POINT_SET_LIST_H
00026 #define POINT_SET_ON_DISK_POINT_SET_LIST_H
00027 
00028 #include <assert.h>
00029 #include <vector>
00030 #include <string>
00031 #include <iostream>
00032 #include <fstream>
00033 #include <list>
00034 #include "point_set/point-set-list.h"
00035 #include "point_set/point-set.h"
00036 #include "point_set/point-ref.h"
00037 #include "point_set/feature.h"
00038 
00039 using namespace std;
00040 
00041 namespace libpmk {
00043 
00055 class OnDiskPointSetList : public PointSetList {
00056  public:
00058 
00063    OnDiskPointSetList(string filename);
00064 
00066 
00071    OnDiskPointSetList(string filename, int lru_cache_size,
00072                       int area_cache_size);
00073    virtual ~OnDiskPointSetList();
00074 
00075    virtual int GetNumPointSets() const;
00076    virtual int GetNumPoints() const;
00077    virtual int GetFeatureDim() const;
00078    virtual vector<int> GetSetCardinalities() const;
00079 
00082 
00096    virtual const PointSet& operator[](int index) const;
00097 
00099    static const int DEFAULT_LRU_CACHE_SIZE;
00100 
00102    static const int DEFAULT_AREA_CACHE_SIZE;
00103 
00104  private:
00105    // Returns true iff the <index>th point set in the whole list is
00106    // in the area cache.
00107    bool IsInAreaCache(int index) const;
00108 
00109    // Seeks to the <index>th point set list on the disk and fills up
00110    // the area cache with
00111    //   <index>, <index> + 1, ..., <index> + cache_size_ - 1.
00112    //
00113    // If the end is reached, then cache_ will just end there, so
00114    // the value of cache_.size() may not always be equal to
00115    // this.cache_size_ .
00116    // The resulting location of the file pointer will be the very next
00117    // point set (i.e., the one that would follow the point set at the
00118    // end of the current cache). It will be EOF if we've hit the end.
00119    // This function sets cache_offset_ appropriately.
00120    void SeekAndFillAreaCache(int index) const;
00121    
00122 
00123    // Seeks to the start, then scans through and computes
00124    // num_point_sets_, num_points_, and feature_dim_.  It also
00125    // populates the vectors <pointers_> and <num_features_> It then
00126    // returns the file pointer to where it was originally.
00127    void GetListInfo() const;
00128    
00129    // A vector of size <num_point_sets_>. The value of pointers_[ii]
00130    // gives you the position in the stream where you can find the
00131    // iith point set.
00132    mutable vector<streampos> pointers_;
00133 
00134    // A vector of size <num_point_sets_> which tells you how many features
00135    // there are in each point set.
00136    mutable vector<int> num_features_;
00137    
00138    // Useful metadata
00139    mutable int num_point_sets_;
00140    mutable int num_points_;
00141    mutable int feature_dim_;
00142    mutable ifstream input_stream_;
00143 
00144    // cache_[0] is the <cache_offset_>th point set in the whole list.
00145    int max_lru_size_;
00146    int max_area_size_;
00147 
00148    mutable int area_cache_offset_;
00149    
00150    // The LRU cache is such that the most-recently-used is at the front
00151    // and least-recently-used (to evict) is at the back. We need to store
00152    // pairs. The first element in each pair is the index of that point in
00153    // the entire point set list, and the second element in the pair is
00154    // the point set itself.
00155    mutable list<pair<int, PointSet> > lru_cache_;
00156    mutable vector<PointSet> area_cache_;
00157 
00158    // Disallow evil constructors (don't allow users to make copies of 
00159    // OnDiskPointSetLists)
00160    OnDiskPointSetList& operator=(const OnDiskPointSetList& foo);
00161    OnDiskPointSetList(const OnDiskPointSetList& foo);
00162    
00163 };
00164 }  // namespace libpmk
00165 #endif  // POINT_SET_POINT_SET_LIST_H

Generated on Wed May 2 11:17:12 2007 for libpmk by  doxygen 1.5.1