SparseIntVect.h

Go to the documentation of this file.
00001 // $Id: SparseIntVect.h 776 2008-07-28 03:40:30Z glandrum $
00002 //
00003 //  Copyright (C) 2007-2008 Greg Landrum
00004 //
00005 //  @@ All Rights Reserved @@
00006 //
00007 #ifndef __RD_SPARSE_INT_VECT_20070921__
00008 #define __RD_SPARSE_INT_VECT_20070921__
00009 
00010 #include <map>
00011 #include <string>
00012 #include <RDGeneral/Invariant.h>
00013 #include <sstream>
00014 #include <RDBoost/Exceptions.h>
00015 #include <RDGeneral/StreamOps.h>
00016 #include <boost/cstdint.hpp>
00017 
00018 const int ci_SPARSEINTVECT_VERSION=0x0001; //!< version number to use in pickles
00019 namespace RDKit{
00020   //! a class for efficiently storing sparse vectors of ints
00021   template <typename IndexType>
00022   class SparseIntVect {
00023   public:
00024     typedef std::map<IndexType,int> StorageType;
00025   
00026     SparseIntVect() : d_length(0) {};
00027 
00028     //! initialize with a particular length
00029     SparseIntVect(IndexType length) : d_length(length) {};
00030 
00031     //! Copy constructor
00032     SparseIntVect(const SparseIntVect<IndexType> &other){
00033       d_length=other.d_length;
00034       d_data.insert(other.d_data.begin(),other.d_data.end());
00035     }
00036 
00037     //! constructor from a pickle
00038     SparseIntVect(const std::string pkl){
00039       initFromText(pkl.c_str(),pkl.size());
00040     };
00041     //! constructor from a pickle
00042     SparseIntVect(const char *pkl,const unsigned int len){
00043       initFromText(pkl,len);
00044     };
00045 
00046     //! destructor (doesn't need to do anything)
00047     ~SparseIntVect() {}
00048 
00049     //! return the value at an index
00050     int getVal(IndexType idx) const {
00051       if(idx<0||idx>=d_length){
00052         throw IndexErrorException(static_cast<int>(idx));
00053       }
00054       int res=0;
00055       typename StorageType::const_iterator iter=d_data.find(idx);
00056       if(iter!=d_data.end()){
00057         res=iter->second;
00058       }
00059       return res;
00060     };
00061     int operator[] (IndexType idx) const { return getVal(idx); };
00062 
00063     //! set the value at an index
00064     void setVal(IndexType idx, int val){
00065       if(idx<0||idx>=d_length){
00066         throw IndexErrorException(static_cast<int>(idx));
00067       }
00068       if(val!=0){
00069         d_data[idx]=val;
00070       } else {
00071         d_data.erase(idx);
00072       }
00073     };
00074     //! returns the length
00075     IndexType getLength() const { return d_length; };
00076 
00077     //! returns the sum of all the elements in the vect
00078     int getTotalVal() const {
00079       int res=0;
00080       typename StorageType::const_iterator iter;
00081       for(iter=d_data.begin();iter!=d_data.end();++iter){
00082         res+=iter->second;
00083       }
00084       return res;
00085     };
00086 
00087 
00088     //! returns our nonzero elements as a map(IndexType->int)
00089     const StorageType &getNonzeroElements() const {
00090       return d_data;
00091     }
00092 
00093 
00094     //! this is a "fuzzy" intesection, the final value
00095     //! of each element is equal to the minimum from
00096     //! the two vects.
00097     SparseIntVect<IndexType> &
00098     operator&= (const SparseIntVect<IndexType> &other) {
00099       if(other.d_length!=d_length){
00100         throw ValueErrorException("SparseIntVect size mismatch");
00101       }
00102 
00103       typename StorageType::iterator iter=d_data.begin();
00104       typename StorageType::const_iterator oIter=other.d_data.begin();
00105       while(iter!=d_data.end()){
00106         // we're relying on the fact that the maps are sorted:
00107         while(oIter!=other.d_data.end() && oIter->first < iter->first){
00108           ++oIter;
00109         }
00110         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00111           // found it:
00112           if(oIter->second<iter->second){
00113             iter->second=oIter->second;
00114           }
00115           ++oIter;
00116           ++iter;
00117         } else {
00118           // not there; our value is zero, which means
00119           // we should remove this value:
00120           typename StorageType::iterator tmpIter=iter;
00121           ++tmpIter;
00122           d_data.erase(iter);
00123           iter=tmpIter;
00124         }
00125       }
00126       return *this;
00127     };
00128     const SparseIntVect<IndexType> 
00129     operator& (const SparseIntVect<IndexType> &other) const {
00130       SparseIntVect<IndexType> res(*this);
00131       return res&=other;
00132     }
00133 
00134     //! this is a "fuzzy" union, the final value
00135     //! of each element is equal to the maximum from
00136     //! the two vects.
00137     SparseIntVect<IndexType> &
00138     operator|= (const SparseIntVect<IndexType> &other) {
00139       if(other.d_length!=d_length){
00140         throw ValueErrorException("SparseIntVect size mismatch");
00141       }
00142 
00143       typename StorageType::iterator iter=d_data.begin();
00144       typename StorageType::const_iterator oIter=other.d_data.begin();
00145       while(iter!=d_data.end()){
00146         // we're relying on the fact that the maps are sorted:
00147         while(oIter!=other.d_data.end() &&
00148               oIter->first < iter->first){
00149           d_data[oIter->first]=oIter->second;
00150           ++oIter;
00151         }
00152         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00153           // found it:
00154           if(oIter->second>iter->second){
00155             iter->second=oIter->second;
00156           }
00157           ++oIter;
00158         }
00159         ++iter;
00160       }
00161       // finish up the other vect:
00162       while(oIter!=other.d_data.end()){
00163         d_data[oIter->first]=oIter->second;
00164         ++oIter;
00165       }
00166       return *this;
00167     };
00168     const SparseIntVect<IndexType> 
00169     operator| (const SparseIntVect<IndexType> &other) const {
00170       SparseIntVect<IndexType> res(*this);
00171       return res|=other;
00172     }
00173 
00174     SparseIntVect<IndexType> &
00175     operator+= (const SparseIntVect<IndexType> &other) {
00176       if(other.d_length!=d_length){
00177         throw ValueErrorException("SparseIntVect size mismatch");
00178       }
00179       typename StorageType::iterator iter=d_data.begin();
00180       typename StorageType::const_iterator oIter=other.d_data.begin();
00181       while(oIter!=other.d_data.end()){
00182         while(iter!=d_data.end() &&
00183               iter->first < oIter->first){
00184           ++iter;
00185         }
00186         if(iter!=d_data.end() && oIter->first==iter->first){
00187           // found it:
00188           iter->second+=oIter->second;
00189           if(!iter->second){
00190             typename StorageType::iterator tIter=iter;
00191             ++tIter;
00192             d_data.erase(iter);
00193             iter=tIter;
00194           } else {
00195             ++iter;
00196           }
00197         } else {
00198           d_data[oIter->first]=oIter->second;
00199         }
00200         ++oIter;
00201       }
00202       return *this;
00203     };
00204     const SparseIntVect<IndexType> 
00205     operator+ (const SparseIntVect<IndexType> &other) const {
00206       SparseIntVect<IndexType> res(*this);
00207       return res+=other;
00208     }
00209 
00210     SparseIntVect<IndexType> &
00211     operator-= (const SparseIntVect<IndexType> &other) {
00212       if(other.d_length!=d_length){
00213         throw ValueErrorException("SparseIntVect size mismatch");
00214       }
00215       typename StorageType::iterator iter=d_data.begin();
00216       typename StorageType::const_iterator oIter=other.d_data.begin();
00217       while(oIter!=other.d_data.end()){
00218         while(iter!=d_data.end() &&
00219               iter->first < oIter->first){
00220           ++iter;
00221         }
00222         if(iter!=d_data.end() && oIter->first==iter->first){
00223           // found it:
00224           iter->second-=oIter->second;
00225           if(!iter->second){
00226             typename StorageType::iterator tIter=iter;
00227             ++tIter;
00228             d_data.erase(iter);
00229             iter=tIter;
00230           } else {
00231             ++iter;
00232           }
00233         } else {
00234           d_data[oIter->first]=-oIter->second;
00235         }
00236         ++oIter;
00237       }
00238       return *this;
00239     };
00240     const SparseIntVect<IndexType> 
00241     operator- (const SparseIntVect<IndexType> &other) const {
00242       SparseIntVect<IndexType> res(*this);
00243       return res-=other;
00244     }
00245 
00246     bool operator==(const SparseIntVect<IndexType> &v2) const{
00247       if(d_length!=v2.d_length){
00248         return false;
00249       }
00250       return d_data==v2.d_data;
00251     }
00252     bool operator!=(const SparseIntVect<IndexType> &v2) const {
00253       return !(*this==v2);
00254     }
00255 
00256     //! returns a binary string representation (pickle)
00257     std::string toString() const {
00258       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00259       boost::uint32_t tInt;
00260       tInt=ci_SPARSEINTVECT_VERSION;
00261       streamWrite(ss,tInt);
00262       tInt=sizeof(IndexType);
00263       streamWrite(ss,tInt);
00264       streamWrite(ss,d_length);
00265       IndexType nEntries=d_data.size();
00266       streamWrite(ss,nEntries);
00267 
00268       typename StorageType::const_iterator iter=d_data.begin();
00269       while(iter!=d_data.end()){
00270         streamWrite(ss,iter->first);
00271         boost::int32_t tInt=iter->second;
00272         streamWrite(ss,tInt);
00273         ++iter;
00274       }
00275       return ss.str();
00276     };
00277 
00278     void fromString(const std::string &txt) {
00279       initFromText(txt.c_str(),txt.length());
00280     }
00281 
00282   private:
00283     IndexType d_length;
00284     StorageType d_data;
00285     
00286     void initFromText(const char *pkl,const unsigned int len) {
00287       d_data.clear();
00288       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00289       ss.write(pkl,len);
00290       
00291       boost::uint32_t vers;
00292       streamRead(ss,vers);
00293       if(vers==0x0001){
00294         boost::uint32_t tInt;
00295         streamRead(ss,tInt);
00296         if(tInt>sizeof(IndexType)){
00297           throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00298         }
00299         switch(tInt){
00300         case sizeof(char):
00301           readVals<unsigned char>(ss);break;
00302         case sizeof(boost::int32_t):
00303           readVals<boost::uint32_t>(ss);break;
00304         case sizeof(boost::int64_t):
00305           readVals<boost::uint64_t>(ss);break;
00306         default:
00307           throw ValueErrorException("unreadable format");
00308         }
00309       } else {
00310         throw ValueErrorException("bad version in SparseIntVect pickle");
00311       }
00312     };
00313     template <typename T>
00314     void readVals(std::stringstream &ss){
00315       PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00316       T tVal;
00317       streamRead(ss,tVal);
00318       d_length=tVal;
00319       T nEntries;
00320       streamRead(ss,nEntries);
00321       for(T i=0;i<nEntries;++i){
00322         streamRead(ss,tVal);
00323         boost::int32_t val;
00324         streamRead(ss,val);
00325         d_data[tVal]=val;
00326       }
00327     }
00328   };
00329 
00330   template <typename IndexType, typename SequenceType>
00331   void updateFromSequence(SparseIntVect<IndexType> &vect,
00332                           const SequenceType &seq){
00333     typename SequenceType::const_iterator seqIt;
00334     for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00335       // EFF: probably not the most efficient approach
00336       IndexType idx=*seqIt;
00337       vect.setVal(idx,vect.getVal(idx)+1);
00338     }
00339   }
00340 
00341   template <typename IndexType>
00342   double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00343                         const SparseIntVect<IndexType> &v2,
00344                         bool returnDistance=false,
00345                         double bounds=0.0){
00346     if(v1.getLength()!=v2.getLength()){
00347       throw ValueErrorException("SparseIntVect size mismatch");
00348     }
00349     double v1Sum=v1.getTotalVal();
00350     double v2Sum=v2.getTotalVal();
00351     double denom=v1Sum+v2Sum;
00352     if(fabs(denom)<1e-6){
00353       if(returnDistance){
00354         return 1.0;
00355       } else {
00356         return 0.0;
00357       }
00358     }
00359     if(!returnDistance && bounds>0.0){
00360       double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00361       if(2.*minV/denom<bounds){
00362         return 0.0;
00363       }
00364     }
00365 
00366     double numer=0.0;
00367     // we're doing : (v1&v2).getTotalVal(), but w/o generating
00368     // the other vector:
00369     typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00370     iter1=v1.getNonzeroElements().begin();
00371     iter2=v2.getNonzeroElements().begin();
00372     while(iter1 != v1.getNonzeroElements().end()){
00373       while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00374         ++iter2;
00375       }
00376       if(iter2!=v2.getNonzeroElements().end()){
00377         if(iter2->first == iter1->first){
00378           if(iter2->second<iter1->second){
00379             numer += iter2->second;
00380           } else {
00381             numer += iter1->second;
00382           }
00383           ++iter2;
00384         }
00385         ++iter1;
00386       } else {
00387         break;
00388       }
00389     }
00390     double sim=2.*numer/denom;
00391     if(returnDistance) sim = 1.-sim;
00392     return sim;
00393   }
00394 } 
00395 
00396 
00397 
00398 #endif

Generated on Tue Oct 7 06:10:11 2008 for RDCode by  doxygen 1.5.5