SparseIntVect.h

Go to the documentation of this file.
00001 // $Id: SparseIntVect.h 1734 2011-05-27 03:29:56Z glandrum $
00002 //
00003 //  Copyright (C) 2007-2008 Greg Landrum
00004 //
00005 //  @@ All Rights Reserved @@
00006 //  This file is part of the RDKit.
00007 //  The contents are covered by the terms of the BSD license
00008 //  which is included in the file license.txt, found at the root
00009 //  of the RDKit source tree.
00010 //
00011 #ifndef __RD_SPARSE_INT_VECT_20070921__
00012 #define __RD_SPARSE_INT_VECT_20070921__
00013 
00014 #include <map>
00015 #include <string>
00016 #include <RDGeneral/Invariant.h>
00017 #include <sstream>
00018 #include <RDBoost/Exceptions.h>
00019 #include <RDGeneral/StreamOps.h>
00020 #include <boost/cstdint.hpp>
00021 
00022 
00023 const int ci_SPARSEINTVECT_VERSION=0x0001; //!< version number to use in pickles
00024 namespace RDKit{
00025   //! a class for efficiently storing sparse vectors of ints
00026   template <typename IndexType>
00027   class SparseIntVect {
00028   public:
00029     typedef std::map<IndexType,int> StorageType;
00030   
00031     SparseIntVect() : d_length(0) {};
00032 
00033     //! initialize with a particular length
00034     SparseIntVect(IndexType length) : d_length(length) {};
00035 
00036     //! Copy constructor
00037     SparseIntVect(const SparseIntVect<IndexType> &other){
00038       d_length=other.d_length;
00039       d_data.insert(other.d_data.begin(),other.d_data.end());
00040     }
00041 
00042     //! constructor from a pickle
00043     SparseIntVect(const std::string pkl){
00044       initFromText(pkl.c_str(),pkl.size());
00045     };
00046     //! constructor from a pickle
00047     SparseIntVect(const char *pkl,const unsigned int len){
00048       initFromText(pkl,len);
00049     };
00050 
00051     //! destructor (doesn't need to do anything)
00052     ~SparseIntVect() {}
00053 
00054 #ifdef __clang__
00055 #pragma clang diagnostic push
00056 #pragma clang diagnostic ignored "-Wtautological-compare"
00057 #endif
00058     //! return the value at an index
00059     int getVal(IndexType idx) const {
00060       if(idx<0||idx>=d_length){
00061         throw IndexErrorException(static_cast<int>(idx));
00062       }
00063       int res=0;
00064       typename StorageType::const_iterator iter=d_data.find(idx);
00065       if(iter!=d_data.end()){
00066         res=iter->second;
00067       }
00068       return res;
00069     };
00070 
00071     //! set the value at an index
00072     void setVal(IndexType idx, int val){
00073       if(idx<0||idx>=d_length){
00074         throw IndexErrorException(static_cast<int>(idx));
00075       }
00076       if(val!=0){
00077         d_data[idx]=val;
00078       } else {
00079         d_data.erase(idx);
00080       }
00081     };
00082 #ifdef __clang__
00083 #pragma clang diagnostic pop
00084 #endif
00085     //! support indexing using []
00086     int operator[] (IndexType idx) const { return getVal(idx); };
00087 
00088     //! returns the length
00089     IndexType getLength() const { return d_length; };
00090 
00091     //! returns the sum of all the elements in the vect
00092     //! the doAbs argument toggles summing the absolute values of the elements
00093     int getTotalVal(bool doAbs=false) const {
00094       int res=0;
00095       typename StorageType::const_iterator iter;
00096       for(iter=d_data.begin();iter!=d_data.end();++iter){
00097         if(!doAbs) res+=iter->second;
00098         else res+=abs(iter->second);
00099       }
00100       return res;
00101     };
00102 
00103 
00104     //! returns our nonzero elements as a map(IndexType->int)
00105     const StorageType &getNonzeroElements() const {
00106       return d_data;
00107     }
00108 
00109 
00110     //! this is a "fuzzy" intesection, the final value
00111     //! of each element is equal to the minimum from
00112     //! the two vects.
00113     SparseIntVect<IndexType> &
00114     operator&= (const SparseIntVect<IndexType> &other) {
00115       if(other.d_length!=d_length){
00116         throw ValueErrorException("SparseIntVect size mismatch");
00117       }
00118 
00119       typename StorageType::iterator iter=d_data.begin();
00120       typename StorageType::const_iterator oIter=other.d_data.begin();
00121       while(iter!=d_data.end()){
00122         // we're relying on the fact that the maps are sorted:
00123         while(oIter!=other.d_data.end() && oIter->first < iter->first){
00124           ++oIter;
00125         }
00126         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00127           // found it:
00128           if(oIter->second<iter->second){
00129             iter->second=oIter->second;
00130           }
00131           ++oIter;
00132           ++iter;
00133         } else {
00134           // not there; our value is zero, which means
00135           // we should remove this value:
00136           typename StorageType::iterator tmpIter=iter;
00137           ++tmpIter;
00138           d_data.erase(iter);
00139           iter=tmpIter;
00140         }
00141       }
00142       return *this;
00143     };
00144     const SparseIntVect<IndexType> 
00145     operator& (const SparseIntVect<IndexType> &other) const {
00146       SparseIntVect<IndexType> res(*this);
00147       return res&=other;
00148     }
00149 
00150     //! this is a "fuzzy" union, the final value
00151     //! of each element is equal to the maximum from
00152     //! the two vects.
00153     SparseIntVect<IndexType> &
00154     operator|= (const SparseIntVect<IndexType> &other) {
00155       if(other.d_length!=d_length){
00156         throw ValueErrorException("SparseIntVect size mismatch");
00157       }
00158 
00159       typename StorageType::iterator iter=d_data.begin();
00160       typename StorageType::const_iterator oIter=other.d_data.begin();
00161       while(iter!=d_data.end()){
00162         // we're relying on the fact that the maps are sorted:
00163         while(oIter!=other.d_data.end() &&
00164               oIter->first < iter->first){
00165           d_data[oIter->first]=oIter->second;
00166           ++oIter;
00167         }
00168         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00169           // found it:
00170           if(oIter->second>iter->second){
00171             iter->second=oIter->second;
00172           }
00173           ++oIter;
00174         }
00175         ++iter;
00176       }
00177       // finish up the other vect:
00178       while(oIter!=other.d_data.end()){
00179         d_data[oIter->first]=oIter->second;
00180         ++oIter;
00181       }
00182       return *this;
00183     };
00184     const SparseIntVect<IndexType> 
00185     operator| (const SparseIntVect<IndexType> &other) const {
00186       SparseIntVect<IndexType> res(*this);
00187       return res|=other;
00188     }
00189 
00190     SparseIntVect<IndexType> &
00191     operator+= (const SparseIntVect<IndexType> &other) {
00192       if(other.d_length!=d_length){
00193         throw ValueErrorException("SparseIntVect size mismatch");
00194       }
00195       typename StorageType::iterator iter=d_data.begin();
00196       typename StorageType::const_iterator oIter=other.d_data.begin();
00197       while(oIter!=other.d_data.end()){
00198         while(iter!=d_data.end() &&
00199               iter->first < oIter->first){
00200           ++iter;
00201         }
00202         if(iter!=d_data.end() && oIter->first==iter->first){
00203           // found it:
00204           iter->second+=oIter->second;
00205           if(!iter->second){
00206             typename StorageType::iterator tIter=iter;
00207             ++tIter;
00208             d_data.erase(iter);
00209             iter=tIter;
00210           } else {
00211             ++iter;
00212           }
00213         } else {
00214           d_data[oIter->first]=oIter->second;
00215         }
00216         ++oIter;
00217       }
00218       return *this;
00219     };
00220     const SparseIntVect<IndexType> 
00221     operator+ (const SparseIntVect<IndexType> &other) const {
00222       SparseIntVect<IndexType> res(*this);
00223       return res+=other;
00224     }
00225 
00226     SparseIntVect<IndexType> &
00227     operator-= (const SparseIntVect<IndexType> &other) {
00228       if(other.d_length!=d_length){
00229         throw ValueErrorException("SparseIntVect size mismatch");
00230       }
00231       typename StorageType::iterator iter=d_data.begin();
00232       typename StorageType::const_iterator oIter=other.d_data.begin();
00233       while(oIter!=other.d_data.end()){
00234         while(iter!=d_data.end() &&
00235               iter->first < oIter->first){
00236           ++iter;
00237         }
00238         if(iter!=d_data.end() && oIter->first==iter->first){
00239           // found it:
00240           iter->second-=oIter->second;
00241           if(!iter->second){
00242             typename StorageType::iterator tIter=iter;
00243             ++tIter;
00244             d_data.erase(iter);
00245             iter=tIter;
00246           } else {
00247             ++iter;
00248           }
00249         } else {
00250           d_data[oIter->first] = -oIter->second;
00251         }
00252         ++oIter;
00253       }
00254       return *this;
00255     };
00256     const SparseIntVect<IndexType> 
00257     operator- (const SparseIntVect<IndexType> &other) const {
00258       SparseIntVect<IndexType> res(*this);
00259       return res-=other;
00260     }
00261 
00262     bool operator==(const SparseIntVect<IndexType> &v2) const{
00263       if(d_length!=v2.d_length){
00264         return false;
00265       }
00266       return d_data==v2.d_data;
00267     }
00268     bool operator!=(const SparseIntVect<IndexType> &v2) const {
00269       return !(*this==v2);
00270     }
00271 
00272     //! returns a binary string representation (pickle)
00273     std::string toString() const {
00274       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00275       boost::uint32_t tInt;
00276       tInt=ci_SPARSEINTVECT_VERSION;
00277       streamWrite(ss,tInt);
00278       tInt=sizeof(IndexType);
00279       streamWrite(ss,tInt);
00280       streamWrite(ss,d_length);
00281       IndexType nEntries=d_data.size();
00282       streamWrite(ss,nEntries);
00283 
00284       typename StorageType::const_iterator iter=d_data.begin();
00285       while(iter!=d_data.end()){
00286         streamWrite(ss,iter->first);
00287         boost::int32_t tInt=iter->second;
00288         streamWrite(ss,tInt);
00289         ++iter;
00290       }
00291       return ss.str();
00292     };
00293 
00294     void fromString(const std::string &txt) {
00295       initFromText(txt.c_str(),txt.length());
00296     }
00297 
00298   private:
00299     IndexType d_length;
00300     StorageType d_data;
00301     
00302     void initFromText(const char *pkl,const unsigned int len) {
00303       d_data.clear();
00304       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00305       ss.write(pkl,len);
00306       
00307       boost::uint32_t vers;
00308       streamRead(ss,vers);
00309       if(vers==0x0001){
00310         boost::uint32_t tInt;
00311         streamRead(ss,tInt);
00312         if(tInt>sizeof(IndexType)){
00313           throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00314         }
00315         switch(tInt){
00316         case sizeof(char):
00317           readVals<unsigned char>(ss);break;
00318         case sizeof(boost::int32_t):
00319           readVals<boost::uint32_t>(ss);break;
00320         case sizeof(boost::int64_t):
00321           readVals<boost::uint64_t>(ss);break;
00322         default:
00323           throw ValueErrorException("unreadable format");
00324         }
00325       } else {
00326         throw ValueErrorException("bad version in SparseIntVect pickle");
00327       }
00328     };
00329     template <typename T>
00330     void readVals(std::stringstream &ss){
00331       PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00332       T tVal;
00333       streamRead(ss,tVal);
00334       d_length=tVal;
00335       T nEntries;
00336       streamRead(ss,nEntries);
00337       for(T i=0;i<nEntries;++i){
00338         streamRead(ss,tVal);
00339         boost::int32_t val;
00340         streamRead(ss,val);
00341         d_data[tVal]=val;
00342       }
00343     }
00344   };
00345 
00346   template <typename IndexType, typename SequenceType>
00347   void updateFromSequence(SparseIntVect<IndexType> &vect,
00348                           const SequenceType &seq){
00349     typename SequenceType::const_iterator seqIt;
00350     for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00351       // EFF: probably not the most efficient approach
00352       IndexType idx=*seqIt;
00353       vect.setVal(idx,vect.getVal(idx)+1);
00354     }
00355   }
00356 
00357   namespace {
00358     template <typename IndexType>
00359     void calcVectParams(const SparseIntVect<IndexType> &v1,
00360                           const SparseIntVect<IndexType> &v2,
00361                           double &v1Sum,double &v2Sum,
00362                           double &andSum){
00363       if(v1.getLength()!=v2.getLength()){
00364         throw ValueErrorException("SparseIntVect size mismatch");
00365       }
00366       v1Sum=v2Sum=andSum=0.0;
00367       // we're doing : (v1&v2).getTotalVal(), but w/o generating
00368       // the other vector:
00369       typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00370       iter1=v1.getNonzeroElements().begin();
00371       if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second); 
00372       iter2=v2.getNonzeroElements().begin();
00373       if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second); 
00374       while(iter1 != v1.getNonzeroElements().end()){
00375         while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00376           ++iter2;
00377           if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second); 
00378         }
00379         if(iter2!=v2.getNonzeroElements().end()){
00380           if(iter2->first == iter1->first){
00381             if(abs(iter2->second)<abs(iter1->second)){
00382               andSum += abs(iter2->second);
00383             } else {
00384               andSum += abs(iter1->second);
00385             }
00386             ++iter2;
00387             if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second); 
00388           }
00389           ++iter1;
00390           if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second); 
00391         } else {
00392           break;
00393         }
00394       }
00395       if(iter1 != v1.getNonzeroElements().end()){
00396         ++iter1;
00397         while(iter1!=v1.getNonzeroElements().end()){
00398           v1Sum+=abs(iter1->second);
00399           ++iter1;
00400         }
00401       }
00402       if(iter2!=v2.getNonzeroElements().end()){
00403         ++iter2;
00404         while(iter2!=v2.getNonzeroElements().end()){
00405           v2Sum+=abs(iter2->second);
00406           ++iter2;
00407         }
00408       }
00409     }
00410   }
00411   
00412   template <typename IndexType>
00413   double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00414                         const SparseIntVect<IndexType> &v2,
00415                         bool returnDistance=false,
00416                         double bounds=0.0){
00417     if(v1.getLength()!=v2.getLength()){
00418       throw ValueErrorException("SparseIntVect size mismatch");
00419     }
00420     double v1Sum=0.0;
00421     double v2Sum=0.0;
00422     if(!returnDistance && bounds>0.0){
00423       v1Sum=v1.getTotalVal(true);
00424       v2Sum=v2.getTotalVal(true);
00425       double denom=v1Sum+v2Sum;
00426       if(fabs(denom)<1e-6){
00427         if(returnDistance){
00428           return 1.0;
00429         } else {
00430           return 0.0;
00431         }
00432       }
00433       double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00434       if(2.*minV/denom<bounds){
00435         return 0.0;
00436       }
00437       v1Sum=0.0;
00438       v2Sum=0.0;
00439     }
00440 
00441     double numer=0.0;
00442 
00443     calcVectParams(v1,v2,v1Sum,v2Sum,numer);
00444 
00445     double denom=v1Sum+v2Sum;
00446     double sim;
00447     if(fabs(denom)<1e-6){
00448       sim=0.0;
00449     } else {
00450       sim=2.*numer/denom;
00451     }
00452     if(returnDistance) sim = 1.-sim;
00453     //std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
00454     return sim;
00455   }
00456 
00457 
00458   template <typename IndexType>
00459   double TverskySimilarity(const SparseIntVect<IndexType> &v1,
00460                            const SparseIntVect<IndexType> &v2,
00461                            double a, double b,
00462                            bool returnDistance=false,
00463                            double bounds=0.0){
00464     if(v1.getLength()!=v2.getLength()){
00465       throw ValueErrorException("SparseIntVect size mismatch");
00466     }
00467     double v1Sum=0.0;
00468     double v2Sum=0.0;
00469     double andSum=0.0;
00470 
00471     calcVectParams(v1,v2,v1Sum,v2Sum,andSum);
00472 
00473     double denom=a*v1Sum+b*v2Sum+(1-a-b)*andSum;
00474     double sim;
00475 
00476     if(fabs(denom)<1e-6){
00477       sim=0.0;
00478     } else {
00479       sim=andSum/denom;
00480     }
00481     if(returnDistance) sim = 1.-sim;
00482     //std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
00483     return sim;
00484   }
00485 
00486   template <typename IndexType>
00487   double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
00488                             const SparseIntVect<IndexType> &v2,
00489                             bool returnDistance=false,
00490                             double bounds=0.0){
00491     return TverskySimilarity(v1,v2,1.0,1.0,returnDistance,bounds);
00492   }
00493 
00494 } 
00495 
00496 
00497 
00498 #endif