00001
00002
00003
00004
00005
00006
00007 #ifndef __RD_SPARSE_INT_VECT_20070921__
00008 #define __RD_SPARSE_INT_VECT_20070921__
00009
00010 #include <map>
00011 #include <string>
00012 #include <RDGeneral/Invariant.h>
00013 #include <sstream>
00014 #include <RDBoost/Exceptions.h>
00015 #include <RDGeneral/StreamOps.h>
00016 #include <boost/cstdint.hpp>
00017
00018 const int ci_SPARSEINTVECT_VERSION=0x0001;
00019 namespace RDKit{
00020
00021 template <typename IndexType>
00022 class SparseIntVect {
00023 public:
00024 typedef std::map<IndexType,int> StorageType;
00025
00026 SparseIntVect() : d_length(0) {};
00027
00028
00029 SparseIntVect(IndexType length) : d_length(length) {};
00030
00031
00032 SparseIntVect(const SparseIntVect<IndexType> &other){
00033 d_length=other.d_length;
00034 d_data.insert(other.d_data.begin(),other.d_data.end());
00035 }
00036
00037
00038 SparseIntVect(const std::string pkl){
00039 initFromText(pkl.c_str(),pkl.size());
00040 };
00041
00042 SparseIntVect(const char *pkl,const unsigned int len){
00043 initFromText(pkl,len);
00044 };
00045
00046
00047 ~SparseIntVect() {}
00048
00049
00050 int getVal(IndexType idx) const {
00051 if(idx<0||idx>=d_length){
00052 throw IndexErrorException(static_cast<int>(idx));
00053 }
00054 int res=0;
00055 typename StorageType::const_iterator iter=d_data.find(idx);
00056 if(iter!=d_data.end()){
00057 res=iter->second;
00058 }
00059 return res;
00060 };
00061 int operator[] (IndexType idx) const { return getVal(idx); };
00062
00063
00064 void setVal(IndexType idx, int val){
00065 if(idx<0||idx>=d_length){
00066 throw IndexErrorException(static_cast<int>(idx));
00067 }
00068 if(val!=0){
00069 d_data[idx]=val;
00070 } else {
00071 d_data.erase(idx);
00072 }
00073 };
00074
00075 IndexType getLength() const { return d_length; };
00076
00077
00078 int getTotalVal() const {
00079 int res=0;
00080 typename StorageType::const_iterator iter;
00081 for(iter=d_data.begin();iter!=d_data.end();++iter){
00082 res+=iter->second;
00083 }
00084 return res;
00085 };
00086
00087
00088
00089 const StorageType &getNonzeroElements() const {
00090 return d_data;
00091 }
00092
00093
00094
00095
00096
00097 SparseIntVect<IndexType> &
00098 operator&= (const SparseIntVect<IndexType> &other) {
00099 if(other.d_length!=d_length){
00100 throw ValueErrorException("SparseIntVect size mismatch");
00101 }
00102
00103 typename StorageType::iterator iter=d_data.begin();
00104 typename StorageType::const_iterator oIter=other.d_data.begin();
00105 while(iter!=d_data.end()){
00106
00107 while(oIter!=other.d_data.end() && oIter->first < iter->first){
00108 ++oIter;
00109 }
00110 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00111
00112 if(oIter->second<iter->second){
00113 iter->second=oIter->second;
00114 }
00115 ++oIter;
00116 ++iter;
00117 } else {
00118
00119
00120 typename StorageType::iterator tmpIter=iter;
00121 ++tmpIter;
00122 d_data.erase(iter);
00123 iter=tmpIter;
00124 }
00125 }
00126 return *this;
00127 };
00128 const SparseIntVect<IndexType>
00129 operator& (const SparseIntVect<IndexType> &other) const {
00130 SparseIntVect<IndexType> res(*this);
00131 return res&=other;
00132 }
00133
00134
00135
00136
00137 SparseIntVect<IndexType> &
00138 operator|= (const SparseIntVect<IndexType> &other) {
00139 if(other.d_length!=d_length){
00140 throw ValueErrorException("SparseIntVect size mismatch");
00141 }
00142
00143 typename StorageType::iterator iter=d_data.begin();
00144 typename StorageType::const_iterator oIter=other.d_data.begin();
00145 while(iter!=d_data.end()){
00146
00147 while(oIter!=other.d_data.end() &&
00148 oIter->first < iter->first){
00149 d_data[oIter->first]=oIter->second;
00150 ++oIter;
00151 }
00152 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00153
00154 if(oIter->second>iter->second){
00155 iter->second=oIter->second;
00156 }
00157 ++oIter;
00158 }
00159 ++iter;
00160 }
00161
00162 while(oIter!=other.d_data.end()){
00163 d_data[oIter->first]=oIter->second;
00164 ++oIter;
00165 }
00166 return *this;
00167 };
00168 const SparseIntVect<IndexType>
00169 operator| (const SparseIntVect<IndexType> &other) const {
00170 SparseIntVect<IndexType> res(*this);
00171 return res|=other;
00172 }
00173
00174 SparseIntVect<IndexType> &
00175 operator+= (const SparseIntVect<IndexType> &other) {
00176 if(other.d_length!=d_length){
00177 throw ValueErrorException("SparseIntVect size mismatch");
00178 }
00179 typename StorageType::iterator iter=d_data.begin();
00180 typename StorageType::const_iterator oIter=other.d_data.begin();
00181 while(oIter!=other.d_data.end()){
00182 while(iter!=d_data.end() &&
00183 iter->first < oIter->first){
00184 ++iter;
00185 }
00186 if(iter!=d_data.end() && oIter->first==iter->first){
00187
00188 iter->second+=oIter->second;
00189 if(!iter->second){
00190 typename StorageType::iterator tIter=iter;
00191 ++tIter;
00192 d_data.erase(iter);
00193 iter=tIter;
00194 } else {
00195 ++iter;
00196 }
00197 } else {
00198 d_data[oIter->first]=oIter->second;
00199 }
00200 ++oIter;
00201 }
00202 return *this;
00203 };
00204 const SparseIntVect<IndexType>
00205 operator+ (const SparseIntVect<IndexType> &other) const {
00206 SparseIntVect<IndexType> res(*this);
00207 return res+=other;
00208 }
00209
00210 SparseIntVect<IndexType> &
00211 operator-= (const SparseIntVect<IndexType> &other) {
00212 if(other.d_length!=d_length){
00213 throw ValueErrorException("SparseIntVect size mismatch");
00214 }
00215 typename StorageType::iterator iter=d_data.begin();
00216 typename StorageType::const_iterator oIter=other.d_data.begin();
00217 while(oIter!=other.d_data.end()){
00218 while(iter!=d_data.end() &&
00219 iter->first < oIter->first){
00220 ++iter;
00221 }
00222 if(iter!=d_data.end() && oIter->first==iter->first){
00223
00224 iter->second-=oIter->second;
00225 if(!iter->second){
00226 typename StorageType::iterator tIter=iter;
00227 ++tIter;
00228 d_data.erase(iter);
00229 iter=tIter;
00230 } else {
00231 ++iter;
00232 }
00233 } else {
00234 d_data[oIter->first]=-oIter->second;
00235 }
00236 ++oIter;
00237 }
00238 return *this;
00239 };
00240 const SparseIntVect<IndexType>
00241 operator- (const SparseIntVect<IndexType> &other) const {
00242 SparseIntVect<IndexType> res(*this);
00243 return res-=other;
00244 }
00245
00246 bool operator==(const SparseIntVect<IndexType> &v2) const{
00247 if(d_length!=v2.d_length){
00248 return false;
00249 }
00250 return d_data==v2.d_data;
00251 }
00252 bool operator!=(const SparseIntVect<IndexType> &v2) const {
00253 return !(*this==v2);
00254 }
00255
00256
00257 std::string toString() const {
00258 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00259 boost::uint32_t tInt;
00260 tInt=ci_SPARSEINTVECT_VERSION;
00261 streamWrite(ss,tInt);
00262 tInt=sizeof(IndexType);
00263 streamWrite(ss,tInt);
00264 streamWrite(ss,d_length);
00265 IndexType nEntries=d_data.size();
00266 streamWrite(ss,nEntries);
00267
00268 typename StorageType::const_iterator iter=d_data.begin();
00269 while(iter!=d_data.end()){
00270 streamWrite(ss,iter->first);
00271 boost::int32_t tInt=iter->second;
00272 streamWrite(ss,tInt);
00273 ++iter;
00274 }
00275 return ss.str();
00276 };
00277
00278 void fromString(const std::string &txt) {
00279 initFromText(txt.c_str(),txt.length());
00280 }
00281
00282 private:
00283 IndexType d_length;
00284 StorageType d_data;
00285
00286 void initFromText(const char *pkl,const unsigned int len) {
00287 d_data.clear();
00288 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00289 ss.write(pkl,len);
00290
00291 boost::uint32_t vers;
00292 streamRead(ss,vers);
00293 if(vers==0x0001){
00294 boost::uint32_t tInt;
00295 streamRead(ss,tInt);
00296 if(tInt>sizeof(IndexType)){
00297 throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00298 }
00299 switch(tInt){
00300 case sizeof(char):
00301 readVals<unsigned char>(ss);break;
00302 case sizeof(boost::int32_t):
00303 readVals<boost::uint32_t>(ss);break;
00304 case sizeof(boost::int64_t):
00305 readVals<boost::uint64_t>(ss);break;
00306 default:
00307 throw ValueErrorException("unreadable format");
00308 }
00309 } else {
00310 throw ValueErrorException("bad version in SparseIntVect pickle");
00311 }
00312 };
00313 template <typename T>
00314 void readVals(std::stringstream &ss){
00315 PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00316 T tVal;
00317 streamRead(ss,tVal);
00318 d_length=tVal;
00319 T nEntries;
00320 streamRead(ss,nEntries);
00321 for(T i=0;i<nEntries;++i){
00322 streamRead(ss,tVal);
00323 boost::int32_t val;
00324 streamRead(ss,val);
00325 d_data[tVal]=val;
00326 }
00327 }
00328 };
00329
00330 template <typename IndexType, typename SequenceType>
00331 void updateFromSequence(SparseIntVect<IndexType> &vect,
00332 const SequenceType &seq){
00333 typename SequenceType::const_iterator seqIt;
00334 for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00335
00336 IndexType idx=*seqIt;
00337 vect.setVal(idx,vect.getVal(idx)+1);
00338 }
00339 }
00340
00341 template <typename IndexType>
00342 double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00343 const SparseIntVect<IndexType> &v2,
00344 bool returnDistance=false,
00345 double bounds=0.0){
00346 if(v1.getLength()!=v2.getLength()){
00347 throw ValueErrorException("SparseIntVect size mismatch");
00348 }
00349 double v1Sum=v1.getTotalVal();
00350 double v2Sum=v2.getTotalVal();
00351 double denom=v1Sum+v2Sum;
00352 if(fabs(denom)<1e-6){
00353 if(returnDistance){
00354 return 1.0;
00355 } else {
00356 return 0.0;
00357 }
00358 }
00359 if(!returnDistance && bounds>0.0){
00360 double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00361 if(2.*minV/denom<bounds){
00362 return 0.0;
00363 }
00364 }
00365
00366 double numer=0.0;
00367
00368
00369 typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00370 iter1=v1.getNonzeroElements().begin();
00371 iter2=v2.getNonzeroElements().begin();
00372 while(iter1 != v1.getNonzeroElements().end()){
00373 while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00374 ++iter2;
00375 }
00376 if(iter2!=v2.getNonzeroElements().end()){
00377 if(iter2->first == iter1->first){
00378 if(iter2->second<iter1->second){
00379 numer += iter2->second;
00380 } else {
00381 numer += iter1->second;
00382 }
00383 ++iter2;
00384 }
00385 ++iter1;
00386 } else {
00387 break;
00388 }
00389 }
00390 double sim=2.*numer/denom;
00391 if(returnDistance) sim = 1.-sim;
00392 return sim;
00393 }
00394 }
00395
00396
00397
00398 #endif