00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef __RD_SPARSE_INT_VECT_20070921__
00012 #define __RD_SPARSE_INT_VECT_20070921__
00013
00014 #include <map>
00015 #include <string>
00016 #include <RDGeneral/Invariant.h>
00017 #include <sstream>
00018 #include <RDBoost/Exceptions.h>
00019 #include <RDGeneral/StreamOps.h>
00020 #include <boost/cstdint.hpp>
00021
00022
00023 const int ci_SPARSEINTVECT_VERSION=0x0001;
00024 namespace RDKit{
00025
00026 template <typename IndexType>
00027 class SparseIntVect {
00028 public:
00029 typedef std::map<IndexType,int> StorageType;
00030
00031 SparseIntVect() : d_length(0) {};
00032
00033
00034 SparseIntVect(IndexType length) : d_length(length) {};
00035
00036
00037 SparseIntVect(const SparseIntVect<IndexType> &other){
00038 d_length=other.d_length;
00039 d_data.insert(other.d_data.begin(),other.d_data.end());
00040 }
00041
00042
00043 SparseIntVect(const std::string pkl){
00044 initFromText(pkl.c_str(),pkl.size());
00045 };
00046
00047 SparseIntVect(const char *pkl,const unsigned int len){
00048 initFromText(pkl,len);
00049 };
00050
00051
00052 ~SparseIntVect() {}
00053
00054 #ifdef __clang__
00055 #pragma clang diagnostic push
00056 #pragma clang diagnostic ignored "-Wtautological-compare"
00057 #endif
00058
00059 int getVal(IndexType idx) const {
00060 if(idx<0||idx>=d_length){
00061 throw IndexErrorException(static_cast<int>(idx));
00062 }
00063 int res=0;
00064 typename StorageType::const_iterator iter=d_data.find(idx);
00065 if(iter!=d_data.end()){
00066 res=iter->second;
00067 }
00068 return res;
00069 };
00070
00071
00072 void setVal(IndexType idx, int val){
00073 if(idx<0||idx>=d_length){
00074 throw IndexErrorException(static_cast<int>(idx));
00075 }
00076 if(val!=0){
00077 d_data[idx]=val;
00078 } else {
00079 d_data.erase(idx);
00080 }
00081 };
00082 #ifdef __clang__
00083 #pragma clang diagnostic pop
00084 #endif
00085
00086 int operator[] (IndexType idx) const { return getVal(idx); };
00087
00088
00089 IndexType getLength() const { return d_length; };
00090
00091
00092
00093 int getTotalVal(bool doAbs=false) const {
00094 int res=0;
00095 typename StorageType::const_iterator iter;
00096 for(iter=d_data.begin();iter!=d_data.end();++iter){
00097 if(!doAbs) res+=iter->second;
00098 else res+=abs(iter->second);
00099 }
00100 return res;
00101 };
00102
00103
00104
00105 const StorageType &getNonzeroElements() const {
00106 return d_data;
00107 }
00108
00109
00110
00111
00112
00113 SparseIntVect<IndexType> &
00114 operator&= (const SparseIntVect<IndexType> &other) {
00115 if(other.d_length!=d_length){
00116 throw ValueErrorException("SparseIntVect size mismatch");
00117 }
00118
00119 typename StorageType::iterator iter=d_data.begin();
00120 typename StorageType::const_iterator oIter=other.d_data.begin();
00121 while(iter!=d_data.end()){
00122
00123 while(oIter!=other.d_data.end() && oIter->first < iter->first){
00124 ++oIter;
00125 }
00126 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00127
00128 if(oIter->second<iter->second){
00129 iter->second=oIter->second;
00130 }
00131 ++oIter;
00132 ++iter;
00133 } else {
00134
00135
00136 typename StorageType::iterator tmpIter=iter;
00137 ++tmpIter;
00138 d_data.erase(iter);
00139 iter=tmpIter;
00140 }
00141 }
00142 return *this;
00143 };
00144 const SparseIntVect<IndexType>
00145 operator& (const SparseIntVect<IndexType> &other) const {
00146 SparseIntVect<IndexType> res(*this);
00147 return res&=other;
00148 }
00149
00150
00151
00152
00153 SparseIntVect<IndexType> &
00154 operator|= (const SparseIntVect<IndexType> &other) {
00155 if(other.d_length!=d_length){
00156 throw ValueErrorException("SparseIntVect size mismatch");
00157 }
00158
00159 typename StorageType::iterator iter=d_data.begin();
00160 typename StorageType::const_iterator oIter=other.d_data.begin();
00161 while(iter!=d_data.end()){
00162
00163 while(oIter!=other.d_data.end() &&
00164 oIter->first < iter->first){
00165 d_data[oIter->first]=oIter->second;
00166 ++oIter;
00167 }
00168 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00169
00170 if(oIter->second>iter->second){
00171 iter->second=oIter->second;
00172 }
00173 ++oIter;
00174 }
00175 ++iter;
00176 }
00177
00178 while(oIter!=other.d_data.end()){
00179 d_data[oIter->first]=oIter->second;
00180 ++oIter;
00181 }
00182 return *this;
00183 };
00184 const SparseIntVect<IndexType>
00185 operator| (const SparseIntVect<IndexType> &other) const {
00186 SparseIntVect<IndexType> res(*this);
00187 return res|=other;
00188 }
00189
00190 SparseIntVect<IndexType> &
00191 operator+= (const SparseIntVect<IndexType> &other) {
00192 if(other.d_length!=d_length){
00193 throw ValueErrorException("SparseIntVect size mismatch");
00194 }
00195 typename StorageType::iterator iter=d_data.begin();
00196 typename StorageType::const_iterator oIter=other.d_data.begin();
00197 while(oIter!=other.d_data.end()){
00198 while(iter!=d_data.end() &&
00199 iter->first < oIter->first){
00200 ++iter;
00201 }
00202 if(iter!=d_data.end() && oIter->first==iter->first){
00203
00204 iter->second+=oIter->second;
00205 if(!iter->second){
00206 typename StorageType::iterator tIter=iter;
00207 ++tIter;
00208 d_data.erase(iter);
00209 iter=tIter;
00210 } else {
00211 ++iter;
00212 }
00213 } else {
00214 d_data[oIter->first]=oIter->second;
00215 }
00216 ++oIter;
00217 }
00218 return *this;
00219 };
00220 const SparseIntVect<IndexType>
00221 operator+ (const SparseIntVect<IndexType> &other) const {
00222 SparseIntVect<IndexType> res(*this);
00223 return res+=other;
00224 }
00225
00226 SparseIntVect<IndexType> &
00227 operator-= (const SparseIntVect<IndexType> &other) {
00228 if(other.d_length!=d_length){
00229 throw ValueErrorException("SparseIntVect size mismatch");
00230 }
00231 typename StorageType::iterator iter=d_data.begin();
00232 typename StorageType::const_iterator oIter=other.d_data.begin();
00233 while(oIter!=other.d_data.end()){
00234 while(iter!=d_data.end() &&
00235 iter->first < oIter->first){
00236 ++iter;
00237 }
00238 if(iter!=d_data.end() && oIter->first==iter->first){
00239
00240 iter->second-=oIter->second;
00241 if(!iter->second){
00242 typename StorageType::iterator tIter=iter;
00243 ++tIter;
00244 d_data.erase(iter);
00245 iter=tIter;
00246 } else {
00247 ++iter;
00248 }
00249 } else {
00250 d_data[oIter->first] = -oIter->second;
00251 }
00252 ++oIter;
00253 }
00254 return *this;
00255 };
00256 const SparseIntVect<IndexType>
00257 operator- (const SparseIntVect<IndexType> &other) const {
00258 SparseIntVect<IndexType> res(*this);
00259 return res-=other;
00260 }
00261
00262 bool operator==(const SparseIntVect<IndexType> &v2) const{
00263 if(d_length!=v2.d_length){
00264 return false;
00265 }
00266 return d_data==v2.d_data;
00267 }
00268 bool operator!=(const SparseIntVect<IndexType> &v2) const {
00269 return !(*this==v2);
00270 }
00271
00272
00273 std::string toString() const {
00274 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00275 boost::uint32_t tInt;
00276 tInt=ci_SPARSEINTVECT_VERSION;
00277 streamWrite(ss,tInt);
00278 tInt=sizeof(IndexType);
00279 streamWrite(ss,tInt);
00280 streamWrite(ss,d_length);
00281 IndexType nEntries=d_data.size();
00282 streamWrite(ss,nEntries);
00283
00284 typename StorageType::const_iterator iter=d_data.begin();
00285 while(iter!=d_data.end()){
00286 streamWrite(ss,iter->first);
00287 boost::int32_t tInt=iter->second;
00288 streamWrite(ss,tInt);
00289 ++iter;
00290 }
00291 return ss.str();
00292 };
00293
00294 void fromString(const std::string &txt) {
00295 initFromText(txt.c_str(),txt.length());
00296 }
00297
00298 private:
00299 IndexType d_length;
00300 StorageType d_data;
00301
00302 void initFromText(const char *pkl,const unsigned int len) {
00303 d_data.clear();
00304 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00305 ss.write(pkl,len);
00306
00307 boost::uint32_t vers;
00308 streamRead(ss,vers);
00309 if(vers==0x0001){
00310 boost::uint32_t tInt;
00311 streamRead(ss,tInt);
00312 if(tInt>sizeof(IndexType)){
00313 throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00314 }
00315 switch(tInt){
00316 case sizeof(char):
00317 readVals<unsigned char>(ss);break;
00318 case sizeof(boost::int32_t):
00319 readVals<boost::uint32_t>(ss);break;
00320 case sizeof(boost::int64_t):
00321 readVals<boost::uint64_t>(ss);break;
00322 default:
00323 throw ValueErrorException("unreadable format");
00324 }
00325 } else {
00326 throw ValueErrorException("bad version in SparseIntVect pickle");
00327 }
00328 };
00329 template <typename T>
00330 void readVals(std::stringstream &ss){
00331 PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00332 T tVal;
00333 streamRead(ss,tVal);
00334 d_length=tVal;
00335 T nEntries;
00336 streamRead(ss,nEntries);
00337 for(T i=0;i<nEntries;++i){
00338 streamRead(ss,tVal);
00339 boost::int32_t val;
00340 streamRead(ss,val);
00341 d_data[tVal]=val;
00342 }
00343 }
00344 };
00345
00346 template <typename IndexType, typename SequenceType>
00347 void updateFromSequence(SparseIntVect<IndexType> &vect,
00348 const SequenceType &seq){
00349 typename SequenceType::const_iterator seqIt;
00350 for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00351
00352 IndexType idx=*seqIt;
00353 vect.setVal(idx,vect.getVal(idx)+1);
00354 }
00355 }
00356
00357 namespace {
00358 template <typename IndexType>
00359 void calcVectParams(const SparseIntVect<IndexType> &v1,
00360 const SparseIntVect<IndexType> &v2,
00361 double &v1Sum,double &v2Sum,
00362 double &andSum){
00363 if(v1.getLength()!=v2.getLength()){
00364 throw ValueErrorException("SparseIntVect size mismatch");
00365 }
00366 v1Sum=v2Sum=andSum=0.0;
00367
00368
00369 typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00370 iter1=v1.getNonzeroElements().begin();
00371 if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
00372 iter2=v2.getNonzeroElements().begin();
00373 if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
00374 while(iter1 != v1.getNonzeroElements().end()){
00375 while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00376 ++iter2;
00377 if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
00378 }
00379 if(iter2!=v2.getNonzeroElements().end()){
00380 if(iter2->first == iter1->first){
00381 if(abs(iter2->second)<abs(iter1->second)){
00382 andSum += abs(iter2->second);
00383 } else {
00384 andSum += abs(iter1->second);
00385 }
00386 ++iter2;
00387 if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
00388 }
00389 ++iter1;
00390 if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
00391 } else {
00392 break;
00393 }
00394 }
00395 if(iter1 != v1.getNonzeroElements().end()){
00396 ++iter1;
00397 while(iter1!=v1.getNonzeroElements().end()){
00398 v1Sum+=abs(iter1->second);
00399 ++iter1;
00400 }
00401 }
00402 if(iter2!=v2.getNonzeroElements().end()){
00403 ++iter2;
00404 while(iter2!=v2.getNonzeroElements().end()){
00405 v2Sum+=abs(iter2->second);
00406 ++iter2;
00407 }
00408 }
00409 }
00410 }
00411
00412 template <typename IndexType>
00413 double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00414 const SparseIntVect<IndexType> &v2,
00415 bool returnDistance=false,
00416 double bounds=0.0){
00417 if(v1.getLength()!=v2.getLength()){
00418 throw ValueErrorException("SparseIntVect size mismatch");
00419 }
00420 double v1Sum=0.0;
00421 double v2Sum=0.0;
00422 if(!returnDistance && bounds>0.0){
00423 v1Sum=v1.getTotalVal(true);
00424 v2Sum=v2.getTotalVal(true);
00425 double denom=v1Sum+v2Sum;
00426 if(fabs(denom)<1e-6){
00427 if(returnDistance){
00428 return 1.0;
00429 } else {
00430 return 0.0;
00431 }
00432 }
00433 double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00434 if(2.*minV/denom<bounds){
00435 return 0.0;
00436 }
00437 v1Sum=0.0;
00438 v2Sum=0.0;
00439 }
00440
00441 double numer=0.0;
00442
00443 calcVectParams(v1,v2,v1Sum,v2Sum,numer);
00444
00445 double denom=v1Sum+v2Sum;
00446 double sim;
00447 if(fabs(denom)<1e-6){
00448 sim=0.0;
00449 } else {
00450 sim=2.*numer/denom;
00451 }
00452 if(returnDistance) sim = 1.-sim;
00453
00454 return sim;
00455 }
00456
00457
00458 template <typename IndexType>
00459 double TverskySimilarity(const SparseIntVect<IndexType> &v1,
00460 const SparseIntVect<IndexType> &v2,
00461 double a, double b,
00462 bool returnDistance=false,
00463 double bounds=0.0){
00464 if(v1.getLength()!=v2.getLength()){
00465 throw ValueErrorException("SparseIntVect size mismatch");
00466 }
00467 double v1Sum=0.0;
00468 double v2Sum=0.0;
00469 double andSum=0.0;
00470
00471 calcVectParams(v1,v2,v1Sum,v2Sum,andSum);
00472
00473 double denom=a*v1Sum+b*v2Sum+(1-a-b)*andSum;
00474 double sim;
00475
00476 if(fabs(denom)<1e-6){
00477 sim=0.0;
00478 } else {
00479 sim=andSum/denom;
00480 }
00481 if(returnDistance) sim = 1.-sim;
00482
00483 return sim;
00484 }
00485
00486 template <typename IndexType>
00487 double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
00488 const SparseIntVect<IndexType> &v2,
00489 bool returnDistance=false,
00490 double bounds=0.0){
00491 return TverskySimilarity(v1,v2,1.0,1.0,returnDistance,bounds);
00492 }
00493
00494 }
00495
00496
00497
00498 #endif