RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
InfoBitRanker.h
Go to the documentation of this file.
1// $Id$
2//
3// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10
11#include <RDGeneral/export.h>
12#ifndef _RD_INFORANKER_H_
13#define _RD_INFORANKER_H_
14
15#include <RDGeneral/types.h>
17
18/*! \brief Class used to rank bits based on a specified measure of information
19 *
20 * Basically a primitive mimic of the CombiChem "signal" functionality
21 * To use:
22 * - create an instance of this class
23 * - loop over the fingerprints in the dataset by calling accumulateVotes
24 *method
25 * - call getTopN to get the top n ranked bits
26 *
27 * Sample usage and results from the python wrapper:
28 * Here's a small set of vectors:
29 * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
30 * ...
31 * 0001 0
32 * 0101 0
33 * 0010 1
34 * 1110 1
35 *
36 * Default ranker, using infogain:
37 * >>> ranker = InfoBitRanker(4,2)
38 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
39 * ...
40 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
41 *int(bit),'%.3f'%gain,int(n0),int(n1)
42 * ...
43 * 3 1.000 2 0
44 * 2 1.000 0 2
45 * 0 0.311 0 1
46 *
47 * Using the biased infogain:
48 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
49 * >>> ranker.SetBiasList((1,))
50 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
51 * ...
52 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
53 *int(bit),'%.3f'%gain,int(n0),int(n1)
54 * ...
55 * 2 1.000 0 2
56 * 0 0.311 0 1
57 * 1 0.000 1 1
58 *
59 * A chi squared ranker is also available:
60 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
61 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
62 * ...
63 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
64 *int(bit),'%.3f'%gain,int(n0),int(n1)
65 * ...
66 * 3 4.000 2 0
67 * 2 4.000 0 2
68 * 0 1.333 0 1
69 *
70 * As is a biased chi squared:
71 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
72 * >>> ranker.SetBiasList((1,))
73 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
74 * ...
75 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
76 *int(bit),'%.3f'%gain,int(n0),int(n1)
77 * ...
78 * 2 4.000 0 2
79 * 0 1.333 0 1
80 * 1 0.000 1 1
81 */
82namespace RDInfoTheory {
83typedef std::vector<RDKit::USHORT> USHORT_VECT;
84typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
85
87 public:
88 /*! \brief the type of measure for information
89 *
90 */
91 typedef enum {
96 } InfoType;
97
98 /*! \brief Constructor
99 *
100 * ARGUMENTS:
101 *
102 * - nBits: the dimension of the bit vectors or the fingerprint length
103 * - nClasses: the number of classes used in the classification problem
104 *(e.g. active,
105 * moderately active, inactive etc.). It is assumed that the
106 *classes are
107 * numbered from 0 to (nClasses - 1)
108 * - infoType: the type of information metric
109 */
110 InfoBitRanker(unsigned int nBits, unsigned int nClasses,
112 : d_dims(nBits), d_classes(nClasses), d_type(infoType) {
113 d_counts.resize(0);
114 for (unsigned int i = 0; i < nClasses; i++) {
115 USHORT_VECT cCount;
116 cCount.resize(d_dims, 0);
117 d_counts.push_back(cCount);
118 }
119 d_clsCount.resize(d_classes, 0);
120 d_nInst = 0;
121 d_top = 0;
122 dp_topBits = nullptr;
123 d_biasList.resize(0);
124 dp_maskBits = nullptr;
125 }
126
128 if (dp_topBits) {
129 delete[] dp_topBits;
130 }
131 if (dp_maskBits) {
132 delete dp_maskBits;
133 }
134 }
135
136 /*! \brief Accumulate the votes for all the bits turned on in a bit vector
137 *
138 * ARGUMENTS:
139 *
140 * - bv : bit vector that supports [] operator
141 * - label : the class label for the bit vector. It is assumed that 0 <=
142 *class < nClasses
143 */
144 void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
145 void accumulateVotes(const SparseBitVect &bv, unsigned int label);
146
147 /*! \brief Returns the top n bits ranked by the information metric
148 *
149 * This is actually the function where most of the work of ranking is
150 *happening
151 *
152 * \param num the number of top ranked bits that are required
153 *
154 * \return a pointer to an information array. The client should *not*
155 * delete this
156 */
157 double *getTopN(unsigned int num);
158
159 /*! \brief return the number of labelled instances(examples) or fingerprints
160 *seen so far
161 *
162 */
163 unsigned int getNumInstances() const { return d_nInst; }
164
165 /*! \brief return the number of classes
166 *
167 */
168 unsigned int getNumClasses() const { return d_classes; }
169
170 /*! \brief Set the classes to which the entropy calculation should be biased
171 *
172 * This list contains a set of class ids used when in the BIASENTROPY mode of
173 *ranking bits.
174 * In this mode, a bit must be correllated higher with one of the biased
175 *classes than all the
176 * other classes. For example, in a two class problem with actives and
177 *inactives, the fraction of
178 * actives that hit the bit has to be greater than the fraction of inactives
179 *that hit the bit
180 *
181 * ARGUMENTS:
182 * classList - list of class ids that we want a bias towards
183 */
184 void setBiasList(RDKit::INT_VECT &classList);
185
186 /*! \brief Set the bits to be used as a mask
187 *
188 * If this function is called, only the bits which are present in the
189 * maskBits list will be used.
190 *
191 * ARGUMENTS:
192 * maskBits - the bits to be considered
193 */
195
196 /*! \brief Write the top N bits to a stream
197 *
198 */
199 void writeTopBitsToStream(std::ostream *outStream) const;
200
201 /*! \brief Write the top bits to a file
202 *
203 */
204 void writeTopBitsToFile(const std::string &fileName) const;
205
206 private:
207 /*! \brief check if we want to compute the info content for a bit based on the
208 *bias list
209 *
210 * This what happens here:
211 * - the fraction of items in each class that hit a particular bit are
212 *computed
213 * - the maximum of these fractions for classes that are not in the
214 *biasList are computed
215 * - If this maximum is less than the fraction for at least one of the
216 * classes in the biaslist, the bit is considered good
217 * ARGUMENTS:
218 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
219 *of classes))
220 * a 2D structure is assumed with the first row containing number
221 *of items of each class
222 * with the bit set and the second row to entires of each class
223 *with the bit turned off
224 */
225 bool BiasCheckBit(RDKit::USHORT *resMat) const;
226
227 /*! \brief Compute the biased info entropy gain based on the bias list
228 *
229 * This what happens here:
230 * - we call BiasCheckBit to see if the bit qualifies to compute the
231 *infocontent
232 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
233 *
234 * ARGUMENTS:
235 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
236 *of classes))
237 * a 2D structure is assumed with the first row containing number
238 *of items of each class
239 * with the bit set and the second row to entires of each class
240 *with the bit turned off
241 */
242 double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
243
244 /*! \brief Compute the biased chi qsure value based on the bias list
245 *
246 * This what happens here:
247 * - we call BiasCheckBit to see if the bit qualifies to compute the
248 *infocontent
249 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
250 *
251 * ARGUMENTS:
252 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
253 *of classes))
254 * a 2D structure is assumed with the first row containing number
255 *of items of each class
256 * with the bit set and the second row to entires of each class
257 *with the bit turned off
258 */
259 double BiasChiSquareGain(RDKit::USHORT *resMat) const;
260
261 unsigned int d_dims; // the number of bits in the fingerprints
262 unsigned int d_classes; // the number of classes (active, inactive,
263 // moderately active etc.)
264 InfoType d_type; // the type of information measure - currently we support
265 // only entropy
266 VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
267 // each bit for each class
268 USHORT_VECT d_clsCount; // counter for the number of instances of each class
269 double *dp_topBits; // storage for the top ranked bits and the corresponding
270 // statistics
271 unsigned int d_top; // the number of bits that have been ranked
272 unsigned int d_nInst; // total number of instances or fingerprints used
273 // accumulate votes
275 d_biasList; // if we want a bias towards certain classes in ranking bits
276 ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
277};
278} // namespace RDInfoTheory
279#endif
Pulls in all the BitVect classes.
a class for bit vectors that are densely occupied
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label)
Accumulate the votes for all the bits turned on in a bit vector.
InfoType
the type of measure for information
void setMaskBits(RDKit::INT_VECT &maskBits)
Set the bits to be used as a mask.
void writeTopBitsToFile(const std::string &fileName) const
Write the top bits to a file.
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY)
Constructor.
unsigned int getNumClasses() const
return the number of classes
void accumulateVotes(const SparseBitVect &bv, unsigned int label)
double * getTopN(unsigned int num)
Returns the top n bits ranked by the information metric.
unsigned int getNumInstances() const
return the number of labelled instances(examples) or fingerprints seen so far
void writeTopBitsToStream(std::ostream *outStream) const
Write the top N bits to a stream.
void setBiasList(RDKit::INT_VECT &classList)
Set the classes to which the entropy calculation should be biased.
a class for bit vectors that are sparsely occupied.
#define RDKIT_INFOTHEORY_EXPORT
Definition export.h:265
Class used to rank bits based on a specified measure of information.
std::vector< RDKit::USHORT > USHORT_VECT
std::vector< USHORT_VECT > VECT_USHORT_VECT
std::vector< int > INT_VECT
Definition types.h:303
unsigned short USHORT
Definition types.h:300