RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
InfoGainFuncs.h
Go to the documentation of this file.
1// $Id$
2//
3// Copyright (C) 2003 Rational Discovery LLC
4//
5
6#include <RDGeneral/export.h>
7#ifndef INFOGAINFUNC_H
8#define INFOGAINFUNC_H
9
10#include <RDGeneral/types.h>
11
12namespace RDInfoTheory {
13
14template <class T>
15double ChiSquare(T *dMat, long int dim1, long int dim2) {
16 // For a contingency matrix with each column corresponding to a class and each
17 // row to a
18 // the descriptor (or variable) state, the matrix looks something like for 3x3
19 // problem
20 //
21 // 1 2 3 Totals
22 // 1 | N11 N12 N13 R1
23 // 2 | N21 N22 N23 R2
24 // 3 | N31 N32 N33 R3
25 // Totals | C1 C2 C3 N
26 //
27 // Th chi squere formula is
28 // chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
29 T *rowSums, *colSums;
30 int i, j, tSum;
31 // find the row sum
32 tSum = 0;
33 rowSums = new T[dim1];
34 for (i = 0; i < dim1; i++) {
35 int idx1 = i * dim2;
36 rowSums[i] = (T)0.0;
37 for (j = 0; j < dim2; j++) {
38 rowSums[i] += dMat[idx1 + j];
39 }
40 tSum += (int)rowSums[i];
41 }
42
43 // find the column sums
44 colSums = new T[dim2];
45 for (i = 0; i < dim2; i++) {
46 colSums[i] = (T)0.0;
47 for (j = 0; j < dim1; j++) {
48 colSums[i] += dMat[j * dim2 + i];
49 }
50 }
51
52 double chi = 0.0;
53 for (i = 0; i < dim1; i++) {
54 double rchi = 0.0;
55 for (j = 0; j < dim2; j++) {
56 rchi += (pow((double)dMat[i * dim2 + j], 2) / colSums[j]);
57 }
58 chi += (((double)tSum / rowSums[i]) * rchi);
59 }
60 chi -= tSum;
61 delete[] rowSums;
62 delete[] colSums;
63
64 return chi;
65}
66
67template <class T>
68double InfoEntropy(T *tPtr, long int dim) {
69 int i;
70 T nInstances = 0;
71 double accum = 0.0, d;
72
73 for (i = 0; i < dim; i++) {
74 nInstances += tPtr[i];
75 }
76
77 if (nInstances != 0) {
78 for (i = 0; i < dim; i++) {
79 d = (double)tPtr[i] / nInstances;
80 if (d != 0) {
81 accum += -d * log(d);
82 }
83 }
84 }
85 return accum / log(2.0);
86}
87
88template <class T>
89double InfoEntropyGain(T *dMat, long int dim1, long int dim2) {
90 T *variableRes, *overallRes;
91 double gain, term2;
92 int tSum;
93
94 // std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
95 variableRes = new T[dim1];
96 for (long int i = 0; i < dim1; i++) {
97 long int idx1 = i * dim2;
98 variableRes[i] = (T)0.0;
99 for (long int j = 0; j < dim2; j++) {
100 variableRes[i] += dMat[idx1 + j];
101 // std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
102 }
103 }
104
105 overallRes = new T[dim2];
106 // do the col sums
107 for (long int i = 0; i < dim2; i++) {
108 overallRes[i] = (T)0.0;
109 for (long int j = 0; j < dim1; j++) {
110 overallRes[i] += dMat[j * dim2 + i];
111 // std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
112 }
113 }
114
115 term2 = 0.0;
116 for (long int i = 0; i < dim1; i++) {
117 T *tPtr;
118 tPtr = dMat + i * dim2;
119 term2 += variableRes[i] * InfoEntropy(tPtr, dim2);
120 }
121 tSum = 0;
122 for (long int i = 0; i < dim2; i++) {
123 tSum += static_cast<int>(overallRes[i]);
124 }
125
126 if (tSum != 0) {
127 term2 /= tSum;
128 gain = InfoEntropy(overallRes, dim2) - term2;
129 } else {
130 gain = 0.0;
131 }
132 // std::cerr<<" >gain> "<<gain<<std::endl;
133
134 delete[] overallRes;
135 delete[] variableRes;
136 return gain;
137}
138} // namespace RDInfoTheory
139#endif
Class used to rank bits based on a specified measure of information.
double InfoEntropyGain(T *dMat, long int dim1, long int dim2)
double ChiSquare(T *dMat, long int dim1, long int dim2)
double InfoEntropy(T *tPtr, long int dim)