RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
LeaderPicker.h
Go to the documentation of this file.
1//
2// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3// Copyright (C) 2017-2019 Greg Landrum and NextMove Software
4//
5// @@ All Rights Reserved @@
6// This file is part of the RDKit.
7// The contents are covered by the terms of the BSD license
8// which is included in the file license.txt, found at the root
9// of the RDKit source tree.
10//
11#ifndef RD_LEADERPICKER_H
12#define RD_LEADERPICKER_H
13
14#include <RDGeneral/types.h>
15#include <RDGeneral/utils.h>
16#include <RDGeneral/Invariant.h>
17#include <RDGeneral/RDLog.h>
19#include <RDGeneral/RDThreads.h>
20#include <cstdlib>
21#include "DistPicker.h"
22
23namespace RDPickers {
24
25/*! \brief Implements the Leader algorithm for picking a subset of item from a
26 *pool
27 *
28 * This class inherits from the DistPicker and implements a specific picking
29 *strategy aimed at diversity. See documentation for "pick()" member function
30 *for the algorithm details
31 */
32class LeaderPicker : public DistPicker {
33 public:
34 double default_threshold{0.0};
36
37 /*! \brief Default Constructor
38 *
39 */
41 LeaderPicker(double threshold)
42 : default_threshold(threshold), default_nthreads(1) {}
43 LeaderPicker(double threshold, int nthreads)
44 : default_threshold(threshold), default_nthreads(nthreads) {}
45
46 /*! \brief Contains the implementation for a lazy Leader diversity picker
47 *
48 * See the documentation for the pick() method for details about the algorithm
49 *
50 * \param func - a function (or functor) taking two unsigned ints as
51 *arguments and returning the distance (as a double) between those two
52 *elements.
53 *
54 * \param poolSize - the size of the pool to pick the items from. It is
55 *assumed that the distance matrix above contains the right number of
56 *elements; i.e. poolSize*(poolSize-1)
57 *
58 * \param pickSize - the number items to pick from pool (<= poolSize)
59 *
60 * \param firstPicks - (optional)the first items in the pick list
61 *
62 * \param seed - (optional) seed for the random number generator. If this is
63 *<0 the generator will be seeded with a random number.
64 */
65 template <typename T>
66 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
67 unsigned int pickSize) const;
68
69 template <typename T>
70 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
71 unsigned int pickSize, double threshold) const;
72
73 template <typename T>
74 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
75 unsigned int pickSize,
76 const RDKit::INT_VECT &firstPicks,
77 double threshold) const;
78
79 template <typename T>
80 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
81 unsigned int pickSize,
82 const RDKit::INT_VECT &firstPicks, double threshold,
83 int nthreads) const;
84
85 /*! \brief Contains the implementation for the Leader diversity picker
86 *
87 * \param distMat - distance matrix - a vector of double. It is assumed that
88 *only the lower triangle element of the matrix are supplied in a 1D array\n
89 *
90 * \param poolSize - the size of the pool to pick the items from. It is
91 *assumed that the distance matrix above contains the right number of
92 *elements; i.e. poolSize*(poolSize-1) \n
93 *
94 * \param pickSize - maximum number items to pick from pool (<= poolSize)
95 *
96 * \param firstPicks - indices of the items used to seed the pick set.
97 */
98 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
99 unsigned int pickSize, const RDKit::INT_VECT &firstPicks,
100 double threshold, int nthreads) const {
101 CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
102 if (!poolSize) {
103 throw ValueErrorException("empty pool to pick from");
104 }
105 if (poolSize < pickSize) {
106 throw ValueErrorException("pickSize cannot be larger than the poolSize");
107 }
108 distmatFunctor functor(distMat);
109 return this->lazyPick(functor, poolSize, pickSize, firstPicks, threshold,
110 nthreads);
111 }
112
113 /*! \overload */
114 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
115 unsigned int pickSize) const override {
117 return pick(distMat, poolSize, pickSize, iv, default_threshold,
119 }
120};
121
122#if defined(RDK_BUILD_THREADSAFE_SSS)
123#if defined(unix) || defined(__unix__) || defined(__unix)
124#define USE_THREADED_LEADERPICKER
125#endif
126#endif
127
128#ifdef USE_THREADED_LEADERPICKER
129// Note that this block of code currently only works on linux (which is why it's
130// disabled by default elsewhere). In order to work on other platforms we need
131// cross-platform threading primitives which support a barrier; or a rewrite.
132// Given that we will get the cross-platform threading for free with C++20, I
133// think it makes sense to just wait
134template <typename T>
135void *LeaderPickerWork(void *arg);
136
137template <typename T>
138struct LeaderPickerState {
139 typedef struct {
140 int *ptr;
141 unsigned int capacity;
142 unsigned int len;
143 unsigned int next[2];
144 } LeaderPickerBlock;
145 typedef struct {
146 LeaderPickerState<T> *stat;
147 pthread_t tid;
148 unsigned int id;
149 } LeaderPickerThread;
150
151 std::vector<LeaderPickerThread> threads;
152 std::vector<LeaderPickerBlock> blocks;
153 pthread_barrier_t wait;
154 pthread_barrier_t done;
155 std::vector<int> v;
156 LeaderPickerBlock *head_block;
157 unsigned int thread_op;
158 unsigned int nthreads;
159 unsigned int tick;
160 double threshold;
161 int query;
162 T *func;
163
164 LeaderPickerState(unsigned int count, int nt) {
165 v.resize(count);
166 for (unsigned int i = 0; i < count; i++) {
167 v[i] = i;
168 }
169
170 // InitializeBlocks
171 unsigned int bcount;
172 unsigned int bsize;
173 if (nt > 1) {
174 bsize = 4096;
175 bcount = (count + (bsize - 1)) / bsize;
176 unsigned int tasks = (bcount + 1) / 2;
177 // limit number of threads to available work
178 if (nt > (int)tasks) {
179 nt = tasks;
180 }
181 } else {
182 bsize = 32768;
183 bcount = (count + (bsize - 1)) / bsize;
184 }
185 blocks.resize(bcount);
186 head_block = &blocks[0];
187 tick = 0;
188
189 if (bcount > 1) {
190 int *ptr = &v[0];
191 unsigned int len = count;
192 for (unsigned int i = 0; i < bcount; i++) {
193 LeaderPickerBlock *block = &blocks[i];
194 block->ptr = ptr;
195 if (len > bsize) {
196 block->capacity = bsize;
197 block->len = bsize;
198 block->next[0] = i + 1;
199 } else {
200 block->capacity = len;
201 block->len = len;
202 block->next[0] = 0;
203 break;
204 }
205 ptr += bsize;
206 len -= bsize;
207 }
208 } else {
209 head_block->capacity = count;
210 head_block->len = count;
211 head_block->next[0] = 0;
212 head_block->next[1] = 0;
213 head_block->ptr = &v[0];
214 }
215
216 // InitializeThreads
217 if (nt > 1) {
218 nthreads = nt;
219 pthread_barrier_init(&wait, NULL, nthreads + 1);
220 pthread_barrier_init(&done, NULL, nthreads + 1);
221
222 threads.resize(nt);
223 for (unsigned int i = 0; i < nthreads; i++) {
224 threads[i].id = i;
225 threads[i].stat = this;
226 pthread_create(&threads[i].tid, NULL, LeaderPickerWork<T>,
227 (void *)&threads[i]);
228 }
229 } else {
230 nthreads = 1;
231 }
232 }
233
234 ~LeaderPickerState() {
235 if (nthreads > 1) {
236 thread_op = 1;
237 pthread_barrier_wait(&wait);
238 for (unsigned int i = 0; i < nthreads; i++) {
239 pthread_join(threads[i].tid, 0);
240 }
241 pthread_barrier_destroy(&wait);
242 pthread_barrier_destroy(&done);
243 }
244 }
245
246 bool empty() {
247 while (head_block) {
248 if (head_block->len) {
249 return false;
250 }
251 unsigned int next_tick = head_block->next[tick];
252 if (!next_tick) {
253 return true;
254 }
255 head_block = &blocks[next_tick];
256 }
257 return true;
258 }
259
260 unsigned int compact(int *dst, int *src, unsigned int len) {
261 unsigned int count = 0;
262 for (unsigned int i = 0; i < len; i++) {
263 if ((*func)(query, src[i]) > threshold) {
264 dst[count++] = src[i];
265 }
266 }
267 return count;
268 }
269
270 void compact_job(unsigned int cycle) {
271 // On entry, next[tick] for each block is the current linked list.
272 // On exit, next[tock] is the linked list for the next iteration.
273 unsigned int tock = tick ^ 1;
274
275 LeaderPickerBlock *list = head_block;
276 for (;;) {
277 unsigned int next_tick = list->next[tick];
278 if (next_tick) {
279 LeaderPickerBlock *next = &blocks[next_tick];
280 unsigned int next_next_tick = next->next[tick];
281 if (cycle == 0) {
282 list->len = compact(list->ptr, list->ptr, list->len);
283 if (list->len + next->len <= list->capacity) {
284 list->len += compact(list->ptr + list->len, next->ptr, next->len);
285 list->next[tock] = next_next_tick;
286 } else {
287 next->len = compact(next->ptr, next->ptr, next->len);
288 if (next->len) {
289 list->next[tock] = next_tick;
290 next->next[tock] = next_next_tick;
291 } else {
292 list->next[tock] = next_next_tick;
293 }
294 }
295 cycle = nthreads - 1;
296 } else {
297 cycle--;
298 }
299 if (next_next_tick) {
300 list = &blocks[next_next_tick];
301 } else {
302 break;
303 }
304 } else {
305 if (cycle == 0) {
306 list->len = compact(list->ptr, list->ptr, list->len);
307 list->next[tock] = 0;
308 }
309 break;
310 }
311 }
312 }
313
314 void compact(int pick) {
315 query = pick;
316 if (nthreads > 1) {
317 thread_op = 0;
318 pthread_barrier_wait(&wait);
319 pthread_barrier_wait(&done);
320 } else {
321 compact_job(0);
322 }
323 tick ^= 1;
324 }
325
326 int compact_next() {
327 compact(head_block->ptr[0]);
328 return query;
329 }
330};
331
332// This is the loop the worker threads run
333template <typename T>
334void *LeaderPickerWork(void *arg) {
335 typename LeaderPickerState<T>::LeaderPickerThread *thread;
336 thread = (typename LeaderPickerState<T>::LeaderPickerThread *)arg;
337 LeaderPickerState<T> *stat = thread->stat;
338
339 for (;;) {
340 pthread_barrier_wait(&stat->wait);
341 if (stat->thread_op) {
342 return (void *)0;
343 }
344 stat->compact_job(thread->id);
345 pthread_barrier_wait(&stat->done);
346 }
347}
348#else
349
350template <typename T>
352 std::vector<int> v;
353 unsigned int left;
354 double threshold;
355 int query;
357
358 LeaderPickerState(unsigned int count, int)
359 : left(count), threshold(0.0), query(0), func(nullptr) {
360 v.resize(count);
361 for (unsigned int i = 0; i < count; i++) {
362 v[i] = i;
363 }
364 }
365
366 bool empty() { return left == 0; }
367
368 unsigned int compact(int *dst, int *src, unsigned int len) {
369 unsigned int count = 0;
370 for (unsigned int i = 0; i < len; i++) {
371 double ld = (*func)(query, src[i]);
372 // std::cerr << query << "-" << src[i] << " " << ld << std::endl;
373 if (ld > threshold) {
374 dst[count++] = src[i];
375 }
376 }
377 return count;
378 }
379
380 void compact(int pick) {
381 query = pick;
382 left = compact(&v[0], &v[0], left);
383 }
384
386 query = v[0];
387 left = compact(&v[0], &v[1], left - 1);
388 return query;
389 }
390};
391
392#endif
393// we implement this here in order to allow arbitrary functors without link
394// errors
395template <typename T>
396RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
397 unsigned int pickSize,
398 const RDKit::INT_VECT &firstPicks,
399 double threshold, int nthreads) const {
400 if (!poolSize) {
401 throw ValueErrorException("empty pool to pick from");
402 }
403
404 if (poolSize < pickSize) {
405 throw ValueErrorException("pickSize cannot be larger than the poolSize");
406 }
407
408 if (!pickSize) {
409 pickSize = poolSize;
410 }
411 RDKit::INT_VECT picks;
412
413 nthreads = RDKit::getNumThreadsToUse(nthreads);
414
415 LeaderPickerState<T> stat(poolSize, nthreads);
416 stat.threshold = threshold;
417 stat.func = &func;
418
419 unsigned int picked = 0; // picks.size()
420 unsigned int pick = 0;
421
422 if (!firstPicks.empty()) {
423 for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
424 pIdx != firstPicks.end(); ++pIdx) {
425 pick = static_cast<unsigned int>(*pIdx);
426 if (pick >= poolSize) {
427 throw ValueErrorException("pick index was larger than the poolSize");
428 }
429 picks.push_back(pick);
430 stat.compact(pick);
431 picked++;
432 }
433 }
434
435 while (picked < pickSize && !stat.empty()) {
436 pick = stat.compact_next();
437 picks.push_back(pick);
438 picked++;
439 }
440 return picks;
441}
442
443template <typename T>
444RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
445 unsigned int pickSize) const {
446 RDKit::INT_VECT firstPicks;
447 return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks,
449}
450
451template <typename T>
452RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
453 unsigned int pickSize,
454 double threshold) const {
455 RDKit::INT_VECT firstPicks;
456 return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
458}
459template <typename T>
460RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
461 unsigned int pickSize,
462 const RDKit::INT_VECT &firstPicks,
463 double threshold) const {
464 return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
466}
467
468}; // namespace RDPickers
469
470#endif
#define CHECK_INVARIANT(expr, mess)
Definition Invariant.h:101
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition DistPicker.h:46
Implements the Leader algorithm for picking a subset of item from a pool.
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy Leader diversity picker.
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, double threshold, int nthreads) const
Contains the implementation for the Leader diversity picker.
LeaderPicker()
Default Constructor.
LeaderPicker(double threshold)
LeaderPicker(double threshold, int nthreads)
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const override
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
RDKIT_RGROUPDECOMPOSITION_EXPORT const std::string done
std::vector< int > INT_VECT
Definition types.h:284
unsigned int getNumThreadsToUse(int target)
Definition RDThreads.h:37
unsigned int compact(int *dst, int *src, unsigned int len)
LeaderPickerState(unsigned int count, int)