RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
python_streambuf.h
Go to the documentation of this file.
1//
2// This file is part of the CCTBX distribution:
3// http://cctbx.sourceforge.net/
4// Downloaded from here:
5// http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6//
7// Copyright (c) 2006, The Regents of the University of
8// California, through Lawrence Berkeley National Laboratory (subject to
9// receipt of any required approvals from the U.S. Dept. of Energy). All
10// rights reserved.
11//
12// The license is here:
13// http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14//
15#include <RDGeneral/export.h>
16#ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17#define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
19#include <boost/python/object.hpp>
20#include <boost/python/str.hpp>
21#include <boost/python/extract.hpp>
22
23#include <boost/optional.hpp>
24#include <boost/utility/typed_in_place_factory.hpp>
26
27#include <RDGeneral/Invariant.h>
29
30#include <streambuf>
31
32namespace boost_adaptbx {
33namespace python {
34
35namespace bp = boost::python;
36
37/// A stream buffer getting data from and putting data into a Python file object
38/** The aims are as follow:
39
40 - Given a C++ function acting on a standard stream, e.g.
41
42 \code
43 void read_inputs(std::istream& input) {
44 ...
45 input >> something >> something_else;
46 }
47 \endcode
48
49 and given a piece of Python code which creates a file-like object,
50 to be able to pass this file object to that C++ function, e.g.
51
52 \code
53 import gzip
54 gzip_file_obj = gzip.GzipFile(...)
55 read_inputs(gzip_file_obj)
56 \endcode
57
58 and have the standard stream pull data from and put data into the Python
59 file object.
60
61 - When Python \c read_inputs() returns, the Python object is able to
62 continue reading or writing where the C++ code left off.
63
64 - Operations in C++ on mere files should be competitively fast compared
65 to the direct use of \c std::fstream.
66
67
68 \b Motivation
69
70 - the standard Python library offer of file-like objects (files,
71 compressed files and archives, network, ...) is far superior to the
72 offer of streams in the C++ standard library and Boost C++ libraries.
73
74 - i/o code involves a fair amount of text processing which is more
75 efficiently prototyped in Python but then one may need to rewrite
76 a time-critical part in C++, in as seamless a manner as possible.
77
78 \b Usage
79
80 This is 2-step:
81
82 - a trivial wrapper function
83
84 \code
85 using boost_adaptbx::python::streambuf;
86 void read_inputs_wrapper(streambuf& input)
87 {
88 streambuf::istream is(input);
89 read_inputs(is);
90 }
91
92 def("read_inputs", read_inputs_wrapper);
93 \endcode
94
95 which has to be written every time one wants a Python binding for
96 such a C++ function.
97
98 - the Python side
99
100 \code
101 from boost.python import streambuf
102 read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
103 \endcode
104
105 \c buffer_size is optional. See also: \c default_buffer_size
106
107 Note: references are to the C++ standard (the numbers between parentheses
108 at the end of references are margin markers).
109*/
110class streambuf : public std::basic_streambuf<char> {
111 private:
112 typedef std::basic_streambuf<char> base_t;
113
114 public:
115 /* The syntax
116 using base_t::char_type;
117 would be nicer but Visual Studio C++ 8 chokes on it
118 */
119 typedef base_t::char_type char_type;
120 typedef base_t::int_type int_type;
121 typedef base_t::pos_type pos_type;
122 typedef base_t::off_type off_type;
123 typedef base_t::traits_type traits_type;
124
125 // work around Visual C++ 7.1 problem
126 inline static int traits_type_eof() { return traits_type::eof(); }
127
128 /// The default size of the read and write buffer.
129 /** They are respectively used to buffer data read from and data written to
130 the Python file object. It can be modified from Python.
131 */
132 const static std::size_t default_buffer_size = 1024;
133
134 /// Construct from a Python file object
135 /** if buffer_size is 0 the current default_buffer_size is used.
136 */
137 streambuf(bp::object &python_file_obj, std::size_t buffer_size_ = 0)
138 : py_read(getattr(python_file_obj, "read", bp::object())),
139 py_write(getattr(python_file_obj, "write", bp::object())),
140 py_seek(getattr(python_file_obj, "seek", bp::object())),
141 py_tell(getattr(python_file_obj, "tell", bp::object())),
142 buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
143 write_buffer(nullptr),
144 pos_of_read_buffer_end_in_py_file(0),
145 pos_of_write_buffer_end_in_py_file(buffer_size),
146 farthest_pptr(nullptr) {
147 TEST_ASSERT(buffer_size != 0);
148 /* Some Python file objects (e.g. sys.stdout and sys.stdin)
149 have non-functional seek and tell. If so, assign None to
150 py_tell and py_seek.
151 */
152 if (py_tell != bp::object()) {
153 try {
154 off_type py_pos = bp::extract<off_type>(py_tell());
155 if (py_seek != bp::object()) {
156 /* Make sure we can actually seek.
157 bzip2 readers from python have a seek method, but it fails
158 when they are in write mode.
159 */
160 py_seek(py_pos);
161 }
162 } catch (bp::error_already_set &) {
163 py_tell = bp::object();
164 py_seek = bp::object();
165 /* Boost.Python does not do any Python exception handling whatsoever
166 So we need to catch it by hand like so.
167 */
168 PyErr_Clear();
169 }
170 }
171
172 if (py_write != bp::object()) {
173 // C-like string to make debugging easier
174 write_buffer = new char[buffer_size + 1];
175 write_buffer[buffer_size] = '\0';
176 setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
177 farthest_pptr = pptr();
178 } else {
179 // The first attempt at output will result in a call to overflow
180 setp(nullptr, nullptr);
181 }
182
183 if (py_tell != bp::object()) {
184 off_type py_pos = bp::extract<off_type>(py_tell());
185 pos_of_read_buffer_end_in_py_file = py_pos;
186 pos_of_write_buffer_end_in_py_file = py_pos;
187 }
188 }
189
190 /// constructor to enforce a mode (binary or text)
191 streambuf(bp::object &python_file_obj, char mode,
192 std::size_t buffer_size_ = 0)
193 : streambuf(python_file_obj, buffer_size_) {
194#if 1
195 bp::object io_mod = bp::import("io");
196 CHECK_INVARIANT(io_mod, "module not found");
197 bp::object iobase = io_mod.attr("TextIOBase");
198 CHECK_INVARIANT(iobase, "base class not found");
199#else
200 // using statics to save an undetermined amount of time results in
201 // alarming seg faults on windows. so we don't do it. Keep this here
202 // for the moment though in case someone manages to figure that out in
203 // the future
204 static bp::object io_mod = bp::object();
205 static bp::object iobase = bp::object();
206 if (!io_mod) io_mod = bp::import("io");
207 if (io_mod && !iobase) iobase = io_mod.attr("TextIOBase");
208 CHECK_INVARIANT(io_mod, "module not found");
209 CHECK_INVARIANT(iobase, "base class not found");
210#endif
211
212 df_isTextMode = PyObject_IsInstance(python_file_obj.ptr(), iobase.ptr());
213 switch (mode) {
214 case 's': /// yeah, is redundant, but it is somehow natural to do "s"
215 case 't':
216 if (!df_isTextMode) {
218 "Need a text mode file object like StringIO or a file opened "
219 "with mode 't'");
220 }
221 break;
222 case 'b':
223 if (df_isTextMode) {
225 "Need a binary mode file object like BytesIO or a file opened "
226 "with mode 'b'");
227 }
228 break;
229 default:
230 throw std::invalid_argument("bad mode character");
231 }
232 }
233
234 /// Mundane destructor freeing the allocated resources
235 ~streambuf() override {
236 if (write_buffer) {
237 delete[] write_buffer;
238 }
239 }
240
241 /// C.f. C++ standard section 27.5.2.4.3
242 /** It is essential to override this virtual function for the stream
243 member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
244 */
245 std::streamsize showmanyc() override {
246 int_type const failure = traits_type::eof();
247 int_type status = underflow();
248 if (status == failure) {
249 return -1;
250 }
251 return egptr() - gptr();
252 }
253
254 /// C.f. C++ standard section 27.5.2.4.3
255 int_type underflow() override {
256 int_type const failure = traits_type::eof();
257 if (py_read == bp::object()) {
258 throw std::invalid_argument(
259 "That Python file object has no 'read' attribute");
260 }
261 read_buffer = py_read(buffer_size);
262 char *read_buffer_data;
263 bp::ssize_t py_n_read;
264 if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
265 &py_n_read) == -1) {
266 setg(nullptr, nullptr, nullptr);
267 throw std::invalid_argument(
268 "The method 'read' of the Python file object "
269 "did not return a string.");
270 }
271 off_type n_read = (off_type)py_n_read;
272 pos_of_read_buffer_end_in_py_file += n_read;
273 setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
274 // ^^^27.5.2.3.1 (4)
275 if (n_read == 0) {
276 return failure;
277 }
278 return traits_type::to_int_type(read_buffer_data[0]);
279 }
280
281 /// C.f. C++ standard section 27.5.2.4.5
283 if (py_write == bp::object()) {
284 throw std::invalid_argument(
285 "That Python file object has no 'write' attribute");
286 }
287 farthest_pptr = std::max(farthest_pptr, pptr());
288 off_type n_written = (off_type)(farthest_pptr - pbase());
289 off_type orig_n_written = n_written;
290 const unsigned int STD_ASCII = 0x7F;
291 if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII) {
292 // we're somewhere in the middle of a utf8 block. If we
293 // only write part of it we'll end up with an exception,
294 // so push everything that could be utf8 into the next block
295 while (n_written > 0 && static_cast<unsigned int>(
296 write_buffer[n_written - 1]) > STD_ASCII) {
297 --n_written;
298 }
299 }
300 bp::str chunk(pbase(), pbase() + n_written);
301 py_write(chunk);
302
303 if ((!df_isTextMode || static_cast<unsigned int>(c) <= STD_ASCII) &&
304 !traits_type::eq_int_type(c, traits_type::eof())) {
305 py_write(traits_type::to_char_type(c));
306 n_written++;
307 }
308
309 setp(pbase(), epptr());
310 // ^^^ 27.5.2.4.5 (5)
311 farthest_pptr = pptr();
312 if (n_written) {
313 pos_of_write_buffer_end_in_py_file += n_written;
314 if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII &&
315 !traits_type::eq_int_type(c, traits_type::eof())) {
316 size_t n_to_copy = orig_n_written - n_written;
317
318 for (size_t i = 0; i < n_to_copy; ++i) {
319 sputc(write_buffer[n_written + i]);
320 ++farthest_pptr;
321 }
322 sputc(c);
323 ++farthest_pptr;
324 }
325 }
326 return traits_type::eq_int_type(c, traits_type::eof())
327 ? traits_type::not_eof(c)
328 : c;
329 }
330
331 /// Update the python file to reflect the state of this stream buffer
332 /** Empty the write buffer into the Python file object and set the seek
333 position of the latter accordingly (C++ standard section 27.5.2.4.2).
334 If there is no write buffer or it is empty, but there is a non-empty
335 read buffer, set the Python file object seek position to the
336 seek position in that read buffer.
337 */
338 int sync() override {
339 int result = 0;
340 farthest_pptr = std::max(farthest_pptr, pptr());
341 if (farthest_pptr && farthest_pptr > pbase()) {
342 off_type delta = pptr() - farthest_pptr;
343 int_type status = overflow();
344 if (traits_type::eq_int_type(status, traits_type::eof())) {
345 result = -1;
346 }
347 if (py_seek != bp::object()) {
348 py_seek(delta, 1);
349 }
350 } else if (gptr() && gptr() < egptr()) {
351 if (py_seek != bp::object()) {
352 py_seek(gptr() - egptr(), 1);
353 }
354 }
355 return result;
356 }
357
358 /// C.f. C++ standard section 27.5.2.4.2
359 /** This implementation is optimised to look whether the position is within
360 the buffers, so as to avoid calling Python seek or tell. It is
361 important for many applications that the overhead of calling into Python
362 is avoided as much as possible (e.g. parsers which may do a lot of
363 backtracking)
364 */
365 pos_type seekoff(off_type off, std::ios_base::seekdir way,
366 std::ios_base::openmode which =
367 std::ios_base::in | std::ios_base::out) override {
368 /* In practice, "which" is either std::ios_base::in or out
369 since we end up here because either seekp or seekg was called
370 on the stream using this buffer. That simplifies the code
371 in a few places.
372 */
373 int const failure = off_type(-1);
374
375 if (py_seek == bp::object()) {
376 throw std::invalid_argument(
377 "That Python file object has no 'seek' attribute");
378 }
379
380 // we need the read buffer to contain something!
381 if (which == std::ios_base::in && !gptr()) {
382 if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
383 return failure;
384 }
385 }
386
387 // compute the whence parameter for Python seek
388 int whence;
389 switch (way) {
390 case std::ios_base::beg:
391 whence = 0;
392 break;
393 case std::ios_base::cur:
394 whence = 1;
395 break;
396 case std::ios_base::end:
397 whence = 2;
398 break;
399 default:
400 return failure;
401 }
402
403 // Let's have a go
404 boost::optional<off_type> result =
405 seekoff_without_calling_python(off, way, which);
406 if (!result) {
407 // we need to call Python
408 if (which == std::ios_base::out) {
409 overflow();
410 }
411 if (way == std::ios_base::cur) {
412 if (which == std::ios_base::in) {
413 off -= egptr() - gptr();
414 } else if (which == std::ios_base::out) {
415 off += pptr() - pbase();
416 }
417 }
418 py_seek(off, whence);
419 result = off_type(bp::extract<off_type>(py_tell()));
420 if (which == std::ios_base::in) {
421 underflow();
422 }
423 }
424 return *result;
425 }
426
427 /// C.f. C++ standard section 27.5.2.4.2
429 std::ios_base::openmode which =
430 std::ios_base::in | std::ios_base::out) override {
431 return streambuf::seekoff(sp, std::ios_base::beg, which);
432 }
433
434 private:
435 bp::object py_read, py_write, py_seek, py_tell;
436
437 std::size_t buffer_size;
438
439 /* This is actually a Python string and the actual read buffer is
440 its internal data, i.e. an array of characters. We use a Boost.Python
441 object so as to hold on it: as a result, the actual buffer can't
442 go away.
443 */
444 bp::object read_buffer;
445
446 /* A mere array of char's allocated on the heap at construction time and
447 de-allocated only at destruction time.
448 */
449 char *write_buffer;
450 bool df_isTextMode;
451
452 off_type pos_of_read_buffer_end_in_py_file,
453 pos_of_write_buffer_end_in_py_file;
454
455 // the farthest place the buffer has been written into
456 char *farthest_pptr;
457
458 boost::optional<off_type> seekoff_without_calling_python(
459 off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
460 boost::optional<off_type> const failure = off_type(-1);
461
462 // Buffer range and current position
463 off_type buf_begin, buf_end, buf_cur, upper_bound;
464 off_type pos_of_buffer_end_in_py_file;
465 if (which == std::ios_base::in) {
466 pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
467 buf_begin = reinterpret_cast<std::streamsize>(eback());
468 buf_cur = reinterpret_cast<std::streamsize>(gptr());
469 buf_end = reinterpret_cast<std::streamsize>(egptr());
470 upper_bound = buf_end;
471 } else if (which == std::ios_base::out) {
472 pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
473 buf_begin = reinterpret_cast<std::streamsize>(pbase());
474 buf_cur = reinterpret_cast<std::streamsize>(pptr());
475 buf_end = reinterpret_cast<std::streamsize>(epptr());
476 farthest_pptr = std::max(farthest_pptr, pptr());
477 upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
478 } else {
479 CHECK_INVARIANT(0, "unreachable code");
480 }
481
482 // Sought position in "buffer coordinate"
483 off_type buf_sought;
484 if (way == std::ios_base::cur) {
485 buf_sought = buf_cur + off;
486 } else if (way == std::ios_base::beg) {
487 buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
488 } else if (way == std::ios_base::end) {
489 return failure;
490 } else {
491 CHECK_INVARIANT(0, "unreachable code");
492 }
493
494 // if the sought position is not in the buffer, give up
495 if (buf_sought < buf_begin || buf_sought >= upper_bound) {
496 return failure;
497 }
498
499 // we are in wonderland
500 if (which == std::ios_base::in) {
501 gbump(buf_sought - buf_cur);
502 } else if (which == std::ios_base::out) {
503 pbump(buf_sought - buf_cur);
504 }
505 return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
506 }
507
508 public:
509 class istream : public std::istream {
510 public:
511 istream(streambuf &buf) : std::istream(&buf) {
512 exceptions(std::ios_base::badbit);
513 }
514
515 ~istream() override {
516 // do nothing.
517 // This used to do:
518 // if (this->good()) this->sync();
519 // but that caused problems if the underlying file had been closed
520 // (see github #579) and really doesn't seem necessary for what we're
521 // doing.
522 }
523 };
524
525 class ostream : public std::ostream {
526 public:
527 ostream(streambuf &buf) : std::ostream(&buf) {
528 exceptions(std::ios_base::badbit);
529 }
530
531 // overload that takes ownership of the streambuf ptr
532 ostream(streambuf *buf) : std::ostream(buf), m_buf(buf) {
533 exceptions(std::ios_base::badbit);
534 }
535
536 ~ostream() override {
537 if (this->good()) {
538 this->flush();
539 }
540 delete m_buf;
541 }
542
543 private:
544 streambuf *m_buf = nullptr;
545 };
546};
547
548// std::size_t streambuf::default_buffer_size = 1024;
549
552
553 streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size = 0)
554 : python_streambuf(python_file_obj, buffer_size) {}
555};
556
558 ostream(bp::object &python_file_obj, std::size_t buffer_size = 0)
559 : streambuf_capsule(python_file_obj, buffer_size),
561
562 ~ostream() noexcept override {
563 if (this->good()) {
564 this->flush();
565 }
566 }
567};
568} // namespace python
569} // namespace boost_adaptbx
570
571#endif // GUARD
#define TEST_ASSERT(expr)
Definition Invariant.h:151
#define CHECK_INVARIANT(expr, mess)
Definition Invariant.h:100
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
A stream buffer getting data from and putting data into a Python file object.
~streambuf() override
Mundane destructor freeing the allocated resources.
static const std::size_t default_buffer_size
The default size of the read and write buffer.
pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
std::streamsize showmanyc() override
C.f. C++ standard section 27.5.2.4.3.
pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
streambuf(bp::object &python_file_obj, char mode, std::size_t buffer_size_=0)
constructor to enforce a mode (binary or text)
int sync() override
Update the python file to reflect the state of this stream buffer.
int_type overflow(int_type c=traits_type_eof()) override
C.f. C++ standard section 27.5.2.4.5.
int_type underflow() override
C.f. C++ standard section 27.5.2.4.3.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)