python_streambuf.h

Go to the documentation of this file.
00001 //
00002 //  This file is part of the CCTBX distribution:
00003 //     http://cctbx.sourceforge.net/
00004 //  Downloaded from here: 
00005 //    http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
00006 //
00007 //  Copyright (c) 2006, The Regents of the University of
00008 //  California, through Lawrence Berkeley National Laboratory (subject to
00009 //  receipt of any required approvals from the U.S. Dept. of Energy).  All
00010 //  rights reserved.
00011 //
00012 //  The license is here:
00013 //    http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
00014 //
00015 #ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
00016 #define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
00017 
00018 #include <boost/python/object.hpp>
00019 #include <boost/python/str.hpp>
00020 #include <boost/python/extract.hpp>
00021 
00022 #include <boost/optional.hpp>
00023 #include <boost/utility/typed_in_place_factory.hpp>
00024 
00025 //#include <tbxx/error_utils.hpp>
00026 #include <RDGeneral/Invariant.h>
00027 
00028 #include <streambuf>
00029 #include <iostream>
00030 
00031 namespace boost_adaptbx { namespace python {
00032 
00033 namespace bp = boost::python;
00034 
00035 /// A stream buffer getting data from and putting data into a Python file object
00036 /** The aims are as follow:
00037 
00038     - Given a C++ function acting on a standard stream, e.g.
00039 
00040       \code
00041       void read_inputs(std::istream& input) {
00042         ...
00043         input >> something >> something_else;
00044       }
00045       \endcode
00046 
00047       and given a piece of Python code which creates a file-like object,
00048       to be able to pass this file object to that C++ function, e.g.
00049 
00050       \code
00051       import gzip
00052       gzip_file_obj = gzip.GzipFile(...)
00053       read_inputs(gzip_file_obj)
00054       \endcode
00055 
00056       and have the standard stream pull data from and put data into the Python
00057       file object.
00058 
00059     - When Python \c read_inputs() returns, the Python object is able to
00060       continue reading or writing where the C++ code left off.
00061 
00062     - Operations in C++ on mere files should be competitively fast compared
00063       to the direct use of \c std::fstream.
00064 
00065 
00066     \b Motivation
00067 
00068       - the standard Python library offer of file-like objects (files,
00069         compressed files and archives, network, ...) is far superior to the
00070         offer of streams in the C++ standard library and Boost C++ libraries.
00071 
00072       - i/o code involves a fair amount of text processing which is more
00073         efficiently prototyped in Python but then one may need to rewrite
00074         a time-critical part in C++, in as seamless a manner as possible.
00075 
00076     \b Usage
00077 
00078     This is 2-step:
00079 
00080       - a trivial wrapper function
00081 
00082         \code
00083           using boost_adaptbx::python::streambuf;
00084           void read_inputs_wrapper(streambuf& input)
00085           {
00086             streambuf::istream is(input);
00087             read_inputs(is);
00088           }
00089 
00090           def("read_inputs", read_inputs_wrapper);
00091         \endcode
00092 
00093         which has to be written every time one wants a Python binding for
00094         such a C++ function.
00095 
00096       - the Python side
00097 
00098         \code
00099           from boost.python import streambuf
00100           read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
00101         \endcode
00102 
00103         \c buffer_size is optional. See also: \c default_buffer_size
00104 
00105   Note: references are to the C++ standard (the numbers between parentheses
00106   at the end of references are margin markers).
00107 */
00108 class streambuf : public std::basic_streambuf<char>
00109 {
00110   private:
00111     typedef std::basic_streambuf<char> base_t;
00112 
00113   public:
00114     /* The syntax
00115         using base_t::char_type;
00116        would be nicer but Visual Studio C++ 8 chokes on it
00117     */
00118     typedef base_t::char_type   char_type;
00119     typedef base_t::int_type    int_type;
00120     typedef base_t::pos_type    pos_type;
00121     typedef base_t::off_type    off_type;
00122     typedef base_t::traits_type traits_type;
00123 
00124     // work around Visual C++ 7.1 problem
00125     inline static int
00126     traits_type_eof() { return traits_type::eof(); }
00127 
00128     /// The default size of the read and write buffer.
00129     /** They are respectively used to buffer data read from and data written to
00130         the Python file object. It can be modified from Python.
00131     */
00132     const static std::size_t default_buffer_size=1024;
00133 
00134     /// Construct from a Python file object
00135     /** if buffer_size is 0 the current default_buffer_size is used.
00136     */
00137     streambuf(
00138       bp::object& python_file_obj,
00139       std::size_t buffer_size_=0)
00140     :
00141       py_read (getattr(python_file_obj, "read",  bp::object())),
00142       py_write(getattr(python_file_obj, "write", bp::object())),
00143       py_seek (getattr(python_file_obj, "seek",  bp::object())),
00144       py_tell (getattr(python_file_obj, "tell",  bp::object())),
00145       buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
00146       write_buffer(0),
00147       pos_of_read_buffer_end_in_py_file(0),
00148       pos_of_write_buffer_end_in_py_file(buffer_size),
00149       farthest_pptr(0)
00150     {
00151       TEST_ASSERT(buffer_size != 0);
00152       /* Some Python file objects (e.g. sys.stdout and sys.stdin)
00153          have non-functional seek and tell. If so, assign None to
00154          py_tell and py_seek.
00155        */
00156       if (py_tell != bp::object()) {
00157         try {
00158           py_tell();
00159         }
00160         catch (bp::error_already_set&) {
00161           py_tell = bp::object();
00162           py_seek = bp::object();
00163           /* Boost.Python does not do any Python exception handling whatsoever
00164              So we need to catch it by hand like so.
00165            */
00166           PyErr_Clear();
00167         }
00168       }
00169 
00170       if (py_write != bp::object()) {
00171         // C-like string to make debugging easier
00172         write_buffer = new char[buffer_size + 1];
00173         write_buffer[buffer_size] = '\0';
00174         setp(write_buffer, write_buffer + buffer_size);  // 27.5.2.4.5 (5)
00175         farthest_pptr = pptr();
00176       }
00177       else {
00178         // The first attempt at output will result in a call to overflow
00179         setp(0, 0);
00180       }
00181 
00182       if (py_tell != bp::object()) {
00183         off_type py_pos = bp::extract<off_type>(py_tell());
00184         pos_of_read_buffer_end_in_py_file = py_pos;
00185         pos_of_write_buffer_end_in_py_file = py_pos;
00186       }
00187     }
00188 
00189     /// Mundane destructor freeing the allocated resources
00190     virtual ~streambuf() {
00191       if (write_buffer) delete[] write_buffer;
00192     }
00193 
00194     /// C.f. C++ standard section 27.5.2.4.3
00195     /** It is essential to override this virtual function for the stream
00196         member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
00197      */
00198     virtual std::streamsize showmanyc() {
00199       int_type const failure = traits_type::eof();
00200       int_type status = underflow();
00201       if (status == failure) return -1;
00202       return egptr() - gptr();
00203     }
00204 
00205     /// C.f. C++ standard section 27.5.2.4.3
00206     virtual int_type underflow() {
00207       int_type const failure = traits_type::eof();
00208       if (py_read == bp::object()) {
00209         throw std::invalid_argument(
00210           "That Python file object has no 'read' attribute");
00211       }
00212       read_buffer = py_read(buffer_size);
00213       char *read_buffer_data;
00214       bp::ssize_t py_n_read;
00215       if (PyString_AsStringAndSize(read_buffer.ptr(),
00216                                    &read_buffer_data, &py_n_read) == -1) {
00217         setg(0, 0, 0);
00218         throw std::invalid_argument(
00219           "The method 'read' of the Python file object "
00220           "did not return a string.");
00221       }
00222       off_type n_read = (off_type)py_n_read;
00223       pos_of_read_buffer_end_in_py_file += n_read;
00224       setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
00225       // ^^^27.5.2.3.1 (4)
00226       if (n_read == 0) return failure;
00227       return traits_type::to_int_type(read_buffer_data[0]);
00228     }
00229 
00230     /// C.f. C++ standard section 27.5.2.4.5
00231     virtual int_type overflow(int_type c=traits_type_eof()) {
00232       if (py_write == bp::object()) {
00233         throw std::invalid_argument(
00234           "That Python file object has no 'write' attribute");
00235       }
00236       farthest_pptr = std::max(farthest_pptr, pptr());
00237       off_type n_written = (off_type)(farthest_pptr - pbase());
00238       bp::str chunk(pbase(), farthest_pptr);
00239       py_write(chunk);
00240       if (!traits_type::eq_int_type(c, traits_type::eof())) {
00241         py_write(traits_type::to_char_type(c));
00242         n_written++;
00243       }
00244       if (n_written) {
00245         pos_of_write_buffer_end_in_py_file += n_written;
00246         setp(pbase(), epptr());
00247         // ^^^ 27.5.2.4.5 (5)
00248         farthest_pptr = pptr();
00249       }
00250       return traits_type::eq_int_type(
00251         c, traits_type::eof()) ? traits_type::not_eof(c) : c;
00252     }
00253 
00254     /// Update the python file to reflect the state of this stream buffer
00255     /** Empty the write buffer into the Python file object and set the seek
00256         position of the latter accordingly (C++ standard section 27.5.2.4.2).
00257         If there is no write buffer or it is empty, but there is a non-empty
00258         read buffer, set the Python file object seek position to the
00259         seek position in that read buffer.
00260     */
00261     virtual int sync() {
00262       int result = 0;
00263       farthest_pptr = std::max(farthest_pptr, pptr());
00264       if (farthest_pptr && farthest_pptr > pbase()) {
00265         off_type delta = pptr() - farthest_pptr;
00266         int_type status = overflow();
00267         if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
00268         if (py_seek != bp::object()) py_seek(delta, 1);
00269       }
00270       else if (gptr() && gptr() < egptr()) {
00271         if (py_seek != bp::object()) py_seek(gptr() - egptr(), 1);
00272       }
00273       return result;
00274     }
00275 
00276     /// C.f. C++ standard section 27.5.2.4.2
00277     /** This implementation is optimised to look whether the position is within
00278         the buffers, so as to avoid calling Python seek or tell. It is
00279         important for many applications that the overhead of calling into Python
00280         is avoided as much as possible (e.g. parsers which may do a lot of
00281         backtracking)
00282     */
00283     virtual
00284     pos_type seekoff(off_type off, std::ios_base::seekdir way,
00285                      std::ios_base::openmode which=  std::ios_base::in
00286                                                    | std::ios_base::out)
00287     {
00288       /* In practice, "which" is either std::ios_base::in or out
00289          since we end up here because either seekp or seekg was called
00290          on the stream using this buffer. That simplifies the code
00291          in a few places.
00292       */
00293       int const failure = off_type(-1);
00294 
00295       if (py_seek == bp::object()) {
00296         throw std::invalid_argument(
00297           "That Python file object has no 'seek' attribute");
00298       }
00299 
00300       // we need the read buffer to contain something!
00301       if (which == std::ios_base::in && !gptr()) {
00302         if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
00303           return failure;
00304         }
00305       }
00306 
00307       // compute the whence parameter for Python seek
00308       int whence;
00309       switch (way) {
00310         case std::ios_base::beg:
00311           whence = 0;
00312           break;
00313         case std::ios_base::cur:
00314           whence = 1;
00315           break;
00316         case std::ios_base::end:
00317           whence = 2;
00318           break;
00319         default:
00320           return failure;
00321       }
00322 
00323       // Let's have a go
00324       boost::optional<off_type> result = seekoff_without_calling_python(
00325         off, way, which);
00326       if (!result) {
00327         // we need to call Python
00328         if (which == std::ios_base::out) overflow();
00329         if (way == std::ios_base::cur) {
00330           if      (which == std::ios_base::in)  off -= egptr() - gptr();
00331           else if (which == std::ios_base::out) off += pptr() - pbase();
00332         }
00333         py_seek(off, whence);
00334         result = off_type(bp::extract<off_type>(py_tell()));
00335         if (which == std::ios_base::in) underflow();
00336       }
00337       return *result;
00338     }
00339 
00340     /// C.f. C++ standard section 27.5.2.4.2
00341     virtual
00342     pos_type seekpos(pos_type sp,
00343                      std::ios_base::openmode which=  std::ios_base::in
00344                                                    | std::ios_base::out)
00345     {
00346       return streambuf::seekoff(sp, std::ios_base::beg, which);
00347     }
00348 
00349   private:
00350     bp::object py_read, py_write, py_seek, py_tell;
00351 
00352     std::size_t buffer_size;
00353 
00354     /* This is actually a Python string and the actual read buffer is
00355        its internal data, i.e. an array of characters. We use a Boost.Python
00356        object so as to hold on it: as a result, the actual buffer can't
00357        go away.
00358     */
00359     bp::object read_buffer;
00360 
00361     /* A mere array of char's allocated on the heap at construction time and
00362        de-allocated only at destruction time.
00363     */
00364     char *write_buffer;
00365 
00366     off_type pos_of_read_buffer_end_in_py_file,
00367              pos_of_write_buffer_end_in_py_file;
00368 
00369     // the farthest place the buffer has been written into
00370     char *farthest_pptr;
00371 
00372 
00373     boost::optional<off_type> seekoff_without_calling_python(
00374       off_type off,
00375       std::ios_base::seekdir way,
00376       std::ios_base::openmode which)
00377     {
00378       boost::optional<off_type> const failure;
00379 
00380       // Buffer range and current position
00381       off_type buf_begin, buf_end, buf_cur, upper_bound;
00382       off_type pos_of_buffer_end_in_py_file;
00383       if (which == std::ios_base::in) {
00384         pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
00385         buf_begin = reinterpret_cast<std::streamsize>(eback());
00386         buf_cur = reinterpret_cast<std::streamsize>(gptr());
00387         buf_end = reinterpret_cast<std::streamsize>(egptr());
00388         upper_bound = buf_end;
00389       }
00390       else if (which == std::ios_base::out) {
00391         pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
00392         buf_begin = reinterpret_cast<std::streamsize>(pbase());
00393         buf_cur = reinterpret_cast<std::streamsize>(pptr());
00394         buf_end = reinterpret_cast<std::streamsize>(epptr());
00395         farthest_pptr = std::max(farthest_pptr, pptr());
00396         upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
00397       }
00398       else {
00399         CHECK_INVARIANT(0,"unreachable code");
00400       }
00401 
00402       // Sought position in "buffer coordinate"
00403       off_type buf_sought;
00404       if (way == std::ios_base::cur) {
00405         buf_sought = buf_cur + off;
00406       }
00407       else if (way == std::ios_base::beg) {
00408         buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
00409       }
00410       else if (way == std::ios_base::end) {
00411         return failure;
00412       }
00413       else {
00414         CHECK_INVARIANT(0,"unreachable code");
00415       }
00416 
00417       // if the sought position is not in the buffer, give up
00418       if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure;
00419 
00420       // we are in wonderland
00421       if      (which == std::ios_base::in)  gbump(buf_sought - buf_cur);
00422       else if (which == std::ios_base::out) pbump(buf_sought - buf_cur);
00423       return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
00424     }
00425 
00426   public:
00427 
00428     class istream : public std::istream
00429     {
00430       public:
00431         istream(streambuf& buf) : std::istream(&buf)
00432         {
00433           exceptions(std::ios_base::badbit);
00434         }
00435 
00436         ~istream() { if (this->good()) this->sync(); }
00437     };
00438 
00439     class ostream : public std::ostream
00440     {
00441       public:
00442         ostream(streambuf& buf) : std::ostream(&buf)
00443         {
00444           exceptions(std::ios_base::badbit);
00445         }
00446 
00447         ~ostream() { if (this->good()) this->flush(); }
00448     };
00449 };
00450 
00451     //std::size_t streambuf::default_buffer_size = 1024;
00452 
00453 struct streambuf_capsule
00454 {
00455   streambuf python_streambuf;
00456 
00457   streambuf_capsule(
00458     bp::object& python_file_obj,
00459     std::size_t buffer_size=0)
00460   :
00461     python_streambuf(python_file_obj, buffer_size)
00462   {}
00463 };
00464 
00465 struct ostream : private streambuf_capsule, streambuf::ostream
00466 {
00467   ostream(
00468     bp::object& python_file_obj,
00469     std::size_t buffer_size=0)
00470   :
00471     streambuf_capsule(python_file_obj, buffer_size),
00472     streambuf::ostream(python_streambuf)
00473   {}
00474 
00475   ~ostream()
00476   {
00477     try {
00478       if (this->good()) this->flush();
00479     }
00480     catch (bp::error_already_set&) {
00481       PyErr_Clear();
00482       throw std::runtime_error(
00483         "Problem closing python ostream.\n"
00484         "  Known limitation: the error is unrecoverable. Sorry.\n"
00485         "  Suggestion for programmer: add ostream.flush() before"
00486         " returning.");
00487     }
00488   }
00489 };
00490 
00491 }} // boost_adaptbx::python
00492 
00493 #endif // GUARD