RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
GeneralFileReader.h
Go to the documentation of this file.
1//
2// Copyright (C) 2020 Shrey Aryan
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#ifndef GENERAL_FILE_READER_H
11#define GENERAL_FILE_READER_H
13#include <RDStreams/streams.h>
14
15#include <boost/algorithm/string.hpp>
16#include <memory>
17#include <string>
18#include <vector>
19
20#include "MolSupplier.h"
23
24namespace RDKit {
28 bool takeOwnership = true;
29 bool sanitize = true;
30 bool removeHs = true;
31 bool strictParsing = true;
32
33 std::string delimiter = "\t";
34 int smilesColumn = 0;
35 int nameColumn = 1;
36 bool titleLine = true;
37
38 std::string nameRecord = "";
39 int confId2D = -1;
40 int confId3D = 0;
41
43};
44//! current supported file formats
45const std::vector<std::string> supportedFileFormats{
46 "sdf", "mae", "maegz", "sdfgz", "smi", "csv", "txt", "tsv", "tdt"};
47//! current supported compression formats
48const std::vector<std::string> supportedCompressionFormats{"gz"};
49
50//! given file path determines the file and compression format
51//! returns true on success, otherwise false
52//! Note: Error handeling is done in the getSupplier method
53
54inline void determineFormat(const std::string path, std::string &fileFormat,
55 std::string &compressionFormat) {
56 //! filename without compression format
57 std::string basename;
58 //! Special case maegz.
59 //! NOTE: also supporting case-insensitive filesystems
60 if (boost::algorithm::iends_with(path, ".maegz")) {
61 fileFormat = "mae";
62 compressionFormat = "gz";
63 return;
64 } else if (boost::algorithm::iends_with(path, ".sdfgz")) {
65 fileFormat = "sdf";
66 compressionFormat = "gz";
67 return;
68 } else if (boost::algorithm::iends_with(path, ".gz")) {
69 compressionFormat = "gz";
70 basename = path.substr(0, path.size() - 3);
71 } else if (boost::algorithm::iends_with(path, ".zst") ||
72 boost::algorithm::iends_with(path, ".bz2") ||
73 boost::algorithm::iends_with(path, ".7z")) {
74 throw BadFileException(
75 "Unsupported compression extension (.zst, .bz2, .7z) given path: " +
76 path);
77 } else {
78 basename = path;
79 compressionFormat = "";
80 }
81 for (auto const &suffix : supportedFileFormats) {
82 if (boost::algorithm::iends_with(basename, "." + suffix)) {
83 fileFormat = suffix;
84 return;
85 }
86 }
87 throw BadFileException(
88 "Unsupported structure or compression extension given path: " + path);
89}
90
91//! returns a new MolSupplier object based on the file name instantiated
92//! with the relevant options provided in the SupplierOptions struct
93/*!
94 <b>Note:</b>
95 - the caller owns the memory and therefore the pointer must be deleted
96*/
97
98inline std::unique_ptr<FileParsers::MolSupplier> getSupplier(
99 const std::string &path, const struct SupplierOptions &opt) {
100 std::string fileFormat = "";
101 std::string compressionFormat = "";
102 //! get the file and compression format form the path
103 determineFormat(path, fileFormat, compressionFormat);
104
105 std::istream *strm;
106 if (compressionFormat.empty()) {
107 strm = new std::ifstream(path.c_str(), std::ios::in | std::ios::binary);
108 } else {
109#ifdef RDK_USE_BOOST_IOSTREAMS
110 strm = new gzstream(path);
111#else
112 throw BadFileException(
113 "compressed files are only supported if the RDKit is built with boost::iostreams support");
114#endif
115 }
116
117 if ((!(*strm)) || strm->bad()) {
118 std::ostringstream errout;
119 errout << "Bad input file " << path;
120 delete strm;
121 throw BadFileException(errout.str());
122 }
123 strm->peek();
124 if (strm->bad() || strm->eof()) {
125 std::ostringstream errout;
126 errout << "Invalid input file " << path;
127 delete strm;
128 throw BadFileException(errout.str());
129 }
130
131#ifdef RDK_BUILD_THREADSAFE_SSS
132 FileParsers::MultithreadedMolSupplier::Parameters params;
133 params.numWriterThreads = getNumThreadsToUse(opt.numWriterThreads);
134#endif
135 //! Dispatch to the appropriate supplier
136 if (fileFormat == "sdf") {
137 FileParsers::MolFileParserParams parseParams;
138 parseParams.sanitize = opt.sanitize;
139 parseParams.removeHs = opt.removeHs;
140 parseParams.strictParsing = opt.strictParsing;
141#ifdef RDK_BUILD_THREADSAFE_SSS
142 if (params.numWriterThreads > 1) {
143 return std::make_unique<FileParsers::MultithreadedSDMolSupplier>(
144 strm, true, params, parseParams);
145 }
146#endif
147 return std::make_unique<FileParsers::ForwardSDMolSupplier>(strm, true,
148 parseParams);
149 }
150
151 else if (fileFormat == "smi" || fileFormat == "csv" || fileFormat == "txt" ||
152 fileFormat == "tsv") {
153 FileParsers::SmilesMolSupplierParams parseParams;
154 parseParams.delimiter = opt.delimiter;
155 parseParams.smilesColumn = opt.smilesColumn;
156 parseParams.nameColumn = opt.nameColumn;
157 parseParams.titleLine = opt.titleLine;
158 parseParams.parseParameters.sanitize = opt.sanitize;
159#ifdef RDK_BUILD_THREADSAFE_SSS
160 if (params.numWriterThreads > 1) {
161 return std::make_unique<FileParsers::MultithreadedSmilesMolSupplier>(
162 strm, true, params, parseParams);
163 }
164#endif
165 return std::make_unique<FileParsers::SmilesMolSupplier>(strm, true,
166 parseParams);
167 }
168#ifdef RDK_BUILD_MAEPARSER_SUPPORT
169 else if (fileFormat == "mae") {
170 FileParsers::MaeMolSupplierParams parseParams;
171 parseParams.sanitize = opt.sanitize;
172 parseParams.removeHs = opt.removeHs;
173 return std::make_unique<FileParsers::MaeMolSupplier>(strm, true,
174 parseParams);
175 }
176#endif
177 else if (fileFormat == "tdt") {
178 FileParsers::TDTMolSupplierParams parseParams;
179 parseParams.nameRecord = opt.nameRecord;
180 parseParams.confId2D = opt.confId2D;
181 parseParams.confId3D = opt.confId3D;
182 parseParams.parseParameters.sanitize = opt.sanitize;
183 return std::make_unique<FileParsers::TDTMolSupplier>(strm, true,
184 parseParams);
185 }
186 throw BadFileException("Unsupported file format: " + fileFormat);
187}
188
189} // namespace GeneralMolSupplier
190} // namespace RDKit
191#endif
used by various file parsing classes to indicate a bad file
const std::vector< std::string > supportedCompressionFormats
current supported compression formats
const std::vector< std::string > supportedFileFormats
current supported file formats
void determineFormat(const std::string path, std::string &fileFormat, std::string &compressionFormat)
std::unique_ptr< FileParsers::MolSupplier > getSupplier(const std::string &path, const struct SupplierOptions &opt)
Std stuff.
unsigned int getNumThreadsToUse(int target)
Definition RDThreads.h:37