FST  openfst-1.7.3
OpenFst Library
sttable.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // A generic string-to-type table file format.
5 //
6 // This is not meant as a generalization of SSTable. This is more of a simple
7 // replacement for SSTable in order to provide an open-source implementation
8 // of the FAR format for the external version of the FST library.
9 
10 #ifndef FST_EXTENSIONS_FAR_STTABLE_H_
11 #define FST_EXTENSIONS_FAR_STTABLE_H_
12 
13 #include <algorithm>
14 #include <istream>
15 #include <memory>
16 
17 #include <fstream>
18 #include <fst/util.h>
19 
20 namespace fst {
21 
22 static constexpr int32 kSTTableMagicNumber = 2125656924;
23 static constexpr int32 kSTTableFileVersion = 1;
24 
25 // String-type table writing class for an object of type T using a functor
26 // Writer. The Writer functor must provide at least the following interface:
27 //
28 // struct Writer {
29 // void operator()(std::ostream &, const T &) const;
30 // };
31 template <class T, class Writer>
33  public:
34  explicit STTableWriter(const std::string &filename)
35  : stream_(filename, std::ios_base::out | std::ios_base::binary),
36  error_(false) {
37  WriteType(stream_, kSTTableMagicNumber);
38  WriteType(stream_, kSTTableFileVersion);
39  if (stream_.fail()) {
40  FSTERROR() << "STTableWriter::STTableWriter: Error writing to file: "
41  << filename;
42  error_ = true;
43  }
44  }
45 
46  static STTableWriter<T, Writer> *Create(const std::string &filename) {
47  if (filename.empty()) {
48  LOG(ERROR) << "STTableWriter: Writing to standard out unsupported.";
49  return nullptr;
50  }
51  return new STTableWriter<T, Writer>(filename);
52  }
53 
54  void Add(const std::string &key, const T &t) {
55  if (key.empty()) {
56  FSTERROR() << "STTableWriter::Add: Key empty: " << key;
57  error_ = true;
58  } else if (key < last_key_) {
59  FSTERROR() << "STTableWriter::Add: Key out of order: " << key;
60  error_ = true;
61  }
62  if (error_) return;
63  last_key_ = key;
64  positions_.push_back(stream_.tellp());
65  WriteType(stream_, key);
66  entry_writer_(stream_, t);
67  }
68 
69  bool Error() const { return error_; }
70 
72  WriteType(stream_, positions_);
73  WriteType(stream_, static_cast<int64>(positions_.size()));
74  }
75 
76  private:
77  Writer entry_writer_;
78  std::ofstream stream_;
79  std::vector<int64> positions_; // Position in file of each key-entry pair.
80  std::string last_key_; // Last key.
81  bool error_;
82 
83  STTableWriter(const STTableWriter &) = delete;
84  STTableWriter &operator=(const STTableWriter &) = delete;
85 };
86 
87 // String-type table reading class for object of type T using a functor Reader.
88 // Reader must provide at least the following interface:
89 //
90 // struct Reader {
91 // T *operator()(std::istream &) const;
92 // };
93 //
94 template <class T, class Reader>
96  public:
97  explicit STTableReader(const std::vector<std::string> &filenames)
98  : sources_(filenames), error_(false) {
99  compare_.reset(new Compare(&keys_));
100  keys_.resize(filenames.size());
101  streams_.resize(filenames.size(), 0);
102  positions_.resize(filenames.size());
103  for (size_t i = 0; i < filenames.size(); ++i) {
104  streams_[i] = new std::ifstream(
105  filenames[i], std::ios_base::in | std::ios_base::binary);
106  if (streams_[i]->fail()) {
107  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
108  << filenames[i];
109  error_ = true;
110  return;
111  }
112  int32 magic_number = 0;
113  ReadType(*streams_[i], &magic_number);
114  int32 file_version = 0;
115  ReadType(*streams_[i], &file_version);
116  if (magic_number != kSTTableMagicNumber) {
117  FSTERROR() << "STTableReader::STTableReader: Wrong file type: "
118  << filenames[i];
119  error_ = true;
120  return;
121  }
122  if (file_version != kSTTableFileVersion) {
123  FSTERROR() << "STTableReader::STTableReader: Wrong file version: "
124  << filenames[i];
125  error_ = true;
126  return;
127  }
128  int64 num_entries;
129  streams_[i]->seekg(-static_cast<int>(sizeof(int64)), std::ios_base::end);
130  ReadType(*streams_[i], &num_entries);
131  if (num_entries > 0) {
132  streams_[i]->seekg(-static_cast<int>(sizeof(int64)) * (num_entries + 1),
133  std::ios_base::end);
134  positions_[i].resize(num_entries);
135  for (size_t j = 0; (j < num_entries) && (!streams_[i]->fail()); ++j) {
136  ReadType(*streams_[i], &(positions_[i][j]));
137  }
138  streams_[i]->seekg(positions_[i][0]);
139  if (streams_[i]->fail()) {
140  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
141  << filenames[i];
142  error_ = true;
143  return;
144  }
145  }
146  }
147  MakeHeap();
148  }
149 
151  for (auto &stream : streams_) delete stream;
152  }
153 
154  static STTableReader<T, Reader> *Open(const std::string &filename) {
155  if (filename.empty()) {
156  LOG(ERROR) << "STTableReader: Operation not supported on standard input";
157  return nullptr;
158  }
159  std::vector<std::string> filenames;
160  filenames.push_back(filename);
161  return new STTableReader<T, Reader>(filenames);
162  }
163 
165  const std::vector<std::string> &filenames) {
166  return new STTableReader<T, Reader>(filenames);
167  }
168 
169  void Reset() {
170  if (error_) return;
171  for (size_t i = 0; i < streams_.size(); ++i)
172  streams_[i]->seekg(positions_[i].front());
173  MakeHeap();
174  }
175 
176  bool Find(const std::string &key) {
177  if (error_) return false;
178  for (size_t i = 0; i < streams_.size(); ++i) LowerBound(i, key);
179  MakeHeap();
180  if (heap_.empty()) return false;
181  return keys_[current_] == key;
182  }
183 
184  bool Done() const { return error_ || heap_.empty(); }
185 
186  void Next() {
187  if (error_) return;
188  if (streams_[current_]->tellg() <= positions_[current_].back()) {
189  ReadType(*(streams_[current_]), &(keys_[current_]));
190  if (streams_[current_]->fail()) {
191  FSTERROR() << "STTableReader: Error reading file: "
192  << sources_[current_];
193  error_ = true;
194  return;
195  }
196  std::push_heap(heap_.begin(), heap_.end(), *compare_);
197  } else {
198  heap_.pop_back();
199  }
200  if (!heap_.empty()) PopHeap();
201  }
202 
203  const std::string &GetKey() const { return keys_[current_]; }
204 
205  const T *GetEntry() const { return entry_.get(); }
206 
207  bool Error() const { return error_; }
208 
209  private:
210  // Comparison functor used to compare stream IDs in the heap.
211  struct Compare {
212  explicit Compare(const std::vector<std::string> *keys) : keys(keys) {}
213 
214  bool operator()(size_t i, size_t j) const {
215  return (*keys)[i] > (*keys)[j];
216  };
217 
218  private:
219  const std::vector<std::string> *keys;
220  };
221 
222  // Positions the stream at the position corresponding to the lower bound for
223  // the specified key.
224  void LowerBound(size_t id, const std::string &find_key) {
225  auto *strm = streams_[id];
226  const auto &positions = positions_[id];
227  if (positions.empty()) return;
228  size_t low = 0;
229  size_t high = positions.size() - 1;
230  while (low < high) {
231  size_t mid = (low + high) / 2;
232  strm->seekg(positions[mid]);
233  std::string key;
234  ReadType(*strm, &key);
235  if (key > find_key) {
236  high = mid;
237  } else if (key < find_key) {
238  low = mid + 1;
239  } else {
240  for (size_t i = mid; i > low; --i) {
241  strm->seekg(positions[i - 1]);
242  ReadType(*strm, &key);
243  if (key != find_key) {
244  strm->seekg(positions[i]);
245  return;
246  }
247  }
248  strm->seekg(positions[low]);
249  return;
250  }
251  }
252  strm->seekg(positions[low]);
253  }
254 
255  // Adds all streams to the heap.
256  void MakeHeap() {
257  heap_.clear();
258  for (size_t i = 0; i < streams_.size(); ++i) {
259  if (positions_[i].empty()) continue;
260  ReadType(*streams_[i], &(keys_[i]));
261  if (streams_[i]->fail()) {
262  FSTERROR() << "STTableReader: Error reading file: " << sources_[i];
263  error_ = true;
264  return;
265  }
266  heap_.push_back(i);
267  }
268  if (heap_.empty()) return;
269  std::make_heap(heap_.begin(), heap_.end(), *compare_);
270  PopHeap();
271  }
272 
273  // Positions the stream with the lowest key at the top of the heap, sets
274  // current_ to the ID of that stream, and reads the current entry from that
275  // stream.
276  void PopHeap() {
277  std::pop_heap(heap_.begin(), heap_.end(), *compare_);
278  current_ = heap_.back();
279  entry_.reset(entry_reader_(*streams_[current_]));
280  if (!entry_) error_ = true;
281  if (streams_[current_]->fail()) {
282  FSTERROR() << "STTableReader: Error reading entry for key: "
283  << keys_[current_] << ", file: " << sources_[current_];
284  error_ = true;
285  }
286  }
287 
288  Reader entry_reader_;
289  std::vector<std::istream *> streams_; // Input streams.
290  std::vector<std::string> sources_; // Corresponding file names.
291  std::vector<std::vector<int64>> positions_; // Index of positions.
292  std::vector<std::string> keys_; // Lowest unread key for each stream.
293  std::vector<int64> heap_; // Heap containing ID of streams with unread keys.
294  int64 current_; // ID of current stream to be read.
295  std::unique_ptr<Compare> compare_; // Functor comparing stream IDs.
296  mutable std::unique_ptr<T> entry_; // The currently read entry.
297  bool error_;
298 };
299 
300 // String-type table header reading function template on the entry header type.
301 // The Header type must provide at least the following interface:
302 //
303 // struct Header {
304 // void Read(std::istream &istrm, const string &filename);
305 // };
306 template <class Header>
307 bool ReadSTTableHeader(const std::string &filename, Header *header) {
308  if (filename.empty()) {
309  LOG(ERROR) << "ReadSTTable: Can't read header from standard input";
310  return false;
311  }
312  std::ifstream strm(filename, std::ios_base::in | std::ios_base::binary);
313  if (!strm) {
314  LOG(ERROR) << "ReadSTTableHeader: Could not open file: " << filename;
315  return false;
316  }
317  int32 magic_number = 0;
318  ReadType(strm, &magic_number);
319  int32 file_version = 0;
320  ReadType(strm, &file_version);
321  if (magic_number != kSTTableMagicNumber) {
322  LOG(ERROR) << "ReadSTTableHeader: Wrong file type: " << filename;
323  return false;
324  }
325  if (file_version != kSTTableFileVersion) {
326  LOG(ERROR) << "ReadSTTableHeader: Wrong file version: " << filename;
327  return false;
328  }
329  int64 i = -1;
330  strm.seekg(-static_cast<int>(sizeof(int64)), std::ios_base::end);
331  ReadType(strm, &i); // Reads number of entries
332  if (strm.fail()) {
333  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << filename;
334  return false;
335  }
336  if (i == 0) return true; // No entry header to read.
337  strm.seekg(-2 * static_cast<int>(sizeof(int64)), std::ios_base::end);
338  ReadType(strm, &i); // Reads position for last entry in file.
339  strm.seekg(i);
340  std::string key;
341  ReadType(strm, &key);
342  header->Read(strm, filename + ":" + key);
343  if (strm.fail()) {
344  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << filename;
345  return false;
346  }
347  return true;
348 }
349 
350 bool IsSTTable(const std::string &filename);
351 
352 } // namespace fst
353 
354 #endif // FST_EXTENSIONS_FAR_STTABLE_H_
bool ReadSTTableHeader(const std::string &filename, Header *header)
Definition: sttable.h:307
#define LOG(type)
Definition: log.h:46
static STTableReader< T, Reader > * Open(const std::vector< std::string > &filenames)
Definition: sttable.h:164
STTableReader(const std::vector< std::string > &filenames)
Definition: sttable.h:97
int64_t int64
Definition: types.h:27
std::ostream & WriteType(std::ostream &strm, const T t)
Definition: util.h:155
#define FSTERROR()
Definition: util.h:35
bool Error() const
Definition: sttable.h:207
bool Done() const
Definition: sttable.h:184
const std::string & GetKey() const
Definition: sttable.h:203
bool IsSTTable(const std::string &filename)
Definition: sttable.cc:9
static STTableReader< T, Reader > * Open(const std::string &filename)
Definition: sttable.h:154
static STTableWriter< T, Writer > * Create(const std::string &filename)
Definition: sttable.h:46
int32_t int32
Definition: types.h:26
bool Find(const std::string &key)
Definition: sttable.h:176
STTableWriter(const std::string &filename)
Definition: sttable.h:34
const T * GetEntry() const
Definition: sttable.h:205
std::istream & ReadType(std::istream &strm, T *t)
Definition: util.h:47
bool Error() const
Definition: sttable.h:69
void Add(const std::string &key, const T &t)
Definition: sttable.h:54