FST  openfst-1.8.3
OpenFst Library
sttable.h
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 //
18 // A generic string-to-type table file format.
19 //
20 // This is not meant as a generalization of SSTable. This is more of a simple
21 // replacement for SSTable in order to provide an open-source implementation
22 // of the FAR format for the external version of the FST library.
23 
24 #ifndef FST_EXTENSIONS_FAR_STTABLE_H_
25 #define FST_EXTENSIONS_FAR_STTABLE_H_
26 
27 #include <algorithm>
28 #include <cstddef>
29 #include <cstdint>
30 #include <ios>
31 #include <istream>
32 #include <memory>
33 #include <string>
34 #include <vector>
35 
36 #include <fst/log.h>
37 #include <fstream>
38 #include <fst/util.h>
39 #include <string_view>
40 
41 namespace fst {
42 
43 inline constexpr int32_t kSTTableMagicNumber = 2125656924;
44 inline constexpr int32_t kSTTableFileVersion = 1;
45 
46 // String-type table writing class for an object of type T using a functor
47 // Writer. The Writer functor must provide at least the following interface:
48 //
49 // struct Writer {
50 // void operator()(std::ostream &, const T &) const;
51 // };
52 template <class T, class Writer>
54  public:
55  explicit STTableWriter(std::string_view source)
56  : stream_(std::string(source),
57  std::ios_base::out | std::ios_base::binary),
58  error_(false) {
59  WriteType(stream_, kSTTableMagicNumber);
60  WriteType(stream_, kSTTableFileVersion);
61  if (stream_.fail()) {
62  FSTERROR() << "STTableWriter::STTableWriter: Error writing to file: "
63  << source;
64  error_ = true;
65  }
66  }
67 
68  static STTableWriter<T, Writer> *Create(std::string_view source) {
69  if (source.empty()) {
70  LOG(ERROR) << "STTableWriter: Writing to standard out unsupported.";
71  return nullptr;
72  }
73  return new STTableWriter<T, Writer>(source);
74  }
75 
76  void Add(std::string_view key, const T &t) {
77  if (key.empty()) {
78  FSTERROR() << "STTableWriter::Add: Key empty: " << key;
79  error_ = true;
80  } else if (key < last_key_) {
81  FSTERROR() << "STTableWriter::Add: Key out of order: " << key;
82  error_ = true;
83  }
84  if (error_) return;
85  last_key_.assign(key.data(), key.size());
86  positions_.push_back(stream_.tellp());
87  WriteType(stream_, key);
88  entry_writer_(stream_, t);
89  }
90 
91  bool Error() const { return error_; }
92 
94  WriteType(stream_, positions_);
95  WriteType(stream_, static_cast<int64_t>(positions_.size()));
96  }
97 
98  private:
99  Writer entry_writer_;
100  std::ofstream stream_;
101  std::vector<int64_t> positions_; // Position in file of each key-entry pair.
102  std::string last_key_; // Last key.
103  bool error_;
104 
105  STTableWriter(const STTableWriter &) = delete;
106  STTableWriter &operator=(const STTableWriter &) = delete;
107 };
108 
109 // String-type table reading class for object of type T using a functor Reader.
110 // Reader must provide at least the following interface:
111 //
112 // struct Reader {
113 // T *operator()(std::istream &) const;
114 // };
115 //
116 template <class T, class Reader>
118  public:
119  explicit STTableReader(const std::vector<std::string> &sources)
120  : sources_(sources), error_(false) {
121  compare_.reset(new Compare(&keys_));
122  keys_.resize(sources.size());
123  streams_.resize(sources.size(), nullptr);
124  positions_.resize(sources.size());
125  for (size_t i = 0; i < sources.size(); ++i) {
126  streams_[i] = new std::ifstream(
127  sources[i], std::ios_base::in | std::ios_base::binary);
128  if (streams_[i]->fail()) {
129  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
130  << sources[i];
131  error_ = true;
132  return;
133  }
134  int32_t magic_number = 0;
135  ReadType(*streams_[i], &magic_number);
136  int32_t file_version = 0;
137  ReadType(*streams_[i], &file_version);
138  if (magic_number != kSTTableMagicNumber) {
139  FSTERROR() << "STTableReader::STTableReader: Wrong file type: "
140  << sources[i];
141  error_ = true;
142  return;
143  }
144  if (file_version != kSTTableFileVersion) {
145  FSTERROR() << "STTableReader::STTableReader: Wrong file version: "
146  << sources[i];
147  error_ = true;
148  return;
149  }
150  int64_t num_entries;
151  streams_[i]->seekg(-static_cast<int>(sizeof(int64_t)),
152  std::ios_base::end);
153  ReadType(*streams_[i], &num_entries);
154  if (num_entries > 0) {
155  streams_[i]->seekg(
156  -static_cast<int>(sizeof(int64_t)) * (num_entries + 1),
157  std::ios_base::end);
158  positions_[i].resize(num_entries);
159  for (size_t j = 0; (j < num_entries) && (!streams_[i]->fail()); ++j) {
160  ReadType(*streams_[i], &(positions_[i][j]));
161  }
162  streams_[i]->seekg(positions_[i][0]);
163  if (streams_[i]->fail()) {
164  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
165  << sources[i];
166  error_ = true;
167  return;
168  }
169  }
170  }
171  MakeHeap();
172  }
173 
175  for (auto &stream : streams_) delete stream;
176  }
177 
178  static STTableReader<T, Reader> *Open(std::string_view source) {
179  if (source.empty()) {
180  LOG(ERROR) << "STTableReader: Operation not supported on standard input";
181  return nullptr;
182  }
183  std::vector<std::string> sources;
184  sources.push_back(std::string(source));
185  return new STTableReader<T, Reader>(sources);
186  }
187 
189  const std::vector<std::string> &sources) {
190  return new STTableReader<T, Reader>(sources);
191  }
192 
193  void Reset() {
194  if (error_) return;
195  for (size_t i = 0; i < streams_.size(); ++i) {
196  if (!positions_[i].empty()) {
197  streams_[i]->seekg(positions_[i].front());
198  }
199  }
200  MakeHeap();
201  }
202 
203  bool Find(std::string_view key) {
204  if (error_) return false;
205  for (size_t i = 0; i < streams_.size(); ++i) LowerBound(i, key);
206  MakeHeap();
207  if (heap_.empty()) return false;
208  return keys_[current_] == key;
209  }
210 
211  bool Done() const { return error_ || heap_.empty(); }
212 
213  void Next() {
214  if (error_) return;
215  if (streams_[current_]->tellg() <= positions_[current_].back()) {
216  ReadType(*(streams_[current_]), &(keys_[current_]));
217  if (streams_[current_]->fail()) {
218  FSTERROR() << "STTableReader: Error reading file: "
219  << sources_[current_];
220  error_ = true;
221  return;
222  }
223  std::push_heap(heap_.begin(), heap_.end(), *compare_);
224  } else {
225  heap_.pop_back();
226  }
227  if (!heap_.empty()) PopHeap();
228  }
229 
230  const std::string &GetKey() const { return keys_[current_]; }
231 
232  const T *GetEntry() const { return entry_.get(); }
233 
234  bool Error() const { return error_; }
235 
236  private:
237  // Comparison functor used to compare stream IDs in the heap.
238  struct Compare {
239  explicit Compare(const std::vector<std::string> *keys) : keys(keys) {}
240 
241  bool operator()(size_t i, size_t j) const {
242  return (*keys)[i] > (*keys)[j];
243  }
244 
245  private:
246  const std::vector<std::string> *keys;
247  };
248 
249  // Positions the stream at the position corresponding to the lower bound for
250  // the specified key.
251  void LowerBound(size_t id, std::string_view find_key) {
252  auto *strm = streams_[id];
253  const auto &positions = positions_[id];
254  if (positions.empty()) return;
255  size_t low = 0;
256  size_t high = positions.size() - 1;
257  while (low < high) {
258  size_t mid = (low + high) / 2;
259  strm->seekg(positions[mid]);
260  std::string key;
261  ReadType(*strm, &key);
262  if (key > find_key) {
263  high = mid;
264  } else if (key < find_key) {
265  low = mid + 1;
266  } else {
267  for (size_t i = mid; i > low; --i) {
268  strm->seekg(positions[i - 1]);
269  ReadType(*strm, &key);
270  if (key != find_key) {
271  strm->seekg(positions[i]);
272  return;
273  }
274  }
275  strm->seekg(positions[low]);
276  return;
277  }
278  }
279  strm->seekg(positions[low]);
280  }
281 
282  // Adds all streams to the heap.
283  void MakeHeap() {
284  heap_.clear();
285  for (size_t i = 0; i < streams_.size(); ++i) {
286  if (positions_[i].empty()) continue;
287  ReadType(*streams_[i], &(keys_[i]));
288  if (streams_[i]->fail()) {
289  FSTERROR() << "STTableReader: Error reading file: " << sources_[i];
290  error_ = true;
291  return;
292  }
293  heap_.push_back(i);
294  }
295  if (heap_.empty()) return;
296  std::make_heap(heap_.begin(), heap_.end(), *compare_);
297  PopHeap();
298  }
299 
300  // Positions the stream with the lowest key at the top of the heap, sets
301  // current_ to the ID of that stream, and reads the current entry from that
302  // stream.
303  void PopHeap() {
304  std::pop_heap(heap_.begin(), heap_.end(), *compare_);
305  current_ = heap_.back();
306  entry_.reset(entry_reader_(*streams_[current_]));
307  if (!entry_) error_ = true;
308  if (streams_[current_]->fail()) {
309  FSTERROR() << "STTableReader: Error reading entry for key: "
310  << keys_[current_] << ", file: " << sources_[current_];
311  error_ = true;
312  }
313  }
314 
315  Reader entry_reader_;
316  std::vector<std::istream *> streams_; // Input streams.
317  std::vector<std::string> sources_; // Corresponding file names.
318  std::vector<std::vector<int64_t>> positions_; // Index of positions.
319  std::vector<std::string> keys_; // Lowest unread key for each stream.
320  std::vector<int64_t>
321  heap_; // Heap containing ID of streams with unread keys.
322  int64_t current_; // ID of current stream to be read.
323  std::unique_ptr<Compare> compare_; // Functor comparing stream IDs.
324  mutable std::unique_ptr<T> entry_; // The currently read entry.
325  bool error_;
326 };
327 
328 // String-type table header reading function template on the entry header type.
329 // The Header type must provide at least the following interface:
330 //
331 // struct Header {
332 // void Read(std::istream &istrm, const string &source);
333 // };
334 template <class Header>
335 bool ReadSTTableHeader(const std::string &source, Header *header) {
336  if (source.empty()) {
337  LOG(ERROR) << "ReadSTTable: Can't read header from standard input";
338  return false;
339  }
340  std::ifstream strm(source, std::ios_base::in | std::ios_base::binary);
341  if (!strm) {
342  LOG(ERROR) << "ReadSTTableHeader: Could not open file: " << source;
343  return false;
344  }
345  int32_t magic_number = 0;
346  ReadType(strm, &magic_number);
347  int32_t file_version = 0;
348  ReadType(strm, &file_version);
349  if (magic_number != kSTTableMagicNumber) {
350  LOG(ERROR) << "ReadSTTableHeader: Wrong file type: " << source;
351  return false;
352  }
353  if (file_version != kSTTableFileVersion) {
354  LOG(ERROR) << "ReadSTTableHeader: Wrong file version: " << source;
355  return false;
356  }
357  int64_t i = -1;
358  strm.seekg(-static_cast<int>(sizeof(int64_t)), std::ios_base::end);
359  ReadType(strm, &i); // Reads number of entries
360  if (strm.fail()) {
361  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << source;
362  return false;
363  }
364  if (i == 0) return true; // No entry header to read.
365  strm.seekg(-2 * static_cast<int>(sizeof(int64_t)), std::ios_base::end);
366  ReadType(strm, &i); // Reads position for last entry in file.
367  strm.seekg(i);
368  std::string key;
369  ReadType(strm, &key);
370  if (!header->Read(strm, source + ":" + key)) {
371  LOG(ERROR) << "ReadSTTableHeader: Error reading FstHeader: " << source;
372  return false;
373  }
374  if (strm.fail()) {
375  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << source;
376  return false;
377  }
378  return true;
379 }
380 
381 bool IsSTTable(std::string_view source);
382 
383 } // namespace fst
384 
385 #endif // FST_EXTENSIONS_FAR_STTABLE_H_
constexpr int32_t kSTTableMagicNumber
Definition: sttable.h:43
constexpr int32_t kSTTableFileVersion
Definition: sttable.h:44
void Add(std::string_view key, const T &t)
Definition: sttable.h:76
#define LOG(type)
Definition: log.h:53
bool Find(std::string_view key)
Definition: sttable.h:203
static STTableReader< T, Reader > * Open(const std::vector< std::string > &sources)
Definition: sttable.h:188
std::ostream & WriteType(std::ostream &strm, const T t)
Definition: util.h:228
#define FSTERROR()
Definition: util.h:56
bool Error() const
Definition: sttable.h:234
STTableReader(const std::vector< std::string > &sources)
Definition: sttable.h:119
bool Done() const
Definition: sttable.h:211
const std::string & GetKey() const
Definition: sttable.h:230
bool ReadSTTableHeader(const std::string &source, Header *header)
Definition: sttable.h:335
STTableWriter(std::string_view source)
Definition: sttable.h:55
static STTableWriter< T, Writer > * Create(std::string_view source)
Definition: sttable.h:68
const T * GetEntry() const
Definition: sttable.h:232
std::istream & ReadType(std::istream &strm, T *t)
Definition: util.h:80
bool IsSTTable(std::string_view source)
Definition: sttable.cc:29
bool Error() const
Definition: sttable.h:91
static STTableReader< T, Reader > * Open(std::string_view source)
Definition: sttable.h:178