FST  openfst-1.8.2
OpenFst Library
sttable.h
Go to the documentation of this file.
1 // Copyright 2005-2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 //
18 // A generic string-to-type table file format.
19 //
20 // This is not meant as a generalization of SSTable. This is more of a simple
21 // replacement for SSTable in order to provide an open-source implementation
22 // of the FAR format for the external version of the FST library.
23 
24 #ifndef FST_EXTENSIONS_FAR_STTABLE_H_
25 #define FST_EXTENSIONS_FAR_STTABLE_H_
26 
27 #include <algorithm>
28 #include <cstdint>
29 #include <istream>
30 #include <memory>
31 #include <string>
32 
33 #include <fstream>
34 #include <fst/util.h>
35 #include <string_view>
36 
37 namespace fst {
38 
39 inline constexpr int32_t kSTTableMagicNumber = 2125656924;
40 inline constexpr int32_t kSTTableFileVersion = 1;
41 
42 // String-type table writing class for an object of type T using a functor
43 // Writer. The Writer functor must provide at least the following interface:
44 //
45 // struct Writer {
46 // void operator()(std::ostream &, const T &) const;
47 // };
48 template <class T, class Writer>
50  public:
51  explicit STTableWriter(const std::string &source)
52  : stream_(source, std::ios_base::out | std::ios_base::binary),
53  error_(false) {
54  WriteType(stream_, kSTTableMagicNumber);
55  WriteType(stream_, kSTTableFileVersion);
56  if (stream_.fail()) {
57  FSTERROR() << "STTableWriter::STTableWriter: Error writing to file: "
58  << source;
59  error_ = true;
60  }
61  }
62 
63  static STTableWriter<T, Writer> *Create(const std::string &source) {
64  if (source.empty()) {
65  LOG(ERROR) << "STTableWriter: Writing to standard out unsupported.";
66  return nullptr;
67  }
68  return new STTableWriter<T, Writer>(source);
69  }
70 
71  void Add(std::string_view key, const T &t) {
72  if (key.empty()) {
73  FSTERROR() << "STTableWriter::Add: Key empty: " << key;
74  error_ = true;
75  } else if (key < last_key_) {
76  FSTERROR() << "STTableWriter::Add: Key out of order: " << key;
77  error_ = true;
78  }
79  if (error_) return;
80  last_key_.assign(key.data(), key.size());
81  positions_.push_back(stream_.tellp());
82  WriteType(stream_, key);
83  entry_writer_(stream_, t);
84  }
85 
86  bool Error() const { return error_; }
87 
89  WriteType(stream_, positions_);
90  WriteType(stream_, static_cast<int64_t>(positions_.size()));
91  }
92 
93  private:
94  Writer entry_writer_;
95  std::ofstream stream_;
96  std::vector<int64_t> positions_; // Position in file of each key-entry pair.
97  std::string last_key_; // Last key.
98  bool error_;
99 
100  STTableWriter(const STTableWriter &) = delete;
101  STTableWriter &operator=(const STTableWriter &) = delete;
102 };
103 
104 // String-type table reading class for object of type T using a functor Reader.
105 // Reader must provide at least the following interface:
106 //
107 // struct Reader {
108 // T *operator()(std::istream &) const;
109 // };
110 //
111 template <class T, class Reader>
113  public:
114  explicit STTableReader(const std::vector<std::string> &sources)
115  : sources_(sources), error_(false) {
116  compare_.reset(new Compare(&keys_));
117  keys_.resize(sources.size());
118  streams_.resize(sources.size(), nullptr);
119  positions_.resize(sources.size());
120  for (size_t i = 0; i < sources.size(); ++i) {
121  streams_[i] = new std::ifstream(
122  sources[i], std::ios_base::in | std::ios_base::binary);
123  if (streams_[i]->fail()) {
124  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
125  << sources[i];
126  error_ = true;
127  return;
128  }
129  int32_t magic_number = 0;
130  ReadType(*streams_[i], &magic_number);
131  int32_t file_version = 0;
132  ReadType(*streams_[i], &file_version);
133  if (magic_number != kSTTableMagicNumber) {
134  FSTERROR() << "STTableReader::STTableReader: Wrong file type: "
135  << sources[i];
136  error_ = true;
137  return;
138  }
139  if (file_version != kSTTableFileVersion) {
140  FSTERROR() << "STTableReader::STTableReader: Wrong file version: "
141  << sources[i];
142  error_ = true;
143  return;
144  }
145  int64_t num_entries;
146  streams_[i]->seekg(-static_cast<int>(sizeof(int64_t)),
147  std::ios_base::end);
148  ReadType(*streams_[i], &num_entries);
149  if (num_entries > 0) {
150  streams_[i]->seekg(
151  -static_cast<int>(sizeof(int64_t)) * (num_entries + 1),
152  std::ios_base::end);
153  positions_[i].resize(num_entries);
154  for (size_t j = 0; (j < num_entries) && (!streams_[i]->fail()); ++j) {
155  ReadType(*streams_[i], &(positions_[i][j]));
156  }
157  streams_[i]->seekg(positions_[i][0]);
158  if (streams_[i]->fail()) {
159  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
160  << sources[i];
161  error_ = true;
162  return;
163  }
164  }
165  }
166  MakeHeap();
167  }
168 
170  for (auto &stream : streams_) delete stream;
171  }
172 
173  static STTableReader<T, Reader> *Open(const std::string &source) {
174  if (source.empty()) {
175  LOG(ERROR) << "STTableReader: Operation not supported on standard input";
176  return nullptr;
177  }
178  std::vector<std::string> sources;
179  sources.push_back(source);
180  return new STTableReader<T, Reader>(sources);
181  }
182 
184  const std::vector<std::string> &sources) {
185  return new STTableReader<T, Reader>(sources);
186  }
187 
188  void Reset() {
189  if (error_) return;
190  for (size_t i = 0; i < streams_.size(); ++i) {
191  if (!positions_[i].empty()) {
192  streams_[i]->seekg(positions_[i].front());
193  }
194  }
195  MakeHeap();
196  }
197 
198  bool Find(std::string_view key) {
199  if (error_) return false;
200  for (size_t i = 0; i < streams_.size(); ++i) LowerBound(i, key);
201  MakeHeap();
202  if (heap_.empty()) return false;
203  return keys_[current_] == key;
204  }
205 
206  bool Done() const { return error_ || heap_.empty(); }
207 
208  void Next() {
209  if (error_) return;
210  if (streams_[current_]->tellg() <= positions_[current_].back()) {
211  ReadType(*(streams_[current_]), &(keys_[current_]));
212  if (streams_[current_]->fail()) {
213  FSTERROR() << "STTableReader: Error reading file: "
214  << sources_[current_];
215  error_ = true;
216  return;
217  }
218  std::push_heap(heap_.begin(), heap_.end(), *compare_);
219  } else {
220  heap_.pop_back();
221  }
222  if (!heap_.empty()) PopHeap();
223  }
224 
225  const std::string &GetKey() const { return keys_[current_]; }
226 
227  const T *GetEntry() const { return entry_.get(); }
228 
229  bool Error() const { return error_; }
230 
231  private:
232  // Comparison functor used to compare stream IDs in the heap.
233  struct Compare {
234  explicit Compare(const std::vector<std::string> *keys) : keys(keys) {}
235 
236  bool operator()(size_t i, size_t j) const {
237  return (*keys)[i] > (*keys)[j];
238  }
239 
240  private:
241  const std::vector<std::string> *keys;
242  };
243 
244  // Positions the stream at the position corresponding to the lower bound for
245  // the specified key.
246  void LowerBound(size_t id, std::string_view find_key) {
247  auto *strm = streams_[id];
248  const auto &positions = positions_[id];
249  if (positions.empty()) return;
250  size_t low = 0;
251  size_t high = positions.size() - 1;
252  while (low < high) {
253  size_t mid = (low + high) / 2;
254  strm->seekg(positions[mid]);
255  std::string key;
256  ReadType(*strm, &key);
257  if (key > find_key) {
258  high = mid;
259  } else if (key < find_key) {
260  low = mid + 1;
261  } else {
262  for (size_t i = mid; i > low; --i) {
263  strm->seekg(positions[i - 1]);
264  ReadType(*strm, &key);
265  if (key != find_key) {
266  strm->seekg(positions[i]);
267  return;
268  }
269  }
270  strm->seekg(positions[low]);
271  return;
272  }
273  }
274  strm->seekg(positions[low]);
275  }
276 
277  // Adds all streams to the heap.
278  void MakeHeap() {
279  heap_.clear();
280  for (size_t i = 0; i < streams_.size(); ++i) {
281  if (positions_[i].empty()) continue;
282  ReadType(*streams_[i], &(keys_[i]));
283  if (streams_[i]->fail()) {
284  FSTERROR() << "STTableReader: Error reading file: " << sources_[i];
285  error_ = true;
286  return;
287  }
288  heap_.push_back(i);
289  }
290  if (heap_.empty()) return;
291  std::make_heap(heap_.begin(), heap_.end(), *compare_);
292  PopHeap();
293  }
294 
295  // Positions the stream with the lowest key at the top of the heap, sets
296  // current_ to the ID of that stream, and reads the current entry from that
297  // stream.
298  void PopHeap() {
299  std::pop_heap(heap_.begin(), heap_.end(), *compare_);
300  current_ = heap_.back();
301  entry_.reset(entry_reader_(*streams_[current_]));
302  if (!entry_) error_ = true;
303  if (streams_[current_]->fail()) {
304  FSTERROR() << "STTableReader: Error reading entry for key: "
305  << keys_[current_] << ", file: " << sources_[current_];
306  error_ = true;
307  }
308  }
309 
310  Reader entry_reader_;
311  std::vector<std::istream *> streams_; // Input streams.
312  std::vector<std::string> sources_; // Corresponding file names.
313  std::vector<std::vector<int64_t>> positions_; // Index of positions.
314  std::vector<std::string> keys_; // Lowest unread key for each stream.
315  std::vector<int64_t>
316  heap_; // Heap containing ID of streams with unread keys.
317  int64_t current_; // ID of current stream to be read.
318  std::unique_ptr<Compare> compare_; // Functor comparing stream IDs.
319  mutable std::unique_ptr<T> entry_; // The currently read entry.
320  bool error_;
321 };
322 
323 // String-type table header reading function template on the entry header type.
324 // The Header type must provide at least the following interface:
325 //
326 // struct Header {
327 // void Read(std::istream &istrm, const string &source);
328 // };
329 template <class Header>
330 bool ReadSTTableHeader(const std::string &source, Header *header) {
331  if (source.empty()) {
332  LOG(ERROR) << "ReadSTTable: Can't read header from standard input";
333  return false;
334  }
335  std::ifstream strm(source, std::ios_base::in | std::ios_base::binary);
336  if (!strm) {
337  LOG(ERROR) << "ReadSTTableHeader: Could not open file: " << source;
338  return false;
339  }
340  int32_t magic_number = 0;
341  ReadType(strm, &magic_number);
342  int32_t file_version = 0;
343  ReadType(strm, &file_version);
344  if (magic_number != kSTTableMagicNumber) {
345  LOG(ERROR) << "ReadSTTableHeader: Wrong file type: " << source;
346  return false;
347  }
348  if (file_version != kSTTableFileVersion) {
349  LOG(ERROR) << "ReadSTTableHeader: Wrong file version: " << source;
350  return false;
351  }
352  int64_t i = -1;
353  strm.seekg(-static_cast<int>(sizeof(int64_t)), std::ios_base::end);
354  ReadType(strm, &i); // Reads number of entries
355  if (strm.fail()) {
356  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << source;
357  return false;
358  }
359  if (i == 0) return true; // No entry header to read.
360  strm.seekg(-2 * static_cast<int>(sizeof(int64_t)), std::ios_base::end);
361  ReadType(strm, &i); // Reads position for last entry in file.
362  strm.seekg(i);
363  std::string key;
364  ReadType(strm, &key);
365  if (!header->Read(strm, source + ":" + key)) {
366  LOG(ERROR) << "ReadSTTableHeader: Error reading FstHeader: " << source;
367  return false;
368  }
369  if (strm.fail()) {
370  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << source;
371  return false;
372  }
373  return true;
374 }
375 
376 bool IsSTTable(const std::string &source);
377 
378 } // namespace fst
379 
380 #endif // FST_EXTENSIONS_FAR_STTABLE_H_
static STTableReader< T, Reader > * Open(const std::string &source)
Definition: sttable.h:173
constexpr int32_t kSTTableMagicNumber
Definition: sttable.h:39
constexpr int32_t kSTTableFileVersion
Definition: sttable.h:40
static STTableWriter< T, Writer > * Create(const std::string &source)
Definition: sttable.h:63
void Add(std::string_view key, const T &t)
Definition: sttable.h:71
#define LOG(type)
Definition: log.h:49
bool Find(std::string_view key)
Definition: sttable.h:198
static STTableReader< T, Reader > * Open(const std::vector< std::string > &sources)
Definition: sttable.h:183
std::ostream & WriteType(std::ostream &strm, const T t)
Definition: util.h:211
#define FSTERROR()
Definition: util.h:53
STTableWriter(const std::string &source)
Definition: sttable.h:51
bool Error() const
Definition: sttable.h:229
STTableReader(const std::vector< std::string > &sources)
Definition: sttable.h:114
bool Done() const
Definition: sttable.h:206
const std::string & GetKey() const
Definition: sttable.h:225
bool ReadSTTableHeader(const std::string &source, Header *header)
Definition: sttable.h:330
bool IsSTTable(const std::string &source)
Definition: sttable.cc:27
const T * GetEntry() const
Definition: sttable.h:227
std::istream & ReadType(std::istream &strm, T *t)
Definition: util.h:65
bool Error() const
Definition: sttable.h:86