FST  openfst-1.6.1
OpenFst Library
sttable.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // A generic string-to-type table file format.
5 //
6 // This is not meant as a generalization of SSTable. This is more of a simple
7 // replacement for SSTable in order to provide an open-source implementation
8 // of the FAR format for the external version of the FST library.
9 
10 #ifndef FST_EXTENSIONS_FAR_STTABLE_H_
11 #define FST_EXTENSIONS_FAR_STTABLE_H_
12 
13 #include <algorithm>
14 #include <istream>
15 #include <memory>
16 
17 #include <fstream>
18 #include <fst/util.h>
19 
20 namespace fst {
21 
22 static constexpr int32 kSTTableMagicNumber = 2125656924;
23 static constexpr int32 kSTTableFileVersion = 1;
24 
25 // String-type table writing class for an object of type T using a functor
26 // Writer. The Writer functor must provide at least the following interface:
27 //
28 // struct Writer {
29 // void operator()(std::ostream &, const T &) const;
30 // };
31 template <class T, class Writer>
33  public:
34  explicit STTableWriter(const string &filename)
35  : stream_(filename, std::ios_base::out | std::ios_base::binary),
36  error_(false) {
37  WriteType(stream_, kSTTableMagicNumber);
38  WriteType(stream_, kSTTableFileVersion);
39  if (stream_.fail()) {
40  FSTERROR() << "STTableWriter::STTableWriter: Error writing to file: "
41  << filename;
42  error_ = true;
43  }
44  }
45 
46  static STTableWriter<T, Writer> *Create(const string &filename) {
47  if (filename.empty()) {
48  LOG(ERROR) << "STTableWriter: Writing to standard out unsupported.";
49  return nullptr;
50  }
51  return new STTableWriter<T, Writer>(filename);
52  }
53 
54  void Add(const string &key, const T &t) {
55  if (key == "") {
56  FSTERROR() << "STTableWriter::Add: Key empty: " << key;
57  error_ = true;
58  } else if (key < last_key_) {
59  FSTERROR() << "STTableWriter::Add: Key out of order: " << key;
60  error_ = true;
61  }
62  if (error_) return;
63  last_key_ = key;
64  positions_.push_back(stream_.tellp());
65  WriteType(stream_, key);
66  entry_writer_(stream_, t);
67  }
68 
69  bool Error() const { return error_; }
70 
72  WriteType(stream_, positions_);
73  WriteType(stream_, static_cast<int64>(positions_.size()));
74  }
75 
76  private:
77  Writer entry_writer_;
78  std::ofstream stream_;
79  std::vector<int64> positions_; // Position in file of each key-entry pair.
80  string last_key_; // Last key.
81  bool error_;
82 
83  STTableWriter(const STTableWriter &) = delete;
84  STTableWriter &operator=(const STTableWriter &) = delete;
85 };
86 
87 // String-type table reading class for object of type T using a functor Reader.
88 // Reader must provide at least the following interface:
89 //
90 // struct Reader {
91 // T *operator()(std::istream &) const;
92 // };
93 //
94 template <class T, class Reader>
96  public:
97  explicit STTableReader(const std::vector<string> &filenames)
98  : sources_(filenames), error_(false) {
99  compare_.reset(new Compare(&keys_));
100  keys_.resize(filenames.size());
101  streams_.resize(filenames.size(), 0);
102  positions_.resize(filenames.size());
103  for (size_t i = 0; i < filenames.size(); ++i) {
104  streams_[i] = new std::ifstream(
105  filenames[i], std::ios_base::in | std::ios_base::binary);
106  int32 magic_number = 0;
107  ReadType(*streams_[i], &magic_number);
108  int32 file_version = 0;
109  ReadType(*streams_[i], &file_version);
110  if (magic_number != kSTTableMagicNumber) {
111  FSTERROR() << "STTableReader::STTableReader: Wrong file type: "
112  << filenames[i];
113  error_ = true;
114  return;
115  }
116  if (file_version != kSTTableFileVersion) {
117  FSTERROR() << "STTableReader::STTableReader: Wrong file version: "
118  << filenames[i];
119  error_ = true;
120  return;
121  }
122  int64 num_entries;
123  streams_[i]->seekg(-static_cast<int>(sizeof(int64)), std::ios_base::end);
124  ReadType(*streams_[i], &num_entries);
125  if (num_entries > 0) {
126  streams_[i]->seekg(-static_cast<int>(sizeof(int64)) * (num_entries + 1),
127  std::ios_base::end);
128  positions_[i].resize(num_entries);
129  for (size_t j = 0; (j < num_entries) && (!streams_[i]->fail()); ++j) {
130  ReadType(*streams_[i], &(positions_[i][j]));
131  }
132  streams_[i]->seekg(positions_[i][0]);
133  if (streams_[i]->fail()) {
134  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
135  << filenames[i];
136  error_ = true;
137  return;
138  }
139  }
140  }
141  MakeHeap();
142  }
143 
145  for (auto &stream : streams_) delete stream;
146  }
147 
148  static STTableReader<T, Reader> *Open(const string &filename) {
149  if (filename.empty()) {
150  LOG(ERROR) << "STTableReader: Operation not supported on standard input";
151  return nullptr;
152  }
153  std::vector<string> filenames;
154  filenames.push_back(filename);
155  return new STTableReader<T, Reader>(filenames);
156  }
157 
158  static STTableReader<T, Reader> *Open(const std::vector<string> &filenames) {
159  return new STTableReader<T, Reader>(filenames);
160  }
161 
162  void Reset() {
163  if (error_) return;
164  for (size_t i = 0; i < streams_.size(); ++i)
165  streams_[i]->seekg(positions_[i].front());
166  MakeHeap();
167  }
168 
169  bool Find(const string &key) {
170  if (error_) return false;
171  for (size_t i = 0; i < streams_.size(); ++i) LowerBound(i, key);
172  MakeHeap();
173  if (heap_.empty()) return false;
174  return keys_[current_] == key;
175  }
176 
177  bool Done() const { return error_ || heap_.empty(); }
178 
179  void Next() {
180  if (error_) return;
181  if (streams_[current_]->tellg() <= positions_[current_].back()) {
182  ReadType(*(streams_[current_]), &(keys_[current_]));
183  if (streams_[current_]->fail()) {
184  FSTERROR() << "STTableReader: Error reading file: "
185  << sources_[current_];
186  error_ = true;
187  return;
188  }
189  std::push_heap(heap_.begin(), heap_.end(), *compare_);
190  } else {
191  heap_.pop_back();
192  }
193  if (!heap_.empty()) PopHeap();
194  }
195 
196  const string &GetKey() const { return keys_[current_]; }
197 
198  const T *GetEntry() const { return entry_.get(); }
199 
200  bool Error() const { return error_; }
201 
202  private:
203  // Comparison functor used to compare stream IDs in the heap.
204  struct Compare {
205  explicit Compare(const std::vector<string> *keys) : keys(keys) {}
206 
207  bool operator()(size_t i, size_t j) const {
208  return (*keys)[i] > (*keys)[j];
209  };
210 
211  private:
212  const std::vector<string> *keys;
213  };
214 
215  // Positions the stream at the position corresponding to the lower bound for
216  // the specified key.
217  void LowerBound(size_t id, const string &find_key) {
218  auto *strm = streams_[id];
219  const auto &positions = positions_[id];
220  if (positions.empty()) return;
221  size_t low = 0;
222  size_t high = positions.size() - 1;
223  while (low < high) {
224  size_t mid = (low + high) / 2;
225  strm->seekg(positions[mid]);
226  string key;
227  ReadType(*strm, &key);
228  if (key > find_key) {
229  high = mid;
230  } else if (key < find_key) {
231  low = mid + 1;
232  } else {
233  for (size_t i = mid; i > low; --i) {
234  strm->seekg(positions[i - 1]);
235  ReadType(*strm, &key);
236  if (key != find_key) {
237  strm->seekg(positions[i]);
238  return;
239  }
240  }
241  strm->seekg(positions[low]);
242  return;
243  }
244  }
245  strm->seekg(positions[low]);
246  }
247 
248  // Adds all streams to the heap.
249  void MakeHeap() {
250  heap_.clear();
251  for (size_t i = 0; i < streams_.size(); ++i) {
252  if (positions_[i].empty()) continue;
253  ReadType(*streams_[i], &(keys_[i]));
254  if (streams_[i]->fail()) {
255  FSTERROR() << "STTableReader: Error reading file: " << sources_[i];
256  error_ = true;
257  return;
258  }
259  heap_.push_back(i);
260  }
261  if (heap_.empty()) return;
262  std::make_heap(heap_.begin(), heap_.end(), *compare_);
263  PopHeap();
264  }
265 
266  // Positions the stream with the lowest key at the top of the heap, sets
267  // current_ to the ID of that stream, and reads the current entry from that
268  // stream.
269  void PopHeap() {
270  std::pop_heap(heap_.begin(), heap_.end(), *compare_);
271  current_ = heap_.back();
272  entry_.reset(entry_reader_(*streams_[current_]));
273  if (!entry_) error_ = true;
274  if (streams_[current_]->fail()) {
275  FSTERROR() << "STTableReader: Error reading entry for key: "
276  << keys_[current_] << ", file: " << sources_[current_];
277  error_ = true;
278  }
279  }
280 
281  Reader entry_reader_;
282  std::vector<std::istream *> streams_; // Input streams.
283  std::vector<string> sources_; // Corresponding file names.
284  std::vector<std::vector<int64>> positions_; // Index of positions.
285  std::vector<string> keys_; // Lowest unread key for each stream.
286  std::vector<int64> heap_; // Heap containing ID of streams with unread keys.
287  int64 current_; // ID of current stream to be read.
288  std::unique_ptr<Compare> compare_; // Functor comparing stream IDs.
289  mutable std::unique_ptr<T> entry_; // The currently read entry.
290  bool error_;
291 };
292 
293 // String-type table header reading function template on the entry header type.
294 // The Header type must provide at least the following interface:
295 //
296 // struct Header {
297 // void Read(std::istream &istrm, const string &filename);
298 // };
299 template <class Header>
300 bool ReadSTTableHeader(const string &filename, Header *header) {
301  if (filename.empty()) {
302  LOG(ERROR) << "ReadSTTable: Can't read header from standard input";
303  return false;
304  }
305  std::ifstream strm(filename, std::ios_base::in | std::ios_base::binary);
306  if (!strm) {
307  LOG(ERROR) << "ReadSTTableHeader: Could not open file: " << filename;
308  return false;
309  }
310  int32 magic_number = 0;
311  ReadType(strm, &magic_number);
312  int32 file_version = 0;
313  ReadType(strm, &file_version);
314  if (magic_number != kSTTableMagicNumber) {
315  LOG(ERROR) << "ReadSTTableHeader: Wrong file type: " << filename;
316  return false;
317  }
318  if (file_version != kSTTableFileVersion) {
319  LOG(ERROR) << "ReadSTTableHeader: Wrong file version: " << filename;
320  return false;
321  }
322  int64 i = -1;
323  strm.seekg(-static_cast<int>(sizeof(int64)), std::ios_base::end);
324  ReadType(strm, &i); // Reads number of entries
325  if (strm.fail()) {
326  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << filename;
327  return false;
328  }
329  if (i == 0) return true; // No entry header to read.
330  strm.seekg(-2 * static_cast<int>(sizeof(int64)), std::ios_base::end);
331  ReadType(strm, &i); // Reads position for last entry in file.
332  strm.seekg(i);
333  string key;
334  ReadType(strm, &key);
335  header->Read(strm, filename + ":" + key);
336  if (strm.fail()) {
337  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << filename;
338  return false;
339  }
340  return true;
341 }
342 
343 bool IsSTTable(const string &filename);
344 
345 } // namespace fst
346 
347 #endif // FST_EXTENSIONS_FAR_STTABLE_H_
static STTableWriter< T, Writer > * Create(const string &filename)
Definition: sttable.h:46
bool IsSTTable(const string &filename)
Definition: sttable.cc:9
bool ReadSTTableHeader(const string &filename, Header *header)
Definition: sttable.h:300
static STTableReader< T, Reader > * Open(const std::vector< string > &filenames)
Definition: sttable.h:158
#define LOG(type)
Definition: log.h:48
static STTableReader< T, Reader > * Open(const string &filename)
Definition: sttable.h:148
int64_t int64
Definition: types.h:27
std::ostream & WriteType(std::ostream &strm, const T t)
Definition: util.h:152
#define FSTERROR()
Definition: util.h:32
bool Error() const
Definition: sttable.h:200
bool Done() const
Definition: sttable.h:177
bool Find(const string &key)
Definition: sttable.h:169
void Add(const string &key, const T &t)
Definition: sttable.h:54
const string & GetKey() const
Definition: sttable.h:196
STTableReader(const std::vector< string > &filenames)
Definition: sttable.h:97
int32_t int32
Definition: types.h:26
STTableWriter(const string &filename)
Definition: sttable.h:34
const T * GetEntry() const
Definition: sttable.h:198
std::istream & ReadType(std::istream &strm, T *t)
Definition: util.h:44
bool Error() const
Definition: sttable.h:69