FST  openfst-1.7.2
OpenFst Library
sttable.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // A generic string-to-type table file format.
5 //
6 // This is not meant as a generalization of SSTable. This is more of a simple
7 // replacement for SSTable in order to provide an open-source implementation
8 // of the FAR format for the external version of the FST library.
9 
10 #ifndef FST_EXTENSIONS_FAR_STTABLE_H_
11 #define FST_EXTENSIONS_FAR_STTABLE_H_
12 
13 #include <algorithm>
14 #include <istream>
15 #include <memory>
16 
17 #include <fstream>
18 #include <fst/util.h>
19 
20 namespace fst {
21 
22 static constexpr int32 kSTTableMagicNumber = 2125656924;
23 static constexpr int32 kSTTableFileVersion = 1;
24 
25 // String-type table writing class for an object of type T using a functor
26 // Writer. The Writer functor must provide at least the following interface:
27 //
28 // struct Writer {
29 // void operator()(std::ostream &, const T &) const;
30 // };
31 template <class T, class Writer>
33  public:
34  explicit STTableWriter(const string &filename)
35  : stream_(filename, std::ios_base::out | std::ios_base::binary),
36  error_(false) {
37  WriteType(stream_, kSTTableMagicNumber);
38  WriteType(stream_, kSTTableFileVersion);
39  if (stream_.fail()) {
40  FSTERROR() << "STTableWriter::STTableWriter: Error writing to file: "
41  << filename;
42  error_ = true;
43  }
44  }
45 
46  static STTableWriter<T, Writer> *Create(const string &filename) {
47  if (filename.empty()) {
48  LOG(ERROR) << "STTableWriter: Writing to standard out unsupported.";
49  return nullptr;
50  }
51  return new STTableWriter<T, Writer>(filename);
52  }
53 
54  void Add(const string &key, const T &t) {
55  if (key == "") {
56  FSTERROR() << "STTableWriter::Add: Key empty: " << key;
57  error_ = true;
58  } else if (key < last_key_) {
59  FSTERROR() << "STTableWriter::Add: Key out of order: " << key;
60  error_ = true;
61  }
62  if (error_) return;
63  last_key_ = key;
64  positions_.push_back(stream_.tellp());
65  WriteType(stream_, key);
66  entry_writer_(stream_, t);
67  }
68 
69  bool Error() const { return error_; }
70 
72  WriteType(stream_, positions_);
73  WriteType(stream_, static_cast<int64>(positions_.size()));
74  }
75 
76  private:
77  Writer entry_writer_;
78  std::ofstream stream_;
79  std::vector<int64> positions_; // Position in file of each key-entry pair.
80  string last_key_; // Last key.
81  bool error_;
82 
83  STTableWriter(const STTableWriter &) = delete;
84  STTableWriter &operator=(const STTableWriter &) = delete;
85 };
86 
87 // String-type table reading class for object of type T using a functor Reader.
88 // Reader must provide at least the following interface:
89 //
90 // struct Reader {
91 // T *operator()(std::istream &) const;
92 // };
93 //
94 template <class T, class Reader>
96  public:
97  explicit STTableReader(const std::vector<string> &filenames)
98  : sources_(filenames), error_(false) {
99  compare_.reset(new Compare(&keys_));
100  keys_.resize(filenames.size());
101  streams_.resize(filenames.size(), 0);
102  positions_.resize(filenames.size());
103  for (size_t i = 0; i < filenames.size(); ++i) {
104  streams_[i] = new std::ifstream(
105  filenames[i], std::ios_base::in | std::ios_base::binary);
106  if (streams_[i]->fail()) {
107  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
108  << filenames[i];
109  error_ = true;
110  return;
111  }
112  int32 magic_number = 0;
113  ReadType(*streams_[i], &magic_number);
114  int32 file_version = 0;
115  ReadType(*streams_[i], &file_version);
116  if (magic_number != kSTTableMagicNumber) {
117  FSTERROR() << "STTableReader::STTableReader: Wrong file type: "
118  << filenames[i];
119  error_ = true;
120  return;
121  }
122  if (file_version != kSTTableFileVersion) {
123  FSTERROR() << "STTableReader::STTableReader: Wrong file version: "
124  << filenames[i];
125  error_ = true;
126  return;
127  }
128  int64 num_entries;
129  streams_[i]->seekg(-static_cast<int>(sizeof(int64)), std::ios_base::end);
130  ReadType(*streams_[i], &num_entries);
131  if (num_entries > 0) {
132  streams_[i]->seekg(-static_cast<int>(sizeof(int64)) * (num_entries + 1),
133  std::ios_base::end);
134  positions_[i].resize(num_entries);
135  for (size_t j = 0; (j < num_entries) && (!streams_[i]->fail()); ++j) {
136  ReadType(*streams_[i], &(positions_[i][j]));
137  }
138  streams_[i]->seekg(positions_[i][0]);
139  if (streams_[i]->fail()) {
140  FSTERROR() << "STTableReader::STTableReader: Error reading file: "
141  << filenames[i];
142  error_ = true;
143  return;
144  }
145  }
146  }
147  MakeHeap();
148  }
149 
151  for (auto &stream : streams_) delete stream;
152  }
153 
154  static STTableReader<T, Reader> *Open(const string &filename) {
155  if (filename.empty()) {
156  LOG(ERROR) << "STTableReader: Operation not supported on standard input";
157  return nullptr;
158  }
159  std::vector<string> filenames;
160  filenames.push_back(filename);
161  return new STTableReader<T, Reader>(filenames);
162  }
163 
164  static STTableReader<T, Reader> *Open(const std::vector<string> &filenames) {
165  return new STTableReader<T, Reader>(filenames);
166  }
167 
168  void Reset() {
169  if (error_) return;
170  for (size_t i = 0; i < streams_.size(); ++i)
171  streams_[i]->seekg(positions_[i].front());
172  MakeHeap();
173  }
174 
175  bool Find(const string &key) {
176  if (error_) return false;
177  for (size_t i = 0; i < streams_.size(); ++i) LowerBound(i, key);
178  MakeHeap();
179  if (heap_.empty()) return false;
180  return keys_[current_] == key;
181  }
182 
183  bool Done() const { return error_ || heap_.empty(); }
184 
185  void Next() {
186  if (error_) return;
187  if (streams_[current_]->tellg() <= positions_[current_].back()) {
188  ReadType(*(streams_[current_]), &(keys_[current_]));
189  if (streams_[current_]->fail()) {
190  FSTERROR() << "STTableReader: Error reading file: "
191  << sources_[current_];
192  error_ = true;
193  return;
194  }
195  std::push_heap(heap_.begin(), heap_.end(), *compare_);
196  } else {
197  heap_.pop_back();
198  }
199  if (!heap_.empty()) PopHeap();
200  }
201 
202  const string &GetKey() const { return keys_[current_]; }
203 
204  const T *GetEntry() const { return entry_.get(); }
205 
206  bool Error() const { return error_; }
207 
208  private:
209  // Comparison functor used to compare stream IDs in the heap.
210  struct Compare {
211  explicit Compare(const std::vector<string> *keys) : keys(keys) {}
212 
213  bool operator()(size_t i, size_t j) const {
214  return (*keys)[i] > (*keys)[j];
215  };
216 
217  private:
218  const std::vector<string> *keys;
219  };
220 
221  // Positions the stream at the position corresponding to the lower bound for
222  // the specified key.
223  void LowerBound(size_t id, const string &find_key) {
224  auto *strm = streams_[id];
225  const auto &positions = positions_[id];
226  if (positions.empty()) return;
227  size_t low = 0;
228  size_t high = positions.size() - 1;
229  while (low < high) {
230  size_t mid = (low + high) / 2;
231  strm->seekg(positions[mid]);
232  string key;
233  ReadType(*strm, &key);
234  if (key > find_key) {
235  high = mid;
236  } else if (key < find_key) {
237  low = mid + 1;
238  } else {
239  for (size_t i = mid; i > low; --i) {
240  strm->seekg(positions[i - 1]);
241  ReadType(*strm, &key);
242  if (key != find_key) {
243  strm->seekg(positions[i]);
244  return;
245  }
246  }
247  strm->seekg(positions[low]);
248  return;
249  }
250  }
251  strm->seekg(positions[low]);
252  }
253 
254  // Adds all streams to the heap.
255  void MakeHeap() {
256  heap_.clear();
257  for (size_t i = 0; i < streams_.size(); ++i) {
258  if (positions_[i].empty()) continue;
259  ReadType(*streams_[i], &(keys_[i]));
260  if (streams_[i]->fail()) {
261  FSTERROR() << "STTableReader: Error reading file: " << sources_[i];
262  error_ = true;
263  return;
264  }
265  heap_.push_back(i);
266  }
267  if (heap_.empty()) return;
268  std::make_heap(heap_.begin(), heap_.end(), *compare_);
269  PopHeap();
270  }
271 
272  // Positions the stream with the lowest key at the top of the heap, sets
273  // current_ to the ID of that stream, and reads the current entry from that
274  // stream.
275  void PopHeap() {
276  std::pop_heap(heap_.begin(), heap_.end(), *compare_);
277  current_ = heap_.back();
278  entry_.reset(entry_reader_(*streams_[current_]));
279  if (!entry_) error_ = true;
280  if (streams_[current_]->fail()) {
281  FSTERROR() << "STTableReader: Error reading entry for key: "
282  << keys_[current_] << ", file: " << sources_[current_];
283  error_ = true;
284  }
285  }
286 
287  Reader entry_reader_;
288  std::vector<std::istream *> streams_; // Input streams.
289  std::vector<string> sources_; // Corresponding file names.
290  std::vector<std::vector<int64>> positions_; // Index of positions.
291  std::vector<string> keys_; // Lowest unread key for each stream.
292  std::vector<int64> heap_; // Heap containing ID of streams with unread keys.
293  int64 current_; // ID of current stream to be read.
294  std::unique_ptr<Compare> compare_; // Functor comparing stream IDs.
295  mutable std::unique_ptr<T> entry_; // The currently read entry.
296  bool error_;
297 };
298 
299 // String-type table header reading function template on the entry header type.
300 // The Header type must provide at least the following interface:
301 //
302 // struct Header {
303 // void Read(std::istream &istrm, const string &filename);
304 // };
305 template <class Header>
306 bool ReadSTTableHeader(const string &filename, Header *header) {
307  if (filename.empty()) {
308  LOG(ERROR) << "ReadSTTable: Can't read header from standard input";
309  return false;
310  }
311  std::ifstream strm(filename, std::ios_base::in | std::ios_base::binary);
312  if (!strm) {
313  LOG(ERROR) << "ReadSTTableHeader: Could not open file: " << filename;
314  return false;
315  }
316  int32 magic_number = 0;
317  ReadType(strm, &magic_number);
318  int32 file_version = 0;
319  ReadType(strm, &file_version);
320  if (magic_number != kSTTableMagicNumber) {
321  LOG(ERROR) << "ReadSTTableHeader: Wrong file type: " << filename;
322  return false;
323  }
324  if (file_version != kSTTableFileVersion) {
325  LOG(ERROR) << "ReadSTTableHeader: Wrong file version: " << filename;
326  return false;
327  }
328  int64 i = -1;
329  strm.seekg(-static_cast<int>(sizeof(int64)), std::ios_base::end);
330  ReadType(strm, &i); // Reads number of entries
331  if (strm.fail()) {
332  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << filename;
333  return false;
334  }
335  if (i == 0) return true; // No entry header to read.
336  strm.seekg(-2 * static_cast<int>(sizeof(int64)), std::ios_base::end);
337  ReadType(strm, &i); // Reads position for last entry in file.
338  strm.seekg(i);
339  string key;
340  ReadType(strm, &key);
341  header->Read(strm, filename + ":" + key);
342  if (strm.fail()) {
343  LOG(ERROR) << "ReadSTTableHeader: Error reading file: " << filename;
344  return false;
345  }
346  return true;
347 }
348 
349 bool IsSTTable(const string &filename);
350 
351 } // namespace fst
352 
353 #endif // FST_EXTENSIONS_FAR_STTABLE_H_
static STTableWriter< T, Writer > * Create(const string &filename)
Definition: sttable.h:46
bool IsSTTable(const string &filename)
Definition: sttable.cc:9
bool ReadSTTableHeader(const string &filename, Header *header)
Definition: sttable.h:306
static STTableReader< T, Reader > * Open(const std::vector< string > &filenames)
Definition: sttable.h:164
#define LOG(type)
Definition: log.h:48
static STTableReader< T, Reader > * Open(const string &filename)
Definition: sttable.h:154
int64_t int64
Definition: types.h:27
std::ostream & WriteType(std::ostream &strm, const T t)
Definition: util.h:155
#define FSTERROR()
Definition: util.h:35
bool Error() const
Definition: sttable.h:206
bool Done() const
Definition: sttable.h:183
bool Find(const string &key)
Definition: sttable.h:175
void Add(const string &key, const T &t)
Definition: sttable.h:54
const string & GetKey() const
Definition: sttable.h:202
STTableReader(const std::vector< string > &filenames)
Definition: sttable.h:97
int32_t int32
Definition: types.h:26
STTableWriter(const string &filename)
Definition: sttable.h:34
const T * GetEntry() const
Definition: sttable.h:204
std::istream & ReadType(std::istream &strm, T *t)
Definition: util.h:47
bool Error() const
Definition: sttable.h:69