FST  openfst-1.8.4
OpenFst Library
util.h
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 //
18 // FST utility inline definitions.
19 
20 #ifndef FST_UTIL_H_
21 #define FST_UTIL_H_
22 
23 #include <array>
24 #include <cstddef>
25 #include <cstdint>
26 #include <ios>
27 #include <iostream>
28 #include <istream>
29 #include <iterator>
30 #include <list>
31 #include <map>
32 #include <optional>
33 #include <ostream>
34 #include <set>
35 #include <sstream>
36 #include <string>
37 #include <tuple>
38 #include <type_traits>
39 #include <unordered_map>
40 #include <unordered_set>
41 #include <utility>
42 #include <vector>
43 
44 #include <fst/compat.h>
45 #include <fst/flags.h>
46 #include <fst/log.h>
47 #include <fstream>
48 #include <fst/mapped-file.h>
49 #include <unordered_map>
50 #include <string_view>
51 #include <optional>
52 
53 // Utility for error handling.
54 
55 DECLARE_bool(fst_error_fatal);
56 
57 #define FSTERROR() \
58  (FST_FLAGS_fst_error_fatal ? LOG(FATAL) : LOG(ERROR))
59 
60 namespace fst {
61 
62 // Utility for type I/O. For portability of serialized objects across
63 // architectures, care must be taken so that only fixed-size types (like
64 // `int32_t`) are used with `WriteType`/`ReadType`, not types that may differ in
65 // size depending on the architecture, such as `int`. For `enum` types, a
66 // fixed-size base (like `enum E : int32_t`) should be used. Objects are
67 // written and read in the host byte order, so will not be portable across
68 // different endiannesses.
69 
70 namespace internal {
71 // Whether the scalar type is supported by `ReadType`/`WriteType`.
72 template <class T>
73 inline constexpr bool IsScalarIOTypeV =
74  std::is_arithmetic_v<T> || std::is_enum_v<T>;
75 } // namespace internal
76 
77 // Reads types from an input stream.
78 
79 // Generic case.
80 template <class T, typename std::enable_if_t<std::is_class_v<T>, T> * = nullptr>
81 inline std::istream &ReadType(std::istream &strm, T *t) {
82  return t->Read(strm);
83 }
84 
85 // Numeric (boolean, integral, floating-point) or enum case.
86 template <class T, typename std::enable_if_t<internal::IsScalarIOTypeV<T>, T>
87  * = nullptr>
88 inline std::istream &ReadType(std::istream &strm, T *t) {
89  return strm.read(reinterpret_cast<char *>(t), sizeof(T));
90 }
91 
92 // Numeric (boolean, integral, floating-point) or enum case only.
93 template <class T>
94 inline std::istream &ReadType(std::istream &strm, size_t n, T *t) {
95  static_assert(internal::IsScalarIOTypeV<T>,
96  "Type not supported for batch read.");
97  return strm.read(reinterpret_cast<char *>(t), sizeof(T) * n);
98 }
99 
100 // String case.
101 inline std::istream &ReadType(std::istream &strm, std::string *s) {
102  s->clear();
103  int32_t ns = 0;
104  ReadType(strm, &ns);
105  if (ns <= 0) return strm;
106  s->resize(ns);
107  ReadType(strm, ns, s->data());
108  return strm;
109 }
110 
111 // Declares types that can be read from an input stream.
112 template <class... T>
113 std::istream &ReadType(std::istream &strm, std::vector<T...> *c);
114 template <class... T>
115 std::istream &ReadType(std::istream &strm, std::list<T...> *c);
116 template <class... T>
117 std::istream &ReadType(std::istream &strm, std::set<T...> *c);
118 template <class... T>
119 std::istream &ReadType(std::istream &strm, std::map<T...> *c);
120 template <class... T>
121 std::istream &ReadType(std::istream &strm, std::unordered_map<T...> *c);
122 template <class... T>
123 std::istream &ReadType(std::istream &strm, std::unordered_set<T...> *c);
124 
125 // Pair case.
126 template <typename S, typename T>
127 inline std::istream &ReadType(std::istream &strm, std::pair<S, T> *p) {
128  ReadType(strm, &p->first);
129  ReadType(strm, &p->second);
130  return strm;
131 }
132 
133 template <typename S, typename T>
134 inline std::istream &ReadType(std::istream &strm, std::pair<const S, T> *p) {
135  ReadType(strm, const_cast<S *>(&p->first));
136  ReadType(strm, &p->second);
137  return strm;
138 }
139 
140 namespace internal {
141 template <class C, class ReserveFn>
142 std::istream &ReadContainerType(std::istream &strm, C *c, ReserveFn reserve) {
143  c->clear();
144  int64_t n = 0;
145  ReadType(strm, &n);
146  reserve(c, n);
147  auto insert = std::inserter(*c, c->begin());
148  for (int64_t i = 0; i < n; ++i) {
149  typename C::value_type value;
150  ReadType(strm, &value);
151  *insert = value;
152  }
153  return strm;
154 }
155 
156 // Generic vector case.
157 template <typename T, class A,
158  typename std::enable_if_t<std::is_class_v<T>, T> * = nullptr>
159 inline std::istream &ReadVectorType(std::istream &strm, std::vector<T, A> *c) {
161  strm, c, [](decltype(c) v, int n) { v->reserve(n); });
162 }
163 
164 // Vector of numerics (boolean, integral, floating-point, char) or enum case.
165 template <
166  typename T, class A,
167  typename std::enable_if_t<internal::IsScalarIOTypeV<T>, T> * = nullptr>
168 inline std::istream &ReadVectorType(std::istream &strm, std::vector<T, A> *c) {
169  c->clear();
170  int64_t n = 0;
171  ReadType(strm, &n);
172  if (n == 0) return strm;
173  c->resize(n);
174  ReadType(strm, n, c->data());
175  return strm;
176 }
177 } // namespace internal
178 
179 template <class T, size_t N>
180 std::istream &ReadType(std::istream &strm, std::array<T, N> *c) {
181  if constexpr (internal::IsScalarIOTypeV<T>) {
182  ReadType(strm, c->size(), c->data());
183  } else {
184  for (auto &v : *c) ReadType(strm, &v);
185  }
186  return strm;
187 }
188 
189 template <class... T>
190 std::istream &ReadType(std::istream &strm, std::vector<T...> *c) {
191  return internal::ReadVectorType(strm, c);
192 }
193 
194 template <class... T>
195 std::istream &ReadType(std::istream &strm, std::list<T...> *c) {
196  return internal::ReadContainerType(strm, c, [](decltype(c) v, int n) {});
197 }
198 
199 template <class... T>
200 std::istream &ReadType(std::istream &strm, std::set<T...> *c) {
201  return internal::ReadContainerType(strm, c, [](decltype(c) v, int n) {});
202 }
203 
204 template <class... T>
205 std::istream &ReadType(std::istream &strm, std::map<T...> *c) {
206  return internal::ReadContainerType(strm, c, [](decltype(c) v, int n) {});
207 }
208 
209 template <class... T>
210 std::istream &ReadType(std::istream &strm, std::unordered_set<T...> *c) {
212  strm, c, [](decltype(c) v, int n) { v->reserve(n); });
213 }
214 
215 template <class... T>
216 std::istream &ReadType(std::istream &strm, std::unordered_map<T...> *c) {
218  strm, c, [](decltype(c) v, int n) { v->reserve(n); });
219 }
220 
221 // Writes types to an output stream.
222 
223 // Generic case.
224 template <class T, typename std::enable_if<
225  std::is_class<T>::value &&
226  // `string_view` is handled separately below.
227  !std::is_convertible<T, std::string_view>::value,
228  T>::type * = nullptr>
229 inline std::ostream &WriteType(std::ostream &strm, const T t) {
230  t.Write(strm);
231  return strm;
232 }
233 
234 // Numeric (boolean, integral, floating-point) or enum case.
235 template <class T, typename std::enable_if_t<internal::IsScalarIOTypeV<T>, T>
236  * = nullptr>
237 inline std::ostream &WriteType(std::ostream &strm, const T t) {
238  return strm.write(reinterpret_cast<const char *>(&t), sizeof(T));
239 }
240 
241 // Numeric (boolean, integral, floating-point) or enum case only.
242 template <class T>
243 inline std::ostream &WriteType(std::ostream &strm, size_t n, const T *t) {
244  static_assert(internal::IsScalarIOTypeV<T>,
245  "Type not supported for batch write.");
246  return strm.write(reinterpret_cast<const char *>(t), sizeof(T) * n);
247 }
248 
249 inline std::ostream &WriteType(std::ostream &strm, std::string_view s) {
250  int32_t ns = s.size();
251  WriteType(strm, ns);
252  return strm.write(s.data(), ns);
253 }
254 
255 // Declares types that can be written to an output stream.
256 
257 template <typename... T>
258 std::ostream &WriteType(std::ostream &strm, const std::vector<T...> &c);
259 
260 template <typename... T>
261 std::ostream &WriteType(std::ostream &strm, const std::list<T...> &c);
262 
263 template <typename... T>
264 std::ostream &WriteType(std::ostream &strm, const std::set<T...> &c);
265 
266 template <typename... T>
267 std::ostream &WriteType(std::ostream &strm, const std::map<T...> &c);
268 
269 template <typename... T>
270 std::ostream &WriteType(std::ostream &strm, const std::unordered_map<T...> &c);
271 
272 template <typename... T>
273 std::ostream &WriteType(std::ostream &strm, const std::unordered_set<T...> &c);
274 
275 // Pair case.
276 template <typename S, typename T>
277 inline std::ostream &WriteType(std::ostream &strm, const std::pair<S, T> &p) {
278  WriteType(strm, p.first);
279  WriteType(strm, p.second);
280  return strm;
281 }
282 
283 namespace internal {
284 template <class C>
285 std::ostream &WriteSequence(std::ostream &strm, const C &c) {
286  for (const auto &e : c) {
287  WriteType(strm, e);
288  }
289  return strm;
290 }
291 
292 template <class C>
293 std::ostream &WriteContainer(std::ostream &strm, const C &c) {
294  const int64_t n = c.size();
295  WriteType(strm, n);
296  WriteSequence(strm, c);
297  return strm;
298 }
299 } // namespace internal
300 
301 template <class T, size_t N>
302 std::ostream &WriteType(std::ostream &strm, const std::array<T, N> &c) {
303  return internal::WriteSequence(strm, c);
304 }
305 
306 template <typename... T>
307 std::ostream &WriteType(std::ostream &strm, const std::vector<T...> &c) {
308  return internal::WriteContainer(strm, c);
309 }
310 
311 template <typename... T>
312 std::ostream &WriteType(std::ostream &strm, const std::list<T...> &c) {
313  return internal::WriteContainer(strm, c);
314 }
315 
316 template <typename... T>
317 std::ostream &WriteType(std::ostream &strm, const std::set<T...> &c) {
318  return internal::WriteContainer(strm, c);
319 }
320 
321 template <typename... T>
322 std::ostream &WriteType(std::ostream &strm, const std::map<T...> &c) {
323  return internal::WriteContainer(strm, c);
324 }
325 
326 template <typename... T>
327 std::ostream &WriteType(std::ostream &strm, const std::unordered_map<T...> &c) {
328  return internal::WriteContainer(strm, c);
329 }
330 
331 template <typename... T>
332 std::ostream &WriteType(std::ostream &strm, const std::unordered_set<T...> &c) {
333  return internal::WriteContainer(strm, c);
334 }
335 
336 // Utilities for converting between int64_t or Weight and string.
337 
338 // Parses a 64-bit signed integer in some base out of an input string. The
339 // string should consist only of digits (no prefixes such as "0x") and an
340 // optionally preceding minus. Returns a value iff the entirety of the string is
341 // consumed during integer parsing, otherwise returns `std::nullopt`.
342 std::optional<int64_t> ParseInt64(std::string_view s, int base = 10);
343 
344 int64_t StrToInt64(std::string_view s, std::string_view source, size_t nline,
345  bool * error = nullptr);
346 
347 template <typename Weight>
348 Weight StrToWeight(std::string_view s) {
349  Weight w;
350  std::istringstream strm(std::string{s});
351  strm >> w;
352  if (!strm) {
353  FSTERROR() << "StrToWeight: Bad weight: " << s;
354  return Weight::NoWeight();
355  }
356  return w;
357 }
358 
359 template <typename Weight>
360 std::string WeightToStr(Weight w) {
361  std::ostringstream strm;
362  strm.precision(9);
363  strm << w;
364  return strm.str();
365 }
366 
367 // Utilities for reading/writing integer pairs (typically labels).
368 
369 template <typename I>
370 bool ReadIntPairs(std::string_view source,
371  std::vector<std::pair<I, I>> *pairs) {
372  std::ifstream strm(std::string(source), std::ios_base::in);
373  if (!strm) {
374  LOG(ERROR) << "ReadIntPairs: Can't open file: " << source;
375  return false;
376  }
377  const int kLineLen = 8096;
378  char line[kLineLen];
379  size_t nline = 0;
380  pairs->clear();
381  while (strm.getline(line, kLineLen)) {
382  ++nline;
383  std::vector<std::string_view> col =
384  StrSplit(line, ByAnyChar("\n\t "), SkipEmpty());
385  // empty line or comment?
386  if (col.empty() || col[0].empty() || col[0][0] == '#') continue;
387  if (col.size() != 2) {
388  LOG(ERROR) << "ReadIntPairs: Bad number of columns, "
389  << "file = " << source << ", line = " << nline;
390  return false;
391  }
392  bool err;
393  I i1 = StrToInt64(col[0], source, nline, &err);
394  if (err) return false;
395  I i2 = StrToInt64(col[1], source, nline, &err);
396  if (err) return false;
397  pairs->emplace_back(i1, i2);
398  }
399  return true;
400 }
401 
402 template <typename I>
403 bool WriteIntPairs(std::string_view source,
404  const std::vector<std::pair<I, I>> &pairs) {
405  std::ofstream fstrm;
406  if (!source.empty()) {
407  fstrm.open(std::string(source));
408  if (!fstrm) {
409  LOG(ERROR) << "WriteIntPairs: Can't open file: " << source;
410  return false;
411  }
412  }
413  std::ostream &ostrm = fstrm.is_open() ? fstrm : std::cout;
414  for (const auto &pair : pairs) {
415  ostrm << pair.first << "\t" << pair.second << "\n";
416  }
417  return !!ostrm;
418 }
419 
420 // Utilities for reading/writing label pairs.
421 
422 template <typename Label>
423 bool ReadLabelPairs(std::string_view source,
424  std::vector<std::pair<Label, Label>> *pairs) {
425  return ReadIntPairs(source, pairs);
426 }
427 
428 template <typename Label>
429 bool WriteLabelPairs(std::string_view source,
430  const std::vector<std::pair<Label, Label>> &pairs) {
431  return WriteIntPairs(source, pairs);
432 }
433 
434 // Utilities for converting a type name to a legal C symbol.
435 
436 void ConvertToLegalCSymbol(std::string *s);
437 
438 // Utilities for stream I/O.
439 
440 bool AlignInput(std::istream &strm, size_t align = MappedFile::kArchAlignment);
441 bool AlignOutput(std::ostream &strm, size_t align = MappedFile::kArchAlignment);
442 
443 // An associative container for which testing membership is faster than an STL
444 // set if members are restricted to an interval that excludes most non-members.
445 // A Key must have ==, !=, and < operators defined. Element NoKey should be a
446 // key that marks an uninitialized key and is otherwise unused. Find() returns
447 // an STL const_iterator to the match found, otherwise it equals End().
448 template <class Key, Key NoKey>
449 class CompactSet {
450  public:
451  using const_iterator = typename std::set<Key>::const_iterator;
452 
453  CompactSet() : min_key_(NoKey), max_key_(NoKey) {}
454 
455  CompactSet(const CompactSet &) = default;
456 
457  void Insert(Key key) {
458  set_.insert(key);
459  if (min_key_ == NoKey || key < min_key_) min_key_ = key;
460  if (max_key_ == NoKey || max_key_ < key) max_key_ = key;
461  }
462 
463  void Erase(Key key) {
464  set_.erase(key);
465  if (set_.empty()) {
466  min_key_ = max_key_ = NoKey;
467  } else if (key == min_key_) {
468  ++min_key_;
469  } else if (key == max_key_) {
470  --max_key_;
471  }
472  }
473 
474  void Clear() {
475  set_.clear();
476  min_key_ = max_key_ = NoKey;
477  }
478 
479  const_iterator Find(Key key) const {
480  if (min_key_ == NoKey || key < min_key_ || max_key_ < key) {
481  return set_.end();
482  } else {
483  return set_.find(key);
484  }
485  }
486 
487  bool Member(Key key) const {
488  if (min_key_ == NoKey || key < min_key_ || max_key_ < key) {
489  return false; // out of range
490  } else if (min_key_ != NoKey && max_key_ + 1 == min_key_ + set_.size()) {
491  return true; // dense range
492  } else {
493  return set_.count(key);
494  }
495  }
496 
497  const_iterator Begin() const { return set_.begin(); }
498 
499  const_iterator End() const { return set_.end(); }
500 
501  // All stored keys are greater than or equal to this value.
502  Key LowerBound() const { return min_key_; }
503 
504  // All stored keys are less than or equal to this value.
505  Key UpperBound() const { return max_key_; }
506 
507  private:
508  std::set<Key> set_;
509  Key min_key_;
510  Key max_key_;
511 
512  void operator=(const CompactSet &) = delete;
513 };
514 
515 } // namespace fst
516 
517 #endif // FST_UTIL_H_
bool AlignInput(std::istream &strm, size_t align=MappedFile::kArchAlignment)
Definition: util.cc:85
void ConvertToLegalCSymbol(std::string *s)
Definition: util.cc:75
std::ostream & WriteSequence(std::ostream &strm, const C &c)
Definition: util.h:285
Key LowerBound() const
Definition: util.h:502
bool WriteLabelPairs(std::string_view source, const std::vector< std::pair< Label, Label >> &pairs)
Definition: util.h:429
#define LOG(type)
Definition: log.h:53
bool AlignOutput(std::ostream &strm, size_t align=MappedFile::kArchAlignment)
Definition: util.cc:101
static constexpr size_t kArchAlignment
Definition: mapped-file.h:100
void Erase(Key key)
Definition: util.h:463
constexpr int kLineLen
Definition: symbol-table.h:58
Key UpperBound() const
Definition: util.h:505
internal::StringSplitter StrSplit(std::string_view full, ByAnyChar delim)
Definition: compat.cc:75
typename std::set< Label >::const_iterator const_iterator
Definition: util.h:451
std::ostream & WriteType(std::ostream &strm, const T t)
Definition: util.h:229
#define FSTERROR()
Definition: util.h:57
std::optional< int64_t > ParseInt64(std::string_view s, int base=10)
Definition: util.cc:46
DECLARE_bool(fst_error_fatal)
const_iterator Find(Key key) const
Definition: util.h:479
void Insert(Key key)
Definition: util.h:457
void Clear()
Definition: util.h:474
std::string WeightToStr(Weight w)
Definition: util.h:360
constexpr bool IsScalarIOTypeV
Definition: util.h:73
bool Member(Key key) const
Definition: util.h:487
std::ostream & WriteContainer(std::ostream &strm, const C &c)
Definition: util.h:293
std::istream & ReadType(std::istream &strm, T *t)
Definition: util.h:81
const_iterator Begin() const
Definition: util.h:497
std::istream & ReadContainerType(std::istream &strm, C *c, ReserveFn reserve)
Definition: util.h:142
bool ReadLabelPairs(std::string_view source, std::vector< std::pair< Label, Label >> *pairs)
Definition: util.h:423
int64_t StrToInt64(std::string_view s, std::string_view source, size_t nline, bool *error=nullptr)
Definition: util.cc:62
bool ReadIntPairs(std::string_view source, std::vector< std::pair< I, I >> *pairs)
Definition: util.h:370
std::istream & ReadVectorType(std::istream &strm, std::vector< T, A > *c)
Definition: util.h:159
const_iterator End() const
Definition: util.h:499
Weight StrToWeight(std::string_view s)
Definition: util.h:348
bool WriteIntPairs(std::string_view source, const std::vector< std::pair< I, I >> &pairs)
Definition: util.h:403