FST  openfst-1.8.3
OpenFst Library
bi-table.h
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 //
18 // Classes for representing a bijective mapping between an arbitrary entry
19 // of type T and a signed integral ID.
20 
21 #ifndef FST_BI_TABLE_H_
22 #define FST_BI_TABLE_H_
23 
24 #include <sys/types.h>
25 
26 #include <cstddef>
27 #include <cstdint>
28 #include <deque>
29 #include <functional>
30 #include <memory>
31 #include <type_traits>
32 #include <unordered_set>
33 #include <vector>
34 
35 #include <fst/log.h>
36 #include <fst/memory.h>
37 #include <fst/windows_defs.inc>
38 #include <unordered_map>
39 #include <unordered_set>
40 #include <functional>
41 
42 namespace fst {
43 
44 // Bitables model bijective mappings between entries of an arbitrary type T and
45 // an signed integral ID of type I. The IDs are allocated starting from 0 in
46 // order.
47 //
48 // template <class I, class T>
49 // class BiTable {
50 // public:
51 //
52 // // Required constructors.
53 // BiTable();
54 //
55 // // Looks up integer ID from entry. If it doesn't exist and insert
56 // / is true, adds it; otherwise, returns -1.
57 // I FindId(const T &entry, bool insert = true);
58 //
59 // // Looks up entry from integer ID.
60 // const T &FindEntry(I) const;
61 //
62 // // Returns number of stored entries.
63 // I Size() const;
64 // };
65 
66 // An implementation using a hash map for the entry to ID mapping. H is the
67 // hash function and E is the equality function.
68 template <class I, class T, class H = std::hash<T>, class E = std::equal_to<T>>
69 class HashBiTable {
70  public:
71  // Reserves space for table_size elements.
72  explicit HashBiTable(size_t table_size = 0, const H &h = H(),
73  const E &e = E())
74  : hash_func_(h),
75  hash_equal_(e),
76  entry2id_(table_size, hash_func_, hash_equal_) {
77  if (table_size) id2entry_.reserve(table_size);
78  }
79 
81  : hash_func_(table.hash_func_),
82  hash_equal_(table.hash_equal_),
83  entry2id_(table.entry2id_.begin(), table.entry2id_.end(),
84  table.entry2id_.size(), hash_func_, hash_equal_),
85  id2entry_(table.id2entry_) {}
86 
87  I FindId(const T &entry, bool insert = true) {
88  if (!insert) {
89  const auto it = entry2id_.find(entry);
90  return it == entry2id_.end() ? -1 : it->second - 1;
91  }
92  I &id_ref = entry2id_[entry];
93  if (id_ref == 0) { // T not found; stores and assigns a new ID.
94  id2entry_.push_back(entry);
95  id_ref = id2entry_.size();
96  }
97  return id_ref - 1; // NB: id_ref = ID + 1.
98  }
99 
100  const T &FindEntry(I s) const { return id2entry_[s]; }
101 
102  I Size() const { return id2entry_.size(); }
103 
104  // TODO(riley): Add fancy clear-to-size, as in CompactHashBiTable.
105  void Clear() {
106  entry2id_.clear();
107  id2entry_.clear();
108  }
109 
110  private:
111  H hash_func_;
112  E hash_equal_;
113  std::unordered_map<T, I, H, E> entry2id_;
114  std::vector<T> id2entry_;
115 };
116 
117 // Enables alternative hash set representations below.
119 
120 // Default hash set is STL hash_set.
121 template <class K, class H, class E, HSType HS>
122 struct HashSet : public std::unordered_set<K, H, E, PoolAllocator<K>> {
123  private:
124  using Base = std::unordered_set<K, H, E, PoolAllocator<K>>;
125  public:
126  using Base::Base;
127 
128  void rehash(size_t n) {}
129 };
130 
131 // An implementation using a hash set for the entry to ID mapping. The hash set
132 // holds keys which are either the ID or kCurrentKey. These keys can be mapped
133 // to entries either by looking up in the entry vector or, if kCurrentKey, in
134 // current_entry_. The hash and key equality functions map to entries first. H
135 // is the hash function and E is the equality function.
136 template <class I, class T, class H = std::hash<T>, class E = std::equal_to<T>,
137  HSType HS = HS_FLAT>
139  static_assert(HS == HS_STL || HS == HS_FLAT, "Unsupported hash set type");
140 
141  public:
142  friend class HashFunc;
143  friend class HashEqual;
144 
145  // Reserves space for table_size elements.
146  explicit CompactHashBiTable(size_t table_size = 0, const H &h = H(),
147  const E &e = E())
148  : hash_func_(h),
149  hash_equal_(e),
150  compact_hash_func_(*this),
151  compact_hash_equal_(*this),
152  keys_(table_size, compact_hash_func_, compact_hash_equal_) {
153  if (table_size) id2entry_.reserve(table_size);
154  }
155 
157  : hash_func_(table.hash_func_),
158  hash_equal_(table.hash_equal_),
159  compact_hash_func_(*this),
160  compact_hash_equal_(*this),
161  id2entry_(table.id2entry_),
162  keys_(table.keys_.begin(), table.keys_.end(), table.keys_.size(),
163  compact_hash_func_, compact_hash_equal_) {}
164 
165  I FindId(const T &entry, bool insert = true) {
166  current_entry_ = &entry;
167  if (insert) {
168  auto [iter, was_inserted] = keys_.insert(kCurrentKey);
169  if (!was_inserted) return *iter; // Already exists.
170  // Overwrites kCurrentKey with a new key value; this is safe because it
171  // doesn't affect hashing or equality testing.
172  I key = id2entry_.size();
173  const_cast<I &>(*iter) = key;
174  id2entry_.push_back(entry);
175  return key;
176  }
177  const auto it = keys_.find(kCurrentKey);
178  return it == keys_.end() ? -1 : *it;
179  }
180 
181  const T &FindEntry(I s) const { return id2entry_[s]; }
182 
183  I Size() const { return id2entry_.size(); }
184 
185  // Clears content; with argument, erases last n IDs.
186  void Clear(ssize_t n = -1) {
187  if (n < 0 || n >= id2entry_.size()) { // Clears completely.
188  keys_.clear();
189  id2entry_.clear();
190  } else if (n == id2entry_.size() - 1) { // Leaves only key 0.
191  const T entry = FindEntry(0);
192  keys_.clear();
193  id2entry_.clear();
194  FindId(entry, true);
195  } else {
196  while (n-- > 0) {
197  I key = id2entry_.size() - 1;
198  keys_.erase(key);
199  id2entry_.pop_back();
200  }
201  keys_.rehash(0);
202  }
203  }
204 
205  private:
206  static_assert(std::is_signed_v<I>, "I must be a signed type");
207  // ... otherwise >= kCurrentKey comparisons as used below don't work.
208  // TODO(rybach): (1) don't use >= for key comparison, (2) allow unsigned key
209  // types.
210  static constexpr I kCurrentKey = -1;
211 
212  class HashFunc {
213  public:
214  explicit HashFunc(const CompactHashBiTable &ht) : ht_(&ht) {}
215 
216  size_t operator()(I k) const {
217  if (k >= kCurrentKey) {
218  return (ht_->hash_func_)(ht_->Key2Entry(k));
219  } else {
220  return 0;
221  }
222  }
223 
224  private:
225  const CompactHashBiTable *ht_;
226  };
227 
228  class HashEqual {
229  public:
230  explicit HashEqual(const CompactHashBiTable &ht) : ht_(&ht) {}
231 
232  bool operator()(I k1, I k2) const {
233  if (k1 == k2) {
234  return true;
235  } else if (k1 >= kCurrentKey && k2 >= kCurrentKey) {
236  return (ht_->hash_equal_)(ht_->Key2Entry(k1), ht_->Key2Entry(k2));
237  } else {
238  return false;
239  }
240  }
241 
242  private:
243  const CompactHashBiTable *ht_;
244  };
245 
247 
248  const T &Key2Entry(I k) const {
249  if (k == kCurrentKey) {
250  return *current_entry_;
251  } else {
252  return id2entry_[k];
253  }
254  }
255 
256  H hash_func_;
257  E hash_equal_;
258  HashFunc compact_hash_func_;
259  HashEqual compact_hash_equal_;
260  std::vector<T> id2entry_;
261  KeyHashSet keys_;
262  const T *current_entry_;
263 };
264 
265 // An implementation using a vector for the entry to ID mapping. It is passed a
266 // function object FP that should fingerprint entries uniquely to an integer
267 // that can used as a vector index. Normally, VectorBiTable constructs the FP
268 // object. The user can instead pass in this object.
269 template <class I, class T, class FP>
271  public:
272  // Reserves table_size cells of space.
273  explicit VectorBiTable(const FP &fp = FP(), size_t table_size = 0) : fp_(fp) {
274  if (table_size) id2entry_.reserve(table_size);
275  }
276 
278  : fp_(table.fp_), fp2id_(table.fp2id_), id2entry_(table.id2entry_) {}
279 
280  I FindId(const T &entry, bool insert = true) {
281  ssize_t fp = (fp_)(entry);
282  if (fp >= fp2id_.size()) fp2id_.resize(fp + 1);
283  I &id_ref = fp2id_[fp];
284  if (id_ref == 0) { // T not found.
285  if (insert) { // Stores and assigns a new ID.
286  id2entry_.push_back(entry);
287  id_ref = id2entry_.size();
288  } else {
289  return -1;
290  }
291  }
292  return id_ref - 1; // NB: id_ref = ID + 1.
293  }
294 
295  const T &FindEntry(I s) const { return id2entry_[s]; }
296 
297  I Size() const { return id2entry_.size(); }
298 
299  const FP &Fingerprint() const { return fp_; }
300 
301  private:
302  FP fp_;
303  std::vector<I> fp2id_;
304  std::vector<T> id2entry_;
305 };
306 
307 // An implementation using a vector and a compact hash table. The selecting
308 // functor S returns true for entries to be hashed in the vector. The
309 // fingerprinting functor FP returns a unique fingerprint for each entry to be
310 // hashed in the vector (these need to be suitable for indexing in a vector).
311 // The hash functor H is used when hashing entry into the compact hash table.
312 template <class I, class T, class S, class FP, class H = std::hash<T>,
313  HSType HS = HS_FLAT>
315  public:
316  friend class HashFunc;
317  friend class HashEqual;
318 
319  explicit VectorHashBiTable(const S &s = S(), const FP &fp = FP(),
320  const H &h = H(), size_t vector_size = 0,
321  size_t entry_size = 0)
322  : selector_(s),
323  fp_(fp),
324  h_(h),
325  hash_func_(*this),
326  hash_equal_(*this),
327  keys_(0, hash_func_, hash_equal_) {
328  if (vector_size) fp2id_.reserve(vector_size);
329  if (entry_size) id2entry_.reserve(entry_size);
330  }
331 
333  : selector_(table.s_),
334  fp_(table.fp_),
335  h_(table.h_),
336  id2entry_(table.id2entry_),
337  fp2id_(table.fp2id_),
338  hash_func_(*this),
339  hash_equal_(*this),
340  keys_(table.keys_.size(), hash_func_, hash_equal_) {
341  keys_.insert(table.keys_.begin(), table.keys_.end());
342  }
343 
344  I FindId(const T &entry, bool insert = true) {
345  if ((selector_)(entry)) { // Uses the vector if selector_(entry) == true.
346  uint64_t fp = (fp_)(entry);
347  if (fp2id_.size() <= fp) fp2id_.resize(fp + 1, 0);
348  if (fp2id_[fp] == 0) { // T not found.
349  if (insert) { // Stores and assigns a new ID.
350  id2entry_.push_back(entry);
351  fp2id_[fp] = id2entry_.size();
352  } else {
353  return -1;
354  }
355  }
356  return fp2id_[fp] - 1; // NB: assoc_value = ID + 1.
357  } else { // Uses the hash table otherwise.
358  current_entry_ = &entry;
359  if (const auto it = keys_.find(kCurrentKey); it != keys_.end()) {
360  return *it;
361  } else {
362  if (insert) {
363  I key = id2entry_.size();
364  id2entry_.push_back(entry);
365  keys_.insert(key);
366  return key;
367  } else {
368  return -1;
369  }
370  }
371  }
372  }
373 
374  const T &FindEntry(I s) const { return id2entry_[s]; }
375 
376  I Size() const { return id2entry_.size(); }
377 
378  const S &Selector() const { return selector_; }
379 
380  const FP &Fingerprint() const { return fp_; }
381 
382  const H &HashFunction() const { return h_; }
383 
384  private:
385  static constexpr I kCurrentKey = -1;
386 
387  class HashFunc {
388  public:
389  explicit HashFunc(const VectorHashBiTable &ht) : ht_(&ht) {}
390 
391  size_t operator()(I k) const {
392  if (k >= kCurrentKey) {
393  return (ht_->h_)(ht_->Key2Entry(k));
394  } else {
395  return 0;
396  }
397  }
398 
399  private:
400  const VectorHashBiTable *ht_;
401  };
402 
403  class HashEqual {
404  public:
405  explicit HashEqual(const VectorHashBiTable &ht) : ht_(&ht) {}
406 
407  bool operator()(I k1, I k2) const {
408  if (k1 >= kCurrentKey && k2 >= kCurrentKey) {
409  return ht_->Key2Entry(k1) == ht_->Key2Entry(k2);
410  } else {
411  return k1 == k2;
412  }
413  }
414 
415  private:
416  const VectorHashBiTable *ht_;
417  };
418 
420 
421  const T &Key2Entry(I k) const {
422  if (k == kCurrentKey) {
423  return *current_entry_;
424  } else {
425  return id2entry_[k];
426  }
427  }
428 
429  S selector_; // True if entry hashed into vector.
430  FP fp_; // Fingerprint used for hashing into vector.
431  H h_; // Hash funcion used for hashing into hash_set.
432 
433  std::vector<T> id2entry_; // Maps state IDs to entry.
434  std::vector<I> fp2id_; // Maps entry fingerprints to IDs.
435 
436  // Compact implementation of the hash table mapping entries to state IDs
437  // using the hash function h_.
438  HashFunc hash_func_;
439  HashEqual hash_equal_;
440  KeyHashSet keys_;
441  const T *current_entry_;
442 };
443 
444 // An implementation using a hash map for the entry to ID mapping. This version
445 // permits erasing of arbitrary states. The entry T must have == defined and
446 // its default constructor must produce a entry that will never be seen. F is
447 // the hash function.
448 template <class I, class T, class F>
450  public:
451  ErasableBiTable() : first_(0) {}
452 
453  I FindId(const T &entry, bool insert = true) {
454  I &id_ref = entry2id_[entry];
455  if (id_ref == 0) { // T not found.
456  if (insert) { // Stores and assigns a new ID.
457  id2entry_.push_back(entry);
458  id_ref = id2entry_.size() + first_;
459  } else {
460  return -1;
461  }
462  }
463  return id_ref - 1; // NB: id_ref = ID + 1.
464  }
465 
466  const T &FindEntry(I s) const { return id2entry_[s - first_]; }
467 
468  I Size() const { return id2entry_.size(); }
469 
470  void Erase(I s) {
471  auto &ref = id2entry_[s - first_];
472  entry2id_.erase(ref);
473  ref = empty_entry_;
474  while (!id2entry_.empty() && id2entry_.front() == empty_entry_) {
475  id2entry_.pop_front();
476  ++first_;
477  }
478  }
479 
480  private:
481  std::unordered_map<T, I, F> entry2id_;
482  std::deque<T> id2entry_;
483  const T empty_entry_;
484  I first_; // I of first element in the deque.
485 };
486 
487 } // namespace fst
488 
489 #endif // FST_BI_TABLE_H_
VectorHashBiTable(const VectorHashBiTable< I, T, S, FP, H, HS > &table)
Definition: bi-table.h:332
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:280
const T & FindEntry(I s) const
Definition: bi-table.h:181
const T & FindEntry(I s) const
Definition: bi-table.h:100
const S & Selector() const
Definition: bi-table.h:378
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:344
VectorBiTable(const FP &fp=FP(), size_t table_size=0)
Definition: bi-table.h:273
const FP & Fingerprint() const
Definition: bi-table.h:299
const T & FindEntry(I s) const
Definition: bi-table.h:466
VectorHashBiTable(const S &s=S(), const FP &fp=FP(), const H &h=H(), size_t vector_size=0, size_t entry_size=0)
Definition: bi-table.h:319
I Size() const
Definition: bi-table.h:102
HashBiTable(const HashBiTable< I, T, H, E > &table)
Definition: bi-table.h:80
I Size() const
Definition: bi-table.h:468
const T & FindEntry(I s) const
Definition: bi-table.h:295
I Size() const
Definition: bi-table.h:297
const FP & Fingerprint() const
Definition: bi-table.h:380
HashBiTable(size_t table_size=0, const H &h=H(), const E &e=E())
Definition: bi-table.h:72
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:453
HSType
Definition: bi-table.h:118
VectorBiTable(const VectorBiTable< I, T, FP > &table)
Definition: bi-table.h:277
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:87
const T & FindEntry(I s) const
Definition: bi-table.h:374
CompactHashBiTable(size_t table_size=0, const H &h=H(), const E &e=E())
Definition: bi-table.h:146
const H & HashFunction() const
Definition: bi-table.h:382
void Clear(ssize_t n=-1)
Definition: bi-table.h:186
CompactHashBiTable(const CompactHashBiTable &table)
Definition: bi-table.h:156
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:165
void rehash(size_t n)
Definition: bi-table.h:128