FST  openfst-1.8.2.post1
OpenFst Library
bi-table.h
Go to the documentation of this file.
1 // Copyright 2005-2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 //
18 // Classes for representing a bijective mapping between an arbitrary entry
19 // of type T and a signed integral ID.
20 
21 #ifndef FST_BI_TABLE_H_
22 #define FST_BI_TABLE_H_
23 
24 #include <cstdint>
25 #include <deque>
26 #include <functional>
27 #include <memory>
28 #include <type_traits>
29 #include <unordered_set>
30 #include <vector>
31 
32 #include <fst/log.h>
33 #include <fst/memory.h>
34 #include <fst/windows_defs.inc>
35 #include <unordered_map>
36 #include <unordered_set>
37 
38 namespace fst {
39 
40 // Bitables model bijective mappings between entries of an arbitrary type T and
41 // an signed integral ID of type I. The IDs are allocated starting from 0 in
42 // order.
43 //
44 // template <class I, class T>
45 // class BiTable {
46 // public:
47 //
48 // // Required constructors.
49 // BiTable();
50 //
51 // // Looks up integer ID from entry. If it doesn't exist and insert
52 // / is true, adds it; otherwise, returns -1.
53 // I FindId(const T &entry, bool insert = true);
54 //
55 // // Looks up entry from integer ID.
56 // const T &FindEntry(I) const;
57 //
58 // // Returns number of stored entries.
59 // I Size() const;
60 // };
61 
62 // An implementation using a hash map for the entry to ID mapping. H is the
63 // hash function and E is the equality function.
64 template <class I, class T, class H, class E = std::equal_to<T>>
65 class HashBiTable {
66  public:
67  // Reserves space for table_size elements.
68  explicit HashBiTable(size_t table_size = 0, const H &h = H(),
69  const E &e = E())
70  : hash_func_(h),
71  hash_equal_(e),
72  entry2id_(table_size, hash_func_, hash_equal_) {
73  if (table_size) id2entry_.reserve(table_size);
74  }
75 
77  : hash_func_(table.hash_func_),
78  hash_equal_(table.hash_equal_),
79  entry2id_(table.entry2id_.begin(), table.entry2id_.end(),
80  table.entry2id_.size(), hash_func_, hash_equal_),
81  id2entry_(table.id2entry_) {}
82 
83  I FindId(const T &entry, bool insert = true) {
84  if (!insert) {
85  const auto it = entry2id_.find(entry);
86  return it == entry2id_.end() ? -1 : it->second - 1;
87  }
88  I &id_ref = entry2id_[entry];
89  if (id_ref == 0) { // T not found; stores and assigns a new ID.
90  id2entry_.push_back(entry);
91  id_ref = id2entry_.size();
92  }
93  return id_ref - 1; // NB: id_ref = ID + 1.
94  }
95 
96  const T &FindEntry(I s) const { return id2entry_[s]; }
97 
98  I Size() const { return id2entry_.size(); }
99 
100  // TODO(riley): Add fancy clear-to-size, as in CompactHashBiTable.
101  void Clear() {
102  entry2id_.clear();
103  id2entry_.clear();
104  }
105 
106  private:
107  H hash_func_;
108  E hash_equal_;
109  std::unordered_map<T, I, H, E> entry2id_;
110  std::vector<T> id2entry_;
111 };
112 
113 // Enables alternative hash set representations below.
115 
116 // Default hash set is STL hash_set.
117 template <class K, class H, class E, HSType HS>
118 struct HashSet : public std::unordered_set<K, H, E, PoolAllocator<K>> {
119  private:
120  using Base = std::unordered_set<K, H, E, PoolAllocator<K>>;
121  public:
122  using Base::Base;
123 
124  void rehash(size_t n) {}
125 };
126 
127 // An implementation using a hash set for the entry to ID mapping. The hash set
128 // holds keys which are either the ID or kCurrentKey. These keys can be mapped
129 // to entries either by looking up in the entry vector or, if kCurrentKey, in
130 // current_entry_. The hash and key equality functions map to entries first. H
131 // is the hash function and E is the equality function.
132 template <class I, class T, class H, class E = std::equal_to<T>,
133  HSType HS = HS_FLAT>
135  static_assert(HS == HS_STL || HS == HS_FLAT, "Unsupported hash set type");
136 
137  public:
138  friend class HashFunc;
139  friend class HashEqual;
140 
141  // Reserves space for table_size elements.
142  explicit CompactHashBiTable(size_t table_size = 0, const H &h = H(),
143  const E &e = E())
144  : hash_func_(h),
145  hash_equal_(e),
146  compact_hash_func_(*this),
147  compact_hash_equal_(*this),
148  keys_(table_size, compact_hash_func_, compact_hash_equal_) {
149  if (table_size) id2entry_.reserve(table_size);
150  }
151 
153  : hash_func_(table.hash_func_),
154  hash_equal_(table.hash_equal_),
155  compact_hash_func_(*this),
156  compact_hash_equal_(*this),
157  id2entry_(table.id2entry_),
158  keys_(table.keys_.begin(), table.keys_.end(), table.keys_.size(),
159  compact_hash_func_, compact_hash_equal_) {}
160 
161  I FindId(const T &entry, bool insert = true) {
162  current_entry_ = &entry;
163  if (insert) {
164  auto [iter, was_inserted] = keys_.insert(kCurrentKey);
165  if (!was_inserted) return *iter; // Already exists.
166  // Overwrites kCurrentKey with a new key value; this is safe because it
167  // doesn't affect hashing or equality testing.
168  I key = id2entry_.size();
169  const_cast<I &>(*iter) = key;
170  id2entry_.push_back(entry);
171  return key;
172  }
173  const auto it = keys_.find(kCurrentKey);
174  return it == keys_.end() ? -1 : *it;
175  }
176 
177  const T &FindEntry(I s) const { return id2entry_[s]; }
178 
179  I Size() const { return id2entry_.size(); }
180 
181  // Clears content; with argument, erases last n IDs.
182  void Clear(ssize_t n = -1) {
183  if (n < 0 || n >= id2entry_.size()) { // Clears completely.
184  keys_.clear();
185  id2entry_.clear();
186  } else if (n == id2entry_.size() - 1) { // Leaves only key 0.
187  const T entry = FindEntry(0);
188  keys_.clear();
189  id2entry_.clear();
190  FindId(entry, true);
191  } else {
192  while (n-- > 0) {
193  I key = id2entry_.size() - 1;
194  keys_.erase(key);
195  id2entry_.pop_back();
196  }
197  keys_.rehash(0);
198  }
199  }
200 
201  private:
202  static_assert(std::is_signed_v<I>, "I must be a signed type");
203  // ... otherwise >= kCurrentKey comparisons as used below don't work.
204  // TODO(rybach): (1) don't use >= for key comparison, (2) allow unsigned key
205  // types.
206  static constexpr I kCurrentKey = -1;
207 
208  class HashFunc {
209  public:
210  explicit HashFunc(const CompactHashBiTable &ht) : ht_(&ht) {}
211 
212  size_t operator()(I k) const {
213  if (k >= kCurrentKey) {
214  return (ht_->hash_func_)(ht_->Key2Entry(k));
215  } else {
216  return 0;
217  }
218  }
219 
220  private:
221  const CompactHashBiTable *ht_;
222  };
223 
224  class HashEqual {
225  public:
226  explicit HashEqual(const CompactHashBiTable &ht) : ht_(&ht) {}
227 
228  bool operator()(I k1, I k2) const {
229  if (k1 == k2) {
230  return true;
231  } else if (k1 >= kCurrentKey && k2 >= kCurrentKey) {
232  return (ht_->hash_equal_)(ht_->Key2Entry(k1), ht_->Key2Entry(k2));
233  } else {
234  return false;
235  }
236  }
237 
238  private:
239  const CompactHashBiTable *ht_;
240  };
241 
243 
244  const T &Key2Entry(I k) const {
245  if (k == kCurrentKey) {
246  return *current_entry_;
247  } else {
248  return id2entry_[k];
249  }
250  }
251 
252  H hash_func_;
253  E hash_equal_;
254  HashFunc compact_hash_func_;
255  HashEqual compact_hash_equal_;
256  std::vector<T> id2entry_;
257  KeyHashSet keys_;
258  const T *current_entry_;
259 };
260 
261 // An implementation using a vector for the entry to ID mapping. It is passed a
262 // function object FP that should fingerprint entries uniquely to an integer
263 // that can used as a vector index. Normally, VectorBiTable constructs the FP
264 // object. The user can instead pass in this object.
265 template <class I, class T, class FP>
267  public:
268  // Reserves table_size cells of space.
269  explicit VectorBiTable(const FP &fp = FP(), size_t table_size = 0) : fp_(fp) {
270  if (table_size) id2entry_.reserve(table_size);
271  }
272 
274  : fp_(table.fp_), fp2id_(table.fp2id_), id2entry_(table.id2entry_) {}
275 
276  I FindId(const T &entry, bool insert = true) {
277  ssize_t fp = (fp_)(entry);
278  if (fp >= fp2id_.size()) fp2id_.resize(fp + 1);
279  I &id_ref = fp2id_[fp];
280  if (id_ref == 0) { // T not found.
281  if (insert) { // Stores and assigns a new ID.
282  id2entry_.push_back(entry);
283  id_ref = id2entry_.size();
284  } else {
285  return -1;
286  }
287  }
288  return id_ref - 1; // NB: id_ref = ID + 1.
289  }
290 
291  const T &FindEntry(I s) const { return id2entry_[s]; }
292 
293  I Size() const { return id2entry_.size(); }
294 
295  const FP &Fingerprint() const { return fp_; }
296 
297  private:
298  FP fp_;
299  std::vector<I> fp2id_;
300  std::vector<T> id2entry_;
301 };
302 
303 // An implementation using a vector and a compact hash table. The selecting
304 // functor S returns true for entries to be hashed in the vector. The
305 // fingerprinting functor FP returns a unique fingerprint for each entry to be
306 // hashed in the vector (these need to be suitable for indexing in a vector).
307 // The hash functor H is used when hashing entry into the compact hash table.
308 template <class I, class T, class S, class FP, class H, HSType HS = HS_FLAT>
310  public:
311  friend class HashFunc;
312  friend class HashEqual;
313 
314  explicit VectorHashBiTable(const S &s = S(), const FP &fp = FP(),
315  const H &h = H(), size_t vector_size = 0,
316  size_t entry_size = 0)
317  : selector_(s),
318  fp_(fp),
319  h_(h),
320  hash_func_(*this),
321  hash_equal_(*this),
322  keys_(0, hash_func_, hash_equal_) {
323  if (vector_size) fp2id_.reserve(vector_size);
324  if (entry_size) id2entry_.reserve(entry_size);
325  }
326 
328  : selector_(table.s_),
329  fp_(table.fp_),
330  h_(table.h_),
331  id2entry_(table.id2entry_),
332  fp2id_(table.fp2id_),
333  hash_func_(*this),
334  hash_equal_(*this),
335  keys_(table.keys_.size(), hash_func_, hash_equal_) {
336  keys_.insert(table.keys_.begin(), table.keys_.end());
337  }
338 
339  I FindId(const T &entry, bool insert = true) {
340  if ((selector_)(entry)) { // Uses the vector if selector_(entry) == true.
341  uint64_t fp = (fp_)(entry);
342  if (fp2id_.size() <= fp) fp2id_.resize(fp + 1, 0);
343  if (fp2id_[fp] == 0) { // T not found.
344  if (insert) { // Stores and assigns a new ID.
345  id2entry_.push_back(entry);
346  fp2id_[fp] = id2entry_.size();
347  } else {
348  return -1;
349  }
350  }
351  return fp2id_[fp] - 1; // NB: assoc_value = ID + 1.
352  } else { // Uses the hash table otherwise.
353  current_entry_ = &entry;
354  const auto it = keys_.find(kCurrentKey);
355  if (it == keys_.end()) {
356  if (insert) {
357  I key = id2entry_.size();
358  id2entry_.push_back(entry);
359  keys_.insert(key);
360  return key;
361  } else {
362  return -1;
363  }
364  } else {
365  return *it;
366  }
367  }
368  }
369 
370  const T &FindEntry(I s) const { return id2entry_[s]; }
371 
372  I Size() const { return id2entry_.size(); }
373 
374  const S &Selector() const { return selector_; }
375 
376  const FP &Fingerprint() const { return fp_; }
377 
378  const H &Hash() const { return h_; }
379 
380  private:
381  static constexpr I kCurrentKey = -1;
382 
383  class HashFunc {
384  public:
385  explicit HashFunc(const VectorHashBiTable &ht) : ht_(&ht) {}
386 
387  size_t operator()(I k) const {
388  if (k >= kCurrentKey) {
389  return (ht_->h_)(ht_->Key2Entry(k));
390  } else {
391  return 0;
392  }
393  }
394 
395  private:
396  const VectorHashBiTable *ht_;
397  };
398 
399  class HashEqual {
400  public:
401  explicit HashEqual(const VectorHashBiTable &ht) : ht_(&ht) {}
402 
403  bool operator()(I k1, I k2) const {
404  if (k1 >= kCurrentKey && k2 >= kCurrentKey) {
405  return ht_->Key2Entry(k1) == ht_->Key2Entry(k2);
406  } else {
407  return k1 == k2;
408  }
409  }
410 
411  private:
412  const VectorHashBiTable *ht_;
413  };
414 
416 
417  const T &Key2Entry(I k) const {
418  if (k == kCurrentKey) {
419  return *current_entry_;
420  } else {
421  return id2entry_[k];
422  }
423  }
424 
425  S selector_; // True if entry hashed into vector.
426  FP fp_; // Fingerprint used for hashing into vector.
427  H h_; // Hash funcion used for hashing into hash_set.
428 
429  std::vector<T> id2entry_; // Maps state IDs to entry.
430  std::vector<I> fp2id_; // Maps entry fingerprints to IDs.
431 
432  // Compact implementation of the hash table mapping entries to state IDs
433  // using the hash function h_.
434  HashFunc hash_func_;
435  HashEqual hash_equal_;
436  KeyHashSet keys_;
437  const T *current_entry_;
438 };
439 
440 // An implementation using a hash map for the entry to ID mapping. This version
441 // permits erasing of arbitrary states. The entry T must have == defined and
442 // its default constructor must produce a entry that will never be seen. F is
443 // the hash function.
444 template <class I, class T, class F>
446  public:
447  ErasableBiTable() : first_(0) {}
448 
449  I FindId(const T &entry, bool insert = true) {
450  I &id_ref = entry2id_[entry];
451  if (id_ref == 0) { // T not found.
452  if (insert) { // Stores and assigns a new ID.
453  id2entry_.push_back(entry);
454  id_ref = id2entry_.size() + first_;
455  } else {
456  return -1;
457  }
458  }
459  return id_ref - 1; // NB: id_ref = ID + 1.
460  }
461 
462  const T &FindEntry(I s) const { return id2entry_[s - first_]; }
463 
464  I Size() const { return id2entry_.size(); }
465 
466  void Erase(I s) {
467  auto &ref = id2entry_[s - first_];
468  entry2id_.erase(ref);
469  ref = empty_entry_;
470  while (!id2entry_.empty() && id2entry_.front() == empty_entry_) {
471  id2entry_.pop_front();
472  ++first_;
473  }
474  }
475 
476  private:
477  std::unordered_map<T, I, F> entry2id_;
478  std::deque<T> id2entry_;
479  const T empty_entry_;
480  I first_; // I of first element in the deque.
481 };
482 
483 } // namespace fst
484 
485 #endif // FST_BI_TABLE_H_
VectorHashBiTable(const VectorHashBiTable< I, T, S, FP, H, HS > &table)
Definition: bi-table.h:327
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:276
const T & FindEntry(I s) const
Definition: bi-table.h:177
const T & FindEntry(I s) const
Definition: bi-table.h:96
const S & Selector() const
Definition: bi-table.h:374
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:339
VectorBiTable(const FP &fp=FP(), size_t table_size=0)
Definition: bi-table.h:269
const FP & Fingerprint() const
Definition: bi-table.h:295
const T & FindEntry(I s) const
Definition: bi-table.h:462
VectorHashBiTable(const S &s=S(), const FP &fp=FP(), const H &h=H(), size_t vector_size=0, size_t entry_size=0)
Definition: bi-table.h:314
I Size() const
Definition: bi-table.h:98
HashBiTable(const HashBiTable< I, T, H, E > &table)
Definition: bi-table.h:76
I Size() const
Definition: bi-table.h:464
const T & FindEntry(I s) const
Definition: bi-table.h:291
I Size() const
Definition: bi-table.h:293
const FP & Fingerprint() const
Definition: bi-table.h:376
HashBiTable(size_t table_size=0, const H &h=H(), const E &e=E())
Definition: bi-table.h:68
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:449
HSType
Definition: bi-table.h:114
VectorBiTable(const VectorBiTable< I, T, FP > &table)
Definition: bi-table.h:273
const H & Hash() const
Definition: bi-table.h:378
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:83
const T & FindEntry(I s) const
Definition: bi-table.h:370
CompactHashBiTable(size_t table_size=0, const H &h=H(), const E &e=E())
Definition: bi-table.h:142
void Clear(ssize_t n=-1)
Definition: bi-table.h:182
CompactHashBiTable(const CompactHashBiTable &table)
Definition: bi-table.h:152
I FindId(const T &entry, bool insert=true)
Definition: bi-table.h:161
void rehash(size_t n)
Definition: bi-table.h:124