FST  openfst-1.7.1
OpenFst Library
symbol-table.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // Classes to provide symbol-to-integer and integer-to-symbol mappings.
5 
6 #ifndef FST_SYMBOL_TABLE_H_
7 #define FST_SYMBOL_TABLE_H_
8 
9 #include <functional>
10 #include <ios>
11 #include <iostream>
12 #include <memory>
13 #include <sstream>
14 #include <string>
15 #include <utility>
16 #include <vector>
17 
18 #include <fst/compat.h>
19 #include <fst/flags.h>
20 #include <fst/log.h>
21 #include <fstream>
22 #include <map>
23 
24 DECLARE_bool(fst_compat_symbols);
25 
26 namespace fst {
27 
28 constexpr int64 kNoSymbol = -1;
29 
30 // WARNING: Reading via symbol table read options should
31 // not be used. This is a temporary work around for
32 // reading symbol ranges of previously stored symbol sets.
35 
37  std::vector<std::pair<int64, int64>> string_hash_ranges,
38  const string &source)
39  : string_hash_ranges(std::move(string_hash_ranges)), source(source) {}
40 
41  std::vector<std::pair<int64, int64>> string_hash_ranges;
42  string source;
43 };
44 
46  explicit SymbolTableTextOptions(bool allow_negative_labels = false);
47 
50 };
51 
52 namespace internal {
53 
54 // List of symbols with a dense hash for looking up symbol index, rehashing at
55 // 75% occupancy.
57  public:
59 
61 
62  std::pair<int64, bool> InsertOrFind(const string &key);
63 
64  int64 Find(const string &key) const;
65 
66  size_t Size() const { return symbols_.size(); }
67 
68  const string &GetSymbol(size_t idx) const { return symbols_[idx]; }
69 
70  void RemoveSymbol(size_t idx);
71 
72  private:
73  // num_buckets must be power of 2.
74  void Rehash(size_t num_buckets);
75 
76  int64 empty_;
77  std::vector<string> symbols_;
78  std::hash<string> str_hash_;
79  std::vector<int64> buckets_;
80  uint64 hash_mask_;
81 };
82 
84  public:
85  explicit SymbolTableImpl(const string &name)
86  : name_(name),
87  available_key_(0),
88  dense_key_limit_(0),
89  check_sum_finalized_(false) {}
90 
92  : name_(impl.name_),
93  available_key_(impl.available_key_),
94  dense_key_limit_(impl.dense_key_limit_),
95  symbols_(impl.symbols_),
96  idx_key_(impl.idx_key_),
97  key_map_(impl.key_map_),
98  check_sum_finalized_(false) {}
99 
100  int64 AddSymbol(const string &symbol, int64 key);
101 
102  int64 AddSymbol(const string &symbol) {
103  return AddSymbol(symbol, available_key_);
104  }
105 
106  // Removes the symbol with the given key. The removal is costly
107  // (O(NumSymbols)) and may reduce the efficiency of Find() because of a
108  // potentially reduced size of the dense key interval.
109  void RemoveSymbol(int64 key);
110 
111  static SymbolTableImpl *ReadText(
112  std::istream &strm, const string &name,
114 
115  static SymbolTableImpl* Read(std::istream &strm,
116  const SymbolTableReadOptions &opts);
117 
118  bool Write(std::ostream &strm) const;
119 
120  // Return the string associated with the key. If the key is out of
121  // range (<0, >max), return an empty string.
122  string Find(int64 key) const {
123  int64 idx = key;
124  if (key < 0 || key >= dense_key_limit_) {
125  const auto it = key_map_.find(key);
126  if (it == key_map_.end()) return "";
127  idx = it->second;
128  }
129  if (idx < 0 || idx >= symbols_.Size()) return "";
130  return symbols_.GetSymbol(idx);
131  }
132 
133  // Returns the key associated with the symbol; if the symbol
134  // does not exists, returns kNoSymbol.
135  int64 Find(const string &symbol) const {
136  int64 idx = symbols_.Find(symbol);
137  if (idx == kNoSymbol || idx < dense_key_limit_) return idx;
138  return idx_key_[idx - dense_key_limit_];
139  }
140 
141  bool Member(int64 key) const { return !Find(key).empty(); }
142 
143  bool Member(const string &symbol) const { return Find(symbol) != kNoSymbol; }
144 
145  int64 GetNthKey(ssize_t pos) const {
146  if (pos < 0 || pos >= symbols_.Size()) return kNoSymbol;
147  if (pos < dense_key_limit_) return pos;
148  return Find(symbols_.GetSymbol(pos));
149  }
150 
151  const string &Name() const { return name_; }
152 
153  void SetName(const string &new_name) { name_ = new_name; }
154 
155  const string &CheckSum() const {
156  MaybeRecomputeCheckSum();
157  return check_sum_string_;
158  }
159 
160  const string &LabeledCheckSum() const {
161  MaybeRecomputeCheckSum();
162  return labeled_check_sum_string_;
163  }
164 
165  int64 AvailableKey() const { return available_key_; }
166 
167  size_t NumSymbols() const { return symbols_.Size(); }
168 
169  private:
170  // Recomputes the checksums (both of them) if we've had changes since the last
171  // computation (i.e., if check_sum_finalized_ is false).
172  // Takes ~2.5 microseconds (dbg) or ~230 nanoseconds (opt) on a 2.67GHz Xeon
173  // if the checksum is up-to-date (requiring no recomputation).
174  void MaybeRecomputeCheckSum() const;
175 
176  string name_;
177  int64 available_key_;
178  int64 dense_key_limit_;
179 
180  DenseSymbolMap symbols_;
181  // Maps index to key for index >= dense_key_limit:
182  // key = idx_key_[index - dense_key_limit]
183  std::vector<int64> idx_key_;
184  // Maps key to index for key >= dense_key_limit_.
185  // index = key_map_[key]
186  std::map<int64, int64> key_map_;
187 
188  mutable bool check_sum_finalized_;
189  mutable string check_sum_string_;
190  mutable string labeled_check_sum_string_;
191  mutable Mutex check_sum_mutex_;
192 };
193 
194 } // namespace internal
195 
196 // Symbol (string) to integer (and reverse) mapping.
197 //
198 // The SymbolTable implements the mappings of labels to strings and reverse.
199 // SymbolTables are used to describe the alphabet of the input and output
200 // labels for arcs in a Finite State Transducer.
201 //
202 // SymbolTables are reference-counted and can therefore be shared across
203 // multiple machines. For example a language model grammar G, with a
204 // SymbolTable for the words in the language model can share this symbol
205 // table with the lexical representation L o G.
206 class SymbolTable {
207  public:
208  // Constructs symbol table with an optional name.
209  explicit SymbolTable(const string &name = "<unspecified>")
210  : impl_(std::make_shared<internal::SymbolTableImpl>(name)) {}
211 
212  virtual ~SymbolTable() {}
213 
214  // Reads a text representation of the symbol table from an istream. Pass a
215  // name to give the resulting SymbolTable.
217  std::istream &strm, const string &name,
219  auto *impl = internal::SymbolTableImpl::ReadText(strm, name, opts);
220  return impl ? new SymbolTable(impl) : nullptr;
221  }
222 
223  // Reads a text representation of the symbol table.
224  static SymbolTable *ReadText(const string &filename,
226  std::ifstream strm(filename, std::ios_base::in);
227  if (!strm.good()) {
228  LOG(ERROR) << "SymbolTable::ReadText: Can't open file " << filename;
229  return nullptr;
230  }
231  return ReadText(strm, filename, opts);
232  }
233 
234  // WARNING: Reading via symbol table read options should not be used. This is
235  // a temporary work-around.
236  static SymbolTable* Read(std::istream &strm,
237  const SymbolTableReadOptions &opts) {
238  auto *impl = internal::SymbolTableImpl::Read(strm, opts);
239  return (impl) ? new SymbolTable(impl) : nullptr;
240  }
241 
242  // Reads a binary dump of the symbol table from a stream.
243  static SymbolTable *Read(std::istream &strm,
244  const string &source) {
246  opts.source = source;
247  return Read(strm, opts);
248  }
249 
250  // Reads a binary dump of the symbol table.
251  static SymbolTable *Read(const string& filename) {
252  std::ifstream strm(filename,
253  std::ios_base::in | std::ios_base::binary);
254  if (!strm.good()) {
255  LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
256  return nullptr;
257  }
258  return Read(strm, filename);
259  }
260 
261  // DERIVABLE INTERFACE
262 
263  // Creates a reference counted copy.
264  virtual SymbolTable *Copy() const { return new SymbolTable(*this); }
265 
266  // Adds a symbol with given key to table. A symbol table also keeps track of
267  // the last available key (highest key value in the symbol table).
268  virtual int64 AddSymbol(const string &symbol, int64 key) {
269  MutateCheck();
270  return impl_->AddSymbol(symbol, key);
271  }
272 
273  // Adds a symbol to the table. The associated value key is automatically
274  // assigned by the symbol table.
275  virtual int64 AddSymbol(const string &symbol) {
276  MutateCheck();
277  return impl_->AddSymbol(symbol);
278  }
279 
280  // Adds another symbol table to this table. All key values will be offset
281  // by the current available key (highest key value in the symbol table).
282  // Note string symbols with the same key value will still have the same
283  // key value after the symbol table has been merged, but a different
284  // value. Adding symbol tables do not result in changes in the base table.
285  virtual void AddTable(const SymbolTable &table);
286 
287  // Returns the current available key (i.e., highest key + 1) in the symbol
288  // table.
289  virtual int64 AvailableKey() const { return impl_->AvailableKey(); }
290 
291  // Return the label-agnostic MD5 check-sum for this table. All new symbols
292  // added to the table will result in an updated checksum. Deprecated.
293  virtual const string &CheckSum() const { return impl_->CheckSum(); }
294 
295  virtual int64 GetNthKey(ssize_t pos) const { return impl_->GetNthKey(pos); }
296 
297  // Returns the string associated with the key; if the key is out of
298  // range (<0, >max), returns an empty string.
299  virtual string Find(int64 key) const { return impl_->Find(key); }
300 
301  // Returns the key associated with the symbol; if the symbol does not exist,
302  // kNoSymbol is returned.
303  virtual int64 Find(const string &symbol) const { return impl_->Find(symbol); }
304 
305  // Same as CheckSum(), but returns an label-dependent version.
306  virtual const string &LabeledCheckSum() const {
307  return impl_->LabeledCheckSum();
308  }
309 
310  virtual bool Member(int64 key) const { return impl_->Member(key); }
311 
312  virtual bool Member(const string &symbol) const {
313  return impl_->Member(symbol);
314  }
315 
316  // Returns the name of the symbol table.
317  virtual const string &Name() const { return impl_->Name(); }
318 
319  // Returns the current number of symbols in table (not necessarily equal to
320  // AvailableKey()).
321  virtual size_t NumSymbols() const { return impl_->NumSymbols(); }
322 
323  virtual void RemoveSymbol(int64 key) {
324  MutateCheck();
325  return impl_->RemoveSymbol(key);
326  }
327 
328  // Sets the name of the symbol table.
329  virtual void SetName(const string &new_name) {
330  MutateCheck();
331  impl_->SetName(new_name);
332  }
333 
334  virtual bool Write(std::ostream &strm) const { return impl_->Write(strm); }
335 
336  virtual bool Write(const string &filename) const {
337  std::ofstream strm(filename,
338  std::ios_base::out | std::ios_base::binary);
339  if (!strm.good()) {
340  LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
341  return false;
342  }
343  return Write(strm);
344  }
345 
346  // Dump a text representation of the symbol table via a stream.
347  virtual bool WriteText(std::ostream &strm,
348  const SymbolTableTextOptions &opts = SymbolTableTextOptions()) const;
349 
350  // Dump an text representation of the symbol table.
351  virtual bool WriteText(const string &filename) const {
352  std::ofstream strm(filename);
353  if (!strm.good()) {
354  LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
355  return false;
356  }
357  return WriteText(strm);
358  }
359 
360  private:
361  explicit SymbolTable(internal::SymbolTableImpl *impl) : impl_(impl) {}
362 
363  void MutateCheck() {
364  if (!impl_.unique()) impl_.reset(new internal::SymbolTableImpl(*impl_));
365  }
366 
367  const internal::SymbolTableImpl *Impl() const { return impl_.get(); }
368 
369  private:
370  std::shared_ptr<internal::SymbolTableImpl> impl_;
371 };
372 
373 // Iterator class for symbols in a symbol table.
375  public:
376  explicit SymbolTableIterator(const SymbolTable &table)
377  : table_(table),
378  pos_(0),
379  nsymbols_(table.NumSymbols()),
380  key_(table.GetNthKey(0)) {}
381 
383 
384  // Returns whether iterator is done.
385  bool Done() const { return (pos_ == nsymbols_); }
386 
387  // Return the key of the current symbol.
388  int64 Value() const { return key_; }
389 
390  // Return the string of the current symbol.
391  string Symbol() const { return table_.Find(key_); }
392 
393  // Advances iterator.
394  void Next() {
395  ++pos_;
396  if (pos_ < nsymbols_) key_ = table_.GetNthKey(pos_);
397  }
398 
399  // Resets iterator.
400  void Reset() {
401  pos_ = 0;
402  key_ = table_.GetNthKey(0);
403  }
404 
405  private:
406  const SymbolTable &table_;
407  ssize_t pos_;
408  size_t nsymbols_;
409  int64 key_;
410 };
411 
412 // Relabels a symbol table as specified by the input vector of pairs
413 // (old label, new label). The new symbol table only retains symbols
414 // for which a relabeling is explicitly specified.
415 //
416 // TODO(allauzen): consider adding options to allow for some form of implicit
417 // identity relabeling.
418 template <class Label>
420  const std::vector<std::pair<Label, Label>> &pairs) {
421  auto *new_table = new SymbolTable(
422  table->Name().empty() ? string()
423  : (string("relabeled_") + table->Name()));
424  for (const auto &pair : pairs) {
425  new_table->AddSymbol(table->Find(pair.first), pair.second);
426  }
427  return new_table;
428 }
429 
430 // Returns true if the two symbol tables have equal checksums. Passing in
431 // nullptr for either table always returns true.
432 bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2,
433  bool warning = true);
434 
435 // Symbol Table serialization.
436 
437 void SymbolTableToString(const SymbolTable *table, string *result);
438 
439 SymbolTable *StringToSymbolTable(const string &str);
440 
441 } // namespace fst
442 
443 #endif // FST_SYMBOL_TABLE_H_
SymbolTableIterator(const SymbolTable &table)
Definition: symbol-table.h:376
virtual const string & LabeledCheckSum() const
Definition: symbol-table.h:306
virtual int64 GetNthKey(ssize_t pos) const
Definition: symbol-table.h:295
uint64_t uint64
Definition: types.h:32
const string & GetSymbol(size_t idx) const
Definition: symbol-table.h:68
virtual bool Write(std::ostream &strm) const
Definition: symbol-table.h:334
virtual const string & CheckSum() const
Definition: symbol-table.h:293
virtual SymbolTable * Copy() const
Definition: symbol-table.h:264
virtual int64 AddSymbol(const string &symbol)
Definition: symbol-table.h:275
int64 AddSymbol(const string &symbol)
Definition: symbol-table.h:102
#define LOG(type)
Definition: log.h:48
const string & Name() const
Definition: symbol-table.h:151
const string & LabeledCheckSum() const
Definition: symbol-table.h:160
SymbolTableImpl(const string &name)
Definition: symbol-table.h:85
virtual size_t NumSymbols() const
Definition: symbol-table.h:321
virtual bool Member(const string &symbol) const
Definition: symbol-table.h:312
void SetName(const string &new_name)
Definition: symbol-table.h:153
int64_t int64
Definition: types.h:27
constexpr int64 kNoSymbol
Definition: symbol-table.h:28
bool Member(const string &symbol) const
Definition: symbol-table.h:143
SymbolTable(const string &name="<unspecified>")
Definition: symbol-table.h:209
SymbolTableImpl(const SymbolTableImpl &impl)
Definition: symbol-table.h:91
bool Member(int64 key) const
Definition: symbol-table.h:141
static SymbolTable * Read(std::istream &strm, const SymbolTableReadOptions &opts)
Definition: symbol-table.h:236
static SymbolTable * Read(std::istream &strm, const string &source)
Definition: symbol-table.h:243
static SymbolTable * Read(const string &filename)
Definition: symbol-table.h:251
SymbolTable * StringToSymbolTable(const string &str)
virtual int64 AddSymbol(const string &symbol, int64 key)
Definition: symbol-table.h:268
string Symbol() const
Definition: symbol-table.h:391
static SymbolTableImpl * ReadText(std::istream &strm, const string &name, const SymbolTableTextOptions &opts=SymbolTableTextOptions())
Definition: symbol-table.cc:89
virtual void SetName(const string &new_name)
Definition: symbol-table.h:329
static SymbolTableImpl * Read(std::istream &strm, const SymbolTableReadOptions &opts)
virtual string Find(int64 key) const
Definition: symbol-table.h:299
void SymbolTableToString(const SymbolTable *table, string *result)
virtual bool WriteText(const string &filename) const
Definition: symbol-table.h:351
static SymbolTable * ReadText(const string &filename, const SymbolTableTextOptions &opts=SymbolTableTextOptions())
Definition: symbol-table.h:224
std::vector< std::pair< int64, int64 > > string_hash_ranges
Definition: symbol-table.h:41
int64 Find(const string &symbol) const
Definition: symbol-table.h:135
virtual int64 AvailableKey() const
Definition: symbol-table.h:289
virtual bool Member(int64 key) const
Definition: symbol-table.h:310
virtual bool Write(const string &filename) const
Definition: symbol-table.h:336
const string & CheckSum() const
Definition: symbol-table.h:155
SymbolTableReadOptions(std::vector< std::pair< int64, int64 >> string_hash_ranges, const string &source)
Definition: symbol-table.h:36
virtual ~SymbolTable()
Definition: symbol-table.h:212
virtual const string & Name() const
Definition: symbol-table.h:317
DECLARE_bool(fst_compat_symbols)
virtual int64 Find(const string &symbol) const
Definition: symbol-table.h:303
bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2, bool warning=true)
string Find(int64 key) const
Definition: symbol-table.h:122
int64 GetNthKey(ssize_t pos) const
Definition: symbol-table.h:145
virtual void RemoveSymbol(int64 key)
Definition: symbol-table.h:323
SymbolTable * RelabelSymbolTable(const SymbolTable *table, const std::vector< std::pair< Label, Label >> &pairs)
Definition: symbol-table.h:419
static SymbolTable * ReadText(std::istream &strm, const string &name, const SymbolTableTextOptions &opts=SymbolTableTextOptions())
Definition: symbol-table.h:216