FST  openfst-1.7.2
OpenFst Library
symbol-table.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // Classes to provide symbol-to-integer and integer-to-symbol mappings.
5 
6 #ifndef FST_SYMBOL_TABLE_H_
7 #define FST_SYMBOL_TABLE_H_
8 
9 #include <functional>
10 #include <ios>
11 #include <iostream>
12 #include <memory>
13 #include <sstream>
14 #include <string>
15 #include <utility>
16 #include <vector>
17 
18 #include <fst/compat.h>
19 #include <fst/flags.h>
20 #include <fst/log.h>
21 #include <fstream>
22 #include <map>
23 
24 DECLARE_bool(fst_compat_symbols);
25 
26 namespace fst {
27 
28 constexpr int64 kNoSymbol = -1;
29 
30 // WARNING: Reading via symbol table read options should
31 // not be used. This is a temporary work around for
32 // reading symbol ranges of previously stored symbol sets.
35 
37  std::vector<std::pair<int64, int64>> string_hash_ranges,
38  const string &source)
39  : string_hash_ranges(std::move(string_hash_ranges)), source(source) {}
40 
41  std::vector<std::pair<int64, int64>> string_hash_ranges;
42  string source;
43 };
44 
46  explicit SymbolTableTextOptions(bool allow_negative_labels = false);
47 
50 };
51 
52 namespace internal {
53 
54 extern const int kLineLen;
55 
56 // List of symbols with a dense hash for looking up symbol index, rehashing at
57 // 75% occupancy.
59  public:
61 
63 
64  std::pair<int64, bool> InsertOrFind(const string &key);
65 
66  int64 Find(const string &key) const;
67 
68  size_t Size() const { return symbols_.size(); }
69 
70  const string &GetSymbol(size_t idx) const { return symbols_[idx]; }
71 
72  void RemoveSymbol(size_t idx);
73 
74  private:
75  // num_buckets must be power of 2.
76  void Rehash(size_t num_buckets);
77 
78  int64 empty_;
79  std::vector<string> symbols_;
80  std::hash<string> str_hash_;
81  std::vector<int64> buckets_;
82  uint64 hash_mask_;
83 };
84 
86  public:
87  explicit SymbolTableImpl(const string &name)
88  : name_(name),
89  available_key_(0),
90  dense_key_limit_(0),
91  check_sum_finalized_(false) {}
92 
94  : name_(impl.name_),
95  available_key_(impl.available_key_),
96  dense_key_limit_(impl.dense_key_limit_),
97  symbols_(impl.symbols_),
98  idx_key_(impl.idx_key_),
99  key_map_(impl.key_map_),
100  check_sum_finalized_(false) {}
101 
102  int64 AddSymbol(const string &symbol, int64 key);
103 
104  int64 AddSymbol(const string &symbol) {
105  return AddSymbol(symbol, available_key_);
106  }
107 
108  // Removes the symbol with the given key. The removal is costly
109  // (O(NumSymbols)) and may reduce the efficiency of Find() because of a
110  // potentially reduced size of the dense key interval.
111  void RemoveSymbol(int64 key);
112 
113  static SymbolTableImpl *ReadText(
114  std::istream &strm, const string &name,
116 
117  static SymbolTableImpl* Read(std::istream &strm,
118  const SymbolTableReadOptions &opts);
119 
120  bool Write(std::ostream &strm) const;
121 
122  // Return the string associated with the key. If the key is out of
123  // range (<0, >max), return an empty string.
124  string Find(int64 key) const {
125  int64 idx = key;
126  if (key < 0 || key >= dense_key_limit_) {
127  const auto it = key_map_.find(key);
128  if (it == key_map_.end()) return "";
129  idx = it->second;
130  }
131  if (idx < 0 || idx >= symbols_.Size()) return "";
132  return symbols_.GetSymbol(idx);
133  }
134 
135  // Returns the key associated with the symbol; if the symbol
136  // does not exists, returns kNoSymbol.
137  int64 Find(const string &symbol) const {
138  int64 idx = symbols_.Find(symbol);
139  if (idx == kNoSymbol || idx < dense_key_limit_) return idx;
140  return idx_key_[idx - dense_key_limit_];
141  }
142 
143  bool Member(int64 key) const { return !Find(key).empty(); }
144 
145  bool Member(const string &symbol) const { return Find(symbol) != kNoSymbol; }
146 
147  int64 GetNthKey(ssize_t pos) const {
148  if (pos < 0 || pos >= symbols_.Size()) return kNoSymbol;
149  if (pos < dense_key_limit_) return pos;
150  return Find(symbols_.GetSymbol(pos));
151  }
152 
153  const string &Name() const { return name_; }
154 
155  void SetName(const string &new_name) { name_ = new_name; }
156 
157  const string &CheckSum() const {
158  MaybeRecomputeCheckSum();
159  return check_sum_string_;
160  }
161 
162  const string &LabeledCheckSum() const {
163  MaybeRecomputeCheckSum();
164  return labeled_check_sum_string_;
165  }
166 
167  int64 AvailableKey() const { return available_key_; }
168 
169  size_t NumSymbols() const { return symbols_.Size(); }
170 
171  private:
172  // Recomputes the checksums (both of them) if we've had changes since the last
173  // computation (i.e., if check_sum_finalized_ is false).
174  // Takes ~2.5 microseconds (dbg) or ~230 nanoseconds (opt) on a 2.67GHz Xeon
175  // if the checksum is up-to-date (requiring no recomputation).
176  void MaybeRecomputeCheckSum() const;
177 
178  string name_;
179  int64 available_key_;
180  int64 dense_key_limit_;
181 
182  DenseSymbolMap symbols_;
183  // Maps index to key for index >= dense_key_limit:
184  // key = idx_key_[index - dense_key_limit]
185  std::vector<int64> idx_key_;
186  // Maps key to index for key >= dense_key_limit_.
187  // index = key_map_[key]
188  std::map<int64, int64> key_map_;
189 
190  mutable bool check_sum_finalized_;
191  mutable string check_sum_string_;
192  mutable string labeled_check_sum_string_;
193  mutable Mutex check_sum_mutex_;
194 };
195 
196 } // namespace internal
197 
198 // Symbol (string) to integer (and reverse) mapping.
199 //
200 // The SymbolTable implements the mappings of labels to strings and reverse.
201 // SymbolTables are used to describe the alphabet of the input and output
202 // labels for arcs in a Finite State Transducer.
203 //
204 // SymbolTables are reference-counted and can therefore be shared across
205 // multiple machines. For example a language model grammar G, with a
206 // SymbolTable for the words in the language model can share this symbol
207 // table with the lexical representation L o G.
208 class SymbolTable {
209  public:
210  // Constructs symbol table with an optional name.
211  explicit SymbolTable(const string &name = "<unspecified>")
212  : impl_(std::make_shared<internal::SymbolTableImpl>(name)) {}
213 
214  virtual ~SymbolTable() {}
215 
216  // Reads a text representation of the symbol table from an istream. Pass a
217  // name to give the resulting SymbolTable.
219  std::istream &strm, const string &name,
221  auto *impl = internal::SymbolTableImpl::ReadText(strm, name, opts);
222  return impl ? new SymbolTable(impl) : nullptr;
223  }
224 
225  // Reads a text representation of the symbol table.
226  static SymbolTable *ReadText(const string &filename,
228  std::ifstream strm(filename, std::ios_base::in);
229  if (!strm.good()) {
230  LOG(ERROR) << "SymbolTable::ReadText: Can't open file " << filename;
231  return nullptr;
232  }
233  return ReadText(strm, filename, opts);
234  }
235 
236  // WARNING: Reading via symbol table read options should not be used. This is
237  // a temporary work-around.
238  static SymbolTable* Read(std::istream &strm,
239  const SymbolTableReadOptions &opts) {
240  auto *impl = internal::SymbolTableImpl::Read(strm, opts);
241  return (impl) ? new SymbolTable(impl) : nullptr;
242  }
243 
244  // Reads a binary dump of the symbol table from a stream.
245  static SymbolTable *Read(std::istream &strm,
246  const string &source) {
248  opts.source = source;
249  return Read(strm, opts);
250  }
251 
252  // Reads a binary dump of the symbol table.
253  static SymbolTable *Read(const string& filename) {
254  std::ifstream strm(filename,
255  std::ios_base::in | std::ios_base::binary);
256  if (!strm.good()) {
257  LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
258  return nullptr;
259  }
260  return Read(strm, filename);
261  }
262 
263  // DERIVABLE INTERFACE
264 
265  // Creates a reference counted copy.
266  virtual SymbolTable *Copy() const { return new SymbolTable(*this); }
267 
268  // Adds a symbol with given key to table. A symbol table also keeps track of
269  // the last available key (highest key value in the symbol table).
270  virtual int64 AddSymbol(const string &symbol, int64 key) {
271  MutateCheck();
272  return impl_->AddSymbol(symbol, key);
273  }
274 
275  // Adds a symbol to the table. The associated value key is automatically
276  // assigned by the symbol table.
277  virtual int64 AddSymbol(const string &symbol) {
278  MutateCheck();
279  return impl_->AddSymbol(symbol);
280  }
281 
282  // Adds another symbol table to this table. All key values will be offset
283  // by the current available key (highest key value in the symbol table).
284  // Note string symbols with the same key value will still have the same
285  // key value after the symbol table has been merged, but a different
286  // value. Adding symbol tables do not result in changes in the base table.
287  virtual void AddTable(const SymbolTable &table);
288 
289  // Returns the current available key (i.e., highest key + 1) in the symbol
290  // table.
291  virtual int64 AvailableKey() const { return impl_->AvailableKey(); }
292 
293  // Return the label-agnostic MD5 check-sum for this table. All new symbols
294  // added to the table will result in an updated checksum. Deprecated.
295  virtual const string &CheckSum() const { return impl_->CheckSum(); }
296 
297  virtual int64 GetNthKey(ssize_t pos) const { return impl_->GetNthKey(pos); }
298 
299  // Returns the string associated with the key; if the key is out of
300  // range (<0, >max), returns an empty string.
301  virtual string Find(int64 key) const { return impl_->Find(key); }
302 
303  // Returns the key associated with the symbol; if the symbol does not exist,
304  // kNoSymbol is returned.
305  virtual int64 Find(const string &symbol) const { return impl_->Find(symbol); }
306 
307  // Same as CheckSum(), but returns an label-dependent version.
308  virtual const string &LabeledCheckSum() const {
309  return impl_->LabeledCheckSum();
310  }
311 
312  virtual bool Member(int64 key) const { return impl_->Member(key); }
313 
314  virtual bool Member(const string &symbol) const {
315  return impl_->Member(symbol);
316  }
317 
318  // Returns the name of the symbol table.
319  virtual const string &Name() const { return impl_->Name(); }
320 
321  // Returns the current number of symbols in table (not necessarily equal to
322  // AvailableKey()).
323  virtual size_t NumSymbols() const { return impl_->NumSymbols(); }
324 
325  virtual void RemoveSymbol(int64 key) {
326  MutateCheck();
327  return impl_->RemoveSymbol(key);
328  }
329 
330  // Sets the name of the symbol table.
331  virtual void SetName(const string &new_name) {
332  MutateCheck();
333  impl_->SetName(new_name);
334  }
335 
336  virtual bool Write(std::ostream &strm) const { return impl_->Write(strm); }
337 
338  virtual bool Write(const string &filename) const {
339  std::ofstream strm(filename,
340  std::ios_base::out | std::ios_base::binary);
341  if (!strm.good()) {
342  LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
343  return false;
344  }
345  return Write(strm);
346  }
347 
348  // Dump a text representation of the symbol table via a stream.
349  virtual bool WriteText(std::ostream &strm,
350  const SymbolTableTextOptions &opts = SymbolTableTextOptions()) const;
351 
352  // Dump an text representation of the symbol table.
353  virtual bool WriteText(const string &filename) const {
354  std::ofstream strm(filename);
355  if (!strm.good()) {
356  LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
357  return false;
358  }
359  return WriteText(strm);
360  }
361 
362  private:
363  explicit SymbolTable(internal::SymbolTableImpl *impl) : impl_(impl) {}
364 
365  void MutateCheck() {
366  if (!impl_.unique()) impl_.reset(new internal::SymbolTableImpl(*impl_));
367  }
368 
369  const internal::SymbolTableImpl *Impl() const { return impl_.get(); }
370 
371  private:
372  std::shared_ptr<internal::SymbolTableImpl> impl_;
373 };
374 
375 // Iterator class for symbols in a symbol table.
377  public:
378  explicit SymbolTableIterator(const SymbolTable &table)
379  : table_(table),
380  pos_(0),
381  nsymbols_(table.NumSymbols()),
382  key_(table.GetNthKey(0)) {}
383 
385 
386  // Returns whether iterator is done.
387  bool Done() const { return (pos_ == nsymbols_); }
388 
389  // Return the key of the current symbol.
390  int64 Value() const { return key_; }
391 
392  // Return the string of the current symbol.
393  string Symbol() const { return table_.Find(key_); }
394 
395  // Advances iterator.
396  void Next() {
397  ++pos_;
398  if (pos_ < nsymbols_) key_ = table_.GetNthKey(pos_);
399  }
400 
401  // Resets iterator.
402  void Reset() {
403  pos_ = 0;
404  key_ = table_.GetNthKey(0);
405  }
406 
407  private:
408  const SymbolTable &table_;
409  ssize_t pos_;
410  size_t nsymbols_;
411  int64 key_;
412 };
413 
414 // Relabels a symbol table as specified by the input vector of pairs
415 // (old label, new label). The new symbol table only retains symbols
416 // for which a relabeling is explicitly specified.
417 //
418 // TODO(allauzen): consider adding options to allow for some form of implicit
419 // identity relabeling.
420 template <class Label>
422  const std::vector<std::pair<Label, Label>> &pairs) {
423  auto *new_table = new SymbolTable(
424  table->Name().empty() ? string()
425  : (string("relabeled_") + table->Name()));
426  for (const auto &pair : pairs) {
427  new_table->AddSymbol(table->Find(pair.first), pair.second);
428  }
429  return new_table;
430 }
431 
432 // Returns true if the two symbol tables have equal checksums. Passing in
433 // nullptr for either table always returns true.
434 bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2,
435  bool warning = true);
436 
437 // Symbol Table serialization.
438 
439 void SymbolTableToString(const SymbolTable *table, string *result);
440 
441 SymbolTable *StringToSymbolTable(const string &str);
442 
443 } // namespace fst
444 
445 #endif // FST_SYMBOL_TABLE_H_
SymbolTableIterator(const SymbolTable &table)
Definition: symbol-table.h:378
virtual const string & LabeledCheckSum() const
Definition: symbol-table.h:308
virtual int64 GetNthKey(ssize_t pos) const
Definition: symbol-table.h:297
uint64_t uint64
Definition: types.h:32
const string & GetSymbol(size_t idx) const
Definition: symbol-table.h:70
virtual bool Write(std::ostream &strm) const
Definition: symbol-table.h:336
virtual const string & CheckSum() const
Definition: symbol-table.h:295
virtual SymbolTable * Copy() const
Definition: symbol-table.h:266
virtual int64 AddSymbol(const string &symbol)
Definition: symbol-table.h:277
int64 AddSymbol(const string &symbol)
Definition: symbol-table.h:104
#define LOG(type)
Definition: log.h:48
const string & Name() const
Definition: symbol-table.h:153
const string & LabeledCheckSum() const
Definition: symbol-table.h:162
SymbolTableImpl(const string &name)
Definition: symbol-table.h:87
virtual size_t NumSymbols() const
Definition: symbol-table.h:323
virtual bool Member(const string &symbol) const
Definition: symbol-table.h:314
void SetName(const string &new_name)
Definition: symbol-table.h:155
int64_t int64
Definition: types.h:27
constexpr int64 kNoSymbol
Definition: symbol-table.h:28
bool Member(const string &symbol) const
Definition: symbol-table.h:145
SymbolTable(const string &name="<unspecified>")
Definition: symbol-table.h:211
SymbolTableImpl(const SymbolTableImpl &impl)
Definition: symbol-table.h:93
bool Member(int64 key) const
Definition: symbol-table.h:143
static SymbolTable * Read(std::istream &strm, const SymbolTableReadOptions &opts)
Definition: symbol-table.h:238
static SymbolTable * Read(std::istream &strm, const string &source)
Definition: symbol-table.h:245
static SymbolTable * Read(const string &filename)
Definition: symbol-table.h:253
SymbolTable * StringToSymbolTable(const string &str)
virtual int64 AddSymbol(const string &symbol, int64 key)
Definition: symbol-table.h:270
string Symbol() const
Definition: symbol-table.h:393
static SymbolTableImpl * ReadText(std::istream &strm, const string &name, const SymbolTableTextOptions &opts=SymbolTableTextOptions())
Definition: symbol-table.cc:90
const int kLineLen
Definition: symbol-table.cc:28
virtual void SetName(const string &new_name)
Definition: symbol-table.h:331
static SymbolTableImpl * Read(std::istream &strm, const SymbolTableReadOptions &opts)
virtual string Find(int64 key) const
Definition: symbol-table.h:301
void SymbolTableToString(const SymbolTable *table, string *result)
virtual bool WriteText(const string &filename) const
Definition: symbol-table.h:353
static SymbolTable * ReadText(const string &filename, const SymbolTableTextOptions &opts=SymbolTableTextOptions())
Definition: symbol-table.h:226
std::vector< std::pair< int64, int64 > > string_hash_ranges
Definition: symbol-table.h:41
int64 Find(const string &symbol) const
Definition: symbol-table.h:137
virtual int64 AvailableKey() const
Definition: symbol-table.h:291
virtual bool Member(int64 key) const
Definition: symbol-table.h:312
virtual bool Write(const string &filename) const
Definition: symbol-table.h:338
const string & CheckSum() const
Definition: symbol-table.h:157
SymbolTableReadOptions(std::vector< std::pair< int64, int64 >> string_hash_ranges, const string &source)
Definition: symbol-table.h:36
virtual ~SymbolTable()
Definition: symbol-table.h:214
virtual const string & Name() const
Definition: symbol-table.h:319
DECLARE_bool(fst_compat_symbols)
virtual int64 Find(const string &symbol) const
Definition: symbol-table.h:305
bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2, bool warning=true)
string Find(int64 key) const
Definition: symbol-table.h:124
int64 GetNthKey(ssize_t pos) const
Definition: symbol-table.h:147
virtual void RemoveSymbol(int64 key)
Definition: symbol-table.h:325
SymbolTable * RelabelSymbolTable(const SymbolTable *table, const std::vector< std::pair< Label, Label >> &pairs)
Definition: symbol-table.h:421
static SymbolTable * ReadText(std::istream &strm, const string &name, const SymbolTableTextOptions &opts=SymbolTableTextOptions())
Definition: symbol-table.h:218