FST  openfst-1.8.3
OpenFst Library
string.h
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
16 // finite-state transducer library.
17 //
18 // Utilities to convert strings into FSTs.
19 
20 #ifndef FST_STRING_H_
21 #define FST_STRING_H_
22 
23 #include <cstdint>
24 #include <memory>
25 #include <optional>
26 #include <ostream>
27 #include <sstream>
28 #include <string>
29 #include <utility>
30 #include <vector>
31 
32 #include <fst/flags.h>
33 #include <fst/log.h>
34 #include <fst/arc.h>
35 #include <fst/compact-fst.h>
36 #include <fst/fst.h>
37 #include <fst/icu.h>
38 #include <fst/mutable-fst.h>
39 #include <fst/properties.h>
40 #include <fst/symbol-table.h>
41 #include <fst/util.h>
42 #include <fst/compat.h>
43 #include <string_view>
44 
45 DECLARE_string(fst_field_separator);
46 
47 namespace fst {
48 
49 enum class TokenType : uint8_t { SYMBOL = 1, BYTE = 2, UTF8 = 3 };
50 
51 inline std::ostream &operator<<(std::ostream &strm,
52  const TokenType &token_type) {
53  switch (token_type) {
54  case TokenType::BYTE:
55  return strm << "byte";
56  case TokenType::UTF8:
57  return strm << "utf8";
58  case TokenType::SYMBOL:
59  return strm << "symbol";
60  }
61  return strm; // unreachable
62 }
63 
64 namespace internal {
65 
66 template <class Label>
67 bool ConvertSymbolToLabel(std::string_view str, const SymbolTable *syms,
68  Label unknown_label, Label *output) {
69  int64_t n;
70  if (syms) {
71  n = syms->Find(str);
72  if ((n == kNoSymbol) && (unknown_label != kNoLabel)) n = unknown_label;
73  if (n == kNoSymbol) {
74  LOG(ERROR) << "ConvertSymbolToLabel: Symbol \"" << str
75  << "\" is not mapped to any integer label, symbol table = "
76  << syms->Name();
77  return false;
78  }
79  } else {
80  const auto maybe_n = ParseInt64(str);
81  if (!maybe_n.has_value()) {
82  LOG(ERROR) << "ConvertSymbolToLabel: Bad label integer "
83  << "= \"" << str << "\"";
84  return false;
85  }
86  n = *maybe_n;
87  }
88  *output = n;
89  return true;
90 }
91 
92 template <class Label>
94  std::string_view str, TokenType token_type, const SymbolTable *syms,
95  Label unknown_label, std::vector<Label> *labels,
96  std::string_view sep = FST_FLAGS_fst_field_separator) {
97  labels->clear();
98  switch (token_type) {
99  case TokenType::BYTE: {
100  labels->reserve(str.size());
101  return ByteStringToLabels(str, labels);
102  }
103  case TokenType::UTF8: {
104  return UTF8StringToLabels(str, labels);
105  }
106  case TokenType::SYMBOL: {
107  const std::string separator = fst::StrCat("\n", sep);
108  for (std::string_view c :
109  StrSplit(str, ByAnyChar(separator), SkipEmpty())) {
110  Label label;
111  if (!ConvertSymbolToLabel(c, syms, unknown_label, &label)) return false;
112  labels->push_back(label);
113  }
114  return true;
115  }
116  }
117  return false; // Unreachable.
118 }
119 
120 // The last character of 'sep' is used as a separator between symbols.
121 // Additionally, epsilon symbols will be printed only if omit_epsilon
122 // is false.
123 template <class Label>
124 bool LabelsToSymbolString(const std::vector<Label> &labels, std::string *str,
125  const SymbolTable &syms, std::string_view sep,
126  bool omit_epsilon) {
127  std::stringstream ostrm;
128  sep.remove_prefix(sep.size() - 1); // We only respect the final char of sep.
129  std::string_view delim = "";
130  for (auto label : labels) {
131  if (omit_epsilon && !label) continue;
132  ostrm << delim;
133  const std::string &symbol = syms.Find(label);
134  if (symbol.empty()) {
135  LOG(ERROR) << "LabelsToSymbolString: Label " << label
136  << " is not mapped onto any textual symbol in symbol table "
137  << syms.Name();
138  return false;
139  }
140  ostrm << symbol;
141  delim = sep;
142  }
143  *str = ostrm.str();
144  return !!ostrm;
145 }
146 
147 // The last character of 'sep' is used as a separator between symbols.
148 // Additionally, epsilon symbols will be printed only if omit_epsilon
149 // is false.
150 template <class Label>
151 bool LabelsToNumericString(const std::vector<Label> &labels, std::string *str,
152  std::string_view sep, bool omit_epsilon) {
153  std::stringstream ostrm;
154  sep.remove_prefix(sep.size() - 1); // We only respect the final char of sep.
155  std::string_view delim = "";
156  for (auto label : labels) {
157  if (omit_epsilon && !label) continue;
158  ostrm << delim;
159  ostrm << label;
160  delim = sep;
161  }
162  *str = ostrm.str();
163  return !!ostrm;
164 }
165 
166 } // namespace internal
167 
168 // Functor for compiling a string in an FST.
169 template <class Arc>
170 class OPENFST_DEPRECATED("allow_negative is no-op") StringCompiler {
171  public:
172  using Label = typename Arc::Label;
173  using StateId = typename Arc::StateId;
174  using Weight = typename Arc::Weight;
175 
176  explicit StringCompiler(TokenType token_type = TokenType::BYTE,
177  const SymbolTable *syms = nullptr,
178  Label unknown_label = kNoLabel)
179  : token_type_(token_type), syms_(syms), unknown_label_(unknown_label) {}
180 
181  // Compiles string into an FST. With SYMBOL token type, sep is used to
182  // specify the set of char separators between symbols, in addition
183  // of '\n' which is always treated as a separator.
184  // Returns true on success.
185  template <class FST>
186  bool operator()(
187  std::string_view str, FST *fst,
188  std::string_view sep = FST_FLAGS_fst_field_separator) const {
189  std::vector<Label> labels;
190  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
191  unknown_label_, &labels, sep)) {
192  return false;
193  }
194  Compile(labels, fst);
195  return true;
196  }
197 
198  // Same as above but allows to specify a weight for the string.
199  template <class FST>
200  bool operator()(
201  std::string_view str, FST *fst, Weight weight,
202  std::string_view sep = FST_FLAGS_fst_field_separator) const {
203  std::vector<Label> labels;
204  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
205  unknown_label_, &labels, sep)) {
206  return false;
207  }
208  Compile(labels, fst, std::move(weight));
209  return true;
210  }
211 
212  private:
213  void Compile(const std::vector<Label> &labels, MutableFst<Arc> *fst,
214  Weight weight = Weight::One()) const {
215  fst->DeleteStates();
216  auto state = fst->AddState();
217  fst->SetStart(state);
218  fst->AddStates(labels.size());
219  for (auto label : labels) {
220  fst->AddArc(state, Arc(label, label, state + 1));
221  ++state;
222  }
223  fst->SetFinal(state, std::move(weight));
225  }
226 
227  template <class Unsigned>
228  void Compile(const std::vector<Label> &labels,
229  CompactStringFst<Arc, Unsigned> *fst) const {
230  using Compactor = typename CompactStringFst<Arc, Unsigned>::Compactor;
231  fst->SetCompactor(
232  std::make_shared<Compactor>(labels.begin(), labels.end()));
233  }
234 
235  template <class Unsigned>
236  void Compile(const std::vector<Label> &labels,
238  Weight weight = Weight::One()) const {
239  std::vector<std::pair<Label, Weight>> compacts;
240  compacts.reserve(labels.size() + 1);
241  for (StateId i = 0; i < static_cast<StateId>(labels.size()) - 1; ++i) {
242  compacts.emplace_back(labels[i], Weight::One());
243  }
244  compacts.emplace_back(!labels.empty() ? labels.back() : kNoLabel, weight);
245  using Compactor =
247  fst->SetCompactor(
248  std::make_shared<Compactor>(compacts.begin(), compacts.end()));
249  }
250 
251  const TokenType token_type_;
252  const SymbolTable *syms_; // Symbol table (used when token type is symbol).
253  const Label unknown_label_; // Label for token missing from symbol table.
254 
255  StringCompiler(const StringCompiler &) = delete;
256  StringCompiler &operator=(const StringCompiler &) = delete;
257 };
258 
259 // A useful alias when using StdArc.
260 using StdStringCompiler = StringCompiler<StdArc>;
261 
262 // Helpers for StringPrinter.
263 
264 // Converts an FST to a vector of output labels. To get input labels, use
265 // Project or Invert. Returns true on success. Use only with string FSTs; may
266 // loop for non-string FSTs.
267 template <class Arc>
269  std::vector<typename Arc::Label> *labels) {
270  labels->clear();
271  auto s = fst.Start();
272  if (s == kNoStateId) {
273  LOG(ERROR) << "StringFstToOutputLabels: Invalid start state";
274  return false;
275  }
276  while (fst.Final(s) == Arc::Weight::Zero()) {
277  ArcIterator<Fst<Arc>> aiter(fst, s);
278  if (aiter.Done()) {
279  LOG(ERROR) << "StringFstToOutputLabels: Does not reach final state";
280  return false;
281  }
282  const auto &arc = aiter.Value();
283  labels->push_back(arc.olabel);
284  s = arc.nextstate;
285  aiter.Next();
286  if (!aiter.Done()) {
287  LOG(ERROR) << "StringFstToOutputLabels: State " << s
288  << " has multiple outgoing arcs";
289  return false;
290  }
291  }
292  if (fst.NumArcs(s) != 0) {
293  LOG(ERROR) << "StringFstToOutputLabels: Final state " << s
294  << " has outgoing arc(s)";
295  return false;
296  }
297  return true;
298 }
299 
300 // Same as above but also computes the path weight. The output weight parameter
301 // is only set if labels extraction is successful.
302 template <class Arc>
304  std::vector<typename Arc::Label> *labels,
305  typename Arc::Weight *weight) {
306  labels->clear();
307  auto path_weight = Arc::Weight::One();
308  auto s = fst.Start();
309  if (s == kNoStateId) {
310  LOG(ERROR) << "StringFstToOutputLabels: Invalid start state";
311  return false;
312  }
313  auto final_weight = fst.Final(s);
314  while (final_weight == Arc::Weight::Zero()) {
315  ArcIterator<Fst<Arc>> aiter(fst, s);
316  if (aiter.Done()) {
317  LOG(ERROR) << "StringFstToOutputLabels: Does not reach final state";
318  return false;
319  }
320  const auto &arc = aiter.Value();
321  labels->push_back(arc.olabel);
322  path_weight = Times(path_weight, arc.weight);
323  s = arc.nextstate;
324  aiter.Next();
325  if (!aiter.Done()) {
326  LOG(ERROR) << "StringFstToOutputLabels: State " << s
327  << " has multiple outgoing arcs";
328  return false;
329  }
330  final_weight = fst.Final(s);
331  }
332  if (fst.NumArcs(s) != 0) {
333  LOG(ERROR) << "StringFstToOutputLabels: Final state " << s
334  << " has outgoing arc(s)";
335  return false;
336  }
337  *weight = Times(path_weight, final_weight);
338  return true;
339 }
340 
341 // Converts a list of symbols to a string. If the token type is SYMBOL, the last
342 // character of sep is used to separate textual symbols. Additionally, if the
343 // token type is SYMBOL, epsilon symbols will be printed only if omit_epsilon
344 // is false. Returns true on success.
345 template <class Label>
347  const std::vector<Label> &labels, std::string *str,
348  TokenType ttype = TokenType::BYTE, const SymbolTable *syms = nullptr,
349  std::string_view sep = FST_FLAGS_fst_field_separator,
350  bool omit_epsilon = true) {
351  switch (ttype) {
352  case TokenType::BYTE: {
353  return LabelsToByteString(labels, str);
354  }
355  case TokenType::UTF8: {
356  return LabelsToUTF8String(labels, str);
357  }
358  case TokenType::SYMBOL: {
359  return syms ? internal::LabelsToSymbolString(labels, str, *syms, sep,
360  omit_epsilon)
361  : internal::LabelsToNumericString(labels, str, sep,
362  omit_epsilon);
363  }
364  }
365  return false;
366 }
367 
368 // Functor for printing a string FST as a string.
369 template <class Arc>
371  public:
372  using Label = typename Arc::Label;
373  using Weight = typename Arc::Weight;
374 
375  explicit StringPrinter(TokenType token_type = TokenType::BYTE,
376  const SymbolTable *syms = nullptr,
377  bool omit_epsilon = true)
378  : token_type_(token_type), syms_(syms), omit_epsilon_(omit_epsilon) {}
379 
380  // Converts the FST into a string. With SYMBOL token type, the last character
381  // of sep is used as a separator between symbols. Returns true on success.
383  const Fst<Arc> &fst, std::string *str,
384  std::string_view sep = FST_FLAGS_fst_field_separator) const {
385  std::vector<Label> labels;
386  return StringFstToOutputLabels(fst, &labels) &&
387  LabelsToString(labels, str, token_type_, syms_, sep, omit_epsilon_);
388  }
389 
390  // Same as above but also computes the path weight. The output weight
391  // parameter is only set if labels extraction is successful.
393  const Fst<Arc> &fst, std::string *str, Weight *weight,
394  std::string_view sep = FST_FLAGS_fst_field_separator) const {
395  std::vector<Label> labels;
396  return StringFstToOutputLabels(fst, &labels, weight) &&
397  LabelsToString(labels, str, token_type_, syms_, sep, omit_epsilon_);
398  }
399 
400  private:
401  const TokenType token_type_;
402  const SymbolTable *syms_;
403  const bool omit_epsilon_;
404 
405  StringPrinter(const StringPrinter &) = delete;
406  StringPrinter &operator=(const StringPrinter &) = delete;
407 };
408 
409 // A useful alias when using StdArc.
411 
412 } // namespace fst
413 
414 #endif // FST_STRING_H_
const std::string & Name() const
Definition: symbol-table.h:466
bool StringFstToOutputLabels(const Fst< Arc > &fst, std::vector< typename Arc::Label > *labels)
Definition: string.h:268
constexpr int kNoLabel
Definition: fst.h:195
class OPENFST_DEPRECATED("allow_negative is no-op") StringCompiler
Definition: string.h:170
StringCompiler< StdArc > StdStringCompiler
Definition: string.h:260
virtual size_t NumArcs(StateId) const =0
ErrorWeight Times(const ErrorWeight &, const ErrorWeight &)
Definition: error-weight.h:64
#define LOG(type)
Definition: log.h:53
constexpr uint64_t kCompiledStringProperties
Definition: properties.h:157
virtual Weight Final(StateId) const =0
virtual void SetStart(StateId)=0
typename Arc::Weight Weight
Definition: string.h:373
constexpr int64_t kNoSymbol
Definition: symbol-table.h:55
constexpr int kNoStateId
Definition: fst.h:196
internal::StringSplitter StrSplit(std::string_view full, ByAnyChar delim)
Definition: compat.cc:77
const Arc & Value() const
Definition: fst.h:536
void Compile(std::istream &istrm, const std::string &source, const std::string &dest, const std::string &fst_type, const std::string &arc_type, const SymbolTable *isyms, const SymbolTable *osyms, const SymbolTable *ssyms, bool accep, bool ikeep, bool okeep, bool nkeep)
Definition: compile.cc:32
std::optional< int64_t > ParseInt64(std::string_view s, int base=10)
Definition: util.cc:46
bool operator()(const Fst< Arc > &fst, std::string *str, std::string_view sep=FST_FLAGS_fst_field_separator) const
Definition: string.h:382
virtual void SetProperties(uint64_t props, uint64_t mask)=0
DECLARE_string(fst_field_separator)
See www.openfst.org for extensive documentation on this weighted.
std::ostream & operator<<(std::ostream &strm, const ErrorWeight &)
Definition: error-weight.h:71
bool UTF8StringToLabels(std::string_view str, std::vector< Label > *labels)
Definition: icu.h:53
typename Arc::Label Label
Definition: string.h:372
bool LabelsToSymbolString(const std::vector< Label > &labels, std::string *str, const SymbolTable &syms, std::string_view sep, bool omit_epsilon)
Definition: string.h:124
virtual StateId Start() const =0
bool Done() const
Definition: fst.h:532
bool ConvertStringToLabels(std::string_view str, TokenType token_type, const SymbolTable *syms, Label unknown_label, std::vector< Label > *labels, std::string_view sep=FST_FLAGS_fst_field_separator)
Definition: string.h:93
bool ConvertSymbolToLabel(std::string_view str, const SymbolTable *syms, Label unknown_label, Label *output)
Definition: string.h:67
bool LabelsToUTF8String(const std::vector< Label > &labels, std::string *str)
Definition: icu.h:103
TokenType
Definition: string.h:49
virtual void AddArc(StateId, const Arc &)=0
void SetCompactor(std::shared_ptr< Compactor > compactor)
Definition: compact-fst.h:1198
bool operator()(const Fst< Arc > &fst, std::string *str, Weight *weight, std::string_view sep=FST_FLAGS_fst_field_separator) const
Definition: string.h:392
std::string StrCat(const StringOrInt &s1, const StringOrInt &s2)
Definition: compat.h:286
bool LabelsToString(const std::vector< Label > &labels, std::string *str, TokenType ttype=TokenType::BYTE, const SymbolTable *syms=nullptr, std::string_view sep=FST_FLAGS_fst_field_separator, bool omit_epsilon=true)
Definition: string.h:346
virtual StateId AddState()=0
bool LabelsToNumericString(const std::vector< Label > &labels, std::string *str, std::string_view sep, bool omit_epsilon)
Definition: string.h:151
virtual void SetFinal(StateId s, Weight weight=Weight::One())=0
virtual void DeleteStates(const std::vector< StateId > &)=0
bool ByteStringToLabels(std::string_view str, std::vector< Label > *labels)
Definition: icu.h:41
std::string Find(int64_t key) const
Definition: symbol-table.h:450
bool LabelsToByteString(const std::vector< Label > &labels, std::string *str)
Definition: icu.h:93
StringPrinter(TokenType token_type=TokenType::BYTE, const SymbolTable *syms=nullptr, bool omit_epsilon=true)
Definition: string.h:375
void Next()
Definition: fst.h:540
virtual void AddStates(size_t)=0