FST  openfst-1.8.2.post1
OpenFst Library
string.h
Go to the documentation of this file.
1 // Copyright 2005-2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 //
18 // Utilities to convert strings into FSTs.
19 
20 #ifndef FST_STRING_H_
21 #define FST_STRING_H_
22 
23 #include <cstdint>
24 #include <memory>
25 #include <ostream>
26 #include <sstream>
27 #include <string>
28 #include <vector>
29 
30 #include <fst/flags.h>
31 #include <fst/log.h>
32 
33 #include <fst/compact-fst.h>
34 #include <fst/icu.h>
35 #include <fst/mutable-fst.h>
36 #include <fst/properties.h>
37 #include <fst/symbol-table.h>
38 #include <fst/util.h>
39 
40 #include <fst/compat.h>
41 #include <string_view>
42 
43 DECLARE_string(fst_field_separator);
44 
45 namespace fst {
46 
47 enum class TokenType : uint8_t { SYMBOL = 1, BYTE = 2, UTF8 = 3 };
48 
49 inline std::ostream &operator<<(std::ostream &strm,
50  const TokenType &token_type) {
51  switch (token_type) {
52  case TokenType::BYTE:
53  return strm << "byte";
54  case TokenType::UTF8:
55  return strm << "utf8";
56  case TokenType::SYMBOL:
57  return strm << "symbol";
58  }
59  return strm; // unreachable
60 }
61 
62 namespace internal {
63 
64 template <class Label>
65 bool ConvertSymbolToLabel(std::string_view str, const SymbolTable *syms,
66  Label unknown_label, bool allow_negative,
67  Label *output) {
68  int64_t n;
69  if (syms) {
70  n = syms->Find(str);
71  if ((n == kNoSymbol) && (unknown_label != kNoLabel)) n = unknown_label;
72  if (n == kNoSymbol || (!allow_negative && n < 0)) {
73  LOG(ERROR) << "ConvertSymbolToLabel: Symbol \"" << str
74  << "\" is not mapped to any integer label, symbol table = "
75  << syms->Name();
76  return false;
77  }
78  } else {
79  const auto maybe_n = ParseInt64(str);
80  if (!maybe_n.has_value() || (!allow_negative && *maybe_n < 0)) {
81  LOG(ERROR) << "ConvertSymbolToLabel: Bad label integer "
82  << "= \"" << str << "\"";
83  return false;
84  }
85  n = *maybe_n;
86  }
87  *output = n;
88  return true;
89 }
90 
91 template <class Label>
93  std::string_view str, TokenType token_type, const SymbolTable *syms,
94  Label unknown_label, bool allow_negative, std::vector<Label> *labels,
95  const std::string &sep = FST_FLAGS_fst_field_separator) {
96  labels->clear();
97  switch (token_type) {
98  case TokenType::BYTE: {
99  labels->reserve(str.size());
100  return ByteStringToLabels(str, labels);
101  }
102  case TokenType::UTF8: {
103  return UTF8StringToLabels(str, labels);
104  }
105  case TokenType::SYMBOL: {
106  const std::string separator = fst::StrCat("\n", sep);
107  for (std::string_view c :
108  StrSplit(str, ByAnyChar(separator), SkipEmpty())) {
109  Label label;
110  if (!ConvertSymbolToLabel(c, syms, unknown_label, allow_negative,
111  &label)) {
112  return false;
113  }
114  labels->push_back(label);
115  }
116  return true;
117  }
118  }
119  return false; // Unreachable.
120 }
121 
122 // The last character of 'sep' is used as a separator between symbols.
123 // Additionally, epsilon symbols will be printed only if omit_epsilon
124 // is false.
125 template <class Label>
126 bool LabelsToSymbolString(const std::vector<Label> &labels, std::string *str,
127  const SymbolTable &syms, std::string_view sep,
128  bool omit_epsilon) {
129  std::stringstream ostrm;
130  sep.remove_prefix(sep.size() - 1); // We only respect the final char of sep.
131  std::string_view delim = "";
132  for (auto label : labels) {
133  if (omit_epsilon && !label) continue;
134  ostrm << delim;
135  const std::string &symbol = syms.Find(label);
136  if (symbol.empty()) {
137  LOG(ERROR) << "LabelsToSymbolString: Label " << label
138  << " is not mapped onto any textual symbol in symbol table "
139  << syms.Name();
140  return false;
141  }
142  ostrm << symbol;
143  delim = sep;
144  }
145  *str = ostrm.str();
146  return !!ostrm;
147 }
148 
149 // The last character of 'sep' is used as a separator between symbols.
150 // Additionally, epsilon symbols will be printed only if omit_epsilon
151 // is false.
152 template <class Label>
153 bool LabelsToNumericString(const std::vector<Label> &labels, std::string *str,
154  std::string_view sep, bool omit_epsilon) {
155  std::stringstream ostrm;
156  sep.remove_prefix(sep.size() - 1); // We only respect the final char of sep.
157  std::string_view delim = "";
158  for (auto label : labels) {
159  if (omit_epsilon && !label) continue;
160  ostrm << delim;
161  ostrm << label;
162  delim = sep;
163  }
164  *str = ostrm.str();
165  return !!ostrm;
166 }
167 
168 } // namespace internal
169 
170 // Functor for compiling a string in an FST.
171 template <class Arc>
173  public:
174  using Label = typename Arc::Label;
175  using StateId = typename Arc::StateId;
176  using Weight = typename Arc::Weight;
177 
178  explicit StringCompiler(TokenType token_type = TokenType::BYTE,
179  const SymbolTable *syms = nullptr,
180  Label unknown_label = kNoLabel,
181  bool allow_negative = false)
182  : token_type_(token_type),
183  syms_(syms),
184  unknown_label_(unknown_label),
185  allow_negative_(allow_negative) {}
186 
187  // Compiles string into an FST. With SYMBOL token type, sep is used to
188  // specify the set of char separators between symbols, in addition
189  // of '\n' which is always treated as a separator.
190  // Returns true on success.
191  template <class FST>
193  std::string_view str, FST *fst,
194  const std::string &sep = FST_FLAGS_fst_field_separator) const {
195  std::vector<Label> labels;
196  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
197  unknown_label_, allow_negative_,
198  &labels, sep)) {
199  return false;
200  }
201  Compile(labels, fst);
202  return true;
203  }
204 
205  // Same as above but allows to specify a weight for the string.
206  template <class FST>
208  std::string_view str, FST *fst, Weight weight,
209  const std::string &sep = FST_FLAGS_fst_field_separator) const {
210  std::vector<Label> labels;
211  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
212  unknown_label_, allow_negative_,
213  &labels, sep)) {
214  return false;
215  }
216  Compile(labels, fst, std::move(weight));
217  return true;
218  }
219 
220  private:
221  void Compile(const std::vector<Label> &labels, MutableFst<Arc> *fst,
222  Weight weight = Weight::One()) const {
223  fst->DeleteStates();
224  auto state = fst->AddState();
225  fst->SetStart(state);
226  fst->AddStates(labels.size());
227  for (auto label : labels) {
228  fst->AddArc(state, Arc(label, label, state + 1));
229  ++state;
230  }
231  fst->SetFinal(state, std::move(weight));
233  }
234 
235  template <class Unsigned>
236  void Compile(const std::vector<Label> &labels,
237  CompactStringFst<Arc, Unsigned> *fst) const {
238  using Compactor = typename CompactStringFst<Arc, Unsigned>::Compactor;
239  fst->SetCompactor(
240  std::make_shared<Compactor>(labels.begin(), labels.end()));
241  }
242 
243  template <class Unsigned>
244  void Compile(const std::vector<Label> &labels,
246  Weight weight = Weight::One()) const {
247  std::vector<std::pair<Label, Weight>> compacts;
248  compacts.reserve(labels.size() + 1);
249  for (StateId i = 0; i < static_cast<StateId>(labels.size()) - 1; ++i) {
250  compacts.emplace_back(labels[i], Weight::One());
251  }
252  compacts.emplace_back(!labels.empty() ? labels.back() : kNoLabel, weight);
253  using Compactor =
255  fst->SetCompactor(
256  std::make_shared<Compactor>(compacts.begin(), compacts.end()));
257  }
258 
259  const TokenType token_type_;
260  const SymbolTable *syms_; // Symbol table (used when token type is symbol).
261  const Label unknown_label_; // Label for token missing from symbol table.
262  const bool allow_negative_; // Negative labels allowed?
263 
264  StringCompiler(const StringCompiler &) = delete;
265  StringCompiler &operator=(const StringCompiler &) = delete;
266 };
267 
268 // A useful alias when using StdArc.
270 
271 // Helpers for StringPrinter.
272 
273 // Converts an FST to a vector of output labels. To get input labels, use
274 // Project or Invert. Returns true on success. Use only with string FSTs; may
275 // loop for non-string FSTs.
276 template <class Arc>
278  std::vector<typename Arc::Label> *labels) {
279  labels->clear();
280  auto s = fst.Start();
281  if (s == kNoStateId) {
282  LOG(ERROR) << "StringFstToOutputLabels: Invalid start state";
283  return false;
284  }
285  while (fst.Final(s) == Arc::Weight::Zero()) {
286  ArcIterator<Fst<Arc>> aiter(fst, s);
287  if (aiter.Done()) {
288  LOG(ERROR) << "StringFstToOutputLabels: Does not reach final state";
289  return false;
290  }
291  const auto &arc = aiter.Value();
292  labels->push_back(arc.olabel);
293  s = arc.nextstate;
294  aiter.Next();
295  if (!aiter.Done()) {
296  LOG(ERROR) << "StringFstToOutputLabels: State " << s
297  << " has multiple outgoing arcs";
298  return false;
299  }
300  }
301  if (fst.NumArcs(s) != 0) {
302  LOG(ERROR) << "StringFstToOutputLabels: Final state " << s
303  << " has outgoing arc(s)";
304  return false;
305  }
306  return true;
307 }
308 
309 // Converts a list of symbols to a string. If the token type is SYMBOL, the last
310 // character of sep is used to separate textual symbols. Additionally, if the
311 // token type is SYMBOL, epsilon symbols will be printed only if omit_epsilon
312 // is false. Returns true on success.
313 template <class Label>
315  const std::vector<Label> &labels, std::string *str,
316  TokenType ttype = TokenType::BYTE, const SymbolTable *syms = nullptr,
317  const std::string &sep = FST_FLAGS_fst_field_separator,
318  bool omit_epsilon = true) {
319  switch (ttype) {
320  case TokenType::BYTE: {
321  return LabelsToByteString(labels, str);
322  }
323  case TokenType::UTF8: {
324  return LabelsToUTF8String(labels, str);
325  }
326  case TokenType::SYMBOL: {
327  return syms ? internal::LabelsToSymbolString(labels, str, *syms, sep,
328  omit_epsilon)
329  : internal::LabelsToNumericString(labels, str, sep,
330  omit_epsilon);
331  }
332  }
333  return false;
334 }
335 
336 // Functor for printing a string FST as a string.
337 template <class Arc>
339  public:
340  using Label = typename Arc::Label;
341 
342  explicit StringPrinter(TokenType token_type = TokenType::BYTE,
343  const SymbolTable *syms = nullptr,
344  bool omit_epsilon = true)
345  : token_type_(token_type), syms_(syms), omit_epsilon_(omit_epsilon) {}
346 
347  // Converts the FST into a string. With SYMBOL token type, the last character
348  // of sep is used as a separator between symbols. Returns true on success.
350  const Fst<Arc> &fst, std::string *str,
351  const std::string &sep = FST_FLAGS_fst_field_separator) const {
352  std::vector<Label> labels;
353  return StringFstToOutputLabels(fst, &labels) &&
354  LabelsToString(labels, str, token_type_, syms_, sep, omit_epsilon_);
355  }
356 
357  private:
358  const TokenType token_type_;
359  const SymbolTable *syms_;
360  const bool omit_epsilon_;
361 
362  StringPrinter(const StringPrinter &) = delete;
363  StringPrinter &operator=(const StringPrinter &) = delete;
364 };
365 
366 // A useful alias when using StdArc.
368 
369 } // namespace fst
370 
371 #endif // FST_STRING_H_
const std::string & Name() const
Definition: symbol-table.h:462
void Compile(std::istream &istrm, const std::string &source, const std::string &dest, const std::string &fst_type, const std::string &arc_type, const SymbolTable *isyms, const SymbolTable *osyms, const SymbolTable *ssyms, bool accep, bool ikeep, bool okeep, bool nkeep, bool allow_negative_labels)
Definition: compile.cc:28
bool StringFstToOutputLabels(const Fst< Arc > &fst, std::vector< typename Arc::Label > *labels)
Definition: string.h:277
bool LabelsToString(const std::vector< Label > &labels, std::string *str, TokenType ttype=TokenType::BYTE, const SymbolTable *syms=nullptr, const std::string &sep=FST_FLAGS_fst_field_separator, bool omit_epsilon=true)
Definition: string.h:314
constexpr int kNoLabel
Definition: fst.h:201
bool operator()(std::string_view str, FST *fst, const std::string &sep=FST_FLAGS_fst_field_separator) const
Definition: string.h:192
virtual size_t NumArcs(StateId) const =0
bool ConvertSymbolToLabel(std::string_view str, const SymbolTable *syms, Label unknown_label, bool allow_negative, Label *output)
Definition: string.h:65
typename Arc::StateId StateId
Definition: string.h:175
#define LOG(type)
Definition: log.h:49
constexpr uint64_t kCompiledStringProperties
Definition: properties.h:156
virtual Weight Final(StateId) const =0
virtual void SetStart(StateId)=0
constexpr int64_t kNoSymbol
Definition: symbol-table.h:49
constexpr int kNoStateId
Definition: fst.h:202
internal::StringSplitter StrSplit(std::string_view full, ByAnyChar delim)
Definition: compat.cc:81
const Arc & Value() const
Definition: fst.h:537
typename Arc::Weight Weight
Definition: string.h:176
std::optional< int64_t > ParseInt64(std::string_view s, int base=10)
Definition: util.cc:42
typename Arc::Label Label
Definition: string.h:174
virtual void SetProperties(uint64_t props, uint64_t mask)=0
StringCompiler(TokenType token_type=TokenType::BYTE, const SymbolTable *syms=nullptr, Label unknown_label=kNoLabel, bool allow_negative=false)
Definition: string.h:178
DECLARE_string(fst_field_separator)
std::ostream & operator<<(std::ostream &strm, const ErrorWeight &)
Definition: error-weight.h:70
bool UTF8StringToLabels(std::string_view str, std::vector< Label > *labels)
Definition: icu.h:52
typename Arc::Label Label
Definition: string.h:340
bool ConvertStringToLabels(std::string_view str, TokenType token_type, const SymbolTable *syms, Label unknown_label, bool allow_negative, std::vector< Label > *labels, const std::string &sep=FST_FLAGS_fst_field_separator)
Definition: string.h:92
bool LabelsToSymbolString(const std::vector< Label > &labels, std::string *str, const SymbolTable &syms, std::string_view sep, bool omit_epsilon)
Definition: string.h:126
virtual StateId Start() const =0
bool Done() const
Definition: fst.h:533
bool operator()(std::string_view str, FST *fst, Weight weight, const std::string &sep=FST_FLAGS_fst_field_separator) const
Definition: string.h:207
bool LabelsToUTF8String(const std::vector< Label > &labels, std::string *str)
Definition: icu.h:102
TokenType
Definition: string.h:47
virtual void AddArc(StateId, const Arc &)=0
void SetCompactor(std::shared_ptr< Compactor > compactor)
Definition: compact-fst.h:1185
std::string StrCat(const StringOrInt &s1, const StringOrInt &s2)
Definition: compat.h:279
virtual StateId AddState()=0
bool LabelsToNumericString(const std::vector< Label > &labels, std::string *str, std::string_view sep, bool omit_epsilon)
Definition: string.h:153
virtual void SetFinal(StateId s, Weight weight=Weight::One())=0
virtual void DeleteStates(const std::vector< StateId > &)=0
bool ByteStringToLabels(std::string_view str, std::vector< Label > *labels)
Definition: icu.h:40
std::string Find(int64_t key) const
Definition: symbol-table.h:446
bool LabelsToByteString(const std::vector< Label > &labels, std::string *str)
Definition: icu.h:92
bool operator()(const Fst< Arc > &fst, std::string *str, const std::string &sep=FST_FLAGS_fst_field_separator) const
Definition: string.h:349
StringPrinter(TokenType token_type=TokenType::BYTE, const SymbolTable *syms=nullptr, bool omit_epsilon=true)
Definition: string.h:342
void Next()
Definition: fst.h:541
virtual void AddStates(size_t)=0