FST  openfst-1.7.2
OpenFst Library
string.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // Utilities to convert strings into FSTs.
5 
6 #ifndef FST_STRING_H_
7 #define FST_STRING_H_
8 
9 #include <memory>
10 #include <sstream>
11 #include <string>
12 #include <vector>
13 
14 #include <fst/flags.h>
15 #include <fst/log.h>
16 
17 #include <fst/compact-fst.h>
18 #include <fst/icu.h>
19 #include <fst/mutable-fst.h>
20 #include <fst/util.h>
21 
22 
23 DECLARE_string(fst_field_separator);
24 
25 namespace fst {
26 
27 enum StringTokenType { SYMBOL = 1, BYTE = 2, UTF8 = 3 };
28 
29 namespace internal {
30 
31 template <class Label>
32 bool ConvertSymbolToLabel(const char *str, const SymbolTable *syms,
33  Label unknown_label, bool allow_negative,
34  Label *output) {
35  int64 n;
36  if (syms) {
37  n = syms->Find(str);
38  if ((n == -1) && (unknown_label != kNoLabel)) n = unknown_label;
39  if (n == -1 || (!allow_negative && n < 0)) {
40  LOG(ERROR) << "ConvertSymbolToLabel: Symbol \"" << str
41  << "\" is not mapped to any integer label, symbol table = "
42  << syms->Name();
43  return false;
44  }
45  } else {
46  char *p;
47  n = strtoll(str, &p, 10);
48  if (p < str + strlen(str) || (!allow_negative && n < 0)) {
49  LOG(ERROR) << "ConvertSymbolToLabel: Bad label integer "
50  << "= \"" << str << "\"";
51  return false;
52  }
53  }
54  *output = n;
55  return true;
56 }
57 
58 template <class Label>
59 bool ConvertStringToLabels(const string &str, StringTokenType token_type,
60  const SymbolTable *syms, Label unknown_label,
61  bool allow_negative, std::vector<Label> *labels) {
62  labels->clear();
63  if (token_type == StringTokenType::BYTE) {
64  labels->reserve(str.size());
65  return ByteStringToLabels(str, labels);
66  } else if (token_type == StringTokenType::UTF8) {
67  return UTF8StringToLabels(str, labels);
68  } else {
69  std::unique_ptr<char[]> c_str(new char[str.size() + 1]);
70  str.copy(c_str.get(), str.size());
71  c_str[str.size()] = 0;
72  std::vector<char *> vec;
73  const string separator = "\n" + FLAGS_fst_field_separator;
74  SplitString(c_str.get(), separator.c_str(), &vec, true);
75  for (const char *c : vec) {
76  Label label;
77  if (!ConvertSymbolToLabel(c, syms, unknown_label, allow_negative,
78  &label)) {
79  return false;
80  }
81  labels->push_back(label);
82  }
83  }
84  return true;
85 }
86 
87 // The sep string is used as a separator between symbols, unless it is nullptr,
88 // in which case the last character of FLAGS_fst_field_separator is used.
89 template <class Label>
90 bool LabelsToSymbolString(const std::vector<Label> &labels, string *str,
91  const SymbolTable &syms,
92  const string *sep = nullptr) {
93  std::stringstream ostrm;
94  string delim = "";
95  for (auto label : labels) {
96  ostrm << delim;
97  const string &symbol = syms.Find(label);
98  if (symbol.empty()) {
99  LOG(ERROR) << "LabelToSymbolString: Label " << label
100  << " is not mapped onto any textual symbol in symbol table "
101  << syms.Name();
102  return false;
103  }
104  ostrm << symbol;
105  delim = sep != nullptr ? *sep : string(1, FLAGS_fst_field_separator.back());
106  }
107  *str = ostrm.str();
108  return !!ostrm;
109 }
110 
111 } // namespace internal
112 
113 // TODO(kbg): Move this just before StringPrinter.
114 // Converts an FST to a vector of output labels. To get input labels, use
115 // Project or Invert. Returns true on success. Use only with string FSTs; may
116 // loop for non-string FSTs.
117 template <class Arc>
119  std::vector<typename Arc::Label> *labels) {
120  labels->clear();
121  auto s = fst.Start();
122  if (s == kNoStateId) {
123  LOG(ERROR) << "StringFstToOutputLabels: Invalid start state";
124  return false;
125  }
126  while (fst.Final(s) == Arc::Weight::Zero()) {
127  ArcIterator<Fst<Arc>> aiter(fst, s);
128  if (aiter.Done()) {
129  LOG(ERROR) << "StringFstToOutputLabels: Does not reach final state";
130  return false;
131  }
132  const auto &arc = aiter.Value();
133  labels->push_back(arc.olabel);
134  s = arc.nextstate;
135  aiter.Next();
136  if (!aiter.Done()) {
137  LOG(ERROR) << "StringFstToOutputLabels: State has multiple outgoing arcs";
138  return false;
139  }
140  }
141  if (fst.NumArcs(s) != 0) {
142  LOG(ERROR) << "FstToOutputLabels: Final state has outgoing arc(s)";
143  return false;
144  }
145  return true;
146 }
147 
148 // TODO(kbg): Move this just before StringPrinter.
149 // Converts a list of symbols to a string. Returns true on success. If the token
150 // type is SYMBOL and sep is provided, it is used to separate textual symbols.
151 // If the token type is SYMBOL and it is not provided, the last character of
152 // FLAGS_fst_field_separator is used.
153 template <class Label>
154 bool LabelsToString(const std::vector<Label> &labels, string *str,
155  StringTokenType ttype = BYTE,
156  const SymbolTable *syms = nullptr,
157  const string *sep = nullptr) {
158  switch (ttype) {
159  case StringTokenType::BYTE: {
160  return LabelsToByteString(labels, str);
161  }
162  case StringTokenType::UTF8: {
163  return LabelsToUTF8String(labels, str);
164  }
166  return internal::LabelsToSymbolString(labels, str, *syms, sep);
167  }
168  }
169  return false;
170 }
171 
172 // Functor for compiling a string in an FST.
173 template <class Arc>
175  public:
176  using Label = typename Arc::Label;
177  using StateId = typename Arc::StateId;
178  using Weight = typename Arc::Weight;
179 
180  explicit StringCompiler(StringTokenType token_type,
181  const SymbolTable *syms = nullptr,
182  Label unknown_label = kNoLabel,
183  bool allow_negative = false)
184  : token_type_(token_type),
185  syms_(syms),
186  unknown_label_(unknown_label),
187  allow_negative_(allow_negative) {}
188 
189  // Compiles string into an FST.
190  template <class FST>
191  bool operator()(const string &str, FST *fst) const {
192  std::vector<Label> labels;
193  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
194  unknown_label_, allow_negative_,
195  &labels)) {
196  return false;
197  }
198  Compile(labels, fst);
199  return true;
200  }
201 
202  template <class FST>
203  bool operator()(const string &str, FST *fst, Weight weight) const {
204  std::vector<Label> labels;
205  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
206  unknown_label_, allow_negative_,
207  &labels)) {
208  return false;
209  }
210  Compile(labels, fst, std::move(weight));
211  return true;
212  }
213 
214  private:
215  void Compile(const std::vector<Label> &labels, MutableFst<Arc> *fst,
216  Weight weight = Weight::One()) const {
217  fst->DeleteStates();
218  while (fst->NumStates() <= labels.size()) fst->AddState();
219  for (StateId i = 0; i < labels.size(); ++i) {
220  fst->AddArc(i, Arc(labels[i], labels[i], Weight::One(), i + 1));
221  }
222  fst->SetStart(0);
223  fst->SetFinal(labels.size(), std::move(weight));
224  }
225 
226  template <class Unsigned>
227  void Compile(const std::vector<Label> &labels,
228  CompactStringFst<Arc, Unsigned> *fst) const {
229  fst->SetCompactElements(labels.begin(), labels.end());
230  }
231 
232  template <class Unsigned>
233  void Compile(const std::vector<Label> &labels,
235  const Weight &weight = Weight::One()) const {
236  std::vector<std::pair<Label, Weight>> compacts;
237  compacts.reserve(labels.size() + 1);
238  for (StateId i = 0; i < static_cast<StateId>(labels.size()) - 1; ++i) {
239  compacts.emplace_back(labels[i], Weight::One());
240  }
241  compacts.emplace_back(!labels.empty() ? labels.back() : kNoLabel, weight);
242  fst->SetCompactElements(compacts.begin(), compacts.end());
243  }
244 
245  const StringTokenType token_type_;
246  const SymbolTable *syms_; // Symbol table (used when token type is symbol).
247  const Label unknown_label_; // Label for token missing from symbol table.
248  const bool allow_negative_; // Negative labels allowed?
249 
250  StringCompiler(const StringCompiler &) = delete;
251  StringCompiler &operator=(const StringCompiler &) = delete;
252 };
253 
254 // Helpers for StringPrinter.
255 
256 // Functor for printing a string FST as a string.
257 template <class Arc>
259  public:
260  using Label = typename Arc::Label;
261 
262  explicit StringPrinter(StringTokenType token_type,
263  const SymbolTable *syms = nullptr)
264  : token_type_(token_type), syms_(syms) {}
265 
266  // Converts the FST into a string. With SYMBOL token type, sep is used as a
267  // separator between symbols, unless it is nullptr, in which case the last
268  // character of FLAGS_fst_field_separator is used. Returns true on success.
269  bool operator()(const Fst<Arc> &fst, string *str,
270  const string *sep = nullptr) const {
271  std::vector<Label> labels;
272  return (StringFstToOutputLabels(fst, &labels) &&
273  LabelsToString(labels, str, token_type_, syms_, sep));
274  }
275 
276  private:
277  const StringTokenType token_type_;
278  const SymbolTable *syms_;
279 
280  StringPrinter(const StringPrinter &) = delete;
281  StringPrinter &operator=(const StringPrinter &) = delete;
282 };
283 
284 } // namespace fst
285 
286 #endif // FST_STRING_H_
bool StringFstToOutputLabels(const Fst< Arc > &fst, std::vector< typename Arc::Label > *labels)
Definition: string.h:118
constexpr int kNoLabel
Definition: fst.h:179
bool LabelsToString(const std::vector< Label > &labels, string *str, StringTokenType ttype=BYTE, const SymbolTable *syms=nullptr, const string *sep=nullptr)
Definition: string.h:154
StringTokenType
Definition: string.h:27
virtual size_t NumArcs(StateId) const =0
StringCompiler(StringTokenType token_type, const SymbolTable *syms=nullptr, Label unknown_label=kNoLabel, bool allow_negative=false)
Definition: string.h:180
virtual void AddArc(StateId, const Arc &arc)=0
typename Arc::StateId StateId
Definition: string.h:177
#define LOG(type)
Definition: log.h:48
virtual Weight Final(StateId) const =0
virtual void SetStart(StateId)=0
bool LabelsToUTF8String(const std::vector< Label > &labels, string *str)
Definition: icu.h:86
bool UTF8StringToLabels(const string &str, std::vector< Label > *labels)
Definition: icu.h:36
constexpr int kNoStateId
Definition: fst.h:180
bool ConvertStringToLabels(const string &str, StringTokenType token_type, const SymbolTable *syms, Label unknown_label, bool allow_negative, std::vector< Label > *labels)
Definition: string.h:59
const Arc & Value() const
Definition: fst.h:503
typename Arc::Weight Weight
Definition: string.h:178
int64_t int64
Definition: types.h:27
bool ByteStringToLabels(const string &str, std::vector< Label > *labels)
Definition: icu.h:24
void SplitString(char *line, const char *delim, std::vector< char * > *vec, bool omit_empty_strings)
Definition: util.cc:22
bool ConvertSymbolToLabel(const char *str, const SymbolTable *syms, Label unknown_label, bool allow_negative, Label *output)
Definition: string.h:32
bool operator()(const string &str, FST *fst, Weight weight) const
Definition: string.h:203
virtual void SetFinal(StateId, Weight)=0
bool operator()(const string &str, FST *fst) const
Definition: string.h:191
typename Arc::Label Label
Definition: string.h:176
DECLARE_string(fst_field_separator)
typename Arc::Label Label
Definition: string.h:260
virtual StateId Start() const =0
bool Done() const
Definition: fst.h:499
virtual string Find(int64 key) const
Definition: symbol-table.h:301
StringPrinter(StringTokenType token_type, const SymbolTable *syms=nullptr)
Definition: string.h:262
virtual StateId AddState()=0
bool LabelsToByteString(const std::vector< Label > &labels, string *str)
Definition: icu.h:76
virtual void DeleteStates(const std::vector< StateId > &)=0
virtual const string & Name() const
Definition: symbol-table.h:319
bool operator()(const Fst< Arc > &fst, string *str, const string *sep=nullptr) const
Definition: string.h:269
bool LabelsToSymbolString(const std::vector< Label > &labels, string *str, const SymbolTable &syms, const string *sep=nullptr)
Definition: string.h:90
virtual StateId NumStates() const =0
void SetCompactElements(const Iterator &b, const Iterator &e)
Definition: compact-fst.h:1123
void Next()
Definition: fst.h:507