FST  openfst-1.7.1
OpenFst Library
string.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // Utilities to convert strings into FSTs.
5 
6 #ifndef FST_STRING_H_
7 #define FST_STRING_H_
8 
9 #include <memory>
10 #include <sstream>
11 #include <string>
12 #include <vector>
13 
14 #include <fst/flags.h>
15 #include <fst/log.h>
16 
17 #include <fst/compact-fst.h>
18 #include <fst/icu.h>
19 #include <fst/mutable-fst.h>
20 #include <fst/util.h>
21 
22 
23 DECLARE_string(fst_field_separator);
24 
25 namespace fst {
26 
27 enum StringTokenType { SYMBOL = 1, BYTE = 2, UTF8 = 3 };
28 
29 namespace internal {
30 
31 template <class Label>
32 bool ConvertSymbolToLabel(const char *str, const SymbolTable *syms,
33  Label unknown_label, bool allow_negative,
34  Label *output) {
35  int64 n;
36  if (syms) {
37  n = syms->Find(str);
38  if ((n == -1) && (unknown_label != kNoLabel)) n = unknown_label;
39  if (n == -1 || (!allow_negative && n < 0)) {
40  VLOG(1) << "ConvertSymbolToLabel: Symbol \"" << str
41  << "\" is not mapped to any integer label, symbol table = "
42  << syms->Name();
43  return false;
44  }
45  } else {
46  char *p;
47  n = strtoll(str, &p, 10);
48  if (p < str + strlen(str) || (!allow_negative && n < 0)) {
49  VLOG(1) << "ConvertSymbolToLabel: Bad label integer "
50  << "= \"" << str << "\"";
51  return false;
52  }
53  }
54  *output = n;
55  return true;
56 }
57 
58 template <class Label>
59 bool ConvertStringToLabels(const string &str, StringTokenType token_type,
60  const SymbolTable *syms, Label unknown_label,
61  bool allow_negative, std::vector<Label> *labels) {
62  labels->clear();
63  if (token_type == StringTokenType::BYTE) {
64  labels->reserve(str.size());
65  return ByteStringToLabels(str, labels);
66  } else if (token_type == StringTokenType::UTF8) {
67  return UTF8StringToLabels(str, labels);
68  } else {
69  std::unique_ptr<char[]> c_str(new char[str.size() + 1]);
70  str.copy(c_str.get(), str.size());
71  c_str[str.size()] = 0;
72  std::vector<char *> vec;
73  const string separator = "\n" + FLAGS_fst_field_separator;
74  SplitString(c_str.get(), separator.c_str(), &vec, true);
75  for (const char *c : vec) {
76  Label label;
77  if (!ConvertSymbolToLabel(c, syms, unknown_label, allow_negative,
78  &label)) {
79  return false;
80  }
81  labels->push_back(label);
82  }
83  }
84  return true;
85 }
86 
87 } // namespace internal
88 
89 // Functor for compiling a string in an FST.
90 template <class Arc>
92  public:
93  using Label = typename Arc::Label;
94  using StateId = typename Arc::StateId;
95  using Weight = typename Arc::Weight;
96 
97  explicit StringCompiler(StringTokenType token_type,
98  const SymbolTable *syms = nullptr,
99  Label unknown_label = kNoLabel,
100  bool allow_negative = false)
101  : token_type_(token_type),
102  syms_(syms),
103  unknown_label_(unknown_label),
104  allow_negative_(allow_negative) {}
105 
106  // Compiles string into an FST.
107  template <class FST>
108  bool operator()(const string &str, FST *fst) const {
109  std::vector<Label> labels;
110  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
111  unknown_label_, allow_negative_,
112  &labels)) {
113  return false;
114  }
115  Compile(labels, fst);
116  return true;
117  }
118 
119  template <class FST>
120  bool operator()(const string &str, FST *fst, Weight weight) const {
121  std::vector<Label> labels;
122  if (!internal::ConvertStringToLabels(str, token_type_, syms_,
123  unknown_label_, allow_negative_,
124  &labels)) {
125  return false;
126  }
127  Compile(labels, fst, std::move(weight));
128  return true;
129  }
130 
131  private:
132  void Compile(const std::vector<Label> &labels, MutableFst<Arc> *fst,
133  Weight weight = Weight::One()) const {
134  fst->DeleteStates();
135  while (fst->NumStates() <= labels.size()) fst->AddState();
136  for (StateId i = 0; i < labels.size(); ++i) {
137  fst->AddArc(i, Arc(labels[i], labels[i], Weight::One(), i + 1));
138  }
139  fst->SetStart(0);
140  fst->SetFinal(labels.size(), std::move(weight));
141  }
142 
143  template <class Unsigned>
144  void Compile(const std::vector<Label> &labels,
145  CompactStringFst<Arc, Unsigned> *fst) const {
146  fst->SetCompactElements(labels.begin(), labels.end());
147  }
148 
149  template <class Unsigned>
150  void Compile(const std::vector<Label> &labels,
152  const Weight &weight = Weight::One()) const {
153  std::vector<std::pair<Label, Weight>> compacts;
154  compacts.reserve(labels.size() + 1);
155  for (StateId i = 0; i < static_cast<StateId>(labels.size()) - 1; ++i) {
156  compacts.emplace_back(labels[i], Weight::One());
157  }
158  compacts.emplace_back(!labels.empty() ? labels.back() : kNoLabel, weight);
159  fst->SetCompactElements(compacts.begin(), compacts.end());
160  }
161 
162  const StringTokenType token_type_;
163  const SymbolTable *syms_; // Symbol table (used when token type is symbol).
164  const Label unknown_label_; // Label for token missing from symbol table.
165  const bool allow_negative_; // Negative labels allowed?
166 
167  StringCompiler(const StringCompiler &) = delete;
168  StringCompiler &operator=(const StringCompiler &) = delete;
169 };
170 
171 // Functor for printing a string FST as a string.
172 template <class Arc>
174  public:
175  using Label = typename Arc::Label;
176  using StateId = typename Arc::StateId;
177  using Weight = typename Arc::Weight;
178 
179  explicit StringPrinter(StringTokenType token_type,
180  const SymbolTable *syms = nullptr)
181  : token_type_(token_type), syms_(syms) {}
182 
183  // Converts the FST into a string. If a SYMBOL-based fst, then use the sep as
184  // the separator between symbols. If sep is nullptr, then use the last char
185  // in the FLAGS_fst_field_separator. This is to maintain backwards
186  // compatibility with the code before the sep argument was added.
187  bool operator()(const Fst<Arc> &fst, string *result,
188  const string* sep = nullptr) {
189  if (!FstToLabels(fst)) {
190  VLOG(1) << "StringPrinter::operator(): FST is not a string";
191  return false;
192  }
193  result->clear();
194  if (token_type_ == StringTokenType::SYMBOL) {
195  std::stringstream sstrm;
196  for (size_t i = 0; i < labels_.size(); ++i) {
197  if (i) {
198  if (sep == nullptr) {
199  sstrm << *(FLAGS_fst_field_separator.rbegin());
200  } else {
201  sstrm << *sep;
202  }
203  }
204  if (!PrintLabel(labels_[i], sstrm)) return false;
205  }
206  *result = sstrm.str();
207  } else if (token_type_ == StringTokenType::BYTE) {
208  return LabelsToByteString(labels_, result);
209  } else if (token_type_ == StringTokenType::UTF8) {
210  return LabelsToUTF8String(labels_, result);
211  } else {
212  VLOG(1) << "StringPrinter::operator(): Unknown token type: "
213  << token_type_;
214  return false;
215  }
216  return true;
217  }
218 
219  private:
220  bool FstToLabels(const Fst<Arc> &fst) {
221  labels_.clear();
222  auto s = fst.Start();
223  if (s == kNoStateId) {
224  VLOG(2) << "StringPrinter::FstToLabels: Invalid starting state for "
225  << "string FST";
226  return false;
227  }
228  while (fst.Final(s) == Weight::Zero()) {
229  ArcIterator<Fst<Arc>> aiter(fst, s);
230  if (aiter.Done()) {
231  VLOG(2) << "StringPrinter::FstToLabels: String FST traversal does "
232  << "not reach final state";
233  return false;
234  }
235  const auto &arc = aiter.Value();
236  labels_.push_back(arc.olabel);
237  s = arc.nextstate;
238  if (s == kNoStateId) {
239  VLOG(2) << "StringPrinter::FstToLabels: Transition to invalid state";
240  return false;
241  }
242  aiter.Next();
243  if (!aiter.Done()) {
244  VLOG(2) << "StringPrinter::FstToLabels: State with multiple "
245  << "outgoing arcs found";
246  return false;
247  }
248  }
249  return true;
250  }
251 
252  bool PrintLabel(Label label, std::ostream &ostrm) {
253  if (syms_) {
254  const auto symbol = syms_->Find(label);
255  if (symbol == "") {
256  VLOG(2) << "StringPrinter::PrintLabel: Integer " << label << " is not "
257  << "mapped to any textual symbol, symbol table = "
258  << syms_->Name();
259  return false;
260  }
261  ostrm << symbol;
262  } else {
263  ostrm << label;
264  }
265  return true;
266  }
267 
268  const StringTokenType token_type_;
269  const SymbolTable *syms_; // Symbol table (used when token type is symbol).
270  std::vector<Label> labels_; // Input FST labels.
271 
272  StringPrinter(const StringPrinter &) = delete;
273  StringPrinter &operator=(const StringPrinter &) = delete;
274 };
275 
276 } // namespace fst
277 
278 #endif // FST_STRING_H_
constexpr int kNoLabel
Definition: fst.h:179
StringTokenType
Definition: string.h:27
StringCompiler(StringTokenType token_type, const SymbolTable *syms=nullptr, Label unknown_label=kNoLabel, bool allow_negative=false)
Definition: string.h:97
virtual void AddArc(StateId, const Arc &arc)=0
typename Arc::StateId StateId
Definition: string.h:94
virtual Weight Final(StateId) const =0
virtual void SetStart(StateId)=0
bool LabelsToUTF8String(const std::vector< Label > &labels, string *str)
Definition: icu.h:86
typename Arc::Weight Weight
Definition: string.h:177
bool UTF8StringToLabels(const string &str, std::vector< Label > *labels)
Definition: icu.h:36
constexpr int kNoStateId
Definition: fst.h:180
bool ConvertStringToLabels(const string &str, StringTokenType token_type, const SymbolTable *syms, Label unknown_label, bool allow_negative, std::vector< Label > *labels)
Definition: string.h:59
const Arc & Value() const
Definition: fst.h:503
typename Arc::Weight Weight
Definition: string.h:95
int64_t int64
Definition: types.h:27
bool ByteStringToLabels(const string &str, std::vector< Label > *labels)
Definition: icu.h:24
void SplitString(char *line, const char *delim, std::vector< char * > *vec, bool omit_empty_strings)
Definition: util.cc:22
bool ConvertSymbolToLabel(const char *str, const SymbolTable *syms, Label unknown_label, bool allow_negative, Label *output)
Definition: string.h:32
bool operator()(const string &str, FST *fst, Weight weight) const
Definition: string.h:120
virtual void SetFinal(StateId, Weight)=0
bool operator()(const string &str, FST *fst) const
Definition: string.h:108
typename Arc::Label Label
Definition: string.h:93
DECLARE_string(fst_field_separator)
typename Arc::Label Label
Definition: string.h:175
#define VLOG(level)
Definition: log.h:49
virtual StateId Start() const =0
bool Done() const
Definition: fst.h:499
virtual string Find(int64 key) const
Definition: symbol-table.h:299
StringPrinter(StringTokenType token_type, const SymbolTable *syms=nullptr)
Definition: string.h:179
bool operator()(const Fst< Arc > &fst, string *result, const string *sep=nullptr)
Definition: string.h:187
virtual StateId AddState()=0
bool LabelsToByteString(const std::vector< Label > &labels, string *str)
Definition: icu.h:76
virtual void DeleteStates(const std::vector< StateId > &)=0
virtual const string & Name() const
Definition: symbol-table.h:317
virtual StateId NumStates() const =0
typename Arc::StateId StateId
Definition: string.h:176
void SetCompactElements(const Iterator &b, const Iterator &e)
Definition: compact-fst.h:1123
void Next()
Definition: fst.h:507