FST  openfst-1.7.3
OpenFst Library
compile-strings.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 
4 #ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
5 #define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
6 
7 #include <libgen.h>
8 
9 #include <fstream>
10 #include <istream>
11 #include <string>
12 #include <vector>
13 
14 #include <fst/extensions/far/far.h>
15 #include <fstream>
16 #include <fst/string.h>
17 
18 namespace fst {
19 
20 // Constructs a reader that provides FSTs from a file (stream) either on a
21 // line-by-line basis or on a per-stream basis. Note that the freshly
22 // constructed reader is already set to the first input.
23 //
24 // Sample usage:
25 //
26 // for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) {
27 // auto *fst = reader.GetVectorFst();
28 // }
29 template <class Arc>
30 class StringReader {
31  public:
32  using Label = typename Arc::Label;
33  using Weight = typename Arc::Weight;
34 
35  enum EntryType { LINE = 1, FILE = 2 };
36 
37  StringReader(std::istream &istrm, const std::string &source,
38  EntryType entry_type, StringTokenType token_type,
39  bool allow_negative_labels, const SymbolTable *syms = nullptr,
40  Label unknown_label = kNoStateId)
41  : nline_(0),
42  istrm_(istrm),
43  source_(source),
44  entry_type_(entry_type),
45  token_type_(token_type),
46  symbols_(syms),
47  done_(false),
48  compiler_(token_type, syms, unknown_label, allow_negative_labels) {
49  Next(); // Initialize the reader to the first input.
50  }
51 
52  bool Done() { return done_; }
53 
54  void Next() {
55  VLOG(1) << "Processing source " << source_ << " at line " << nline_;
56  if (!istrm_) { // We're done if we have no more input.
57  done_ = true;
58  return;
59  }
60  if (entry_type_ == LINE) {
61  getline(istrm_, content_);
62  ++nline_;
63  } else {
64  content_.clear();
65  std::string line;
66  while (getline(istrm_, line)) {
67  ++nline_;
68  content_.append(line);
69  content_.append("\n");
70  }
71  }
72  if (!istrm_ && content_.empty()) // We're also done if we read off all the
73  done_ = true; // whitespace at the end of a file.
74  }
75 
76  VectorFst<Arc> *GetVectorFst(bool keep_symbols = false) {
77  std::unique_ptr<VectorFst<Arc>> fst(new VectorFst<Arc>());
78  if (keep_symbols) {
79  fst->SetInputSymbols(symbols_);
80  fst->SetOutputSymbols(symbols_);
81  }
82  if (compiler_(content_, fst.get())) {
83  return fst.release();
84  } else {
85  return nullptr;
86  }
87  }
88 
89  CompactStringFst<Arc> *GetCompactFst(bool keep_symbols = false) {
90  std::unique_ptr<CompactStringFst<Arc>> fst;
91  if (keep_symbols) {
92  VectorFst<Arc> tmp;
93  tmp.SetInputSymbols(symbols_);
94  tmp.SetOutputSymbols(symbols_);
95  fst.reset(new CompactStringFst<Arc>(tmp));
96  } else {
97  fst.reset(new CompactStringFst<Arc>());
98  }
99  if (compiler_(content_, fst.get())) {
100  return fst.release();
101  } else {
102  return nullptr;
103  }
104  }
105 
106  private:
107  size_t nline_;
108  std::istream &istrm_;
109  std::string source_;
110  EntryType entry_type_;
111  StringTokenType token_type_;
112  const SymbolTable *symbols_;
113  bool done_;
114  StringCompiler<Arc> compiler_;
115  std::string content_; // The actual content of the input stream's next FST.
116 
117  StringReader(const StringReader &) = delete;
118  StringReader &operator=(const StringReader &) = delete;
119 };
120 
121 // Computes the minimal length required to encode each line number as a decimal
122 // number, or zero if the file is not seekable.
123 int KeySize(const char *filename);
124 
125 template <class Arc>
126 void FarCompileStrings(const std::vector<std::string> &in_fnames,
127  const std::string &out_fname,
128  const std::string &fst_type, const FarType &far_type,
129  int32 generate_keys, FarEntryType fet, FarTokenType tt,
130  const std::string &symbols_fname,
131  const std::string &unknown_symbol, bool keep_symbols,
132  bool initial_symbols, bool allow_negative_labels,
133  const std::string &key_prefix,
134  const std::string &key_suffix) {
135  typename StringReader<Arc>::EntryType entry_type;
136  if (fet == FET_LINE) {
137  entry_type = StringReader<Arc>::LINE;
138  } else if (fet == FET_FILE) {
139  entry_type = StringReader<Arc>::FILE;
140  } else {
141  FSTERROR() << "FarCompileStrings: Unknown entry type";
142  return;
143  }
144  StringTokenType token_type;
145  if (tt == FTT_SYMBOL) {
146  token_type = StringTokenType::SYMBOL;
147  } else if (tt == FTT_BYTE) {
148  token_type = StringTokenType::BYTE;
149  } else if (tt == FTT_UTF8) {
150  token_type = StringTokenType::UTF8;
151  } else {
152  FSTERROR() << "FarCompileStrings: Unknown token type";
153  return;
154  }
155  bool compact;
156  if (fst_type.empty() || (fst_type == "vector")) {
157  compact = false;
158  } else if (fst_type == "compact") {
159  compact = true;
160  } else {
161  FSTERROR() << "FarCompileStrings: Unknown FST type: " << fst_type;
162  return;
163  }
164  std::unique_ptr<const SymbolTable> syms;
165  typename Arc::Label unknown_label = kNoLabel;
166  if (!symbols_fname.empty()) {
167  const SymbolTableTextOptions opts(allow_negative_labels);
168  syms.reset(SymbolTable::ReadText(symbols_fname, opts));
169  if (!syms) {
170  LOG(ERROR) << "FarCompileStrings: Error reading symbol table: "
171  << symbols_fname;
172  return;
173  }
174  if (!unknown_symbol.empty()) {
175  unknown_label = syms->Find(unknown_symbol);
176  if (unknown_label == kNoLabel) {
177  FSTERROR() << "FarCompileStrings: Label \"" << unknown_label
178  << "\" missing from symbol table: " << symbols_fname;
179  return;
180  }
181  }
182  }
183  std::unique_ptr<FarWriter<Arc>> far_writer(
184  FarWriter<Arc>::Create(out_fname, far_type));
185  if (!far_writer) return;
186  int n = 0;
187  for (const auto &in_fname : in_fnames) {
188  // Don't try to call KeySize("").
189  if (generate_keys == 0 && in_fname.empty()) {
190  FSTERROR() << "FarCompileStrings: Read from a file instead of stdin or"
191  << " set the --generate_keys flag.";
192  return;
193  }
194  const int key_size =
195  generate_keys ? generate_keys : (entry_type == StringReader<Arc>::FILE
196  ? 1 : KeySize(in_fname.c_str()));
197  if (key_size == 0) {
198  FSTERROR() << "FarCompileStrings: " << in_fname << " is not seekable. "
199  << "Read from a file instead or set the --generate_keys flag.";
200  return;
201  }
202  std::ifstream fstrm;
203  if (!in_fname.empty()) {
204  fstrm.open(in_fname);
205  if (!fstrm) {
206  FSTERROR() << "FarCompileStrings: Can't open file: " << in_fname;
207  return;
208  }
209  }
210  std::istream &istrm = fstrm.is_open() ? fstrm : std::cin;
211  bool keep_syms = keep_symbols;
212  for (StringReader<Arc> reader(
213  istrm, in_fname.empty() ? "stdin" : in_fname, entry_type,
214  token_type, allow_negative_labels, syms.get(), unknown_label);
215  !reader.Done(); reader.Next()) {
216  ++n;
217  std::unique_ptr<const Fst<Arc>> fst;
218  if (compact) {
219  fst.reset(reader.GetCompactFst(keep_syms));
220  } else {
221  fst.reset(reader.GetVectorFst(keep_syms));
222  }
223  if (initial_symbols) keep_syms = false;
224  if (!fst) {
225  FSTERROR() << "FarCompileStrings: Compiling string number " << n
226  << " in file " << in_fname << " failed with token_type = "
227  << (tt == FTT_BYTE
228  ? "byte"
229  : (tt == FTT_UTF8
230  ? "utf8"
231  : (tt == FTT_SYMBOL ? "symbol" : "unknown")))
232  << " and entry_type = "
233  << (fet == FET_LINE
234  ? "line"
235  : (fet == FET_FILE ? "file" : "unknown"));
236  return;
237  }
238  std::ostringstream keybuf;
239  keybuf.width(key_size);
240  keybuf.fill('0');
241  keybuf << n;
242  std::string key;
243  if (generate_keys > 0) {
244  key = keybuf.str();
245  } else {
246  auto *filename = new char[in_fname.size() + 1];
247  strcpy(filename, in_fname.c_str());
248  key = basename(filename);
249  if (entry_type != StringReader<Arc>::FILE) {
250  key += "-";
251  key += keybuf.str();
252  }
253  delete[] filename;
254  }
255  far_writer->Add(key_prefix + key + key_suffix, *fst);
256  }
257  if (generate_keys == 0) n = 0;
258  }
259 }
260 
261 } // namespace fst
262 
263 #endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
constexpr int kNoLabel
Definition: fst.h:178
StringTokenType
Definition: string.h:29
int KeySize(const char *filename)
Definition: strings.cc:19
#define LOG(type)
Definition: log.h:46
constexpr int kNoStateId
Definition: fst.h:179
FarTokenType
Definition: far.h:23
typename Arc::Weight Weight
#define FSTERROR()
Definition: util.h:35
CompactStringFst< Arc > * GetCompactFst(bool keep_symbols=false)
void SetOutputSymbols(const SymbolTable *osyms) override
Definition: mutable-fst.h:382
void FarCompileStrings(const std::vector< std::string > &in_fnames, const std::string &out_fname, const std::string &fst_type, const FarType &far_type, int32 generate_keys, FarEntryType fet, FarTokenType tt, const std::string &symbols_fname, const std::string &unknown_symbol, bool keep_symbols, bool initial_symbols, bool allow_negative_labels, const std::string &key_prefix, const std::string &key_suffix)
FarType
Definition: far.h:71
void SetInputSymbols(const SymbolTable *isyms) override
Definition: mutable-fst.h:377
typename Arc::Label Label
#define VLOG(level)
Definition: log.h:47
VectorFst< Arc > * GetVectorFst(bool keep_symbols=false)
int32_t int32
Definition: types.h:26
static SymbolTable * ReadText(std::istream &strm, const std::string &name, const SymbolTableTextOptions &opts=SymbolTableTextOptions())
Definition: symbol-table.h:248
StringReader(std::istream &istrm, const std::string &source, EntryType entry_type, StringTokenType token_type, bool allow_negative_labels, const SymbolTable *syms=nullptr, Label unknown_label=kNoStateId)
FarEntryType
Definition: far.h:21