FST  openfst-1.8.3
OpenFst Library
compile-strings.h
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 
18 #ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
19 #define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
20 
21 #include <libgen.h>
22 
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26 #include <fstream>
27 #include <iostream>
28 #include <istream>
29 #include <memory>
30 #include <sstream>
31 #include <string>
32 #include <vector>
33 
34 #include <fst/log.h>
35 #include <fst/extensions/far/far.h>
36 #include <fst/compact-fst.h>
37 #include <fstream>
38 #include <fst/fst.h>
39 #include <fst/string.h>
40 #include <fst/symbol-table.h>
41 #include <fst/util.h>
42 #include <fst/vector-fst.h>
43 #include <string_view>
44 
45 namespace fst {
46 namespace internal {
47 
48 // Constructs a reader that provides FSTs from a file (stream) either on a
49 // line-by-line basis or on a per-stream basis. Note that the freshly
50 // constructed reader is already set to the first input.
51 //
52 // Sample usage:
53 //
54 // for (StringReader<Arc> reader(...); !reader.Done(); reader.Next()) {
55 // auto *fst = reader.GetVectorFst();
56 // }
57 template <class Arc>
58 class StringReader {
59  public:
60  using Label = typename Arc::Label;
61  using Weight = typename Arc::Weight;
62 
63  StringReader(std::istream &istrm, const std::string &source,
64  FarEntryType entry_type, TokenType token_type,
65  const SymbolTable *syms = nullptr,
66  Label unknown_label = kNoStateId)
67  : nline_(0),
68  istrm_(istrm),
69  source_(source),
70  entry_type_(entry_type),
71  token_type_(token_type),
72  symbols_(syms),
73  done_(false),
74  compiler_(token_type, syms, unknown_label) {
75  Next(); // Initialize the reader to the first input.
76  }
77 
78  bool Done() { return done_; }
79 
80  void Next() {
81  VLOG(1) << "Processing source " << source_ << " at line " << nline_;
82  if (!istrm_) { // We're done if we have no more input.
83  done_ = true;
84  return;
85  }
86  if (entry_type_ == FarEntryType::LINE) {
87  std::getline(istrm_, content_);
88  ++nline_;
89  } else {
90  content_.clear();
91  std::string line;
92  while (std::getline(istrm_, line)) {
93  ++nline_;
94  content_.append(line);
95  content_.append("\n");
96  }
97  }
98  if (!istrm_ && content_.empty()) // We're also done if we read off all the
99  done_ = true; // whitespace at the end of a file.
100  }
101 
102  VectorFst<Arc> *GetVectorFst(bool keep_symbols = false) {
103  std::unique_ptr<VectorFst<Arc>> fst(new VectorFst<Arc>());
104  if (keep_symbols) {
105  fst->SetInputSymbols(symbols_);
106  fst->SetOutputSymbols(symbols_);
107  }
108  if (compiler_(content_, fst.get())) {
109  return fst.release();
110  } else {
111  return nullptr;
112  }
113  }
114 
115  CompactStringFst<Arc> *GetCompactFst(bool keep_symbols = false) {
116  std::unique_ptr<CompactStringFst<Arc>> fst;
117  if (keep_symbols) {
118  VectorFst<Arc> tmp;
119  tmp.SetInputSymbols(symbols_);
120  tmp.SetOutputSymbols(symbols_);
121  fst.reset(new CompactStringFst<Arc>(tmp));
122  } else {
123  fst.reset(new CompactStringFst<Arc>());
124  }
125  if (compiler_(content_, fst.get())) {
126  return fst.release();
127  } else {
128  return nullptr;
129  }
130  }
131 
132  private:
133  size_t nline_;
134  std::istream &istrm_;
135  std::string source_;
136  FarEntryType entry_type_;
137  TokenType token_type_;
138  const SymbolTable *symbols_;
139  bool done_;
140  StringCompiler<Arc> compiler_;
141  std::string content_; // The actual content of the input stream's next FST.
142 
143  StringReader(const StringReader &) = delete;
144  StringReader &operator=(const StringReader &) = delete;
145 };
146 
147 // Computes the minimal length required to encode each line number as a decimal
148 // number, or zero if the file is not seekable.
149 int KeySize(std::string_view source);
150 
151 } // namespace internal
152 
153 template <class Arc>
154 void CompileStrings(const std::vector<std::string> &sources,
155  FarWriter<Arc> &writer, std::string_view fst_type,
156  int32_t generate_keys, FarEntryType entry_type,
157  TokenType token_type, const std::string &symbols_source,
158  const std::string &unknown_symbol, bool keep_symbols,
159  bool initial_symbols, const std::string &key_prefix,
160  const std::string &key_suffix) {
161  bool compact;
162  if (fst_type.empty() || (fst_type == "vector")) {
163  compact = false;
164  } else if (fst_type == "compact") {
165  compact = true;
166  } else {
167  FSTERROR() << "CompileStrings: Unknown FST type: " << fst_type;
168  return;
169  }
170  std::unique_ptr<const SymbolTable> syms;
171  typename Arc::Label unknown_label = kNoLabel;
172  if (!symbols_source.empty()) {
173  syms.reset(SymbolTable::ReadText(symbols_source,
174  FST_FLAGS_fst_field_separator));
175  if (!syms) {
176  LOG(ERROR) << "CompileStrings: Error reading symbol table: "
177  << symbols_source;
178  return;
179  }
180  if (!unknown_symbol.empty()) {
181  unknown_label = syms->Find(unknown_symbol);
182  if (unknown_label == kNoLabel) {
183  FSTERROR() << "CompileStrings: Label \"" << unknown_label
184  << "\" missing from symbol table: " << symbols_source;
185  return;
186  }
187  }
188  }
189  int n = 0;
190  for (const auto &in_source : sources) {
191  // Don't try to call KeySize("").
192  if (generate_keys == 0 && in_source.empty()) {
193  FSTERROR() << "CompileStrings: Read from a file instead of stdin or"
194  << " set the --generate_keys flag.";
195  return;
196  }
197  const int key_size = generate_keys ? generate_keys
198  : (entry_type == FarEntryType::FILE
199  ? 1
200  : internal::KeySize(in_source));
201  if (key_size == 0) {
202  FSTERROR() << "CompileStrings: " << in_source << " is not seekable. "
203  << "Read from a file instead or set the --generate_keys flag.";
204  return;
205  }
206  std::ifstream fstrm;
207  if (!in_source.empty()) {
208  fstrm.open(in_source);
209  if (!fstrm) {
210  FSTERROR() << "CompileStrings: Can't open file: " << in_source;
211  return;
212  }
213  }
214  std::istream &istrm = fstrm.is_open() ? fstrm : std::cin;
215  bool keep_syms = keep_symbols;
216  for (internal::StringReader<Arc> reader(
217  istrm, in_source.empty() ? "stdin" : in_source, entry_type,
218  token_type, syms.get(), unknown_label);
219  !reader.Done(); reader.Next()) {
220  ++n;
221  std::unique_ptr<const Fst<Arc>> fst;
222  if (compact) {
223  fst.reset(reader.GetCompactFst(keep_syms));
224  } else {
225  fst.reset(reader.GetVectorFst(keep_syms));
226  }
227  if (initial_symbols) keep_syms = false;
228  if (!fst) {
229  FSTERROR() << "CompileStrings: Compiling string number " << n
230  << " in file " << in_source
231  << " failed with token_type = " << token_type
232  << " and entry_type = "
233  << (entry_type == FarEntryType::LINE
234  ? "line"
235  : (entry_type == FarEntryType::FILE ? "file"
236  : "unknown"));
237  return;
238  }
239  std::ostringstream keybuf;
240  keybuf.width(key_size);
241  keybuf.fill('0');
242  keybuf << n;
243  std::string key;
244  if (generate_keys > 0) {
245  key = keybuf.str();
246  } else {
247  auto source =
248  fst::make_unique_for_overwrite<char[]>(in_source.size() + 1);
249  strcpy(source.get(), in_source.c_str()); // NOLINT(runtime/printf)
250  key = basename(source.get());
251  if (entry_type != FarEntryType::FILE) {
252  key += "-";
253  key += keybuf.str();
254  }
255  }
256  writer.Add(key_prefix + key + key_suffix, *fst);
257  }
258  if (generate_keys == 0) n = 0;
259  }
260 }
261 
262 } // namespace fst
263 
264 #endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_
constexpr int kNoLabel
Definition: fst.h:195
#define LOG(type)
Definition: log.h:53
constexpr int kNoStateId
Definition: fst.h:196
typename Arc::Label Label
VectorFst< Arc > * GetVectorFst(bool keep_symbols=false)
virtual void Add(std::string_view key, const Fst< Arc > &fst)=0
#define FSTERROR()
Definition: util.h:56
void SetOutputSymbols(const SymbolTable *osyms) override
Definition: mutable-fst.h:401
int KeySize(std::string_view source)
void SetInputSymbols(const SymbolTable *isyms) override
Definition: mutable-fst.h:396
StringReader(std::istream &istrm, const std::string &source, FarEntryType entry_type, TokenType token_type, const SymbolTable *syms=nullptr, Label unknown_label=kNoStateId)
#define VLOG(level)
Definition: log.h:54
static SymbolTable * ReadText(std::istream &strm, std::string_view name, const std::string &sep=FST_FLAGS_fst_field_separator)
Definition: symbol-table.h:381
TokenType
Definition: string.h:49
CompactStringFst< Arc > * GetCompactFst(bool keep_symbols=false)
void CompileStrings(const std::vector< std::string > &sources, FarWriter< Arc > &writer, std::string_view fst_type, int32_t generate_keys, FarEntryType entry_type, TokenType token_type, const std::string &symbols_source, const std::string &unknown_symbol, bool keep_symbols, bool initial_symbols, const std::string &key_prefix, const std::string &key_suffix)
typename Arc::Weight Weight
FarEntryType
Definition: far.h:49