FST  openfst-1.8.4
OpenFst Library
linearscript.cc
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 
18 #include <fst/compat.h>
20 
21 #include <cctype>
22 #include <cstddef>
23 #include <functional>
24 #include <set>
25 #include <sstream>
26 #include <string>
27 #include <vector>
28 
29 #include <fst/flags.h>
30 #include <fst/log.h>
31 #include <fstream>
32 #include <fst/script/script-impl.h>
33 
34 DEFINE_string(delimiter, "|",
35  "Single non-white-space character delimiter inside sequences of "
36  "feature symbols and output symbols");
37 DEFINE_string(empty_symbol, "<empty>",
38  "Special symbol that designates an empty sequence");
39 
40 DEFINE_string(start_symbol, "<s>", "Start of sentence symbol");
41 DEFINE_string(end_symbol, "</s>", "End of sentence symbol");
42 
43 DEFINE_bool(classifier, false,
44  "Treat input model as a classifier instead of a tagger");
45 
46 namespace fst {
47 namespace script {
48 
50  return FST_FLAGS_delimiter.size() == 1 &&
51  !std::isspace(FST_FLAGS_delimiter[0]);
52 }
53 
55  bool okay = !FST_FLAGS_empty_symbol.empty();
56  for (size_t i = 0; i < FST_FLAGS_empty_symbol.size(); ++i) {
57  char c = FST_FLAGS_empty_symbol[i];
58  if (std::isspace(c)) okay = false;
59  }
60  return okay;
61 }
62 
63 void LinearCompile(const std::string &arc_type,
64  const std::string &epsilon_symbol,
65  const std::string &unknown_symbol, const std::string &vocab,
66  char **models, int models_len, const std::string &out,
67  const std::string &save_isymbols,
68  const std::string &save_fsymbols,
69  const std::string &save_osymbols) {
70  LinearCompileArgs args(epsilon_symbol, unknown_symbol, vocab, models,
71  models_len, out, save_isymbols, save_fsymbols,
72  save_osymbols);
73  Apply<Operation<LinearCompileArgs>>("LinearCompileTpl", arc_type, &args);
74 }
75 
77 
78 void SplitByWhitespace(const std::string &str, std::vector<std::string> *out) {
79  out->clear();
80  std::istringstream strm(str);
81  std::string buf;
82  while (strm >> buf) out->push_back(buf);
83 }
84 
85 int ScanNumClasses(char **models, int models_len) {
86  std::set<std::string, std::less<>> preds;
87  for (int i = 0; i < models_len; ++i) {
88  std::ifstream in(models[i]);
89  if (!in) LOG(FATAL) << "Failed to open " << models[i];
90  std::string line;
91  std::getline(in, line);
92  size_t num_line = 1;
93  while (std::getline(in, line)) {
94  ++num_line;
95  std::vector<std::string> fields;
96  SplitByWhitespace(line, &fields);
97  if (fields.size() != 3)
98  LOG(FATAL) << "Wrong number of fields in source " << models[i]
99  << ", line " << num_line;
100  preds.insert(fields[1]);
101  }
102  }
103  return preds.size();
104 }
105 
106 } // namespace script
107 } // namespace fst
void SplitByWhitespace(const std::string &str, std::vector< std::string > *out)
Definition: linearscript.cc:78
REGISTER_FST_OPERATION_3ARCS(Compress, CompressArgs)
std::tuple< const std::string &, const std::string &, const std::string &, char **, int, const std::string &, const std::string &, const std::string &, const std::string & > LinearCompileArgs
Definition: linearscript.h:52
#define LOG(type)
Definition: log.h:53
DEFINE_string(delimiter,"|","Single non-white-space character delimiter inside sequences of ""feature symbols and output symbols")
bool ValidateEmptySymbol()
Definition: linearscript.cc:54
int ScanNumClasses(char **models, int models_len)
Definition: linearscript.cc:85
void LinearCompile(const std::string &arc_type, const std::string &epsilon_symbol, const std::string &unknown_symbol, const std::string &vocab, char **models, int models_len, const std::string &out, const std::string &save_isymbols, const std::string &save_fsymbols, const std::string &save_osymbols)
Definition: linearscript.cc:63
DEFINE_bool(classifier, false,"Treat input model as a classifier instead of a tagger")
bool ValidateDelimiter()
Definition: linearscript.cc:49
void LinearCompileTpl(LinearCompileArgs *args)
Definition: linearscript.h:302