FST  openfst-1.8.3
OpenFst Library
linearscript.cc
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 
18 #include <fst/compat.h>
20 
21 #include <cctype>
22 #include <cstddef>
23 #include <functional>
24 #include <set>
25 #include <sstream>
26 #include <string>
27 #include <vector>
28 
29 #include <fst/flags.h>
30 #include <fst/log.h>
32 #include <fst/arc.h>
33 #include <fst/cache.h>
34 #include <fstream>
35 #include <fst/float-weight.h>
36 #include <fst/script/script-impl.h>
37 #include <unordered_map>
38 
39 DEFINE_string(delimiter, "|",
40  "Single non-white-space character delimiter inside sequences of "
41  "feature symbols and output symbols");
42 DEFINE_string(empty_symbol, "<empty>",
43  "Special symbol that designates an empty sequence");
44 
45 DEFINE_string(start_symbol, "<s>", "Start of sentence symbol");
46 DEFINE_string(end_symbol, "</s>", "End of sentence symbol");
47 
48 DEFINE_bool(classifier, false,
49  "Treat input model as a classifier instead of a tagger");
50 
51 namespace fst {
52 namespace script {
53 
55  return FST_FLAGS_delimiter.size() == 1 &&
56  !std::isspace(FST_FLAGS_delimiter[0]);
57 }
58 
60  bool okay = !FST_FLAGS_empty_symbol.empty();
61  for (size_t i = 0; i < FST_FLAGS_empty_symbol.size(); ++i) {
62  char c = FST_FLAGS_empty_symbol[i];
63  if (std::isspace(c)) okay = false;
64  }
65  return okay;
66 }
67 
68 void LinearCompile(const std::string &arc_type,
69  const std::string &epsilon_symbol,
70  const std::string &unknown_symbol, const std::string &vocab,
71  char **models, int models_len, const std::string &out,
72  const std::string &save_isymbols,
73  const std::string &save_fsymbols,
74  const std::string &save_osymbols) {
75  LinearCompileArgs args(epsilon_symbol, unknown_symbol, vocab, models,
76  models_len, out, save_isymbols, save_fsymbols,
77  save_osymbols);
78  Apply<Operation<LinearCompileArgs>>("LinearCompileTpl", arc_type, &args);
79 }
80 
82 
83 void SplitByWhitespace(const std::string &str, std::vector<std::string> *out) {
84  out->clear();
85  std::istringstream strm(str);
86  std::string buf;
87  while (strm >> buf) out->push_back(buf);
88 }
89 
90 int ScanNumClasses(char **models, int models_len) {
91  std::set<std::string, std::less<>> preds;
92  for (int i = 0; i < models_len; ++i) {
93  std::ifstream in(models[i]);
94  if (!in) LOG(FATAL) << "Failed to open " << models[i];
95  std::string line;
96  std::getline(in, line);
97  size_t num_line = 1;
98  while (std::getline(in, line)) {
99  ++num_line;
100  std::vector<std::string> fields;
101  SplitByWhitespace(line, &fields);
102  if (fields.size() != 3)
103  LOG(FATAL) << "Wrong number of fields in source " << models[i]
104  << ", line " << num_line;
105  preds.insert(fields[1]);
106  }
107  }
108  return preds.size();
109 }
110 
111 } // namespace script
112 } // namespace fst
void SplitByWhitespace(const std::string &str, std::vector< std::string > *out)
Definition: linearscript.cc:83
REGISTER_FST_OPERATION_3ARCS(Compress, CompressArgs)
std::tuple< const std::string &, const std::string &, const std::string &, char **, int, const std::string &, const std::string &, const std::string &, const std::string & > LinearCompileArgs
Definition: linearscript.h:52
#define LOG(type)
Definition: log.h:53
DEFINE_string(delimiter,"|","Single non-white-space character delimiter inside sequences of ""feature symbols and output symbols")
bool ValidateEmptySymbol()
Definition: linearscript.cc:59
int ScanNumClasses(char **models, int models_len)
Definition: linearscript.cc:90
void LinearCompile(const std::string &arc_type, const std::string &epsilon_symbol, const std::string &unknown_symbol, const std::string &vocab, char **models, int models_len, const std::string &out, const std::string &save_isymbols, const std::string &save_fsymbols, const std::string &save_osymbols)
Definition: linearscript.cc:68
DEFINE_bool(classifier, false,"Treat input model as a classifier instead of a tagger")
bool ValidateDelimiter()
Definition: linearscript.cc:54
void LinearCompileTpl(LinearCompileArgs *args)
Definition: linearscript.h:302