FST  openfst-1.8.3
OpenFst Library
icu.h
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 //
18 // This library implements an unrestricted Thompson/Pike UTF-8 parser and
19 // serializer. UTF-8 is a restricted subset of this byte stream encoding. For
20 // a description of the encoding details, see:
21 //
22 // http://en.wikipedia.org/wiki/UTF-8
23 
24 #ifndef FST_ICU_H_
25 #define FST_ICU_H_
26 
27 #include <cstdint>
28 #include <sstream>
29 #include <string>
30 #include <vector>
31 
32 #include <fst/log.h>
33 #include <string_view>
34 
35 namespace fst {
36 
37 // Trivial function to copy bytestrings into vectors of labels, truncating
38 // if necessary. It is possible to use this sensibly with as little as 8 bits
39 // of Label precision. This returns `true` deterministically for compatibility.
40 template <class Label>
41 bool ByteStringToLabels(std::string_view str, std::vector<Label> *labels) {
42  for (const unsigned char ch : str) labels->push_back(ch);
43  return true;
44 }
45 
46 // This function writes UTF-8 strings into a vector of Labels, truncating if
47 // necessary. It is possible to use this sensibly with as little as 16 bits of
48 // Label precision (i.e., when all characters are within the Basic Multilingual
49 // Plane). With 21 bits, one can label all UTF-8 labelpoints, including those
50 // from the various Astral Planes. Naturally, it is safe to use this with larger
51 // Labels (e.g., 64 bits).
52 template <class Label>
53 bool UTF8StringToLabels(std::string_view str, std::vector<Label> *labels) {
54  for (auto it = str.begin(); it != str.end();) {
55  int c = *it & 0xff;
56  ++it;
57  if ((c & 0x80) == 0) {
58  labels->push_back(c);
59  } else {
60  if ((c & 0xc0) == 0x80) {
61  LOG(ERROR) << "UTF8StringToLabels: Continuation byte as lead byte";
62  return false;
63  }
64  int count =
65  (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + (c >= 0xfc);
66  int32_t label = c & ((1 << (6 - count)) - 1);
67  while (count != 0) {
68  if (it == str.end()) {
69  LOG(ERROR) << "UTF8StringToLabels: Truncated UTF-8 byte sequence";
70  return false;
71  }
72  char cb = *it;
73  ++it;
74  if ((cb & 0xc0) != 0x80) {
75  LOG(ERROR) << "UTF8StringToLabels: Missing/invalid continuation byte";
76  return false;
77  }
78  label = (label << 6) | (cb & 0x3f);
79  --count;
80  }
81  if (label < 0) {
82  // Should be unreachable.
83  LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
84  return false;
85  }
86  labels->push_back(label);
87  }
88  }
89  return true;
90 }
91 
92 template <class Label>
93 bool LabelsToByteString(const std::vector<Label> &labels, std::string *str) {
94  std::ostringstream ostrm;
95  for (const char label : labels) {
96  if (label != 0) ostrm << label;
97  }
98  *str = ostrm.str();
99  return !!ostrm;
100 }
101 
102 template <class Label>
103 bool LabelsToUTF8String(const std::vector<Label> &labels, std::string *str) {
104  std::ostringstream ostrm;
105  for (const int32_t label : labels) {
106  if (label < 0) {
107  LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << label;
108  return false;
109  } else if (label == 0) {
110  continue;
111  } else if (label < 0x80) {
112  ostrm << static_cast<char>(label);
113  } else if (label < 0x800) {
114  ostrm << static_cast<char>((label >> 6) | 0xc0);
115  ostrm << static_cast<char>((label & 0x3f) | 0x80);
116  } else if (label < 0x10000) {
117  ostrm << static_cast<char>((label >> 12) | 0xe0);
118  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
119  ostrm << static_cast<char>((label & 0x3f) | 0x80);
120  } else if (label < 0x200000) {
121  ostrm << static_cast<char>((label >> 18) | 0xf0);
122  ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
123  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
124  ostrm << static_cast<char>((label & 0x3f) | 0x80);
125  } else if (label < 0x4000000) {
126  ostrm << static_cast<char>((label >> 24) | 0xf8);
127  ostrm << static_cast<char>(((label >> 18) & 0x3f) | 0x80);
128  ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
129  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
130  ostrm << static_cast<char>((label & 0x3f) | 0x80);
131  } else {
132  ostrm << static_cast<char>((label >> 30) | 0xfc);
133  ostrm << static_cast<char>(((label >> 24) & 0x3f) | 0x80);
134  ostrm << static_cast<char>(((label >> 18) & 0x3f) | 0x80);
135  ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
136  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
137  ostrm << static_cast<char>((label & 0x3f) | 0x80);
138  }
139  }
140  *str = ostrm.str();
141  return !!ostrm;
142 }
143 
144 } // namespace fst
145 
146 #endif // FST_ICU_H_
#define LOG(type)
Definition: log.h:53
bool UTF8StringToLabels(std::string_view str, std::vector< Label > *labels)
Definition: icu.h:53
bool LabelsToUTF8String(const std::vector< Label > &labels, std::string *str)
Definition: icu.h:103
bool ByteStringToLabels(std::string_view str, std::vector< Label > *labels)
Definition: icu.h:41
bool LabelsToByteString(const std::vector< Label > &labels, std::string *str)
Definition: icu.h:93