FST  openfst-1.7.2
OpenFst Library
icu.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 //
4 // This library implements an unrestricted Thompson/Pike UTF-8 parser and
5 // serializer. UTF-8 is a restricted subset of this byte stream encoding. For
6 // a description of the encoding details, see:
7 //
8 // http://en.wikipedia.org/wiki/UTF-8
9 
10 #ifndef FST_ICU_H_
11 #define FST_ICU_H_
12 
13 #include <sstream>
14 #include <vector>
15 
16 #include <fst/log.h>
17 
18 namespace fst {
19 
20 // Trivial function to copy bytestrings into vectors of labels, truncating
21 // if necessary. It is possible to use this sensibly with as little as 8 bits
22 // of Label precision. This returns `true` deterministically for compatibility.
23 template <class Label>
24 bool ByteStringToLabels(const string &str, std::vector<Label> *labels) {
25  for (const unsigned char ch : str) labels->push_back(ch);
26  return true;
27 }
28 
29 // This function writes UTF-8 strings into a vector of Labels, truncating if
30 // necessary. It is possible to use this sensibly with as little as 16 bits of
31 // Label precision (i.e., when all characters are within the Basic Multilingual
32 // Plane). With 21 bits, one can label all UTF-8 labelpoints, including those
33 // from the various Astral Planes. Naturally, it is safe to use this with larger
34 // Labels (e.g., 64 bits).
35 template <class Label>
36 bool UTF8StringToLabels(const string &str, std::vector<Label> *labels) {
37  for (auto it = str.begin(); it != str.end();) {
38  int c = *it & 0xff;
39  ++it;
40  if ((c & 0x80) == 0) {
41  labels->push_back(c);
42  } else {
43  if ((c & 0xc0) == 0x80) {
44  LOG(ERROR) << "UTF8StringToLabels: Continuation byte as lead byte";
45  return false;
46  }
47  int count =
48  (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + (c >= 0xfc);
49  int32 label = c & ((1 << (6 - count)) - 1);
50  while (count != 0) {
51  if (it == str.end()) {
52  LOG(ERROR) << "UTF8StringToLabels: Truncated UTF-8 byte sequence";
53  return false;
54  }
55  char cb = *it;
56  ++it;
57  if ((cb & 0xc0) != 0x80) {
58  LOG(ERROR) << "UTF8StringToLabels: Missing/invalid continuation byte";
59  return false;
60  }
61  label = (label << 6) | (cb & 0x3f);
62  --count;
63  }
64  if (label < 0) {
65  // Should be unreachable.
66  LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
67  return false;
68  }
69  labels->push_back(label);
70  }
71  }
72  return true;
73 }
74 
75 template <class Label>
76 bool LabelsToByteString(const std::vector<Label> &labels, string *str) {
77  std::ostringstream ostrm;
78  for (const char label : labels) {
79  if (label != 0) ostrm << label;
80  }
81  *str = ostrm.str();
82  return !!ostrm;
83 }
84 
85 template <class Label>
86 bool LabelsToUTF8String(const std::vector<Label> &labels, string *str) {
87  std::ostringstream ostrm;
88  for (const int32 label : labels) {
89  if (label < 0) {
90  LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << label;
91  return false;
92  } else if (label == 0) {
93  continue;
94  } else if (label < 0x80) {
95  ostrm << static_cast<char>(label);
96  } else if (label < 0x800) {
97  ostrm << static_cast<char>((label >> 6) | 0xc0);
98  ostrm << static_cast<char>((label & 0x3f) | 0x80);
99  } else if (label < 0x10000) {
100  ostrm << static_cast<char>((label >> 12) | 0xe0);
101  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
102  ostrm << static_cast<char>((label & 0x3f) | 0x80);
103  } else if (label < 0x200000) {
104  ostrm << static_cast<char>((label >> 18) | 0xf0);
105  ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
106  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
107  ostrm << static_cast<char>((label & 0x3f) | 0x80);
108  } else if (label < 0x4000000) {
109  ostrm << static_cast<char>((label >> 24) | 0xf8);
110  ostrm << static_cast<char>(((label >> 18) & 0x3f) | 0x80);
111  ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
112  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
113  ostrm << static_cast<char>((label & 0x3f) | 0x80);
114  } else {
115  ostrm << static_cast<char>((label >> 30) | 0xfc);
116  ostrm << static_cast<char>(((label >> 24) & 0x3f) | 0x80);
117  ostrm << static_cast<char>(((label >> 18) & 0x3f) | 0x80);
118  ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
119  ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
120  ostrm << static_cast<char>((label & 0x3f) | 0x80);
121  }
122  }
123  *str = ostrm.str();
124  return !!ostrm;
125 }
126 
127 } // namespace fst
128 
129 #endif // FST_ICU_H_
#define LOG(type)
Definition: log.h:48
bool LabelsToUTF8String(const std::vector< Label > &labels, string *str)
Definition: icu.h:86
bool UTF8StringToLabels(const string &str, std::vector< Label > *labels)
Definition: icu.h:36
bool ByteStringToLabels(const string &str, std::vector< Label > *labels)
Definition: icu.h:24
int32_t int32
Definition: types.h:26
bool LabelsToByteString(const std::vector< Label > &labels, string *str)
Definition: icu.h:76