45 #include <string_view> 49 "Require symbol tables to match when appropriate");
51 "Set of characters used as a separator between printed fields");
57 static constexpr int32_t kSymbolTableMagicNumber = 2125658996;
61 buckets_(1 << 4, kEmptyBucket),
62 hash_mask_(buckets_.size() - 1) {}
65 static constexpr
float kMaxOccupancyRatio = 0.75;
66 if (
Size() >= kMaxOccupancyRatio * buckets_.size()) {
67 Rehash(buckets_.size() * 2);
69 size_t idx = GetHash(key);
70 while (buckets_[idx] != kEmptyBucket) {
71 const auto stored_value = buckets_[idx];
72 if (symbols_[stored_value] == key)
return {stored_value,
false};
73 idx = (idx + 1) & hash_mask_;
75 const auto next =
Size();
77 symbols_.emplace_back(key);
82 size_t idx = str_hash_(key) & hash_mask_;
83 while (buckets_[idx] != kEmptyBucket) {
84 const auto stored_value = buckets_[idx];
85 if (symbols_[stored_value] == key)
return stored_value;
86 idx = (idx + 1) & hash_mask_;
91 void DenseSymbolMap::Rehash(
size_t num_buckets) {
92 buckets_.resize(num_buckets);
93 hash_mask_ = buckets_.size() - 1;
94 std::fill(buckets_.begin(), buckets_.end(), kEmptyBucket);
95 for (
size_t i = 0; i <
Size(); ++i) {
96 size_t idx = GetHash(symbols_[i]);
97 while (buckets_[idx] != kEmptyBucket) {
98 idx = (idx + 1) & hash_mask_;
105 symbols_.erase(symbols_.begin() + idx);
106 Rehash(buckets_.size());
112 for (
const auto &item : table) {
113 AddSymbol(item.Symbol());
117 std::unique_ptr<SymbolTableImplBase>
119 LOG(FATAL) <<
"ConstSymbolTableImpl can't be copied";
124 LOG(FATAL) <<
"ConstSymbolTableImpl does not support AddSymbol";
133 LOG(FATAL) <<
"ConstSymbolTableImpl does not support RemoveSymbol";
137 LOG(FATAL) <<
"ConstSymbolTableImpl does not support SetName";
141 LOG(FATAL) <<
"ConstSymbolTableImpl does not support AddTable";
145 std::istream &strm, std::string_view name, std::string_view sep) {
146 auto impl = std::make_unique<SymbolTableImpl>(name);
149 const std::string separator =
fst::StrCat(sep,
"\n");
150 while (!strm.getline(line,
kLineLen).fail()) {
152 const std::vector<std::string_view> col =
154 if (col.empty())
continue;
155 if (col.size() != 2) {
156 LOG(ERROR) <<
"SymbolTable::ReadText: Bad number of columns (" 157 << col.size() <<
"), " 158 <<
"file = " << name <<
", line = " << nline <<
": " << line;
161 std::string_view symbol = col[0];
162 std::string_view value = col[1];
164 if (!maybe_key.has_value() || *maybe_key ==
kNoSymbol) {
165 LOG(ERROR) <<
"SymbolTable::ReadText: Bad label (" << value <<
"), " 166 <<
"file = " << name <<
", line = " << nline;
169 impl->AddSymbol(symbol, *maybe_key);
172 return impl.release();
175 void SymbolTableImpl::MaybeRecomputeCheckSum()
const {
178 if (check_sum_finalized_)
return;
181 MutexLock check_sum_lock(&check_sum_mutex_);
182 if (check_sum_finalized_) {
187 for (
size_t i = 0; i < symbols_.Size(); ++i) {
188 check_sum.
Update(symbols_.GetSymbol(i));
189 check_sum.
Update(std::string_view{
"\0", 1});
191 check_sum_string_ = check_sum.
Digest();
194 for (int64_t i = 0; i < dense_key_limit_; ++i) {
195 std::ostringstream line;
196 line << symbols_.GetSymbol(i) <<
'\t' << i;
197 labeled_check_sum.
Update(line.str());
199 using citer = std::map<int64_t, int64_t>::const_iterator;
200 for (citer it = key_map_.begin(); it != key_map_.end(); ++it) {
203 if (it->first < dense_key_limit_)
continue;
204 std::ostringstream line;
205 line << symbols_.GetSymbol(it->second) <<
'\t' << it->first;
206 labeled_check_sum.
Update(line.str());
208 labeled_check_sum_string_ = labeled_check_sum.
Digest();
209 check_sum_finalized_ =
true;
214 if (key < 0 || key >= dense_key_limit_) {
215 const auto it = key_map_.find(key);
216 if (it == key_map_.end())
return "";
219 if (idx < 0 || idx >= symbols_.Size())
return "";
220 return symbols_.GetSymbol(idx);
225 if (
const auto &[insert_key, inserted] = symbols_.InsertOrFind(symbol);
227 const auto key_already = GetNthKey(insert_key);
228 if (key_already == key)
return key;
229 VLOG(1) <<
"SymbolTable::AddSymbol: symbol = " << symbol
230 <<
" already in symbol_map_ with key = " << key_already
231 <<
" but supplied new key = " << key <<
" (ignoring new key)";
234 if (key + 1 == static_cast<int64_t>(symbols_.Size()) &&
235 key == dense_key_limit_) {
238 idx_key_.push_back(key);
239 key_map_[key] = symbols_.Size() - 1;
241 if (key >= available_key_) available_key_ = key + 1;
242 check_sum_finalized_ =
false;
250 if (key < 0 || key >= dense_key_limit_) {
251 auto iter = key_map_.find(key);
252 if (iter == key_map_.end())
return;
254 key_map_.erase(iter);
256 if (idx < 0 || idx >= static_cast<int64_t>(symbols_.Size()))
return;
257 symbols_.RemoveSymbol(idx);
259 for (
auto &k : key_map_) {
260 if (k.second > idx) --k.second;
262 if (key >= 0 && key < dense_key_limit_) {
264 const auto new_dense_key_limit = key;
265 for (int64_t i = key + 1; i < dense_key_limit_; ++i) {
269 idx_key_.resize(symbols_.Size() - new_dense_key_limit);
270 for (int64_t i = symbols_.Size(); i >= dense_key_limit_; --i) {
271 idx_key_[i - new_dense_key_limit - 1] = idx_key_[i - dense_key_limit_];
274 for (int64_t i = new_dense_key_limit; i < dense_key_limit_ - 1; ++i) {
275 idx_key_[i - new_dense_key_limit] = i + 1;
277 dense_key_limit_ = new_dense_key_limit;
280 for (
size_t i = idx - dense_key_limit_; i + 1 < idx_key_.size(); ++i) {
281 idx_key_[i] = idx_key_[i + 1];
285 if (key == available_key_ - 1) available_key_ = key;
289 std::istream &strm, std::string_view source) {
290 int32_t magic_number = 0;
293 LOG(ERROR) <<
"SymbolTable::Read: Read failed: " << source;
298 auto impl = std::make_unique<SymbolTableImpl>(name);
299 ReadType(strm, &impl->available_key_);
303 LOG(ERROR) <<
"SymbolTable::Read: Read failed: " << source;
308 impl->check_sum_finalized_ =
false;
309 for (int64_t i = 0; i < size; ++i) {
313 LOG(ERROR) <<
"SymbolTable::Read: Read failed: " << source;
316 impl->AddSymbol(symbol, key);
319 return impl.release();
323 WriteType(strm, kSymbolTableMagicNumber);
326 const int64_t size = symbols_.Size();
328 for (int64_t i = 0; i < dense_key_limit_; ++i) {
332 for (
const auto &p : key_map_) {
333 WriteType(strm, symbols_.GetSymbol(p.second));
338 LOG(ERROR) <<
"SymbolTable::Write: Write failed";
349 std::string_view sep) {
350 std::ifstream strm(source, std::ios_base::in);
352 LOG(ERROR) <<
"SymbolTable::ReadText: Can't open file: " << source;
355 return ReadText(strm, source, sep);
359 if (!source.empty()) {
360 std::ofstream strm(source,
361 std::ios_base::out | std::ios_base::binary);
363 LOG(ERROR) <<
"SymbolTable::Write: Can't open file: " << source;
367 LOG(ERROR) <<
"SymbolTable::Write: Write failed: " << source;
372 return Write(std::cout);
377 for (
const auto &item : *
this) {
378 std::ostringstream line;
379 line << item.Symbol() << sep[0] << item.Label() <<
'\n';
380 strm.write(line.str().data(), line.str().length());
386 std::string_view sep)
const {
388 std::ofstream strm(sink);
390 LOG(ERROR) <<
"SymbolTable::WriteText: Can't open file: " << sink;
393 if (!WriteText(strm, sep)) {
394 LOG(ERROR) <<
"SymbolTable::WriteText: Write failed: " << sink;
399 return WriteText(std::cout, sep);
406 if (!FST_FLAGS_fst_compat_symbols)
return true;
407 if (syms1 && syms2 &&
410 LOG(WARNING) <<
"CompatSymbols: Symbol table checksums do not match. " 411 <<
"Table sizes are " << syms1->
NumSymbols() <<
" and " 421 std::ostringstream ostrm;
423 *result = ostrm.str();
427 std::istringstream istrm(str);
bool Write(std::ostream &strm) const
bool WriteText(std::ostream &strm, std::string_view sep=FST_FLAGS_fst_field_separator) const
void SymbolTableToString(const SymbolTable *table, std::string *result)
std::pair< int64_t, bool > InsertOrFind(std::string_view key)
int64_t AddSymbol(std::string_view symbol, int64_t key) final
static SymbolTableImpl * Read(std::istream &strm, std::string_view source)
DEFINE_bool(fst_compat_symbols, true,"Require symbol tables to match when appropriate")
SymbolTable * StringToSymbolTable(const std::string &str)
constexpr int64_t kNoSymbol
std::string Find(int64_t key) const override
internal::StringSplitter StrSplit(std::string_view full, ByAnyChar delim)
size_t NumSymbols() const
std::ostream & WriteType(std::ostream &strm, const T t)
void SetName(std::string_view new_name) final
std::unique_ptr< SymbolTableImplBase > Copy() const final
std::optional< int64_t > ParseInt64(std::string_view s, int base=10)
bool Write(std::ostream &strm) const override
void AddTable(const SymbolTable &table) override
int64_t Find(std::string_view key) const
void RemoveSymbol(int64_t key) override
int64_t AddSymbol(std::string_view symbol, int64_t key) override
void Update(std::string_view data)
static SymbolTableImpl * ReadText(std::istream &strm, std::string_view name, std::string_view sep=FST_FLAGS_fst_field_separator)
std::string StrCat(const StringOrInt &s1, const StringOrInt &s2)
std::istream & ReadType(std::istream &strm, T *t)
bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2, bool warning=true)
static SymbolTable * ReadText(std::istream &strm, std::string_view name, std::string_view sep=FST_FLAGS_fst_field_separator)
const std::string & LabeledCheckSum() const
void AddTable(const SymbolTable &table) final
static SymbolTable * Read(std::istream &strm, std::string_view source)
DEFINE_string(fst_field_separator,"\t ","Set of characters used as a separator between printed fields")
void RemoveSymbol(int64_t key) final
void RemoveSymbol(size_t idx)