43 #include <string_view> 47 "Require symbol tables to match when appropriate");
49 "Set of characters used as a separator between printed fields");
55 static constexpr int32_t kSymbolTableMagicNumber = 2125658996;
59 buckets_(1 << 4, kEmptyBucket),
60 hash_mask_(buckets_.size() - 1) {}
63 static constexpr
float kMaxOccupancyRatio = 0.75;
64 if (
Size() >= kMaxOccupancyRatio * buckets_.size()) {
65 Rehash(buckets_.size() * 2);
67 size_t idx = GetHash(key);
68 while (buckets_[idx] != kEmptyBucket) {
69 const auto stored_value = buckets_[idx];
70 if (symbols_[stored_value] == key)
return {stored_value,
false};
71 idx = (idx + 1) & hash_mask_;
73 const auto next =
Size();
75 symbols_.emplace_back(key);
80 size_t idx = str_hash_(key) & hash_mask_;
81 while (buckets_[idx] != kEmptyBucket) {
82 const auto stored_value = buckets_[idx];
83 if (symbols_[stored_value] == key)
return stored_value;
84 idx = (idx + 1) & hash_mask_;
89 void DenseSymbolMap::Rehash(
size_t num_buckets) {
90 buckets_.resize(num_buckets);
91 hash_mask_ = buckets_.size() - 1;
92 std::fill(buckets_.begin(), buckets_.end(), kEmptyBucket);
93 for (
size_t i = 0; i <
Size(); ++i) {
94 size_t idx = GetHash(symbols_[i]);
95 while (buckets_[idx] != kEmptyBucket) {
96 idx = (idx + 1) & hash_mask_;
103 symbols_.erase(symbols_.begin() + idx);
104 Rehash(buckets_.size());
110 for (
const auto &item : table) {
111 AddSymbol(item.Symbol());
115 std::unique_ptr<SymbolTableImplBase>
117 LOG(FATAL) <<
"ConstSymbolTableImpl can't be copied";
122 LOG(FATAL) <<
"ConstSymbolTableImpl does not support AddSymbol";
131 LOG(FATAL) <<
"ConstSymbolTableImpl does not support RemoveSymbol";
135 LOG(FATAL) <<
"ConstSymbolTableImpl does not support SetName";
139 LOG(FATAL) <<
"ConstSymbolTableImpl does not support AddTable";
143 std::istream &strm, std::string_view name,
const std::string &sep) {
144 auto impl = std::make_unique<SymbolTableImpl>(name);
147 const auto separator = sep +
"\n";
148 while (!strm.getline(line,
kLineLen).fail()) {
150 const std::vector<std::string_view> col =
152 if (col.empty())
continue;
153 if (col.size() != 2) {
154 LOG(ERROR) <<
"SymbolTable::ReadText: Bad number of columns (" 155 << col.size() <<
"), " 156 <<
"file = " << name <<
", line = " << nline <<
": " << line;
159 std::string_view symbol = col[0];
160 std::string_view value = col[1];
162 if (!maybe_key.has_value() || *maybe_key ==
kNoSymbol) {
163 LOG(ERROR) <<
"SymbolTable::ReadText: Bad label (" << value <<
"), " 164 <<
"file = " << name <<
", line = " << nline;
167 impl->AddSymbol(symbol, *maybe_key);
170 return impl.release();
173 void SymbolTableImpl::MaybeRecomputeCheckSum()
const {
176 if (check_sum_finalized_)
return;
179 MutexLock check_sum_lock(&check_sum_mutex_);
180 if (check_sum_finalized_) {
185 for (
size_t i = 0; i < symbols_.Size(); ++i) {
186 check_sum.
Update(symbols_.GetSymbol(i));
187 check_sum.
Update(std::string_view{
"\0", 1});
189 check_sum_string_ = check_sum.
Digest();
192 for (int64_t i = 0; i < dense_key_limit_; ++i) {
193 std::ostringstream line;
194 line << symbols_.GetSymbol(i) <<
'\t' << i;
195 labeled_check_sum.
Update(line.str());
197 using citer = std::map<int64_t, int64_t>::const_iterator;
198 for (citer it = key_map_.begin(); it != key_map_.end(); ++it) {
201 if (it->first < dense_key_limit_)
continue;
202 std::ostringstream line;
203 line << symbols_.GetSymbol(it->second) <<
'\t' << it->first;
204 labeled_check_sum.
Update(line.str());
206 labeled_check_sum_string_ = labeled_check_sum.
Digest();
207 check_sum_finalized_ =
true;
212 if (key < 0 || key >= dense_key_limit_) {
213 const auto it = key_map_.find(key);
214 if (it == key_map_.end())
return "";
217 if (idx < 0 || idx >= symbols_.Size())
return "";
218 return symbols_.GetSymbol(idx);
223 if (
const auto &[insert_key, inserted] = symbols_.InsertOrFind(symbol);
225 const auto key_already = GetNthKey(insert_key);
226 if (key_already == key)
return key;
227 VLOG(1) <<
"SymbolTable::AddSymbol: symbol = " << symbol
228 <<
" already in symbol_map_ with key = " << key_already
229 <<
" but supplied new key = " << key <<
" (ignoring new key)";
232 if (key + 1 == static_cast<int64_t>(symbols_.Size()) &&
233 key == dense_key_limit_) {
236 idx_key_.push_back(key);
237 key_map_[key] = symbols_.Size() - 1;
239 if (key >= available_key_) available_key_ = key + 1;
240 check_sum_finalized_ =
false;
248 if (key < 0 || key >= dense_key_limit_) {
249 auto iter = key_map_.find(key);
250 if (iter == key_map_.end())
return;
252 key_map_.erase(iter);
254 if (idx < 0 || idx >= static_cast<int64_t>(symbols_.Size()))
return;
255 symbols_.RemoveSymbol(idx);
257 for (
auto &k : key_map_) {
258 if (k.second > idx) --k.second;
260 if (key >= 0 && key < dense_key_limit_) {
262 const auto new_dense_key_limit = key;
263 for (int64_t i = key + 1; i < dense_key_limit_; ++i) {
267 idx_key_.resize(symbols_.Size() - new_dense_key_limit);
268 for (int64_t i = symbols_.Size(); i >= dense_key_limit_; --i) {
269 idx_key_[i - new_dense_key_limit - 1] = idx_key_[i - dense_key_limit_];
272 for (int64_t i = new_dense_key_limit; i < dense_key_limit_ - 1; ++i) {
273 idx_key_[i - new_dense_key_limit] = i + 1;
275 dense_key_limit_ = new_dense_key_limit;
278 for (
size_t i = idx - dense_key_limit_; i + 1 < idx_key_.size(); ++i) {
279 idx_key_[i] = idx_key_[i + 1];
283 if (key == available_key_ - 1) available_key_ = key;
287 std::istream &strm, std::string_view source) {
288 int32_t magic_number = 0;
291 LOG(ERROR) <<
"SymbolTable::Read: Read failed: " << source;
296 auto impl = std::make_unique<SymbolTableImpl>(name);
297 ReadType(strm, &impl->available_key_);
301 LOG(ERROR) <<
"SymbolTable::Read: Read failed: " << source;
306 impl->check_sum_finalized_ =
false;
307 for (int64_t i = 0; i < size; ++i) {
311 LOG(ERROR) <<
"SymbolTable::Read: Read failed: " << source;
314 impl->AddSymbol(symbol, key);
317 return impl.release();
321 WriteType(strm, kSymbolTableMagicNumber);
324 const int64_t size = symbols_.Size();
326 for (int64_t i = 0; i < dense_key_limit_; ++i) {
330 for (
const auto &p : key_map_) {
331 WriteType(strm, symbols_.GetSymbol(p.second));
336 LOG(ERROR) <<
"SymbolTable::Write: Write failed";
347 const std::string &sep) {
348 std::ifstream strm(source, std::ios_base::in);
350 LOG(ERROR) <<
"SymbolTable::ReadText: Can't open file: " << source;
353 return ReadText(strm, source, sep);
357 if (!source.empty()) {
358 std::ofstream strm(source,
359 std::ios_base::out | std::ios_base::binary);
361 LOG(ERROR) <<
"SymbolTable::Write: Can't open file: " << source;
365 LOG(ERROR) <<
"SymbolTable::Write: Write failed: " << source;
370 return Write(std::cout);
375 for (
const auto &item : *
this) {
376 std::ostringstream line;
377 line << item.Symbol() << sep[0] << item.Label() <<
'\n';
378 strm.write(line.str().data(), line.str().length());
384 const std::string &sep)
const {
386 std::ofstream strm(sink);
388 LOG(ERROR) <<
"SymbolTable::WriteText: Can't open file: " << sink;
391 if (!WriteText(strm, sep)) {
392 LOG(ERROR) <<
"SymbolTable::WriteText: Write failed: " << sink;
397 return WriteText(std::cout, sep);
404 if (!FST_FLAGS_fst_compat_symbols)
return true;
405 if (syms1 && syms2 &&
408 LOG(WARNING) <<
"CompatSymbols: Symbol table checksums do not match. " 409 <<
"Table sizes are " << syms1->
NumSymbols() <<
" and " 419 std::ostringstream ostrm;
421 *result = ostrm.str();
425 std::istringstream istrm(str);
bool Write(std::ostream &strm) const
void SymbolTableToString(const SymbolTable *table, std::string *result)
std::pair< int64_t, bool > InsertOrFind(std::string_view key)
int64_t AddSymbol(std::string_view symbol, int64_t key) final
static SymbolTableImpl * Read(std::istream &strm, std::string_view source)
DEFINE_bool(fst_compat_symbols, true,"Require symbol tables to match when appropriate")
SymbolTable * StringToSymbolTable(const std::string &str)
constexpr int64_t kNoSymbol
std::string Find(int64_t key) const override
internal::StringSplitter StrSplit(std::string_view full, ByAnyChar delim)
size_t NumSymbols() const
std::ostream & WriteType(std::ostream &strm, const T t)
void SetName(std::string_view new_name) final
std::unique_ptr< SymbolTableImplBase > Copy() const final
std::optional< int64_t > ParseInt64(std::string_view s, int base=10)
bool Write(std::ostream &strm) const override
void AddTable(const SymbolTable &table) override
int64_t Find(std::string_view key) const
void RemoveSymbol(int64_t key) override
int64_t AddSymbol(std::string_view symbol, int64_t key) override
void Update(std::string_view data)
static SymbolTable * ReadText(std::istream &strm, std::string_view name, const std::string &sep=FST_FLAGS_fst_field_separator)
static SymbolTableImpl * ReadText(std::istream &strm, std::string_view name, const std::string &sep=FST_FLAGS_fst_field_separator)
bool WriteText(std::ostream &strm, const std::string &sep=FST_FLAGS_fst_field_separator) const
std::istream & ReadType(std::istream &strm, T *t)
bool CompatSymbols(const SymbolTable *syms1, const SymbolTable *syms2, bool warning=true)
const std::string & LabeledCheckSum() const
void AddTable(const SymbolTable &table) final
static SymbolTable * Read(std::istream &strm, std::string_view source)
DEFINE_string(fst_field_separator,"\t ","Set of characters used as a separator between printed fields")
void RemoveSymbol(int64_t key) final
void RemoveSymbol(size_t idx)