FST  openfst-1.7.1
OpenFst Library
bitmap-index.h
Go to the documentation of this file.
1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
3 
4 #ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
5 #define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
6 
7 #include <utility>
8 #include <vector>
9 
10 #include <fst/compat.h>
11 
12 // This class is a bitstring storage class with an index that allows
13 // seeking to the Nth set or clear bit in time O(Log(N)) where N is
14 // the length of the bit vector. In addition, it allows counting set or
15 // clear bits over ranges in constant time.
16 //
17 // This is accomplished by maintaining an "secondary" index of limited
18 // size in bits that maintains a running count of the number of bits set
19 // in each block of bitmap data. A block is defined as the number of
20 // uint64 values that can fit in the secondary index before an overflow
21 // occurs.
22 //
23 // To handle overflows, a "primary" index containing a running count of
24 // bits set in each block is created using the type uint64.
25 
26 namespace fst {
27 
28 class BitmapIndex {
29  public:
30  static size_t StorageSize(size_t size) {
31  return ((size + kStorageBlockMask) >> kStorageLogBitSize);
32  }
33 
34  BitmapIndex() : bits_(nullptr), size_(0) {}
35 
36  bool Get(size_t index) const {
37  return (bits_[index >> kStorageLogBitSize] &
38  (kOne << (index & kStorageBlockMask))) != 0;
39  }
40 
41  static void Set(uint64* bits, size_t index) {
42  bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask));
43  }
44 
45  static void Clear(uint64* bits, size_t index) {
46  bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
47  }
48 
49  size_t Bits() const { return size_; }
50 
51  size_t ArraySize() const { return StorageSize(size_); }
52 
53  // Returns the number of one bits in the bitmap
54  size_t GetOnesCount() const {
55  return primary_index_[primary_index_size() - 1];
56  }
57 
58  // Returns the number of one bits in positions 0 to limit - 1.
59  // REQUIRES: limit <= Bits()
60  size_t Rank1(size_t end) const;
61 
62  // Returns the number of one bits in the range start to end - 1.
63  // REQUIRES: limit <= Bits()
64  size_t GetOnesCountInRange(size_t start, size_t end) const {
65  return Rank1(end) - Rank1(start);
66  }
67 
68  // Returns the number of zero bits in positions 0 to limit - 1.
69  // REQUIRES: limit <= Bits()
70  size_t Rank0(size_t end) const { return end - Rank1(end); }
71 
72  // Returns the number of zero bits in the range start to end - 1.
73  // REQUIRES: limit <= Bits()
74  size_t GetZeroesCountInRange(size_t start, size_t end) const {
75  return end - start - GetOnesCountInRange(start, end);
76  }
77 
78  // Return true if any bit between begin inclusive and end exclusive
79  // is set. 0 <= begin <= end <= Bits() is required.
80  //
81  bool TestRange(size_t start, size_t end) const {
82  return Rank1(end) > Rank1(start);
83  }
84 
85  // Returns the offset to the nth set bit (zero based)
86  // or Bits() if index >= number of ones
87  size_t Select1(size_t bit_index) const;
88 
89  // Returns the offset to the nth clear bit (zero based)
90  // or Bits() if index > number of
91  size_t Select0(size_t bit_index) const;
92 
93  // Returns the offset of the nth and nth+1 clear bit (zero based),
94  // equivalent to two calls to Select0, but more efficient.
95  std::pair<size_t, size_t> Select0s(size_t bit_index) const;
96 
97  // Rebuilds from index for the associated Bitmap, should be called
98  // whenever changes have been made to the Bitmap or else behavior
99  // of the indexed bitmap methods will be undefined.
100  void BuildIndex(const uint64* bits, size_t size);
101 
102  // the secondary index accumulates counts until it can possibly overflow
103  // this constant computes the number of uint64 units that can fit into
104  // units the size of uint16.
105  static const uint64 kOne = 1;
106  static const uint32 kStorageBitSize = 64;
107  static const uint32 kStorageLogBitSize = 6;
109  ((1 << 16) - 1) >> kStorageLogBitSize;
110 
111  private:
112  static const uint32 kStorageBlockMask = kStorageBitSize - 1;
113 
114  // returns, from the index, the count of ones up to array_index
115  size_t get_index_ones_count(size_t array_index) const;
116 
117  // because the indexes, both primary and secondary, contain a running
118  // count of the population of one bits contained in [0,i), there is
119  // no reason to have an element in the zeroth position as this value would
120  // necessarily be zero. (The bits are indexed in a zero based way.) Thus
121  // we don't store the 0th element in either index. Both of the following
122  // functions, if greater than 0, must be decremented by one before retreiving
123  // the value from the corresponding array.
124  // returns the 1 + the block that contains the bitindex in question
125  // the inverted version works the same but looks for zeros using an inverted
126  // view of the index
127  size_t find_primary_block(size_t bit_index) const;
128 
129  size_t find_inverted_primary_block(size_t bit_index) const;
130 
131  // similarly, the secondary index (which resets its count to zero at
132  // the end of every kSecondaryBlockSize entries) does not store the element
133  // at 0. Note that the rem_bit_index parameter is the number of bits
134  // within the secondary block, after the bits accounted for by the primary
135  // block have been removed (i.e. the remaining bits) And, because we
136  // reset to zero with each new block, there is no need to store those
137  // actual zeros.
138  // returns 1 + the secondary block that contains the bitindex in question
139  size_t find_secondary_block(size_t block, size_t rem_bit_index) const;
140 
141  size_t find_inverted_secondary_block(size_t block,
142  size_t rem_bit_index) const;
143 
144  // We create a primary index based upon the number of secondary index
145  // blocks. The primary index uses fields wide enough to accomodate any
146  // index of the bitarray so cannot overflow
147  // The primary index is the actual running
148  // count of one bits set for all blocks (and, thus, all uint64s).
149  size_t primary_index_size() const {
150  return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
151  }
152 
153  const uint64* bits_;
154  size_t size_;
155 
156  // The primary index contains the running popcount of all blocks
157  // which means the nth value contains the popcounts of
158  // [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
159  std::vector<uint32> primary_index_;
160  // The secondary index contains the running popcount of the associated
161  // bitmap. It is the same length (in units of uint16) as the
162  // bitmap's map is in units of uint64s.
163  std::vector<uint16> secondary_index_;
164 };
165 
166 } // end namespace fst
167 
168 #endif // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
uint64_t uint64
Definition: types.h:32
static const uint64 kOne
Definition: bitmap-index.h:105
static const uint32 kSecondaryBlockSize
Definition: bitmap-index.h:108
size_t Bits() const
Definition: bitmap-index.h:49
size_t Select1(size_t bit_index) const
Definition: bitmap-index.cc:57
size_t Rank1(size_t end) const
Definition: bitmap-index.cc:43
size_t Select0(size_t bit_index) const
Definition: bitmap-index.cc:77
size_t GetOnesCount() const
Definition: bitmap-index.h:54
static void Clear(uint64 *bits, size_t index)
Definition: bitmap-index.h:45
static void Set(uint64 *bits, size_t index)
Definition: bitmap-index.h:41
void BuildIndex(const uint64 *bits, size_t size)
size_t GetZeroesCountInRange(size_t start, size_t end) const
Definition: bitmap-index.h:74
bool TestRange(size_t start, size_t end) const
Definition: bitmap-index.h:81
static size_t StorageSize(size_t size)
Definition: bitmap-index.h:30
bool Get(size_t index) const
Definition: bitmap-index.h:36
size_t GetOnesCountInRange(size_t start, size_t end) const
Definition: bitmap-index.h:64
size_t ArraySize() const
Definition: bitmap-index.h:51
uint32_t uint32
Definition: types.h:31
std::pair< size_t, size_t > Select0s(size_t bit_index) const
Definition: bitmap-index.cc:98
static const uint32 kStorageLogBitSize
Definition: bitmap-index.h:107
static const uint32 kStorageBitSize
Definition: bitmap-index.h:106
size_t Rank0(size_t end) const
Definition: bitmap-index.h:70