FST  openfst-1.8.3
OpenFst Library
nthbit.h
Go to the documentation of this file.
1 // Copyright 2005-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // See www.openfst.org for extensive documentation on this weighted
16 // finite-state transducer library.
17 
18 #ifndef FST_EXTENSIONS_NGRAM_NTHBIT_H_
19 #define FST_EXTENSIONS_NGRAM_NTHBIT_H_
20 
21 #include <array>
22 #include <cstdint>
23 
24 #if defined(__aarch64__)
25 #include <arm_neon.h>
26 #endif
27 
28 #include <fst/log.h>
29 
30 #if defined(__BMI2__) // Intel Bit Manipulation Instruction Set 2
31 // PDEP requires BMI2; this is present starting with Haswell.
32 
33 #include <immintrin.h>
34 
35 namespace fst {
36 // Returns the position (0-63) of the r-th 1 bit in v.
37 // 0 <= r < CountOnes(v) <= 64. Therefore, v must not be 0.
38 inline int nth_bit(uint64_t v, uint32_t r) {
39  DCHECK_NE(v, 0);
40  DCHECK_LE(0, r);
41  DCHECK_LT(r, __builtin_popcountll(v));
42 
43  // PDEP example from https://stackoverflow.com/a/27453505
44  // __builtin_ctzll is UB for 0, but the conditions above ensure that can't
45  // happen.
46  return __builtin_ctzll(_pdep_u64(uint64_t{1} << r, v));
47 }
48 } // namespace fst
49 
50 #elif SIZE_MAX == UINT32_MAX
51 // Detect 32-bit architectures via size_t.
52 
53 namespace fst {
54 // Returns the position (0-63) of the r-th 1 bit in v.
55 // 0 <= r < CountOnes(v) <= 64. Therefore, v must not be 0.
56 int nth_bit(uint64_t v, uint32_t r);
57 } // namespace fst
58 
59 #elif SIZE_MAX == UINT64_MAX
60 // Default 64-bit version, used by ARM64 and Intel < Haswell.
61 
62 namespace fst {
63 namespace internal {
64 
65 constexpr std::array<uint64_t, 64> PrefixSumOverflows() {
66  std::array<uint64_t, 64> a{};
67  constexpr uint64_t kOnesStep8 = 0x0101010101010101;
68  for (int i = 0; i < 64; ++i) {
69  a[i] = (0x7F - i) * kOnesStep8;
70  }
71  return a;
72 }
73 
74 constexpr std::array<uint64_t, 64> kPrefixSumOverflow = PrefixSumOverflows();
75 
76 extern const uint8_t kSelectInByte[2048];
77 } // namespace internal
78 
79 // Returns the position (0-63) of the r-th 1 bit in v.
80 // 0 <= r < CountOnes(v) <= 64. Therefore, v must not be 0.
81 //
82 // This version is based on the paper "Broadword Implementation of
83 // Rank/Select Queries" by Sebastiano Vigna, p. 5, Algorithm 2, with
84 // improvements from "Optimized Succinct Data Structures for Massive Data"
85 // by Gog & Petri, 2014.
86 inline int nth_bit(const uint64_t v, const uint32_t r) {
87  constexpr uint64_t kOnesStep8 = 0x0101010101010101;
88  constexpr uint64_t kMSBsStep8 = 0x80 * kOnesStep8;
89 
90  DCHECK_NE(v, 0);
91  DCHECK_LE(0, r);
92  DCHECK_LT(r, __builtin_popcountll(v));
93 
94 #if defined(__aarch64__)
95  // Use the ARM64 CNT instruction to compute a byte-wise popcount.
96  const uint64_t s =
97  reinterpret_cast<uint64_t>(vcnt_u8(reinterpret_cast<uint8x8_t>(v)));
98 #else
99  constexpr uint64_t kOnesStep4 = 0x1111111111111111;
100  uint64_t s = v;
101  s = s - ((s >> 1) & (0x5 * kOnesStep4));
102  s = (s & (0x3 * kOnesStep4)) + ((s >> 2) & (0x3 * kOnesStep4));
103  s = (s + (s >> 4)) & (0xF * kOnesStep8);
104 #endif
105  // s now contains the byte-wise popcounts of v.
106 
107  // byte_sums contains partial sums of the byte-wise popcounts.
108  // That is, byte i contains the popcounts of bytes <= i.
109  uint64_t byte_sums = s * kOnesStep8;
110 
111  // kPrefixSumOverflow[r] == (0x7F - r) * kOnesStep8, so the high bit is
112  // still set if byte_sums - r > 0, or byte_sums > r. The first one set
113  // is in the byte with the sum larger than r (since r is 0-based),
114  // so this is the byte we need.
115  const uint64_t b = (byte_sums + internal::kPrefixSumOverflow[r]) & kMSBsStep8;
116  // The first bit set is the high bit in the byte, so
117  // num_trailing_zeros == 8 * byte_nr + 7 and the byte number is the
118  // number of trailing zeros divided by 8.
119  const int byte_nr = __builtin_ctzll(b) >> 3;
120  const int shift = byte_nr << 3;
121  // The top byte contains the whole-word popcount; we never need that.
122  byte_sums <<= 8;
123  // Paper uses reinterpret_cast<uint8_t *>; use shift/mask instead.
124  const int rank_in_byte = r - (byte_sums >> shift) & 0xFF;
125  return shift +
126  internal::kSelectInByte[(rank_in_byte << 8) + ((v >> shift) & 0xFF)];
127 }
128 } // namespace fst
129 
130 #else
131 
132 #error Unrecognized architecture size
133 
134 #endif
135 
136 #endif // FST_EXTENSIONS_NGRAM_NTHBIT_H_
#define DCHECK_LT(x, y)
Definition: log.h:76
#define DCHECK_LE(x, y)
Definition: log.h:78
#define DCHECK_NE(x, y)
Definition: log.h:80
int nth_bit(const uint64_t v, uint32_t r)
Definition: nthbit.cc:235