Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active October 31, 2024 15:32
Show Gist options
  • Save vurtun/47c14b1abd98911b10e5f0a45c254616 to your computer and use it in GitHub Desktop.
Save vurtun/47c14b1abd98911b10e5f0a45c254616 to your computer and use it in GitHub Desktop.
object string filter using sse2
#ifdef _MSC_VER
#define alignto(x) __declspec(align(x))
#define bit_cnt(u) __popcnt(u)
#define bit_cnt64(u) __popcnt64(u)
static int bit_ffs32(unsigned int u) {_BitScanForward(&u, u); return casti(u);}
static int bit_ffs64(unsigned long long u) {_BitScanForward64(&u, u); return casti(u);}
#else /* GCC, CLANG */
#define alignto(x) __attribute__((aligned(x)))
#define bit_cnt(u) __builtin_popcount(u)
#define bit_cnt64(u) __builtin_popcountll(u)
#define bit_ffs32(u) __builtin_ctz(u)
#define bit_ffs64(u) __builtin_ctzll(u)
#endif
static int
fltr_obj(struct guid* res, const char *obj_desc_buf, const char *obj_desc_buf_end,
const struct guid* obj_ids, const char *fltr_str, const char *fltr_str_end){
/* This algorithm uses as a predicate equality of the first and the last characters from the substring. These two characters are populated in two registers
* F and L respectively. Then in each iteration two chunks of strings are loaded. The first chunk (A) is read from offset i (where i is the current offset)
* and the second chunk (B) is read from offset i + k - 1, where k is sub string's length. Then we compute a vector expression F == A and B == L.
* This step yields a byte vector (or a bit mask), where "true" values denote position of potential substring occurrences.
* Finally, just at these positions an exact comparisons of sub strings are performed.
*
* Example: Let's assume 8-byte registers. We're searching for word "cat", thus:
*
* F = [ c | c | c | c | c | c | c | c ]
* L = [ t | t | t | t | t | t | t | t ]
*
* We're searching in the string "a_cat_tries". In the first iteration the register A gets data from offset 0, B from offset 2:
*
* A = [ a | _ | c | a | t | _ | t | r ]
* B = [ c | a | t | _ | t | r | i | e ]
*
* Now we compare:
*
* AF = ( A == F ) = [ 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 ]
* BL = ( B == L ) = [ 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 ]
*
* After merging comparison results, i.e.AF & BL, we get following mask :
*
* mask = [ 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 ]
*
* Since the mask is non-zero, it means there are possible substring occurrences.
* As we see, there is only one non-zero element at index 2, thus only one sub string comparison must be performed.
*
* For the actual search of all objects we use a single buffer containing all descriptions of all objects separated by '\0'.
* In addition a separate array of object ids is kept to map from description based on number of `\0` encountered to array index
* and therefore object id.
*/
if(fltr_str == fltr_str_end){
return 0;
}
int val_idx = 0;
int guid_idx = 0;
int needle_cnt = fltr_str_end - fltr_str;
const __m128i zero = _mm_set1_epi8(0u);
const __m128i first = _mm_set1_epi8(fltr_str[0] );
const __m128i last = _mm_set1_epi8(fltr_str_end[-1] );
for (const char *it = obj_desc_buf; it < obj_desc_buf_end; it += 16) {
__m128i blk_first = _mm_loadu_si128((const __m128i*)it );
__m128i blk_last = _mm_loadu_si128((const __m128i*)it + needle_cnt - 1u);
__m128i equal_first = _mm_cmpeq_epi8(first, blk_first);
__m128i equal_last = _mm_cmpeq_epi8(last, blk_last);
__m128i equal_zero = _mm_cmpeq_epi8(blk_first, zero);
__m128i equal = _mm_and_si128(equal_first,equal_last);
unsigned zero_mask = _mm_movemask_epi8(equal_zero);
unsigned msk = _mm_movemask_epi8(equal);
while(msk != 0u) {
unsigned bit_pos = bit_ffs32( msk );
if(memcmp(it + bit_pos, fltr_str, needle_cnt ) == 0) {
unsigned cur_idx = guid_idx + bit_cnt(zero_mask & ((1u << bit_pos) - 1u));
res[val_idx] = obj_ids[cur_idx];
val_idx++;
}
msk = msk & ( msk - 1 );
}
guid_idx += bit_cnt(zero_mask);
}
return val_idx;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment