Last active
October 31, 2024 15:32
-
-
Save vurtun/47c14b1abd98911b10e5f0a45c254616 to your computer and use it in GitHub Desktop.
object string filter using sse2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifdef _MSC_VER | |
#define alignto(x) __declspec(align(x)) | |
#define bit_cnt(u) __popcnt(u) | |
#define bit_cnt64(u) __popcnt64(u) | |
static int bit_ffs32(unsigned int u) {_BitScanForward(&u, u); return casti(u);} | |
static int bit_ffs64(unsigned long long u) {_BitScanForward64(&u, u); return casti(u);} | |
#else /* GCC, CLANG */ | |
#define alignto(x) __attribute__((aligned(x))) | |
#define bit_cnt(u) __builtin_popcount(u) | |
#define bit_cnt64(u) __builtin_popcountll(u) | |
#define bit_ffs32(u) __builtin_ctz(u) | |
#define bit_ffs64(u) __builtin_ctzll(u) | |
#endif | |
static int | |
fltr_obj(struct guid* res, const char *obj_desc_buf, const char *obj_desc_buf_end, | |
const struct guid* obj_ids, const char *fltr_str, const char *fltr_str_end){ | |
/* This algorithm uses as a predicate equality of the first and the last characters from the substring. These two characters are populated in two registers | |
* F and L respectively. Then in each iteration two chunks of strings are loaded. The first chunk (A) is read from offset i (where i is the current offset) | |
* and the second chunk (B) is read from offset i + k - 1, where k is sub string's length. Then we compute a vector expression F == A and B == L. | |
* This step yields a byte vector (or a bit mask), where "true" values denote position of potential substring occurrences. | |
* Finally, just at these positions an exact comparisons of sub strings are performed. | |
* | |
* Example: Let's assume 8-byte registers. We're searching for word "cat", thus: | |
* | |
* F = [ c | c | c | c | c | c | c | c ] | |
* L = [ t | t | t | t | t | t | t | t ] | |
* | |
* We're searching in the string "a_cat_tries". In the first iteration the register A gets data from offset 0, B from offset 2: | |
* | |
* A = [ a | _ | c | a | t | _ | t | r ] | |
* B = [ c | a | t | _ | t | r | i | e ] | |
* | |
* Now we compare: | |
* | |
* AF = ( A == F ) = [ 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 ] | |
* BL = ( B == L ) = [ 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 ] | |
* | |
* After merging comparison results, i.e.AF & BL, we get following mask : | |
* | |
* mask = [ 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 ] | |
* | |
* Since the mask is non-zero, it means there are possible substring occurrences. | |
* As we see, there is only one non-zero element at index 2, thus only one sub string comparison must be performed. | |
* | |
* For the actual search of all objects we use a single buffer containing all descriptions of all objects separated by '\0'. | |
* In addition a separate array of object ids is kept to map from description based on number of `\0` encountered to array index | |
* and therefore object id. | |
*/ | |
if(fltr_str == fltr_str_end){ | |
return 0; | |
} | |
int val_idx = 0; | |
int guid_idx = 0; | |
int needle_cnt = fltr_str_end - fltr_str; | |
const __m128i zero = _mm_set1_epi8(0u); | |
const __m128i first = _mm_set1_epi8(fltr_str[0] ); | |
const __m128i last = _mm_set1_epi8(fltr_str_end[-1] ); | |
for (const char *it = obj_desc_buf; it < obj_desc_buf_end; it += 16) { | |
__m128i blk_first = _mm_loadu_si128((const __m128i*)it ); | |
__m128i blk_last = _mm_loadu_si128((const __m128i*)it + needle_cnt - 1u); | |
__m128i equal_first = _mm_cmpeq_epi8(first, blk_first); | |
__m128i equal_last = _mm_cmpeq_epi8(last, blk_last); | |
__m128i equal_zero = _mm_cmpeq_epi8(blk_first, zero); | |
__m128i equal = _mm_and_si128(equal_first,equal_last); | |
unsigned zero_mask = _mm_movemask_epi8(equal_zero); | |
unsigned msk = _mm_movemask_epi8(equal); | |
while(msk != 0u) { | |
unsigned bit_pos = bit_ffs32( msk ); | |
if(memcmp(it + bit_pos, fltr_str, needle_cnt ) == 0) { | |
unsigned cur_idx = guid_idx + bit_cnt(zero_mask & ((1u << bit_pos) - 1u)); | |
res[val_idx] = obj_ids[cur_idx]; | |
val_idx++; | |
} | |
msk = msk & ( msk - 1 ); | |
} | |
guid_idx += bit_cnt(zero_mask); | |
} | |
return val_idx; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment