mirror of
https://github.com/ii64/sonic.git
synced 2026-06-21 00:46:43 +08:00
484 lines
15 KiB
C
484 lines
15 KiB
C
/*
|
|
* Copyright 2021 ByteDance Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "native.h"
|
|
|
|
#if USE_SSE
|
|
#define loop_decl() \
|
|
size_t v; \
|
|
size_t n = 0; \
|
|
const char * p = s; \
|
|
|
|
#define loop_simd(size, load, func, ...) { \
|
|
while (nb >= size) { \
|
|
if ((v = func(load((const void *)(p)), ## __VA_ARGS__)) < size) { \
|
|
return n + v; \
|
|
} else { \
|
|
n += v; \
|
|
p += size; \
|
|
nb -= size; \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
#if !USE_AVX2
|
|
#define loop_zero()
|
|
#define loop_m256(func, ...)
|
|
#else
|
|
#define loop_zero() _mm256_zeroupper();
|
|
#define loop_m256(func, ...) loop_simd(32, _mm256_loadu_si256, func, ## __VA_ARGS__)
|
|
#endif
|
|
|
|
#define loop_m128(func, ...) loop_simd(16, _mm_loadu_si128, func, ## __VA_ARGS__)
|
|
#define loop_last(func, ...) return func(_mm_loadu_si128(as_m128c(p + nb - 16)), ## __VA_ARGS__) + n + nb - 16;
|
|
|
|
#define loop_bulk(func, ...) { \
|
|
loop_decl() \
|
|
loop_m256(func ## _avx2, ## __VA_ARGS__) \
|
|
loop_zero(); \
|
|
loop_m128(func ## _sse2, ## __VA_ARGS__) \
|
|
loop_last(func ## _sse2, ## __VA_ARGS__) \
|
|
}
|
|
|
|
#define loop_duff(func, ...) { \
|
|
size_t r = nb; \
|
|
__m128i m = _mm_set1_epi8(0xff); \
|
|
\
|
|
/* remaining bytes */ \
|
|
switch (r) { \
|
|
case 15 : m = _mm_insert_epi8(m, s[14], 14); \
|
|
case 14 : m = _mm_insert_epi8(m, s[13], 13); \
|
|
case 13 : m = _mm_insert_epi8(m, s[12], 12); \
|
|
case 12 : m = _mm_insert_epi8(m, s[11], 11); \
|
|
case 11 : m = _mm_insert_epi8(m, s[10], 10); \
|
|
case 10 : m = _mm_insert_epi8(m, s[ 9], 9); \
|
|
case 9 : m = _mm_insert_epi8(m, s[ 8], 8); \
|
|
case 8 : m = _mm_insert_epi8(m, s[ 7], 7); \
|
|
case 7 : m = _mm_insert_epi8(m, s[ 6], 6); \
|
|
case 6 : m = _mm_insert_epi8(m, s[ 5], 5); \
|
|
case 5 : m = _mm_insert_epi8(m, s[ 4], 4); \
|
|
case 4 : m = _mm_insert_epi8(m, s[ 3], 3); \
|
|
case 3 : m = _mm_insert_epi8(m, s[ 2], 2); \
|
|
case 2 : m = _mm_insert_epi8(m, s[ 1], 1); \
|
|
case 1 : m = _mm_insert_epi8(m, s[ 0], 0); \
|
|
default : return func ## _sse2(m, ## __VA_ARGS__); \
|
|
} \
|
|
}
|
|
|
|
static inline size_t lspace_sse2(__m128i v0) {
|
|
__m128i v1 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8(' '));
|
|
__m128i v2 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8('\t'));
|
|
__m128i v3 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8('\n'));
|
|
__m128i v4 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8('\r'));
|
|
__m128i v5 = _mm_or_si128 (v1, v2);
|
|
__m128i v6 = _mm_or_si128 (v3, v4);
|
|
__m128i v7 = _mm_or_si128 (v5, v6);
|
|
uint32_t v8 = _mm_movemask_epi8 (v7);
|
|
uint32_t v9 = __builtin_ctz (~v8);
|
|
return v9;
|
|
}
|
|
|
|
#if USE_AVX2
|
|
static inline size_t lspace_avx2(__m256i v0) {
|
|
__m256i v1 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8(' '));
|
|
__m256i v2 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8('\t'));
|
|
__m256i v3 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8('\n'));
|
|
__m256i v4 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8('\r'));
|
|
__m256i v5 = _mm256_or_si256 (v1, v2);
|
|
__m256i v6 = _mm256_or_si256 (v3, v4);
|
|
__m256i v7 = _mm256_or_si256 (v5, v6);
|
|
uint32_t v8 = _mm256_movemask_epi8 (v7);
|
|
uint64_t v9 = __builtin_ctzll (~(uint64_t)(v8));
|
|
return v9;
|
|
}
|
|
#endif
|
|
|
|
static inline size_t lquote_sse2(__m128i v0) {
|
|
__m128i v1 = _mm_cmpgt_epi8 (v0, _mm_set1_epi8(-1));
|
|
__m128i v2 = _mm_cmplt_epi8 (v0, _mm_set1_epi8(' '));
|
|
__m128i v3 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8('"'));
|
|
__m128i v4 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8('\\'));
|
|
__m128i v5 = _mm_and_si128 (v1, v2);
|
|
__m128i v6 = _mm_or_si128 (v3, v4);
|
|
__m128i v7 = _mm_or_si128 (v5, v6);
|
|
uint32_t v8 = _mm_movemask_epi8 (v7);
|
|
uint32_t v9 = __builtin_ctz (v8 | 0xffff0000);
|
|
return v9;
|
|
}
|
|
|
|
#if USE_AVX2
|
|
static inline size_t lquote_avx2(__m256i v0) {
|
|
__m256i v1 = _mm256_cmpgt_epi8 (v0, _mm256_set1_epi8(-1));
|
|
__m256i v2 = _mm256_cmpgt_epi8 (v0, _mm256_set1_epi8(31));
|
|
__m256i v3 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8('"'));
|
|
__m256i v4 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8('\\'));
|
|
__m256i v5 = _mm256_andnot_si256 (v2, v1);
|
|
__m256i v6 = _mm256_or_si256 (v3, v4);
|
|
__m256i v7 = _mm256_or_si256 (v5, v6);
|
|
uint32_t v8 = _mm256_movemask_epi8 (v7);
|
|
uint64_t v9 = __builtin_ctzll ((uint64_t)v8 | 0xffffffff00000000);
|
|
return v9;
|
|
}
|
|
#endif
|
|
|
|
static inline size_t strchr2_sse2(__m128i v0, uint64_t c0, uint64_t c1) {
|
|
__m128i v1 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8((char)c0));
|
|
__m128i v2 = _mm_cmpeq_epi8 (v0, _mm_set1_epi8((char)c1));
|
|
__m128i v3 = _mm_or_si128 (v1, v2);
|
|
uint32_t v4 = _mm_movemask_epi8 (v3);
|
|
uint32_t v5 = __builtin_ctz (v4 | 0xffff0000);
|
|
return v5;
|
|
}
|
|
|
|
#if USE_AVX2
|
|
static inline size_t strchr2_avx2(__m256i v0, uint64_t c0, uint64_t c1) {
|
|
__m256i v1 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8((char)c0));
|
|
__m256i v2 = _mm256_cmpeq_epi8 (v0, _mm256_set1_epi8((char)c1));
|
|
__m256i v3 = _mm256_or_si256 (v1, v2);
|
|
uint32_t v4 = _mm256_movemask_epi8 (v3);
|
|
uint64_t v5 = __builtin_ctzll ((uint64_t)v4 | 0xffffffff00000000);
|
|
return v5;
|
|
}
|
|
#endif
|
|
|
|
#define do_simd(func, ...) { \
|
|
if (nb == 0) { \
|
|
return 0; \
|
|
} if (nb < 16) { \
|
|
loop_duff(func, ## __VA_ARGS__) \
|
|
} else { \
|
|
loop_bulk(func, ## __VA_ARGS__) \
|
|
} \
|
|
}
|
|
#endif
|
|
|
|
#define is_quote(c) ((c) == '"' || (c) == '\\' || ((c) >= 0 && (c) <= 31))
|
|
#define is_space(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
|
|
|
|
static inline size_t lspace_p(const char *s, size_t nb) {
|
|
#if USE_SSE
|
|
do_simd(lspace)
|
|
#else
|
|
size_t i = 0;
|
|
while (i < nb && !is_space(s[i])) i++;
|
|
return i;
|
|
#endif
|
|
}
|
|
|
|
static inline size_t lquote_p(const char *s, size_t nb) {
|
|
#if USE_SSE
|
|
do_simd(lquote)
|
|
#else
|
|
size_t i = 0;
|
|
while (i < nb && !is_quote(s[i])) i++;
|
|
return i;
|
|
#endif
|
|
}
|
|
|
|
static inline size_t strchr1_p(const char *p, size_t nb, uint64_t ch) {
|
|
#if USE_SSE
|
|
int64_t r;
|
|
uint32_t t;
|
|
|
|
/* prepare the vector */
|
|
ssize_t n = nb;
|
|
uintptr_t m = (uintptr_t)p;
|
|
const char * q = p;
|
|
|
|
#if USE_AVX2
|
|
#define ALIGN_VAL 31
|
|
#define _mm_or _mm256_or_si256
|
|
#define _mm_load _mm256_load_si256
|
|
#define _mm_cmpeq(a, b) _mm256_cmpeq_epi8(a, b)
|
|
#define _mm_testz(v) _mm256_testz_si256(v, v)
|
|
#define _mm_movemask(v) _mm256_movemask_epi8(v)
|
|
__m256i a;
|
|
__m256i b;
|
|
__m256i c;
|
|
__m256i d;
|
|
__m256i u;
|
|
__m256i v;
|
|
__m256i w;
|
|
__m256i x = _mm256_set1_epi8(ch);
|
|
#else
|
|
#define ALIGN_VAL 15
|
|
#define _mm_or _mm_or_si128
|
|
#define _mm_load _mm_load_si128
|
|
#define _mm_cmpeq(a, b) _mm_cmpeq_epi8(a, b)
|
|
#define _mm_testz(v) (_mm_movemask_epi8(v) == 0)
|
|
#define _mm_movemask(v) _mm_movemask_epi8(v)
|
|
__m128i a;
|
|
__m128i b;
|
|
__m128i c;
|
|
__m128i d;
|
|
__m128i u;
|
|
__m128i v;
|
|
__m128i w;
|
|
__m128i x = _mm_set1_epi8(ch);
|
|
#endif
|
|
|
|
#define BLOCK_SIZE (ALIGN_VAL + 1)
|
|
#define BLOCK_MASK (1ull << BLOCK_SIZE)
|
|
#define BLOCK_LARGE (BLOCK_SIZE * 4)
|
|
|
|
/* check for pointer alignment */
|
|
if (m & ALIGN_VAL) {
|
|
v = _mm_load ((const void *)(m & -BLOCK_SIZE));
|
|
v = _mm_cmpeq (v, x);
|
|
r = _mm_movemask (v);
|
|
|
|
/* check for match in the first characters */
|
|
if ((r >>= (t = m & ALIGN_VAL)) != 0) {
|
|
if ((r = __builtin_ctzll(r | BLOCK_MASK)) < n) {
|
|
return r;
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
/* make the pointer aligned */
|
|
p += BLOCK_SIZE - t;
|
|
n -= BLOCK_SIZE - t;
|
|
}
|
|
|
|
/* attempt to compare 4 blocks at a time */
|
|
while (n >= BLOCK_LARGE) {
|
|
a = _mm_load ((const void *)(p + BLOCK_SIZE * 0));
|
|
b = _mm_load ((const void *)(p + BLOCK_SIZE * 1));
|
|
c = _mm_load ((const void *)(p + BLOCK_SIZE * 2));
|
|
d = _mm_load ((const void *)(p + BLOCK_SIZE * 3));
|
|
a = _mm_cmpeq (a, x);
|
|
b = _mm_cmpeq (b, x);
|
|
c = _mm_cmpeq (c, x);
|
|
d = _mm_cmpeq (d, x);
|
|
u = _mm_or (a, b);
|
|
v = _mm_or (c, d);
|
|
w = _mm_or (u, v);
|
|
|
|
/* check if anything matches */
|
|
if (_mm_testz(w)) {
|
|
p += BLOCK_LARGE;
|
|
n -= BLOCK_LARGE;
|
|
continue;
|
|
}
|
|
|
|
/* match something in the 4-blocks region */
|
|
if ((r = _mm_movemask(a)) != 0) {
|
|
return p - q + __builtin_ctzll(r | BLOCK_MASK);
|
|
} else if ((r = _mm_movemask(b)) != 0) {
|
|
return p - q + __builtin_ctzll(r | BLOCK_MASK) + BLOCK_SIZE;
|
|
} else if ((r = _mm_movemask(c)) != 0) {
|
|
return p - q + __builtin_ctzll(r | BLOCK_MASK) + BLOCK_SIZE * 2;
|
|
} else {
|
|
return p - q + __builtin_ctzll(_mm_movemask(d) | BLOCK_MASK) + BLOCK_SIZE * 3;
|
|
}
|
|
}
|
|
|
|
/* check every block, at most 4 times */
|
|
for (int i = 0; i < 4 && n >= 0; i++) {
|
|
v = _mm_load ((const void *)p);
|
|
v = _mm_cmpeq (v, x);
|
|
r = _mm_movemask (v);
|
|
|
|
/* found something */
|
|
if (r != 0) {
|
|
if ((r = __builtin_ctzll(r | BLOCK_MASK)) >= n) {
|
|
return -1;
|
|
} else {
|
|
return p - q + r;
|
|
}
|
|
}
|
|
|
|
/* otherwise advance to next block */
|
|
p += BLOCK_SIZE;
|
|
n -= BLOCK_SIZE;
|
|
}
|
|
|
|
#undef _mm_load
|
|
#undef _mm_bitor
|
|
#undef _mm_cmpeq
|
|
#undef _mm_testz
|
|
#undef _mm_movemask
|
|
#undef ALIGN_VAL
|
|
#undef BLOCK_SIZE
|
|
#undef BLOCK_LARGE
|
|
#else
|
|
for (size_t i = 0; i < nb; i++) {
|
|
if (p[i] == ch) {
|
|
return i;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* not found */
|
|
return nb;
|
|
}
|
|
|
|
static inline size_t strchr2_p(const char *s, size_t nb, uint64_t c0, uint64_t c1) {
|
|
#if USE_SSE
|
|
do_simd(strchr2, c0, c1)
|
|
#else
|
|
size_t i = 0;
|
|
while (i < nb && s[i] != c0 && s[i] != c1) i++;
|
|
return i;
|
|
#endif
|
|
}
|
|
|
|
size_t lzero(const char *p, size_t n) {
|
|
#if USE_SSE
|
|
#if USE_AVX
|
|
__m256i a;
|
|
__m256i b;
|
|
__m256i c;
|
|
__m256i d;
|
|
__m256i u;
|
|
__m256i v;
|
|
__m256i w;
|
|
__m256i y = _mm256_set1_epi8(0xff);
|
|
__m256i z = _mm256_setzero_si256();
|
|
#define BLOCK_SIZE 32
|
|
#else
|
|
__m128i a;
|
|
__m128i b;
|
|
__m128i c;
|
|
__m128i d;
|
|
__m128i u;
|
|
__m128i v;
|
|
__m128i w;
|
|
__m128i z = _mm_setzero_si128();
|
|
#define BLOCK_SIZE 16
|
|
#endif
|
|
|
|
#if USE_AVX2
|
|
#define _mm_load _mm256_load_si256
|
|
#define _mm_and(a, b) _mm256_and_si256(a, b)
|
|
#define _mm_cmpeq(a, b) _mm256_cmpeq_epi8(a, b)
|
|
#define _mm_testinz(v) (!_mm256_testc_si256(v, y))
|
|
#elif USE_AVX
|
|
#define _mm_load _mm256_load_si256
|
|
#define _mm_and(a, b) _mm256_and_ps((__m256)a, (__m256)b)
|
|
#define _mm_cmpeq(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
|
|
#define _mm_testinz(v) (!_mm256_testc_si256(v, y))
|
|
#else
|
|
#define _mm_load _mm_load_si128
|
|
#define _mm_and(a, b) _mm_and_si128(a, b)
|
|
#define _mm_cmpeq(a, b) _mm_cmpeq_epi8(a, b)
|
|
#define _mm_testinz(v) (_mm_movemask_epi8(v) != 0xffff)
|
|
#endif
|
|
|
|
/* multi-block loop */
|
|
while (n >= BLOCK_SIZE * 4) {
|
|
a = _mm_load ((const void *)(p + BLOCK_SIZE * 0));
|
|
b = _mm_load ((const void *)(p + BLOCK_SIZE * 1));
|
|
c = _mm_load ((const void *)(p + BLOCK_SIZE * 2));
|
|
d = _mm_load ((const void *)(p + BLOCK_SIZE * 3));
|
|
a = _mm_cmpeq (a, z);
|
|
b = _mm_cmpeq (b, z);
|
|
c = _mm_cmpeq (c, z);
|
|
d = _mm_cmpeq (d, z);
|
|
u = _mm_and (a, b);
|
|
v = _mm_and (c, d);
|
|
w = _mm_and (u, v);
|
|
|
|
/* test for zeros */
|
|
if (_mm_testinz(w)) {
|
|
return 1;
|
|
}
|
|
|
|
/* move to next block */
|
|
p += BLOCK_SIZE * 4;
|
|
n -= BLOCK_SIZE * 4;
|
|
}
|
|
|
|
/* single block loop */
|
|
while (n >= BLOCK_SIZE) {
|
|
a = _mm_load ((const void *)(p));
|
|
b = _mm_cmpeq (a, z);
|
|
|
|
/* test for zeros */
|
|
if (_mm_testinz(b)) {
|
|
return 1;
|
|
}
|
|
|
|
/* move to next block */
|
|
p += BLOCK_SIZE;
|
|
n -= BLOCK_SIZE;
|
|
}
|
|
|
|
#undef _mm_load
|
|
#undef _mm_cmpeq
|
|
#undef _mm_bitand
|
|
#undef _mm_testinz
|
|
#undef BLOCK_SIZE
|
|
#endif
|
|
|
|
/* 8 bytes loop */
|
|
while (n >= 8) {
|
|
if (*(uint64_t *)p) {
|
|
return 1;
|
|
} else {
|
|
p += 8;
|
|
n -= 8;
|
|
}
|
|
}
|
|
|
|
/* 4 bytes test */
|
|
if (n >= 4) {
|
|
if (*(uint32_t *)p) {
|
|
return 1;
|
|
} else {
|
|
p += 4;
|
|
n -= 4;
|
|
}
|
|
}
|
|
|
|
/* 2 bytes test */
|
|
if (n >= 2) {
|
|
if (*(uint16_t *)p) {
|
|
return 1;
|
|
} else {
|
|
p += 2;
|
|
n -= 2;
|
|
}
|
|
}
|
|
|
|
/* the final byte */
|
|
if (n == 0) {
|
|
return 0;
|
|
} else {
|
|
return *p != 0;
|
|
}
|
|
}
|
|
|
|
size_t lquote(const GoString *s, size_t p) {
|
|
return lquote_p(s->buf + p, s->len - p) + p;
|
|
}
|
|
|
|
size_t lspace(const char *sp, size_t nb, size_t p) {
|
|
return lspace_p(sp + p, nb - p) + p;
|
|
}
|
|
|
|
ssize_t strchr1(const GoString *s, size_t p, char ch) {
|
|
size_t n = s->len - p;
|
|
size_t v = strchr1_p(s->buf + p, n, ch);
|
|
return v >= n ? -1 : v + p;
|
|
}
|
|
|
|
ssize_t strchr2(const GoString *s, size_t p, char c0, char c1) {
|
|
size_t n = s->len - p;
|
|
size_t v = strchr2_p(s->buf + p, n, c0, c1);
|
|
return v >= n ? -1 : v + p;
|
|
}
|