2
0
Fork 0
mirror of https://github.com/ii64/sonic.git synced 2026-06-21 00:46:43 +08:00

opt: skip space use shuffle (#416)

This commit is contained in:
liu 2023-05-16 13:24:22 +08:00 committed by GitHub
parent ebbe7589ca
commit d83abb5435
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 9710 additions and 8575 deletions

File diff suppressed because it is too large Load diff

View file

@ -9,28 +9,28 @@ package avx
func __native_entry__() uintptr func __native_entry__() uintptr
var ( var (
_subr__f32toa = __native_entry__() + 28800 _subr__f32toa = __native_entry__() + 31264
_subr__f64toa = __native_entry__() + 448 _subr__f64toa = __native_entry__() + 192
_subr__get_by_path = __native_entry__() + 25664 _subr__get_by_path = __native_entry__() + 25856
_subr__html_escape = __native_entry__() + 9296 _subr__html_escape = __native_entry__() + 9040
_subr__i64toa = __native_entry__() + 3744 _subr__i64toa = __native_entry__() + 3488
_subr__lspace = __native_entry__() + 80 _subr__lspace = __native_entry__() + 16
_subr__quote = __native_entry__() + 5136 _subr__quote = __native_entry__() + 4880
_subr__skip_array = __native_entry__() + 18592 _subr__skip_array = __native_entry__() + 17952
_subr__skip_number = __native_entry__() + 22224 _subr__skip_number = __native_entry__() + 21952
_subr__skip_object = __native_entry__() + 20640 _subr__skip_object = __native_entry__() + 20368
_subr__skip_one = __native_entry__() + 22384 _subr__skip_one = __native_entry__() + 22112
_subr__skip_one_fast = __native_entry__() + 22624 _subr__skip_one_fast = __native_entry__() + 22352
_subr__u64toa = __native_entry__() + 3856 _subr__u64toa = __native_entry__() + 3600
_subr__unquote = __native_entry__() + 6928 _subr__unquote = __native_entry__() + 6672
_subr__validate_one = __native_entry__() + 22448 _subr__validate_one = __native_entry__() + 22176
_subr__validate_utf8 = __native_entry__() + 27552 _subr__validate_utf8 = __native_entry__() + 30000
_subr__validate_utf8_fast = __native_entry__() + 28224 _subr__validate_utf8_fast = __native_entry__() + 30672
_subr__value = __native_entry__() + 12480 _subr__value = __native_entry__() + 12224
_subr__vnumber = __native_entry__() + 16256 _subr__vnumber = __native_entry__() + 15616
_subr__vsigned = __native_entry__() + 17872 _subr__vsigned = __native_entry__() + 17232
_subr__vstring = __native_entry__() + 14704 _subr__vstring = __native_entry__() + 14064
_subr__vunsigned = __native_entry__() + 18240 _subr__vunsigned = __native_entry__() + 17600
) )
const ( const (
@ -45,7 +45,7 @@ const (
_stack__skip_number = 72 _stack__skip_number = 72
_stack__skip_object = 128 _stack__skip_object = 128
_stack__skip_one = 128 _stack__skip_one = 128
_stack__skip_one_fast = 216 _stack__skip_one_fast = 200
_stack__u64toa = 8 _stack__u64toa = 8
_stack__unquote = 88 _stack__unquote = 88
_stack__validate_one = 128 _stack__validate_one = 128

File diff suppressed because it is too large Load diff

View file

@ -9,34 +9,34 @@ package avx2
func __native_entry__() uintptr func __native_entry__() uintptr
var ( var (
_subr__f32toa = __native_entry__() + 34720 _subr__f32toa = __native_entry__() + 33888
_subr__f64toa = __native_entry__() + 736 _subr__f64toa = __native_entry__() + 288
_subr__get_by_path = __native_entry__() + 29248 _subr__get_by_path = __native_entry__() + 28336
_subr__html_escape = __native_entry__() + 10944 _subr__html_escape = __native_entry__() + 10496
_subr__i64toa = __native_entry__() + 4032 _subr__i64toa = __native_entry__() + 3584
_subr__lspace = __native_entry__() + 224 _subr__lspace = __native_entry__() + 64
_subr__quote = __native_entry__() + 5520 _subr__quote = __native_entry__() + 5072
_subr__skip_array = __native_entry__() + 21616 _subr__skip_array = __native_entry__() + 20688
_subr__skip_number = __native_entry__() + 25840 _subr__skip_number = __native_entry__() + 24912
_subr__skip_object = __native_entry__() + 23648 _subr__skip_object = __native_entry__() + 22736
_subr__skip_one = __native_entry__() + 26000 _subr__skip_one = __native_entry__() + 25072
_subr__skip_one_fast = __native_entry__() + 26416 _subr__skip_one_fast = __native_entry__() + 25488
_subr__u64toa = __native_entry__() + 4144 _subr__u64toa = __native_entry__() + 3696
_subr__unquote = __native_entry__() + 8336 _subr__unquote = __native_entry__() + 7888
_subr__validate_one = __native_entry__() + 26064 _subr__validate_one = __native_entry__() + 25136
_subr__validate_utf8 = __native_entry__() + 31152 _subr__validate_utf8 = __native_entry__() + 30320
_subr__validate_utf8_fast = __native_entry__() + 32112 _subr__validate_utf8_fast = __native_entry__() + 31280
_subr__value = __native_entry__() + 15472 _subr__value = __native_entry__() + 15024
_subr__vnumber = __native_entry__() + 19280 _subr__vnumber = __native_entry__() + 18352
_subr__vsigned = __native_entry__() + 20896 _subr__vsigned = __native_entry__() + 19968
_subr__vstring = __native_entry__() + 17952 _subr__vstring = __native_entry__() + 17024
_subr__vunsigned = __native_entry__() + 21264 _subr__vunsigned = __native_entry__() + 20336
) )
const ( const (
_stack__f32toa = 48 _stack__f32toa = 48
_stack__f64toa = 80 _stack__f64toa = 80
_stack__get_by_path = 312 _stack__get_by_path = 296
_stack__html_escape = 72 _stack__html_escape = 72
_stack__i64toa = 16 _stack__i64toa = 16
_stack__lspace = 8 _stack__lspace = 8
@ -45,7 +45,7 @@ const (
_stack__skip_number = 72 _stack__skip_number = 72
_stack__skip_object = 128 _stack__skip_object = 128
_stack__skip_one = 128 _stack__skip_one = 128
_stack__skip_one_fast = 224 _stack__skip_one_fast = 208
_stack__u64toa = 8 _stack__u64toa = 8
_stack__unquote = 72 _stack__unquote = 72
_stack__validate_one = 128 _stack__validate_one = 128

File diff suppressed because it is too large Load diff

View file

@ -9,34 +9,34 @@ package sse
func __native_entry__() uintptr func __native_entry__() uintptr
var ( var (
_subr__f32toa = __native_entry__() + 29440 _subr__f32toa = __native_entry__() + 31760
_subr__f64toa = __native_entry__() + 448 _subr__f64toa = __native_entry__() + 160
_subr__get_by_path = __native_entry__() + 26304 _subr__get_by_path = __native_entry__() + 26384
_subr__html_escape = __native_entry__() + 9360 _subr__html_escape = __native_entry__() + 9072
_subr__i64toa = __native_entry__() + 3712 _subr__i64toa = __native_entry__() + 3424
_subr__lspace = __native_entry__() + 80 _subr__lspace = __native_entry__() + 16
_subr__quote = __native_entry__() + 5152 _subr__quote = __native_entry__() + 4864
_subr__skip_array = __native_entry__() + 18800 _subr__skip_array = __native_entry__() + 18112
_subr__skip_number = __native_entry__() + 22448 _subr__skip_number = __native_entry__() + 22128
_subr__skip_object = __native_entry__() + 20832 _subr__skip_object = __native_entry__() + 20512
_subr__skip_one = __native_entry__() + 22608 _subr__skip_one = __native_entry__() + 22288
_subr__skip_one_fast = __native_entry__() + 22832 _subr__skip_one_fast = __native_entry__() + 22512
_subr__u64toa = __native_entry__() + 3840 _subr__u64toa = __native_entry__() + 3552
_subr__unquote = __native_entry__() + 6992 _subr__unquote = __native_entry__() + 6704
_subr__validate_one = __native_entry__() + 22656 _subr__validate_one = __native_entry__() + 22336
_subr__validate_utf8 = __native_entry__() + 28208 _subr__validate_utf8 = __native_entry__() + 30528
_subr__validate_utf8_fast = __native_entry__() + 28880 _subr__validate_utf8_fast = __native_entry__() + 31200
_subr__value = __native_entry__() + 12560 _subr__value = __native_entry__() + 12272
_subr__vnumber = __native_entry__() + 16416 _subr__vnumber = __native_entry__() + 15728
_subr__vsigned = __native_entry__() + 18064 _subr__vsigned = __native_entry__() + 17376
_subr__vstring = __native_entry__() + 14800 _subr__vstring = __native_entry__() + 14112
_subr__vunsigned = __native_entry__() + 18448 _subr__vunsigned = __native_entry__() + 17760
) )
const ( const (
_stack__f32toa = 48 _stack__f32toa = 48
_stack__f64toa = 80 _stack__f64toa = 80
_stack__get_by_path = 256 _stack__get_by_path = 240
_stack__html_escape = 64 _stack__html_escape = 64
_stack__i64toa = 16 _stack__i64toa = 16
_stack__lspace = 8 _stack__lspace = 8
@ -45,7 +45,7 @@ const (
_stack__skip_number = 72 _stack__skip_number = 72
_stack__skip_object = 128 _stack__skip_object = 128
_stack__skip_one = 128 _stack__skip_one = 128
_stack__skip_one_fast = 168 _stack__skip_one_fast = 136
_stack__u64toa = 8 _stack__u64toa = 8
_stack__unquote = 88 _stack__unquote = 88
_stack__validate_one = 128 _stack__validate_one = 128

View file

@ -16,14 +16,7 @@
#include "native.h" #include "native.h"
#if USE_AVX2
static const uintptr_t ALIGN_MASK = 31;
#else
static const uintptr_t ALIGN_MASK = 15;
#endif
size_t lspace(const char *sp, size_t nb, size_t p) { size_t lspace(const char *sp, size_t nb, size_t p) {
int32_t ms;
const char * ss = sp; const char * ss = sp;
/* seek to `p` */ /* seek to `p` */
@ -31,64 +24,28 @@ size_t lspace(const char *sp, size_t nb, size_t p) {
nb -= p; nb -= p;
/* likely to run into non-spaces within a few characters, try scalar code first */ /* likely to run into non-spaces within a few characters, try scalar code first */
while (nb > 0 && ((uintptr_t)sp & ALIGN_MASK)) {
switch ((nb--, *sp++)) {
case ' ' : break;
case '\r' : break;
case '\n' : break;
case '\t' : break;
default : return sp - ss - 1;
}
}
#if USE_AVX2 #if USE_AVX2
__m256i space_tab = _mm256_setr_epi8(
'\x20', 0, 0, 0, 0, 0, 0, 0,
0, '\x09', '\x0A', 0, 0, '\x0D', 0, 0,
'\x20', 0, 0, 0, 0, 0, 0, 0,
0, '\x09', '\x0A', 0, 0, '\x0D', 0, 0
);
/* 32-byte loop */ /* 32-byte loop */
while (likely(nb >= 32)) { while (likely(nb >= 32)) {
__m256i x = _mm256_load_si256 ((const void *)sp); __m256i input = _mm256_loadu_si256((__m256i*)sp);
__m256i a = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8(' ')); __m256i shuffle = _mm256_shuffle_epi8(space_tab, input);
__m256i b = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\t')); __m256i result = _mm256_cmpeq_epi8(input, shuffle);
__m256i c = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\n')); int32_t mask = _mm256_movemask_epi8(result);
__m256i d = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\r')); if (mask != -1) {
__m256i u = _mm256_or_si256 (a, b); return sp - ss + __builtin_ctzll(~(uint64_t)mask);
__m256i v = _mm256_or_si256 (c, d);
__m256i w = _mm256_or_si256 (u, v);
/* check for matches */
if ((ms = _mm256_movemask_epi8(w)) != -1) {
_mm256_zeroupper();
return sp - ss + __builtin_ctzll(~(uint64_t)ms);
} }
/* move to next block */
sp += 32; sp += 32;
nb -= 32; nb -= 32;
} }
/* clear upper half to avoid AVX-SSE transition penalty */
_mm256_zeroupper();
#endif #endif
/* 16-byte loop */
while (likely(nb >= 16)) {
__m128i x = _mm_load_si128 ((const void *)sp);
__m128i a = _mm_cmpeq_epi8 (x, _mm_set1_epi8(' '));
__m128i b = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\t'));
__m128i c = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\n'));
__m128i d = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\r'));
__m128i u = _mm_or_si128 (a, b);
__m128i v = _mm_or_si128 (c, d);
__m128i w = _mm_or_si128 (u, v);
/* check for matches */
if ((ms = _mm_movemask_epi8(w)) != 0xffff) {
return sp - ss + __builtin_ctz(~ms);
}
/* move to next block */
sp += 16;
nb -= 16;
}
/* remaining bytes, do with scalar code */ /* remaining bytes, do with scalar code */
while (nb-- > 0) { while (nb-- > 0) {
switch (*sp++) { switch (*sp++) {