2
0
Fork 0
mirror of https://github.com/ii64/sonic.git synced 2026-06-21 00:46:43 +08:00
sonic/native/fastbytes.c
liu 56e81a633e
fix: use sse instead of sse4 (#305)
* fix: use sse instead of sse4

* fix: use dispatch

* fix: remove lzero

Co-authored-by: liuqiang <liuqiang.06@bytedance.com>
2022-09-26 12:45:01 +08:00

105 lines
No EOL
3.1 KiB
C

/*
* Copyright 2021 ByteDance Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "native.h"
#if USE_AVX2
static const uintptr_t ALIGN_MASK = 31;
#else
static const uintptr_t ALIGN_MASK = 15;
#endif
size_t lspace(const char *sp, size_t nb, size_t p) {
int32_t ms;
const char * ss = sp;
/* seek to `p` */
sp += p;
nb -= p;
/* likely to run into non-spaces within a few characters, try scalar code first */
while (nb > 0 && ((uintptr_t)sp & ALIGN_MASK)) {
switch ((nb--, *sp++)) {
case ' ' : break;
case '\r' : break;
case '\n' : break;
case '\t' : break;
default : return sp - ss - 1;
}
}
#if USE_AVX2
/* 32-byte loop */
while (likely(nb >= 32)) {
__m256i x = _mm256_load_si256 ((const void *)sp);
__m256i a = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8(' '));
__m256i b = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\t'));
__m256i c = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\n'));
__m256i d = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\r'));
__m256i u = _mm256_or_si256 (a, b);
__m256i v = _mm256_or_si256 (c, d);
__m256i w = _mm256_or_si256 (u, v);
/* check for matches */
if ((ms = _mm256_movemask_epi8(w)) != -1) {
_mm256_zeroupper();
return sp - ss + __builtin_ctzll(~(uint64_t)ms);
}
/* move to next block */
sp += 32;
nb -= 32;
}
/* clear upper half to avoid AVX-SSE transition penalty */
_mm256_zeroupper();
#endif
/* 16-byte loop */
while (likely(nb >= 16)) {
__m128i x = _mm_load_si128 ((const void *)sp);
__m128i a = _mm_cmpeq_epi8 (x, _mm_set1_epi8(' '));
__m128i b = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\t'));
__m128i c = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\n'));
__m128i d = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\r'));
__m128i u = _mm_or_si128 (a, b);
__m128i v = _mm_or_si128 (c, d);
__m128i w = _mm_or_si128 (u, v);
/* check for matches */
if ((ms = _mm_movemask_epi8(w)) != 0xffff) {
return sp - ss + __builtin_ctz(~ms);
}
/* move to next block */
sp += 16;
nb -= 16;
}
/* remaining bytes, do with scalar code */
while (nb-- > 0) {
switch (*sp++) {
case ' ' : break;
case '\r' : break;
case '\n' : break;
case '\t' : break;
default : return sp - ss - 1;
}
}
/* all the characters are spaces */
return sp - ss;
}