sonic/native/fastbytes.c

/*
 * Copyright 2021 ByteDance Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "native.h"

#if USE_AVX2
static const uintptr_t ALIGN_MASK = 31;
#else
static const uintptr_t ALIGN_MASK = 15;
#endif

size_t lspace(const char *sp, size_t nb, size_t p) {
    int32_t      ms;
    const char * ss = sp;

    /* seek to `p` */
    sp += p;
    nb -= p;

    /* likely to run into non-spaces within a few characters, try scalar code first */
    while (nb > 0 && ((uintptr_t)sp & ALIGN_MASK)) {
        switch ((nb--, *sp++)) {
            case ' '  : break;
            case '\r' : break;
            case '\n' : break;
            case '\t' : break;
            default   : return sp - ss - 1;
        }
    }

#if USE_AVX2
    /* 32-byte loop */
    while (likely(nb >= 32)) {
        __m256i x = _mm256_load_si256 ((const void *)sp);
        __m256i a = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8(' '));
        __m256i b = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\t'));
        __m256i c = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\n'));
        __m256i d = _mm256_cmpeq_epi8 (x, _mm256_set1_epi8('\r'));
        __m256i u = _mm256_or_si256   (a, b);
        __m256i v = _mm256_or_si256   (c, d);
        __m256i w = _mm256_or_si256   (u, v);

        /* check for matches */
        if ((ms = _mm256_movemask_epi8(w)) != -1) {
            _mm256_zeroupper();
            return sp - ss + __builtin_ctzll(~(uint64_t)ms);
        }

        /* move to next block */
        sp += 32;
        nb -= 32;
    }

    /* clear upper half to avoid AVX-SSE transition penalty */
    _mm256_zeroupper();
#endif

    /* 16-byte loop */
    while (likely(nb >= 16)) {
        __m128i x = _mm_load_si128 ((const void *)sp);
        __m128i a = _mm_cmpeq_epi8 (x, _mm_set1_epi8(' '));
        __m128i b = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\t'));
        __m128i c = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\n'));
        __m128i d = _mm_cmpeq_epi8 (x, _mm_set1_epi8('\r'));
        __m128i u = _mm_or_si128   (a, b);
        __m128i v = _mm_or_si128   (c, d);
        __m128i w = _mm_or_si128   (u, v);

        /* check for matches */
        if ((ms = _mm_movemask_epi8(w)) != 0xffff) {
            return sp - ss + __builtin_ctz(~ms);
        }

        /* move to next block */
        sp += 16;
        nb -= 16;
    }

    /* remaining bytes, do with scalar code */
    while (nb-- > 0) {
        switch (*sp++) {
            case ' '  : break;
            case '\r' : break;
            case '\n' : break;
            case '\t' : break;
            default   : return sp - ss - 1;
        }
    }

    /* all the characters are spaces */
    return sp - ss;
}