sonic/native/parsing.c

/*
 * Copyright 2021 ByteDance Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "native.h"

static const char _UnquoteTab[256] = {
    ['/' ] = '/',
    ['"' ] = '"',
    ['b' ] = '\b',
    ['f' ] = '\f',
    ['n' ] = '\n',
    ['r' ] = '\r',
    ['t' ] = '\t',
    ['u' ] = -1,
    ['\\'] = '\\',
};

static inline ssize_t memcchr_p32(const char *s, ssize_t nb, char *p) {
    int64_t      r;
    ssize_t      n = nb;
    const char * q = s;

#if USE_AVX2
    __m256i u;
    __m256i v;
    __m256i b = _mm256_set1_epi8('\\');

    /* process every 32 bytes */
    while (n >= 32) {
        u = _mm256_loadu_si256  ((const void *)s);
        v = _mm256_cmpeq_epi8   (u, b);
            _mm256_storeu_si256 ((void *)p, u);

        /* check for matches */
        if ((r = _mm256_movemask_epi8(v)) != 0) {
            return s - q + __builtin_ctzll(r | (1ull << 32));
        }

        /* move to the next 32 bytes */
        s += 32;
        p += 32;
        n -= 32;
    }
#endif

#if USE_AVX2
    _mm256_zeroupper();
#endif

#if USE_SSE
    __m128i x;
    __m128i y;
    __m128i a = _mm_set1_epi8('\\');

    /* process every 16 bytes */
    while (n >= 16) {
        x = _mm_loadu_si128  ((const void *)s);
        y = _mm_cmpeq_epi8   (x, a);
            _mm_storeu_si128 ((void *)p, x);

        /* check for matches */
        if ((r = _mm_movemask_epi8(y)) != 0) {
            return s - q + __builtin_ctzll(r | (1 << 16));
        }

        /* move to the next 16 bytes */
        s += 16;
        p += 16;
        n -= 16;
    }
#endif

    /* remaining bytes, do with scalar code */
    while (n--) {
        if (*s != '\\') {
            *p++ = *s++;
        } else {
            return s - q;
        }
    }

    /* nothing found, but everything was copied */
    return -1;
}

#define ALL_01h     (~0ul / 255)
#define ALL_7fh     (ALL_01h * 127)
#define ALL_80h     (ALL_01h * 128)

static inline uint32_t hasless(uint32_t x, uint8_t n) {
    return (x - ALL_01h * n) & ~x & ALL_80h;
}

static inline uint32_t hasmore(uint32_t x, uint8_t n) {
    return (x + ALL_01h * (127 - n) | x) & ALL_80h;
}

static inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) {
    return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h;
}

#undef ALL_01h
#undef ALL_7fh
#undef ALL_80h

static inline char ishex(char c) {
    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}

static inline void unirep(char **dp) {
    *(*dp)++ = 0xef;
    *(*dp)++ = 0xbf;
    *(*dp)++ = 0xbd;
}

static inline char unhex16_is(const char *s) {
    uint32_t v = *(uint32_t *)s;
    return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a'));
}

static inline uint32_t unhex16_fast(const char *s) {
    uint32_t a = __builtin_bswap32(*(uint32_t *)s);
    uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f);
    uint32_t c = (b >> 4) | b;
    uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff);
    return d;
}

ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flags) {
    ssize_t      n;
    ssize_t      x = nb;
    const char * s = sp;
    const char * p = dp;

    /* scan & copy all the non-escape characters */
    while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) {
        char     cc;
        uint32_t r0;
        uint32_t r1;

        /* skip the plain text */
        dp += n;
        sp += n + 2;
        nb -= n + 2;

        /* check for EOF */
        if (nb < 0) {
            *ep = x;
            return -ERR_EOF;
        }

        /* check for double unquote */
        if (unlikely(flags & F_DBLUNQ)) {
            int  nr = nb;
            char c1 = sp[-1];

            /* must have at least 1 character left */
            if (nr == 0) {
                *ep = x;
                return -ERR_EOF;
            }

            /* every quote must be a double quote */
            if (c1 != '\\') {
                *ep = sp - s - 1;
                return -ERR_INVAL;
            }

            /* special case of '\\\\' and '\\\"' */
            if (*sp == '\\') {
                if (nr < 2) {
                    *ep = x;
                    return -ERR_EOF;
                } else if (sp[1] != '"' && sp[1] != '\\') {
                    *ep = sp - s + 1;
                    return -ERR_INVAL;
                } else {
                    sp++;
                    nb--;
                }
            }

            /* skip the second escape */
            sp++;
            nb--;
        }

        /* check for escape sequence */
        if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) {
            *ep = sp - s - 1;
            return -ERR_ESCAPE;
        }

        /* check for simple escape sequence */
        if (cc != -1) {
            *dp++ = cc;
            continue;
        }

        /* must have at least 4 characters */
        if (nb < 4) {
            *ep = x;
            return -ERR_EOF;
        }

        /* check for hexadecimal characters */
        if (!unhex16_is(sp)) {
            *ep = sp - s;
            for (int i = 0; i < 4 && ishex(*sp); i++, sp++) ++*ep;
            return -ERR_INVAL;
        }

        /* decode the code-point */
        r0 = unhex16_fast(sp);
        sp += 4;
        nb -= 4;

        /* ASCII characters, unlikely */
        if (unlikely(r0 <= 0x7f)) {
            *dp++ = (char)r0;
            continue;
        }

        /* latin-1 characters, unlikely */
        if (unlikely(r0 <= 0x07ff)) {
            *dp++ = (char)(0xc0 | (r0 >> 6));
            *dp++ = (char)(0x80 | (r0 & 0x3f));
            continue;
        }

        /* 3-byte characters, likely */
        if (likely(r0 < 0xd800 || r0 > 0xdfff)) {
            *dp++ = (char)(0xe0 | ((r0 >> 12)       ));
            *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
            *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
            continue;
        }

        /* check for double unquote */
        if (unlikely(flags & F_DBLUNQ)) {
            if (nb < 1) {
                *ep = x;
                return -ERR_EOF;
            } else if (sp[0] != '\\') {
                *ep = sp - s - 4;
                return -ERR_UNICODE;
            } else {
                nb--;
                sp++;
            }
        }

        /* surrogate half, must follows by the other half */
        if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') {
            if (likely(flags & F_UNIREP)) {
                unirep(&dp);
                continue;
            } else {
                *ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4);
                return -ERR_UNICODE;
            }
        }

        /* check the hexadecimal escape */
        if (!unhex16_is(sp + 2)) {
            *ep = sp - s + 2;
            for (int i = 0; i < 4 && ishex(sp[2]); i++, sp++) ++*ep;
            return -ERR_INVAL;
        }

        /* decode the second code-point */
        r1 = unhex16_fast(sp + 2);
        sp += 6;
        nb -= 6;

        /* it must be the other half */
        if (r1 < 0xdc00 || r1 > 0xdfff) {
            if (likely(!(flags & F_UNIREP))) {
                *ep = sp - s - 4;
                return -ERR_UNICODE;
            } else {
                unirep(&dp);
                unirep(&dp);
                continue;
            }
        }

        /* merge two surrogates */
        r0 = (r0 - 0xd800) << 10;
        r1 = (r1 - 0xdc00) + 0x010000;
        r0 += r1;

        /* check the code point range */
        if (r0 > 0x10ffff) {
            if (likely(!(flags & F_UNIREP))) {
                *ep = sp - s - 4;
                return -ERR_UNICODE;
            } else {
                unirep(&dp);
                continue;
            }
        }

        /* encode the character */
        *dp++ = (char)(0xf0 | ((r0 >> 18)       ));
        *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f));
        *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
        *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
    }

    /* calculate the result length */
    return dp + nb - p;
}