mirror of
https://github.com/ii64/sonic.git
synced 2026-06-21 00:46:43 +08:00
326 lines
8.4 KiB
C
326 lines
8.4 KiB
C
/*
|
|
* Copyright 2021 ByteDance Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "native.h"
|
|
|
|
static const char _UnquoteTab[256] = {
|
|
['/' ] = '/',
|
|
['"' ] = '"',
|
|
['b' ] = '\b',
|
|
['f' ] = '\f',
|
|
['n' ] = '\n',
|
|
['r' ] = '\r',
|
|
['t' ] = '\t',
|
|
['u' ] = -1,
|
|
['\\'] = '\\',
|
|
};
|
|
|
|
static inline ssize_t memcchr_p32(const char *s, ssize_t nb, char *p) {
|
|
int64_t r;
|
|
ssize_t n = nb;
|
|
const char * q = s;
|
|
|
|
#if USE_AVX2
|
|
__m256i u;
|
|
__m256i v;
|
|
__m256i b = _mm256_set1_epi8('\\');
|
|
|
|
/* process every 32 bytes */
|
|
while (n >= 32) {
|
|
u = _mm256_loadu_si256 ((const void *)s);
|
|
v = _mm256_cmpeq_epi8 (u, b);
|
|
_mm256_storeu_si256 ((void *)p, u);
|
|
|
|
/* check for matches */
|
|
if ((r = _mm256_movemask_epi8(v)) != 0) {
|
|
return s - q + __builtin_ctzll(r | (1ull << 32));
|
|
}
|
|
|
|
/* move to the next 32 bytes */
|
|
s += 32;
|
|
p += 32;
|
|
n -= 32;
|
|
}
|
|
#endif
|
|
|
|
#if USE_AVX2
|
|
_mm256_zeroupper();
|
|
#endif
|
|
|
|
#if USE_SSE
|
|
__m128i x;
|
|
__m128i y;
|
|
__m128i a = _mm_set1_epi8('\\');
|
|
|
|
/* process every 16 bytes */
|
|
while (n >= 16) {
|
|
x = _mm_loadu_si128 ((const void *)s);
|
|
y = _mm_cmpeq_epi8 (x, a);
|
|
_mm_storeu_si128 ((void *)p, x);
|
|
|
|
/* check for matches */
|
|
if ((r = _mm_movemask_epi8(y)) != 0) {
|
|
return s - q + __builtin_ctzll(r | (1 << 16));
|
|
}
|
|
|
|
/* move to the next 16 bytes */
|
|
s += 16;
|
|
p += 16;
|
|
n -= 16;
|
|
}
|
|
#endif
|
|
|
|
/* remaining bytes, do with scalar code */
|
|
while (n--) {
|
|
if (*s != '\\') {
|
|
*p++ = *s++;
|
|
} else {
|
|
return s - q;
|
|
}
|
|
}
|
|
|
|
/* nothing found, but everything was copied */
|
|
return -1;
|
|
}
|
|
|
|
#define ALL_01h (~0ul / 255)
|
|
#define ALL_7fh (ALL_01h * 127)
|
|
#define ALL_80h (ALL_01h * 128)
|
|
|
|
static inline uint32_t hasless(uint32_t x, uint8_t n) {
|
|
return (x - ALL_01h * n) & ~x & ALL_80h;
|
|
}
|
|
|
|
static inline uint32_t hasmore(uint32_t x, uint8_t n) {
|
|
return (x + ALL_01h * (127 - n) | x) & ALL_80h;
|
|
}
|
|
|
|
static inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) {
|
|
return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h;
|
|
}
|
|
|
|
#undef ALL_01h
|
|
#undef ALL_7fh
|
|
#undef ALL_80h
|
|
|
|
static inline char ishex(char c) {
|
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
|
}
|
|
|
|
static inline void unirep(char **dp) {
|
|
*(*dp)++ = 0xef;
|
|
*(*dp)++ = 0xbf;
|
|
*(*dp)++ = 0xbd;
|
|
}
|
|
|
|
static inline char unhex16_is(const char *s) {
|
|
uint32_t v = *(uint32_t *)s;
|
|
return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a'));
|
|
}
|
|
|
|
static inline uint32_t unhex16_fast(const char *s) {
|
|
uint32_t a = __builtin_bswap32(*(uint32_t *)s);
|
|
uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f);
|
|
uint32_t c = (b >> 4) | b;
|
|
uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff);
|
|
return d;
|
|
}
|
|
|
|
ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flags) {
|
|
ssize_t n;
|
|
ssize_t x = nb;
|
|
const char * s = sp;
|
|
const char * p = dp;
|
|
|
|
/* scan & copy all the non-escape characters */
|
|
while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) {
|
|
char cc;
|
|
uint32_t r0;
|
|
uint32_t r1;
|
|
|
|
/* skip the plain text */
|
|
dp += n;
|
|
sp += n + 2;
|
|
nb -= n + 2;
|
|
|
|
/* check for EOF */
|
|
if (nb < 0) {
|
|
*ep = x;
|
|
return -ERR_EOF;
|
|
}
|
|
|
|
/* check for double unquote */
|
|
if (unlikely(flags & F_DBLUNQ)) {
|
|
int nr = nb;
|
|
char c1 = sp[-1];
|
|
|
|
/* must have at least 1 character left */
|
|
if (nr == 0) {
|
|
*ep = x;
|
|
return -ERR_EOF;
|
|
}
|
|
|
|
/* every quote must be a double quote */
|
|
if (c1 != '\\') {
|
|
*ep = sp - s - 1;
|
|
return -ERR_INVAL;
|
|
}
|
|
|
|
/* special case of '\\\\' and '\\\"' */
|
|
if (*sp == '\\') {
|
|
if (nr < 2) {
|
|
*ep = x;
|
|
return -ERR_EOF;
|
|
} else if (sp[1] != '"' && sp[1] != '\\') {
|
|
*ep = sp - s + 1;
|
|
return -ERR_INVAL;
|
|
} else {
|
|
sp++;
|
|
nb--;
|
|
}
|
|
}
|
|
|
|
/* skip the second escape */
|
|
sp++;
|
|
nb--;
|
|
}
|
|
|
|
/* check for escape sequence */
|
|
if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) {
|
|
*ep = sp - s - 1;
|
|
return -ERR_ESCAPE;
|
|
}
|
|
|
|
/* check for simple escape sequence */
|
|
if (cc != -1) {
|
|
*dp++ = cc;
|
|
continue;
|
|
}
|
|
|
|
/* must have at least 4 characters */
|
|
if (nb < 4) {
|
|
*ep = x;
|
|
return -ERR_EOF;
|
|
}
|
|
|
|
/* check for hexadecimal characters */
|
|
if (!unhex16_is(sp)) {
|
|
*ep = sp - s;
|
|
for (int i = 0; i < 4 && ishex(*sp); i++, sp++) ++*ep;
|
|
return -ERR_INVAL;
|
|
}
|
|
|
|
/* decode the code-point */
|
|
r0 = unhex16_fast(sp);
|
|
sp += 4;
|
|
nb -= 4;
|
|
|
|
/* ASCII characters, unlikely */
|
|
if (unlikely(r0 <= 0x7f)) {
|
|
*dp++ = (char)r0;
|
|
continue;
|
|
}
|
|
|
|
/* latin-1 characters, unlikely */
|
|
if (unlikely(r0 <= 0x07ff)) {
|
|
*dp++ = (char)(0xc0 | (r0 >> 6));
|
|
*dp++ = (char)(0x80 | (r0 & 0x3f));
|
|
continue;
|
|
}
|
|
|
|
/* 3-byte characters, likely */
|
|
if (likely(r0 < 0xd800 || r0 > 0xdfff)) {
|
|
*dp++ = (char)(0xe0 | ((r0 >> 12) ));
|
|
*dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f));
|
|
*dp++ = (char)(0x80 | ((r0 ) & 0x3f));
|
|
continue;
|
|
}
|
|
|
|
/* check for double unquote */
|
|
if (unlikely(flags & F_DBLUNQ)) {
|
|
if (nb < 1) {
|
|
*ep = x;
|
|
return -ERR_EOF;
|
|
} else if (sp[0] != '\\') {
|
|
*ep = sp - s - 4;
|
|
return -ERR_UNICODE;
|
|
} else {
|
|
nb--;
|
|
sp++;
|
|
}
|
|
}
|
|
|
|
/* surrogate half, must follows by the other half */
|
|
if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') {
|
|
if (likely(flags & F_UNIREP)) {
|
|
unirep(&dp);
|
|
continue;
|
|
} else {
|
|
*ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4);
|
|
return -ERR_UNICODE;
|
|
}
|
|
}
|
|
|
|
/* check the hexadecimal escape */
|
|
if (!unhex16_is(sp + 2)) {
|
|
*ep = sp - s + 2;
|
|
for (int i = 0; i < 4 && ishex(sp[2]); i++, sp++) ++*ep;
|
|
return -ERR_INVAL;
|
|
}
|
|
|
|
/* decode the second code-point */
|
|
r1 = unhex16_fast(sp + 2);
|
|
sp += 6;
|
|
nb -= 6;
|
|
|
|
/* it must be the other half */
|
|
if (r1 < 0xdc00 || r1 > 0xdfff) {
|
|
if (likely(!(flags & F_UNIREP))) {
|
|
*ep = sp - s - 4;
|
|
return -ERR_UNICODE;
|
|
} else {
|
|
unirep(&dp);
|
|
unirep(&dp);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* merge two surrogates */
|
|
r0 = (r0 - 0xd800) << 10;
|
|
r1 = (r1 - 0xdc00) + 0x010000;
|
|
r0 += r1;
|
|
|
|
/* check the code point range */
|
|
if (r0 > 0x10ffff) {
|
|
if (likely(!(flags & F_UNIREP))) {
|
|
*ep = sp - s - 4;
|
|
return -ERR_UNICODE;
|
|
} else {
|
|
unirep(&dp);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* encode the character */
|
|
*dp++ = (char)(0xf0 | ((r0 >> 18) ));
|
|
*dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f));
|
|
*dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f));
|
|
*dp++ = (char)(0x80 | ((r0 ) & 0x3f));
|
|
}
|
|
|
|
/* calculate the result length */
|
|
return dp + nb - p;
|
|
}
|