2
0
Fork 0
mirror of https://github.com/ii64/sonic.git synced 2026-06-21 00:46:43 +08:00
sonic/native/utf8.c
liu 7475b256ce
support JSON validate (#189)
* fix: check unescaped control chars in decode

* feat: add utf8 validate func

* feat: validate utf8 in json string

* feat: add validateone api

* fix: check unicode pointer for surrogate

* clang12 compile

* feat: Import `Valid()` and `Skip()`

* opt: use looktable

* fix utf-8 validate performance problem

* fix: utf-8 validate bug

* clang12 build

* feat: (encoder) accelerate validating json from `json.Marshaler`

chore!:
- `encoder.NoCompactMarshaler`changes to `encoder.CompactMarshaler`, which means compacting operation is not open by default

* fix: only one json value is `Valid()`

Co-authored-by: liuqiang <liuqiang.06@bytedance.com>
Co-authored-by: duanyi.aster <duanyi.aster@bytedance.com>
2022-02-21 16:35:53 +08:00

183 lines
No EOL
6.1 KiB
C

/*
* Copyright (c) 2009 The Go Authors. All rights reserved.
* Modifications Copyright 2021 ByteDance Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "native.h"
// ascii: 0x00 ~ 0x7F
static inline int _mm_ascii_mask(__m128i vv) {
return _mm_movemask_epi8(vv);
}
#if USE_AVX2
// ascii: 0x00 ~ 0x7F
static inline int _mm256_ascii_mask(__m256i vv) {
return _mm256_movemask_epi8(vv);
}
#endif
static inline bool is_ascii(uint8_t ch) {
return ch < 0x80;
}
// The default lowest and highest continuation byte.
const static uint8_t locb = 0x80;
const static uint8_t hicb = 0xBF;
const static uint8_t xx = 0xF1; // invalid: size 1
const static uint8_t as = 0xF0; // ASCII: size 1
const static uint8_t s1 = 0x02; // accept 0, size 2
const static uint8_t s2 = 0x13; // accept 1, size 3
const static uint8_t s3 = 0x03; // accept 0, size 3
const static uint8_t s4 = 0x23; // accept 2, size 3
const static uint8_t s5 = 0x34; // accept 3, size 4
const static uint8_t s6 = 0x04; // accept 0, size 4
const static uint8_t s7 = 0x44; // accept 4, size 4
// first is information about the first byte in a UTF-8 sequence.
static const uint8_t first[256] = {
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
};
// AcceptRange gives the range of valid values for the second byte in a UTF-8
// sequence.
struct AcceptRange {
uint8_t lo; // lowest value for second byte.
uint8_t hi; // highest value for second byte.
};
// ranges has size 16 to avoid bounds checks in the code that uses it.
const static struct AcceptRange ranges[5] = {
{locb, hicb}, // 0
{0xA0, hicb}, // 1
{locb, 0x9F}, // 2
{0x90, hicb}, // 3
{locb, 0x8F}, // 4
};
// UTF-8 code point | first byte | second byte | third byte | fourth byte
// U+0000 - U+007F | 0___ ____
// U+0080 - U+07FF | 110_ ____ | 10__ ____
// U+0800 - U+D7FF | 1110 ____ | 10__ ____ | 10__ ____
// U+D800 - U+DFFF | reserved for UTF-16 surrogate pairs
// U+E000 - U+FFFF | 1110 ____ | 10__ ____ | 10__ ____
// U+10000 - U+10FFFF | 1111 0___ | 10__ ____ | 10__ ____ | 10__ ____
// checks non-ascii characters, and returns the utf-8 length
static inline ssize_t nonascii_is_utf8(const uint8_t* sp, size_t n) {
uint8_t mask = first[sp[0]];
uint8_t size = mask & 7;
if (n < size) {
return 0;
}
struct AcceptRange accept = ranges[mask >> 4];
switch (size) {
case 4 : if (sp[3] < locb || hicb < sp[3]) return 0;
case 3 : if (sp[2] < locb || hicb < sp[2]) return 0;
case 2 : if (sp[1] < accept.lo || accept.hi < sp[1]) return 0; break;
case 1 : return 0; // invalid chars
case 0 : return 1; // ascii chars
default: return 0;
}
return size;
}
ssize_t find_non_ascii(const uint8_t*sp, ssize_t nb) {
const uint8_t* ss = sp;
int64_t m;
#if USE_AVX2
while (nb >= 32) {
__m256i v = _mm256_loadu_si256 ((const void *)(sp));
if (unlikely((m = _mm256_ascii_mask(v)) != 0)) {
return sp - ss + __builtin_ctzll(m);
}
nb -= 32;
sp += 32;
}
/* clear spper half to avoid AVX-SSE transition penalty */
_mm256_zeroupper();
#endif
while (nb >= 16) {
__m128i v = _mm_loadu_si128 ((const void *)(sp));
if (unlikely((m = _mm_ascii_mask(v)) != 0)) {
return sp - ss + __builtin_ctzll(m);
}
nb -= 16;
sp += 16;
}
/* remaining bytes, do with scalar code */
while (nb-- > 0) {
if (is_ascii(*sp)) {
sp++;
} else {
return sp - ss;
}
}
/* nothing found */
return -1;
}
// utf8_validate validates whether the JSON string is valid UTF-8.
// return -1 if validate, otherwise, return the error postion.
ssize_t utf8_validate(const char *sp, ssize_t nb) {
const uint8_t* p = (const uint8_t*)sp;
const uint8_t* s = (const uint8_t*)sp;
ssize_t n;
ssize_t b;
// Optimize for the continuous non-ascii chars */
while (nb > 0 && (n = (!is_ascii(*p) ? 0 : find_non_ascii(p, nb))) != -1) {
/* not found non-ascii in string */
if (n >= nb) {
return -1;
}
nb -= n;
p += n;
/* validate the non-ascii */
if (unlikely((b = nonascii_is_utf8(p, nb)) == 0)) {
return p - s;
}
nb -= b;
p += b;
}
return -1;
}