/* * Copyright (c) 2009 The Go Authors. All rights reserved. * Modifications Copyright 2021 ByteDance Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "native.h" // ascii: 0x00 ~ 0x7F static inline int _mm_ascii_mask(__m128i vv) { return _mm_movemask_epi8(vv); } #if USE_AVX2 // ascii: 0x00 ~ 0x7F static inline int _mm256_ascii_mask(__m256i vv) { return _mm256_movemask_epi8(vv); } #endif static inline bool is_ascii(uint8_t ch) { return ch < 0x80; } // The default lowest and highest continuation byte. const static uint8_t locb = 0x80; const static uint8_t hicb = 0xBF; const static uint8_t xx = 0xF1; // invalid: size 1 const static uint8_t as = 0xF0; // ASCII: size 1 const static uint8_t s1 = 0x02; // accept 0, size 2 const static uint8_t s2 = 0x13; // accept 1, size 3 const static uint8_t s3 = 0x03; // accept 0, size 3 const static uint8_t s4 = 0x23; // accept 2, size 3 const static uint8_t s5 = 0x34; // accept 3, size 4 const static uint8_t s6 = 0x04; // accept 0, size 4 const static uint8_t s7 = 0x44; // accept 4, size 4 // first is information about the first byte in a UTF-8 sequence. static const uint8_t first[256] = { // 1 2 3 4 5 6 7 8 9 A B C D E F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F // 1 2 3 4 5 6 7 8 9 A B C D E F xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF }; // AcceptRange gives the range of valid values for the second byte in a UTF-8 // sequence. struct AcceptRange { uint8_t lo; // lowest value for second byte. uint8_t hi; // highest value for second byte. }; // ranges has size 16 to avoid bounds checks in the code that uses it. const static struct AcceptRange ranges[5] = { {locb, hicb}, // 0 {0xA0, hicb}, // 1 {locb, 0x9F}, // 2 {0x90, hicb}, // 3 {locb, 0x8F}, // 4 }; // UTF-8 code point | first byte | second byte | third byte | fourth byte // U+0000 - U+007F | 0___ ____ // U+0080 - U+07FF | 110_ ____ | 10__ ____ // U+0800 - U+D7FF | 1110 ____ | 10__ ____ | 10__ ____ // U+D800 - U+DFFF | reserved for UTF-16 surrogate pairs // U+E000 - U+FFFF | 1110 ____ | 10__ ____ | 10__ ____ // U+10000 - U+10FFFF | 1111 0___ | 10__ ____ | 10__ ____ | 10__ ____ // checks non-ascii characters, and returns the utf-8 length static inline ssize_t nonascii_is_utf8(const uint8_t* sp, size_t n) { uint8_t mask = first[sp[0]]; uint8_t size = mask & 7; if (n < size) { return 0; } struct AcceptRange accept = ranges[mask >> 4]; switch (size) { case 4 : if (sp[3] < locb || hicb < sp[3]) return 0; case 3 : if (sp[2] < locb || hicb < sp[2]) return 0; case 2 : if (sp[1] < accept.lo || accept.hi < sp[1]) return 0; break; case 1 : return 0; // invalid chars case 0 : return 1; // ascii chars default: return 0; } return size; } ssize_t find_non_ascii(const uint8_t*sp, ssize_t nb) { const uint8_t* ss = sp; int64_t m; #if USE_AVX2 while (nb >= 32) { __m256i v = _mm256_loadu_si256 ((const void *)(sp)); if (unlikely((m = _mm256_ascii_mask(v)) != 0)) { return sp - ss + __builtin_ctzll(m); } nb -= 32; sp += 32; } /* clear spper half to avoid AVX-SSE transition penalty */ _mm256_zeroupper(); #endif while (nb >= 16) { __m128i v = _mm_loadu_si128 ((const void *)(sp)); if (unlikely((m = _mm_ascii_mask(v)) != 0)) { return sp - ss + __builtin_ctzll(m); } nb -= 16; sp += 16; } /* remaining bytes, do with scalar code */ while (nb-- > 0) { if (is_ascii(*sp)) { sp++; } else { return sp - ss; } } /* nothing found */ return -1; } // utf8_validate validates whether the JSON string is valid UTF-8. // return -1 if validate, otherwise, return the error postion. ssize_t utf8_validate(const char *sp, ssize_t nb) { const uint8_t* p = (const uint8_t*)sp; const uint8_t* s = (const uint8_t*)sp; ssize_t n; ssize_t b; // Optimize for the continuous non-ascii chars */ while (nb > 0 && (n = (!is_ascii(*p) ? 0 : find_non_ascii(p, nb))) != -1) { /* not found non-ascii in string */ if (n >= nb) { return -1; } nb -= n; p += n; /* validate the non-ascii */ if (unlikely((b = nonascii_is_utf8(p, nb)) == 0)) { return p - s; } nb -= b; p += b; } return -1; }