mirror of
https://github.com/ii64/sonic.git
synced 2026-06-21 00:46:43 +08:00
* opt: faster f64toa * feat:(option) add option `MaxInlineDepth` for addjust compilation inline depth (#287) * feat: make compilation depth changeable * feat: add option `DefaultMaxInlineDepth` * add recurse depth = 10 * refactor * doc: readme and comment * opt: add `_MAX_FIELDS` to limit the inlining of big struct * update license * fix typo * feat:(ast) support cast `null` to empty value (#278) * fix: recompile * test: add ftoa benchmarks * feat: add f32toa * fix: add license * opt: optimize decimal * fix benchmarks * fix: poor performance in atof32 * test: add more integer test Co-authored-by: liuqiang <liuqiang.06@bytedance.com> Co-authored-by: Yi Duan <duanyi.aster@bytedance.com>
115 lines
No EOL
4.2 KiB
C
115 lines
No EOL
4.2 KiB
C
#ifndef UTF8_H
|
|
#define UTF8_H
|
|
/*
|
|
* Copyright (C) 2019 Yaoyuan <ibireme@gmail.com>.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
* This file may have been modified by ByteDance authors. All ByteDance
|
|
* Modifications are Copyright 2022 ByteDance Authors.
|
|
*/
|
|
|
|
static inline ssize_t valid_utf8_4byte(uint32_t ubin) {
|
|
/*
|
|
Each unicode code point is encoded as 1 to 4 bytes in UTF-8 encoding,
|
|
we use 4-byte mask and pattern value to validate UTF-8 byte sequence,
|
|
this requires the input data to have 4-byte zero padding.
|
|
---------------------------------------------------
|
|
1 byte
|
|
unicode range [U+0000, U+007F]
|
|
unicode min [.......0]
|
|
unicode max [.1111111]
|
|
bit pattern [0.......]
|
|
---------------------------------------------------
|
|
2 byte
|
|
unicode range [U+0080, U+07FF]
|
|
unicode min [......10 ..000000]
|
|
unicode max [...11111 ..111111]
|
|
bit require [...xxxx. ........] (1E 00)
|
|
bit mask [xxx..... xx......] (E0 C0)
|
|
bit pattern [110..... 10......] (C0 80)
|
|
// 1101 0100 10110000
|
|
// 0001 1110
|
|
---------------------------------------------------
|
|
3 byte
|
|
unicode range [U+0800, U+FFFF]
|
|
unicode min [........ ..100000 ..000000]
|
|
unicode max [....1111 ..111111 ..111111]
|
|
bit require [....xxxx ..x..... ........] (0F 20 00)
|
|
bit mask [xxxx.... xx...... xx......] (F0 C0 C0)
|
|
bit pattern [1110.... 10...... 10......] (E0 80 80)
|
|
---------------------------------------------------
|
|
3 byte invalid (reserved for surrogate halves)
|
|
unicode range [U+D800, U+DFFF]
|
|
unicode min [....1101 ..100000 ..000000]
|
|
unicode max [....1101 ..111111 ..111111]
|
|
bit mask [....xxxx ..x..... ........] (0F 20 00)
|
|
bit pattern [....1101 ..1..... ........] (0D 20 00)
|
|
---------------------------------------------------
|
|
4 byte
|
|
unicode range [U+10000, U+10FFFF]
|
|
unicode min [........ ...10000 ..000000 ..000000]
|
|
unicode max [.....100 ..001111 ..111111 ..111111]
|
|
bit err0 [.....100 ........ ........ ........] (04 00 00 00)
|
|
bit err1 [.....011 ..110000 ........ ........] (03 30 00 00)
|
|
bit require [.....xxx ..xx.... ........ ........] (07 30 00 00)
|
|
bit mask [xxxxx... xx...... xx...... xx......] (F8 C0 C0 C0)
|
|
bit pattern [11110... 10...... 10...... 10......] (F0 80 80 80)
|
|
---------------------------------------------------
|
|
*/
|
|
const uint32_t b2_mask = 0x0000C0E0UL;
|
|
const uint32_t b2_patt = 0x000080C0UL;
|
|
const uint32_t b2_requ = 0x0000001EUL;
|
|
const uint32_t b3_mask = 0x00C0C0F0UL;
|
|
const uint32_t b3_patt = 0x008080E0UL;
|
|
const uint32_t b3_requ = 0x0000200FUL;
|
|
const uint32_t b3_erro = 0x0000200DUL;
|
|
const uint32_t b4_mask = 0xC0C0C0F8UL;
|
|
const uint32_t b4_patt = 0x808080F0UL;
|
|
const uint32_t b4_requ = 0x00003007UL;
|
|
const uint32_t b4_err0 = 0x00000004UL;
|
|
const uint32_t b4_err1 = 0x00003003UL;
|
|
|
|
#define is_valid_seq_2(uni) ( \
|
|
((uni & b2_mask) == b2_patt) && \
|
|
((uni & b2_requ)) \
|
|
)
|
|
|
|
#define is_valid_seq_3(uni) ( \
|
|
((uni & b3_mask) == b3_patt) && \
|
|
((tmp = (uni & b3_requ))) && \
|
|
((tmp != b3_erro)) \
|
|
)
|
|
|
|
#define is_valid_seq_4(uni) ( \
|
|
((uni & b4_mask) == b4_patt) && \
|
|
((tmp = (uni & b4_requ))) && \
|
|
((tmp & b4_err0) == 0 || (tmp & b4_err1) == 0) \
|
|
)
|
|
uint32_t tmp = 0;
|
|
|
|
if (is_valid_seq_3(ubin)) return 3;
|
|
if (is_valid_seq_2(ubin)) return 2;
|
|
if (is_valid_seq_4(ubin)) return 4;
|
|
return 0;
|
|
}
|
|
|
|
static inline uint32_t less4byte_to_uint32(const char* sp, size_t nb) {
|
|
if (nb == 1) return *(uint8_t*)sp;
|
|
if (nb == 2) return *(uint16_t*)sp;
|
|
uint32_t hi_1 = (*(uint8_t*)(sp + 2));
|
|
uint32_t lo_2 = *(uint16_t*)(sp);
|
|
return hi_1 << 16 | lo_2;
|
|
}
|
|
|
|
#endif |