2
0
Fork 0
mirror of https://github.com/ii64/sonic.git synced 2026-06-21 00:46:43 +08:00
sonic/native/utf8.h
liu ccc0f3f1e3
opt: faster ftoa (#291)
* opt: faster f64toa

* feat:(option) add option `MaxInlineDepth` for addjust compilation inline depth (#287)

* feat: make compilation depth changeable

* feat: add option `DefaultMaxInlineDepth`

* add recurse depth = 10

* refactor

* doc: readme and comment

* opt: add `_MAX_FIELDS` to limit the inlining of big struct

* update license

* fix typo

* feat:(ast) support cast `null` to empty value (#278)

* fix: recompile

* test: add ftoa benchmarks

* feat: add f32toa

* fix: add license

* opt: optimize decimal

* fix benchmarks

* fix: poor performance in atof32

* test: add more integer test

Co-authored-by: liuqiang <liuqiang.06@bytedance.com>
Co-authored-by: Yi Duan <duanyi.aster@bytedance.com>
2022-09-20 10:26:18 +08:00

115 lines
No EOL
4.2 KiB
C

#ifndef UTF8_H
#define UTF8_H
/*
* Copyright (C) 2019 Yaoyuan <ibireme@gmail.com>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file may have been modified by ByteDance authors. All ByteDance
* Modifications are Copyright 2022 ByteDance Authors.
*/
static inline ssize_t valid_utf8_4byte(uint32_t ubin) {
/*
Each unicode code point is encoded as 1 to 4 bytes in UTF-8 encoding,
we use 4-byte mask and pattern value to validate UTF-8 byte sequence,
this requires the input data to have 4-byte zero padding.
---------------------------------------------------
1 byte
unicode range [U+0000, U+007F]
unicode min [.......0]
unicode max [.1111111]
bit pattern [0.......]
---------------------------------------------------
2 byte
unicode range [U+0080, U+07FF]
unicode min [......10 ..000000]
unicode max [...11111 ..111111]
bit require [...xxxx. ........] (1E 00)
bit mask [xxx..... xx......] (E0 C0)
bit pattern [110..... 10......] (C0 80)
// 1101 0100 10110000
// 0001 1110
---------------------------------------------------
3 byte
unicode range [U+0800, U+FFFF]
unicode min [........ ..100000 ..000000]
unicode max [....1111 ..111111 ..111111]
bit require [....xxxx ..x..... ........] (0F 20 00)
bit mask [xxxx.... xx...... xx......] (F0 C0 C0)
bit pattern [1110.... 10...... 10......] (E0 80 80)
---------------------------------------------------
3 byte invalid (reserved for surrogate halves)
unicode range [U+D800, U+DFFF]
unicode min [....1101 ..100000 ..000000]
unicode max [....1101 ..111111 ..111111]
bit mask [....xxxx ..x..... ........] (0F 20 00)
bit pattern [....1101 ..1..... ........] (0D 20 00)
---------------------------------------------------
4 byte
unicode range [U+10000, U+10FFFF]
unicode min [........ ...10000 ..000000 ..000000]
unicode max [.....100 ..001111 ..111111 ..111111]
bit err0 [.....100 ........ ........ ........] (04 00 00 00)
bit err1 [.....011 ..110000 ........ ........] (03 30 00 00)
bit require [.....xxx ..xx.... ........ ........] (07 30 00 00)
bit mask [xxxxx... xx...... xx...... xx......] (F8 C0 C0 C0)
bit pattern [11110... 10...... 10...... 10......] (F0 80 80 80)
---------------------------------------------------
*/
const uint32_t b2_mask = 0x0000C0E0UL;
const uint32_t b2_patt = 0x000080C0UL;
const uint32_t b2_requ = 0x0000001EUL;
const uint32_t b3_mask = 0x00C0C0F0UL;
const uint32_t b3_patt = 0x008080E0UL;
const uint32_t b3_requ = 0x0000200FUL;
const uint32_t b3_erro = 0x0000200DUL;
const uint32_t b4_mask = 0xC0C0C0F8UL;
const uint32_t b4_patt = 0x808080F0UL;
const uint32_t b4_requ = 0x00003007UL;
const uint32_t b4_err0 = 0x00000004UL;
const uint32_t b4_err1 = 0x00003003UL;
#define is_valid_seq_2(uni) ( \
((uni & b2_mask) == b2_patt) && \
((uni & b2_requ)) \
)
#define is_valid_seq_3(uni) ( \
((uni & b3_mask) == b3_patt) && \
((tmp = (uni & b3_requ))) && \
((tmp != b3_erro)) \
)
#define is_valid_seq_4(uni) ( \
((uni & b4_mask) == b4_patt) && \
((tmp = (uni & b4_requ))) && \
((tmp & b4_err0) == 0 || (tmp & b4_err1) == 0) \
)
uint32_t tmp = 0;
if (is_valid_seq_3(ubin)) return 3;
if (is_valid_seq_2(ubin)) return 2;
if (is_valid_seq_4(ubin)) return 4;
return 0;
}
static inline uint32_t less4byte_to_uint32(const char* sp, size_t nb) {
if (nb == 1) return *(uint8_t*)sp;
if (nb == 2) return *(uint16_t*)sp;
uint32_t hi_1 = (*(uint8_t*)(sp + 2));
uint32_t lo_2 = *(uint16_t*)(sp);
return hi_1 << 16 | lo_2;
}
#endif