2
0
Fork 0
mirror of https://github.com/ii64/sonic.git synced 2026-06-21 00:46:43 +08:00

add ValidateString option into decode (#253)

* fix: add escape validate

* feat: add validatestring option

* add print debug

* add jit debug in decoder

* fix go1.16 decoder debug

* fix: not change others

* fix generic parse bug

* remove debug info

* test twitter json

* fix: option typos

* test: add bug test

* fix validate bugs

* fix: validate

* remove files

* re-compile

* update license

* add flags on `skip_xxx()`

* fix internal native tests

* re-compile

* add validstring for perftest

* modify api

* fix readme

* fix comment

* add license

* rename to ValidateString

* fix xprintf.h

* add debug

Co-authored-by: liuqiang <liuqiang.06@bytedance.com>
Co-authored-by: duanyi.aster <duanyi.aster@bytedance.com>
This commit is contained in:
liu 2022-08-11 19:06:11 +08:00 committed by GitHub
parent b36771ba37
commit de2dc2c35a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 11818 additions and 11911 deletions

View file

@ -3,7 +3,7 @@
A blazingly fast JSON serializing &amp; deserializing library, accelerated by JIT (just-in-time compiling) and SIMD (single-instruction-multiple-data).
## Requirement
- Go 1.15/1.16/1.17/1.18
- Go 1.15/1.16/1.17/1.18/1.19
- Linux/MacOS/Windows
- Amd64 CPU with AVX instruction set
@ -261,10 +261,10 @@ println(string(buf) == string(exp)) // true
- modification: `Set()`, `SetByIndex()`, `Add()`
## Compatibility
Sonic **DOSE NOT** ensure to support all environments, due to the difficulty of developing high-performance codes. For developers who use sonic to build their applications in different environments (ex: developing on M1 Mac but running on linux server), or those who want to handle JSON strictly consistent with `encoding/json`, we provide some compatible APIs as `sonic.API`
- `ConfigDefault`: the sonic's default config (`EscapeHTML=false`,`SortKeys=false`...) to run on sonic-supporting environment. It will fall back to `encoding/json` with corresponding config , and some options like `SortKeys=false` will be invalid.
- `ConfigStd`: the std-compatible config (`EscapeHTML=true`,`SortKeys=true`...) to run on sonic-supporting environment. It whill fall back to `encoding/json`.
- `ConfigFastest`: the fastest config (`NoQuoteTextMarshaler=true`) to run on sonic-supporting environment. It will fall back to `encoding/json` with corresponding config , and some options will be invalid.
Sonic **DOSE NOT** ensure to support all environments, due to the difficulty of developing high-performance codes. For developers who use sonic to build their applications in different environments (ex: developing on M1 Mac but running on Linux server), or those who want to handle JSON strictly consistent with `encoding/json`, we provide some compatible APIs as `sonic.API`
- `ConfigDefault`: the sonic's default config (`EscapeHTML=false`,`SortKeys=false`...) to run on sonic-supporting environment. It will fall back to `encoding/json` with the corresponding config, and some options like `SortKeys=false` will be invalid.
- `ConfigStd`: the std-compatible config (`EscapeHTML=true`,`SortKeys=true`...) to run on sonic-supporting environment. It will fall back to `encoding/json`.
- `ConfigFastest`: the fastest config (`NoQuoteTextMarshaler=true`) to run on sonic-supporting environment. It will fall back to `encoding/json` with corresponding config, and some options will be invalid.
## Tips

3
api.go
View file

@ -64,6 +64,9 @@ import (
// CopyString indicates decoder to decode string values by copying instead of referring.
CopyString bool
// ValidateString indicates decoder to valid string values: decoder will return errors when
// invalid UTF-8 chars or unescaped control chars(\u0000-\u001f) in the string value of JSON.
ValidateString bool
}
var (

View file

@ -319,7 +319,7 @@ func (self *Parser) Parse() (Node, types.ParsingError) {
func (self *Parser) skip() (int, types.ParsingError) {
fsm := types.NewStateMachine()
start := native.SkipOne(&self.s, &self.p, fsm)
start := native.SkipOne(&self.s, &self.p, fsm, uint64(0))
types.FreeStateMachine(fsm)
if start < 0 {

View file

@ -35,8 +35,10 @@ import (
`testing`
`time`
`unsafe`
`unicode/utf8`
`github.com/bytedance/sonic/decoder`
`github.com/davecgh/go-spew/spew`
)
type T struct {
@ -410,6 +412,7 @@ type unmarshalTest struct {
useNumber bool
golden bool
disallowUnknownFields bool
validateString bool
}
type B struct {
@ -696,11 +699,13 @@ var unmarshalTests = []unmarshalTest{
in: "\"hello\xffworld\"",
ptr: new(string),
out: "hello\xffworld",
validateString: false,
},
{
in: "\"hello\xc2\xc2world\"",
ptr: new(string),
out: "hello\xc2\xc2world",
validateString: false,
},
{
in: "\"hello\xc2\xffworld\"",
@ -999,6 +1004,17 @@ var unmarshalTests = []unmarshalTest{
ptr: new(map[string]json.Number),
err: fmt.Errorf("json: invalid number literal, trying to unmarshal %q into Number", `"invalid"`),
},
{in: `\u`, ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
{in: `\u`, ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\x00\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\x00\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\xff\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\xff\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\x00\"", ptr: new(interface{}), out: interface{}("\x00"), validateString: false},
{in: "\"\x00\"", ptr: new(string), out: "\x00", validateString: false},
{in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\xff"), validateString: false},
{in: "\"\xff\"", ptr: new(string), out: "\xff", validateString: false},
}
func trim(b []byte) []byte {
@ -1128,14 +1144,20 @@ func TestUnmarshal(t *testing.T) {
}
dec := decoder.NewDecoder(tt.in)
validUtf8 := true
if tt.useNumber {
dec.UseNumber()
}
if tt.disallowUnknownFields {
dec.DisallowUnknownFields()
}
if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil) {
t.Errorf("#%d: %v, want %v", i, err, tt.err)
if tt.validateString {
dec.ValidateString()
validUtf8 = utf8.Valid([]byte(tt.in))
}
if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil && validUtf8) {
spew.Dump(tt.in)
t.Fatalf("#%d: %v, want %v", i, err, tt.err)
continue
} else if err != nil {
continue
@ -2203,7 +2225,6 @@ func TestInvalidStringOption(t *testing.T) {
if err != nil {
t.Fatalf("Marshal: %v", err)
}
err = Unmarshal(data, &item)
if err != nil {
t.Fatalf("Unmarshal: %v", err)

View file

@ -387,6 +387,7 @@ func (self *_Assembler) call_sf(fn obj.Addr) {
self.Emit("MOVQ", _IC, _ARG_ic) // MOVQ IC, ic<>+16(FP)
self.Emit("LEAQ", _ARG_ic, _SI) // LEAQ ic<>+16(FP), SI
self.Emit("LEAQ", jit.Ptr(_ST, _FsmOffset), _DX) // LEAQ _FsmOffset(ST), DX
self.Emit("MOVQ", _ARG_fv, _CX)
self.call(fn) // CALL ${fn}
self.Emit("MOVQ", _ARG_ic, _IC) // MOVQ ic<>+16(FP), IC
}
@ -591,7 +592,8 @@ func (self *_Assembler) check_eof(d int64) {
}
}
func (self *_Assembler) parse_string() {
func (self *_Assembler) parse_string() { // parse_string has a validate flag params in the last
self.Emit("MOVQ", _ARG_fv, _CX)
self.call_vf(_F_vstring)
self.check_err()
}

View file

@ -404,6 +404,7 @@ func (self *_Assembler) call_sf(fn obj.Addr) {
self.Emit("MOVQ", _IC, _ARG_ic) // MOVQ IC, ic<>+16(FP)
self.Emit("LEAQ", _ARG_ic, _SI) // LEAQ ic<>+16(FP), SI
self.Emit("LEAQ", jit.Ptr(_ST, _FsmOffset), _DX) // LEAQ _FsmOffset(ST), DX
self.Emit("MOVQ", _ARG_fv, _CX)
self.callc(fn)
self.Emit("MOVQ", _ARG_ic, _IC) // MOVQ ic<>+16(FP), IC
}
@ -602,6 +603,7 @@ func (self *_Assembler) check_eof(d int64) {
}
func (self *_Assembler) parse_string() {
self.Emit("MOVQ", _ARG_fv, _CX)
self.call_vf(_F_vstring)
self.check_err()
}
@ -933,6 +935,8 @@ func (self *_Assembler) mapassign_utext(t reflect.Type, addressable bool) {
var (
_F_skip_one = jit.Imm(int64(native.S_skip_one))
_F_skip_array = jit.Imm(int64(native.S_skip_array))
_F_skip_object = jit.Imm(int64(native.S_skip_object))
_F_skip_number = jit.Imm(int64(native.S_skip_number))
)
@ -1031,11 +1035,6 @@ var (
_F_decodeValue = jit.Imm(int64(_subr_decode_value))
)
var (
_F_skip_array = jit.Imm(int64(native.S_skip_array))
_F_skip_object = jit.Imm(int64(native.S_skip_object))
)
var (
_F_FieldMap_GetCaseInsensitive obj.Addr
)

View file

@ -33,6 +33,7 @@ const (
_F_disable_urc
_F_disable_unknown
_F_copy_string
_F_validate_string
_F_allow_control = 31
)
@ -45,6 +46,7 @@ const (
OptionUseUnicodeErrors Options = 1 << _F_disable_urc
OptionDisableUnknown Options = 1 << _F_disable_unknown
OptionCopyString Options = 1 << _F_copy_string
OptionValidateString Options = 1 << _F_validate_string
)
func (self *Decoder) SetOptions(opts Options) {
@ -139,6 +141,13 @@ func (self *Decoder) CopyString() {
self.f |= 1 << _F_copy_string
}
// ValidateString causes the Decoder to validate string values when decoding string value
// in JSON. Validation is that, returning error when unescaped control chars(0x00-0x1f) or
// invalid UTF-8 chars in the string value of JSON.
func (self *Decoder) ValidateString() {
self.f |= 1 << _F_validate_string
}
// Pretouch compiles vt ahead-of-time to avoid JIT compilation on-the-fly, in
// order to reduce the first-hit latency.
//
@ -199,7 +208,7 @@ func Skip(data []byte) (start int, end int) {
s := rt.Mem2Str(data)
p := 0
m := types.NewStateMachine()
ret := native.SkipOne(&s, &p, m)
ret := native.SkipOne(&s, &p, m, uint64(0))
types.FreeStateMachine(m)
return ret, p
}

View file

@ -67,12 +67,12 @@ func __unquote(sp unsafe.Pointer, nb int, dp unsafe.Pointer, ep *int, flags uint
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __value(s unsafe.Pointer, n int, p int, v *types.JsonState, allow_control int) (ret int)
func __value(s unsafe.Pointer, n int, p int, v *types.JsonState, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __vstring(s *string, p *int, v *types.JsonState)
func __vstring(s *string, p *int, v *types.JsonState, flags uint64)
//go:nosplit
//go:noescape
@ -92,17 +92,17 @@ func __vunsigned(s *string, p *int, v *types.JsonState)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_one(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_one(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_array(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_array(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_object(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_object(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape

File diff suppressed because it is too large Load diff

View file

@ -237,21 +237,39 @@ func TestNative_Vstring(t *testing.T) {
var v types.JsonState
i := 0
s := `test"test\n2"`
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 5, i)
assert.Equal(t, -1, v.Ep)
assert.Equal(t, int64(0), v.Iv)
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 13, i)
assert.Equal(t, 9, v.Ep)
assert.Equal(t, int64(5), v.Iv)
}
func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\x1f\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_Vstring_ValidUtf8(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\xff\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_VstringEscapeEOF(t *testing.T) {
var v types.JsonState
i := 0
s := `xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"x`
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 95, i)
assert.Equal(t, 63, v.Ep)
assert.Equal(t, int64(0), v.Iv)
@ -312,7 +330,7 @@ func TestNative_VstringHangUpOnRandomData(t *testing.T) {
p := 1
s := rt.Mem2Str(v)
var js types.JsonState
__vstring(&s, &p, &js)
__vstring(&s, &p, &js, 0)
fmt.Printf("js: %s\n", spew.Sdump(js))
}
@ -506,36 +524,36 @@ func TestNative_Vunsigned(t *testing.T) {
func TestNative_SkipOne(t *testing.T) {
p := 0
s := ` {"asdf": [null, true, false, 1, 2.0, -3]}, 1234.5`
q := __skip_one(&s, &p, &types.StateMachine{})
q := __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 42, p)
assert.Equal(t, 1, q)
p = 0
s = `1 2.5 -3 "asdf\nqwer" true false null {} []`
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 1, p)
assert.Equal(t, 0, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 5, p)
assert.Equal(t, 2, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 8, p)
assert.Equal(t, 6, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 21, p)
assert.Equal(t, 9, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 26, p)
assert.Equal(t, 22, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 32, p)
assert.Equal(t, 27, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 37, p)
assert.Equal(t, 33, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 40, p)
assert.Equal(t, 38, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 43, p)
assert.Equal(t, 41, q)
}
@ -547,7 +565,7 @@ func TestNative_SkipOne_Error(t *testing.T) {
`"asdf`, `"\\\"`,
}) {
p := 0
q := __skip_one(&s, &p, &types.StateMachine{})
q := __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.True(t, q < 0)
}
}
@ -555,14 +573,14 @@ func TestNative_SkipOne_Error(t *testing.T) {
func TestNative_SkipArray(t *testing.T) {
p := 0
s := `null, true, false, 1, 2.0, -3, {"asdf": "wqer"}],`
__skip_array(&s, &p, &types.StateMachine{})
__skip_array(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, p, 48)
}
func TestNative_SkipObject(t *testing.T) {
p := 0
s := `"asdf": "wqer"},`
__skip_object(&s, &p, &types.StateMachine{})
__skip_object(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, p, 15)
}

View file

@ -9,45 +9,45 @@ package avx
func __native_entry__() uintptr
var (
_subr__f64toa = __native_entry__() + 630
_subr__html_escape = __native_entry__() + 8581
_subr__i64toa = __native_entry__() + 3642
_subr__lspace = __native_entry__() + 301
_subr__f64toa = __native_entry__() + 570
_subr__html_escape = __native_entry__() + 9062
_subr__i64toa = __native_entry__() + 3205
_subr__lspace = __native_entry__() + 251
_subr__lzero = __native_entry__() + 13
_subr__quote = __native_entry__() + 4955
_subr__skip_array = __native_entry__() + 17819
_subr__skip_number = __native_entry__() + 20937
_subr__skip_object = __native_entry__() + 17856
_subr__skip_one = __native_entry__() + 16120
_subr__u64toa = __native_entry__() + 3735
_subr__unquote = __native_entry__() + 6426
_subr__validate_one = __native_entry__() + 21054
_subr__value = __native_entry__() + 11301
_subr__vnumber = __native_entry__() + 14278
_subr__vsigned = __native_entry__() + 15592
_subr__vstring = __native_entry__() + 13243
_subr__vunsigned = __native_entry__() + 15851
_subr__quote = __native_entry__() + 4498
_subr__skip_array = __native_entry__() + 19852
_subr__skip_number = __native_entry__() + 21130
_subr__skip_object = __native_entry__() + 19887
_subr__skip_one = __native_entry__() + 18078
_subr__u64toa = __native_entry__() + 3300
_subr__unquote = __native_entry__() + 6037
_subr__validate_one = __native_entry__() + 21247
_subr__value = __native_entry__() + 11651
_subr__vnumber = __native_entry__() + 16191
_subr__vsigned = __native_entry__() + 17496
_subr__vstring = __native_entry__() + 13546
_subr__vunsigned = __native_entry__() + 17776
)
const (
_stack__f64toa = 120
_stack__html_escape = 72
_stack__f64toa = 136
_stack__html_escape = 64
_stack__i64toa = 24
_stack__lspace = 8
_stack__lzero = 8
_stack__quote = 56
_stack__quote = 80
_stack__skip_array = 144
_stack__skip_number = 96
_stack__skip_number = 80
_stack__skip_object = 144
_stack__skip_one = 144
_stack__u64toa = 8
_stack__unquote = 88
_stack__unquote = 72
_stack__validate_one = 144
_stack__value = 416
_stack__vnumber = 312
_stack__vsigned = 16
_stack__vstring = 128
_stack__vunsigned = 8
_stack__vstring = 136
_stack__vunsigned = 24
)
var (

View file

@ -67,12 +67,12 @@ func __unquote(sp unsafe.Pointer, nb int, dp unsafe.Pointer, ep *int, flags uint
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __value(s unsafe.Pointer, n int, p int, v *types.JsonState, allow_control int) (ret int)
func __value(s unsafe.Pointer, n int, p int, v *types.JsonState, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __vstring(s *string, p *int, v *types.JsonState)
func __vstring(s *string, p *int, v *types.JsonState, flags uint64)
//go:nosplit
//go:noescape
@ -92,17 +92,17 @@ func __vunsigned(s *string, p *int, v *types.JsonState)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_one(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_one(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_array(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_array(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_object(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_object(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape

File diff suppressed because it is too large Load diff

View file

@ -237,21 +237,39 @@ func TestNative_Vstring(t *testing.T) {
var v types.JsonState
i := 0
s := `test"test\n2"`
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 5, i)
assert.Equal(t, -1, v.Ep)
assert.Equal(t, int64(0), v.Iv)
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 13, i)
assert.Equal(t, 9, v.Ep)
assert.Equal(t, int64(5), v.Iv)
}
func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\x1f\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_Vstring_ValidUtf8(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\xff\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_VstringEscapeEOF(t *testing.T) {
var v types.JsonState
i := 0
s := `xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"x`
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 95, i)
assert.Equal(t, 63, v.Ep)
assert.Equal(t, int64(0), v.Iv)
@ -312,7 +330,7 @@ func TestNative_VstringHangUpOnRandomData(t *testing.T) {
p := 1
s := rt.Mem2Str(v)
var js types.JsonState
__vstring(&s, &p, &js)
__vstring(&s, &p, &js, 0)
fmt.Printf("js: %s\n", spew.Sdump(js))
}
@ -506,36 +524,36 @@ func TestNative_Vunsigned(t *testing.T) {
func TestNative_SkipOne(t *testing.T) {
p := 0
s := ` {"asdf": [null, true, false, 1, 2.0, -3]}, 1234.5`
q := __skip_one(&s, &p, &types.StateMachine{})
q := __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 42, p)
assert.Equal(t, 1, q)
p = 0
s = `1 2.5 -3 "asdf\nqwer" true false null {} []`
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 1, p)
assert.Equal(t, 0, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 5, p)
assert.Equal(t, 2, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 8, p)
assert.Equal(t, 6, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 21, p)
assert.Equal(t, 9, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 26, p)
assert.Equal(t, 22, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 32, p)
assert.Equal(t, 27, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 37, p)
assert.Equal(t, 33, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 40, p)
assert.Equal(t, 38, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 43, p)
assert.Equal(t, 41, q)
}
@ -547,7 +565,7 @@ func TestNative_SkipOne_Error(t *testing.T) {
`"asdf`, `"\\\"`,
}) {
p := 0
q := __skip_one(&s, &p, &types.StateMachine{})
q := __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.True(t, q < 0)
}
}
@ -555,14 +573,14 @@ func TestNative_SkipOne_Error(t *testing.T) {
func TestNative_SkipArray(t *testing.T) {
p := 0
s := `null, true, false, 1, 2.0, -3, {"asdf": "wqer"}],`
__skip_array(&s, &p, &types.StateMachine{})
__skip_array(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, p, 48)
}
func TestNative_SkipObject(t *testing.T) {
p := 0
s := `"asdf": "wqer"},`
__skip_object(&s, &p, &types.StateMachine{})
__skip_object(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, p, 15)
}

View file

@ -9,45 +9,45 @@ package avx2
func __native_entry__() uintptr
var (
_subr__f64toa = __native_entry__() + 903
_subr__html_escape = __native_entry__() + 10249
_subr__i64toa = __native_entry__() + 3915
_subr__lspace = __native_entry__() + 429
_subr__f64toa = __native_entry__() + 814
_subr__html_escape = __native_entry__() + 10717
_subr__i64toa = __native_entry__() + 3449
_subr__lspace = __native_entry__() + 379
_subr__lzero = __native_entry__() + 13
_subr__quote = __native_entry__() + 5328
_subr__skip_array = __native_entry__() + 21867
_subr__skip_number = __native_entry__() + 25515
_subr__skip_object = __native_entry__() + 21904
_subr__skip_one = __native_entry__() + 19172
_subr__u64toa = __native_entry__() + 4008
_subr__unquote = __native_entry__() + 7794
_subr__validate_one = __native_entry__() + 25632
_subr__value = __native_entry__() + 14495
_subr__vnumber = __native_entry__() + 17330
_subr__vsigned = __native_entry__() + 18644
_subr__vstring = __native_entry__() + 16453
_subr__vunsigned = __native_entry__() + 18903
_subr__quote = __native_entry__() + 4842
_subr__skip_array = __native_entry__() + 22748
_subr__skip_number = __native_entry__() + 24641
_subr__skip_object = __native_entry__() + 22783
_subr__skip_one = __native_entry__() + 20939
_subr__u64toa = __native_entry__() + 3544
_subr__unquote = __native_entry__() + 7467
_subr__validate_one = __native_entry__() + 24758
_subr__value = __native_entry__() + 14548
_subr__vnumber = __native_entry__() + 19052
_subr__vsigned = __native_entry__() + 20357
_subr__vstring = __native_entry__() + 16711
_subr__vunsigned = __native_entry__() + 20637
)
const (
_stack__f64toa = 120
_stack__f64toa = 136
_stack__html_escape = 72
_stack__i64toa = 24
_stack__lspace = 8
_stack__lzero = 8
_stack__quote = 56
_stack__quote = 72
_stack__skip_array = 152
_stack__skip_number = 96
_stack__skip_number = 88
_stack__skip_object = 152
_stack__skip_one = 152
_stack__u64toa = 8
_stack__unquote = 72
_stack__validate_one = 152
_stack__value = 408
_stack__value = 416
_stack__vnumber = 312
_stack__vsigned = 16
_stack__vstring = 112
_stack__vunsigned = 8
_stack__vstring = 136
_stack__vunsigned = 24
)
var (

View file

@ -77,12 +77,12 @@ func HTMLEscape(s unsafe.Pointer, nb int, dp unsafe.Pointer, dn *int) int
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func Value(s unsafe.Pointer, n int, p int, v *types.JsonState, allow_control int) int
func Value(s unsafe.Pointer, n int, p int, v *types.JsonState, flags uint64) int
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func SkipOne(s *string, p *int, m *types.StateMachine) int
func SkipOne(s *string, p *int, m *types.StateMachine, flags uint64) int
//go:nosplit
//go:noescape

View file

@ -48,7 +48,7 @@ TEXT ·Value(SB), NOSPLIT, $0 - 48
JMP github·combytedancesonicinternalnativeavx2·__value(SB)
JMP github·combytedancesonicinternalnativeavx·__value(SB)
TEXT ·SkipOne(SB), NOSPLIT, $0 - 32
TEXT ·SkipOne(SB), NOSPLIT, $0 - 40
CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0
JE 2(PC)
JMP github·combytedancesonicinternalnativeavx2·__skip_one(SB)

View file

@ -65,12 +65,12 @@ func __unquote(sp unsafe.Pointer, nb int, dp unsafe.Pointer, ep *int, flags uint
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __value(s unsafe.Pointer, n int, p int, v *types.JsonState, allow_control int) (ret int)
func __value(s unsafe.Pointer, n int, p int, v *types.JsonState, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __vstring(s *string, p *int, v *types.JsonState)
func __vstring(s *string, p *int, v *types.JsonState, flags uint64)
//go:nosplit
//go:noescape
@ -90,17 +90,17 @@ func __vunsigned(s *string, p *int, v *types.JsonState)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_one(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_one(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_array(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_array(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __skip_object(s *string, p *int, m *types.StateMachine) (ret int)
func __skip_object(s *string, p *int, m *types.StateMachine, flags uint64) (ret int)
//go:nosplit
//go:noescape

View file

@ -235,21 +235,39 @@ func TestNative_Vstring(t *testing.T) {
var v types.JsonState
i := 0
s := `test"test\n2"`
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 5, i)
assert.Equal(t, -1, v.Ep)
assert.Equal(t, int64(0), v.Iv)
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 13, i)
assert.Equal(t, 9, v.Ep)
assert.Equal(t, int64(5), v.Iv)
}
func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\x1f\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_Vstring_ValidUtf8(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\xff\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_VstringEscapeEOF(t *testing.T) {
var v types.JsonState
i := 0
s := `xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"x`
__vstring(&s, &i, &v)
__vstring(&s, &i, &v, 0)
assert.Equal(t, 95, i)
assert.Equal(t, 63, v.Ep)
assert.Equal(t, int64(0), v.Iv)
@ -310,7 +328,7 @@ func TestNative_VstringHangUpOnRandomData(t *testing.T) {
p := 1
s := rt.Mem2Str(v)
var js types.JsonState
__vstring(&s, &p, &js)
__vstring(&s, &p, &js, 0)
fmt.Printf("js: %s\n", spew.Sdump(js))
}
@ -504,36 +522,36 @@ func TestNative_Vunsigned(t *testing.T) {
func TestNative_SkipOne(t *testing.T) {
p := 0
s := ` {"asdf": [null, true, false, 1, 2.0, -3]}, 1234.5`
q := __skip_one(&s, &p, &types.StateMachine{})
q := __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 42, p)
assert.Equal(t, 1, q)
p = 0
s = `1 2.5 -3 "asdf\nqwer" true false null {} []`
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 1, p)
assert.Equal(t, 0, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 5, p)
assert.Equal(t, 2, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 8, p)
assert.Equal(t, 6, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 21, p)
assert.Equal(t, 9, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 26, p)
assert.Equal(t, 22, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 32, p)
assert.Equal(t, 27, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 37, p)
assert.Equal(t, 33, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 40, p)
assert.Equal(t, 38, q)
q = __skip_one(&s, &p, &types.StateMachine{})
q = __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, 43, p)
assert.Equal(t, 41, q)
}
@ -545,7 +563,7 @@ func TestNative_SkipOne_Error(t *testing.T) {
`"asdf`, `"\\\"`,
}) {
p := 0
q := __skip_one(&s, &p, &types.StateMachine{})
q := __skip_one(&s, &p, &types.StateMachine{}, uint64(0))
assert.True(t, q < 0)
}
}
@ -553,14 +571,14 @@ func TestNative_SkipOne_Error(t *testing.T) {
func TestNative_SkipArray(t *testing.T) {
p := 0
s := `null, true, false, 1, 2.0, -3, {"asdf": "wqer"}],`
__skip_array(&s, &p, &types.StateMachine{})
__skip_array(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, p, 48)
}
func TestNative_SkipObject(t *testing.T) {
p := 0
s := `"asdf": "wqer"},`
__skip_object(&s, &p, &types.StateMachine{})
__skip_object(&s, &p, &types.StateMachine{}, uint64(0))
assert.Equal(t, p, 15)
}

View file

@ -45,11 +45,13 @@ const (
const (
B_DOUBLE_UNQUOTE = 0
B_UNICODE_REPLACE = 1
B_VALIDATE_STRING = 5
)
const (
F_DOUBLE_UNQUOTE = 1 << B_DOUBLE_UNQUOTE
F_UNICODE_REPLACE = 1 << B_UNICODE_REPLACE
F_VALIDATE_STRING = 1 << B_VALIDATE_STRING
)
const (

21
licenses/LICENSE-yyjson Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 YaoYuan <ibireme@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "native.h"
#include "fastbytes.c"
#include "fastfloat.c"
@ -21,5 +20,4 @@
#include "parsing.c"
#include "atof_eisel_lemire.c"
#include "atof_native.c"
#include "scanning.c"
#include "utf8.c"
#include "scanning.c"

View file

@ -109,22 +109,22 @@ ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flag
ssize_t html_escape(const char *sp, ssize_t nb, char *dp, ssize_t *dn);
long value(const char *s, size_t n, long p, JsonState *ret, uint64_t flags);
void vstring(const GoString *src, long *p, JsonState *ret);
void vstring(const GoString *src, long *p, JsonState *ret, uint64_t flags);
void vnumber(const GoString *src, long *p, JsonState *ret);
void vsigned(const GoString *src, long *p, JsonState *ret);
void vunsigned(const GoString *src, long *p, JsonState *ret);
long skip_one(const GoString *src, long *p, StateMachine *m);
long skip_array(const GoString *src, long *p, StateMachine *m);
long skip_object(const GoString *src, long *p, StateMachine *m);
long skip_one(const GoString *src, long *p, StateMachine *m, uint64_t flags);
long skip_array(const GoString *src, long *p, StateMachine *m, uint64_t flags);
long skip_object(const GoString *src, long *p, StateMachine *m, uint64_t flags);
long skip_string(const GoString *src, long *p);
long skip_string(const GoString *src, long *p, uint64_t flags);
long skip_negative(const GoString *src, long *p);
long skip_positive(const GoString *src, long *p);
long skip_number(const GoString *src, long *p);
bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val);
double atof_native(const char *sp, ssize_t nb, char* dbuf, ssize_t cap);
double atof_native(const char *sp, ssize_t nb, char *dbuf, ssize_t cap);
ssize_t utf8_validate(const char *sp, ssize_t nb);
long validate_string(const GoString *src, long *p);

View file

@ -15,13 +15,16 @@
*/
#include "native.h"
static const char *CS_ARRAY = "[]{},\"[]{},\"[]{}";
static const char *CS_OBJECT = "[]{},:\"[]{}:,\"[]";
#include "utf8.h"
#include "test/xprintf.h"
static const uint64_t ODD_MASK = 0xaaaaaaaaaaaaaaaa;
static const uint64_t EVEN_MASK = 0x5555555555555555;
// NOTE: mask referenced from decoder/decoder.go
static const uint64_t MASK_VALIDATE_STRING = 1ull << 5;
static const uint64_t MASK_ALLOW_CONTROL = 1ull << 31;
static const double P10_TAB[23] = {
/* <= the connvertion to double is not exact when less than 1 => */ 1e-000,
1e+001, 1e+002, 1e+003, 1e+004, 1e+005, 1e+006, 1e+007, 1e+008, 1e+009, 1e+010,
@ -110,7 +113,7 @@ static inline int64_t advance_dword(const GoString *src, long *p, long dec, int6
}
}
static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep) {
static inline ssize_t advance_string_default(const GoString *src, long p, int64_t *ep) {
char ch;
uint64_t es;
uint64_t fe;
@ -228,7 +231,6 @@ static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep) {
m0 = ((uint64_t)s3 << 48) | ((uint64_t)s2 << 32) | ((uint64_t)s1 << 16) | (uint64_t)s0;
m1 = ((uint64_t)t3 << 48) | ((uint64_t)t2 << 32) | ((uint64_t)t1 << 16) | (uint64_t)t0;
#endif
/** update first quote position */
if (unlikely(m1 != 0)) {
ep_setx(sp - ss + __builtin_ctzll(m1))
@ -238,7 +240,7 @@ static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep) {
if (unlikely(m1 != 0 || cr != 0)) {
m0_mask(add64)
}
/* check for end quote */
if (m0 != 0) {
return sp - ss + __builtin_ctzll(m0) + 1;
@ -273,7 +275,7 @@ static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep) {
m0 = ((uint64_t)s1 << 16) | (uint64_t)s0;
m1 = ((uint64_t)t1 << 16) | (uint64_t)t0;
#endif
/** update first quote position */
if (unlikely(m1 != 0)) {
ep_setx(sp - ss + __builtin_ctzll(m1))
@ -283,7 +285,7 @@ static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep) {
if (unlikely(m1 != 0 || cr != 0)) {
m0_mask(add32)
}
/* check for end quote */
if (m0 != 0) {
return sp - ss + __builtin_ctzll(m0) + 1;
@ -329,17 +331,6 @@ static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep) {
}
}
static inline int _mm_get_mask(__m128i v, __m128i t) {
return _mm_movemask_epi8(_mm_cmpeq_epi8(v, t));
}
// contrl char: 0x00 ~ 0x1F
static inline int _mm_cchars_mask(__m128i v) {
__m128i e1 = _mm_cmpgt_epi8 (v, _mm_set1_epi8(-1));
__m128i e2 = _mm_cmpgt_epi8 (v, _mm_set1_epi8(31));
return _mm_movemask_epi8 (_mm_andnot_si128 (e2, e1));
}
#if USE_AVX2
static inline int _mm256_get_mask(__m256i v, __m256i t) {
@ -353,33 +344,52 @@ static inline int _mm256_cchars_mask(__m256i v) {
return _mm256_movemask_epi8 (_mm256_andnot_si256 (e2, e1));
}
// ascii: 0x00 ~ 0x7F
static inline int _mm256_nonascii_mask(__m256i v) {
return _mm256_movemask_epi8(v);
}
#endif
static inline ssize_t advance_validate_string(const GoString *src, long p, int64_t *ep) {
static inline int _mm_get_mask(__m128i v, __m128i t) {
return _mm_movemask_epi8(_mm_cmpeq_epi8(v, t));
}
// contrl char: 0x00 ~ 0x1F
static inline int _mm_cchars_mask(__m128i v) {
__m128i e1 = _mm_cmpgt_epi8 (v, _mm_set1_epi8(-1));
__m128i e2 = _mm_cmpgt_epi8 (v, _mm_set1_epi8(31));
return _mm_movemask_epi8 (_mm_andnot_si128 (e2, e1));
}
// ascii: 0x00 ~ 0x7F
static inline int _mm_nonascii_mask(__m128i v) {
return _mm_movemask_epi8(v);
}
static inline ssize_t advance_string_validate(const GoString *src, long p, int64_t *ep) {
char ch;
uint64_t es;
uint64_t fe;
uint64_t os;
uint64_t m0;
uint64_t m1;
uint64_t m2;
uint64_t m0, m1, m2, m3;
uint64_t es, fe, os;
uint64_t cr = 0;
long qp = 0;
long np = 0;
/* prevent out-of-bounds accessing */
if (unlikely(src->len == p)) {
return -ERR_EOF;
}
long up = 0;
/* buffer pointers */
size_t nb = src->len;
const char * sp = src->buf;
const char * ss = src->buf;
#define ep_init() *ep = -1;
#define ep_setc() ep_setx(sp - ss - 1)
#define ep_setx(x) if (*ep == -1) { *ep = (x); }
/* prevent out-of-bounds accessing */
if (unlikely(nb == p)) {
return -ERR_EOF;
}
#define ep_init() *ep = -1;
#define ep_setc() ep_setx(sp - ss - 1)
#define ep_setx(x) if (*ep == -1) { *ep = (x); }
#define ep_seterr(x) *ep = (x);
/* seek to `p` */
nb -= p;
@ -397,6 +407,7 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
uint32_t s0, s1;
uint32_t t0, t1;
uint32_t c0, c1;
uint32_t u0, u1;
#else
/* initialize vectors */
__m128i v0;
@ -410,6 +421,7 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
uint32_t s0, s1, s2, s3;
uint32_t t0, t1, t2, t3;
uint32_t c0, c1, c2, c3;
uint32_t u0, u1, u2, u3;
#endif
#define m0_mask(add) \
@ -419,6 +431,7 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
es = add(os, m1, &cr) << 1; \
m0 &= ~(fe & (es ^ EVEN_MASK));
simd_advance:
/* 64-byte SIMD loop */
while (likely(nb >= 64)) {
#if USE_AVX2
@ -430,9 +443,12 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
t1 = _mm256_get_mask(v1, cx);
c0 = _mm256_cchars_mask(v0);
c1 = _mm256_cchars_mask(v1);
u0 = _mm256_nonascii_mask(v0);
u1 = _mm256_nonascii_mask(v1);
m0 = ((uint64_t)s1 << 32) | (uint64_t)s0;
m1 = ((uint64_t)t1 << 32) | (uint64_t)t0;
m2 = ((uint64_t)c1 << 32) | (uint64_t)c0;
m3 = ((uint64_t)u1 << 32) | (uint64_t)u0;
#else
v0 = _mm_loadu_si128 ((const void *)(sp + 0));
v1 = _mm_loadu_si128 ((const void *)(sp + 16));
@ -450,12 +466,17 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
c1 = _mm_cchars_mask(v1);
c2 = _mm_cchars_mask(v2);
c3 = _mm_cchars_mask(v3);
u0 = _mm_nonascii_mask(v0);
u1 = _mm_nonascii_mask(v1);
u2 = _mm_nonascii_mask(v2);
u3 = _mm_nonascii_mask(v3);
m0 = ((uint64_t)s3 << 48) | ((uint64_t)s2 << 32) | ((uint64_t)s1 << 16) | (uint64_t)s0;
m1 = ((uint64_t)t3 << 48) | ((uint64_t)t2 << 32) | ((uint64_t)t1 << 16) | (uint64_t)t0;
m2 = ((uint64_t)c3 << 48) | ((uint64_t)c2 << 32) | ((uint64_t)c1 << 16) | (uint64_t)c0;
m3 = ((uint64_t)u3 << 48) | ((uint64_t)u2 << 32) | ((uint64_t)u1 << 16) | (uint64_t)u0;
#endif
/** update first quote position */
if (unlikely(m1 != 0)) {
ep_setx(sp - ss + __builtin_ctzll(m1))
@ -466,23 +487,35 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
m0_mask(add64)
}
qp = m0 ? __builtin_ctzll(m0) : 64;
np = m2 ? __builtin_ctzll(m2) : 64;
up = m3 ? __builtin_ctzll(m3) : 64;
/* get the position of end quote */
if (m0 != 0) {
qp = sp - ss + __builtin_ctzll(m0) + 1;
/* check control chars in JSON string */
if (unlikely(m2 !=0 && (np = sp - ss + __builtin_ctzll(m2)) < qp)) {
ep_setx(np) // set error position
if (unlikely(np < qp)) {
ep_seterr(sp - ss + np)
return -ERR_INVAL;
}
return qp;
if (up < qp) {
goto valid_utf8;
}
return sp - ss + qp + 1;
}
/* check control chars in JSON string */
if (unlikely(m2 != 0)) {
ep_setx(sp - ss + __builtin_ctzll(m2))
ep_setx(sp - ss + np)
return -ERR_INVAL;
}
if (unlikely(m3 != 0)) {
goto valid_utf8;
}
/* move to the next block */
sp += 64;
nb -= 64;
@ -495,9 +528,11 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
s0 = _mm256_get_mask (v0, cq);
t0 = _mm256_get_mask (v0, cx);
c0 = _mm256_cchars_mask(v0);
u0 = _mm256_nonascii_mask(v0);
m0 = (uint64_t)s0;
m1 = (uint64_t)t0;
m2 = (uint64_t)c0;
m3 = (uint64_t)u0;
#else
v0 = _mm_loadu_si128 ((const void *)(sp + 0));
v1 = _mm_loadu_si128 ((const void *)(sp + 16));
@ -507,11 +542,14 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
t1 = _mm_get_mask(v1, cx);
c0 = _mm_cchars_mask(v0);
c1 = _mm_cchars_mask(v1);
u0 = _mm_nonascii_mask(v0);
u1 = _mm_nonascii_mask(v1);
m0 = ((uint64_t)s1 << 16) | (uint64_t)s0;
m1 = ((uint64_t)t1 << 16) | (uint64_t)t0;
m2 = ((uint64_t)c1 << 16) | (uint64_t)c0;
m3 = ((uint64_t)u1 << 16) | (uint64_t)u0;
#endif
/** update first quote position */
if (unlikely(m1 != 0)) {
ep_setx(sp - ss + __builtin_ctzll(m1))
@ -521,24 +559,34 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
if (unlikely(m1 != 0 || cr != 0)) {
m0_mask(add32)
}
qp = m0 ? __builtin_ctzll(m0) : 64;
up = m3 ? __builtin_ctzll(m3) : 64;
np = m2 ? __builtin_ctzll(m2) : 64;
/* get the position of end quote */
if (m0 != 0) {
qp = sp - ss + __builtin_ctzll(m0) + 1;
/* check control chars in JSON string */
if (unlikely(m2 !=0 && (np = sp - ss + __builtin_ctzll(m2)) < qp)) {
ep_setx(np) // set error position
if (unlikely(np < qp)) {
ep_seterr(sp - ss + np)
return -ERR_INVAL;
}
return qp;
if (up < qp) {
goto valid_utf8;
}
return sp - ss + qp + 1;
}
/* check control chars in JSON string */
if (unlikely(m2 != 0)) {
ep_setx(sp - ss + __builtin_ctzll(m2))
ep_seterr(sp - ss + __builtin_ctzll(m2))
return -ERR_INVAL;
}
if (m3 != 0) {
goto valid_utf8;
}
/* move to the next block */
sp += 32;
nb -= 32;
@ -554,43 +602,89 @@ static inline ssize_t advance_validate_string(const GoString *src, long p, int64
}
}
remain:
/* handle the remaining bytes with scalar code */
while (nb-- > 0 && (ch = *sp++) != '"') {
while (nb > 0) {
ch = *sp;
if (ch == '"') {
return sp - ss + 1;
}
/* valid the escaped chars */
if (unlikely(ch == '\\')) {
if (nb == 0) {
if (nb == 1) {
return -ERR_EOF;
} else {
ep_setc()
sp++, nb--;
}
} else if (unlikely( ch >= 0 && ch <= 0x1f)) { // control chars
ep_setc()
ep_setx(sp - ss)
sp += 2, nb -= 2;
continue;
}
/* valid unescaped chars */
if (unlikely( ch >= 0 && ch <= 0x1f)) { // control chars
ep_seterr(sp - ss)
return -ERR_INVAL;
}
/* valid utf8 chars */
if (ch & 0x80) {
uint32_t ubin = nb >= 4 ? *(uint32_t*)sp : less4byte_to_uint32(sp, nb);
if ((up = valid_utf8_4byte(ubin))) {
sp += up, nb -= up;
continue;
}
ep_seterr(sp - ss)
return -ERR_INVAL;
}
sp++, nb--;
}
return -ERR_EOF;
valid_utf8:
sp += up, nb -= up;
while (likely(nb >= 4)) {
up = valid_utf8_4byte(*(uint32_t*)sp);
if (unlikely(up == 0)) {
ep_seterr(sp - ss)
return -ERR_INVAL;
}
/* check continous utf-8 */
sp += up, nb -= up;
if (nb > 0 && (*(uint8_t*)sp & 0x80)) {
continue;
}
/* clear the last carried bit */
cr = 0;
goto simd_advance;
}
goto remain;
#undef ep_init
#undef ep_setc
#undef ep_setx
#undef ep_seterr
#undef m0_mask
}
/* check for quotes */
if (ch == '"') {
return sp - ss;
static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep, uint64_t flags) {
if ((flags & MASK_VALIDATE_STRING) != 0) {
return advance_string_validate(src, p, ep);
} else {
return -ERR_EOF;
return advance_string_default(src, p, ep);
}
}
/** Value Scanning Routines **/
const uint64_t MASK_ALLOW_CONTROL = 1ul<<31;
long value(const char *s, size_t n, long p, JsonState *ret, uint64_t flags) {
long q = p;
GoString m = {.buf = s, .len = n};
bool allow_control = (flags&MASK_ALLOW_CONTROL) != 0;
bool allow_control = (flags & MASK_ALLOW_CONTROL) != 0;
/* parse the next identifier, q is UNSAFE, may cause out-of-bounds accessing */
switch (advance_ns(&m, &q)) {
case '-' : /* fallthrough */
@ -604,7 +698,7 @@ long value(const char *s, size_t n, long p, JsonState *ret, uint64_t flags) {
case '7' : /* fallthrough */
case '8' : /* fallthrough */
case '9' : vdigits(&m, &q, ret, flags) ; return q;
case '"' : vstring(&m, &q, ret) ; return q;
case '"' : vstring(&m, &q, ret, flags) ; return q;
case 'n' : ret->vt = advance_dword(&m, &q, 1, V_NULL, VS_NULL) ; return q;
case 't' : ret->vt = advance_dword(&m, &q, 1, V_TRUE, VS_TRUE) ; return q;
case 'f' : ret->vt = advance_dword(&m, &q, 0, V_FALSE, VS_ALSE) ; return q;
@ -619,11 +713,11 @@ long value(const char *s, size_t n, long p, JsonState *ret, uint64_t flags) {
}
}
void vstring(const GoString *src, long *p, JsonState *ret) {
void vstring(const GoString *src, long *p, JsonState *ret, uint64_t flags) {
int64_t v = -1;
int64_t i = *p;
ssize_t e = advance_string(src, i, &v);
ssize_t e = advance_string(src, i, &v, flags);
/* check for errors */
if (e < 0) {
*p = src->len;
@ -992,10 +1086,7 @@ static inline long fsm_push(StateMachine *self, int vt) {
}
}
#define VALID_DEFAULT 0 // basic validate, except JSON string.
#define VALID_FULL 1 // also validate JSON string, including control chars or invalid UTF-8.
static inline long fsm_exec(StateMachine *self, const GoString *src, long *p, int validate_flag) {
static inline long fsm_exec(StateMachine *self, const GoString *src, long *p, uint64_t flags) {
int vt;
char ch;
long vi = -1;
@ -1042,7 +1133,7 @@ static inline long fsm_exec(StateMachine *self, const GoString *src, long *p, in
case FSM_KEY: {
FSM_CHAR('"');
FSM_REPL(self, FSM_ELEM);
FSM_XERR(skip_string(src, p));
FSM_XERR(skip_string(src, p, flags));
continue;
}
@ -1080,11 +1171,7 @@ static inline long fsm_exec(StateMachine *self, const GoString *src, long *p, in
/* the quote of the first key */
case '"': {
FSM_REPL(self, FSM_OBJ);
if (validate_flag == VALID_DEFAULT) {
FSM_XERR(skip_string(src, p));
} else if (validate_flag == VALID_FULL) {
FSM_XERR(validate_string(src, p));
}
FSM_XERR(skip_string(src, p, flags));
FSM_XERR(fsm_push(self, FSM_ELEM));
continue;
}
@ -1110,14 +1197,8 @@ static inline long fsm_exec(StateMachine *self, const GoString *src, long *p, in
case 'f' : FSM_XERR(advance_dword(src, p, 0, *p - 1, VS_ALSE)); break;
case '[' : FSM_XERR(fsm_push(self, FSM_ARR_0)); break;
case '{' : FSM_XERR(fsm_push(self, FSM_OBJ_0)); break;
case '"' : {
if (validate_flag == VALID_DEFAULT) {
FSM_XERR(skip_string(src, p));
} else if (validate_flag == VALID_FULL) {
FSM_XERR(validate_string(src, p));
}
break;
}
case '"' : FSM_XERR(skip_string(src, p, flags)); break;
case 0 : return -ERR_EOF;
default : return -ERR_INVAL;
}
}
@ -1343,58 +1424,36 @@ check_index:
#undef check_sidx
#undef check_vidx
long skip_one(const GoString *src, long *p, StateMachine *m) {
long skip_one(const GoString *src, long *p, StateMachine *m, uint64_t flags) {
fsm_init(m, FSM_VAL);
return fsm_exec(m, src, p, VALID_DEFAULT);
return fsm_exec(m, src, p, flags);
}
long skip_array(const GoString *src, long *p, StateMachine *m) {
long skip_array(const GoString *src, long *p, StateMachine *m, uint64_t flags) {
fsm_init(m, FSM_ARR_0);
return fsm_exec(m, src, p, VALID_DEFAULT);
return fsm_exec(m, src, p, flags);
}
long skip_object(const GoString *src, long *p, StateMachine *m) {
long skip_object(const GoString *src, long *p, StateMachine *m, uint64_t flags) {
fsm_init(m, FSM_OBJ_0);
return fsm_exec(m, src, p, VALID_DEFAULT);
return fsm_exec(m, src, p, flags);
}
long skip_string(const GoString *src, long *p) {
long skip_string(const GoString *src, long *p, uint64_t flags) {
int64_t v;
ssize_t q = *p - 1;
ssize_t e = advance_string(src, *p, &v);
ssize_t e = advance_string(src, *p, &v, flags);
/* check for errors, and update the position */
if (e >= 0) {
*p = e;
return q;
} else {
*p = src->len;
*p = v;
return e;
}
}
long validate_string(const GoString *src, long *p) {
int64_t v;
ssize_t q = *p - 1;
ssize_t e = advance_validate_string(src, *p, &v);
/* check for errors in string advance */
if (e < 0) {
*p = e == -ERR_EOF ? src->len : v;
return e;
}
/* check for errors in UTF-8 validate */
ssize_t nb = e - *p - 1;
ssize_t r = utf8_validate(src->buf + *p, nb);
if (r >= 0) {
*p += r;
return -ERR_INVAL;
}
*p = e;
return q;
}
long skip_negative(const GoString *src, long *p) {
long i = *p;
long r = do_skip_number(src->buf + i, src->len - i);
@ -1456,5 +1515,5 @@ long skip_number(const GoString *src, long *p) {
long validate_one(const GoString *src, long *p, StateMachine *m) {
fsm_init(m, FSM_VAL);
return fsm_exec(m, src, p, VALID_FULL);
return fsm_exec(m, src, p, MASK_VALIDATE_STRING);
}

204
native/test/xprintf.h Normal file
View file

@ -0,0 +1,204 @@
/*
* Copyright 2022 ByteDance Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef XPRINTF_H
#define XPRINTF_H
#include <sys/types.h>
static void __attribute__((naked)) write_syscall(const char *s, size_t n)
{
asm volatile(
"movq %rsi, %rdx"
"\n"
"movq %rdi, %rsi"
"\n"
"movq $1, %rdi"
"\n"
"movq $0x02000004, %rax"
"\n"
"syscall"
"\n"
"retq"
"\n");
}
static void printch(const char ch)
{
write_syscall(&ch, 1);
}
static void printstr(const char *s)
{
size_t n = 0;
const char *p = s;
while (*p++)
n++;
write_syscall(s, n);
}
static void printint(int64_t v)
{
char neg = 0;
char buf[32] = {};
char *p = &buf[31];
if (v == 0)
{
printch('0');
return;
}
if (v < 0)
{
v = -v;
neg = 1;
}
while (v)
{
*--p = (v % 10) + '0';
v /= 10;
}
if (neg)
{
*--p = '-';
}
printstr(p);
}
static const char tab[] = "0123456789abcdef";
static void printhex(uintptr_t v)
{
if (v == 0)
{
printch('0');
return;
}
char buf[32] = {};
char *p = &buf[31];
while (v)
{
*--p = tab[v & 0x0f];
v >>= 4;
}
printstr(p);
}
#define MAX_BUF_LEN 100
static void printbytes(GoSlice *s)
{
printch('[');
int i = 0;
if (s->len > MAX_BUF_LEN)
{
i = s->len - MAX_BUF_LEN;
}
for (; i < s->len; i++)
{
printch(tab[((s->buf[i]) & 0xf0) >> 4]);
printch(tab[(s->buf[i]) & 0x0f]);
if (i != s->len - 1)
printch(',');
}
printch(']');
}
static void printgostr(GoString *s)
{
printch('"');
if (s->len < MAX_BUF_LEN)
{
write_syscall(s->buf, s->len);
}
else
{
write_syscall(&s->buf[s->len - MAX_BUF_LEN], MAX_BUF_LEN);
}
printch('"');
}
static void xprintf(const char *fmt, ...)
{
#ifdef DEBUG
__builtin_va_list va;
char buf[256] = {};
char *p = buf;
__builtin_va_start(va, fmt);
for (;;)
{
if (*fmt == 0)
{
break;
}
if (*fmt != '%')
{
*p++ = *fmt++;
continue;
}
*p = 0;
p = buf;
fmt++;
printstr(buf);
switch (*fmt++)
{
case '%':
{
printch('%');
break;
}
case 's':
{
printgostr(__builtin_va_arg(va, GoString *));
break;
}
case 'd':
{
printint(__builtin_va_arg(va, int64_t));
break;
}
case 'f':
{
printint(__builtin_va_arg(va, double));
break;
}
case 'c':
{
printch(__builtin_va_arg(va, const char));
break;
}
case 'x':
{
printhex(__builtin_va_arg(va, uintptr_t));
break;
}
case 'l':
{
printbytes(__builtin_va_arg(va, GoSlice *));
break;
}
}
}
__builtin_va_end(va);
if (p != buf)
{
*p = 0;
printstr(buf);
}
#endif
}
#endif // XPRINTF_H

View file

@ -1,183 +0,0 @@
/*
* Copyright (c) 2009 The Go Authors. All rights reserved.
* Modifications Copyright 2021 ByteDance Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "native.h"
// ascii: 0x00 ~ 0x7F
static inline int _mm_ascii_mask(__m128i vv) {
return _mm_movemask_epi8(vv);
}
#if USE_AVX2
// ascii: 0x00 ~ 0x7F
static inline int _mm256_ascii_mask(__m256i vv) {
return _mm256_movemask_epi8(vv);
}
#endif
static inline bool is_ascii(uint8_t ch) {
return ch < 0x80;
}
// The default lowest and highest continuation byte.
const static uint8_t locb = 0x80;
const static uint8_t hicb = 0xBF;
const static uint8_t xx = 0xF1; // invalid: size 1
const static uint8_t as = 0xF0; // ASCII: size 1
const static uint8_t s1 = 0x02; // accept 0, size 2
const static uint8_t s2 = 0x13; // accept 1, size 3
const static uint8_t s3 = 0x03; // accept 0, size 3
const static uint8_t s4 = 0x23; // accept 2, size 3
const static uint8_t s5 = 0x34; // accept 3, size 4
const static uint8_t s6 = 0x04; // accept 0, size 4
const static uint8_t s7 = 0x44; // accept 4, size 4
// first is information about the first byte in a UTF-8 sequence.
static const uint8_t first[256] = {
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
};
// AcceptRange gives the range of valid values for the second byte in a UTF-8
// sequence.
struct AcceptRange {
uint8_t lo; // lowest value for second byte.
uint8_t hi; // highest value for second byte.
};
// ranges has size 16 to avoid bounds checks in the code that uses it.
const static struct AcceptRange ranges[5] = {
{locb, hicb}, // 0
{0xA0, hicb}, // 1
{locb, 0x9F}, // 2
{0x90, hicb}, // 3
{locb, 0x8F}, // 4
};
// UTF-8 code point | first byte | second byte | third byte | fourth byte
// U+0000 - U+007F | 0___ ____
// U+0080 - U+07FF | 110_ ____ | 10__ ____
// U+0800 - U+D7FF | 1110 ____ | 10__ ____ | 10__ ____
// U+D800 - U+DFFF | reserved for UTF-16 surrogate pairs
// U+E000 - U+FFFF | 1110 ____ | 10__ ____ | 10__ ____
// U+10000 - U+10FFFF | 1111 0___ | 10__ ____ | 10__ ____ | 10__ ____
// checks non-ascii characters, and returns the utf-8 length
static inline ssize_t nonascii_is_utf8(const uint8_t* sp, size_t n) {
uint8_t mask = first[sp[0]];
uint8_t size = mask & 7;
if (n < size) {
return 0;
}
struct AcceptRange accept = ranges[mask >> 4];
switch (size) {
case 4 : if (sp[3] < locb || hicb < sp[3]) return 0;
case 3 : if (sp[2] < locb || hicb < sp[2]) return 0;
case 2 : if (sp[1] < accept.lo || accept.hi < sp[1]) return 0; break;
case 1 : return 0; // invalid chars
case 0 : return 1; // ascii chars
default: return 0;
}
return size;
}
ssize_t find_non_ascii(const uint8_t*sp, ssize_t nb) {
const uint8_t* ss = sp;
int64_t m;
#if USE_AVX2
while (nb >= 32) {
__m256i v = _mm256_loadu_si256 ((const void *)(sp));
if (unlikely((m = _mm256_ascii_mask(v)) != 0)) {
return sp - ss + __builtin_ctzll(m);
}
nb -= 32;
sp += 32;
}
/* clear spper half to avoid AVX-SSE transition penalty */
_mm256_zeroupper();
#endif
while (nb >= 16) {
__m128i v = _mm_loadu_si128 ((const void *)(sp));
if (unlikely((m = _mm_ascii_mask(v)) != 0)) {
return sp - ss + __builtin_ctzll(m);
}
nb -= 16;
sp += 16;
}
/* remaining bytes, do with scalar code */
while (nb-- > 0) {
if (is_ascii(*sp)) {
sp++;
} else {
return sp - ss;
}
}
/* nothing found */
return -1;
}
// utf8_validate validates whether the JSON string is valid UTF-8.
// return -1 if validate, otherwise, return the error postion.
ssize_t utf8_validate(const char *sp, ssize_t nb) {
const uint8_t* p = (const uint8_t*)sp;
const uint8_t* s = (const uint8_t*)sp;
ssize_t n;
ssize_t b;
// Optimize for the continuous non-ascii chars */
while (nb > 0 && (n = (!is_ascii(*p) ? 0 : find_non_ascii(p, nb))) != -1) {
/* not found non-ascii in string */
if (n >= nb) {
return -1;
}
nb -= n;
p += n;
/* validate the non-ascii */
if (unlikely((b = nonascii_is_utf8(p, nb)) == 0)) {
return p - s;
}
nb -= b;
p += b;
}
return -1;
}

121
native/utf8.h Normal file
View file

@ -0,0 +1,121 @@
#ifndef UTF8_H
#define UTF8_H
/*
* Copyright (C) 2019 Yaoyuan <ibireme@gmail.com>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file may have been modified by ByteDance authors. All ByteDance
* Modifications are Copyright 2022 ByteDance Authors.
*/
static inline ssize_t valid_utf8_4byte(uint32_t ubin) {
/*
Each unicode code point is encoded as 1 to 4 bytes in UTF-8 encoding,
we use 4-byte mask and pattern value to validate UTF-8 byte sequence,
this requires the input data to have 4-byte zero padding.
---------------------------------------------------
1 byte
unicode range [U+0000, U+007F]
unicode min [.......0]
unicode max [.1111111]
bit pattern [0.......]
---------------------------------------------------
2 byte
unicode range [U+0080, U+07FF]
unicode min [......10 ..000000]
unicode max [...11111 ..111111]
bit require [...xxxx. ........] (1E 00)
bit mask [xxx..... xx......] (E0 C0)
bit pattern [110..... 10......] (C0 80)
// 1101 0100 10110000
// 0001 1110
---------------------------------------------------
3 byte
unicode range [U+0800, U+FFFF]
unicode min [........ ..100000 ..000000]
unicode max [....1111 ..111111 ..111111]
bit require [....xxxx ..x..... ........] (0F 20 00)
bit mask [xxxx.... xx...... xx......] (F0 C0 C0)
bit pattern [1110.... 10...... 10......] (E0 80 80)
---------------------------------------------------
3 byte invalid (reserved for surrogate halves)
unicode range [U+D800, U+DFFF]
unicode min [....1101 ..100000 ..000000]
unicode max [....1101 ..111111 ..111111]
bit mask [....xxxx ..x..... ........] (0F 20 00)
bit pattern [....1101 ..1..... ........] (0D 20 00)
---------------------------------------------------
4 byte
unicode range [U+10000, U+10FFFF]
unicode min [........ ...10000 ..000000 ..000000]
unicode max [.....100 ..001111 ..111111 ..111111]
bit err0 [.....100 ........ ........ ........] (04 00 00 00)
bit err1 [.....011 ..110000 ........ ........] (03 30 00 00)
bit require [.....xxx ..xx.... ........ ........] (07 30 00 00)
bit mask [xxxxx... xx...... xx...... xx......] (F8 C0 C0 C0)
bit pattern [11110... 10...... 10...... 10......] (F0 80 80 80)
---------------------------------------------------
*/
const uint32_t b1_mask = 0x00000080UL;
const uint32_t b1_patt = 0x00000000UL;
const uint32_t b2_mask = 0x0000C0E0UL;
const uint32_t b2_patt = 0x000080C0UL;
const uint32_t b2_requ = 0x0000001EUL;
const uint32_t b3_mask = 0x00C0C0F0UL;
const uint32_t b3_patt = 0x008080E0UL;
const uint32_t b3_requ = 0x0000200FUL;
const uint32_t b3_erro = 0x0000200DUL;
const uint32_t b4_mask = 0xC0C0C0F8UL;
const uint32_t b4_patt = 0x808080F0UL;
const uint32_t b4_requ = 0x00003007UL;
const uint32_t b4_err0 = 0x00000004UL;
const uint32_t b4_err1 = 0x00003003UL;
#define is_valid_seq_1(uni) ( \
((uni & b1_mask) == b1_patt) \
)
#define is_valid_seq_2(uni) ( \
((uni & b2_mask) == b2_patt) && \
((uni & b2_requ)) \
)
#define is_valid_seq_3(uni) ( \
((uni & b3_mask) == b3_patt) && \
((tmp = (uni & b3_requ))) && \
((tmp != b3_erro)) \
)
#define is_valid_seq_4(uni) ( \
((uni & b4_mask) == b4_patt) && \
((tmp = (uni & b4_requ))) && \
((tmp & b4_err0) == 0 || (tmp & b4_err1) == 0) \
)
uint32_t tmp = 0;
if (is_valid_seq_3(ubin)) return 3;
if (is_valid_seq_2(ubin)) return 2;
if (is_valid_seq_4(ubin)) return 4;
return 0;
}
static inline uint32_t less4byte_to_uint32(const char* sp, size_t nb) {
if (nb == 1) return *(uint8_t*)sp;
if (nb == 2) return *(uint16_t*)sp;
uint32_t hi_1 = (*(uint8_t*)(sp + 2));
uint32_t lo_2 = *(uint16_t*)(sp);
return hi_1 << 16 | lo_2;
}
#endif

View file

@ -61,6 +61,8 @@ type frozenConfig struct {
// Froze convert the Config to API
func (cfg Config) Froze() API {
api := &frozenConfig{Config: cfg}
// configure encoder options:
if cfg.EscapeHTML {
api.encoderOpts |= encoder.EscapeHTML
}
@ -73,6 +75,11 @@ func (cfg Config) Froze() API {
if cfg.NoQuoteTextMarshaler {
api.encoderOpts |= encoder.NoQuoteTextMarshaler
}
if cfg.NoNullSliceOrMap {
api.encoderOpts |= encoder.NoNullSliceOrMap
}
// configure decoder options:
if cfg.UseInt64 {
api.decoderOpts |= decoder.OptionUseInt64
}
@ -85,8 +92,8 @@ func (cfg Config) Froze() API {
if cfg.CopyString {
api.decoderOpts |= decoder.OptionCopyString
}
if cfg.NoNullSliceOrMap {
api.encoderOpts |= encoder.NoNullSliceOrMap
if cfg.ValidateString {
api.decoderOpts |= decoder.OptionValidateString
}
return api
}
@ -111,7 +118,6 @@ func (cfg *frozenConfig) MarshalIndent(val interface{}, prefix, indent string) (
func (cfg *frozenConfig) UnmarshalFromString(buf string, val interface{}) error {
dec := decoder.NewDecoder(buf)
dec.SetOptions(cfg.decoderOpts)
err := dec.Decode(val)
pos := dec.Pos()