mirror of
https://github.com/ii64/sonic.git
synced 2026-06-21 00:46:43 +08:00
support JSON validate (#189)
* fix: check unescaped control chars in decode * feat: add utf8 validate func * feat: validate utf8 in json string * feat: add validateone api * fix: check unicode pointer for surrogate * clang12 compile * feat: Import `Valid()` and `Skip()` * opt: use looktable * fix utf-8 validate performance problem * fix: utf-8 validate bug * clang12 build * feat: (encoder) accelerate validating json from `json.Marshaler` chore!: - `encoder.NoCompactMarshaler`changes to `encoder.CompactMarshaler`, which means compacting operation is not open by default * fix: only one json value is `Valid()` Co-authored-by: liuqiang <liuqiang.06@bytedance.com> Co-authored-by: duanyi.aster <duanyi.aster@bytedance.com>
This commit is contained in:
parent
5be8dafe41
commit
7475b256ce
28 changed files with 3812 additions and 1615 deletions
13
README.md
13
README.md
|
|
@ -146,13 +146,8 @@ import "github.com/bytedance/sonic"
|
|||
v := map[string]string{"&&":{"<>"}}
|
||||
ret, err := Encode(v, EscapeHTML) // ret == `{"\u0026\u0026":{"X":"\u003c\u003e"}}`
|
||||
```
|
||||
### Optimization Options
|
||||
- encoder.NoCompactMarshaler
|
||||
|
||||
When marshaling `json.RawMessage` or `json.Marshaler`, sonic ensures validating and compacting their output JSON string. The higher the ratio of these kinds of data is, the much this feature impacts encoding performance. Therefore, we provide option `encoder.NoCompactMarshaler` to skip the compacting process, which means your marshaler's outputs **MUST** be valid JSON. If not, **Undocumented Behavior** may happen.
|
||||
- encoder.NoQuoteTextMarshaler
|
||||
|
||||
We also provide option `encoder.NoQuoteTextMarshaler` to avoid quoting the output string of `encoding.TextMarshaler`.
|
||||
### Compact Format
|
||||
Sonic encodes premitive objects (struct/map...) as compact-format JSON by default, except marshaling `json.RawMessage` or `json.Marshaler`: sonic ensures validating their output JSON but **DONOT** compacting them for performance concern. We provide option `encoder.CompactMarshaler` to add compacting process.
|
||||
|
||||
### Print Syntax Error
|
||||
```go
|
||||
|
|
@ -253,8 +248,8 @@ import (
|
|||
// you can set compile recursive depth in Pretouch for better stability in JIT.
|
||||
err := sonic.Pretouch(reflect.TypeOf(v), option.WithCompileRecursiveDepth(depth))
|
||||
```
|
||||
### Accelerate `json.RawMessage\json.Marshaler\encoding.TextMarshaler`
|
||||
To ensure data security, sonic.Encoder validates and escapes JSON values from these interfaces by default, which may degrade performance much if most of your data is in form of them. We provide two options `encoder.NoCompactMarshaler` (for `json.RawMessage\json.Marshaler`) and `encoder.NoQuoteTextMarshaler` (for `encoding.TextMarshaler`) to avoid validating and escaping operations, which means you **MUST** ensure the validity of JSON values from these interfaces by your own.
|
||||
### Accelerate `encoding.TextMarshaler`
|
||||
To ensure data security, sonic.Encoder quotes and escapes string values from `encoding.TextMarshaler` interfaces by default, which may degrade performance much if most of your data is in form of them. We provide `encoder.NoQuoteTextMarshaler` to skip these operations, which means you **MUST** ensure their output string escaped and quoted in accordance with [RFC8259](https://datatracker.ietf.org/doc/html/rfc8259).
|
||||
|
||||
### Pass string or []byte?
|
||||
For alignment to `encoding/json`, we provide API to pass `[]byte` as an argument, but the string-to-bytes copy is conducted at the same time considering safety, which may lose performance when origin JSON is huge. Therefore, you can use `UnmarshalString` and `GetFromString` to pass a string, as long as your origin data is a string or **nocopy-cast** is safe for your []byte.
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ func TestEncodeValue(t *testing.T) {
|
|||
{NewObject([]Pair{}), `{}`, false},
|
||||
{NewBytes([]byte("hello, world")), `"aGVsbG8sIHdvcmxk"`, false},
|
||||
{NewAny(obj), string(buf), false},
|
||||
{NewRaw(`[{ }]`), "[{}]", false},
|
||||
{NewRaw(`[{ }]`), "[{ }]", false},
|
||||
{Node{}, "", true},
|
||||
{Node{t: types.ValueType(1)}, "", true},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,6 @@ package ast
|
|||
|
||||
import (
|
||||
`fmt`
|
||||
`sync`
|
||||
`unsafe`
|
||||
|
||||
`github.com/bytedance/sonic/decoder`
|
||||
|
|
@ -47,12 +46,6 @@ type Parser struct {
|
|||
skipValue bool
|
||||
}
|
||||
|
||||
var stackPool = sync.Pool{
|
||||
New: func()interface{}{
|
||||
return &types.StateMachine{}
|
||||
},
|
||||
}
|
||||
|
||||
/** Parser Private Methods **/
|
||||
|
||||
func (self *Parser) delim() types.ParsingError {
|
||||
|
|
@ -325,9 +318,9 @@ func (self *Parser) Parse() (Node, types.ParsingError) {
|
|||
}
|
||||
|
||||
func (self *Parser) skip() (int, types.ParsingError) {
|
||||
fsm := stackPool.Get().(*types.StateMachine)
|
||||
fsm := types.NewStateMachine()
|
||||
start := native.SkipOne(&self.s, &self.p, fsm)
|
||||
stackPool.Put(fsm)
|
||||
types.FreeStateMachine(fsm)
|
||||
|
||||
if start < 0 {
|
||||
return self.p, types.ParsingError(-start)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ import (
|
|||
`runtime`
|
||||
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
`github.com/bytedance/sonic/internal/native`
|
||||
`github.com/bytedance/sonic/internal/native/types`
|
||||
`github.com/bytedance/sonic/option`
|
||||
)
|
||||
|
||||
|
|
@ -160,4 +162,15 @@ func pretouchRec(vtm map[reflect.Type]bool, opts option.CompileOptions) error {
|
|||
}
|
||||
opts.RecursiveDepth -= 1
|
||||
return pretouchRec(next, opts)
|
||||
}
|
||||
|
||||
// Skip skips only one json value, and returns first non-blank character position and its ending position if it is valid.
|
||||
// Otherwise returns negative error code using start and invalid character position using end
|
||||
func Skip(data []byte) (start int, end int) {
|
||||
s := rt.Mem2Str(data)
|
||||
p := 0
|
||||
m := types.NewStateMachine()
|
||||
ret := native.SkipOne(&s, &p, m)
|
||||
types.FreeStateMachine(m)
|
||||
return ret, p
|
||||
}
|
||||
|
|
@ -29,6 +29,7 @@ import (
|
|||
`github.com/json-iterator/go`
|
||||
`github.com/stretchr/testify/assert`
|
||||
`github.com/stretchr/testify/require`
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
|
|
@ -333,3 +334,14 @@ func BenchmarkDecoder_Parallel_Binding_GoJson(b *testing.B) {
|
|||
})
|
||||
}
|
||||
|
||||
func BenchmarkSkip_Sonic(b *testing.B) {
|
||||
var data = rt.Str2Mem(TwitterJson)
|
||||
if ret, _ := Skip(data); ret < 0 {
|
||||
b.Fatal()
|
||||
}
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
for i:=0; i<b.N; i++ {
|
||||
_, _ = Skip(data)
|
||||
}
|
||||
}
|
||||
|
|
@ -24,6 +24,7 @@ import (
|
|||
`unsafe`
|
||||
|
||||
`github.com/bytedance/sonic/internal/native`
|
||||
`github.com/bytedance/sonic/internal/native/types`
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
`github.com/bytedance/sonic/option`
|
||||
)
|
||||
|
|
@ -34,7 +35,7 @@ type Options uint64
|
|||
const (
|
||||
bitSortMapKeys = iota
|
||||
bitEscapeHTML
|
||||
bitNoCompactMarshaler
|
||||
bitCompactMarshaler
|
||||
bitNoQuoteTextMarshaler
|
||||
)
|
||||
|
||||
|
|
@ -49,9 +50,9 @@ const (
|
|||
// WARNING: This hurts performance A LOT, USE WITH CARE.
|
||||
EscapeHTML Options = 1 << bitEscapeHTML
|
||||
|
||||
// NoCompactMarshaler indicates that the output JSON from json.Marshaler
|
||||
// CompactMarshaler indicates that the output JSON from json.Marshaler
|
||||
// is always compact and needs no validation
|
||||
NoCompactMarshaler Options = 1 << bitNoCompactMarshaler
|
||||
CompactMarshaler Options = 1 << bitCompactMarshaler
|
||||
|
||||
// NoQuoteTextMarshaler indicates that the output text from encoding.TextMarshaler
|
||||
// is always escaped string and needs no quoting
|
||||
|
|
@ -276,4 +277,28 @@ func pretouchRec(vtm map[reflect.Type]bool, opts option.CompileOptions) error {
|
|||
}
|
||||
opts.RecursiveDepth -= 1
|
||||
return pretouchRec(next, opts)
|
||||
}
|
||||
|
||||
// Valid validates json and returns first non-blank character position,
|
||||
// if it is only one valid json value.
|
||||
// Otherwise returns invalid character position using start.
|
||||
func Valid(data []byte) (ok bool, start int) {
|
||||
n := len(data)
|
||||
if n == 0 {
|
||||
return false, -1
|
||||
}
|
||||
s := rt.Mem2Str(data)
|
||||
p := 0
|
||||
m := types.NewStateMachine()
|
||||
ret := native.ValidateOne(&s, &p, m)
|
||||
types.FreeStateMachine(m)
|
||||
if ret < 0 {
|
||||
return false, p-1
|
||||
}
|
||||
for ;p < n; p++ {
|
||||
if (types.SPACE_MASK & (1 << data[p])) == 0 {
|
||||
return false, p
|
||||
}
|
||||
}
|
||||
return true, ret
|
||||
}
|
||||
|
|
@ -29,6 +29,7 @@ import (
|
|||
gojson `github.com/goccy/go-json`
|
||||
`github.com/json-iterator/go`
|
||||
`github.com/stretchr/testify/require`
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
|
|
@ -122,19 +123,34 @@ func TestEncoder_Marshaler(t *testing.T) {
|
|||
v := MarshalerStruct{V: MarshalerImpl{X: 12345}}
|
||||
ret, err := Encode(&v, 0)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, `{"V":12345}`, string(ret))
|
||||
require.Equal(t, `{"V":12345 }`, string(ret))
|
||||
ret, err = Encode(v, 0)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, `{"V":{"X":12345}}`, string(ret))
|
||||
|
||||
ret2, err2 := Encode(&v, NoCompactMarshaler)
|
||||
ret2, err2 := Encode(&v, 0)
|
||||
require.NoError(t, err2)
|
||||
require.Equal(t, `{"V":12345 }`, string(ret2))
|
||||
ret3, err3 := Encode(v, NoCompactMarshaler)
|
||||
ret3, err3 := Encode(v, CompactMarshaler)
|
||||
require.NoError(t, err3)
|
||||
require.Equal(t, `{"V":{"X":12345}}`, string(ret3))
|
||||
}
|
||||
|
||||
type MarshalerErrorStruct struct {
|
||||
V MarshalerImpl
|
||||
}
|
||||
|
||||
func (self *MarshalerErrorStruct) MarshalJSON() ([]byte, error) {
|
||||
return []byte(`[""] {`), nil
|
||||
}
|
||||
|
||||
func TestMarshalerError(t *testing.T) {
|
||||
v := MarshalerErrorStruct{}
|
||||
ret, err := Encode(&v, 0)
|
||||
require.EqualError(t, err, `invalid Marshaler output json syntax at 5: "[\"\"] {"`)
|
||||
require.Equal(t, []byte(nil), ret)
|
||||
}
|
||||
|
||||
type RawMessageStruct struct {
|
||||
X json.RawMessage
|
||||
}
|
||||
|
|
@ -145,11 +161,11 @@ func TestEncoder_RawMessage(t *testing.T) {
|
|||
}
|
||||
ret, err := Encode(&rms, 0)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, `{"X":123456}`, string(ret))
|
||||
|
||||
ret, err = Encode(&rms, NoCompactMarshaler)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, `{"X":123456 }`, string(ret))
|
||||
|
||||
ret, err = Encode(&rms, CompactMarshaler)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, `{"X":123456}`, string(ret))
|
||||
}
|
||||
|
||||
type TextMarshalerImpl struct {
|
||||
|
|
@ -234,11 +250,11 @@ func TestEncoder_MapSortKey(t *testing.T) {
|
|||
}
|
||||
|
||||
func BenchmarkEncoder_Generic_Sonic(b *testing.B) {
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -279,20 +295,20 @@ func BenchmarkEncoder_Generic_StdLib(b *testing.B) {
|
|||
}
|
||||
|
||||
func BenchmarkEncoder_Binding_Sonic(b *testing.B) {
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEncoder_Binding_Sonic_Fast(b *testing.B) {
|
||||
_, _ = Encode(&_BindingValue, NoCompactMarshaler | NoQuoteTextMarshaler)
|
||||
_, _ = Encode(&_BindingValue, NoQuoteTextMarshaler)
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, _ = Encode(&_BindingValue, NoCompactMarshaler | NoQuoteTextMarshaler)
|
||||
_, _ = Encode(&_BindingValue, NoQuoteTextMarshaler)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -324,23 +340,23 @@ func BenchmarkEncoder_Binding_StdLib(b *testing.B) {
|
|||
}
|
||||
|
||||
func BenchmarkEncoder_Parallel_Generic_Sonic(b *testing.B) {
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(_GenericValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func BenchmarkEncoder_Parallel_Generic_Sonic_Fast(b *testing.B) {
|
||||
_, _ = Encode(_GenericValue, NoCompactMarshaler | NoQuoteTextMarshaler)
|
||||
_, _ = Encode(_GenericValue, NoQuoteTextMarshaler)
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
_, _ = Encode(_GenericValue, NoCompactMarshaler | NoQuoteTextMarshaler)
|
||||
_, _ = Encode(_GenericValue, NoQuoteTextMarshaler)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
@ -379,23 +395,23 @@ func BenchmarkEncoder_Parallel_Generic_StdLib(b *testing.B) {
|
|||
}
|
||||
|
||||
func BenchmarkEncoder_Parallel_Binding_Sonic(b *testing.B) {
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML)
|
||||
_, _ = Encode(&_BindingValue, SortMapKeys | EscapeHTML | CompactMarshaler)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func BenchmarkEncoder_Parallel_Binding_Sonic_Fast(b *testing.B) {
|
||||
_, _ = Encode(&_BindingValue, NoCompactMarshaler | NoQuoteTextMarshaler)
|
||||
_, _ = Encode(&_BindingValue, NoQuoteTextMarshaler)
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
_, _ = Encode(&_BindingValue, NoCompactMarshaler | NoQuoteTextMarshaler)
|
||||
_, _ = Encode(&_BindingValue, NoQuoteTextMarshaler)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
@ -455,4 +471,44 @@ func BenchmarkHTMLEscape_StdLib(b *testing.B) {
|
|||
buf = out.Bytes()
|
||||
}
|
||||
_ = buf
|
||||
}
|
||||
|
||||
|
||||
func BenchmarkValidate_Sonic(b *testing.B) {
|
||||
var data = rt.Str2Mem(TwitterJson)
|
||||
ok, s := Valid(data)
|
||||
if !ok {
|
||||
b.Fatal(s)
|
||||
}
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
for i:=0; i<b.N; i++ {
|
||||
_, _ = Valid(data)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidate_Std(b *testing.B) {
|
||||
var data = rt.Str2Mem(TwitterJson)
|
||||
if !json.Valid(data) {
|
||||
b.Fatal()
|
||||
}
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
for i:=0; i<b.N; i++ {
|
||||
_ = json.Valid(data)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCompact_Std(b *testing.B) {
|
||||
var data = rt.Str2Mem(TwitterJson)
|
||||
var dst = bytes.NewBuffer(nil)
|
||||
if err := json.Compact(dst, data); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
b.SetBytes(int64(len(TwitterJson)))
|
||||
b.ResetTimer()
|
||||
for i:=0; i<b.N; i++ {
|
||||
dst.Reset()
|
||||
_ = json.Compact(dst, data)
|
||||
}
|
||||
}
|
||||
|
|
@ -18,6 +18,7 @@ package encoder
|
|||
|
||||
import (
|
||||
`encoding/json`
|
||||
`fmt`
|
||||
`reflect`
|
||||
`strconv`
|
||||
)
|
||||
|
|
@ -42,3 +43,7 @@ func error_number(number json.Number) error {
|
|||
Value : reflect.ValueOf(number),
|
||||
}
|
||||
}
|
||||
|
||||
func error_marshaler(ret []byte, pos int) error {
|
||||
return fmt.Errorf("invalid Marshaler output json syntax at %d: %q", pos, ret)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,11 +83,14 @@ func encodeJsonMarshaler(buf *[]byte, val json.Marshaler, opt Options) error {
|
|||
if ret, err := val.MarshalJSON(); err != nil {
|
||||
return err
|
||||
} else {
|
||||
if opt & NoCompactMarshaler != 0 {
|
||||
*buf = append(*buf, ret...)
|
||||
return nil
|
||||
if opt & CompactMarshaler != 0 {
|
||||
return compact(buf, ret)
|
||||
}
|
||||
return compact(buf, ret)
|
||||
if ok, s := Valid(ret); !ok {
|
||||
return error_marshaler(ret, s)
|
||||
}
|
||||
*buf = append(*buf, ret...)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -103,3 +103,8 @@ func __skip_array(s *string, p *int, m *types.StateMachine) (ret int)
|
|||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __skip_object(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -257,6 +257,51 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
|
|||
assert.Equal(t, int64(0), v.Iv)
|
||||
}
|
||||
|
||||
func TestNative_ValidateOne(t *testing.T) {
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, len(s), p)
|
||||
assert.Equal(t, 0, r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x00\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x80\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\xed\xbf\xbf\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNative_VstringHangUpOnRandomData(t *testing.T) {
|
||||
v, e := hex.DecodeString(
|
||||
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +
|
||||
|
|
|
|||
|
|
@ -9,22 +9,23 @@ package avx
|
|||
func __native_entry__() uintptr
|
||||
|
||||
var (
|
||||
_subr__f64toa = __native_entry__() + 630
|
||||
_subr__html_escape = __native_entry__() + 8160
|
||||
_subr__i64toa = __native_entry__() + 3642
|
||||
_subr__lspace = __native_entry__() + 301
|
||||
_subr__lzero = __native_entry__() + 13
|
||||
_subr__quote = __native_entry__() + 4955
|
||||
_subr__skip_array = __native_entry__() + 17223
|
||||
_subr__skip_object = __native_entry__() + 17258
|
||||
_subr__skip_one = __native_entry__() + 15444
|
||||
_subr__u64toa = __native_entry__() + 3735
|
||||
_subr__unquote = __native_entry__() + 6005
|
||||
_subr__value = __native_entry__() + 10806
|
||||
_subr__vnumber = __native_entry__() + 13602
|
||||
_subr__vsigned = __native_entry__() + 14916
|
||||
_subr__vstring = __native_entry__() + 12567
|
||||
_subr__vunsigned = __native_entry__() + 15175
|
||||
_subr__f64toa = __native_entry__() + 630
|
||||
_subr__html_escape = __native_entry__() + 8160
|
||||
_subr__i64toa = __native_entry__() + 3642
|
||||
_subr__lspace = __native_entry__() + 301
|
||||
_subr__lzero = __native_entry__() + 13
|
||||
_subr__quote = __native_entry__() + 4955
|
||||
_subr__skip_array = __native_entry__() + 17296
|
||||
_subr__skip_object = __native_entry__() + 17333
|
||||
_subr__skip_one = __native_entry__() + 15444
|
||||
_subr__u64toa = __native_entry__() + 3735
|
||||
_subr__unquote = __native_entry__() + 6005
|
||||
_subr__validate_one = __native_entry__() + 20414
|
||||
_subr__value = __native_entry__() + 10806
|
||||
_subr__vnumber = __native_entry__() + 13602
|
||||
_subr__vsigned = __native_entry__() + 14916
|
||||
_subr__vstring = __native_entry__() + 12567
|
||||
_subr__vunsigned = __native_entry__() + 15175
|
||||
)
|
||||
|
||||
const (
|
||||
|
|
@ -34,11 +35,12 @@ const (
|
|||
_stack__lspace = 8
|
||||
_stack__lzero = 8
|
||||
_stack__quote = 80
|
||||
_stack__skip_array = 144
|
||||
_stack__skip_object = 144
|
||||
_stack__skip_one = 144
|
||||
_stack__skip_array = 160
|
||||
_stack__skip_object = 160
|
||||
_stack__skip_one = 160
|
||||
_stack__u64toa = 8
|
||||
_stack__unquote = 88
|
||||
_stack__validate_one = 160
|
||||
_stack__value = 400
|
||||
_stack__vnumber = 312
|
||||
_stack__vsigned = 16
|
||||
|
|
@ -58,6 +60,7 @@ var (
|
|||
_ = _subr__skip_one
|
||||
_ = _subr__u64toa
|
||||
_ = _subr__unquote
|
||||
_ = _subr__validate_one
|
||||
_ = _subr__value
|
||||
_ = _subr__vnumber
|
||||
_ = _subr__vsigned
|
||||
|
|
@ -77,6 +80,7 @@ const (
|
|||
_ = _stack__skip_one
|
||||
_ = _stack__u64toa
|
||||
_ = _stack__unquote
|
||||
_ = _stack__validate_one
|
||||
_ = _stack__value
|
||||
_ = _stack__vnumber
|
||||
_ = _stack__vsigned
|
||||
|
|
|
|||
|
|
@ -103,3 +103,8 @@ func __skip_array(s *string, p *int, m *types.StateMachine) (ret int)
|
|||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __skip_object(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -257,6 +257,51 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
|
|||
assert.Equal(t, int64(0), v.Iv)
|
||||
}
|
||||
|
||||
func TestNative_ValidateOne(t *testing.T) {
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, len(s), p)
|
||||
assert.Equal(t, 0, r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x00\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x80\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\xed\xbf\xbf\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNative_VstringHangUpOnRandomData(t *testing.T) {
|
||||
v, e := hex.DecodeString(
|
||||
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +
|
||||
|
|
|
|||
|
|
@ -9,22 +9,23 @@ package avx2
|
|||
func __native_entry__() uintptr
|
||||
|
||||
var (
|
||||
_subr__f64toa = __native_entry__() + 903
|
||||
_subr__html_escape = __native_entry__() + 9535
|
||||
_subr__i64toa = __native_entry__() + 3915
|
||||
_subr__lspace = __native_entry__() + 429
|
||||
_subr__lzero = __native_entry__() + 13
|
||||
_subr__quote = __native_entry__() + 5328
|
||||
_subr__skip_array = __native_entry__() + 21058
|
||||
_subr__skip_object = __native_entry__() + 21093
|
||||
_subr__skip_one = __native_entry__() + 18201
|
||||
_subr__u64toa = __native_entry__() + 4008
|
||||
_subr__unquote = __native_entry__() + 7080
|
||||
_subr__value = __native_entry__() + 13707
|
||||
_subr__vnumber = __native_entry__() + 16359
|
||||
_subr__vsigned = __native_entry__() + 17673
|
||||
_subr__vstring = __native_entry__() + 15482
|
||||
_subr__vunsigned = __native_entry__() + 17932
|
||||
_subr__f64toa = __native_entry__() + 903
|
||||
_subr__html_escape = __native_entry__() + 9535
|
||||
_subr__i64toa = __native_entry__() + 3915
|
||||
_subr__lspace = __native_entry__() + 429
|
||||
_subr__lzero = __native_entry__() + 13
|
||||
_subr__quote = __native_entry__() + 5328
|
||||
_subr__skip_array = __native_entry__() + 21301
|
||||
_subr__skip_object = __native_entry__() + 21338
|
||||
_subr__skip_one = __native_entry__() + 18201
|
||||
_subr__u64toa = __native_entry__() + 4008
|
||||
_subr__unquote = __native_entry__() + 7080
|
||||
_subr__validate_one = __native_entry__() + 24949
|
||||
_subr__value = __native_entry__() + 13707
|
||||
_subr__vnumber = __native_entry__() + 16359
|
||||
_subr__vsigned = __native_entry__() + 17673
|
||||
_subr__vstring = __native_entry__() + 15482
|
||||
_subr__vunsigned = __native_entry__() + 17932
|
||||
)
|
||||
|
||||
const (
|
||||
|
|
@ -39,6 +40,7 @@ const (
|
|||
_stack__skip_one = 136
|
||||
_stack__u64toa = 8
|
||||
_stack__unquote = 72
|
||||
_stack__validate_one = 136
|
||||
_stack__value = 392
|
||||
_stack__vnumber = 312
|
||||
_stack__vsigned = 16
|
||||
|
|
@ -58,6 +60,7 @@ var (
|
|||
_ = _subr__skip_one
|
||||
_ = _subr__u64toa
|
||||
_ = _subr__unquote
|
||||
_ = _subr__validate_one
|
||||
_ = _subr__value
|
||||
_ = _subr__vnumber
|
||||
_ = _subr__vsigned
|
||||
|
|
@ -77,6 +80,7 @@ const (
|
|||
_ = _stack__skip_one
|
||||
_ = _stack__u64toa
|
||||
_ = _stack__unquote
|
||||
_ = _stack__validate_one
|
||||
_ = _stack__value
|
||||
_ = _stack__vnumber
|
||||
_ = _stack__vsigned
|
||||
|
|
|
|||
|
|
@ -83,6 +83,11 @@ func Value(s unsafe.Pointer, n int, p int, v *types.JsonState, allow_control int
|
|||
//goland:noinspection GoUnusedParameter
|
||||
func SkipOne(s *string, p *int, m *types.StateMachine) int
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func ValidateOne(s *string, p *int, m *types.StateMachine) int
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
|
|
|
|||
|
|
@ -54,6 +54,12 @@ TEXT ·SkipOne(SB), NOSPLIT, $0 - 32
|
|||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx2·__skip_one(SB)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx·__skip_one(SB)
|
||||
|
||||
TEXT ·ValidateOne(SB), NOSPLIT, $0 - 32
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX2(SB), $0
|
||||
JE 2(PC)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx2·__validate_one(SB)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx·__validate_one(SB)
|
||||
|
||||
TEXT ·I64toa(SB), NOSPLIT, $0 - 32
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX2(SB), $0
|
||||
JE 2(PC)
|
||||
|
|
|
|||
|
|
@ -101,3 +101,8 @@ func __skip_array(s *string, p *int, m *types.StateMachine) (ret int)
|
|||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __skip_object(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
|
@ -255,6 +255,51 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
|
|||
assert.Equal(t, int64(0), v.Iv)
|
||||
}
|
||||
|
||||
func TestNative_ValidateOne(t *testing.T) {
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, len(s), p)
|
||||
assert.Equal(t, 0, r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x00\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x80\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\xed\xbf\xbf\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNative_VstringHangUpOnRandomData(t *testing.T) {
|
||||
v, e := hex.DecodeString(
|
||||
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ package types
|
|||
|
||||
import (
|
||||
`fmt`
|
||||
`sync`
|
||||
)
|
||||
|
||||
type ValueType int
|
||||
|
|
@ -55,6 +56,10 @@ const (
|
|||
MAX_RECURSE = 65536
|
||||
)
|
||||
|
||||
const (
|
||||
SPACE_MASK = (1 << ' ') | (1 << '\t') | (1 << '\r') | (1 << '\n')
|
||||
)
|
||||
|
||||
const (
|
||||
ERR_EOF ParsingError = 1
|
||||
ERR_INVALID_CHAR ParsingError = 2
|
||||
|
|
@ -103,3 +108,18 @@ type StateMachine struct {
|
|||
Sp int
|
||||
Vt [MAX_RECURSE]int
|
||||
}
|
||||
|
||||
var stackPool = sync.Pool{
|
||||
New: func()interface{}{
|
||||
return &StateMachine{}
|
||||
},
|
||||
}
|
||||
|
||||
func NewStateMachine() *StateMachine {
|
||||
return stackPool.Get().(*StateMachine)
|
||||
}
|
||||
|
||||
func FreeStateMachine(fsm *StateMachine) {
|
||||
stackPool.Put(fsm)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ type GoMapIterator struct {
|
|||
|
||||
type GoItab struct {
|
||||
it unsafe.Pointer
|
||||
vt *GoType
|
||||
Vt *GoType
|
||||
hv uint32
|
||||
_ [4]byte
|
||||
fn [1]uintptr
|
||||
|
|
@ -186,6 +186,10 @@ func UnpackEface(v interface{}) GoEface {
|
|||
return *(*GoEface)(unsafe.Pointer(&v))
|
||||
}
|
||||
|
||||
func UnpackIface(v interface{}) GoIface {
|
||||
return *(*GoIface)(unsafe.Pointer(&v))
|
||||
}
|
||||
|
||||
func findReflectRtypeItab() *GoItab {
|
||||
v := reflect.TypeOf(struct{}{})
|
||||
return (*GoIface)(unsafe.Pointer(&v)).Itab
|
||||
|
|
|
|||
|
|
@ -22,3 +22,4 @@
|
|||
#include "atof_eisel_lemire.c"
|
||||
#include "atof_native.c"
|
||||
#include "scanning.c"
|
||||
#include "utf8.c"
|
||||
|
|
|
|||
|
|
@ -125,4 +125,8 @@ long skip_positive(const GoString *src, long *p);
|
|||
bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val);
|
||||
double atof_native(const char *sp, ssize_t nb, char* dbuf, ssize_t cap);
|
||||
|
||||
ssize_t utf8_validate(const char *sp, ssize_t nb);
|
||||
long validate_string(const GoString *src, long *p);
|
||||
long validate_one(const GoString *src, long *p, StateMachine *m);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -317,6 +317,259 @@ static inline ssize_t advance_string(const GoString *src, long p, int64_t *ep) {
|
|||
}
|
||||
}
|
||||
|
||||
static inline int _mm_get_mask(__m128i v, __m128i t) {
|
||||
return _mm_movemask_epi8(_mm_cmpeq_epi8(v, t));
|
||||
}
|
||||
|
||||
// contrl char: 0x00 ~ 0x1F
|
||||
static inline int _mm_cchars_mask(__m128i v) {
|
||||
__m128i e1 = _mm_cmpgt_epi8 (v, _mm_set1_epi8(-1));
|
||||
__m128i e2 = _mm_cmpgt_epi8 (v, _mm_set1_epi8(31));
|
||||
return _mm_movemask_epi8 (_mm_andnot_si128 (e2, e1));
|
||||
}
|
||||
|
||||
#if USE_AVX2
|
||||
|
||||
static inline int _mm256_get_mask(__m256i v, __m256i t) {
|
||||
return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, t));
|
||||
}
|
||||
|
||||
// contrl char: 0x00 ~ 0x1F
|
||||
static inline int _mm256_cchars_mask(__m256i v) {
|
||||
__m256i e1 = _mm256_cmpgt_epi8 (v, _mm256_set1_epi8(-1));
|
||||
__m256i e2 = _mm256_cmpgt_epi8 (v, _mm256_set1_epi8(31));
|
||||
return _mm256_movemask_epi8 (_mm256_andnot_si256 (e2, e1));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline ssize_t advance_validate_string(const GoString *src, long p, int64_t *ep) {
|
||||
char ch;
|
||||
uint64_t es;
|
||||
uint64_t fe;
|
||||
uint64_t os;
|
||||
uint64_t m0;
|
||||
uint64_t m1;
|
||||
uint64_t m2;
|
||||
uint64_t cr = 0;
|
||||
long qp = 0;
|
||||
long np = 0;
|
||||
|
||||
/* prevent out-of-bounds accessing */
|
||||
if (unlikely(src->len == p)) {
|
||||
return -ERR_EOF;
|
||||
}
|
||||
|
||||
/* buffer pointers */
|
||||
size_t nb = src->len;
|
||||
const char * sp = src->buf;
|
||||
const char * ss = src->buf;
|
||||
|
||||
#define ep_init() *ep = -1;
|
||||
#define ep_setc() ep_setx(sp - ss - 1)
|
||||
#define ep_setx(x) if (*ep == -1) { *ep = (x); }
|
||||
|
||||
/* seek to `p` */
|
||||
nb -= p;
|
||||
sp += p;
|
||||
ep_init()
|
||||
|
||||
#if USE_AVX2
|
||||
/* initialize vectors */
|
||||
__m256i v0;
|
||||
__m256i v1;
|
||||
__m256i cq = _mm256_set1_epi8('"');
|
||||
__m256i cx = _mm256_set1_epi8('\\');
|
||||
|
||||
/* partial masks */
|
||||
uint32_t s0, s1;
|
||||
uint32_t t0, t1;
|
||||
uint32_t c0, c1;
|
||||
#else
|
||||
/* initialize vectors */
|
||||
__m128i v0;
|
||||
__m128i v1;
|
||||
__m128i v2;
|
||||
__m128i v3;
|
||||
__m128i cq = _mm_set1_epi8('"');
|
||||
__m128i cx = _mm_set1_epi8('\\');
|
||||
|
||||
/* partial masks */
|
||||
uint32_t s0, s1, s2, s3;
|
||||
uint32_t t0, t1, t2, t3;
|
||||
uint32_t c0, c1, c2, c3;
|
||||
#endif
|
||||
|
||||
#define m0_mask(add) \
|
||||
m1 &= ~cr; \
|
||||
fe = (m1 << 1) | cr; \
|
||||
os = (m1 & ~fe) & ODD_MASK; \
|
||||
es = add(os, m1, &cr) << 1; \
|
||||
m0 &= ~(fe & (es ^ EVEN_MASK));
|
||||
|
||||
/* 64-byte SIMD loop */
|
||||
while (likely(nb >= 64)) {
|
||||
#if USE_AVX2
|
||||
v0 = _mm256_loadu_si256 ((const void *)(sp + 0));
|
||||
v1 = _mm256_loadu_si256 ((const void *)(sp + 32));
|
||||
s0 = _mm256_get_mask(v0, cq);
|
||||
s1 = _mm256_get_mask(v1, cq);
|
||||
t0 = _mm256_get_mask(v0, cx);
|
||||
t1 = _mm256_get_mask(v1, cx);
|
||||
c0 = _mm256_cchars_mask(v0);
|
||||
c1 = _mm256_cchars_mask(v1);
|
||||
m0 = ((uint64_t)s1 << 32) | (uint64_t)s0;
|
||||
m1 = ((uint64_t)t1 << 32) | (uint64_t)t0;
|
||||
m2 = ((uint64_t)c1 << 32) | (uint64_t)c0;
|
||||
#else
|
||||
v0 = _mm_loadu_si128 ((const void *)(sp + 0));
|
||||
v1 = _mm_loadu_si128 ((const void *)(sp + 16));
|
||||
v2 = _mm_loadu_si128 ((const void *)(sp + 32));
|
||||
v3 = _mm_loadu_si128 ((const void *)(sp + 48));
|
||||
s0 = _mm_get_mask(v0, cq);
|
||||
s1 = _mm_get_mask(v1, cq);
|
||||
s2 = _mm_get_mask(v2, cq);
|
||||
s3 = _mm_get_mask(v3, cq);
|
||||
t0 = _mm_get_mask(v0, cx);
|
||||
t1 = _mm_get_mask(v1, cx);
|
||||
t2 = _mm_get_mask(v2, cx);
|
||||
t3 = _mm_get_mask(v3, cx);
|
||||
c0 = _mm_cchars_mask(v0);
|
||||
c1 = _mm_cchars_mask(v1);
|
||||
c2 = _mm_cchars_mask(v2);
|
||||
c3 = _mm_cchars_mask(v3);
|
||||
m0 = ((uint64_t)s3 << 48) | ((uint64_t)s2 << 32) | ((uint64_t)s1 << 16) | (uint64_t)s0;
|
||||
m1 = ((uint64_t)t3 << 48) | ((uint64_t)t2 << 32) | ((uint64_t)t1 << 16) | (uint64_t)t0;
|
||||
m2 = ((uint64_t)c3 << 48) | ((uint64_t)c2 << 32) | ((uint64_t)c1 << 16) | (uint64_t)c0;
|
||||
|
||||
#endif
|
||||
|
||||
/** update first quote position */
|
||||
if (unlikely(m1 != 0)) {
|
||||
ep_setx(sp - ss + __builtin_ctzll(m1))
|
||||
}
|
||||
|
||||
/** mask all the escaped quotes */
|
||||
if (unlikely(m1 != 0 || cr != 0)) {
|
||||
m0_mask(add64)
|
||||
}
|
||||
|
||||
/* get the position of end quote */
|
||||
if (m0 != 0) {
|
||||
qp = sp - ss + __builtin_ctzll(m0) + 1;
|
||||
/* check control chars in JSON string */
|
||||
if (unlikely(m2 !=0 && (np = sp - ss + __builtin_ctzll(m2)) < qp)) {
|
||||
ep_setx(np) // set error position
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
return qp;
|
||||
}
|
||||
|
||||
/* check control chars in JSON string */
|
||||
if (unlikely(m2 != 0)) {
|
||||
ep_setx(sp - ss + __builtin_ctzll(m2))
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
/* move to the next block */
|
||||
sp += 64;
|
||||
nb -= 64;
|
||||
}
|
||||
|
||||
/* 32-byte SIMD round */
|
||||
if (likely(nb >= 32)) {
|
||||
#if USE_AVX2
|
||||
v0 = _mm256_loadu_si256 ((const void *)sp);
|
||||
s0 = _mm256_get_mask (v0, cq);
|
||||
t0 = _mm256_get_mask (v0, cx);
|
||||
c0 = _mm256_cchars_mask(v0);
|
||||
m0 = (uint64_t)s0;
|
||||
m1 = (uint64_t)t0;
|
||||
m2 = (uint64_t)c0;
|
||||
#else
|
||||
v0 = _mm_loadu_si128 ((const void *)(sp + 0));
|
||||
v1 = _mm_loadu_si128 ((const void *)(sp + 16));
|
||||
s0 = _mm_get_mask(v0, cq);
|
||||
s1 = _mm_get_mask(v1, cq);
|
||||
t0 = _mm_get_mask(v0, cx);
|
||||
t1 = _mm_get_mask(v1, cx);
|
||||
c0 = _mm_cchars_mask(v0);
|
||||
c1 = _mm_cchars_mask(v1);
|
||||
m0 = ((uint64_t)s1 << 16) | (uint64_t)s0;
|
||||
m1 = ((uint64_t)t1 << 16) | (uint64_t)t0;
|
||||
m2 = ((uint64_t)c1 << 16) | (uint64_t)c0;
|
||||
#endif
|
||||
|
||||
/** update first quote position */
|
||||
if (unlikely(m1 != 0)) {
|
||||
ep_setx(sp - ss + __builtin_ctzll(m1))
|
||||
}
|
||||
|
||||
/** mask all the escaped quotes */
|
||||
if (unlikely(m1 != 0 || cr != 0)) {
|
||||
m0_mask(add32)
|
||||
}
|
||||
|
||||
/* get the position of end quote */
|
||||
if (m0 != 0) {
|
||||
qp = sp - ss + __builtin_ctzll(m0) + 1;
|
||||
/* check control chars in JSON string */
|
||||
if (unlikely(m2 !=0 && (np = sp - ss + __builtin_ctzll(m2)) < qp)) {
|
||||
ep_setx(np) // set error position
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
return qp;
|
||||
}
|
||||
|
||||
/* check control chars in JSON string */
|
||||
if (unlikely(m2 != 0)) {
|
||||
ep_setx(sp - ss + __builtin_ctzll(m2))
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
/* move to the next block */
|
||||
sp += 32;
|
||||
nb -= 32;
|
||||
}
|
||||
|
||||
/* check for carry */
|
||||
if (unlikely(cr != 0)) {
|
||||
if (nb == 0) {
|
||||
return -ERR_EOF;
|
||||
} else {
|
||||
ep_setc()
|
||||
sp++, nb--;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle the remaining bytes with scalar code */
|
||||
while (nb-- > 0 && (ch = *sp++) != '"') {
|
||||
if (unlikely(ch == '\\')) {
|
||||
if (nb == 0) {
|
||||
return -ERR_EOF;
|
||||
} else {
|
||||
ep_setc()
|
||||
sp++, nb--;
|
||||
}
|
||||
} else if (unlikely( ch >= 0 && ch <= 0x1f)) { // control chars
|
||||
ep_setc()
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
}
|
||||
|
||||
#undef ep_init
|
||||
#undef ep_setc
|
||||
#undef ep_setx
|
||||
#undef m0_mask
|
||||
|
||||
/* check for quotes */
|
||||
if (ch == '"') {
|
||||
return sp - ss;
|
||||
} else {
|
||||
return -ERR_EOF;
|
||||
}
|
||||
}
|
||||
|
||||
/** Value Scanning Routines **/
|
||||
|
||||
long value(const char *s, size_t n, long p, JsonState *ret, int allow_control) {
|
||||
|
|
@ -724,7 +977,10 @@ static inline long fsm_push(StateMachine *self, int vt) {
|
|||
}
|
||||
}
|
||||
|
||||
static inline long fsm_exec(StateMachine *self, const GoString *src, long *p) {
|
||||
#define VALID_DEFAULT 0 // basic validate, except JSON string.
|
||||
#define VALID_FULL 1 // also validate JSON string, including control chars or invalid UTF-8.
|
||||
|
||||
static inline long fsm_exec(StateMachine *self, const GoString *src, long *p, int validate_flag) {
|
||||
int vt;
|
||||
char ch;
|
||||
long vi = -1;
|
||||
|
|
@ -806,7 +1062,11 @@ static inline long fsm_exec(StateMachine *self, const GoString *src, long *p) {
|
|||
/* the quote of the first key */
|
||||
case '"': {
|
||||
FSM_REPL(self, FSM_OBJ);
|
||||
FSM_XERR(skip_string(src, p));
|
||||
if (validate_flag == VALID_DEFAULT) {
|
||||
FSM_XERR(skip_string(src, p));
|
||||
} else if (validate_flag == VALID_FULL) {
|
||||
FSM_XERR(validate_string(src, p));
|
||||
}
|
||||
FSM_XERR(fsm_push(self, FSM_ELEM));
|
||||
continue;
|
||||
}
|
||||
|
|
@ -830,9 +1090,16 @@ static inline long fsm_exec(StateMachine *self, const GoString *src, long *p) {
|
|||
case 'n' : FSM_XERR(advance_dword(src, p, 1, *p - 1, VS_NULL)); break;
|
||||
case 't' : FSM_XERR(advance_dword(src, p, 1, *p - 1, VS_TRUE)); break;
|
||||
case 'f' : FSM_XERR(advance_dword(src, p, 0, *p - 1, VS_ALSE)); break;
|
||||
case '"' : FSM_XERR(skip_string(src, p)); break;
|
||||
case '[' : FSM_XERR(fsm_push(self, FSM_ARR_0)); break;
|
||||
case '{' : FSM_XERR(fsm_push(self, FSM_OBJ_0)); break;
|
||||
case '"' : {
|
||||
if (validate_flag == VALID_DEFAULT) {
|
||||
FSM_XERR(skip_string(src, p));
|
||||
} else if (validate_flag == VALID_FULL) {
|
||||
FSM_XERR(validate_string(src, p));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 0 : return -ERR_EOF;
|
||||
default : return -ERR_INVAL;
|
||||
}
|
||||
|
|
@ -1061,17 +1328,17 @@ check_index:
|
|||
|
||||
long skip_one(const GoString *src, long *p, StateMachine *m) {
|
||||
fsm_init(m, FSM_VAL);
|
||||
return fsm_exec(m, src, p);
|
||||
return fsm_exec(m, src, p, VALID_DEFAULT);
|
||||
}
|
||||
|
||||
long skip_array(const GoString *src, long *p, StateMachine *m) {
|
||||
fsm_init(m, FSM_ARR_0);
|
||||
return fsm_exec(m, src, p);
|
||||
return fsm_exec(m, src, p, VALID_DEFAULT);
|
||||
}
|
||||
|
||||
long skip_object(const GoString *src, long *p, StateMachine *m) {
|
||||
fsm_init(m, FSM_OBJ_0);
|
||||
return fsm_exec(m, src, p);
|
||||
return fsm_exec(m, src, p, VALID_DEFAULT);
|
||||
}
|
||||
|
||||
long skip_string(const GoString *src, long *p) {
|
||||
|
|
@ -1089,6 +1356,28 @@ long skip_string(const GoString *src, long *p) {
|
|||
}
|
||||
}
|
||||
|
||||
long validate_string(const GoString *src, long *p) {
|
||||
int64_t v;
|
||||
ssize_t q = *p - 1;
|
||||
ssize_t e = advance_validate_string(src, *p, &v);
|
||||
|
||||
/* check for errors in string advance */
|
||||
if (e < 0) {
|
||||
*p = e == -ERR_EOF ? src->len : v;
|
||||
return e;
|
||||
}
|
||||
|
||||
/* check for errors in UTF-8 validate */
|
||||
ssize_t nb = e - *p - 1;
|
||||
ssize_t r = utf8_validate(src->buf + *p, nb);
|
||||
if (r >= 0) {
|
||||
*p += r;
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
*p = e;
|
||||
return q;
|
||||
}
|
||||
|
||||
long skip_negative(const GoString *src, long *p) {
|
||||
long i = *p;
|
||||
long r = skip_number(src->buf + i, src->len - i);
|
||||
|
|
@ -1118,3 +1407,8 @@ long skip_positive(const GoString *src, long *p) {
|
|||
*p += r - 1;
|
||||
return i;
|
||||
}
|
||||
|
||||
long validate_one(const GoString *src, long *p, StateMachine *m) {
|
||||
fsm_init(m, FSM_VAL);
|
||||
return fsm_exec(m, src, p, VALID_FULL);
|
||||
}
|
||||
183
native/utf8.c
Normal file
183
native/utf8.c
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
* Modifications Copyright 2021 ByteDance Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "native.h"
|
||||
|
||||
// ascii: 0x00 ~ 0x7F
|
||||
static inline int _mm_ascii_mask(__m128i vv) {
|
||||
return _mm_movemask_epi8(vv);
|
||||
}
|
||||
|
||||
#if USE_AVX2
|
||||
|
||||
// ascii: 0x00 ~ 0x7F
|
||||
static inline int _mm256_ascii_mask(__m256i vv) {
|
||||
return _mm256_movemask_epi8(vv);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline bool is_ascii(uint8_t ch) {
|
||||
return ch < 0x80;
|
||||
}
|
||||
|
||||
// The default lowest and highest continuation byte.
|
||||
const static uint8_t locb = 0x80;
|
||||
const static uint8_t hicb = 0xBF;
|
||||
const static uint8_t xx = 0xF1; // invalid: size 1
|
||||
const static uint8_t as = 0xF0; // ASCII: size 1
|
||||
const static uint8_t s1 = 0x02; // accept 0, size 2
|
||||
const static uint8_t s2 = 0x13; // accept 1, size 3
|
||||
const static uint8_t s3 = 0x03; // accept 0, size 3
|
||||
const static uint8_t s4 = 0x23; // accept 2, size 3
|
||||
const static uint8_t s5 = 0x34; // accept 3, size 4
|
||||
const static uint8_t s6 = 0x04; // accept 0, size 4
|
||||
const static uint8_t s7 = 0x44; // accept 4, size 4
|
||||
|
||||
// first is information about the first byte in a UTF-8 sequence.
|
||||
static const uint8_t first[256] = {
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
|
||||
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
|
||||
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
|
||||
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
|
||||
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
|
||||
};
|
||||
|
||||
// AcceptRange gives the range of valid values for the second byte in a UTF-8
|
||||
// sequence.
|
||||
struct AcceptRange {
|
||||
uint8_t lo; // lowest value for second byte.
|
||||
uint8_t hi; // highest value for second byte.
|
||||
};
|
||||
|
||||
// ranges has size 16 to avoid bounds checks in the code that uses it.
|
||||
const static struct AcceptRange ranges[5] = {
|
||||
{locb, hicb}, // 0
|
||||
{0xA0, hicb}, // 1
|
||||
{locb, 0x9F}, // 2
|
||||
{0x90, hicb}, // 3
|
||||
{locb, 0x8F}, // 4
|
||||
};
|
||||
|
||||
// UTF-8 code point | first byte | second byte | third byte | fourth byte
|
||||
// U+0000 - U+007F | 0___ ____
|
||||
// U+0080 - U+07FF | 110_ ____ | 10__ ____
|
||||
// U+0800 - U+D7FF | 1110 ____ | 10__ ____ | 10__ ____
|
||||
// U+D800 - U+DFFF | reserved for UTF-16 surrogate pairs
|
||||
// U+E000 - U+FFFF | 1110 ____ | 10__ ____ | 10__ ____
|
||||
// U+10000 - U+10FFFF | 1111 0___ | 10__ ____ | 10__ ____ | 10__ ____
|
||||
// checks non-ascii characters, and returns the utf-8 length
|
||||
static inline ssize_t nonascii_is_utf8(const uint8_t* sp, size_t n) {
|
||||
uint8_t mask = first[sp[0]];
|
||||
uint8_t size = mask & 7;
|
||||
if (n < size) {
|
||||
return 0;
|
||||
}
|
||||
struct AcceptRange accept = ranges[mask >> 4];
|
||||
switch (size) {
|
||||
case 4 : if (sp[3] < locb || hicb < sp[3]) return 0;
|
||||
case 3 : if (sp[2] < locb || hicb < sp[2]) return 0;
|
||||
case 2 : if (sp[1] < accept.lo || accept.hi < sp[1]) return 0; break;
|
||||
case 1 : return 0; // invalid chars
|
||||
case 0 : return 1; // ascii chars
|
||||
default: return 0;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
ssize_t find_non_ascii(const uint8_t*sp, ssize_t nb) {
|
||||
const uint8_t* ss = sp;
|
||||
int64_t m;
|
||||
|
||||
#if USE_AVX2
|
||||
while (nb >= 32) {
|
||||
__m256i v = _mm256_loadu_si256 ((const void *)(sp));
|
||||
if (unlikely((m = _mm256_ascii_mask(v)) != 0)) {
|
||||
return sp - ss + __builtin_ctzll(m);
|
||||
}
|
||||
nb -= 32;
|
||||
sp += 32;
|
||||
}
|
||||
|
||||
/* clear spper half to avoid AVX-SSE transition penalty */
|
||||
_mm256_zeroupper();
|
||||
#endif
|
||||
|
||||
while (nb >= 16) {
|
||||
__m128i v = _mm_loadu_si128 ((const void *)(sp));
|
||||
if (unlikely((m = _mm_ascii_mask(v)) != 0)) {
|
||||
return sp - ss + __builtin_ctzll(m);
|
||||
}
|
||||
nb -= 16;
|
||||
sp += 16;
|
||||
}
|
||||
|
||||
/* remaining bytes, do with scalar code */
|
||||
while (nb-- > 0) {
|
||||
if (is_ascii(*sp)) {
|
||||
sp++;
|
||||
} else {
|
||||
return sp - ss;
|
||||
}
|
||||
}
|
||||
|
||||
/* nothing found */
|
||||
return -1;
|
||||
}
|
||||
|
||||
// utf8_validate validates whether the JSON string is valid UTF-8.
|
||||
// return -1 if validate, otherwise, return the error postion.
|
||||
ssize_t utf8_validate(const char *sp, ssize_t nb) {
|
||||
const uint8_t* p = (const uint8_t*)sp;
|
||||
const uint8_t* s = (const uint8_t*)sp;
|
||||
ssize_t n;
|
||||
ssize_t b;
|
||||
|
||||
// Optimize for the continuous non-ascii chars */
|
||||
while (nb > 0 && (n = (!is_ascii(*p) ? 0 : find_non_ascii(p, nb))) != -1) {
|
||||
/* not found non-ascii in string */
|
||||
if (n >= nb) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
nb -= n;
|
||||
p += n;
|
||||
|
||||
/* validate the non-ascii */
|
||||
if (unlikely((b = nonascii_is_utf8(p, nb)) == 0)) {
|
||||
return p - s;
|
||||
}
|
||||
|
||||
nb -= b;
|
||||
p += b;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
6
sonic.go
6
sonic.go
|
|
@ -28,10 +28,6 @@ import (
|
|||
`github.com/bytedance/sonic/internal/rt`
|
||||
)
|
||||
|
||||
const (
|
||||
_SpaceMask = (1 << ' ') | (1 << '\t') | (1 << '\r') | (1 << '\n')
|
||||
)
|
||||
|
||||
// Marshal returns the JSON encoding of v.
|
||||
func Marshal(val interface{}) ([]byte, error) {
|
||||
return encoder.Encode(val, 0)
|
||||
|
|
@ -56,7 +52,7 @@ func UnmarshalString(buf string, val interface{}) error {
|
|||
|
||||
/* skip all the trailing spaces */
|
||||
if pos != len(buf) {
|
||||
for pos < len(buf) && (_SpaceMask & (1 << buf[pos])) != 0 {
|
||||
for pos < len(buf) && (types.SPACE_MASK & (1 << buf[pos])) != 0 {
|
||||
pos++
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue