2
0
Fork 0
mirror of https://github.com/ii64/sonic.git synced 2026-06-21 00:46:43 +08:00

feat: repl invalid utf8 in serde by option (#357)

This commit is contained in:
liu 2023-02-10 18:55:27 +08:00 committed by GitHub
parent f87d87de7a
commit 02865de676
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 23478 additions and 20789 deletions

View file

@ -1,6 +1,6 @@
name: License Check name: License Check
on: push on: pull_request
jobs: jobs:
build: build:

View file

@ -1,6 +1,6 @@
name: Push Check Go1.18-Linux-X64 name: Push Check Go1.18-Linux-X64
on: push on: pull_request
jobs: jobs:
build: build:

View file

@ -1,6 +1,6 @@
name: Push Check Linux-ARM name: Push Check Linux-ARM
on: push on: pull_request
jobs: jobs:
build: build:

View file

@ -1,6 +1,6 @@
name: Push Check Linux-X64 name: Push Check Linux-X64
on: push on: pull_request
jobs: jobs:
build: build:

View file

@ -1,6 +1,6 @@
name: Push Check Linux-Qemu name: Push Check Linux-Qemu
on: push on: pull_request
jobs: jobs:
build: build:

View file

@ -1,6 +1,6 @@
name: Push Check Windows-X64 name: Push Check Windows-X64
on: push on: pull_request
jobs: jobs:
build: build:

View file

@ -35,7 +35,7 @@ CC_amd64 := clang
ASM2ASM_amd64 := tools/asm2asm/asm2asm.py ASM2ASM_amd64 := tools/asm2asm/asm2asm.py
CFLAGS := -mno-red-zone CFLAGS := -mno-red-zone
CFLAGS += -arch x86_64 CFLAGS += -target x86_64-apple-macos11
CFLAGS += -fno-asynchronous-unwind-tables CFLAGS += -fno-asynchronous-unwind-tables
CFLAGS += -fno-builtin CFLAGS += -fno-builtin
CFLAGS += -fno-exceptions CFLAGS += -fno-exceptions
@ -100,8 +100,10 @@ endef
all: ${ARCH} all: ${ARCH}
clean: clean:
rm -vfr ${TMP_DIR}/{sse,avx,avx2} for arch in ${ARCH}; do \
rm -vfr ${OUT_DIR}/{sse,avx,avx2} rm -vfr ${TMP_DIR}/$${arch}; \
rm -vfr ${OUT_DIR}/$${arch}; \
done
$(foreach \ $(foreach \
arch, \ arch, \

10
api.go
View file

@ -66,8 +66,8 @@ type Config struct {
// CopyString indicates decoder to decode string values by copying instead of referring. // CopyString indicates decoder to decode string values by copying instead of referring.
CopyString bool CopyString bool
// ValidateString indicates decoder to valid string values: decoder will return errors when // ValidateString indicates decoder and encoder to valid string values: decoder will return errors
// invalid UTF-8 chars or unescaped control chars(\u0000-\u001f) in the string value of JSON. // when unescaped control chars(\u0000-\u001f) in the string value of JSON.
ValidateString bool ValidateString bool
} }
@ -81,6 +81,7 @@ var (
SortMapKeys: true, SortMapKeys: true,
CompactMarshaler: true, CompactMarshaler: true,
CopyString : true, CopyString : true,
ValidateString : true,
}.Froze() }.Froze()
// ConfigFastest is the fastest config of APIs, aiming at speed. // ConfigFastest is the fastest config of APIs, aiming at speed.
@ -164,12 +165,15 @@ func UnmarshalString(buf string, val interface{}) error {
return ConfigDefault.UnmarshalFromString(buf, val) return ConfigDefault.UnmarshalFromString(buf, val)
} }
// Get searches the given path json, // Get searches the given path from json,
// and returns its representing ast.Node. // and returns its representing ast.Node.
// //
// Each path arg must be integer or string: // Each path arg must be integer or string:
// - Integer means searching current node as array // - Integer means searching current node as array
// - String means searching current node as object // - String means searching current node as object
//
// Note, the api expects the json is well-formed at least,
// otherwise it may return unexpected result.
func Get(src []byte, path ...interface{}) (ast.Node, error) { func Get(src []byte, path ...interface{}) (ast.Node, error) {
return GetFromString(string(src), path...) return GetFromString(string(src), path...)
} }

View file

@ -691,7 +691,10 @@ func (self *Node) AddAny(val interface{}) error {
} }
// GetByPath load given path on demands, // GetByPath load given path on demands,
// which only ensure nodes before this path got parsed // which only ensure nodes before this path got parsed.
//
// Note, the api expects the json is well-formed at least,
// otherwise it may return unexpected result.
func (self *Node) GetByPath(path ...interface{}) *Node { func (self *Node) GetByPath(path ...interface{}) *Node {
if !self.Valid() { if !self.Valid() {
return self return self

View file

@ -34,7 +34,6 @@ import (
`strings` `strings`
`testing` `testing`
`time` `time`
`unicode/utf8`
`unsafe` `unsafe`
`github.com/bytedance/sonic/decoder` `github.com/bytedance/sonic/decoder`
@ -1011,8 +1010,8 @@ var unmarshalTests = []unmarshalTest{
{in: "\"\x00\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true}, {in: "\"\x00\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\x00\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true}, {in: "\"\x00\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
{in: "\"\xff\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true}, {in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\ufffd"), validateString: true},
{in: "\"\xff\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true}, {in: "\"\xff\"", ptr: new(string), out: "\ufffd", validateString: true},
{in: "\"\x00\"", ptr: new(interface{}), out: interface{}("\x00"), validateString: false}, {in: "\"\x00\"", ptr: new(interface{}), out: interface{}("\x00"), validateString: false},
{in: "\"\x00\"", ptr: new(string), out: "\x00", validateString: false}, {in: "\"\x00\"", ptr: new(string), out: "\x00", validateString: false},
{in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\xff"), validateString: false}, {in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\xff"), validateString: false},
@ -1147,7 +1146,6 @@ func TestUnmarshal(t *testing.T) {
} }
dec := decoder.NewDecoder(tt.in) dec := decoder.NewDecoder(tt.in)
validUtf8 := true
if tt.useNumber { if tt.useNumber {
dec.UseNumber() dec.UseNumber()
} }
@ -1156,10 +1154,9 @@ func TestUnmarshal(t *testing.T) {
} }
if tt.validateString { if tt.validateString {
dec.ValidateString() dec.ValidateString()
validUtf8 = utf8.Valid([]byte(tt.in))
} }
if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil && validUtf8) { if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil) {
spew.Dump(tt.in) spew.Dump(tt)
t.Fatalf("#%d: %v, want %v", i, err, tt.err) t.Fatalf("#%d: %v, want %v", i, err, tt.err)
continue continue
} else if err != nil { } else if err != nil {
@ -2524,3 +2521,68 @@ func TestChangeTool(t *testing.T) {
} }
} }
func TestDecoder_LongestInvalidUtf8(t *testing.T) {
for _, data := range([]string{
"\"" + strings.Repeat("\x80", 4096) + "\"",
"\"" + strings.Repeat("\x80", 4095) + "\"",
"\"" + strings.Repeat("\x80", 4097) + "\"",
"\"" + strings.Repeat("\x80", 12345) + "\"",
}) {
testDecodeInvalidUtf8(t, []byte(data))
}
}
func testDecodeInvalidUtf8(t *testing.T, data []byte) {
var sgot, jgot string
serr := ConfigStd.Unmarshal(data, &sgot)
jerr := json.Unmarshal(data, &jgot)
assert.Equal(t, serr != nil, jerr != nil)
if jerr == nil {
assert.Equal(t, sgot, jgot)
}
}
func needEscape(b byte) bool {
return b == '"' || b == '\\' || b < '\x20'
}
func genRandJsonBytes(length int) []byte {
var buf bytes.Buffer
buf.WriteByte('"')
for j := 0; j < length; j++ {
r := rand.Intn(0xff + 1)
if needEscape(byte(r)) {
buf.WriteByte('\\')
}
buf.WriteByte(byte(r))
}
buf.WriteByte('"')
return buf.Bytes()
}
func genRandJsonRune(length int) []byte {
var buf bytes.Buffer
buf.WriteByte('"')
for j := 0; j < length; j++ {
r := rand.Intn(0x10FFFF + 1)
if r < 0x80 && needEscape(byte(r)) {
buf.WriteByte('\\')
buf.WriteByte(byte(r))
} else {
buf.WriteRune(rune(r))
}
}
buf.WriteByte('"')
return buf.Bytes()
}
func TestDecoder_RandomInvalidUtf8(t *testing.T) {
nums := 1000
maxLen := 1000
for i := 0; i < nums; i++ {
length := rand.Intn(maxLen)
testDecodeInvalidUtf8(t, genRandJsonBytes(length))
testDecodeInvalidUtf8(t, genRandJsonRune(length))
}
}

View file

@ -25,6 +25,7 @@ import (
`github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/native/types`
`github.com/bytedance/sonic/internal/rt` `github.com/bytedance/sonic/internal/rt`
`github.com/bytedance/sonic/option` `github.com/bytedance/sonic/option`
`github.com/bytedance/sonic/utf8`
) )
const ( const (
@ -80,9 +81,39 @@ func (self *Decoder) Reset(s string) {
// self.f = 0 // self.f = 0
} }
func (self *Decoder) CheckTrailings() error {
pos := self.i
buf := self.s
/* skip all the trailing spaces */
if pos != len(buf) {
for pos < len(buf) && (types.SPACE_MASK & (1 << buf[pos])) != 0 {
pos++
}
}
/* then it must be at EOF */
if pos == len(buf) {
return nil
}
/* junk after JSON value */
return SyntaxError {
Src : buf,
Pos : pos,
Code : types.ERR_INVALID_CHAR,
}
}
// Decode parses the JSON-encoded data from current position and stores the result // Decode parses the JSON-encoded data from current position and stores the result
// in the value pointed to by val. // in the value pointed to by val.
func (self *Decoder) Decode(val interface{}) error { func (self *Decoder) Decode(val interface{}) error {
/* validate json if needed */
if (self.f & (1 << _F_validate_string)) != 0 && !utf8.ValidateString(self.s){
dbuf := utf8.CorrectWith(nil, rt.Str2Mem(self.s), "\ufffd")
self.s = rt.Mem2Str(dbuf)
}
vv := rt.UnpackEface(val) vv := rt.UnpackEface(val)
vp := vv.Value vp := vv.Value
@ -99,7 +130,6 @@ func (self *Decoder) Decode(val interface{}) error {
/* create a new stack, and call the decoder */ /* create a new stack, and call the decoder */
sb, etp := newStack(), rt.PtrElem(vv.Type) sb, etp := newStack(), rt.PtrElem(vv.Type)
nb, err := decodeTypedPointer(self.s, self.i, etp, vp, sb, self.f) nb, err := decodeTypedPointer(self.s, self.i, etp, vp, sb, self.f)
/* return the stack back */ /* return the stack back */
self.i = nb self.i = nb
freeStack(sb) freeStack(sb)

View file

@ -345,7 +345,6 @@ func TestDecoder_Generic(t *testing.T) {
pos, err := decode(TwitterJson, &v, false) pos, err := decode(TwitterJson, &v, false)
assert.NoError(t, err) assert.NoError(t, err)
assert.Equal(t, len(TwitterJson), pos) assert.Equal(t, len(TwitterJson), pos)
spew.Dump(v)
} }
func TestDecoder_Binding(t *testing.T) { func TestDecoder_Binding(t *testing.T) {

View file

@ -34,6 +34,7 @@ import (
`testing` `testing`
`time` `time`
`unsafe` `unsafe`
`strings`
`github.com/bytedance/sonic/encoder` `github.com/bytedance/sonic/encoder`
`github.com/stretchr/testify/assert` `github.com/stretchr/testify/assert`
@ -52,7 +53,6 @@ func TestMain(m *testing.M) {
runtime.GC() runtime.GC()
debug.FreeOSMemory() debug.FreeOSMemory()
} }
println("stop GC looping!")
}() }()
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
m.Run() m.Run()
@ -1169,3 +1169,32 @@ func TestMarshalNullNil(t *testing.T) {
assert.Nil(t, e) assert.Nil(t, e)
assert.Equal(t, `{"A":[],"B":{}}`, string(o)) assert.Equal(t, `{"A":[],"B":{}}`, string(o))
} }
func TestEncoder_LongestInvalidUtf8(t *testing.T) {
for _, data := range([]string{
"\"" + strings.Repeat("\x80", 4096) + "\"",
"\"" + strings.Repeat("\x80", 4095) + "\"",
"\"" + strings.Repeat("\x80", 4097) + "\"",
"\"" + strings.Repeat("\x80", 12345) + "\"",
}) {
testEncodeInvalidUtf8(t, []byte(data))
}
}
func testEncodeInvalidUtf8(t *testing.T, data []byte) {
jgot, jerr := json.Marshal(data)
sgot, serr := ConfigStd.Marshal(data)
assert.Equal(t, serr != nil, jerr != nil)
if jerr == nil {
assert.Equal(t, sgot, jgot)
}
}
func TestEncoder_RandomInvalidUtf8(t *testing.T) {
nums := 1000
maxLen := 1000
for i := 0; i < nums; i++ {
testEncodeInvalidUtf8(t, genRandJsonBytes(maxLen))
testEncodeInvalidUtf8(t, genRandJsonRune(maxLen))
}
}

View file

@ -21,11 +21,11 @@ import (
`encoding/json` `encoding/json`
`reflect` `reflect`
`runtime` `runtime`
`unsafe`
`github.com/bytedance/sonic/internal/native` `github.com/bytedance/sonic/internal/native`
`github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/native/types`
`github.com/bytedance/sonic/internal/rt` `github.com/bytedance/sonic/internal/rt`
`github.com/bytedance/sonic/utf8`
`github.com/bytedance/sonic/option` `github.com/bytedance/sonic/option`
) )
@ -38,6 +38,7 @@ const (
bitCompactMarshaler bitCompactMarshaler
bitNoQuoteTextMarshaler bitNoQuoteTextMarshaler
bitNoNullSliceOrMap bitNoNullSliceOrMap
bitValidateString
// used for recursive compile // used for recursive compile
bitPointerValue = 63 bitPointerValue = 63
@ -66,6 +67,10 @@ const (
// instead of 'null' // instead of 'null'
NoNullSliceOrMap Options = 1 << bitNoNullSliceOrMap NoNullSliceOrMap Options = 1 << bitNoNullSliceOrMap
// ValidateString indicates that encoder should validate the input string
// before encoding it into JSON.
ValidateString Options = 1 << bitValidateString
// CompatibleWithStd is used to be compatible with std encoder. // CompatibleWithStd is used to be compatible with std encoder.
CompatibleWithStd Options = SortMapKeys | EscapeHTML | CompactMarshaler CompatibleWithStd Options = SortMapKeys | EscapeHTML | CompactMarshaler
) )
@ -100,6 +105,15 @@ func (self *Encoder) SetEscapeHTML(f bool) {
} }
} }
// SetValidateString specifies if option ValidateString opens
func (self *Encoder) SetValidateString(f bool) {
if f {
self.Opts |= ValidateString
} else {
self.Opts &= ^ValidateString
}
}
// SetCompactMarshaler specifies if option CompactMarshaler opens // SetCompactMarshaler specifies if option CompactMarshaler opens
func (self *Encoder) SetCompactMarshaler(f bool) { func (self *Encoder) SetCompactMarshaler(f bool) {
if f { if f {
@ -156,7 +170,7 @@ func Encode(val interface{}, opts Options) ([]byte, error) {
return nil, err return nil, err
} }
if opts & EscapeHTML != 0 { if opts & EscapeHTML != 0 || opts & ValidateString != 0 {
return buf, nil return buf, nil
} }
@ -189,6 +203,12 @@ func EncodeInto(buf *[]byte, val interface{}, opts Options) error {
*buf = dest *buf = dest
} }
if opts & ValidateString != 0 && !utf8.Validate(*buf) {
dest := utf8.CorrectWith(nil, *buf, `\ufffd`)
freeBytes(*buf) // free origin used buffer
*buf = dest
}
/* avoid GC ahead */ /* avoid GC ahead */
runtime.KeepAlive(buf) runtime.KeepAlive(buf)
runtime.KeepAlive(efv) runtime.KeepAlive(efv)
@ -203,38 +223,8 @@ var typeByte = rt.UnpackType(reflect.TypeOf(byte(0)))
// For historical reasons, web browsers don't honor standard HTML // For historical reasons, web browsers don't honor standard HTML
// escaping within <script> tags, so an alternative JSON encoding must // escaping within <script> tags, so an alternative JSON encoding must
// be used. // be used.
func HTMLEscape(dest []byte, src []byte) []byte { func HTMLEscape(dst []byte, src []byte) []byte {
nb := len(src) return htmlEscape(dst, src)
// initilize dest buffer
cap := nb * 6 / 5
if dest == nil {
dest = make([]byte, 0, cap)
}
ds := (*rt.GoSlice)(unsafe.Pointer(&dest))
sp := (*rt.GoSlice)(unsafe.Pointer(&src)).Ptr
ds.Len = 0
if (ds.Cap < cap) {
*ds = growslice(typeByte, *ds, cap)
}
for nb > 0 {
dp := unsafe.Pointer(uintptr(ds.Ptr) + uintptr(ds.Len))
dn := ds.Cap - ds.Len
ret := native.HTMLEscape(sp, nb, dp, &dn)
ds.Len += dn
if ret >= 0 {
break
}
ret = ^ret
nb -= ret
*ds = growslice(typeByte, *ds, ds.Cap * 2)
sp = unsafe.Pointer(uintptr(sp) + uintptr(ret))
}
return dest
} }
// EncodeIndented is like Encode but applies Indent to format the output. // EncodeIndented is like Encode but applies Indent to format the output.
@ -293,6 +283,8 @@ func Pretouch(vt reflect.Type, opts ...option.CompileOption) error {
// Valid validates json and returns first non-blank character position, // Valid validates json and returns first non-blank character position,
// if it is only one valid json value. // if it is only one valid json value.
// Otherwise returns invalid character position using start. // Otherwise returns invalid character position using start.
//
// Note: it does not check for the invalid UTF-8 characters.
func Valid(data []byte) (ok bool, start int) { func Valid(data []byte) (ok bool, start int) {
n := len(data) n := len(data)
if n == 0 { if n == 0 {
@ -303,13 +295,17 @@ func Valid(data []byte) (ok bool, start int) {
m := types.NewStateMachine() m := types.NewStateMachine()
ret := native.ValidateOne(&s, &p, m) ret := native.ValidateOne(&s, &p, m)
types.FreeStateMachine(m) types.FreeStateMachine(m)
if ret < 0 { if ret < 0 {
return false, p-1 return false, p-1
} }
/* check for trailing spaces */
for ;p < n; p++ { for ;p < n; p++ {
if (types.SPACE_MASK & (1 << data[p])) == 0 { if (types.SPACE_MASK & (1 << data[p])) == 0 {
return false, p return false, p
} }
} }
return true, ret return true, ret
} }

View file

@ -113,6 +113,39 @@ func encodeTextMarshaler(buf *[]byte, val encoding.TextMarshaler, opt Options) e
} }
} }
func htmlEscape(dst []byte, src []byte) []byte {
var sidx int
dst = append(dst, src[:0]...) // avoid check nil dst
sbuf := (*rt.GoSlice)(unsafe.Pointer(&src))
dbuf := (*rt.GoSlice)(unsafe.Pointer(&dst))
/* grow dst if it is shorter */
if cap(dst) - len(dst) < len(src) + native.BufPaddingSize {
cap := len(src) * 3 / 2 + native.BufPaddingSize
*dbuf = growslice(typeByte, *dbuf, cap)
}
for sidx < sbuf.Len {
sp := padd(sbuf.Ptr, sidx)
dp := padd(dbuf.Ptr, dbuf.Len)
sn := sbuf.Len - sidx
dn := dbuf.Cap - dbuf.Len
nb := native.HTMLEscape(sp, sn, dp, &dn)
/* check for errors */
if dbuf.Len += dn; nb >= 0 {
break
}
/* not enough space, grow the slice and try again */
sidx += ^nb
*dbuf = growslice(typeByte, *dbuf, dbuf.Cap * 2)
}
return dst
}
var ( var (
argPtrs = []bool { true, true, true, false } argPtrs = []bool { true, true, true, false }
localPtrs = []bool{} localPtrs = []bool{}

View file

@ -1,12 +1,14 @@
testname := FuzzMain
corpusdir := ./testdata/fuzz/${testname}
fuzz: fuzz:
mkdir -p ./testdata/fuzz/FuzzMain mkdir -p ${corpusdir}
rm -rf ./go-fuzz-corpus rm -rf ./go-fuzz-corpus
git clone https://github.com/dvyukov/go-fuzz-corpus.git ./go-fuzz-corpus/ git clone https://github.com/dvyukov/go-fuzz-corpus.git ./go-fuzz-corpus/
file2fuzz -o ./testdata/fuzz/FuzzMain ./go-fuzz-corpus/json/corpus/* ./corpus/* file2fuzz -o ${corpusdir} ./go-fuzz-corpus/json/corpus/* ./corpus/*
run: run:
GOARCH=amd64 go test -fuzz=Fuzz -v GOARCH=amd64 go test -fuzz=${testname} -v
clean: clean:
rm -rf ./go-fuzz-corpus/ rm -rf ./go-fuzz-corpus/

View file

@ -19,17 +19,21 @@
package sonic_fuzz package sonic_fuzz
import ( import (
`encoding/json` `encoding/json`
`testing` `testing`
`unicode/utf8` _ `unicode/utf8`
`reflect`
`os` `os`
`runtime` `runtime`
`runtime/debug` `runtime/debug`
`time` `time`
`io`
`log`
`strconv`
`github.com/bytedance/sonic` `github.com/bytedance/sonic`
`github.com/stretchr/testify/require` `github.com/stretchr/testify/require`
`github.com/davecgh/go-spew/spew`
`github.com/bytedance/gopkg/util/gctuner`
) )
func FuzzMain(f *testing.F) { func FuzzMain(f *testing.F) {
@ -39,11 +43,18 @@ func FuzzMain(f *testing.F) {
f.Fuzz(fuzzMain) f.Fuzz(fuzzMain)
} }
// Used for debug falied fuzz corpus
func TestCorpus(t *testing.T) {
fuzzMain(t, []byte("[1\x00"))
}
var target = sonic.ConfigStd
func fuzzMain(t *testing.T, data []byte) { func fuzzMain(t *testing.T, data []byte) {
fuzzValidate(t, data) fuzzValidate(t, data)
fuzzHtmlEscape(t, data) fuzzHtmlEscape(t, data)
// Only fuzz the validate json here, because the default configuration does not have validation in SONIC. // Only fuzz the validate json here.
if !utf8.Valid(data) || !json.Valid(data) { if !json.Valid(data) {
return return
} }
for _, typ := range []func() interface{}{ for _, typ := range []func() interface{}{
@ -54,31 +65,34 @@ func fuzzMain(t *testing.T, data []byte) {
func() interface{} { return new(int64) }, func() interface{} { return new(int64) },
func() interface{} { return new(uint64) }, func() interface{} { return new(uint64) },
func() interface{} { return new(float64) }, func() interface{} { return new(float64) },
func() interface{} { return new(json.Number) }, // func() interface{} { return new(json.Number) },
func() interface{} { return new(S) }, // func() interface{} { return new(S) },
} { } {
sv, jv := typ(), typ() sv, jv := typ(), typ()
serr := sonic.Unmarshal([]byte(data), sv) serr := target.Unmarshal([]byte(data), sv)
jerr := json.Unmarshal([]byte(data), jv) jerr := json.Unmarshal([]byte(data), jv)
require.Equalf(t, serr != nil, jerr != nil, "different error in sonic unmarshal %v", reflect.TypeOf(jv)) require.Equal(t, serr != nil, jerr != nil,
dump(data, jv, jerr, sv, serr))
if jerr != nil { if jerr != nil {
continue continue
} }
require.Equal(t, sv, jv, "different result in sonic unmarshal %v", reflect.TypeOf(jv)) require.Equal(t, sv, jv, dump(data, jv, jerr, sv, serr))
sout, serr := sonic.Marshal(sv)
jout, jerr := json.Marshal(jv) v := jv
require.NoError(t, serr, "error in sonic marshal %v", reflect.TypeOf(jv)) sout, serr := target.Marshal(v)
require.NoError(t, jerr, "error in json marshal %v", reflect.TypeOf(jv)) jout, jerr := json.Marshal(v)
require.NoError(t, serr, dump(v, jout, jerr, sout, serr))
require.NoError(t, jerr, dump(v, jout, jerr, sout, serr))
{ {
sv, jv := typ(), typ() sv, jv := typ(), typ()
serr := sonic.Unmarshal(sout, sv) serr := target.Unmarshal(sout, sv)
jerr := json.Unmarshal(jout, jv) jerr := json.Unmarshal(jout, jv)
require.Equalf(t, serr != nil, jerr != nil, "different error in sonic unmarshal again %v", reflect.TypeOf(jv)) require.Equalf(t, serr != nil, jerr != nil, dump(data, jv, jerr, sv, serr))
if jerr != nil { if jerr != nil {
continue continue
} }
require.Equal(t, sv, jv, "different result in sonic unmarshal again %v", reflect.TypeOf(jv)) require.Equal(t, sv, jv, dump(data, jv, jerr, sv, serr))
} }
if m, ok := sv.(*map[string]interface{}); ok { if m, ok := sv.(*map[string]interface{}); ok {
@ -89,52 +103,54 @@ func fuzzMain(t *testing.T, data []byte) {
fuzzASTGetFromArray(t, jout, *a) fuzzASTGetFromArray(t, jout, *a)
} }
} }
} }
type S struct { type S struct {
A int `json:",omitempty"` A int `json:",omitempty"`
B string `json:"B1,omitempty"` B string `json:"B1,omitempty"`
C float64 C float64
D bool D bool
E uint8 E uint8
// F []byte // unmarshal []byte is different with encoding/json // F []byte // unmarshal []byte is different with encoding/json
G interface{} G interface{}
H map[string]interface{} H map[string]interface{}
I map[string]string I map[string]string
J []interface{} J []interface{}
K []string K []string
L S1 L S1
M *S1 M *S1
N *int N *int
O **int O **int
P int `json:",string"` P int `json:",string"`
Q float64 `json:",string"` Q float64 `json:",string"`
R int `json:"-"` R int `json:"-"`
T struct {} T struct {}
U [2]int U [2]int
V uintptr V uintptr
W json.Number W json.Number
// X json.RawMessage // X json.RawMessage
Y Marshaller Y Marshaller
Z TextMarshaller Z TextMarshaller
} }
type S1 struct { type S1 struct {
A int A int
B string B string
} }
type Marshaller struct { type Marshaller struct {
v string v string
} }
func (m *Marshaller) MarshalJSON() ([]byte, error) { func (m *Marshaller) MarshalJSON() ([]byte, error) {
return json.Marshal(m.v) return json.Marshal(m.v)
} }
func (m *Marshaller) UnmarshalJSON(data []byte) error { func (m *Marshaller) UnmarshalJSON(data []byte) error {
return json.Unmarshal(data, &m.v) return json.Unmarshal(data, &m.v)
} }
type TextMarshaller struct { type TextMarshaller struct {
@ -149,19 +165,54 @@ func (k *TextMarshaller) UnmarshalText(data []byte) error {
return json.Unmarshal(data, &k.v) return json.Unmarshal(data, &k.v)
} }
var debugAsyncGC = os.Getenv("SONIC_NO_ASYNC_GC") == ""
func TestMain(m *testing.M) { func dump(args ...interface{}) string {
return spew.Sdump(args)
}
func fdump(w io.Writer, args ...interface{}) {
spew.Fdump(w, args)
}
const (
MemoryLimitEnv = "SONIC_FUZZ_MEM_LIMIT"
AsynyncGCEnv = "SONIC_NO_ASYNC_GC"
KB uint64 = 1024
MB uint64 = 1024 * KB
GB uint64 = 1024 * MB
)
func setMemLimit(limit uint64) {
threshold := uint64(float64(limit) * 0.7)
numWorker := uint64(runtime.GOMAXPROCS(0))
if os.Getenv(MemoryLimitEnv) != "" {
if memGB, err := strconv.ParseUint(os.Getenv(MemoryLimitEnv), 10, 64); err == nil {
limit = memGB * GB
}
}
gctuner.Tuning(threshold / numWorker)
log.Printf("[%d] Memory Limit: %d GB, Memory Threshold: %d MB\n", os.Getpid(), limit/GB, threshold/MB)
log.Printf("[%d] Memory Threshold Per Worker: %d MB\n", os.Getpid(), threshold/numWorker/MB)
}
func enableSyncGC() {
var debugAsyncGC = os.Getenv("AsynyncGCEnv") == ""
go func () { go func () {
if !debugAsyncGC { if !debugAsyncGC {
return return
} }
println("Begin GC looping...") log.Printf("Begin GC looping...")
for { for {
runtime.GC() runtime.GC()
debug.FreeOSMemory() debug.FreeOSMemory()
} }
}() }()
}
func TestMain(m *testing.M) {
// Avoid OOM
setMemLimit(8 * GB)
enableSyncGC()
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
m.Run() m.Run()
} }

View file

@ -3,18 +3,19 @@ module github.com/bytedance/sonic/fuzz
go 1.18 go 1.18
require ( require (
github.com/bytedance/sonic v1.0.0 github.com/bytedance/gopkg v0.0.0-20221122125632-68358b8ecec6
github.com/stretchr/testify v1.7.0 github.com/bytedance/sonic v1.5.0
github.com/davecgh/go-spew v1.1.1
github.com/stretchr/testify v1.8.1
) )
require ( require (
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06 // indirect github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/klauspost/cpuid/v2 v2.0.9 // indirect github.com/klauspost/cpuid/v2 v2.0.9 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
) )
replace github.com/bytedance/sonic => ../. replace github.com/bytedance/sonic => ../.

View file

@ -1,40 +1,31 @@
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06 h1:1sDoSuDPWzhkdzNVxCxtIaKiAe96ESVPv8coGwc1gZ4= github.com/bytedance/gopkg v0.0.0-20221122125632-68358b8ecec6 h1:FCLDGi1EmB7JzjVVYNZiqc/zAJj2BQ5M0lfkVOxbfs8=
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY= github.com/bytedance/gopkg v0.0.0-20221122125632-68358b8ecec6/go.mod h1:5FoAH5xUHHCMDvQPy1rnj8moqLkLHFaDVBjHhcFwEi0=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/goccy/go-json v0.9.4 h1:L8MLKG2mvVXiQu07qB6hmfqeSYQdOnqPot2GhsIwIaI=
github.com/goccy/go-json v0.9.4/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/tidwall/gjson v1.12.1/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/tidwall/gjson v1.13.0 h1:3TFY9yxOQShrvmjdM76K+jc66zJeT6D3/VFFYCGQf7M= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/tidwall/gjson v1.13.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/sjson v1.2.4 h1:cuiLzLnaMeBhRmEv00Lpk3tkYrcxpmbU81tAY4Dw0tc=
github.com/tidwall/sjson v1.2.4/go.mod h1:098SZ494YoMWPmMO6ct4dcFnqxwj9r/gF0Etp19pSNM=
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670 h1:18EFjUmQOcUvxNYSkA6jO9VAiXCnxFY6NyDX0bHDmkU= golang.org/x/arch v0.0.0-20210923205945-b76863e36670 h1:18EFjUmQOcUvxNYSkA6jO9VAiXCnxFY6NyDX0bHDmkU=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

View file

@ -71,10 +71,10 @@ func generateJSONTag(name string) reflect.StructTag {
name = strings.Split(name, ",")[0] // remove origin "," in tag name name = strings.Split(name, ",")[0] // remove origin "," in tag name
switch int(rand.Int() % 5) { switch int(rand.Int() % 5) {
case 0: return reflect.StructTag(`json:"-"`) // always omitted case 0: return reflect.StructTag(`json:"-"`) // always omitted
case 1: return reflect.StructTag("") // empty tag case 1: opt = "" // empty opt
case 2: opt = "" // empty opt case 2: opt = "omitempty"
case 3: opt = "omitempty" // case 3: opt = "string"
case 4: opt = "string" default: return reflect.StructTag("") // empty tag
} }
return reflect.StructTag(fmt.Sprintf(`json:"%s,%s"`, name, opt)) return reflect.StructTag(fmt.Sprintf(`json:"%s,%s"`, name, opt))
} }
@ -146,7 +146,7 @@ func fuzzDynamicStruct(t *testing.T, data []byte, v map[string]interface{}) {
require.NoErrorf(t, err, "error in sonic pretouch struct %v", typ) require.NoErrorf(t, err, "error in sonic pretouch struct %v", typ)
// Unmarshal fuzz // Unmarshal fuzz
serr := sonic.Unmarshal(data, &sv) serr := target.Unmarshal(data, &sv)
jerr := json.Unmarshal(data, &jv) jerr := json.Unmarshal(data, &jv)
require.Equalf(t, serr != nil, jerr != nil, "different error in sonic unmarshal %v", typ) require.Equalf(t, serr != nil, jerr != nil, "different error in sonic unmarshal %v", typ)
if serr != nil { if serr != nil {
@ -155,7 +155,7 @@ func fuzzDynamicStruct(t *testing.T, data []byte, v map[string]interface{}) {
require.Equal(t, sv, jv, "different result in sonic unmarshal %v", typ) require.Equal(t, sv, jv, "different result in sonic unmarshal %v", typ)
// Marshal fuzz // Marshal fuzz
sout, serr := sonic.Marshal(sv) sout, serr := target.Marshal(sv)
jout, jerr := json.Marshal(jv) jout, jerr := json.Marshal(jv)
require.NoError(t, serr, "error in sonic marshal %v", typ) require.NoError(t, serr, "error in sonic marshal %v", typ)
require.NoError(t, jerr, "error in json marshal %v", typ) require.NoError(t, jerr, "error in json marshal %v", typ)

View file

@ -123,3 +123,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
//go:noescape //go:noescape
//goland:noinspection GoUnusedParameter //goland:noinspection GoUnusedParameter
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int) func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8_fast(s *string) (ret int)

File diff suppressed because it is too large Load diff

View file

@ -256,15 +256,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt)) assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
} }
func TestNative_Vstring_ValidUtf8(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\xff\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_VstringEscapeEOF(t *testing.T) { func TestNative_VstringEscapeEOF(t *testing.T) {
var v types.JsonState var v types.JsonState
i := 0 i := 0
@ -275,51 +266,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
assert.Equal(t, int64(0), v.Iv) assert.Equal(t, int64(0), v.Iv)
} }
func TestNative_ValidateOne(t *testing.T) {
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, len(s), p)
assert.Equal(t, 0, r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x00\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x80\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\xed\xbf\xbf\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
}
func TestNative_VstringHangUpOnRandomData(t *testing.T) { func TestNative_VstringHangUpOnRandomData(t *testing.T) {
v, e := hex.DecodeString( v, e := hex.DecodeString(
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" + "228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +

View file

@ -9,32 +9,34 @@ package avx
func __native_entry__() uintptr func __native_entry__() uintptr
var ( var (
_subr__f32toa = __native_entry__() + 28656 _subr__f32toa = __native_entry__() + 29744
_subr__f64toa = __native_entry__() + 496 _subr__f64toa = __native_entry__() + 496
_subr__get_by_path = __native_entry__() + 26848 _subr__get_by_path = __native_entry__() + 27424
_subr__html_escape = __native_entry__() + 10480 _subr__html_escape = __native_entry__() + 9968
_subr__i64toa = __native_entry__() + 4176 _subr__i64toa = __native_entry__() + 4112
_subr__lspace = __native_entry__() + 80 _subr__lspace = __native_entry__() + 80
_subr__quote = __native_entry__() + 5552 _subr__quote = __native_entry__() + 5792
_subr__skip_array = __native_entry__() + 20160 _subr__skip_array = __native_entry__() + 20576
_subr__skip_number = __native_entry__() + 23472 _subr__skip_number = __native_entry__() + 23920
_subr__skip_object = __native_entry__() + 22048 _subr__skip_object = __native_entry__() + 22496
_subr__skip_one = __native_entry__() + 23616 _subr__skip_one = __native_entry__() + 24080
_subr__skip_one_fast = __native_entry__() + 23824 _subr__skip_one_fast = __native_entry__() + 24320
_subr__u64toa = __native_entry__() + 4288 _subr__u64toa = __native_entry__() + 4384
_subr__unquote = __native_entry__() + 7296 _subr__unquote = __native_entry__() + 7488
_subr__validate_one = __native_entry__() + 23648 _subr__validate_one = __native_entry__() + 24144
_subr__value = __native_entry__() + 13728 _subr__validate_utf8 = __native_entry__() + 28464
_subr__vnumber = __native_entry__() + 17904 _subr__validate_utf8_fast = __native_entry__() + 29136
_subr__vsigned = __native_entry__() + 19456 _subr__value = __native_entry__() + 14672
_subr__vstring = __native_entry__() + 15808 _subr__vnumber = __native_entry__() + 18320
_subr__vunsigned = __native_entry__() + 19808 _subr__vsigned = __native_entry__() + 19856
_subr__vstring = __native_entry__() + 16864
_subr__vunsigned = __native_entry__() + 20208
) )
const ( const (
_stack__f32toa = 64 _stack__f32toa = 56
_stack__f64toa = 80 _stack__f64toa = 80
_stack__get_by_path = 296 _stack__get_by_path = 312
_stack__html_escape = 64 _stack__html_escape = 64
_stack__i64toa = 16 _stack__i64toa = 16
_stack__lspace = 8 _stack__lspace = 8
@ -45,10 +47,12 @@ const (
_stack__skip_one = 128 _stack__skip_one = 128
_stack__skip_one_fast = 208 _stack__skip_one_fast = 208
_stack__u64toa = 8 _stack__u64toa = 8
_stack__unquote = 72 _stack__unquote = 128
_stack__validate_one = 128 _stack__validate_one = 128
_stack__value = 336 _stack__validate_utf8 = 48
_stack__vnumber = 248 _stack__validate_utf8_fast = 24
_stack__value = 368
_stack__vnumber = 280
_stack__vsigned = 16 _stack__vsigned = 16
_stack__vstring = 128 _stack__vstring = 128
_stack__vunsigned = 24 _stack__vunsigned = 24
@ -70,6 +74,8 @@ var (
_ = _subr__u64toa _ = _subr__u64toa
_ = _subr__unquote _ = _subr__unquote
_ = _subr__validate_one _ = _subr__validate_one
_ = _subr__validate_utf8
_ = _subr__validate_utf8_fast
_ = _subr__value _ = _subr__value
_ = _subr__vnumber _ = _subr__vnumber
_ = _subr__vsigned _ = _subr__vsigned
@ -93,6 +99,8 @@ const (
_ = _stack__u64toa _ = _stack__u64toa
_ = _stack__unquote _ = _stack__unquote
_ = _stack__validate_one _ = _stack__validate_one
_ = _stack__validate_utf8
_ = _stack__validate_utf8_fast
_ = _stack__value _ = _stack__value
_ = _stack__vnumber _ = _stack__vnumber
_ = _stack__vsigned _ = _stack__vsigned

View file

@ -123,3 +123,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
//go:noescape //go:noescape
//goland:noinspection GoUnusedParameter //goland:noinspection GoUnusedParameter
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int) func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8_fast(s *string) (ret int)

File diff suppressed because it is too large Load diff

View file

@ -256,15 +256,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt)) assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
} }
func TestNative_Vstring_ValidUtf8(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\xff\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_VstringEscapeEOF(t *testing.T) { func TestNative_VstringEscapeEOF(t *testing.T) {
var v types.JsonState var v types.JsonState
i := 0 i := 0
@ -275,51 +266,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
assert.Equal(t, int64(0), v.Iv) assert.Equal(t, int64(0), v.Iv)
} }
func TestNative_ValidateOne(t *testing.T) {
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, len(s), p)
assert.Equal(t, 0, r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x00\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x80\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\xed\xbf\xbf\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
}
func TestNative_VstringHangUpOnRandomData(t *testing.T) { func TestNative_VstringHangUpOnRandomData(t *testing.T) {
v, e := hex.DecodeString( v, e := hex.DecodeString(
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" + "228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +

View file

@ -9,48 +9,52 @@ package avx2
func __native_entry__() uintptr func __native_entry__() uintptr
var ( var (
_subr__f32toa = __native_entry__() + 32816 _subr__f32toa = __native_entry__() + 35216
_subr__f64toa = __native_entry__() + 752 _subr__f64toa = __native_entry__() + 752
_subr__get_by_path = __native_entry__() + 30896 _subr__get_by_path = __native_entry__() + 30384
_subr__html_escape = __native_entry__() + 12320 _subr__html_escape = __native_entry__() + 11712
_subr__i64toa = __native_entry__() + 4432 _subr__i64toa = __native_entry__() + 4368
_subr__lspace = __native_entry__() + 224 _subr__lspace = __native_entry__() + 224
_subr__quote = __native_entry__() + 5904 _subr__quote = __native_entry__() + 6160
_subr__skip_array = __native_entry__() + 23472 _subr__skip_array = __native_entry__() + 22864
_subr__skip_number = __native_entry__() + 27440 _subr__skip_number = __native_entry__() + 26928
_subr__skip_object = __native_entry__() + 25392 _subr__skip_object = __native_entry__() + 24864
_subr__skip_one = __native_entry__() + 27584 _subr__skip_one = __native_entry__() + 27088
_subr__skip_one_fast = __native_entry__() + 27984 _subr__skip_one_fast = __native_entry__() + 27504
_subr__u64toa = __native_entry__() + 4544 _subr__u64toa = __native_entry__() + 4640
_subr__unquote = __native_entry__() + 8848 _subr__unquote = __native_entry__() + 8960
_subr__validate_one = __native_entry__() + 27616 _subr__validate_one = __native_entry__() + 27152
_subr__value = __native_entry__() + 16896 _subr__validate_utf8 = __native_entry__() + 31552
_subr__vnumber = __native_entry__() + 21216 _subr__validate_utf8_fast = __native_entry__() + 32496
_subr__vsigned = __native_entry__() + 22768 _subr__value = __native_entry__() + 16816
_subr__vstring = __native_entry__() + 19280 _subr__vnumber = __native_entry__() + 20608
_subr__vunsigned = __native_entry__() + 23120 _subr__vsigned = __native_entry__() + 22144
_subr__vstring = __native_entry__() + 19312
_subr__vunsigned = __native_entry__() + 22496
) )
const ( const (
_stack__f32toa = 64 _stack__f32toa = 56
_stack__f64toa = 80 _stack__f64toa = 80
_stack__get_by_path = 304 _stack__get_by_path = 320
_stack__html_escape = 72 _stack__html_escape = 72
_stack__i64toa = 16 _stack__i64toa = 16
_stack__lspace = 8 _stack__lspace = 8
_stack__quote = 72 _stack__quote = 72
_stack__skip_array = 136 _stack__skip_array = 120
_stack__skip_number = 80 _stack__skip_number = 80
_stack__skip_object = 136 _stack__skip_object = 120
_stack__skip_one = 136 _stack__skip_one = 120
_stack__skip_one_fast = 216 _stack__skip_one_fast = 216
_stack__u64toa = 8 _stack__u64toa = 8
_stack__unquote = 72 _stack__unquote = 128
_stack__validate_one = 136 _stack__validate_one = 120
_stack__value = 336 _stack__validate_utf8 = 48
_stack__vnumber = 248 _stack__validate_utf8_fast = 200
_stack__value = 368
_stack__vnumber = 280
_stack__vsigned = 16 _stack__vsigned = 16
_stack__vstring = 136 _stack__vstring = 104
_stack__vunsigned = 24 _stack__vunsigned = 24
) )
@ -70,6 +74,8 @@ var (
_ = _subr__u64toa _ = _subr__u64toa
_ = _subr__unquote _ = _subr__unquote
_ = _subr__validate_one _ = _subr__validate_one
_ = _subr__validate_utf8
_ = _subr__validate_utf8_fast
_ = _subr__value _ = _subr__value
_ = _subr__vnumber _ = _subr__vnumber
_ = _subr__vsigned _ = _subr__vsigned
@ -93,6 +99,8 @@ const (
_ = _stack__u64toa _ = _stack__u64toa
_ = _stack__unquote _ = _stack__unquote
_ = _stack__validate_one _ = _stack__validate_one
_ = _stack__validate_utf8
_ = _stack__validate_utf8_fast
_ = _stack__value _ = _stack__value
_ = _stack__vnumber _ = _stack__vnumber
_ = _stack__vsigned _ = _stack__vsigned

View file

@ -26,7 +26,10 @@ import (
`github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/native/types`
) )
const MaxFrameSize uintptr = 400 const (
MaxFrameSize uintptr = 400
BufPaddingSize int = 64
)
var ( var (
S_f64toa uintptr S_f64toa uintptr
@ -113,6 +116,16 @@ func U64toa(out *byte, val uint64) (ret int)
//goland:noinspection GoUnusedParameter //goland:noinspection GoUnusedParameter
func F64toa(out *byte, val float64) (ret int) func F64toa(out *byte, val float64) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func ValidateUTF8(s *string, p *int, m *types.StateMachine) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func ValidateUTF8Fast(s *string) (ret int)
func useAVX() { func useAVX() {
S_f64toa = avx.S_f64toa S_f64toa = avx.S_f64toa
S_f32toa = avx.S_f32toa S_f32toa = avx.S_f32toa

View file

@ -45,7 +45,6 @@ TEXT ·HTMLEscape(SB), NOSPLIT, $0 - 40
JMP github·combytedancesonicinternalnativeavx·__html_escape(SB) JMP github·combytedancesonicinternalnativeavx·__html_escape(SB)
JMP github·combytedancesonicinternalnativesse·__html_escape(SB) JMP github·combytedancesonicinternalnativesse·__html_escape(SB)
TEXT ·Value(SB), NOSPLIT, $0 - 48 TEXT ·Value(SB), NOSPLIT, $0 - 48
CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0 CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0
JE 2(PC) JE 2(PC)
@ -81,6 +80,7 @@ TEXT ·GetByPath(SB), NOSPLIT, $0 - 32
JE 2(PC) JE 2(PC)
JMP github·combytedancesonicinternalnativeavx·__get_by_path(SB) JMP github·combytedancesonicinternalnativeavx·__get_by_path(SB)
JMP github·combytedancesonicinternalnativesse·__get_by_path(SB) JMP github·combytedancesonicinternalnativesse·__get_by_path(SB)
TEXT ·ValidateOne(SB), NOSPLIT, $0 - 32 TEXT ·ValidateOne(SB), NOSPLIT, $0 - 32
CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0 CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0
JE 2(PC) JE 2(PC)
@ -90,6 +90,24 @@ TEXT ·ValidateOne(SB), NOSPLIT, $0 - 32
JMP github·combytedancesonicinternalnativeavx·__validate_one(SB) JMP github·combytedancesonicinternalnativeavx·__validate_one(SB)
JMP github·combytedancesonicinternalnativesse·__validate_one(SB) JMP github·combytedancesonicinternalnativesse·__validate_one(SB)
TEXT ·ValidateUTF8(SB), NOSPLIT, $0 - 40
CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0
JE 2(PC)
JMP github·combytedancesonicinternalnativeavx2·__validate_utf8(SB)
CMPB github·combytedancesonicinternalcpu·HasAVX(SB), $0
JE 2(PC)
JMP github·combytedancesonicinternalnativeavx·__validate_utf8(SB)
JMP github·combytedancesonicinternalnativesse·__validate_utf8(SB)
TEXT ·ValidateUTF8Fast(SB), NOSPLIT, $0 - 16
CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0
JE 2(PC)
JMP github·combytedancesonicinternalnativeavx2·__validate_utf8_fast(SB)
CMPB github·combytedancesonicinternalcpu·HasAVX(SB), $0
JE 2(PC)
JMP github·combytedancesonicinternalnativeavx·__validate_utf8_fast(SB)
JMP github·combytedancesonicinternalnativesse·__validate_utf8_fast(SB)
TEXT ·I64toa(SB), NOSPLIT, $0 - 32 TEXT ·I64toa(SB), NOSPLIT, $0 - 32
CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0 CMPB github·combytedancesonicinternalcpu·HasAVX2(SB), $0
JE 2(PC) JE 2(PC)

View file

@ -121,3 +121,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
//go:noescape //go:noescape
//goland:noinspection GoUnusedParameter //goland:noinspection GoUnusedParameter
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int) func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8_fast(s *string) (ret int)

View file

@ -254,15 +254,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt)) assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
} }
func TestNative_Vstring_ValidUtf8(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\xff\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_VstringEscapeEOF(t *testing.T) { func TestNative_VstringEscapeEOF(t *testing.T) {
var v types.JsonState var v types.JsonState
i := 0 i := 0
@ -273,51 +264,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
assert.Equal(t, int64(0), v.Iv) assert.Equal(t, int64(0), v.Iv)
} }
func TestNative_ValidateOne(t *testing.T) {
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, len(s), p)
assert.Equal(t, 0, r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x00\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x80\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\xed\xbf\xbf\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
}
func TestNative_VstringHangUpOnRandomData(t *testing.T) { func TestNative_VstringHangUpOnRandomData(t *testing.T) {
v, e := hex.DecodeString( v, e := hex.DecodeString(
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" + "228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +

View file

@ -123,3 +123,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
//go:noescape //go:noescape
//goland:noinspection GoUnusedParameter //goland:noinspection GoUnusedParameter
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int) func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
//go:nosplit
//go:noescape
//goland:noinspection GoUnusedParameter
func __validate_utf8_fast(s *string) (ret int)

File diff suppressed because it is too large Load diff

View file

@ -256,15 +256,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt)) assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
} }
func TestNative_Vstring_ValidUtf8(t *testing.T) {
var v types.JsonState
valid := uint64(types.F_VALIDATE_STRING)
i := 0
s := "test\xff\""
__vstring(&s, &i, &v, valid)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
}
func TestNative_VstringEscapeEOF(t *testing.T) { func TestNative_VstringEscapeEOF(t *testing.T) {
var v types.JsonState var v types.JsonState
i := 0 i := 0
@ -275,51 +266,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
assert.Equal(t, int64(0), v.Iv) assert.Equal(t, int64(0), v.Iv)
} }
func TestNative_ValidateOne(t *testing.T) {
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, len(s), p)
assert.Equal(t, 0, r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x00\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 64, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\x80\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
{
p := 0
s := "\"\xed\xbf\xbf\"x"
r := __validate_one(&s, &p, &types.StateMachine{})
assert.Equal(t, 1, p)
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
}
}
func TestNative_VstringHangUpOnRandomData(t *testing.T) { func TestNative_VstringHangUpOnRandomData(t *testing.T) {
v, e := hex.DecodeString( v, e := hex.DecodeString(
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" + "228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +

View file

@ -9,32 +9,34 @@ package sse
func __native_entry__() uintptr func __native_entry__() uintptr
var ( var (
_subr__f32toa = __native_entry__() + 29152 _subr__f32toa = __native_entry__() + 28688
_subr__f64toa = __native_entry__() + 464 _subr__f64toa = __native_entry__() + 464
_subr__get_by_path = __native_entry__() + 27392 _subr__get_by_path = __native_entry__() + 26432
_subr__html_escape = __native_entry__() + 10416 _subr__html_escape = __native_entry__() + 9584
_subr__i64toa = __native_entry__() + 4048 _subr__i64toa = __native_entry__() + 3744
_subr__lspace = __native_entry__() + 80 _subr__lspace = __native_entry__() + 80
_subr__quote = __native_entry__() + 5456 _subr__quote = __native_entry__() + 5472
_subr__skip_array = __native_entry__() + 20144 _subr__skip_array = __native_entry__() + 19184
_subr__skip_number = __native_entry__() + 23488 _subr__skip_number = __native_entry__() + 22528
_subr__skip_object = __native_entry__() + 22032 _subr__skip_object = __native_entry__() + 21088
_subr__skip_one = __native_entry__() + 23632 _subr__skip_one = __native_entry__() + 22688
_subr__skip_one_fast = __native_entry__() + 23840 _subr__skip_one_fast = __native_entry__() + 22912
_subr__u64toa = __native_entry__() + 4176 _subr__u64toa = __native_entry__() + 4016
_subr__unquote = __native_entry__() + 7232 _subr__unquote = __native_entry__() + 7184
_subr__validate_one = __native_entry__() + 23664 _subr__validate_one = __native_entry__() + 22736
_subr__value = __native_entry__() + 13680 _subr__validate_utf8 = __native_entry__() + 27456
_subr__vnumber = __native_entry__() + 17888 _subr__validate_utf8_fast = __native_entry__() + 28128
_subr__vsigned = __native_entry__() + 19440 _subr__value = __native_entry__() + 13216
_subr__vstring = __native_entry__() + 15760 _subr__vnumber = __native_entry__() + 16928
_subr__vunsigned = __native_entry__() + 19792 _subr__vsigned = __native_entry__() + 18464
_subr__vstring = __native_entry__() + 15408
_subr__vunsigned = __native_entry__() + 18816
) )
const ( const (
_stack__f32toa = 64 _stack__f32toa = 56
_stack__f64toa = 80 _stack__f64toa = 80
_stack__get_by_path = 232 _stack__get_by_path = 264
_stack__html_escape = 64 _stack__html_escape = 64
_stack__i64toa = 16 _stack__i64toa = 16
_stack__lspace = 8 _stack__lspace = 8
@ -43,14 +45,16 @@ const (
_stack__skip_number = 72 _stack__skip_number = 72
_stack__skip_object = 128 _stack__skip_object = 128
_stack__skip_one = 128 _stack__skip_one = 128
_stack__skip_one_fast = 144 _stack__skip_one_fast = 160
_stack__u64toa = 8 _stack__u64toa = 8
_stack__unquote = 72 _stack__unquote = 128
_stack__validate_one = 128 _stack__validate_one = 128
_stack__value = 336 _stack__validate_utf8 = 48
_stack__vnumber = 248 _stack__validate_utf8_fast = 24
_stack__value = 368
_stack__vnumber = 280
_stack__vsigned = 16 _stack__vsigned = 16
_stack__vstring = 144 _stack__vstring = 128
_stack__vunsigned = 24 _stack__vunsigned = 24
) )
@ -70,6 +74,8 @@ var (
_ = _subr__u64toa _ = _subr__u64toa
_ = _subr__unquote _ = _subr__unquote
_ = _subr__validate_one _ = _subr__validate_one
_ = _subr__validate_utf8
_ = _subr__validate_utf8_fast
_ = _subr__value _ = _subr__value
_ = _subr__vnumber _ = _subr__vnumber
_ = _subr__vsigned _ = _subr__vsigned
@ -93,6 +99,8 @@ const (
_ = _stack__u64toa _ = _stack__u64toa
_ = _stack__unquote _ = _stack__unquote
_ = _stack__validate_one _ = _stack__validate_one
_ = _stack__validate_utf8
_ = _stack__validate_utf8_fast
_ = _stack__value _ = _stack__value
_ = _stack__vnumber _ = _stack__vnumber
_ = _stack__vsigned _ = _stack__vsigned

View file

@ -25,7 +25,7 @@ type ValueType int
type ParsingError uint type ParsingError uint
type SearchingError uint type SearchingError uint
// !NOT MODIFIED ONLY. // NOTE: !NOT MODIFIED ONLY.
// This definitions are followed in native/types.h. // This definitions are followed in native/types.h.
const ( const (
@ -75,6 +75,7 @@ const (
ERR_RECURSE_EXCEED_MAX ParsingError = 7 ERR_RECURSE_EXCEED_MAX ParsingError = 7
ERR_FLOAT_INFINITY ParsingError = 8 ERR_FLOAT_INFINITY ParsingError = 8
ERR_MISMATCH ParsingError = 9 ERR_MISMATCH ParsingError = 9
ERR_INVALID_UTF8 ParsingError = 10
) )
var _ParsingErrors = []string{ var _ParsingErrors = []string{
@ -88,6 +89,7 @@ var _ParsingErrors = []string{
ERR_RECURSE_EXCEED_MAX : "recursion exceeded max depth", ERR_RECURSE_EXCEED_MAX : "recursion exceeded max depth",
ERR_FLOAT_INFINITY : "float number is infinity", ERR_FLOAT_INFINITY : "float number is infinity",
ERR_MISMATCH : "mismatched type with value", ERR_MISMATCH : "mismatched type with value",
ERR_INVALID_UTF8 : "invalid UTF8",
} }
func (self ParsingError) Error() string { func (self ParsingError) Error() string {

201
licenses/LICENSE-simdjson Normal file
View file

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2018-2023 The simdjson authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -48,13 +48,13 @@ static inline unsigned ctz10_u32(const uint32_t v) {
if (v < 1000000) return 6; if (v < 1000000) return 6;
if (v < 10000000) return 7; if (v < 10000000) return 7;
if (v < 100000000) return 8; if (v < 100000000) return 8;
return 9; else return 9;
} else { } else {
if (v < 10) return 1; if (v < 10) return 1;
if (v < 100) return 2; if (v < 100) return 2;
if (v < 1000) return 3; if (v < 1000) return 3;
if (v < 10000) return 4; if (v < 10000) return 4;
return 5; else return 5;
} }
} }

View file

@ -54,7 +54,7 @@ static inline unsigned ctz10(const uint64_t v) {
if (v < 100000000000000ull) return 14; if (v < 100000000000000ull) return 14;
if (v < 1000000000000000ull) return 15; if (v < 1000000000000000ull) return 15;
if (v < 10000000000000000ull) return 16; if (v < 10000000000000000ull) return 16;
return 17; else return 17;
} }
if (v < 10ull) return 1; if (v < 10ull) return 1;
if (v < 100ull) return 2; if (v < 100ull) return 2;
@ -65,7 +65,7 @@ static inline unsigned ctz10(const uint64_t v) {
if (v < 10000000ull) return 7; if (v < 10000000ull) return 7;
if (v < 100000000ull) return 8; if (v < 100000000ull) return 8;
if (v < 1000000000ull) return 9; if (v < 1000000000ull) return 9;
return 10; else return 10;
} }

View file

@ -14,6 +14,8 @@
* limitations under the License. * limitations under the License.
*/ */
#include "native.h" #include "native.h"
#include "test/xprintf.h"
#include "test/xassert.h"
#include "fastbytes.c" #include "fastbytes.c"
#include "fastfloat.c" #include "fastfloat.c"
#include "fastint.c" #include "fastint.c"

View file

@ -112,8 +112,8 @@ typedef struct {
} JsonState; } JsonState;
typedef struct { typedef struct {
int sp; int64_t sp;
int vt[MAX_RECURSE]; int64_t vt[MAX_RECURSE];
} StateMachine; } StateMachine;
int f64toa(char *out, double val); int f64toa(char *out, double val);
@ -144,9 +144,10 @@ long skip_number(const GoString *src, long *p);
bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val); bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val);
double atof_native(const char *sp, ssize_t nb, char *dbuf, ssize_t cap); double atof_native(const char *sp, ssize_t nb, char *dbuf, ssize_t cap);
ssize_t utf8_validate(const char *sp, ssize_t nb);
long validate_string(const GoString *src, long *p); long validate_string(const GoString *src, long *p);
long validate_one(const GoString *src, long *p, StateMachine *m); long validate_one(const GoString *src, long *p, StateMachine *m);
long validate_utf8(const GoString *src, long *p, StateMachine *m);
long validate_utf8_fast(const GoString *src);
long skip_one_fast(const GoString *src, long *p); long skip_one_fast(const GoString *src, long *p);
long get_by_path(const GoString *src, long *p, const GoSlice *path); long get_by_path(const GoString *src, long *p, const GoSlice *path);

View file

@ -15,6 +15,7 @@
*/ */
#include "native.h" #include "native.h"
#include "utils.h"
#include <stdint.h> #include <stdint.h>
/** String Quoting **/ /** String Quoting **/
@ -108,27 +109,6 @@ static const quoted_t _HtmlQuoteTab[256] = {
[0xa9] = { .n = 6, .s = "\\u2029" }, [0xa9] = { .n = 6, .s = "\\u2029" },
}; };
static inline void memcpy_p8(char *dp, const char *sp, ssize_t nb) {
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
if (nb >= 1) { *dp = *sp; }
}
static inline void memcpy_p16(char *dp, const char *sp, size_t nb) {
if (nb >= 8) { *(uint64_t *)dp = *(const uint64_t *)sp; sp += 8, dp += 8, nb -= 8; }
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
if (nb >= 1) { *dp = *sp; }
}
static inline void memcpy_p32(char *dp, const char *sp, size_t nb) {
if (nb >= 16) { _mm_storeu_si128((void *)dp, _mm_loadu_si128((const void *)sp)); sp += 16, dp += 16, nb -= 16; }
if (nb >= 8) { *(uint64_t *)dp = *(const uint64_t *)sp; sp += 8, dp += 8, nb -= 8; }
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
if (nb >= 1) { *dp = *sp; }
}
static inline __m128i _mm_find_quote(__m128i vv) { static inline __m128i _mm_find_quote(__m128i vv) {
__m128i e1 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(-1)); __m128i e1 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(-1));
__m128i e2 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(31)); __m128i e2 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(31));

View file

@ -16,6 +16,7 @@
#include "native.h" #include "native.h"
#include "utf8.h" #include "utf8.h"
#include "utils.h"
static const uint64_t ODD_MASK = 0xaaaaaaaaaaaaaaaa; static const uint64_t ODD_MASK = 0xaaaaaaaaaaaaaaaa;
static const uint64_t EVEN_MASK = 0x5555555555555555; static const uint64_t EVEN_MASK = 0x5555555555555555;
@ -41,7 +42,7 @@ static inline uint64_t add32(uint64_t v1, uint64_t v2, uint64_t *vo) {
} }
static inline uint64_t add64(uint64_t v1, uint64_t v2, uint64_t *vo) { static inline uint64_t add64(uint64_t v1, uint64_t v2, uint64_t *vo) {
uint64_t v; unsigned long long v;
uint64_t c = __builtin_uaddll_overflow(v1, v2, &v); uint64_t c = __builtin_uaddll_overflow(v1, v2, &v);
/* set the carry */ /* set the carry */
@ -107,7 +108,7 @@ static inline int64_t advance_dword(const GoString *src, long *p, long dec, int6
return ret; return ret;
} else { } else {
*p -= dec; *p -= dec;
for (int i = 0; src->buf[*p] == (val & 0xff); i++, ++*p) { val >>= 8; } for (int i = 0; src->buf[*p] == (val & 0xff) && i < 4; i++, ++*p) { val >>= 8; }
return -ERR_INVAL; return -ERR_INVAL;
} }
} }
@ -368,12 +369,11 @@ static inline int _mm_nonascii_mask(__m128i v) {
static inline ssize_t advance_string_validate(const GoString *src, long p, int64_t *ep) { static inline ssize_t advance_string_validate(const GoString *src, long p, int64_t *ep) {
char ch; char ch;
uint64_t m0, m1, m2, m3; uint64_t m0, m1, m2;
uint64_t es, fe, os; uint64_t es, fe, os;
uint64_t cr = 0; uint64_t cr = 0;
long qp = 0; long qp = 0;
long np = 0; long np = 0;
long up = 0;
/* buffer pointers */ /* buffer pointers */
size_t nb = src->len; size_t nb = src->len;
@ -406,7 +406,6 @@ static inline ssize_t advance_string_validate(const GoString *src, long p, int64
uint32_t s0, s1; uint32_t s0, s1;
uint32_t t0, t1; uint32_t t0, t1;
uint32_t c0, c1; uint32_t c0, c1;
uint32_t u0, u1;
#else #else
/* initialize vectors */ /* initialize vectors */
__m128i v0; __m128i v0;
@ -420,7 +419,6 @@ static inline ssize_t advance_string_validate(const GoString *src, long p, int64
uint32_t s0, s1, s2, s3; uint32_t s0, s1, s2, s3;
uint32_t t0, t1, t2, t3; uint32_t t0, t1, t2, t3;
uint32_t c0, c1, c2, c3; uint32_t c0, c1, c2, c3;
uint32_t u0, u1, u2, u3;
#endif #endif
#define m0_mask(add) \ #define m0_mask(add) \
@ -430,7 +428,6 @@ static inline ssize_t advance_string_validate(const GoString *src, long p, int64
es = add(os, m1, &cr) << 1; \ es = add(os, m1, &cr) << 1; \
m0 &= ~(fe & (es ^ EVEN_MASK)); m0 &= ~(fe & (es ^ EVEN_MASK));
simd_advance:
/* 64-byte SIMD loop */ /* 64-byte SIMD loop */
while (likely(nb >= 64)) { while (likely(nb >= 64)) {
#if USE_AVX2 #if USE_AVX2
@ -442,12 +439,9 @@ simd_advance:
t1 = _mm256_get_mask(v1, cx); t1 = _mm256_get_mask(v1, cx);
c0 = _mm256_cchars_mask(v0); c0 = _mm256_cchars_mask(v0);
c1 = _mm256_cchars_mask(v1); c1 = _mm256_cchars_mask(v1);
u0 = _mm256_nonascii_mask(v0);
u1 = _mm256_nonascii_mask(v1);
m0 = ((uint64_t)s1 << 32) | (uint64_t)s0; m0 = ((uint64_t)s1 << 32) | (uint64_t)s0;
m1 = ((uint64_t)t1 << 32) | (uint64_t)t0; m1 = ((uint64_t)t1 << 32) | (uint64_t)t0;
m2 = ((uint64_t)c1 << 32) | (uint64_t)c0; m2 = ((uint64_t)c1 << 32) | (uint64_t)c0;
m3 = ((uint64_t)u1 << 32) | (uint64_t)u0;
#else #else
v0 = _mm_loadu_si128 ((const void *)(sp + 0)); v0 = _mm_loadu_si128 ((const void *)(sp + 0));
v1 = _mm_loadu_si128 ((const void *)(sp + 16)); v1 = _mm_loadu_si128 ((const void *)(sp + 16));
@ -465,14 +459,9 @@ simd_advance:
c1 = _mm_cchars_mask(v1); c1 = _mm_cchars_mask(v1);
c2 = _mm_cchars_mask(v2); c2 = _mm_cchars_mask(v2);
c3 = _mm_cchars_mask(v3); c3 = _mm_cchars_mask(v3);
u0 = _mm_nonascii_mask(v0);
u1 = _mm_nonascii_mask(v1);
u2 = _mm_nonascii_mask(v2);
u3 = _mm_nonascii_mask(v3);
m0 = ((uint64_t)s3 << 48) | ((uint64_t)s2 << 32) | ((uint64_t)s1 << 16) | (uint64_t)s0; m0 = ((uint64_t)s3 << 48) | ((uint64_t)s2 << 32) | ((uint64_t)s1 << 16) | (uint64_t)s0;
m1 = ((uint64_t)t3 << 48) | ((uint64_t)t2 << 32) | ((uint64_t)t1 << 16) | (uint64_t)t0; m1 = ((uint64_t)t3 << 48) | ((uint64_t)t2 << 32) | ((uint64_t)t1 << 16) | (uint64_t)t0;
m2 = ((uint64_t)c3 << 48) | ((uint64_t)c2 << 32) | ((uint64_t)c1 << 16) | (uint64_t)c0; m2 = ((uint64_t)c3 << 48) | ((uint64_t)c2 << 32) | ((uint64_t)c1 << 16) | (uint64_t)c0;
m3 = ((uint64_t)u3 << 48) | ((uint64_t)u2 << 32) | ((uint64_t)u1 << 16) | (uint64_t)u0;
#endif #endif
@ -488,7 +477,6 @@ simd_advance:
qp = m0 ? __builtin_ctzll(m0) : 64; qp = m0 ? __builtin_ctzll(m0) : 64;
np = m2 ? __builtin_ctzll(m2) : 64; np = m2 ? __builtin_ctzll(m2) : 64;
up = m3 ? __builtin_ctzll(m3) : 64;
/* get the position of end quote */ /* get the position of end quote */
if (m0 != 0) { if (m0 != 0) {
@ -498,9 +486,6 @@ simd_advance:
return -ERR_INVAL; return -ERR_INVAL;
} }
if (up < qp) {
goto valid_utf8;
}
return sp - ss + qp + 1; return sp - ss + qp + 1;
} }
@ -511,10 +496,6 @@ simd_advance:
return -ERR_INVAL; return -ERR_INVAL;
} }
if (unlikely(m3 != 0)) {
goto valid_utf8;
}
/* move to the next block */ /* move to the next block */
sp += 64; sp += 64;
nb -= 64; nb -= 64;
@ -527,11 +508,9 @@ simd_advance:
s0 = _mm256_get_mask (v0, cq); s0 = _mm256_get_mask (v0, cq);
t0 = _mm256_get_mask (v0, cx); t0 = _mm256_get_mask (v0, cx);
c0 = _mm256_cchars_mask(v0); c0 = _mm256_cchars_mask(v0);
u0 = _mm256_nonascii_mask(v0);
m0 = (uint64_t)s0; m0 = (uint64_t)s0;
m1 = (uint64_t)t0; m1 = (uint64_t)t0;
m2 = (uint64_t)c0; m2 = (uint64_t)c0;
m3 = (uint64_t)u0;
#else #else
v0 = _mm_loadu_si128 ((const void *)(sp + 0)); v0 = _mm_loadu_si128 ((const void *)(sp + 0));
v1 = _mm_loadu_si128 ((const void *)(sp + 16)); v1 = _mm_loadu_si128 ((const void *)(sp + 16));
@ -541,12 +520,9 @@ simd_advance:
t1 = _mm_get_mask(v1, cx); t1 = _mm_get_mask(v1, cx);
c0 = _mm_cchars_mask(v0); c0 = _mm_cchars_mask(v0);
c1 = _mm_cchars_mask(v1); c1 = _mm_cchars_mask(v1);
u0 = _mm_nonascii_mask(v0);
u1 = _mm_nonascii_mask(v1);
m0 = ((uint64_t)s1 << 16) | (uint64_t)s0; m0 = ((uint64_t)s1 << 16) | (uint64_t)s0;
m1 = ((uint64_t)t1 << 16) | (uint64_t)t0; m1 = ((uint64_t)t1 << 16) | (uint64_t)t0;
m2 = ((uint64_t)c1 << 16) | (uint64_t)c0; m2 = ((uint64_t)c1 << 16) | (uint64_t)c0;
m3 = ((uint64_t)u1 << 16) | (uint64_t)u0;
#endif #endif
/** update first quote position */ /** update first quote position */
@ -560,19 +536,14 @@ simd_advance:
} }
qp = m0 ? __builtin_ctzll(m0) : 64; qp = m0 ? __builtin_ctzll(m0) : 64;
up = m3 ? __builtin_ctzll(m3) : 64;
np = m2 ? __builtin_ctzll(m2) : 64; np = m2 ? __builtin_ctzll(m2) : 64;
/* get the position of end quote */ /* get the position of end quote */
if (m0 != 0) { if (m0 != 0) {
if (unlikely(np < qp)) { if (unlikely(np < qp)) {
ep_seterr(sp - ss + np) ep_seterr(sp - ss + np)
return -ERR_INVAL; return -ERR_INVAL;
} }
if (up < qp) {
goto valid_utf8;
}
return sp - ss + qp + 1; return sp - ss + qp + 1;
} }
@ -582,10 +553,6 @@ simd_advance:
return -ERR_INVAL; return -ERR_INVAL;
} }
if (m3 != 0) {
goto valid_utf8;
}
/* move to the next block */ /* move to the next block */
sp += 32; sp += 32;
nb -= 32; nb -= 32;
@ -601,7 +568,6 @@ simd_advance:
} }
} }
remain:
/* handle the remaining bytes with scalar code */ /* handle the remaining bytes with scalar code */
while (nb > 0) { while (nb > 0) {
ch = *sp; ch = *sp;
@ -626,43 +592,9 @@ remain:
return -ERR_INVAL; return -ERR_INVAL;
} }
/* valid utf8 chars */
if (ch & 0x80) {
uint32_t ubin = nb >= 4 ? *(uint32_t*)sp : less4byte_to_uint32(sp, nb);
if ((up = valid_utf8_4byte(ubin))) {
sp += up, nb -= up;
continue;
}
ep_seterr(sp - ss)
return -ERR_INVAL;
}
sp++, nb--; sp++, nb--;
} }
return -ERR_EOF; return -ERR_EOF;
valid_utf8:
sp += up, nb -= up;
while (likely(nb >= 4)) {
up = valid_utf8_4byte(*(uint32_t*)sp);
if (unlikely(up == 0)) {
ep_seterr(sp - ss)
return -ERR_INVAL;
}
/* check continous utf-8 */
sp += up, nb -= up;
if (nb > 0 && (*(uint8_t*)sp & 0x80)) {
continue;
}
/* clear the last carried bit */
cr = 0;
goto simd_advance;
}
goto remain;
#undef ep_init #undef ep_init
#undef ep_setc #undef ep_setc
#undef ep_setx #undef ep_setx
@ -1640,24 +1572,6 @@ static always_inline long skip_number_fast(const GoString *src, long *p) {
return vi; return vi;
} }
static always_inline void memcpy_p64(char * restrict dp, const char * restrict sp, size_t n) {
long nb = n;
#if USE_AVX2
if (nb >= 32) { _mm256_storeu_si256((void *)dp, _mm256_loadu_si256((const void *)sp)); sp += 32, dp += 32, nb -= 32; }
#endif
while (nb >= 16) { _mm_storeu_si128((void *)dp, _mm_loadu_si128((const void *)sp)); sp += 16, dp += 16, nb -= 16; }
if (nb >= 8) { *(uint64_t *)dp = *(const uint64_t *)sp; sp += 8, dp += 8, nb -= 8; }
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
if (nb >= 1) { *dp = *sp; }
}
static always_inline bool vec_cross_page(const void * p, size_t n) {
#define PAGE_SIZE 4096
return (((size_t)(p)) & (PAGE_SIZE - 1)) > (PAGE_SIZE - n);
#undef PAGE_SIZE
}
static always_inline long skip_container_fast(const GoString *src, long *p, char lc, char rc) { static always_inline long skip_container_fast(const GoString *src, long *p, char lc, char rc) {
long nb = src->len - *p; long nb = src->len - *p;
const char *s = src->buf + *p; const char *s = src->buf + *p;
@ -1956,3 +1870,20 @@ err_inval:
*p -= 1; // backward error position *p -= 1; // backward error position
return -ERR_INVAL; return -ERR_INVAL;
} }
//
long validate_utf8(const GoString *src, long *p, StateMachine *m) {
xassert(*p >= 0 && src->len > *p);
return validate_utf8_with_errors(src->buf, src->len, p, m);
}
// validate_utf8_fast returns zero if valid, otherwise, the error position.
long validate_utf8_fast(const GoString *s) {
#if USE_AVX2
/* fast path for valid utf8 */
if (validate_utf8_avx2(s) == 0) {
return 0;
}
#endif
return validate_utf8_errors(s);
}

View file

@ -17,6 +17,7 @@
#ifndef XASSERT_H #ifndef XASSERT_H
#define XASSERT_H #define XASSERT_H
#ifndef DEBUG #ifndef DEBUG
#define xassert(expr) ((void)0) #define xassert(expr) ((void)0)
#else #else

View file

@ -14,11 +14,23 @@
* limitations under the License. * limitations under the License.
*/ */
#ifndef XPRINTF_H #pragma once
#define XPRINTF_H
#include <sys/types.h> #include <sys/types.h>
#ifdef LOG_LEVEL
#define DEBUG
#define LOG_TRACE(_VA_ARGS__...) do { if (LOG_LEVEL >= 0) xprintf(_VA_ARGS__ ); } while (0)
#define LOG_DEBUG(_VA_ARGS__...) do { if (LOG_LEVEL >= 1) xprintf(_VA_ARGS__ ); } while (0)
#define LOG_INFO(_VA_ARGS__...) do { if (LOG_LEVEL >= 2) xprintf(_VA_ARGS__ ); } while (0)
#else
#define LOG_TRACE(_VA_ARGS__...) ((void)0)
#define LOG_DEBUG(_VA_ARGS__...) ((void)0)
#define LOG_INFO(_VA_ARGS__...) ((void)0)
#endif
// Note: this code is on cross-compile, so we can't use System-specific Predefined Macros here.
#if USE_APPLE
static inline void __attribute__((naked)) write_syscall(const char *s, size_t n) static inline void __attribute__((naked)) write_syscall(const char *s, size_t n)
{ {
asm volatile( asm volatile(
@ -35,6 +47,24 @@ static inline void __attribute__((naked)) write_syscall(const char *s, size_t n)
"retq" "retq"
"\n"); "\n");
} }
#else
static inline void __attribute__((naked)) write_syscall(const char *s, size_t n)
{
asm volatile(
"movq %rsi, %rdx"
"\n"
"movq %rdi, %rsi"
"\n"
"movq $1, %rdi"
"\n"
"movq $1, %rax"
"\n"
"syscall"
"\n"
"retq"
"\n");
}
#endif
static inline void printch(const char ch) static inline void printch(const char ch)
{ {
@ -115,7 +145,7 @@ static inline void printhex(uintptr_t v)
printstr(p); printstr(p);
} }
#define MAX_BUF_LEN 100 #define MAX_BUF_LEN 1000
static inline void printbytes(GoSlice *s) static inline void printbytes(GoSlice *s)
{ {
@ -150,9 +180,8 @@ static inline void printgostr(GoString *s)
printch('"'); printch('"');
} }
static inline void xprintf(const char *fmt, ...) static inline void do_xprintf(const char *fmt, ...)
{ {
#ifdef DEBUG
__builtin_va_list va; __builtin_va_list va;
char buf[256] = {}; char buf[256] = {};
char *p = buf; char *p = buf;
@ -227,7 +256,26 @@ static inline void xprintf(const char *fmt, ...)
*p = 0; *p = 0;
printstr(buf); printstr(buf);
} }
#endif
} }
#endif // XPRINTF_H #ifdef DEBUG
#define xprintf(_VA_ARGS__...) do_xprintf(_VA_ARGS__)
#else
#define xprintf(_VA_ARGS__...) ((void)0)
#endif
static always_inline void print_longhex(const void *input, const char* s, int bytes) {
const uint8_t* p = (const uint8_t*)(input);
xprintf("%s : ", s);
for (int i = 0; i < bytes; i++) {
uintptr_t u = p[i];
if (u < 0x10) xprintf("0");
xprintf("%x", u);
if ((i + 1) < bytes && (i + 1) % 4 == 0) {
xprintf("-");
}
}
xprintf("\n");
}
#define psimd(simd) print_longhex((const void *)(simd), #simd, sizeof(*simd))

View file

@ -2,7 +2,7 @@
#ifndef TYPES_H #ifndef TYPES_H
#define TYPES_H #define TYPES_H
// !NOT MODIFIED ONLY. // NOTE: !NOT MODIFIED ONLY.
// This definitions are copied from internal/native/types/types.go. // This definitions are copied from internal/native/types/types.go.
#define V_EOF 1 #define V_EOF 1
@ -34,6 +34,8 @@
#define ERR_NUMBER_FMT 6 #define ERR_NUMBER_FMT 6
#define ERR_RECURSE_MAX 7 #define ERR_RECURSE_MAX 7
#define ERR_FLOAT_INF 8 #define ERR_FLOAT_INF 8
#define ERR_MISMATCH 9
#define ERR_INVAL_UTF8 10
#define MAX_RECURSE 4096 #define MAX_RECURSE 4096

View file

@ -1,5 +1,3 @@
#ifndef UTF8_H
#define UTF8_H
/* /*
* Copyright (C) 2019 Yaoyuan <ibireme@gmail.com>. * Copyright (C) 2019 Yaoyuan <ibireme@gmail.com>.
* *
@ -9,6 +7,20 @@
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2018-2023 The simdjson authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -19,6 +31,13 @@
* Modifications are Copyright 2022 ByteDance Authors. * Modifications are Copyright 2022 ByteDance Authors.
*/ */
#pragma once
#include "native.h"
#include "utils.h"
#include "test/xassert.h"
#include "test/xprintf.h"
static inline ssize_t valid_utf8_4byte(uint32_t ubin) { static inline ssize_t valid_utf8_4byte(uint32_t ubin) {
/* /*
Each unicode code point is encoded as 1 to 4 bytes in UTF-8 encoding, Each unicode code point is encoded as 1 to 4 bytes in UTF-8 encoding,
@ -104,12 +123,371 @@ static inline ssize_t valid_utf8_4byte(uint32_t ubin) {
return 0; return 0;
} }
static inline uint32_t less4byte_to_uint32(const char* sp, size_t nb) { static always_inline long write_error(int pos, StateMachine *m, size_t msize) {
if (nb == 1) return *(uint8_t*)sp; if (m->sp >= msize) {
if (nb == 2) return *(uint16_t*)sp; return -1;
uint32_t hi_1 = (*(uint8_t*)(sp + 2)); }
uint32_t lo_2 = *(uint16_t*)(sp); m->vt[m->sp++] = pos;
return hi_1 << 16 | lo_2; return 0;
} }
// scalar code, error position should excesss 4096
static always_inline long validate_utf8_with_errors(const char *src, long len, long *p, StateMachine *m) {
const char* start = src + *p;
const char* end = src + len;
while (start < end - 3) {
uint32_t u = (*(uint32_t*)(start));
if ((unsigned)(*start) < 0x80) {
start += 1;
continue;
}
size_t n = valid_utf8_4byte(u);
if (n != 0) { // valid utf
start += n;
continue;
}
long err = write_error(start - src, m, MAX_RECURSE);
if (err) {
*p = start - src;
return err;
}
start += 1;
}
while (start < end) {
if ((unsigned)(*start) < 0x80) {
start += 1;
continue;
}
uint32_t u = 0;
memcpy_p4(&u, start, end - start);
size_t n = valid_utf8_4byte(u);
if (n != 0) { // valid utf
start += n;
continue;
}
long err = write_error(start - src, m, MAX_RECURSE);
if (err) {
*p = start - src;
return err;
}
start += 1;
}
*p = start - src;
return 0;
}
// validate_utf8_errors returns zero if valid, otherwise, the error position.
static always_inline long validate_utf8_errors(const GoString* s) {
const char* start = s->buf;
const char* end = s->buf + s->len;
while (start < end - 3) {
uint32_t u = (*(uint32_t*)(start));
if ((unsigned)(*start) < 0x80) {
start += 1;
continue;
}
size_t n = valid_utf8_4byte(u);
if (n == 0) { // invalid utf
return -(start - s->buf) - 1;
}
start += n;
}
while (start < end) {
if ((unsigned)(*start) < 0x80) {
start += 1;
continue;
}
uint32_t u = 0;
memcpy_p4(&u, start, end - start);
size_t n = valid_utf8_4byte(u);
if (n == 0) { // invalid utf
return -(start - s->buf) - 1;
}
start += n;
}
return 0;
}
// SIMD implementation
#if USE_AVX2
static always_inline __m256i simd256_shr(const __m256i input, const int shift) {
__m256i shifted = _mm256_srli_epi16(input, shift);
__m256i mask = _mm256_set1_epi8(0xFFu >> shift);
return _mm256_and_si256(shifted, mask);
}
#define simd256_prev(input, prev, N) _mm256_alignr_epi8(input, _mm256_permute2x128_si256(prev, input, 0x21), 16 - (N));
static always_inline __m256i must_be_2_3_continuation(const __m256i prev2, const __m256i prev3) {
__m256i is_third_byte = _mm256_subs_epu8(prev2, _mm256_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
__m256i is_fourth_byte = _mm256_subs_epu8(prev3, _mm256_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
__m256i or = _mm256_or_si256(is_third_byte, is_fourth_byte);
return _mm256_cmpgt_epi8(or, _mm256_set1_epi8(0));;
}
static always_inline __m256i simd256_lookup16(const __m256i input, const uint8_t* table) {
return _mm256_shuffle_epi8(_mm256_setr_epi8(table[0], table[1], table[2], table[3], table[4], table[5], table[6], table[7], table[8], table[9], table[10], table[11], table[12], table[13], table[14], table[15], table[0], table[1], table[2], table[3], table[4], table[5], table[6], table[7], table[8], table[9], table[10], table[11], table[12], table[13], table[14], table[15]), input);
}
//
// Return nonzero if there are incomplete multibyte characters at the end of the block:
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
//
static always_inline __m256i is_incomplete(const __m256i input) {
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
const uint8_t tab[32] = {
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1};
const __m256i max_value = _mm256_loadu_si256((const __m256i_u *)(&tab[0]));
return _mm256_subs_epu8(input, max_value);
}
static always_inline __m256i check_special_cases(const __m256i input, const __m256i prev1) {
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
// Bit 1 = Too Long (ASCII followed by continuation)
// Bit 2 = Overlong 3-byte
// Bit 4 = Surrogate
// Bit 5 = Overlong 2-byte
// Bit 7 = Two Continuations
const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
// 11______ 11______
const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
const uint8_t SURROGATE = 1<<4; // 11101101 101_____
const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
// 11110100 101_____
// 11110101 1001____
// 11110101 101_____
// 1111011_ 1001____
// 1111011_ 101_____
// 11111___ 1001____
// 11111___ 101_____
const uint8_t TOO_LARGE_1000 = 1<<6;
// 11110101 1000____
// 1111011_ 1000____
// 11111___ 1000____
const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
const __m256i prev1_shr4 = simd256_shr(prev1, 4);
static const uint8_t tab1[16] = {
// 0_______ ________ <ASCII in byte 1>
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
// 10______ ________ <continuation in byte 1>
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
// 1100____ ________ <two byte lead in byte 1>
TOO_SHORT | OVERLONG_2,
// 1101____ ________ <two byte lead in byte 1>
TOO_SHORT,
// 1110____ ________ <three byte lead in byte 1>
TOO_SHORT | OVERLONG_3 | SURROGATE,
// 1111____ ________ <four+ byte lead in byte 1>
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4,
};
__m256i byte_1_high = simd256_lookup16(prev1_shr4, tab1);
const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
__m256i prev1_low = _mm256_and_si256(prev1, _mm256_set1_epi8(0x0F));
static const uint8_t tab2[16] = {
// ____0000 ________
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
// ____0001 ________
CARRY | OVERLONG_2,
// ____001_ ________
CARRY,
CARRY,
// ____0100 ________
CARRY | TOO_LARGE,
// ____0101 ________
CARRY | TOO_LARGE | TOO_LARGE_1000,
// ____011_ ________
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
// ____1___ ________
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
// ____1101 ________
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000
};
__m256i byte_1_low = simd256_lookup16(prev1_low, tab2);
const __m256i input_shr4 = simd256_shr(input, 4);
static const uint8_t tab3[16] = {
// ________ 0_______ <ASCII in byte 2>
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
// ________ 1000____
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
// ________ 1001____
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
// ________ 101_____
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
// ________ 11______
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
};
__m256i byte_2_high = simd256_lookup16(input_shr4, tab3);
return _mm256_and_si256(_mm256_and_si256(byte_1_high, byte_1_low), byte_2_high);
}
static always_inline __m256i check_multibyte_lengths(const __m256i input, const __m256i prev_input, const __m256i sc) {
__m256i prev2 = simd256_prev(input, prev_input, 2);
__m256i prev3 = simd256_prev(input, prev_input, 3);
__m256i must23 = must_be_2_3_continuation(prev2, prev3);
__m256i must23_80 = _mm256_and_si256(must23, _mm256_set1_epi8(0x80));
return _mm256_xor_si256(must23_80, sc);
}
// Check whether the current bytes are valid UTF-8.
static always_inline __m256i check_utf8_bytes(const __m256i input, const __m256i prev_input) {
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
__m256i prev1 = simd256_prev(input, prev_input, 1);
__m256i sc = check_special_cases(input, prev1);
__m256i ret = check_multibyte_lengths(input, prev_input, sc);
return ret;
}
static always_inline bool is_ascii(const __m256i input) {
return _mm256_movemask_epi8(input) == 0;
}
typedef struct {
// If this is nonzero, there has been a UTF-8 error.
__m256i error;
// The last input we received
__m256i prev_input_block;
// Whether the last input we received was incomplete (used for ASCII fast path)
__m256i prev_incomplete;
} utf8_checker;
static always_inline void utf8_checker_init(utf8_checker* checker) {
checker->error = _mm256_setzero_si256();
checker->prev_input_block = _mm256_setzero_si256();
checker->prev_incomplete = _mm256_setzero_si256();
}
static always_inline bool check_error(utf8_checker* checker) {
return !_mm256_testz_si256(checker->error, checker->error);
}
static always_inline void check64_utf(utf8_checker* checker, const uint8_t* start) {
__m256i input = _mm256_loadu_si256((__m256i*)start);
__m256i input2 = _mm256_loadu_si256((__m256i*)(start + 32));
// check utf-8 chars
__m256i error1 = check_utf8_bytes(input, checker->prev_input_block);
__m256i error2 = check_utf8_bytes(input2, input);
checker->error = _mm256_or_si256(checker->error, _mm256_or_si256(error1, error2));
checker->prev_input_block = input2;
checker->prev_incomplete = is_incomplete(input2);
}
static always_inline void check64(utf8_checker* checker, const uint8_t* start) {
// fast path for contiguous ASCII
__m256i input = _mm256_loadu_si256((__m256i*)start);
__m256i input2 = _mm256_loadu_si256((__m256i*)(start + 32));
__m256i reducer = _mm256_or_si256(input, input2);
// check utf-8
if (likely(is_ascii(reducer))) {
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
return;
}
check64_utf(checker, start);
}
static always_inline void check128(utf8_checker* checker, const uint8_t* start) {
// fast path for contiguous ASCII
__m256i input = _mm256_loadu_si256((__m256i*)start);
__m256i input2 = _mm256_loadu_si256((__m256i*)(start + 32));
__m256i input3 = _mm256_loadu_si256((__m256i*)(start + 64));
__m256i input4 = _mm256_loadu_si256((__m256i*)(start + 96));
__m256i reducer1 = _mm256_or_si256(input, input2);
__m256i reducer2 = _mm256_or_si256(input3, input4);
__m256i reducer = _mm256_or_si256(reducer1, reducer2);
// full 128 bytes are ascii
if (likely(is_ascii(reducer))) {
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
return;
}
// frist 64 bytes is ascii, next 64 bytes must be utf8
if (likely(is_ascii(reducer1))) {
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
check64_utf(checker, start + 64);
return;
}
// frist 64 bytes has utf8, next 64 bytes
check64_utf(checker, start);
if (unlikely(is_ascii(reducer2))) {
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
} else {
check64_utf(checker, start + 64);
}
}
static always_inline void check_eof(utf8_checker* checker) {
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
}
static always_inline void check_remain(utf8_checker* checker, const uint8_t* start, const uint8_t* end) {
uint8_t buffer[64] = {0};
int i = 0;
while (start < end) {
buffer[i++] = *(start++);
};
check64(checker, buffer);
check_eof(checker);
}
static always_inline long validate_utf8_avx2(const GoString* s) {
xassert(s->buf != NULL || s->len != 0);
const uint8_t* start = (const uint8_t*)(s->buf);
const uint8_t* end = (const uint8_t*)(s->buf + s->len);
/* check eof */
if (s->len == 0) {
return 0;
}
utf8_checker checker;
utf8_checker_init(&checker);
while (start < (end - 128)) {
check128(&checker, start);
if (check_error(&checker)) {
}
start += 128;
};
while (start < end - 64) {
check64(&checker, start);
start += 64;
}
check_remain(&checker, start, end);
return check_error(&checker) ? -1 : 0;
}
#endif #endif

78
native/utils.h Normal file
View file

@ -0,0 +1,78 @@
/*
* Copyright 2022 ByteDance Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <immintrin.h>
#include <string.h>
#include "native.h"
static always_inline bool vec_cross_page(const void * p, size_t n) {
#define PAGE_SIZE 4096
return (((size_t)(p)) & (PAGE_SIZE - 1)) > (PAGE_SIZE - n);
#undef PAGE_SIZE
}
static always_inline void memcpy4 (void *__restrict dp, const void *__restrict sp) {
((uint32_t *)dp)[0] = ((const uint32_t *)sp)[0];
}
static always_inline void memcpy8 (void *__restrict dp, const void *__restrict sp) {
((uint64_t *)dp)[0] = ((const uint64_t *)sp)[0];
}
static always_inline void memcpy16 (void *__restrict dp, const void *__restrict sp) {
_mm_storeu_si128((void *)(dp), _mm_loadu_si128((const void *)(sp)));
}
static always_inline void memcpy32(void *__restrict dp, const void *__restrict sp) {
#if USE_AVX2
_mm256_storeu_si256((void *)dp, _mm256_loadu_si256((const void *)sp));
#else
_mm_storeu_si128((void *)(dp), _mm_loadu_si128((const void *)(sp)));
_mm_storeu_si128((void *)(dp + 16), _mm_loadu_si128((const void *)(sp + 16)));
#endif
}
static always_inline void memcpy64(void *__restrict dp, const void *__restrict sp) {
memcpy32(dp, sp);
memcpy32(dp + 32, sp + 32);
}
static always_inline void memcpy_p4(void *__restrict dp, const void *__restrict sp, size_t nb) {
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
if (nb >= 1) { *(uint8_t *) dp = *(const uint8_t *)sp; }
}
static always_inline void memcpy_p8(void *__restrict dp, const void *__restrict sp, ssize_t nb) {
if (nb >= 4) { memcpy4(dp, sp); sp += 4, dp += 4, nb -= 4; }
memcpy_p4(dp, sp, nb);
}
static always_inline void memcpy_p16(void *__restrict dp, const void *__restrict sp, size_t nb) {
if (nb >= 8) { memcpy8(dp, sp); sp += 8, dp += 8, nb -= 8; }
memcpy_p8(dp, sp, nb);
}
static always_inline void memcpy_p32(void *__restrict dp, const void *__restrict sp, size_t nb) {
if (nb >= 16) { memcpy16(dp, sp); sp += 16, dp += 16, nb -= 16; }
memcpy_p16(dp, sp, nb);
}
static always_inline void memcpy_p64(void *__restrict dp, const void *__restrict sp, size_t nb) {
if (nb >= 32) { memcpy32(dp, sp); sp += 32, dp += 32, nb -= 32; }
memcpy_p32(dp, sp, nb);
}

View file

@ -26,31 +26,9 @@ import (
`github.com/bytedance/sonic/decoder` `github.com/bytedance/sonic/decoder`
`github.com/bytedance/sonic/encoder` `github.com/bytedance/sonic/encoder`
`github.com/bytedance/sonic/option` `github.com/bytedance/sonic/option`
`github.com/bytedance/sonic/internal/native/types`
`github.com/bytedance/sonic/internal/rt` `github.com/bytedance/sonic/internal/rt`
) )
func checkTrailings(buf string, pos int) error {
/* skip all the trailing spaces */
if pos != len(buf) {
for pos < len(buf) && (types.SPACE_MASK & (1 << buf[pos])) != 0 {
pos++
}
}
/* then it must be at EOF */
if pos == len(buf) {
return nil
}
/* junk after JSON value */
return decoder.SyntaxError {
Src : buf,
Pos : pos,
Code : types.ERR_INVALID_CHAR,
}
}
type frozenConfig struct { type frozenConfig struct {
Config Config
encoderOpts encoder.Options encoderOpts encoder.Options
@ -77,6 +55,9 @@ func (cfg Config) Froze() API {
if cfg.NoNullSliceOrMap { if cfg.NoNullSliceOrMap {
api.encoderOpts |= encoder.NoNullSliceOrMap api.encoderOpts |= encoder.NoNullSliceOrMap
} }
if cfg.ValidateString {
api.encoderOpts |= encoder.ValidateString
}
// configure decoder options: // configure decoder options:
if cfg.UseInt64 { if cfg.UseInt64 {
@ -118,13 +99,13 @@ func (cfg frozenConfig) UnmarshalFromString(buf string, val interface{}) error {
dec := decoder.NewDecoder(buf) dec := decoder.NewDecoder(buf)
dec.SetOptions(cfg.decoderOpts) dec.SetOptions(cfg.decoderOpts)
err := dec.Decode(val) err := dec.Decode(val)
pos := dec.Pos()
/* check for errors */ /* check for errors */
if err != nil { if err != nil {
return err return err
} }
return checkTrailings(buf, pos)
return dec.CheckTrailings()
} }
// Unmarshal is implemented by sonic // Unmarshal is implemented by sonic

71
utf8/utf8.go Normal file
View file

@ -0,0 +1,71 @@
/*
* Copyright 2022 ByteDance Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package utf8
import (
`github.com/bytedance/sonic/internal/rt`
`github.com/bytedance/sonic/internal/native/types`
`github.com/bytedance/sonic/internal/native`
)
// CorrectWith corrects the invalid utf8 byte with repl string.
func CorrectWith(dst []byte, src []byte, repl string) []byte {
sstr := rt.Mem2Str(src)
sidx := 0
/* state machine records the invalid postions */
m := types.NewStateMachine()
m.Sp = 0 // invalid utf8 numbers
for sidx < len(sstr) {
scur := sidx
ecode := native.ValidateUTF8(&sstr, &sidx, m)
if m.Sp != 0 {
if m.Sp > len(sstr) {
panic("numbers of invalid utf8 exceed the string len!")
}
}
for i := 0; i < m.Sp; i++ {
ipos := m.Vt[i] // invalid utf8 position
dst = append(dst, sstr[scur:ipos]...)
dst = append(dst, repl...)
scur = m.Vt[i] + 1
}
/* append the remained valid utf8 bytes */
dst = append(dst, sstr[scur:sidx]...)
/* not enough space, reset and continue */
if ecode != 0 {
m.Sp = 0
}
}
types.FreeStateMachine(m)
return dst
}
// Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid.
func Validate(src []byte) bool {
return ValidateString(rt.Mem2Str(src))
}
// ValidateString as Validate, but for string.
func ValidateString(src string) bool {
return native.ValidateUTF8Fast(&src) == 0
}

138
utf8/utf8_test.go Normal file
View file

@ -0,0 +1,138 @@
/*
* Copyright 2022 ByteDance Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package utf8
import (
`testing`
`strings`
`github.com/stretchr/testify/assert`
`unicode/utf8`
`bytes`
`math/rand`
)
var (
_Header_2Bytes = string([]byte{0xC0})
_Header_3Bytes = string([]byte{0xE0})
_Header_4Bytes = string([]byte{0xF0})
_Low_Surrogate = string([]byte{0xED, 0xA0, 0x80}) // \ud800
_High_Surrogate = string([]byte{0xED, 0xB0, 0x80}) // \udc00
_Cont = "\xb0"
)
func TestCorrectWith_InvalidUtf8(t *testing.T) {
var tests = []struct {
name string
input string
expect string
errpos int
} {
{"basic", `abc`, "abc", -1},
{"long", strings.Repeat("helloα景😊", 1000), strings.Repeat("helloα景😊", 1000), -1},
// invalid utf8 - single byte
{"single_Cont", _Cont, "\ufffd", 0},
{"single_Header_2Bytes", _Header_2Bytes, "\ufffd", 0},
{"single_Header_3Bytes", _Header_3Bytes, "\ufffd", 0},
{"single_Header_4Bytes", _Header_4Bytes, "\ufffd", 0},
// invalid utf8 - two bytes
{"two_Header_2Bytes + _Cont", _Header_2Bytes + _Cont, "\ufffd\ufffd", 0},
{`two_Header_4Bytes + _Cont+ "xx"`, _Header_4Bytes + _Cont + "xx", "\ufffd\ufffdxx", 0},
{ `"xx" + three_Header_4Bytes + _Cont + _Cont`, "xx" + _Header_4Bytes + _Cont + _Cont, "xx\ufffd\ufffd\ufffd", 2},
// invalid utf8 - three bytes
{`three_Low_Surrogate`, _Low_Surrogate, "\ufffd\ufffd\ufffd", 0},
{`three__High_Surrogate`, _High_Surrogate, "\ufffd\ufffd\ufffd", 0},
// invalid utf8 - multi bytes
{`_High_Surrogate + _Low_Surrogate`, _High_Surrogate + _Low_Surrogate, "\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", 0},
{`"\x80\x80\x80\x80"`, "\x80\x80\x80\x80", "\ufffd\ufffd\ufffd\ufffd", 0},
}
for _, test := range tests {
got := CorrectWith(nil, []byte(test.input), "\ufffd")
assert.Equal(t, []byte(test.expect), got, test.name)
assert.Equal(t,test.errpos == -1, utf8.ValidString(test.input), test.name)
}
}
func genRandBytes(length int) []byte {
var buf bytes.Buffer
for j := 0; j < length; j++ {
buf.WriteByte(byte(rand.Intn(0xFF + 1)))
}
return buf.Bytes()
}
func genRandAscii(length int) []byte {
var buf bytes.Buffer
for j := 0; j < length; j++ {
buf.WriteByte(byte(rand.Intn(0x7F + 1)))
}
return buf.Bytes()
}
func genRandRune(length int) []byte {
var buf bytes.Buffer
for j := 0; j < length; j++ {
buf.WriteRune(rune(rand.Intn(0x10FFFF + 1)))
}
return buf.Bytes()
}
func TestValidate_Random(t *testing.T) {
// compare with stdlib
compare := func(t *testing.T, data []byte) {
assert.Equal(t, utf8.Valid(data), Validate(data), string(data))
}
// random testing
nums := 1000
maxLen := 1000
for i := 0; i < nums; i++ {
length := rand.Intn(maxLen)
compare(t, genRandBytes(length))
compare(t, genRandRune(length))
}
}
func BenchmarkValidate(b *testing.B) {
bench := []struct {
name string
data []byte
} {
{"ValidAscii", genRandAscii(1000)},
{"ValidUTF8", genRandRune(1000)},
{"RandomBytes", genRandBytes(1000)},
}
for _, test := range bench {
if utf8.Valid(test.data) != Validate(test.data) {
b.Fatalf("sonic utf8 validate wrong for %s string: %v", test.name, test.data)
}
b.Run("Sonic_" + test.name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
Validate(test.data)
}
})
b.Run("StdLib_" + test.name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
utf8.Valid(test.data)
}
})
}
}