mirror of
https://github.com/ii64/sonic.git
synced 2026-06-20 16:45:22 +08:00
feat: repl invalid utf8 in serde by option (#357)
This commit is contained in:
parent
f87d87de7a
commit
02865de676
52 changed files with 23478 additions and 20789 deletions
2
.github/workflows/license-check.yml
vendored
2
.github/workflows/license-check.yml
vendored
|
|
@ -1,6 +1,6 @@
|
|||
name: License Check
|
||||
|
||||
on: push
|
||||
on: pull_request
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
|||
2
.github/workflows/push-check-go118.yml
vendored
2
.github/workflows/push-check-go118.yml
vendored
|
|
@ -1,6 +1,6 @@
|
|||
name: Push Check Go1.18-Linux-X64
|
||||
|
||||
on: push
|
||||
on: pull_request
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
|||
2
.github/workflows/push-check-linux-arm64.yml
vendored
2
.github/workflows/push-check-linux-arm64.yml
vendored
|
|
@ -1,6 +1,6 @@
|
|||
name: Push Check Linux-ARM
|
||||
|
||||
on: push
|
||||
on: pull_request
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
|||
2
.github/workflows/push-check-linux-x64.yml
vendored
2
.github/workflows/push-check-linux-x64.yml
vendored
|
|
@ -1,6 +1,6 @@
|
|||
name: Push Check Linux-X64
|
||||
|
||||
on: push
|
||||
on: pull_request
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
|||
2
.github/workflows/push-check-qemu.yml
vendored
2
.github/workflows/push-check-qemu.yml
vendored
|
|
@ -1,6 +1,6 @@
|
|||
name: Push Check Linux-Qemu
|
||||
|
||||
on: push
|
||||
on: pull_request
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
|||
2
.github/workflows/push-check-windows.yml
vendored
2
.github/workflows/push-check-windows.yml
vendored
|
|
@ -1,6 +1,6 @@
|
|||
name: Push Check Windows-X64
|
||||
|
||||
on: push
|
||||
on: pull_request
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
|||
8
Makefile
8
Makefile
|
|
@ -35,7 +35,7 @@ CC_amd64 := clang
|
|||
ASM2ASM_amd64 := tools/asm2asm/asm2asm.py
|
||||
|
||||
CFLAGS := -mno-red-zone
|
||||
CFLAGS += -arch x86_64
|
||||
CFLAGS += -target x86_64-apple-macos11
|
||||
CFLAGS += -fno-asynchronous-unwind-tables
|
||||
CFLAGS += -fno-builtin
|
||||
CFLAGS += -fno-exceptions
|
||||
|
|
@ -100,8 +100,10 @@ endef
|
|||
all: ${ARCH}
|
||||
|
||||
clean:
|
||||
rm -vfr ${TMP_DIR}/{sse,avx,avx2}
|
||||
rm -vfr ${OUT_DIR}/{sse,avx,avx2}
|
||||
for arch in ${ARCH}; do \
|
||||
rm -vfr ${TMP_DIR}/$${arch}; \
|
||||
rm -vfr ${OUT_DIR}/$${arch}; \
|
||||
done
|
||||
|
||||
$(foreach \
|
||||
arch, \
|
||||
|
|
|
|||
10
api.go
10
api.go
|
|
@ -66,8 +66,8 @@ type Config struct {
|
|||
// CopyString indicates decoder to decode string values by copying instead of referring.
|
||||
CopyString bool
|
||||
|
||||
// ValidateString indicates decoder to valid string values: decoder will return errors when
|
||||
// invalid UTF-8 chars or unescaped control chars(\u0000-\u001f) in the string value of JSON.
|
||||
// ValidateString indicates decoder and encoder to valid string values: decoder will return errors
|
||||
// when unescaped control chars(\u0000-\u001f) in the string value of JSON.
|
||||
ValidateString bool
|
||||
}
|
||||
|
||||
|
|
@ -81,6 +81,7 @@ var (
|
|||
SortMapKeys: true,
|
||||
CompactMarshaler: true,
|
||||
CopyString : true,
|
||||
ValidateString : true,
|
||||
}.Froze()
|
||||
|
||||
// ConfigFastest is the fastest config of APIs, aiming at speed.
|
||||
|
|
@ -164,12 +165,15 @@ func UnmarshalString(buf string, val interface{}) error {
|
|||
return ConfigDefault.UnmarshalFromString(buf, val)
|
||||
}
|
||||
|
||||
// Get searches the given path json,
|
||||
// Get searches the given path from json,
|
||||
// and returns its representing ast.Node.
|
||||
//
|
||||
// Each path arg must be integer or string:
|
||||
// - Integer means searching current node as array
|
||||
// - String means searching current node as object
|
||||
//
|
||||
// Note, the api expects the json is well-formed at least,
|
||||
// otherwise it may return unexpected result.
|
||||
func Get(src []byte, path ...interface{}) (ast.Node, error) {
|
||||
return GetFromString(string(src), path...)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -691,7 +691,10 @@ func (self *Node) AddAny(val interface{}) error {
|
|||
}
|
||||
|
||||
// GetByPath load given path on demands,
|
||||
// which only ensure nodes before this path got parsed
|
||||
// which only ensure nodes before this path got parsed.
|
||||
//
|
||||
// Note, the api expects the json is well-formed at least,
|
||||
// otherwise it may return unexpected result.
|
||||
func (self *Node) GetByPath(path ...interface{}) *Node {
|
||||
if !self.Valid() {
|
||||
return self
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@ import (
|
|||
`strings`
|
||||
`testing`
|
||||
`time`
|
||||
`unicode/utf8`
|
||||
`unsafe`
|
||||
|
||||
`github.com/bytedance/sonic/decoder`
|
||||
|
|
@ -1011,8 +1010,8 @@ var unmarshalTests = []unmarshalTest{
|
|||
|
||||
{in: "\"\x00\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
|
||||
{in: "\"\x00\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
|
||||
{in: "\"\xff\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true},
|
||||
{in: "\"\xff\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true},
|
||||
{in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\ufffd"), validateString: true},
|
||||
{in: "\"\xff\"", ptr: new(string), out: "\ufffd", validateString: true},
|
||||
{in: "\"\x00\"", ptr: new(interface{}), out: interface{}("\x00"), validateString: false},
|
||||
{in: "\"\x00\"", ptr: new(string), out: "\x00", validateString: false},
|
||||
{in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\xff"), validateString: false},
|
||||
|
|
@ -1147,7 +1146,6 @@ func TestUnmarshal(t *testing.T) {
|
|||
}
|
||||
|
||||
dec := decoder.NewDecoder(tt.in)
|
||||
validUtf8 := true
|
||||
if tt.useNumber {
|
||||
dec.UseNumber()
|
||||
}
|
||||
|
|
@ -1156,10 +1154,9 @@ func TestUnmarshal(t *testing.T) {
|
|||
}
|
||||
if tt.validateString {
|
||||
dec.ValidateString()
|
||||
validUtf8 = utf8.Valid([]byte(tt.in))
|
||||
}
|
||||
if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil && validUtf8) {
|
||||
spew.Dump(tt.in)
|
||||
if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil) {
|
||||
spew.Dump(tt)
|
||||
t.Fatalf("#%d: %v, want %v", i, err, tt.err)
|
||||
continue
|
||||
} else if err != nil {
|
||||
|
|
@ -2524,3 +2521,68 @@ func TestChangeTool(t *testing.T) {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
func TestDecoder_LongestInvalidUtf8(t *testing.T) {
|
||||
for _, data := range([]string{
|
||||
"\"" + strings.Repeat("\x80", 4096) + "\"",
|
||||
"\"" + strings.Repeat("\x80", 4095) + "\"",
|
||||
"\"" + strings.Repeat("\x80", 4097) + "\"",
|
||||
"\"" + strings.Repeat("\x80", 12345) + "\"",
|
||||
}) {
|
||||
testDecodeInvalidUtf8(t, []byte(data))
|
||||
}
|
||||
}
|
||||
|
||||
func testDecodeInvalidUtf8(t *testing.T, data []byte) {
|
||||
var sgot, jgot string
|
||||
serr := ConfigStd.Unmarshal(data, &sgot)
|
||||
jerr := json.Unmarshal(data, &jgot)
|
||||
assert.Equal(t, serr != nil, jerr != nil)
|
||||
if jerr == nil {
|
||||
assert.Equal(t, sgot, jgot)
|
||||
}
|
||||
}
|
||||
|
||||
func needEscape(b byte) bool {
|
||||
return b == '"' || b == '\\' || b < '\x20'
|
||||
}
|
||||
|
||||
func genRandJsonBytes(length int) []byte {
|
||||
var buf bytes.Buffer
|
||||
buf.WriteByte('"')
|
||||
for j := 0; j < length; j++ {
|
||||
r := rand.Intn(0xff + 1)
|
||||
if needEscape(byte(r)) {
|
||||
buf.WriteByte('\\')
|
||||
}
|
||||
buf.WriteByte(byte(r))
|
||||
}
|
||||
buf.WriteByte('"')
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func genRandJsonRune(length int) []byte {
|
||||
var buf bytes.Buffer
|
||||
buf.WriteByte('"')
|
||||
for j := 0; j < length; j++ {
|
||||
r := rand.Intn(0x10FFFF + 1)
|
||||
if r < 0x80 && needEscape(byte(r)) {
|
||||
buf.WriteByte('\\')
|
||||
buf.WriteByte(byte(r))
|
||||
} else {
|
||||
buf.WriteRune(rune(r))
|
||||
}
|
||||
}
|
||||
buf.WriteByte('"')
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func TestDecoder_RandomInvalidUtf8(t *testing.T) {
|
||||
nums := 1000
|
||||
maxLen := 1000
|
||||
for i := 0; i < nums; i++ {
|
||||
length := rand.Intn(maxLen)
|
||||
testDecodeInvalidUtf8(t, genRandJsonBytes(length))
|
||||
testDecodeInvalidUtf8(t, genRandJsonRune(length))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ import (
|
|||
`github.com/bytedance/sonic/internal/native/types`
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
`github.com/bytedance/sonic/option`
|
||||
`github.com/bytedance/sonic/utf8`
|
||||
)
|
||||
|
||||
const (
|
||||
|
|
@ -80,9 +81,39 @@ func (self *Decoder) Reset(s string) {
|
|||
// self.f = 0
|
||||
}
|
||||
|
||||
func (self *Decoder) CheckTrailings() error {
|
||||
pos := self.i
|
||||
buf := self.s
|
||||
/* skip all the trailing spaces */
|
||||
if pos != len(buf) {
|
||||
for pos < len(buf) && (types.SPACE_MASK & (1 << buf[pos])) != 0 {
|
||||
pos++
|
||||
}
|
||||
}
|
||||
|
||||
/* then it must be at EOF */
|
||||
if pos == len(buf) {
|
||||
return nil
|
||||
}
|
||||
|
||||
/* junk after JSON value */
|
||||
return SyntaxError {
|
||||
Src : buf,
|
||||
Pos : pos,
|
||||
Code : types.ERR_INVALID_CHAR,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Decode parses the JSON-encoded data from current position and stores the result
|
||||
// in the value pointed to by val.
|
||||
func (self *Decoder) Decode(val interface{}) error {
|
||||
/* validate json if needed */
|
||||
if (self.f & (1 << _F_validate_string)) != 0 && !utf8.ValidateString(self.s){
|
||||
dbuf := utf8.CorrectWith(nil, rt.Str2Mem(self.s), "\ufffd")
|
||||
self.s = rt.Mem2Str(dbuf)
|
||||
}
|
||||
|
||||
vv := rt.UnpackEface(val)
|
||||
vp := vv.Value
|
||||
|
||||
|
|
@ -99,7 +130,6 @@ func (self *Decoder) Decode(val interface{}) error {
|
|||
/* create a new stack, and call the decoder */
|
||||
sb, etp := newStack(), rt.PtrElem(vv.Type)
|
||||
nb, err := decodeTypedPointer(self.s, self.i, etp, vp, sb, self.f)
|
||||
|
||||
/* return the stack back */
|
||||
self.i = nb
|
||||
freeStack(sb)
|
||||
|
|
|
|||
|
|
@ -345,7 +345,6 @@ func TestDecoder_Generic(t *testing.T) {
|
|||
pos, err := decode(TwitterJson, &v, false)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, len(TwitterJson), pos)
|
||||
spew.Dump(v)
|
||||
}
|
||||
|
||||
func TestDecoder_Binding(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ import (
|
|||
`testing`
|
||||
`time`
|
||||
`unsafe`
|
||||
`strings`
|
||||
|
||||
`github.com/bytedance/sonic/encoder`
|
||||
`github.com/stretchr/testify/assert`
|
||||
|
|
@ -52,7 +53,6 @@ func TestMain(m *testing.M) {
|
|||
runtime.GC()
|
||||
debug.FreeOSMemory()
|
||||
}
|
||||
println("stop GC looping!")
|
||||
}()
|
||||
time.Sleep(time.Millisecond)
|
||||
m.Run()
|
||||
|
|
@ -1169,3 +1169,32 @@ func TestMarshalNullNil(t *testing.T) {
|
|||
assert.Nil(t, e)
|
||||
assert.Equal(t, `{"A":[],"B":{}}`, string(o))
|
||||
}
|
||||
|
||||
func TestEncoder_LongestInvalidUtf8(t *testing.T) {
|
||||
for _, data := range([]string{
|
||||
"\"" + strings.Repeat("\x80", 4096) + "\"",
|
||||
"\"" + strings.Repeat("\x80", 4095) + "\"",
|
||||
"\"" + strings.Repeat("\x80", 4097) + "\"",
|
||||
"\"" + strings.Repeat("\x80", 12345) + "\"",
|
||||
}) {
|
||||
testEncodeInvalidUtf8(t, []byte(data))
|
||||
}
|
||||
}
|
||||
|
||||
func testEncodeInvalidUtf8(t *testing.T, data []byte) {
|
||||
jgot, jerr := json.Marshal(data)
|
||||
sgot, serr := ConfigStd.Marshal(data)
|
||||
assert.Equal(t, serr != nil, jerr != nil)
|
||||
if jerr == nil {
|
||||
assert.Equal(t, sgot, jgot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEncoder_RandomInvalidUtf8(t *testing.T) {
|
||||
nums := 1000
|
||||
maxLen := 1000
|
||||
for i := 0; i < nums; i++ {
|
||||
testEncodeInvalidUtf8(t, genRandJsonBytes(maxLen))
|
||||
testEncodeInvalidUtf8(t, genRandJsonRune(maxLen))
|
||||
}
|
||||
}
|
||||
|
|
@ -21,11 +21,11 @@ import (
|
|||
`encoding/json`
|
||||
`reflect`
|
||||
`runtime`
|
||||
`unsafe`
|
||||
|
||||
`github.com/bytedance/sonic/internal/native`
|
||||
`github.com/bytedance/sonic/internal/native/types`
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
`github.com/bytedance/sonic/utf8`
|
||||
`github.com/bytedance/sonic/option`
|
||||
)
|
||||
|
||||
|
|
@ -38,6 +38,7 @@ const (
|
|||
bitCompactMarshaler
|
||||
bitNoQuoteTextMarshaler
|
||||
bitNoNullSliceOrMap
|
||||
bitValidateString
|
||||
|
||||
// used for recursive compile
|
||||
bitPointerValue = 63
|
||||
|
|
@ -66,6 +67,10 @@ const (
|
|||
// instead of 'null'
|
||||
NoNullSliceOrMap Options = 1 << bitNoNullSliceOrMap
|
||||
|
||||
// ValidateString indicates that encoder should validate the input string
|
||||
// before encoding it into JSON.
|
||||
ValidateString Options = 1 << bitValidateString
|
||||
|
||||
// CompatibleWithStd is used to be compatible with std encoder.
|
||||
CompatibleWithStd Options = SortMapKeys | EscapeHTML | CompactMarshaler
|
||||
)
|
||||
|
|
@ -100,6 +105,15 @@ func (self *Encoder) SetEscapeHTML(f bool) {
|
|||
}
|
||||
}
|
||||
|
||||
// SetValidateString specifies if option ValidateString opens
|
||||
func (self *Encoder) SetValidateString(f bool) {
|
||||
if f {
|
||||
self.Opts |= ValidateString
|
||||
} else {
|
||||
self.Opts &= ^ValidateString
|
||||
}
|
||||
}
|
||||
|
||||
// SetCompactMarshaler specifies if option CompactMarshaler opens
|
||||
func (self *Encoder) SetCompactMarshaler(f bool) {
|
||||
if f {
|
||||
|
|
@ -156,7 +170,7 @@ func Encode(val interface{}, opts Options) ([]byte, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
if opts & EscapeHTML != 0 {
|
||||
if opts & EscapeHTML != 0 || opts & ValidateString != 0 {
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
|
|
@ -189,6 +203,12 @@ func EncodeInto(buf *[]byte, val interface{}, opts Options) error {
|
|||
*buf = dest
|
||||
}
|
||||
|
||||
if opts & ValidateString != 0 && !utf8.Validate(*buf) {
|
||||
dest := utf8.CorrectWith(nil, *buf, `\ufffd`)
|
||||
freeBytes(*buf) // free origin used buffer
|
||||
*buf = dest
|
||||
}
|
||||
|
||||
/* avoid GC ahead */
|
||||
runtime.KeepAlive(buf)
|
||||
runtime.KeepAlive(efv)
|
||||
|
|
@ -203,38 +223,8 @@ var typeByte = rt.UnpackType(reflect.TypeOf(byte(0)))
|
|||
// For historical reasons, web browsers don't honor standard HTML
|
||||
// escaping within <script> tags, so an alternative JSON encoding must
|
||||
// be used.
|
||||
func HTMLEscape(dest []byte, src []byte) []byte {
|
||||
nb := len(src)
|
||||
|
||||
// initilize dest buffer
|
||||
cap := nb * 6 / 5
|
||||
if dest == nil {
|
||||
dest = make([]byte, 0, cap)
|
||||
}
|
||||
ds := (*rt.GoSlice)(unsafe.Pointer(&dest))
|
||||
sp := (*rt.GoSlice)(unsafe.Pointer(&src)).Ptr
|
||||
ds.Len = 0
|
||||
if (ds.Cap < cap) {
|
||||
*ds = growslice(typeByte, *ds, cap)
|
||||
}
|
||||
|
||||
for nb > 0 {
|
||||
dp := unsafe.Pointer(uintptr(ds.Ptr) + uintptr(ds.Len))
|
||||
dn := ds.Cap - ds.Len
|
||||
|
||||
ret := native.HTMLEscape(sp, nb, dp, &dn)
|
||||
ds.Len += dn
|
||||
|
||||
if ret >= 0 {
|
||||
break
|
||||
}
|
||||
ret = ^ret
|
||||
nb -= ret
|
||||
|
||||
*ds = growslice(typeByte, *ds, ds.Cap * 2)
|
||||
sp = unsafe.Pointer(uintptr(sp) + uintptr(ret))
|
||||
}
|
||||
return dest
|
||||
func HTMLEscape(dst []byte, src []byte) []byte {
|
||||
return htmlEscape(dst, src)
|
||||
}
|
||||
|
||||
// EncodeIndented is like Encode but applies Indent to format the output.
|
||||
|
|
@ -293,6 +283,8 @@ func Pretouch(vt reflect.Type, opts ...option.CompileOption) error {
|
|||
// Valid validates json and returns first non-blank character position,
|
||||
// if it is only one valid json value.
|
||||
// Otherwise returns invalid character position using start.
|
||||
//
|
||||
// Note: it does not check for the invalid UTF-8 characters.
|
||||
func Valid(data []byte) (ok bool, start int) {
|
||||
n := len(data)
|
||||
if n == 0 {
|
||||
|
|
@ -303,13 +295,17 @@ func Valid(data []byte) (ok bool, start int) {
|
|||
m := types.NewStateMachine()
|
||||
ret := native.ValidateOne(&s, &p, m)
|
||||
types.FreeStateMachine(m)
|
||||
|
||||
if ret < 0 {
|
||||
return false, p-1
|
||||
}
|
||||
|
||||
/* check for trailing spaces */
|
||||
for ;p < n; p++ {
|
||||
if (types.SPACE_MASK & (1 << data[p])) == 0 {
|
||||
return false, p
|
||||
}
|
||||
}
|
||||
|
||||
return true, ret
|
||||
}
|
||||
|
|
@ -113,6 +113,39 @@ func encodeTextMarshaler(buf *[]byte, val encoding.TextMarshaler, opt Options) e
|
|||
}
|
||||
}
|
||||
|
||||
func htmlEscape(dst []byte, src []byte) []byte {
|
||||
var sidx int
|
||||
|
||||
dst = append(dst, src[:0]...) // avoid check nil dst
|
||||
sbuf := (*rt.GoSlice)(unsafe.Pointer(&src))
|
||||
dbuf := (*rt.GoSlice)(unsafe.Pointer(&dst))
|
||||
|
||||
/* grow dst if it is shorter */
|
||||
if cap(dst) - len(dst) < len(src) + native.BufPaddingSize {
|
||||
cap := len(src) * 3 / 2 + native.BufPaddingSize
|
||||
*dbuf = growslice(typeByte, *dbuf, cap)
|
||||
}
|
||||
|
||||
for sidx < sbuf.Len {
|
||||
sp := padd(sbuf.Ptr, sidx)
|
||||
dp := padd(dbuf.Ptr, dbuf.Len)
|
||||
|
||||
sn := sbuf.Len - sidx
|
||||
dn := dbuf.Cap - dbuf.Len
|
||||
nb := native.HTMLEscape(sp, sn, dp, &dn)
|
||||
|
||||
/* check for errors */
|
||||
if dbuf.Len += dn; nb >= 0 {
|
||||
break
|
||||
}
|
||||
|
||||
/* not enough space, grow the slice and try again */
|
||||
sidx += ^nb
|
||||
*dbuf = growslice(typeByte, *dbuf, dbuf.Cap * 2)
|
||||
}
|
||||
return dst
|
||||
}
|
||||
|
||||
var (
|
||||
argPtrs = []bool { true, true, true, false }
|
||||
localPtrs = []bool{}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,14 @@
|
|||
testname := FuzzMain
|
||||
corpusdir := ./testdata/fuzz/${testname}
|
||||
|
||||
fuzz:
|
||||
mkdir -p ./testdata/fuzz/FuzzMain
|
||||
mkdir -p ${corpusdir}
|
||||
rm -rf ./go-fuzz-corpus
|
||||
git clone https://github.com/dvyukov/go-fuzz-corpus.git ./go-fuzz-corpus/
|
||||
file2fuzz -o ./testdata/fuzz/FuzzMain ./go-fuzz-corpus/json/corpus/* ./corpus/*
|
||||
file2fuzz -o ${corpusdir} ./go-fuzz-corpus/json/corpus/* ./corpus/*
|
||||
|
||||
run:
|
||||
GOARCH=amd64 go test -fuzz=Fuzz -v
|
||||
GOARCH=amd64 go test -fuzz=${testname} -v
|
||||
|
||||
clean:
|
||||
rm -rf ./go-fuzz-corpus/
|
||||
|
|
|
|||
|
|
@ -19,17 +19,21 @@
|
|||
package sonic_fuzz
|
||||
|
||||
import (
|
||||
`encoding/json`
|
||||
`encoding/json`
|
||||
`testing`
|
||||
`unicode/utf8`
|
||||
`reflect`
|
||||
_ `unicode/utf8`
|
||||
`os`
|
||||
`runtime`
|
||||
`runtime/debug`
|
||||
`time`
|
||||
`io`
|
||||
`log`
|
||||
`strconv`
|
||||
|
||||
`github.com/bytedance/sonic`
|
||||
`github.com/stretchr/testify/require`
|
||||
`github.com/davecgh/go-spew/spew`
|
||||
`github.com/bytedance/gopkg/util/gctuner`
|
||||
)
|
||||
|
||||
func FuzzMain(f *testing.F) {
|
||||
|
|
@ -39,11 +43,18 @@ func FuzzMain(f *testing.F) {
|
|||
f.Fuzz(fuzzMain)
|
||||
}
|
||||
|
||||
// Used for debug falied fuzz corpus
|
||||
func TestCorpus(t *testing.T) {
|
||||
fuzzMain(t, []byte("[1\x00"))
|
||||
}
|
||||
|
||||
var target = sonic.ConfigStd
|
||||
|
||||
func fuzzMain(t *testing.T, data []byte) {
|
||||
fuzzValidate(t, data)
|
||||
fuzzHtmlEscape(t, data)
|
||||
// Only fuzz the validate json here, because the default configuration does not have validation in SONIC.
|
||||
if !utf8.Valid(data) || !json.Valid(data) {
|
||||
// Only fuzz the validate json here.
|
||||
if !json.Valid(data) {
|
||||
return
|
||||
}
|
||||
for _, typ := range []func() interface{}{
|
||||
|
|
@ -54,31 +65,34 @@ func fuzzMain(t *testing.T, data []byte) {
|
|||
func() interface{} { return new(int64) },
|
||||
func() interface{} { return new(uint64) },
|
||||
func() interface{} { return new(float64) },
|
||||
func() interface{} { return new(json.Number) },
|
||||
func() interface{} { return new(S) },
|
||||
// func() interface{} { return new(json.Number) },
|
||||
// func() interface{} { return new(S) },
|
||||
} {
|
||||
sv, jv := typ(), typ()
|
||||
serr := sonic.Unmarshal([]byte(data), sv)
|
||||
serr := target.Unmarshal([]byte(data), sv)
|
||||
jerr := json.Unmarshal([]byte(data), jv)
|
||||
require.Equalf(t, serr != nil, jerr != nil, "different error in sonic unmarshal %v", reflect.TypeOf(jv))
|
||||
require.Equal(t, serr != nil, jerr != nil,
|
||||
dump(data, jv, jerr, sv, serr))
|
||||
if jerr != nil {
|
||||
continue
|
||||
}
|
||||
require.Equal(t, sv, jv, "different result in sonic unmarshal %v", reflect.TypeOf(jv))
|
||||
sout, serr := sonic.Marshal(sv)
|
||||
jout, jerr := json.Marshal(jv)
|
||||
require.NoError(t, serr, "error in sonic marshal %v", reflect.TypeOf(jv))
|
||||
require.NoError(t, jerr, "error in json marshal %v", reflect.TypeOf(jv))
|
||||
require.Equal(t, sv, jv, dump(data, jv, jerr, sv, serr))
|
||||
|
||||
v := jv
|
||||
sout, serr := target.Marshal(v)
|
||||
jout, jerr := json.Marshal(v)
|
||||
require.NoError(t, serr, dump(v, jout, jerr, sout, serr))
|
||||
require.NoError(t, jerr, dump(v, jout, jerr, sout, serr))
|
||||
|
||||
{
|
||||
sv, jv := typ(), typ()
|
||||
serr := sonic.Unmarshal(sout, sv)
|
||||
serr := target.Unmarshal(sout, sv)
|
||||
jerr := json.Unmarshal(jout, jv)
|
||||
require.Equalf(t, serr != nil, jerr != nil, "different error in sonic unmarshal again %v", reflect.TypeOf(jv))
|
||||
require.Equalf(t, serr != nil, jerr != nil, dump(data, jv, jerr, sv, serr))
|
||||
if jerr != nil {
|
||||
continue
|
||||
}
|
||||
require.Equal(t, sv, jv, "different result in sonic unmarshal again %v", reflect.TypeOf(jv))
|
||||
require.Equal(t, sv, jv, dump(data, jv, jerr, sv, serr))
|
||||
}
|
||||
|
||||
if m, ok := sv.(*map[string]interface{}); ok {
|
||||
|
|
@ -89,52 +103,54 @@ func fuzzMain(t *testing.T, data []byte) {
|
|||
fuzzASTGetFromArray(t, jout, *a)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
type S struct {
|
||||
A int `json:",omitempty"`
|
||||
B string `json:"B1,omitempty"`
|
||||
C float64
|
||||
D bool
|
||||
E uint8
|
||||
// F []byte // unmarshal []byte is different with encoding/json
|
||||
G interface{}
|
||||
H map[string]interface{}
|
||||
I map[string]string
|
||||
J []interface{}
|
||||
K []string
|
||||
L S1
|
||||
M *S1
|
||||
N *int
|
||||
O **int
|
||||
A int `json:",omitempty"`
|
||||
B string `json:"B1,omitempty"`
|
||||
C float64
|
||||
D bool
|
||||
E uint8
|
||||
// F []byte // unmarshal []byte is different with encoding/json
|
||||
G interface{}
|
||||
H map[string]interface{}
|
||||
I map[string]string
|
||||
J []interface{}
|
||||
K []string
|
||||
L S1
|
||||
M *S1
|
||||
N *int
|
||||
O **int
|
||||
P int `json:",string"`
|
||||
Q float64 `json:",string"`
|
||||
R int `json:"-"`
|
||||
T struct {}
|
||||
U [2]int
|
||||
V uintptr
|
||||
R int `json:"-"`
|
||||
T struct {}
|
||||
U [2]int
|
||||
V uintptr
|
||||
W json.Number
|
||||
// X json.RawMessage
|
||||
Y Marshaller
|
||||
Z TextMarshaller
|
||||
Z TextMarshaller
|
||||
}
|
||||
|
||||
|
||||
type S1 struct {
|
||||
A int
|
||||
B string
|
||||
A int
|
||||
B string
|
||||
}
|
||||
|
||||
type Marshaller struct {
|
||||
v string
|
||||
v string
|
||||
}
|
||||
|
||||
func (m *Marshaller) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(m.v)
|
||||
return json.Marshal(m.v)
|
||||
}
|
||||
|
||||
func (m *Marshaller) UnmarshalJSON(data []byte) error {
|
||||
return json.Unmarshal(data, &m.v)
|
||||
return json.Unmarshal(data, &m.v)
|
||||
}
|
||||
|
||||
type TextMarshaller struct {
|
||||
|
|
@ -149,19 +165,54 @@ func (k *TextMarshaller) UnmarshalText(data []byte) error {
|
|||
return json.Unmarshal(data, &k.v)
|
||||
}
|
||||
|
||||
var debugAsyncGC = os.Getenv("SONIC_NO_ASYNC_GC") == ""
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
func dump(args ...interface{}) string {
|
||||
return spew.Sdump(args)
|
||||
}
|
||||
|
||||
func fdump(w io.Writer, args ...interface{}) {
|
||||
spew.Fdump(w, args)
|
||||
}
|
||||
|
||||
const (
|
||||
MemoryLimitEnv = "SONIC_FUZZ_MEM_LIMIT"
|
||||
AsynyncGCEnv = "SONIC_NO_ASYNC_GC"
|
||||
KB uint64 = 1024
|
||||
MB uint64 = 1024 * KB
|
||||
GB uint64 = 1024 * MB
|
||||
)
|
||||
|
||||
func setMemLimit(limit uint64) {
|
||||
threshold := uint64(float64(limit) * 0.7)
|
||||
numWorker := uint64(runtime.GOMAXPROCS(0))
|
||||
if os.Getenv(MemoryLimitEnv) != "" {
|
||||
if memGB, err := strconv.ParseUint(os.Getenv(MemoryLimitEnv), 10, 64); err == nil {
|
||||
limit = memGB * GB
|
||||
}
|
||||
}
|
||||
gctuner.Tuning(threshold / numWorker)
|
||||
log.Printf("[%d] Memory Limit: %d GB, Memory Threshold: %d MB\n", os.Getpid(), limit/GB, threshold/MB)
|
||||
log.Printf("[%d] Memory Threshold Per Worker: %d MB\n", os.Getpid(), threshold/numWorker/MB)
|
||||
}
|
||||
|
||||
func enableSyncGC() {
|
||||
var debugAsyncGC = os.Getenv("AsynyncGCEnv") == ""
|
||||
go func () {
|
||||
if !debugAsyncGC {
|
||||
return
|
||||
}
|
||||
println("Begin GC looping...")
|
||||
log.Printf("Begin GC looping...")
|
||||
for {
|
||||
runtime.GC()
|
||||
debug.FreeOSMemory()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
// Avoid OOM
|
||||
setMemLimit(8 * GB)
|
||||
enableSyncGC()
|
||||
time.Sleep(time.Millisecond)
|
||||
m.Run()
|
||||
}
|
||||
11
fuzz/go.mod
11
fuzz/go.mod
|
|
@ -3,18 +3,19 @@ module github.com/bytedance/sonic/fuzz
|
|||
go 1.18
|
||||
|
||||
require (
|
||||
github.com/bytedance/sonic v1.0.0
|
||||
github.com/stretchr/testify v1.7.0
|
||||
github.com/bytedance/gopkg v0.0.0-20221122125632-68358b8ecec6
|
||||
github.com/bytedance/sonic v1.5.0
|
||||
github.com/davecgh/go-spew v1.1.1
|
||||
github.com/stretchr/testify v1.8.1
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||
golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
||||
replace github.com/bytedance/sonic => ../.
|
||||
|
|
|
|||
37
fuzz/go.sum
37
fuzz/go.sum
|
|
@ -1,40 +1,31 @@
|
|||
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06 h1:1sDoSuDPWzhkdzNVxCxtIaKiAe96ESVPv8coGwc1gZ4=
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
|
||||
github.com/bytedance/gopkg v0.0.0-20221122125632-68358b8ecec6 h1:FCLDGi1EmB7JzjVVYNZiqc/zAJj2BQ5M0lfkVOxbfs8=
|
||||
github.com/bytedance/gopkg v0.0.0-20221122125632-68358b8ecec6/go.mod h1:5FoAH5xUHHCMDvQPy1rnj8moqLkLHFaDVBjHhcFwEi0=
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/goccy/go-json v0.9.4 h1:L8MLKG2mvVXiQu07qB6hmfqeSYQdOnqPot2GhsIwIaI=
|
||||
github.com/goccy/go-json v0.9.4/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
|
||||
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
|
||||
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/tidwall/gjson v1.12.1/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
|
||||
github.com/tidwall/gjson v1.13.0 h1:3TFY9yxOQShrvmjdM76K+jc66zJeT6D3/VFFYCGQf7M=
|
||||
github.com/tidwall/gjson v1.13.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
|
||||
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
|
||||
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
|
||||
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
|
||||
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
|
||||
github.com/tidwall/sjson v1.2.4 h1:cuiLzLnaMeBhRmEv00Lpk3tkYrcxpmbU81tAY4Dw0tc=
|
||||
github.com/tidwall/sjson v1.2.4/go.mod h1:098SZ494YoMWPmMO6ct4dcFnqxwj9r/gF0Etp19pSNM=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
|
||||
golang.org/x/arch v0.0.0-20210923205945-b76863e36670 h1:18EFjUmQOcUvxNYSkA6jO9VAiXCnxFY6NyDX0bHDmkU=
|
||||
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
|
||||
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
|
||||
|
|
|
|||
|
|
@ -71,10 +71,10 @@ func generateJSONTag(name string) reflect.StructTag {
|
|||
name = strings.Split(name, ",")[0] // remove origin "," in tag name
|
||||
switch int(rand.Int() % 5) {
|
||||
case 0: return reflect.StructTag(`json:"-"`) // always omitted
|
||||
case 1: return reflect.StructTag("") // empty tag
|
||||
case 2: opt = "" // empty opt
|
||||
case 3: opt = "omitempty"
|
||||
case 4: opt = "string"
|
||||
case 1: opt = "" // empty opt
|
||||
case 2: opt = "omitempty"
|
||||
// case 3: opt = "string"
|
||||
default: return reflect.StructTag("") // empty tag
|
||||
}
|
||||
return reflect.StructTag(fmt.Sprintf(`json:"%s,%s"`, name, opt))
|
||||
}
|
||||
|
|
@ -146,7 +146,7 @@ func fuzzDynamicStruct(t *testing.T, data []byte, v map[string]interface{}) {
|
|||
require.NoErrorf(t, err, "error in sonic pretouch struct %v", typ)
|
||||
|
||||
// Unmarshal fuzz
|
||||
serr := sonic.Unmarshal(data, &sv)
|
||||
serr := target.Unmarshal(data, &sv)
|
||||
jerr := json.Unmarshal(data, &jv)
|
||||
require.Equalf(t, serr != nil, jerr != nil, "different error in sonic unmarshal %v", typ)
|
||||
if serr != nil {
|
||||
|
|
@ -155,7 +155,7 @@ func fuzzDynamicStruct(t *testing.T, data []byte, v map[string]interface{}) {
|
|||
require.Equal(t, sv, jv, "different result in sonic unmarshal %v", typ)
|
||||
|
||||
// Marshal fuzz
|
||||
sout, serr := sonic.Marshal(sv)
|
||||
sout, serr := target.Marshal(sv)
|
||||
jout, jerr := json.Marshal(jv)
|
||||
require.NoError(t, serr, "error in sonic marshal %v", typ)
|
||||
require.NoError(t, jerr, "error in json marshal %v", typ)
|
||||
|
|
|
|||
|
|
@ -123,3 +123,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
|
|||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8_fast(s *string) (ret int)
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -256,15 +256,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
|
|||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_Vstring_ValidUtf8(t *testing.T) {
|
||||
var v types.JsonState
|
||||
valid := uint64(types.F_VALIDATE_STRING)
|
||||
i := 0
|
||||
s := "test\xff\""
|
||||
__vstring(&s, &i, &v, valid)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_VstringEscapeEOF(t *testing.T) {
|
||||
var v types.JsonState
|
||||
i := 0
|
||||
|
|
@ -275,51 +266,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
|
|||
assert.Equal(t, int64(0), v.Iv)
|
||||
}
|
||||
|
||||
func TestNative_ValidateOne(t *testing.T) {
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, len(s), p)
|
||||
assert.Equal(t, 0, r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x00\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x80\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\xed\xbf\xbf\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNative_VstringHangUpOnRandomData(t *testing.T) {
|
||||
v, e := hex.DecodeString(
|
||||
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +
|
||||
|
|
|
|||
|
|
@ -9,32 +9,34 @@ package avx
|
|||
func __native_entry__() uintptr
|
||||
|
||||
var (
|
||||
_subr__f32toa = __native_entry__() + 28656
|
||||
_subr__f64toa = __native_entry__() + 496
|
||||
_subr__get_by_path = __native_entry__() + 26848
|
||||
_subr__html_escape = __native_entry__() + 10480
|
||||
_subr__i64toa = __native_entry__() + 4176
|
||||
_subr__lspace = __native_entry__() + 80
|
||||
_subr__quote = __native_entry__() + 5552
|
||||
_subr__skip_array = __native_entry__() + 20160
|
||||
_subr__skip_number = __native_entry__() + 23472
|
||||
_subr__skip_object = __native_entry__() + 22048
|
||||
_subr__skip_one = __native_entry__() + 23616
|
||||
_subr__skip_one_fast = __native_entry__() + 23824
|
||||
_subr__u64toa = __native_entry__() + 4288
|
||||
_subr__unquote = __native_entry__() + 7296
|
||||
_subr__validate_one = __native_entry__() + 23648
|
||||
_subr__value = __native_entry__() + 13728
|
||||
_subr__vnumber = __native_entry__() + 17904
|
||||
_subr__vsigned = __native_entry__() + 19456
|
||||
_subr__vstring = __native_entry__() + 15808
|
||||
_subr__vunsigned = __native_entry__() + 19808
|
||||
_subr__f32toa = __native_entry__() + 29744
|
||||
_subr__f64toa = __native_entry__() + 496
|
||||
_subr__get_by_path = __native_entry__() + 27424
|
||||
_subr__html_escape = __native_entry__() + 9968
|
||||
_subr__i64toa = __native_entry__() + 4112
|
||||
_subr__lspace = __native_entry__() + 80
|
||||
_subr__quote = __native_entry__() + 5792
|
||||
_subr__skip_array = __native_entry__() + 20576
|
||||
_subr__skip_number = __native_entry__() + 23920
|
||||
_subr__skip_object = __native_entry__() + 22496
|
||||
_subr__skip_one = __native_entry__() + 24080
|
||||
_subr__skip_one_fast = __native_entry__() + 24320
|
||||
_subr__u64toa = __native_entry__() + 4384
|
||||
_subr__unquote = __native_entry__() + 7488
|
||||
_subr__validate_one = __native_entry__() + 24144
|
||||
_subr__validate_utf8 = __native_entry__() + 28464
|
||||
_subr__validate_utf8_fast = __native_entry__() + 29136
|
||||
_subr__value = __native_entry__() + 14672
|
||||
_subr__vnumber = __native_entry__() + 18320
|
||||
_subr__vsigned = __native_entry__() + 19856
|
||||
_subr__vstring = __native_entry__() + 16864
|
||||
_subr__vunsigned = __native_entry__() + 20208
|
||||
)
|
||||
|
||||
const (
|
||||
_stack__f32toa = 64
|
||||
_stack__f32toa = 56
|
||||
_stack__f64toa = 80
|
||||
_stack__get_by_path = 296
|
||||
_stack__get_by_path = 312
|
||||
_stack__html_escape = 64
|
||||
_stack__i64toa = 16
|
||||
_stack__lspace = 8
|
||||
|
|
@ -45,10 +47,12 @@ const (
|
|||
_stack__skip_one = 128
|
||||
_stack__skip_one_fast = 208
|
||||
_stack__u64toa = 8
|
||||
_stack__unquote = 72
|
||||
_stack__unquote = 128
|
||||
_stack__validate_one = 128
|
||||
_stack__value = 336
|
||||
_stack__vnumber = 248
|
||||
_stack__validate_utf8 = 48
|
||||
_stack__validate_utf8_fast = 24
|
||||
_stack__value = 368
|
||||
_stack__vnumber = 280
|
||||
_stack__vsigned = 16
|
||||
_stack__vstring = 128
|
||||
_stack__vunsigned = 24
|
||||
|
|
@ -70,6 +74,8 @@ var (
|
|||
_ = _subr__u64toa
|
||||
_ = _subr__unquote
|
||||
_ = _subr__validate_one
|
||||
_ = _subr__validate_utf8
|
||||
_ = _subr__validate_utf8_fast
|
||||
_ = _subr__value
|
||||
_ = _subr__vnumber
|
||||
_ = _subr__vsigned
|
||||
|
|
@ -93,6 +99,8 @@ const (
|
|||
_ = _stack__u64toa
|
||||
_ = _stack__unquote
|
||||
_ = _stack__validate_one
|
||||
_ = _stack__validate_utf8
|
||||
_ = _stack__validate_utf8_fast
|
||||
_ = _stack__value
|
||||
_ = _stack__vnumber
|
||||
_ = _stack__vsigned
|
||||
|
|
|
|||
|
|
@ -123,3 +123,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
|
|||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8_fast(s *string) (ret int)
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -256,15 +256,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
|
|||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_Vstring_ValidUtf8(t *testing.T) {
|
||||
var v types.JsonState
|
||||
valid := uint64(types.F_VALIDATE_STRING)
|
||||
i := 0
|
||||
s := "test\xff\""
|
||||
__vstring(&s, &i, &v, valid)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_VstringEscapeEOF(t *testing.T) {
|
||||
var v types.JsonState
|
||||
i := 0
|
||||
|
|
@ -275,51 +266,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
|
|||
assert.Equal(t, int64(0), v.Iv)
|
||||
}
|
||||
|
||||
func TestNative_ValidateOne(t *testing.T) {
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, len(s), p)
|
||||
assert.Equal(t, 0, r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x00\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x80\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\xed\xbf\xbf\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNative_VstringHangUpOnRandomData(t *testing.T) {
|
||||
v, e := hex.DecodeString(
|
||||
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +
|
||||
|
|
|
|||
|
|
@ -9,48 +9,52 @@ package avx2
|
|||
func __native_entry__() uintptr
|
||||
|
||||
var (
|
||||
_subr__f32toa = __native_entry__() + 32816
|
||||
_subr__f64toa = __native_entry__() + 752
|
||||
_subr__get_by_path = __native_entry__() + 30896
|
||||
_subr__html_escape = __native_entry__() + 12320
|
||||
_subr__i64toa = __native_entry__() + 4432
|
||||
_subr__lspace = __native_entry__() + 224
|
||||
_subr__quote = __native_entry__() + 5904
|
||||
_subr__skip_array = __native_entry__() + 23472
|
||||
_subr__skip_number = __native_entry__() + 27440
|
||||
_subr__skip_object = __native_entry__() + 25392
|
||||
_subr__skip_one = __native_entry__() + 27584
|
||||
_subr__skip_one_fast = __native_entry__() + 27984
|
||||
_subr__u64toa = __native_entry__() + 4544
|
||||
_subr__unquote = __native_entry__() + 8848
|
||||
_subr__validate_one = __native_entry__() + 27616
|
||||
_subr__value = __native_entry__() + 16896
|
||||
_subr__vnumber = __native_entry__() + 21216
|
||||
_subr__vsigned = __native_entry__() + 22768
|
||||
_subr__vstring = __native_entry__() + 19280
|
||||
_subr__vunsigned = __native_entry__() + 23120
|
||||
_subr__f32toa = __native_entry__() + 35216
|
||||
_subr__f64toa = __native_entry__() + 752
|
||||
_subr__get_by_path = __native_entry__() + 30384
|
||||
_subr__html_escape = __native_entry__() + 11712
|
||||
_subr__i64toa = __native_entry__() + 4368
|
||||
_subr__lspace = __native_entry__() + 224
|
||||
_subr__quote = __native_entry__() + 6160
|
||||
_subr__skip_array = __native_entry__() + 22864
|
||||
_subr__skip_number = __native_entry__() + 26928
|
||||
_subr__skip_object = __native_entry__() + 24864
|
||||
_subr__skip_one = __native_entry__() + 27088
|
||||
_subr__skip_one_fast = __native_entry__() + 27504
|
||||
_subr__u64toa = __native_entry__() + 4640
|
||||
_subr__unquote = __native_entry__() + 8960
|
||||
_subr__validate_one = __native_entry__() + 27152
|
||||
_subr__validate_utf8 = __native_entry__() + 31552
|
||||
_subr__validate_utf8_fast = __native_entry__() + 32496
|
||||
_subr__value = __native_entry__() + 16816
|
||||
_subr__vnumber = __native_entry__() + 20608
|
||||
_subr__vsigned = __native_entry__() + 22144
|
||||
_subr__vstring = __native_entry__() + 19312
|
||||
_subr__vunsigned = __native_entry__() + 22496
|
||||
)
|
||||
|
||||
const (
|
||||
_stack__f32toa = 64
|
||||
_stack__f32toa = 56
|
||||
_stack__f64toa = 80
|
||||
_stack__get_by_path = 304
|
||||
_stack__get_by_path = 320
|
||||
_stack__html_escape = 72
|
||||
_stack__i64toa = 16
|
||||
_stack__lspace = 8
|
||||
_stack__quote = 72
|
||||
_stack__skip_array = 136
|
||||
_stack__skip_array = 120
|
||||
_stack__skip_number = 80
|
||||
_stack__skip_object = 136
|
||||
_stack__skip_one = 136
|
||||
_stack__skip_object = 120
|
||||
_stack__skip_one = 120
|
||||
_stack__skip_one_fast = 216
|
||||
_stack__u64toa = 8
|
||||
_stack__unquote = 72
|
||||
_stack__validate_one = 136
|
||||
_stack__value = 336
|
||||
_stack__vnumber = 248
|
||||
_stack__unquote = 128
|
||||
_stack__validate_one = 120
|
||||
_stack__validate_utf8 = 48
|
||||
_stack__validate_utf8_fast = 200
|
||||
_stack__value = 368
|
||||
_stack__vnumber = 280
|
||||
_stack__vsigned = 16
|
||||
_stack__vstring = 136
|
||||
_stack__vstring = 104
|
||||
_stack__vunsigned = 24
|
||||
)
|
||||
|
||||
|
|
@ -70,6 +74,8 @@ var (
|
|||
_ = _subr__u64toa
|
||||
_ = _subr__unquote
|
||||
_ = _subr__validate_one
|
||||
_ = _subr__validate_utf8
|
||||
_ = _subr__validate_utf8_fast
|
||||
_ = _subr__value
|
||||
_ = _subr__vnumber
|
||||
_ = _subr__vsigned
|
||||
|
|
@ -93,6 +99,8 @@ const (
|
|||
_ = _stack__u64toa
|
||||
_ = _stack__unquote
|
||||
_ = _stack__validate_one
|
||||
_ = _stack__validate_utf8
|
||||
_ = _stack__validate_utf8_fast
|
||||
_ = _stack__value
|
||||
_ = _stack__vnumber
|
||||
_ = _stack__vsigned
|
||||
|
|
|
|||
|
|
@ -26,7 +26,10 @@ import (
|
|||
`github.com/bytedance/sonic/internal/native/types`
|
||||
)
|
||||
|
||||
const MaxFrameSize uintptr = 400
|
||||
const (
|
||||
MaxFrameSize uintptr = 400
|
||||
BufPaddingSize int = 64
|
||||
)
|
||||
|
||||
var (
|
||||
S_f64toa uintptr
|
||||
|
|
@ -113,6 +116,16 @@ func U64toa(out *byte, val uint64) (ret int)
|
|||
//goland:noinspection GoUnusedParameter
|
||||
func F64toa(out *byte, val float64) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func ValidateUTF8(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func ValidateUTF8Fast(s *string) (ret int)
|
||||
|
||||
func useAVX() {
|
||||
S_f64toa = avx.S_f64toa
|
||||
S_f32toa = avx.S_f32toa
|
||||
|
|
|
|||
|
|
@ -45,7 +45,6 @@ TEXT ·HTMLEscape(SB), NOSPLIT, $0 - 40
|
|||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx·__html_escape(SB)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕sse·__html_escape(SB)
|
||||
|
||||
|
||||
TEXT ·Value(SB), NOSPLIT, $0 - 48
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX2(SB), $0
|
||||
JE 2(PC)
|
||||
|
|
@ -81,6 +80,7 @@ TEXT ·GetByPath(SB), NOSPLIT, $0 - 32
|
|||
JE 2(PC)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx·__get_by_path(SB)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕sse·__get_by_path(SB)
|
||||
|
||||
TEXT ·ValidateOne(SB), NOSPLIT, $0 - 32
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX2(SB), $0
|
||||
JE 2(PC)
|
||||
|
|
@ -90,6 +90,24 @@ TEXT ·ValidateOne(SB), NOSPLIT, $0 - 32
|
|||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx·__validate_one(SB)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕sse·__validate_one(SB)
|
||||
|
||||
TEXT ·ValidateUTF8(SB), NOSPLIT, $0 - 40
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX2(SB), $0
|
||||
JE 2(PC)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx2·__validate_utf8(SB)
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX(SB), $0
|
||||
JE 2(PC)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx·__validate_utf8(SB)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕sse·__validate_utf8(SB)
|
||||
|
||||
TEXT ·ValidateUTF8Fast(SB), NOSPLIT, $0 - 16
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX2(SB), $0
|
||||
JE 2(PC)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx2·__validate_utf8_fast(SB)
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX(SB), $0
|
||||
JE 2(PC)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕avx·__validate_utf8_fast(SB)
|
||||
JMP github·com∕bytedance∕sonic∕internal∕native∕sse·__validate_utf8_fast(SB)
|
||||
|
||||
TEXT ·I64toa(SB), NOSPLIT, $0 - 32
|
||||
CMPB github·com∕bytedance∕sonic∕internal∕cpu·HasAVX2(SB), $0
|
||||
JE 2(PC)
|
||||
|
|
|
|||
|
|
@ -121,3 +121,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
|
|||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8_fast(s *string) (ret int)
|
||||
|
|
@ -254,15 +254,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
|
|||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_Vstring_ValidUtf8(t *testing.T) {
|
||||
var v types.JsonState
|
||||
valid := uint64(types.F_VALIDATE_STRING)
|
||||
i := 0
|
||||
s := "test\xff\""
|
||||
__vstring(&s, &i, &v, valid)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_VstringEscapeEOF(t *testing.T) {
|
||||
var v types.JsonState
|
||||
i := 0
|
||||
|
|
@ -273,51 +264,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
|
|||
assert.Equal(t, int64(0), v.Iv)
|
||||
}
|
||||
|
||||
func TestNative_ValidateOne(t *testing.T) {
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, len(s), p)
|
||||
assert.Equal(t, 0, r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x00\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x80\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\xed\xbf\xbf\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNative_VstringHangUpOnRandomData(t *testing.T) {
|
||||
v, e := hex.DecodeString(
|
||||
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +
|
||||
|
|
|
|||
|
|
@ -123,3 +123,13 @@ func __validate_one(s *string, p *int, m *types.StateMachine) (ret int)
|
|||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __get_by_path(s *string, p *int, path *[]interface{}) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8(s *string, p *int, m *types.StateMachine) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
//goland:noinspection GoUnusedParameter
|
||||
func __validate_utf8_fast(s *string) (ret int)
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -256,15 +256,6 @@ func TestNative_Vstring_ValidUnescapedChars(t *testing.T) {
|
|||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_Vstring_ValidUtf8(t *testing.T) {
|
||||
var v types.JsonState
|
||||
valid := uint64(types.F_VALIDATE_STRING)
|
||||
i := 0
|
||||
s := "test\xff\""
|
||||
__vstring(&s, &i, &v, valid)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), int(v.Vt))
|
||||
}
|
||||
|
||||
func TestNative_VstringEscapeEOF(t *testing.T) {
|
||||
var v types.JsonState
|
||||
i := 0
|
||||
|
|
@ -275,51 +266,6 @@ func TestNative_VstringEscapeEOF(t *testing.T) {
|
|||
assert.Equal(t, int64(0), v.Iv)
|
||||
}
|
||||
|
||||
func TestNative_ValidateOne(t *testing.T) {
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\\r\\b\\f😁ſ景\xef\xbf\xbf\xf4\x8f\xbf\xbf\xc2\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\""
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, len(s), p)
|
||||
assert.Equal(t, 0, r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\bxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x00\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 64, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\x80\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
{
|
||||
p := 0
|
||||
s := "\"\xed\xbf\xbf\"x"
|
||||
r := __validate_one(&s, &p, &types.StateMachine{})
|
||||
assert.Equal(t, 1, p)
|
||||
assert.Equal(t, -int(types.ERR_INVALID_CHAR), r)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNative_VstringHangUpOnRandomData(t *testing.T) {
|
||||
v, e := hex.DecodeString(
|
||||
"228dc61efd54ef80a908fb6026b7f2d5f92a257ba8b347c995f259eb8685376a" +
|
||||
|
|
|
|||
|
|
@ -9,32 +9,34 @@ package sse
|
|||
func __native_entry__() uintptr
|
||||
|
||||
var (
|
||||
_subr__f32toa = __native_entry__() + 29152
|
||||
_subr__f64toa = __native_entry__() + 464
|
||||
_subr__get_by_path = __native_entry__() + 27392
|
||||
_subr__html_escape = __native_entry__() + 10416
|
||||
_subr__i64toa = __native_entry__() + 4048
|
||||
_subr__lspace = __native_entry__() + 80
|
||||
_subr__quote = __native_entry__() + 5456
|
||||
_subr__skip_array = __native_entry__() + 20144
|
||||
_subr__skip_number = __native_entry__() + 23488
|
||||
_subr__skip_object = __native_entry__() + 22032
|
||||
_subr__skip_one = __native_entry__() + 23632
|
||||
_subr__skip_one_fast = __native_entry__() + 23840
|
||||
_subr__u64toa = __native_entry__() + 4176
|
||||
_subr__unquote = __native_entry__() + 7232
|
||||
_subr__validate_one = __native_entry__() + 23664
|
||||
_subr__value = __native_entry__() + 13680
|
||||
_subr__vnumber = __native_entry__() + 17888
|
||||
_subr__vsigned = __native_entry__() + 19440
|
||||
_subr__vstring = __native_entry__() + 15760
|
||||
_subr__vunsigned = __native_entry__() + 19792
|
||||
_subr__f32toa = __native_entry__() + 28688
|
||||
_subr__f64toa = __native_entry__() + 464
|
||||
_subr__get_by_path = __native_entry__() + 26432
|
||||
_subr__html_escape = __native_entry__() + 9584
|
||||
_subr__i64toa = __native_entry__() + 3744
|
||||
_subr__lspace = __native_entry__() + 80
|
||||
_subr__quote = __native_entry__() + 5472
|
||||
_subr__skip_array = __native_entry__() + 19184
|
||||
_subr__skip_number = __native_entry__() + 22528
|
||||
_subr__skip_object = __native_entry__() + 21088
|
||||
_subr__skip_one = __native_entry__() + 22688
|
||||
_subr__skip_one_fast = __native_entry__() + 22912
|
||||
_subr__u64toa = __native_entry__() + 4016
|
||||
_subr__unquote = __native_entry__() + 7184
|
||||
_subr__validate_one = __native_entry__() + 22736
|
||||
_subr__validate_utf8 = __native_entry__() + 27456
|
||||
_subr__validate_utf8_fast = __native_entry__() + 28128
|
||||
_subr__value = __native_entry__() + 13216
|
||||
_subr__vnumber = __native_entry__() + 16928
|
||||
_subr__vsigned = __native_entry__() + 18464
|
||||
_subr__vstring = __native_entry__() + 15408
|
||||
_subr__vunsigned = __native_entry__() + 18816
|
||||
)
|
||||
|
||||
const (
|
||||
_stack__f32toa = 64
|
||||
_stack__f32toa = 56
|
||||
_stack__f64toa = 80
|
||||
_stack__get_by_path = 232
|
||||
_stack__get_by_path = 264
|
||||
_stack__html_escape = 64
|
||||
_stack__i64toa = 16
|
||||
_stack__lspace = 8
|
||||
|
|
@ -43,14 +45,16 @@ const (
|
|||
_stack__skip_number = 72
|
||||
_stack__skip_object = 128
|
||||
_stack__skip_one = 128
|
||||
_stack__skip_one_fast = 144
|
||||
_stack__skip_one_fast = 160
|
||||
_stack__u64toa = 8
|
||||
_stack__unquote = 72
|
||||
_stack__unquote = 128
|
||||
_stack__validate_one = 128
|
||||
_stack__value = 336
|
||||
_stack__vnumber = 248
|
||||
_stack__validate_utf8 = 48
|
||||
_stack__validate_utf8_fast = 24
|
||||
_stack__value = 368
|
||||
_stack__vnumber = 280
|
||||
_stack__vsigned = 16
|
||||
_stack__vstring = 144
|
||||
_stack__vstring = 128
|
||||
_stack__vunsigned = 24
|
||||
)
|
||||
|
||||
|
|
@ -70,6 +74,8 @@ var (
|
|||
_ = _subr__u64toa
|
||||
_ = _subr__unquote
|
||||
_ = _subr__validate_one
|
||||
_ = _subr__validate_utf8
|
||||
_ = _subr__validate_utf8_fast
|
||||
_ = _subr__value
|
||||
_ = _subr__vnumber
|
||||
_ = _subr__vsigned
|
||||
|
|
@ -93,6 +99,8 @@ const (
|
|||
_ = _stack__u64toa
|
||||
_ = _stack__unquote
|
||||
_ = _stack__validate_one
|
||||
_ = _stack__validate_utf8
|
||||
_ = _stack__validate_utf8_fast
|
||||
_ = _stack__value
|
||||
_ = _stack__vnumber
|
||||
_ = _stack__vsigned
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ type ValueType int
|
|||
type ParsingError uint
|
||||
type SearchingError uint
|
||||
|
||||
// !NOT MODIFIED ONLY.
|
||||
// NOTE: !NOT MODIFIED ONLY.
|
||||
// This definitions are followed in native/types.h.
|
||||
|
||||
const (
|
||||
|
|
@ -75,6 +75,7 @@ const (
|
|||
ERR_RECURSE_EXCEED_MAX ParsingError = 7
|
||||
ERR_FLOAT_INFINITY ParsingError = 8
|
||||
ERR_MISMATCH ParsingError = 9
|
||||
ERR_INVALID_UTF8 ParsingError = 10
|
||||
)
|
||||
|
||||
var _ParsingErrors = []string{
|
||||
|
|
@ -88,6 +89,7 @@ var _ParsingErrors = []string{
|
|||
ERR_RECURSE_EXCEED_MAX : "recursion exceeded max depth",
|
||||
ERR_FLOAT_INFINITY : "float number is infinity",
|
||||
ERR_MISMATCH : "mismatched type with value",
|
||||
ERR_INVALID_UTF8 : "invalid UTF8",
|
||||
}
|
||||
|
||||
func (self ParsingError) Error() string {
|
||||
|
|
|
|||
201
licenses/LICENSE-simdjson
Normal file
201
licenses/LICENSE-simdjson
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2018-2023 The simdjson authors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
|
@ -48,13 +48,13 @@ static inline unsigned ctz10_u32(const uint32_t v) {
|
|||
if (v < 1000000) return 6;
|
||||
if (v < 10000000) return 7;
|
||||
if (v < 100000000) return 8;
|
||||
return 9;
|
||||
else return 9;
|
||||
} else {
|
||||
if (v < 10) return 1;
|
||||
if (v < 100) return 2;
|
||||
if (v < 1000) return 3;
|
||||
if (v < 10000) return 4;
|
||||
return 5;
|
||||
else return 5;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ static inline unsigned ctz10(const uint64_t v) {
|
|||
if (v < 100000000000000ull) return 14;
|
||||
if (v < 1000000000000000ull) return 15;
|
||||
if (v < 10000000000000000ull) return 16;
|
||||
return 17;
|
||||
else return 17;
|
||||
}
|
||||
if (v < 10ull) return 1;
|
||||
if (v < 100ull) return 2;
|
||||
|
|
@ -65,7 +65,7 @@ static inline unsigned ctz10(const uint64_t v) {
|
|||
if (v < 10000000ull) return 7;
|
||||
if (v < 100000000ull) return 8;
|
||||
if (v < 1000000000ull) return 9;
|
||||
return 10;
|
||||
else return 10;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -14,6 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include "native.h"
|
||||
#include "test/xprintf.h"
|
||||
#include "test/xassert.h"
|
||||
#include "fastbytes.c"
|
||||
#include "fastfloat.c"
|
||||
#include "fastint.c"
|
||||
|
|
|
|||
|
|
@ -112,8 +112,8 @@ typedef struct {
|
|||
} JsonState;
|
||||
|
||||
typedef struct {
|
||||
int sp;
|
||||
int vt[MAX_RECURSE];
|
||||
int64_t sp;
|
||||
int64_t vt[MAX_RECURSE];
|
||||
} StateMachine;
|
||||
|
||||
int f64toa(char *out, double val);
|
||||
|
|
@ -144,9 +144,10 @@ long skip_number(const GoString *src, long *p);
|
|||
bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val);
|
||||
double atof_native(const char *sp, ssize_t nb, char *dbuf, ssize_t cap);
|
||||
|
||||
ssize_t utf8_validate(const char *sp, ssize_t nb);
|
||||
long validate_string(const GoString *src, long *p);
|
||||
long validate_one(const GoString *src, long *p, StateMachine *m);
|
||||
long validate_utf8(const GoString *src, long *p, StateMachine *m);
|
||||
long validate_utf8_fast(const GoString *src);
|
||||
|
||||
long skip_one_fast(const GoString *src, long *p);
|
||||
long get_by_path(const GoString *src, long *p, const GoSlice *path);
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
|
||||
#include "native.h"
|
||||
#include "utils.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/** String Quoting **/
|
||||
|
|
@ -108,27 +109,6 @@ static const quoted_t _HtmlQuoteTab[256] = {
|
|||
[0xa9] = { .n = 6, .s = "\\u2029" },
|
||||
};
|
||||
|
||||
static inline void memcpy_p8(char *dp, const char *sp, ssize_t nb) {
|
||||
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
|
||||
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
|
||||
if (nb >= 1) { *dp = *sp; }
|
||||
}
|
||||
|
||||
static inline void memcpy_p16(char *dp, const char *sp, size_t nb) {
|
||||
if (nb >= 8) { *(uint64_t *)dp = *(const uint64_t *)sp; sp += 8, dp += 8, nb -= 8; }
|
||||
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
|
||||
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
|
||||
if (nb >= 1) { *dp = *sp; }
|
||||
}
|
||||
|
||||
static inline void memcpy_p32(char *dp, const char *sp, size_t nb) {
|
||||
if (nb >= 16) { _mm_storeu_si128((void *)dp, _mm_loadu_si128((const void *)sp)); sp += 16, dp += 16, nb -= 16; }
|
||||
if (nb >= 8) { *(uint64_t *)dp = *(const uint64_t *)sp; sp += 8, dp += 8, nb -= 8; }
|
||||
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
|
||||
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
|
||||
if (nb >= 1) { *dp = *sp; }
|
||||
}
|
||||
|
||||
static inline __m128i _mm_find_quote(__m128i vv) {
|
||||
__m128i e1 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(-1));
|
||||
__m128i e2 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(31));
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "native.h"
|
||||
#include "utf8.h"
|
||||
#include "utils.h"
|
||||
|
||||
static const uint64_t ODD_MASK = 0xaaaaaaaaaaaaaaaa;
|
||||
static const uint64_t EVEN_MASK = 0x5555555555555555;
|
||||
|
|
@ -41,7 +42,7 @@ static inline uint64_t add32(uint64_t v1, uint64_t v2, uint64_t *vo) {
|
|||
}
|
||||
|
||||
static inline uint64_t add64(uint64_t v1, uint64_t v2, uint64_t *vo) {
|
||||
uint64_t v;
|
||||
unsigned long long v;
|
||||
uint64_t c = __builtin_uaddll_overflow(v1, v2, &v);
|
||||
|
||||
/* set the carry */
|
||||
|
|
@ -107,7 +108,7 @@ static inline int64_t advance_dword(const GoString *src, long *p, long dec, int6
|
|||
return ret;
|
||||
} else {
|
||||
*p -= dec;
|
||||
for (int i = 0; src->buf[*p] == (val & 0xff); i++, ++*p) { val >>= 8; }
|
||||
for (int i = 0; src->buf[*p] == (val & 0xff) && i < 4; i++, ++*p) { val >>= 8; }
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
}
|
||||
|
|
@ -368,12 +369,11 @@ static inline int _mm_nonascii_mask(__m128i v) {
|
|||
|
||||
static inline ssize_t advance_string_validate(const GoString *src, long p, int64_t *ep) {
|
||||
char ch;
|
||||
uint64_t m0, m1, m2, m3;
|
||||
uint64_t m0, m1, m2;
|
||||
uint64_t es, fe, os;
|
||||
uint64_t cr = 0;
|
||||
long qp = 0;
|
||||
long np = 0;
|
||||
long up = 0;
|
||||
|
||||
/* buffer pointers */
|
||||
size_t nb = src->len;
|
||||
|
|
@ -406,7 +406,6 @@ static inline ssize_t advance_string_validate(const GoString *src, long p, int64
|
|||
uint32_t s0, s1;
|
||||
uint32_t t0, t1;
|
||||
uint32_t c0, c1;
|
||||
uint32_t u0, u1;
|
||||
#else
|
||||
/* initialize vectors */
|
||||
__m128i v0;
|
||||
|
|
@ -420,7 +419,6 @@ static inline ssize_t advance_string_validate(const GoString *src, long p, int64
|
|||
uint32_t s0, s1, s2, s3;
|
||||
uint32_t t0, t1, t2, t3;
|
||||
uint32_t c0, c1, c2, c3;
|
||||
uint32_t u0, u1, u2, u3;
|
||||
#endif
|
||||
|
||||
#define m0_mask(add) \
|
||||
|
|
@ -430,7 +428,6 @@ static inline ssize_t advance_string_validate(const GoString *src, long p, int64
|
|||
es = add(os, m1, &cr) << 1; \
|
||||
m0 &= ~(fe & (es ^ EVEN_MASK));
|
||||
|
||||
simd_advance:
|
||||
/* 64-byte SIMD loop */
|
||||
while (likely(nb >= 64)) {
|
||||
#if USE_AVX2
|
||||
|
|
@ -442,12 +439,9 @@ simd_advance:
|
|||
t1 = _mm256_get_mask(v1, cx);
|
||||
c0 = _mm256_cchars_mask(v0);
|
||||
c1 = _mm256_cchars_mask(v1);
|
||||
u0 = _mm256_nonascii_mask(v0);
|
||||
u1 = _mm256_nonascii_mask(v1);
|
||||
m0 = ((uint64_t)s1 << 32) | (uint64_t)s0;
|
||||
m1 = ((uint64_t)t1 << 32) | (uint64_t)t0;
|
||||
m2 = ((uint64_t)c1 << 32) | (uint64_t)c0;
|
||||
m3 = ((uint64_t)u1 << 32) | (uint64_t)u0;
|
||||
#else
|
||||
v0 = _mm_loadu_si128 ((const void *)(sp + 0));
|
||||
v1 = _mm_loadu_si128 ((const void *)(sp + 16));
|
||||
|
|
@ -465,14 +459,9 @@ simd_advance:
|
|||
c1 = _mm_cchars_mask(v1);
|
||||
c2 = _mm_cchars_mask(v2);
|
||||
c3 = _mm_cchars_mask(v3);
|
||||
u0 = _mm_nonascii_mask(v0);
|
||||
u1 = _mm_nonascii_mask(v1);
|
||||
u2 = _mm_nonascii_mask(v2);
|
||||
u3 = _mm_nonascii_mask(v3);
|
||||
m0 = ((uint64_t)s3 << 48) | ((uint64_t)s2 << 32) | ((uint64_t)s1 << 16) | (uint64_t)s0;
|
||||
m1 = ((uint64_t)t3 << 48) | ((uint64_t)t2 << 32) | ((uint64_t)t1 << 16) | (uint64_t)t0;
|
||||
m2 = ((uint64_t)c3 << 48) | ((uint64_t)c2 << 32) | ((uint64_t)c1 << 16) | (uint64_t)c0;
|
||||
m3 = ((uint64_t)u3 << 48) | ((uint64_t)u2 << 32) | ((uint64_t)u1 << 16) | (uint64_t)u0;
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -488,7 +477,6 @@ simd_advance:
|
|||
|
||||
qp = m0 ? __builtin_ctzll(m0) : 64;
|
||||
np = m2 ? __builtin_ctzll(m2) : 64;
|
||||
up = m3 ? __builtin_ctzll(m3) : 64;
|
||||
|
||||
/* get the position of end quote */
|
||||
if (m0 != 0) {
|
||||
|
|
@ -498,9 +486,6 @@ simd_advance:
|
|||
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
if (up < qp) {
|
||||
goto valid_utf8;
|
||||
}
|
||||
return sp - ss + qp + 1;
|
||||
}
|
||||
|
||||
|
|
@ -511,10 +496,6 @@ simd_advance:
|
|||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
if (unlikely(m3 != 0)) {
|
||||
goto valid_utf8;
|
||||
}
|
||||
|
||||
/* move to the next block */
|
||||
sp += 64;
|
||||
nb -= 64;
|
||||
|
|
@ -527,11 +508,9 @@ simd_advance:
|
|||
s0 = _mm256_get_mask (v0, cq);
|
||||
t0 = _mm256_get_mask (v0, cx);
|
||||
c0 = _mm256_cchars_mask(v0);
|
||||
u0 = _mm256_nonascii_mask(v0);
|
||||
m0 = (uint64_t)s0;
|
||||
m1 = (uint64_t)t0;
|
||||
m2 = (uint64_t)c0;
|
||||
m3 = (uint64_t)u0;
|
||||
#else
|
||||
v0 = _mm_loadu_si128 ((const void *)(sp + 0));
|
||||
v1 = _mm_loadu_si128 ((const void *)(sp + 16));
|
||||
|
|
@ -541,12 +520,9 @@ simd_advance:
|
|||
t1 = _mm_get_mask(v1, cx);
|
||||
c0 = _mm_cchars_mask(v0);
|
||||
c1 = _mm_cchars_mask(v1);
|
||||
u0 = _mm_nonascii_mask(v0);
|
||||
u1 = _mm_nonascii_mask(v1);
|
||||
m0 = ((uint64_t)s1 << 16) | (uint64_t)s0;
|
||||
m1 = ((uint64_t)t1 << 16) | (uint64_t)t0;
|
||||
m2 = ((uint64_t)c1 << 16) | (uint64_t)c0;
|
||||
m3 = ((uint64_t)u1 << 16) | (uint64_t)u0;
|
||||
#endif
|
||||
|
||||
/** update first quote position */
|
||||
|
|
@ -560,19 +536,14 @@ simd_advance:
|
|||
}
|
||||
|
||||
qp = m0 ? __builtin_ctzll(m0) : 64;
|
||||
up = m3 ? __builtin_ctzll(m3) : 64;
|
||||
np = m2 ? __builtin_ctzll(m2) : 64;
|
||||
|
||||
|
||||
/* get the position of end quote */
|
||||
if (m0 != 0) {
|
||||
if (unlikely(np < qp)) {
|
||||
ep_seterr(sp - ss + np)
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
if (up < qp) {
|
||||
goto valid_utf8;
|
||||
}
|
||||
return sp - ss + qp + 1;
|
||||
}
|
||||
|
||||
|
|
@ -582,10 +553,6 @@ simd_advance:
|
|||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
if (m3 != 0) {
|
||||
goto valid_utf8;
|
||||
}
|
||||
|
||||
/* move to the next block */
|
||||
sp += 32;
|
||||
nb -= 32;
|
||||
|
|
@ -601,7 +568,6 @@ simd_advance:
|
|||
}
|
||||
}
|
||||
|
||||
remain:
|
||||
/* handle the remaining bytes with scalar code */
|
||||
while (nb > 0) {
|
||||
ch = *sp;
|
||||
|
|
@ -626,43 +592,9 @@ remain:
|
|||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
/* valid utf8 chars */
|
||||
if (ch & 0x80) {
|
||||
uint32_t ubin = nb >= 4 ? *(uint32_t*)sp : less4byte_to_uint32(sp, nb);
|
||||
if ((up = valid_utf8_4byte(ubin))) {
|
||||
sp += up, nb -= up;
|
||||
continue;
|
||||
}
|
||||
ep_seterr(sp - ss)
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
sp++, nb--;
|
||||
}
|
||||
return -ERR_EOF;
|
||||
|
||||
valid_utf8:
|
||||
sp += up, nb -= up;
|
||||
while (likely(nb >= 4)) {
|
||||
up = valid_utf8_4byte(*(uint32_t*)sp);
|
||||
if (unlikely(up == 0)) {
|
||||
ep_seterr(sp - ss)
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
/* check continous utf-8 */
|
||||
sp += up, nb -= up;
|
||||
if (nb > 0 && (*(uint8_t*)sp & 0x80)) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/* clear the last carried bit */
|
||||
cr = 0;
|
||||
goto simd_advance;
|
||||
}
|
||||
goto remain;
|
||||
|
||||
#undef ep_init
|
||||
#undef ep_setc
|
||||
#undef ep_setx
|
||||
|
|
@ -1640,24 +1572,6 @@ static always_inline long skip_number_fast(const GoString *src, long *p) {
|
|||
return vi;
|
||||
}
|
||||
|
||||
static always_inline void memcpy_p64(char * restrict dp, const char * restrict sp, size_t n) {
|
||||
long nb = n;
|
||||
#if USE_AVX2
|
||||
if (nb >= 32) { _mm256_storeu_si256((void *)dp, _mm256_loadu_si256((const void *)sp)); sp += 32, dp += 32, nb -= 32; }
|
||||
#endif
|
||||
while (nb >= 16) { _mm_storeu_si128((void *)dp, _mm_loadu_si128((const void *)sp)); sp += 16, dp += 16, nb -= 16; }
|
||||
if (nb >= 8) { *(uint64_t *)dp = *(const uint64_t *)sp; sp += 8, dp += 8, nb -= 8; }
|
||||
if (nb >= 4) { *(uint32_t *)dp = *(const uint32_t *)sp; sp += 4, dp += 4, nb -= 4; }
|
||||
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
|
||||
if (nb >= 1) { *dp = *sp; }
|
||||
}
|
||||
|
||||
static always_inline bool vec_cross_page(const void * p, size_t n) {
|
||||
#define PAGE_SIZE 4096
|
||||
return (((size_t)(p)) & (PAGE_SIZE - 1)) > (PAGE_SIZE - n);
|
||||
#undef PAGE_SIZE
|
||||
}
|
||||
|
||||
static always_inline long skip_container_fast(const GoString *src, long *p, char lc, char rc) {
|
||||
long nb = src->len - *p;
|
||||
const char *s = src->buf + *p;
|
||||
|
|
@ -1956,3 +1870,20 @@ err_inval:
|
|||
*p -= 1; // backward error position
|
||||
return -ERR_INVAL;
|
||||
}
|
||||
|
||||
//
|
||||
long validate_utf8(const GoString *src, long *p, StateMachine *m) {
|
||||
xassert(*p >= 0 && src->len > *p);
|
||||
return validate_utf8_with_errors(src->buf, src->len, p, m);
|
||||
}
|
||||
|
||||
// validate_utf8_fast returns zero if valid, otherwise, the error position.
|
||||
long validate_utf8_fast(const GoString *s) {
|
||||
#if USE_AVX2
|
||||
/* fast path for valid utf8 */
|
||||
if (validate_utf8_avx2(s) == 0) {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
return validate_utf8_errors(s);
|
||||
}
|
||||
|
|
@ -17,6 +17,7 @@
|
|||
#ifndef XASSERT_H
|
||||
#define XASSERT_H
|
||||
|
||||
|
||||
#ifndef DEBUG
|
||||
#define xassert(expr) ((void)0)
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -14,11 +14,23 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef XPRINTF_H
|
||||
#define XPRINTF_H
|
||||
#pragma once
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#ifdef LOG_LEVEL
|
||||
#define DEBUG
|
||||
#define LOG_TRACE(_VA_ARGS__...) do { if (LOG_LEVEL >= 0) xprintf(_VA_ARGS__ ); } while (0)
|
||||
#define LOG_DEBUG(_VA_ARGS__...) do { if (LOG_LEVEL >= 1) xprintf(_VA_ARGS__ ); } while (0)
|
||||
#define LOG_INFO(_VA_ARGS__...) do { if (LOG_LEVEL >= 2) xprintf(_VA_ARGS__ ); } while (0)
|
||||
#else
|
||||
#define LOG_TRACE(_VA_ARGS__...) ((void)0)
|
||||
#define LOG_DEBUG(_VA_ARGS__...) ((void)0)
|
||||
#define LOG_INFO(_VA_ARGS__...) ((void)0)
|
||||
#endif
|
||||
|
||||
// Note: this code is on cross-compile, so we can't use System-specific Predefined Macros here.
|
||||
#if USE_APPLE
|
||||
static inline void __attribute__((naked)) write_syscall(const char *s, size_t n)
|
||||
{
|
||||
asm volatile(
|
||||
|
|
@ -35,6 +47,24 @@ static inline void __attribute__((naked)) write_syscall(const char *s, size_t n)
|
|||
"retq"
|
||||
"\n");
|
||||
}
|
||||
#else
|
||||
static inline void __attribute__((naked)) write_syscall(const char *s, size_t n)
|
||||
{
|
||||
asm volatile(
|
||||
"movq %rsi, %rdx"
|
||||
"\n"
|
||||
"movq %rdi, %rsi"
|
||||
"\n"
|
||||
"movq $1, %rdi"
|
||||
"\n"
|
||||
"movq $1, %rax"
|
||||
"\n"
|
||||
"syscall"
|
||||
"\n"
|
||||
"retq"
|
||||
"\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void printch(const char ch)
|
||||
{
|
||||
|
|
@ -115,7 +145,7 @@ static inline void printhex(uintptr_t v)
|
|||
printstr(p);
|
||||
}
|
||||
|
||||
#define MAX_BUF_LEN 100
|
||||
#define MAX_BUF_LEN 1000
|
||||
|
||||
static inline void printbytes(GoSlice *s)
|
||||
{
|
||||
|
|
@ -150,9 +180,8 @@ static inline void printgostr(GoString *s)
|
|||
printch('"');
|
||||
}
|
||||
|
||||
static inline void xprintf(const char *fmt, ...)
|
||||
static inline void do_xprintf(const char *fmt, ...)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
__builtin_va_list va;
|
||||
char buf[256] = {};
|
||||
char *p = buf;
|
||||
|
|
@ -227,7 +256,26 @@ static inline void xprintf(const char *fmt, ...)
|
|||
*p = 0;
|
||||
printstr(buf);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // XPRINTF_H
|
||||
#ifdef DEBUG
|
||||
#define xprintf(_VA_ARGS__...) do_xprintf(_VA_ARGS__)
|
||||
#else
|
||||
#define xprintf(_VA_ARGS__...) ((void)0)
|
||||
#endif
|
||||
|
||||
static always_inline void print_longhex(const void *input, const char* s, int bytes) {
|
||||
const uint8_t* p = (const uint8_t*)(input);
|
||||
xprintf("%s : ", s);
|
||||
for (int i = 0; i < bytes; i++) {
|
||||
uintptr_t u = p[i];
|
||||
if (u < 0x10) xprintf("0");
|
||||
xprintf("%x", u);
|
||||
if ((i + 1) < bytes && (i + 1) % 4 == 0) {
|
||||
xprintf("-");
|
||||
}
|
||||
}
|
||||
xprintf("\n");
|
||||
}
|
||||
|
||||
#define psimd(simd) print_longhex((const void *)(simd), #simd, sizeof(*simd))
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
#ifndef TYPES_H
|
||||
#define TYPES_H
|
||||
|
||||
// !NOT MODIFIED ONLY.
|
||||
// NOTE: !NOT MODIFIED ONLY.
|
||||
// This definitions are copied from internal/native/types/types.go.
|
||||
|
||||
#define V_EOF 1
|
||||
|
|
@ -34,6 +34,8 @@
|
|||
#define ERR_NUMBER_FMT 6
|
||||
#define ERR_RECURSE_MAX 7
|
||||
#define ERR_FLOAT_INF 8
|
||||
#define ERR_MISMATCH 9
|
||||
#define ERR_INVAL_UTF8 10
|
||||
|
||||
#define MAX_RECURSE 4096
|
||||
|
||||
|
|
|
|||
394
native/utf8.h
394
native/utf8.h
|
|
@ -1,5 +1,3 @@
|
|||
#ifndef UTF8_H
|
||||
#define UTF8_H
|
||||
/*
|
||||
* Copyright (C) 2019 Yaoyuan <ibireme@gmail.com>.
|
||||
*
|
||||
|
|
@ -9,6 +7,20 @@
|
|||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* Copyright 2018-2023 The simdjson authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
|
@ -19,6 +31,13 @@
|
|||
* Modifications are Copyright 2022 ByteDance Authors.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "native.h"
|
||||
#include "utils.h"
|
||||
#include "test/xassert.h"
|
||||
#include "test/xprintf.h"
|
||||
|
||||
static inline ssize_t valid_utf8_4byte(uint32_t ubin) {
|
||||
/*
|
||||
Each unicode code point is encoded as 1 to 4 bytes in UTF-8 encoding,
|
||||
|
|
@ -104,12 +123,371 @@ static inline ssize_t valid_utf8_4byte(uint32_t ubin) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline uint32_t less4byte_to_uint32(const char* sp, size_t nb) {
|
||||
if (nb == 1) return *(uint8_t*)sp;
|
||||
if (nb == 2) return *(uint16_t*)sp;
|
||||
uint32_t hi_1 = (*(uint8_t*)(sp + 2));
|
||||
uint32_t lo_2 = *(uint16_t*)(sp);
|
||||
return hi_1 << 16 | lo_2;
|
||||
static always_inline long write_error(int pos, StateMachine *m, size_t msize) {
|
||||
if (m->sp >= msize) {
|
||||
return -1;
|
||||
}
|
||||
m->vt[m->sp++] = pos;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// scalar code, error position should excesss 4096
|
||||
static always_inline long validate_utf8_with_errors(const char *src, long len, long *p, StateMachine *m) {
|
||||
const char* start = src + *p;
|
||||
const char* end = src + len;
|
||||
while (start < end - 3) {
|
||||
uint32_t u = (*(uint32_t*)(start));
|
||||
if ((unsigned)(*start) < 0x80) {
|
||||
start += 1;
|
||||
continue;
|
||||
}
|
||||
size_t n = valid_utf8_4byte(u);
|
||||
if (n != 0) { // valid utf
|
||||
start += n;
|
||||
continue;
|
||||
}
|
||||
long err = write_error(start - src, m, MAX_RECURSE);
|
||||
if (err) {
|
||||
*p = start - src;
|
||||
return err;
|
||||
}
|
||||
start += 1;
|
||||
}
|
||||
while (start < end) {
|
||||
if ((unsigned)(*start) < 0x80) {
|
||||
start += 1;
|
||||
continue;
|
||||
}
|
||||
uint32_t u = 0;
|
||||
memcpy_p4(&u, start, end - start);
|
||||
size_t n = valid_utf8_4byte(u);
|
||||
if (n != 0) { // valid utf
|
||||
start += n;
|
||||
continue;
|
||||
}
|
||||
long err = write_error(start - src, m, MAX_RECURSE);
|
||||
if (err) {
|
||||
*p = start - src;
|
||||
return err;
|
||||
}
|
||||
start += 1;
|
||||
}
|
||||
*p = start - src;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// validate_utf8_errors returns zero if valid, otherwise, the error position.
|
||||
static always_inline long validate_utf8_errors(const GoString* s) {
|
||||
const char* start = s->buf;
|
||||
const char* end = s->buf + s->len;
|
||||
while (start < end - 3) {
|
||||
uint32_t u = (*(uint32_t*)(start));
|
||||
if ((unsigned)(*start) < 0x80) {
|
||||
start += 1;
|
||||
continue;
|
||||
}
|
||||
size_t n = valid_utf8_4byte(u);
|
||||
if (n == 0) { // invalid utf
|
||||
return -(start - s->buf) - 1;
|
||||
}
|
||||
start += n;
|
||||
}
|
||||
while (start < end) {
|
||||
if ((unsigned)(*start) < 0x80) {
|
||||
start += 1;
|
||||
continue;
|
||||
}
|
||||
uint32_t u = 0;
|
||||
memcpy_p4(&u, start, end - start);
|
||||
size_t n = valid_utf8_4byte(u);
|
||||
if (n == 0) { // invalid utf
|
||||
return -(start - s->buf) - 1;
|
||||
}
|
||||
start += n;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// SIMD implementation
|
||||
#if USE_AVX2
|
||||
|
||||
static always_inline __m256i simd256_shr(const __m256i input, const int shift) {
|
||||
__m256i shifted = _mm256_srli_epi16(input, shift);
|
||||
__m256i mask = _mm256_set1_epi8(0xFFu >> shift);
|
||||
return _mm256_and_si256(shifted, mask);
|
||||
}
|
||||
|
||||
#define simd256_prev(input, prev, N) _mm256_alignr_epi8(input, _mm256_permute2x128_si256(prev, input, 0x21), 16 - (N));
|
||||
|
||||
static always_inline __m256i must_be_2_3_continuation(const __m256i prev2, const __m256i prev3) {
|
||||
__m256i is_third_byte = _mm256_subs_epu8(prev2, _mm256_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
|
||||
__m256i is_fourth_byte = _mm256_subs_epu8(prev3, _mm256_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
|
||||
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
||||
__m256i or = _mm256_or_si256(is_third_byte, is_fourth_byte);
|
||||
return _mm256_cmpgt_epi8(or, _mm256_set1_epi8(0));;
|
||||
}
|
||||
|
||||
static always_inline __m256i simd256_lookup16(const __m256i input, const uint8_t* table) {
|
||||
return _mm256_shuffle_epi8(_mm256_setr_epi8(table[0], table[1], table[2], table[3], table[4], table[5], table[6], table[7], table[8], table[9], table[10], table[11], table[12], table[13], table[14], table[15], table[0], table[1], table[2], table[3], table[4], table[5], table[6], table[7], table[8], table[9], table[10], table[11], table[12], table[13], table[14], table[15]), input);
|
||||
}
|
||||
|
||||
//
|
||||
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
||||
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
||||
//
|
||||
static always_inline __m256i is_incomplete(const __m256i input) {
|
||||
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
||||
// ... 1111____ 111_____ 11______
|
||||
const uint8_t tab[32] = {
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1};
|
||||
const __m256i max_value = _mm256_loadu_si256((const __m256i_u *)(&tab[0]));
|
||||
return _mm256_subs_epu8(input, max_value);
|
||||
}
|
||||
|
||||
static always_inline __m256i check_special_cases(const __m256i input, const __m256i prev1) {
|
||||
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
||||
// Bit 1 = Too Long (ASCII followed by continuation)
|
||||
// Bit 2 = Overlong 3-byte
|
||||
// Bit 4 = Surrogate
|
||||
// Bit 5 = Overlong 2-byte
|
||||
// Bit 7 = Two Continuations
|
||||
const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
||||
// 11______ 11______
|
||||
const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
||||
const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
||||
const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
||||
const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
||||
const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
||||
const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
||||
// 11110100 101_____
|
||||
// 11110101 1001____
|
||||
// 11110101 101_____
|
||||
// 1111011_ 1001____
|
||||
// 1111011_ 101_____
|
||||
// 11111___ 1001____
|
||||
// 11111___ 101_____
|
||||
const uint8_t TOO_LARGE_1000 = 1<<6;
|
||||
// 11110101 1000____
|
||||
// 1111011_ 1000____
|
||||
// 11111___ 1000____
|
||||
const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
||||
|
||||
const __m256i prev1_shr4 = simd256_shr(prev1, 4);
|
||||
static const uint8_t tab1[16] = {
|
||||
// 0_______ ________ <ASCII in byte 1>
|
||||
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
||||
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
||||
// 10______ ________ <continuation in byte 1>
|
||||
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
||||
// 1100____ ________ <two byte lead in byte 1>
|
||||
TOO_SHORT | OVERLONG_2,
|
||||
// 1101____ ________ <two byte lead in byte 1>
|
||||
TOO_SHORT,
|
||||
// 1110____ ________ <three byte lead in byte 1>
|
||||
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
||||
// 1111____ ________ <four+ byte lead in byte 1>
|
||||
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4,
|
||||
};
|
||||
__m256i byte_1_high = simd256_lookup16(prev1_shr4, tab1);
|
||||
|
||||
|
||||
const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
||||
__m256i prev1_low = _mm256_and_si256(prev1, _mm256_set1_epi8(0x0F));
|
||||
static const uint8_t tab2[16] = {
|
||||
// ____0000 ________
|
||||
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
||||
// ____0001 ________
|
||||
CARRY | OVERLONG_2,
|
||||
// ____001_ ________
|
||||
CARRY,
|
||||
CARRY,
|
||||
|
||||
// ____0100 ________
|
||||
CARRY | TOO_LARGE,
|
||||
// ____0101 ________
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
// ____011_ ________
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
|
||||
// ____1___ ________
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
// ____1101 ________
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
||||
CARRY | TOO_LARGE | TOO_LARGE_1000
|
||||
};
|
||||
__m256i byte_1_low = simd256_lookup16(prev1_low, tab2);
|
||||
|
||||
|
||||
const __m256i input_shr4 = simd256_shr(input, 4);
|
||||
static const uint8_t tab3[16] = {
|
||||
// ________ 0_______ <ASCII in byte 2>
|
||||
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
||||
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
||||
|
||||
// ________ 1000____
|
||||
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
||||
// ________ 1001____
|
||||
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
||||
// ________ 101_____
|
||||
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
||||
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
||||
|
||||
// ________ 11______
|
||||
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
||||
};
|
||||
__m256i byte_2_high = simd256_lookup16(input_shr4, tab3);
|
||||
|
||||
|
||||
return _mm256_and_si256(_mm256_and_si256(byte_1_high, byte_1_low), byte_2_high);
|
||||
}
|
||||
|
||||
static always_inline __m256i check_multibyte_lengths(const __m256i input, const __m256i prev_input, const __m256i sc) {
|
||||
__m256i prev2 = simd256_prev(input, prev_input, 2);
|
||||
__m256i prev3 = simd256_prev(input, prev_input, 3);
|
||||
|
||||
|
||||
__m256i must23 = must_be_2_3_continuation(prev2, prev3);
|
||||
|
||||
__m256i must23_80 = _mm256_and_si256(must23, _mm256_set1_epi8(0x80));
|
||||
|
||||
return _mm256_xor_si256(must23_80, sc);
|
||||
}
|
||||
|
||||
|
||||
// Check whether the current bytes are valid UTF-8.
|
||||
static always_inline __m256i check_utf8_bytes(const __m256i input, const __m256i prev_input) {
|
||||
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
||||
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
||||
__m256i prev1 = simd256_prev(input, prev_input, 1);
|
||||
__m256i sc = check_special_cases(input, prev1);
|
||||
__m256i ret = check_multibyte_lengths(input, prev_input, sc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static always_inline bool is_ascii(const __m256i input) {
|
||||
return _mm256_movemask_epi8(input) == 0;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
// If this is nonzero, there has been a UTF-8 error.
|
||||
__m256i error;
|
||||
// The last input we received
|
||||
__m256i prev_input_block;
|
||||
// Whether the last input we received was incomplete (used for ASCII fast path)
|
||||
__m256i prev_incomplete;
|
||||
} utf8_checker;
|
||||
|
||||
static always_inline void utf8_checker_init(utf8_checker* checker) {
|
||||
checker->error = _mm256_setzero_si256();
|
||||
checker->prev_input_block = _mm256_setzero_si256();
|
||||
checker->prev_incomplete = _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
static always_inline bool check_error(utf8_checker* checker) {
|
||||
return !_mm256_testz_si256(checker->error, checker->error);
|
||||
}
|
||||
|
||||
static always_inline void check64_utf(utf8_checker* checker, const uint8_t* start) {
|
||||
__m256i input = _mm256_loadu_si256((__m256i*)start);
|
||||
__m256i input2 = _mm256_loadu_si256((__m256i*)(start + 32));
|
||||
// check utf-8 chars
|
||||
__m256i error1 = check_utf8_bytes(input, checker->prev_input_block);
|
||||
__m256i error2 = check_utf8_bytes(input2, input);
|
||||
checker->error = _mm256_or_si256(checker->error, _mm256_or_si256(error1, error2));
|
||||
checker->prev_input_block = input2;
|
||||
checker->prev_incomplete = is_incomplete(input2);
|
||||
}
|
||||
|
||||
static always_inline void check64(utf8_checker* checker, const uint8_t* start) {
|
||||
// fast path for contiguous ASCII
|
||||
__m256i input = _mm256_loadu_si256((__m256i*)start);
|
||||
__m256i input2 = _mm256_loadu_si256((__m256i*)(start + 32));
|
||||
__m256i reducer = _mm256_or_si256(input, input2);
|
||||
// check utf-8
|
||||
if (likely(is_ascii(reducer))) {
|
||||
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
|
||||
return;
|
||||
}
|
||||
check64_utf(checker, start);
|
||||
}
|
||||
|
||||
static always_inline void check128(utf8_checker* checker, const uint8_t* start) {
|
||||
// fast path for contiguous ASCII
|
||||
__m256i input = _mm256_loadu_si256((__m256i*)start);
|
||||
__m256i input2 = _mm256_loadu_si256((__m256i*)(start + 32));
|
||||
__m256i input3 = _mm256_loadu_si256((__m256i*)(start + 64));
|
||||
__m256i input4 = _mm256_loadu_si256((__m256i*)(start + 96));
|
||||
|
||||
__m256i reducer1 = _mm256_or_si256(input, input2);
|
||||
__m256i reducer2 = _mm256_or_si256(input3, input4);
|
||||
__m256i reducer = _mm256_or_si256(reducer1, reducer2);
|
||||
|
||||
// full 128 bytes are ascii
|
||||
if (likely(is_ascii(reducer))) {
|
||||
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
|
||||
return;
|
||||
}
|
||||
|
||||
// frist 64 bytes is ascii, next 64 bytes must be utf8
|
||||
if (likely(is_ascii(reducer1))) {
|
||||
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
|
||||
check64_utf(checker, start + 64);
|
||||
return;
|
||||
}
|
||||
|
||||
// frist 64 bytes has utf8, next 64 bytes
|
||||
check64_utf(checker, start);
|
||||
if (unlikely(is_ascii(reducer2))) {
|
||||
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
|
||||
} else {
|
||||
check64_utf(checker, start + 64);
|
||||
}
|
||||
}
|
||||
|
||||
static always_inline void check_eof(utf8_checker* checker) {
|
||||
checker->error = _mm256_or_si256(checker->error, checker->prev_incomplete);
|
||||
}
|
||||
|
||||
static always_inline void check_remain(utf8_checker* checker, const uint8_t* start, const uint8_t* end) {
|
||||
uint8_t buffer[64] = {0};
|
||||
int i = 0;
|
||||
while (start < end) {
|
||||
buffer[i++] = *(start++);
|
||||
};
|
||||
check64(checker, buffer);
|
||||
check_eof(checker);
|
||||
}
|
||||
|
||||
static always_inline long validate_utf8_avx2(const GoString* s) {
|
||||
xassert(s->buf != NULL || s->len != 0);
|
||||
const uint8_t* start = (const uint8_t*)(s->buf);
|
||||
const uint8_t* end = (const uint8_t*)(s->buf + s->len);
|
||||
/* check eof */
|
||||
if (s->len == 0) {
|
||||
return 0;
|
||||
}
|
||||
utf8_checker checker;
|
||||
utf8_checker_init(&checker);
|
||||
while (start < (end - 128)) {
|
||||
check128(&checker, start);
|
||||
if (check_error(&checker)) {
|
||||
}
|
||||
start += 128;
|
||||
};
|
||||
while (start < end - 64) {
|
||||
check64(&checker, start);
|
||||
start += 64;
|
||||
}
|
||||
check_remain(&checker, start, end);
|
||||
return check_error(&checker) ? -1 : 0;
|
||||
}
|
||||
#endif
|
||||
78
native/utils.h
Normal file
78
native/utils.h
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
* Copyright 2022 ByteDance Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <string.h>
|
||||
#include "native.h"
|
||||
|
||||
static always_inline bool vec_cross_page(const void * p, size_t n) {
|
||||
#define PAGE_SIZE 4096
|
||||
return (((size_t)(p)) & (PAGE_SIZE - 1)) > (PAGE_SIZE - n);
|
||||
#undef PAGE_SIZE
|
||||
}
|
||||
|
||||
static always_inline void memcpy4 (void *__restrict dp, const void *__restrict sp) {
|
||||
((uint32_t *)dp)[0] = ((const uint32_t *)sp)[0];
|
||||
}
|
||||
|
||||
static always_inline void memcpy8 (void *__restrict dp, const void *__restrict sp) {
|
||||
((uint64_t *)dp)[0] = ((const uint64_t *)sp)[0];
|
||||
}
|
||||
|
||||
static always_inline void memcpy16 (void *__restrict dp, const void *__restrict sp) {
|
||||
_mm_storeu_si128((void *)(dp), _mm_loadu_si128((const void *)(sp)));
|
||||
}
|
||||
|
||||
static always_inline void memcpy32(void *__restrict dp, const void *__restrict sp) {
|
||||
#if USE_AVX2
|
||||
_mm256_storeu_si256((void *)dp, _mm256_loadu_si256((const void *)sp));
|
||||
#else
|
||||
_mm_storeu_si128((void *)(dp), _mm_loadu_si128((const void *)(sp)));
|
||||
_mm_storeu_si128((void *)(dp + 16), _mm_loadu_si128((const void *)(sp + 16)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static always_inline void memcpy64(void *__restrict dp, const void *__restrict sp) {
|
||||
memcpy32(dp, sp);
|
||||
memcpy32(dp + 32, sp + 32);
|
||||
}
|
||||
|
||||
static always_inline void memcpy_p4(void *__restrict dp, const void *__restrict sp, size_t nb) {
|
||||
if (nb >= 2) { *(uint16_t *)dp = *(const uint16_t *)sp; sp += 2, dp += 2, nb -= 2; }
|
||||
if (nb >= 1) { *(uint8_t *) dp = *(const uint8_t *)sp; }
|
||||
}
|
||||
|
||||
static always_inline void memcpy_p8(void *__restrict dp, const void *__restrict sp, ssize_t nb) {
|
||||
if (nb >= 4) { memcpy4(dp, sp); sp += 4, dp += 4, nb -= 4; }
|
||||
memcpy_p4(dp, sp, nb);
|
||||
}
|
||||
|
||||
static always_inline void memcpy_p16(void *__restrict dp, const void *__restrict sp, size_t nb) {
|
||||
if (nb >= 8) { memcpy8(dp, sp); sp += 8, dp += 8, nb -= 8; }
|
||||
memcpy_p8(dp, sp, nb);
|
||||
}
|
||||
|
||||
static always_inline void memcpy_p32(void *__restrict dp, const void *__restrict sp, size_t nb) {
|
||||
if (nb >= 16) { memcpy16(dp, sp); sp += 16, dp += 16, nb -= 16; }
|
||||
memcpy_p16(dp, sp, nb);
|
||||
}
|
||||
|
||||
static always_inline void memcpy_p64(void *__restrict dp, const void *__restrict sp, size_t nb) {
|
||||
if (nb >= 32) { memcpy32(dp, sp); sp += 32, dp += 32, nb -= 32; }
|
||||
memcpy_p32(dp, sp, nb);
|
||||
}
|
||||
29
sonic.go
29
sonic.go
|
|
@ -26,31 +26,9 @@ import (
|
|||
`github.com/bytedance/sonic/decoder`
|
||||
`github.com/bytedance/sonic/encoder`
|
||||
`github.com/bytedance/sonic/option`
|
||||
`github.com/bytedance/sonic/internal/native/types`
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
)
|
||||
|
||||
func checkTrailings(buf string, pos int) error {
|
||||
/* skip all the trailing spaces */
|
||||
if pos != len(buf) {
|
||||
for pos < len(buf) && (types.SPACE_MASK & (1 << buf[pos])) != 0 {
|
||||
pos++
|
||||
}
|
||||
}
|
||||
|
||||
/* then it must be at EOF */
|
||||
if pos == len(buf) {
|
||||
return nil
|
||||
}
|
||||
|
||||
/* junk after JSON value */
|
||||
return decoder.SyntaxError {
|
||||
Src : buf,
|
||||
Pos : pos,
|
||||
Code : types.ERR_INVALID_CHAR,
|
||||
}
|
||||
}
|
||||
|
||||
type frozenConfig struct {
|
||||
Config
|
||||
encoderOpts encoder.Options
|
||||
|
|
@ -77,6 +55,9 @@ func (cfg Config) Froze() API {
|
|||
if cfg.NoNullSliceOrMap {
|
||||
api.encoderOpts |= encoder.NoNullSliceOrMap
|
||||
}
|
||||
if cfg.ValidateString {
|
||||
api.encoderOpts |= encoder.ValidateString
|
||||
}
|
||||
|
||||
// configure decoder options:
|
||||
if cfg.UseInt64 {
|
||||
|
|
@ -118,13 +99,13 @@ func (cfg frozenConfig) UnmarshalFromString(buf string, val interface{}) error {
|
|||
dec := decoder.NewDecoder(buf)
|
||||
dec.SetOptions(cfg.decoderOpts)
|
||||
err := dec.Decode(val)
|
||||
pos := dec.Pos()
|
||||
|
||||
/* check for errors */
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return checkTrailings(buf, pos)
|
||||
|
||||
return dec.CheckTrailings()
|
||||
}
|
||||
|
||||
// Unmarshal is implemented by sonic
|
||||
|
|
|
|||
71
utf8/utf8.go
Normal file
71
utf8/utf8.go
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright 2022 ByteDance Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package utf8
|
||||
|
||||
import (
|
||||
`github.com/bytedance/sonic/internal/rt`
|
||||
`github.com/bytedance/sonic/internal/native/types`
|
||||
`github.com/bytedance/sonic/internal/native`
|
||||
)
|
||||
|
||||
// CorrectWith corrects the invalid utf8 byte with repl string.
|
||||
func CorrectWith(dst []byte, src []byte, repl string) []byte {
|
||||
sstr := rt.Mem2Str(src)
|
||||
sidx := 0
|
||||
|
||||
/* state machine records the invalid postions */
|
||||
m := types.NewStateMachine()
|
||||
m.Sp = 0 // invalid utf8 numbers
|
||||
|
||||
for sidx < len(sstr) {
|
||||
scur := sidx
|
||||
ecode := native.ValidateUTF8(&sstr, &sidx, m)
|
||||
|
||||
if m.Sp != 0 {
|
||||
if m.Sp > len(sstr) {
|
||||
panic("numbers of invalid utf8 exceed the string len!")
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i < m.Sp; i++ {
|
||||
ipos := m.Vt[i] // invalid utf8 position
|
||||
dst = append(dst, sstr[scur:ipos]...)
|
||||
dst = append(dst, repl...)
|
||||
scur = m.Vt[i] + 1
|
||||
}
|
||||
/* append the remained valid utf8 bytes */
|
||||
dst = append(dst, sstr[scur:sidx]...)
|
||||
|
||||
/* not enough space, reset and continue */
|
||||
if ecode != 0 {
|
||||
m.Sp = 0
|
||||
}
|
||||
}
|
||||
|
||||
types.FreeStateMachine(m)
|
||||
return dst
|
||||
}
|
||||
|
||||
// Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid.
|
||||
func Validate(src []byte) bool {
|
||||
return ValidateString(rt.Mem2Str(src))
|
||||
}
|
||||
|
||||
// ValidateString as Validate, but for string.
|
||||
func ValidateString(src string) bool {
|
||||
return native.ValidateUTF8Fast(&src) == 0
|
||||
}
|
||||
138
utf8/utf8_test.go
Normal file
138
utf8/utf8_test.go
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* Copyright 2022 ByteDance Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package utf8
|
||||
|
||||
import (
|
||||
`testing`
|
||||
`strings`
|
||||
`github.com/stretchr/testify/assert`
|
||||
`unicode/utf8`
|
||||
`bytes`
|
||||
`math/rand`
|
||||
)
|
||||
|
||||
var (
|
||||
_Header_2Bytes = string([]byte{0xC0})
|
||||
_Header_3Bytes = string([]byte{0xE0})
|
||||
_Header_4Bytes = string([]byte{0xF0})
|
||||
_Low_Surrogate = string([]byte{0xED, 0xA0, 0x80}) // \ud800
|
||||
_High_Surrogate = string([]byte{0xED, 0xB0, 0x80}) // \udc00
|
||||
_Cont = "\xb0"
|
||||
)
|
||||
|
||||
func TestCorrectWith_InvalidUtf8(t *testing.T) {
|
||||
var tests = []struct {
|
||||
name string
|
||||
input string
|
||||
expect string
|
||||
errpos int
|
||||
} {
|
||||
{"basic", `abc`, "abc", -1},
|
||||
{"long", strings.Repeat("helloα,景😊", 1000), strings.Repeat("helloα,景😊", 1000), -1},
|
||||
|
||||
// invalid utf8 - single byte
|
||||
{"single_Cont", _Cont, "\ufffd", 0},
|
||||
{"single_Header_2Bytes", _Header_2Bytes, "\ufffd", 0},
|
||||
{"single_Header_3Bytes", _Header_3Bytes, "\ufffd", 0},
|
||||
{"single_Header_4Bytes", _Header_4Bytes, "\ufffd", 0},
|
||||
|
||||
// invalid utf8 - two bytes
|
||||
{"two_Header_2Bytes + _Cont", _Header_2Bytes + _Cont, "\ufffd\ufffd", 0},
|
||||
{`two_Header_4Bytes + _Cont+ "xx"`, _Header_4Bytes + _Cont + "xx", "\ufffd\ufffdxx", 0},
|
||||
{ `"xx" + three_Header_4Bytes + _Cont + _Cont`, "xx" + _Header_4Bytes + _Cont + _Cont, "xx\ufffd\ufffd\ufffd", 2},
|
||||
|
||||
// invalid utf8 - three bytes
|
||||
{`three_Low_Surrogate`, _Low_Surrogate, "\ufffd\ufffd\ufffd", 0},
|
||||
{`three__High_Surrogate`, _High_Surrogate, "\ufffd\ufffd\ufffd", 0},
|
||||
|
||||
// invalid utf8 - multi bytes
|
||||
{`_High_Surrogate + _Low_Surrogate`, _High_Surrogate + _Low_Surrogate, "\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", 0},
|
||||
{`"\x80\x80\x80\x80"`, "\x80\x80\x80\x80", "\ufffd\ufffd\ufffd\ufffd", 0},
|
||||
}
|
||||
for _, test := range tests {
|
||||
got := CorrectWith(nil, []byte(test.input), "\ufffd")
|
||||
assert.Equal(t, []byte(test.expect), got, test.name)
|
||||
assert.Equal(t,test.errpos == -1, utf8.ValidString(test.input), test.name)
|
||||
}
|
||||
}
|
||||
|
||||
func genRandBytes(length int) []byte {
|
||||
var buf bytes.Buffer
|
||||
for j := 0; j < length; j++ {
|
||||
buf.WriteByte(byte(rand.Intn(0xFF + 1)))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func genRandAscii(length int) []byte {
|
||||
var buf bytes.Buffer
|
||||
for j := 0; j < length; j++ {
|
||||
buf.WriteByte(byte(rand.Intn(0x7F + 1)))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func genRandRune(length int) []byte {
|
||||
var buf bytes.Buffer
|
||||
for j := 0; j < length; j++ {
|
||||
buf.WriteRune(rune(rand.Intn(0x10FFFF + 1)))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func TestValidate_Random(t *testing.T) {
|
||||
// compare with stdlib
|
||||
compare := func(t *testing.T, data []byte) {
|
||||
assert.Equal(t, utf8.Valid(data), Validate(data), string(data))
|
||||
}
|
||||
|
||||
// random testing
|
||||
nums := 1000
|
||||
maxLen := 1000
|
||||
for i := 0; i < nums; i++ {
|
||||
length := rand.Intn(maxLen)
|
||||
compare(t, genRandBytes(length))
|
||||
compare(t, genRandRune(length))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidate(b *testing.B) {
|
||||
bench := []struct {
|
||||
name string
|
||||
data []byte
|
||||
} {
|
||||
{"ValidAscii", genRandAscii(1000)},
|
||||
{"ValidUTF8", genRandRune(1000)},
|
||||
{"RandomBytes", genRandBytes(1000)},
|
||||
}
|
||||
|
||||
for _, test := range bench {
|
||||
if utf8.Valid(test.data) != Validate(test.data) {
|
||||
b.Fatalf("sonic utf8 validate wrong for %s string: %v", test.name, test.data)
|
||||
}
|
||||
b.Run("Sonic_" + test.name, func(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
Validate(test.data)
|
||||
}
|
||||
})
|
||||
b.Run("StdLib_" + test.name, func(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
utf8.Valid(test.data)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue