From 67cffb15bd786099af68ac82e545f12f2eedfb16 Mon Sep 17 00:00:00 2001 From: Yi Duan Date: Tue, 3 Jan 2023 19:47:55 +0800 Subject: [PATCH] feat:(ast) add fallback api on `not-amd64` env (#341) * feat:(ast) add fallback api on `not-amd64` env * test: add native `linux-arm64` CI * opt: just skip number chars whne `decodeFloat64` * fmt * fix: check EOF --- .github/workflows/benchmark-linux-arm64.yml | 27 ++ ...inux-amd64.yml => benchmark-linux-x64.yml} | 10 +- .github/workflows/push-check-go118.yml | 2 +- .github/workflows/push-check-linux-arm64.yml | 31 ++ ...nux-amd64.yml => push-check-linux-x64.yml} | 2 +- ...push-check-arm.yml => push-check-qemu.yml} | 6 +- .github/workflows/push-check-windows.yml | 2 +- ast/api_amd64.go | 91 ++++ ast/api_amd64_test.go | 34 ++ ast/api_compat.go | 62 +++ ast/decode.go | 430 ++++++++++++++++++ ast/encode.go | 130 +++--- ast/encode_test.go | 17 +- ast/error.go | 98 ++++ ast/node.go | 13 +- ast/node_test.go | 6 +- ast/parser.go | 42 +- ast/search_test.go | 3 +- ast/stubs.go | 42 +- ast/utils.go | 56 --- bench-arm.sh | 14 + internal/native/dispatch_amd64.go | 5 + internal/native/dispatch_amd64.s | 9 + internal/rt/{asm.s => asm_amd64.s} | 0 internal/rt/asm_arm64.s | 10 + internal/rt/fastmem.go | 23 + 26 files changed, 975 insertions(+), 190 deletions(-) create mode 100644 .github/workflows/benchmark-linux-arm64.yml rename .github/workflows/{benchmark-linux-amd64.yml => benchmark-linux-x64.yml} (69%) create mode 100644 .github/workflows/push-check-linux-arm64.yml rename .github/workflows/{push-check-linux-amd64.yml => push-check-linux-x64.yml} (96%) rename .github/workflows/{push-check-arm.yml => push-check-qemu.yml} (90%) create mode 100644 ast/api_amd64.go create mode 100644 ast/api_amd64_test.go create mode 100644 ast/api_compat.go create mode 100644 ast/decode.go create mode 100644 ast/error.go delete mode 100644 ast/utils.go create mode 100644 bench-arm.sh rename internal/rt/{asm.s => asm_amd64.s} (100%) create mode 100644 internal/rt/asm_arm64.s diff --git a/.github/workflows/benchmark-linux-arm64.yml b/.github/workflows/benchmark-linux-arm64.yml new file mode 100644 index 0000000..0f420b3 --- /dev/null +++ b/.github/workflows/benchmark-linux-arm64.yml @@ -0,0 +1,27 @@ +name: Benchmark Linux-ARM + +on: pull_request + +jobs: + build: + runs-on: [arm] + steps: + - uses: actions/checkout@v2 + + - name: Check Branch + run: ./check_branch_name.sh ${{ github.head_ref }} + + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: 1.17.1 + + - uses: actions/cache@v2 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + + - name: Benchmark sonic + run: sh bench-arm.sh \ No newline at end of file diff --git a/.github/workflows/benchmark-linux-amd64.yml b/.github/workflows/benchmark-linux-x64.yml similarity index 69% rename from .github/workflows/benchmark-linux-amd64.yml rename to .github/workflows/benchmark-linux-x64.yml index b6931dd..48dadaf 100644 --- a/.github/workflows/benchmark-linux-amd64.yml +++ b/.github/workflows/benchmark-linux-x64.yml @@ -1,4 +1,4 @@ -name: Pull Request Benchmark +name: Benchmark Linux-X64 on: pull_request @@ -24,10 +24,4 @@ jobs: ${{ runner.os }}-go- - name: Benchmark sonic - run: sh bench.sh - - # - name: Benchmark third-party - # run: go test -benchmem -run=^$ -bench . -v ./generic_test - - # - name: Diff - # run: ./bench.py -b '"^Benchmark.*Sonic"' -c + run: sh bench.sh \ No newline at end of file diff --git a/.github/workflows/push-check-go118.yml b/.github/workflows/push-check-go118.yml index 69044a5..e83d077 100644 --- a/.github/workflows/push-check-go118.yml +++ b/.github/workflows/push-check-go118.yml @@ -1,4 +1,4 @@ -name: Push Check Go1.18 +name: Push Check Go1.18-Linux-X64 on: push diff --git a/.github/workflows/push-check-linux-arm64.yml b/.github/workflows/push-check-linux-arm64.yml new file mode 100644 index 0000000..dcb43fe --- /dev/null +++ b/.github/workflows/push-check-linux-arm64.yml @@ -0,0 +1,31 @@ +name: Push Check Linux-ARM + +on: push + +jobs: + build: + strategy: + matrix: + go-version: [1.15.x, 1.19.x] + os: [arm] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v2 + + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go-version }} + + - uses: actions/cache@v2 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + + - name: Compatibility Test - main + run: GOMAXPROCS=4 go test -v -gcflags=-d=checkptr=0 -race github.com/bytedance/sonic + + - name: Compatibility Test - ast + run: GOMAXPROCS=4 go test -v -gcflags=-d=checkptr=0 -race github.com/bytedance/sonic/ast diff --git a/.github/workflows/push-check-linux-amd64.yml b/.github/workflows/push-check-linux-x64.yml similarity index 96% rename from .github/workflows/push-check-linux-amd64.yml rename to .github/workflows/push-check-linux-x64.yml index 583e7a4..66c317c 100644 --- a/.github/workflows/push-check-linux-amd64.yml +++ b/.github/workflows/push-check-linux-x64.yml @@ -1,4 +1,4 @@ -name: Push Check All +name: Push Check Linux-X64 on: push diff --git a/.github/workflows/push-check-arm.yml b/.github/workflows/push-check-qemu.yml similarity index 90% rename from .github/workflows/push-check-arm.yml rename to .github/workflows/push-check-qemu.yml index 35d7a5a..dfe58d7 100644 --- a/.github/workflows/push-check-arm.yml +++ b/.github/workflows/push-check-qemu.yml @@ -1,4 +1,4 @@ -name: Push Check ARM +name: Push Check Linux-Qemu on: push @@ -24,10 +24,10 @@ jobs: restore-keys: | ${{ runner.os }}-go- - - name: Compatibility Test + - name: Compatibility Test - qemu run: | printf ' #!/bin/bash\n if [ ! -x "/usr/bin/qemu-x86_64" ];then\n sudo apt-get update\n sudo apt-get -y install make gcc g++ libglib2.0-dev libpixman-1-dev libfdt-dev python3-pip ninja-build\n sudo pip3 install meson\n wget https://download.qemu.org/qemu-6.2.0.tar.xz\n tar -xvf qemu-6.2.0.tar.xz\n cd qemu-6.2.0\n sudo ./configure\n sudo make -j 4\n sudo make install\n cd ..\n cp /usr/local/bin/qemu-x86_64 /usr/bin/qemu-x86_64\n fi\n' > qemu_install.sh chmod +x qemu_install.sh ./qemu_install.sh GOARCH=amd64 go test -gcflags=-d=checkptr=0 -c . - qemu-x86_64 -cpu max ./sonic.test -test.v + qemu-x86_64 -cpu max ./sonic.test -test.v \ No newline at end of file diff --git a/.github/workflows/push-check-windows.yml b/.github/workflows/push-check-windows.yml index ff7ccf3..453ec9a 100644 --- a/.github/workflows/push-check-windows.yml +++ b/.github/workflows/push-check-windows.yml @@ -1,4 +1,4 @@ -name: Push Check Windows +name: Push Check Windows-X64 on: push diff --git a/ast/api_amd64.go b/ast/api_amd64.go new file mode 100644 index 0000000..caac7a6 --- /dev/null +++ b/ast/api_amd64.go @@ -0,0 +1,91 @@ +//go:build amd64 +// +build amd64 + +package ast + +import ( + `runtime` + `unsafe` + + `github.com/bytedance/sonic/encoder` + `github.com/bytedance/sonic/internal/native` + `github.com/bytedance/sonic/internal/native/types` + `github.com/bytedance/sonic/internal/rt` + uq `github.com/bytedance/sonic/unquote` + `github.com/chenzhuoyu/base64x` +) + +var typeByte = rt.UnpackEface(byte(0)).Type + +func quote(buf *[]byte, val string) { + *buf = append(*buf, '"') + if len(val) == 0 { + *buf = append(*buf, '"') + } + + sp := rt.IndexChar(val, 0) + nb := len(val) + b := (*rt.GoSlice)(unsafe.Pointer(buf)) + + // input buffer + for nb > 0 { + // output buffer + dp := unsafe.Pointer(uintptr(b.Ptr) + uintptr(b.Len)) + dn := b.Cap - b.Len + // call native.Quote, dn is byte count it outputs + ret := native.Quote(sp, nb, dp, &dn, 0) + // update *buf length + b.Len += dn + + // no need more output + if ret >= 0 { + break + } + + // double buf size + *b = growslice(typeByte, *b, b.Cap*2) + // ret is the complement of consumed input + ret = ^ret + // update input buffer + nb -= ret + sp = unsafe.Pointer(uintptr(sp) + uintptr(ret)) + } + + runtime.KeepAlive(buf) + runtime.KeepAlive(sp) + *buf = append(*buf, '"') +} + +func unquote(src string) (string, types.ParsingError) { + return uq.String(src) +} + +func decodeBase64(src string) ([]byte, error) { + return base64x.StdEncoding.DecodeString(src) +} + +func encodeBase64(src []byte) string { + return base64x.StdEncoding.EncodeToString(src) +} + +func (self *Parser) decodeValue() (val types.JsonState) { + sv := (*rt.GoString)(unsafe.Pointer(&self.s)) + self.p = native.Value(sv.Ptr, sv.Len, self.p, &val, 0) + return +} + +func (self *Parser) skip() (int, types.ParsingError) { + fsm := types.NewStateMachine() + start := native.SkipOne(&self.s, &self.p, fsm, 0) + types.FreeStateMachine(fsm) + + if start < 0 { + return self.p, types.ParsingError(-start) + } + return start, 0 +} + +func (self *Node) encodeInterface(buf *[]byte) error { + //WARN: NOT compatible with json.Encoder + return encoder.EncodeInto(buf, self.packAny(), 0) +} \ No newline at end of file diff --git a/ast/api_amd64_test.go b/ast/api_amd64_test.go new file mode 100644 index 0000000..391baaf --- /dev/null +++ b/ast/api_amd64_test.go @@ -0,0 +1,34 @@ +//go:build amd64 +// +build amd64 + +package ast + +import ( + `testing` + + `github.com/bytedance/sonic/encoder` + `github.com/stretchr/testify/assert` +) + +func TestSortNodeTwitter(t *testing.T) {root, err := NewSearcher(_TwitterJson).GetByPath() + if err != nil { + t.Fatal(err) + } + obj, err := root.MapUseNumber() + if err != nil { + t.Fatal(err) + } + exp, err := encoder.Encode(obj, encoder.SortMapKeys) + if err != nil { + t.Fatal(err) + } + if err := root.SortKeys(true); err != nil { + t.Fatal(err) + } + act, err := root.MarshalJSON() + if err != nil { + t.Fatal(err) + } + assert.Equal(t, len(exp), len(act)) + assert.Equal(t, string(exp), string(act)) +} \ No newline at end of file diff --git a/ast/api_compat.go b/ast/api_compat.go new file mode 100644 index 0000000..6efe677 --- /dev/null +++ b/ast/api_compat.go @@ -0,0 +1,62 @@ +//go:build !amd64 +// +build !amd64 + +package ast + +import ( + `encoding/base64` + `encoding/json` + + `github.com/bytedance/sonic/internal/native/types` + `github.com/bytedance/sonic/internal/rt` +) + +func quote(buf *[]byte, val string) { + quoteString(buf, val) +} + +func unquote(src string) (string, types.ParsingError) { + sp := rt.IndexChar(src, -1) + out, ok := unquoteBytes(rt.BytesFrom(sp, len(src)+2, len(src)+2)) + if !ok { + return "", types.ERR_INVALID_ESCAPE + } + return rt.Mem2Str(out), 0 +} + + + +func decodeBase64(src string) ([]byte, error) { + return base64.StdEncoding.DecodeString(src) +} + +func encodeBase64(src []byte) string { + return base64.StdEncoding.EncodeToString(src) +} + +func (self *Parser) decodeValue() (val types.JsonState) { + e, v := decodeValue(self.s, self.p) + if e < 0 { + return v + } + self.p = e + return v +} + +func (self *Parser) skip() (int, types.ParsingError) { + e, s := skipValue(self.s, self.p) + if e < 0 { + return self.p, types.ParsingError(-e) + } + self.p = e + return s, 0 +} + +func (self *Node) encodeInterface(buf *[]byte) error { + out, err := json.Marshal(self.packAny()) + if err != nil { + return err + } + *buf = append(*buf, out...) + return nil +} \ No newline at end of file diff --git a/ast/decode.go b/ast/decode.go new file mode 100644 index 0000000..d54e983 --- /dev/null +++ b/ast/decode.go @@ -0,0 +1,430 @@ +package ast + +import ( + `encoding/base64` + `runtime` + `strconv` + `unsafe` + + `github.com/bytedance/sonic/internal/native/types` + `github.com/bytedance/sonic/internal/rt` +) + +const _blankCharsMask = (1 << ' ') | (1 << '\t') | (1 << '\r') | (1 << '\n') + +const ( + bytesNull = "null" + bytesTrue = "true" + bytesFalse = "false" + bytesObject = "{}" + bytesArray = "[]" +) + +func isSpace(c byte) bool { + return (int(1<= se { + return -int(types.ERR_EOF) + } + runtime.KeepAlive(src) + return int(sp - uintptr(rt.IndexChar(src, 0))) +} + +func decodeNull(src string, pos int) (ret int) { + ret = pos + 4 + if ret > len(src) { + return -int(types.ERR_EOF) + } + if src[pos:ret] == bytesNull { + return ret + } else { + return -int(types.ERR_INVALID_CHAR) + } +} + +func decodeTrue(src string, pos int) (ret int) { + ret = pos + 4 + if ret > len(src) { + return -int(types.ERR_EOF) + } + if src[pos:ret] == bytesTrue { + return ret + } else { + return -int(types.ERR_INVALID_CHAR) + } + +} + +func decodeFalse(src string, pos int) (ret int) { + ret = pos + 5 + if ret > len(src) { + return -int(types.ERR_EOF) + } + if src[pos:ret] == bytesFalse { + return ret + } + return -int(types.ERR_INVALID_CHAR) +} + +func decodeString(src string, pos int) (ret int, v string) { + ret, ep := skipString(src, pos) + if ep == -1 { + (*rt.GoString)(unsafe.Pointer(&v)).Ptr = rt.IndexChar(src, pos+1) + (*rt.GoString)(unsafe.Pointer(&v)).Len = ret - pos - 2 + return ret, v + } + + vv, ok := unquoteBytes(rt.Str2Mem(src[pos:ret])) + if !ok { + return -int(types.ERR_INVALID_CHAR), "" + } + + runtime.KeepAlive(src) + return ret, rt.Mem2Str(vv) +} + +func decodeBinary(src string, pos int) (ret int, v []byte) { + var vv string + ret, vv = decodeString(src, pos) + if ret < 0 { + return ret, nil + } + var err error + v, err = base64.StdEncoding.DecodeString(vv) + if err != nil { + return -int(types.ERR_INVALID_CHAR), nil + } + return ret, v +} + +func isDigit(c byte) bool { + return c >= '0' && c <= '9' +} + +func decodeInt64(src string, pos int) (ret int, v int64, err error) { + sp := uintptr(rt.IndexChar(src, pos)) + ss := uintptr(sp) + se := uintptr(rt.IndexChar(src, len(src))) + if uintptr(sp) >= se { + return -int(types.ERR_EOF), 0, nil + } + + if c := *(*byte)(unsafe.Pointer(sp)); c == '-' { + sp += 1 + } + if sp == se { + return -int(types.ERR_EOF), 0, nil + } + + for ; sp < se; sp += uintptr(1) { + if !isDigit(*(*byte)(unsafe.Pointer(sp))) { + break + } + } + + if sp < se { + if c := *(*byte)(unsafe.Pointer(sp)); c == '.' || c == 'e' || c == 'E' { + return -int(types.ERR_INVALID_NUMBER_FMT), 0, nil + } + } + + var vv string + ret = int(uintptr(sp) - uintptr((*rt.GoString)(unsafe.Pointer(&src)).Ptr)) + (*rt.GoString)(unsafe.Pointer(&vv)).Ptr = unsafe.Pointer(ss) + (*rt.GoString)(unsafe.Pointer(&vv)).Len = ret - pos + + v, err = strconv.ParseInt(vv, 10, 64) + if err != nil { + //NOTICE: allow overflow here + if err.(*strconv.NumError).Err == strconv.ErrRange { + return ret, 0, err + } + return -int(types.ERR_INVALID_CHAR), 0, err + } + + runtime.KeepAlive(src) + return ret, v, nil +} + +func isNumberChars(c byte) bool { + return (c >= '0' && c <= '9') || c == '+' || c == '-' || c == 'e' || c == 'E' || c == '.' +} + +func decodeFloat64(src string, pos int) (ret int, v float64, err error) { + sp := uintptr(rt.IndexChar(src, pos)) + ss := uintptr(sp) + se := uintptr(rt.IndexChar(src, len(src))) + if uintptr(sp) >= se { + return -int(types.ERR_EOF), 0, nil + } + + if c := *(*byte)(unsafe.Pointer(sp)); c == '-' { + sp += 1 + } + if sp == se { + return -int(types.ERR_EOF), 0, nil + } + + for ; sp < se; sp += uintptr(1) { + if !isNumberChars(*(*byte)(unsafe.Pointer(sp))) { + break + } + } + + var vv string + ret = int(uintptr(sp) - uintptr((*rt.GoString)(unsafe.Pointer(&src)).Ptr)) + (*rt.GoString)(unsafe.Pointer(&vv)).Ptr = unsafe.Pointer(ss) + (*rt.GoString)(unsafe.Pointer(&vv)).Len = ret - pos + + v, err = strconv.ParseFloat(vv, 64) + if err != nil { + //NOTICE: allow overflow here + if err.(*strconv.NumError).Err == strconv.ErrRange { + return ret, 0, err + } + return -int(types.ERR_INVALID_CHAR), 0, err + } + + runtime.KeepAlive(src) + return ret, v, nil +} + +func decodeValue(src string, pos int) (ret int, v types.JsonState) { + pos = skipBlank(src, pos) + if pos < 0 { + return pos, types.JsonState{Vt: types.ValueType(pos)} + } + switch c := src[pos]; c { + case 'n': + ret = decodeNull(src, pos) + if ret < 0 { + return ret, types.JsonState{Vt: types.ValueType(ret)} + } + return ret, types.JsonState{Vt: types.V_NULL} + case '"': + var ep int + ret, ep = skipString(src, pos) + if ret < 0 { + return ret, types.JsonState{Vt: types.ValueType(ret)} + } + return ret, types.JsonState{Vt: types.V_STRING, Iv: int64(pos + 1), Ep: ep} + case '{': + return pos + 1, types.JsonState{Vt: types.V_OBJECT} + case '[': + return pos + 1, types.JsonState{Vt: types.V_ARRAY} + case 't': + ret = decodeTrue(src, pos) + if ret < 0 { + return ret, types.JsonState{Vt: types.ValueType(ret)} + } + return ret, types.JsonState{Vt: types.V_TRUE} + case 'f': + ret = decodeFalse(src, pos) + if ret < 0 { + return ret, types.JsonState{Vt: types.ValueType(ret)} + } + return ret, types.JsonState{Vt: types.V_FALSE} + case '-', '+', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + var iv int64 + ret, iv, _ = decodeInt64(src, pos) + if ret >= 0 { + return ret, types.JsonState{Vt: types.V_INTEGER, Iv: iv, Ep: pos} + } else if ret != -int(types.ERR_INVALID_NUMBER_FMT) { + return ret, types.JsonState{Vt: types.ValueType(ret)} + } + var fv float64 + ret, fv, _ = decodeFloat64(src, pos) + if ret >= 0 { + return ret, types.JsonState{Vt: types.V_DOUBLE, Dv: fv, Ep: pos} + } else { + return ret, types.JsonState{Vt: types.ValueType(ret)} + } + default: + return -int(types.ERR_INVALID_CHAR), types.JsonState{Vt:-types.ValueType(types.ERR_INVALID_CHAR)} + } +} + +func skipNumber(src string, pos int) (ret int) { + sp := uintptr(rt.IndexChar(src, pos)) + se := uintptr(rt.IndexChar(src, len(src))) + if uintptr(sp) >= se { + return -int(types.ERR_EOF) + } + + if c := *(*byte)(unsafe.Pointer(sp)); c == '-' { + sp += 1 + } + ss := sp + + var pointer bool + var exponent bool + var lastIsDigit bool + var nextNeedDigit = true + + for ; sp < se; sp += uintptr(1) { + c := *(*byte)(unsafe.Pointer(sp)) + if isDigit(c) { + lastIsDigit = true + nextNeedDigit = false + continue + } else if nextNeedDigit { + return -int(types.ERR_INVALID_CHAR) + } else if c == '.' { + if !lastIsDigit || pointer || sp == ss { + return -int(types.ERR_INVALID_CHAR) + } + pointer = true + lastIsDigit = false + nextNeedDigit = true + continue + } else if c == 'e' || c == 'E' { + if !lastIsDigit || exponent { + return -int(types.ERR_INVALID_CHAR) + } + if sp == se-1 { + return -int(types.ERR_EOF) + } + exponent = true + lastIsDigit = false + nextNeedDigit = false + continue + } else if c == '-' || c == '+' { + if prev := *(*byte)(unsafe.Pointer(sp - 1)); prev != 'e' && prev != 'E' { + return -int(types.ERR_INVALID_CHAR) + } + lastIsDigit = false + nextNeedDigit = true + continue + } else { + break + } + } + + if nextNeedDigit { + return -int(types.ERR_EOF) + } + + runtime.KeepAlive(src) + return int(uintptr(sp) - uintptr((*rt.GoString)(unsafe.Pointer(&src)).Ptr)) +} + +func skipString(src string, pos int) (ret int, ep int) { + if pos+1 >= len(src) { + return -int(types.ERR_EOF), -1 + } + + sp := uintptr(rt.IndexChar(src, pos)) + se := uintptr(rt.IndexChar(src, len(src))) + + if *(*byte)(unsafe.Pointer(sp)) != '"' { + return -int(types.ERR_INVALID_CHAR), -1 + } + sp += 1 + + ep = -1 + for sp < se { + c := *(*byte)(unsafe.Pointer(sp)) + if c == '\\' { + if ep == -1 { + ep = int(uintptr(sp) - uintptr((*rt.GoString)(unsafe.Pointer(&src)).Ptr)) + } + sp += 2 + continue + } + sp += 1 + if c == '"' { + break + } + } + + if sp > se { + return -int(types.ERR_EOF), -1 + } + + runtime.KeepAlive(src) + return int(uintptr(sp) - uintptr((*rt.GoString)(unsafe.Pointer(&src)).Ptr)), ep +} + +func skipPair(src string, pos int, lchar byte, rchar byte) (ret int) { + if pos+1 >= len(src) { + return -int(types.ERR_EOF) + } + + sp := uintptr(rt.IndexChar(src, pos)) + se := uintptr(rt.IndexChar(src, len(src))) + + if *(*byte)(unsafe.Pointer(sp)) != lchar { + return -int(types.ERR_INVALID_CHAR) + } + + sp += 1 + nbrace := 1 + inquote := false + + for sp < se { + c := *(*byte)(unsafe.Pointer(sp)) + if c == '\\' { + sp += 2 + continue + } else if c == '"' { + inquote = !inquote + } else if c == lchar { + if !inquote { + nbrace += 1 + } + } else if c == rchar { + if !inquote { + nbrace -= 1 + if nbrace == 0 { + sp += 1 + break + } + } + } + sp += 1 + } + + if nbrace != 0 { + return -int(types.ERR_INVALID_CHAR) + } + + runtime.KeepAlive(src) + return int(uintptr(sp) - uintptr((*rt.GoString)(unsafe.Pointer(&src)).Ptr)) +} + +func skipValue(src string, pos int) (ret int, start int) { + pos = skipBlank(src, pos) + if pos < 0 { + return pos, -1 + } + switch c := src[pos]; c { + case 'n': + ret = decodeNull(src, pos) + case '"': + ret, _ = skipString(src, pos) + case '{': + ret = skipPair(src, pos, '{', '}') + case '[': + ret = skipPair(src, pos, '[', ']') + case 't': + ret = decodeTrue(src, pos) + case 'f': + ret = decodeFalse(src, pos) + case '-', '+', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + ret = skipNumber(src, pos) + default: + ret = -int(types.ERR_INVALID_CHAR) + } + return ret, pos +} diff --git a/ast/encode.go b/ast/encode.go index e07b9cd..b9bcaea 100644 --- a/ast/encode.go +++ b/ast/encode.go @@ -17,26 +17,77 @@ package ast import ( - `reflect` `sync` - `unsafe` - - `github.com/bytedance/sonic/encoder` - `github.com/bytedance/sonic/internal/native` - `github.com/bytedance/sonic/internal/rt` + `unicode/utf8` ) const ( _MaxBuffer = 1024 // 1KB buffer size ) -const ( - bytesNull = "null" - bytesTrue = "true" - bytesFalse = "false" - bytesObject = "{}" - bytesArray = "[]" -) +func quoteString(e *[]byte, s string) { + *e = append(*e, '"') + start := 0 + for i := 0; i < len(s); { + if b := s[i]; b < utf8.RuneSelf { + if safeSet[b] { + i++ + continue + } + if start < i { + *e = append(*e, s[start:i]...) + } + *e = append(*e, '\\') + switch b { + case '\\', '"': + *e = append(*e, b) + case '\n': + *e = append(*e, 'n') + case '\r': + *e = append(*e, 'r') + case '\t': + *e = append(*e, 't') + default: + // This encodes bytes < 0x20 except for \t, \n and \r. + // If escapeHTML is set, it also escapes <, >, and & + // because they can lead to security holes when + // user-controlled strings are rendered into JSON + // and served to some browsers. + *e = append(*e, `u00`...) + *e = append(*e, hex[b>>4]) + *e = append(*e, hex[b&0xF]) + } + i++ + start = i + continue + } + c, size := utf8.DecodeRuneInString(s[i:]) + // if c == utf8.RuneError && size == 1 { + // if start < i { + // e.Write(s[start:i]) + // } + // e.WriteString(`\ufffd`) + // i += size + // start = i + // continue + // } + if c == '\u2028' || c == '\u2029' { + if start < i { + *e = append(*e, s[start:i]...) + } + *e = append(*e, `\u202`...) + *e = append(*e, hex[c&0xF]) + i += size + start = i + continue + } + i += size + } + if start < len(s) { + *e = append(*e, s[start:]...) + } + *e = append(*e, '"') +} var bytesPool = sync.Pool{} @@ -117,46 +168,13 @@ func (self *Node) encodeNumber(buf *[]byte) error { return nil } -var typeByte = rt.UnpackType(reflect.TypeOf(byte(0))) - -func quote(buf *[]byte, sp unsafe.Pointer, nb int) { - b := (*rt.GoSlice)(unsafe.Pointer(buf)) - // input buffer - for nb > 0 { - // output buffer - dp := unsafe.Pointer(uintptr(b.Ptr) + uintptr(b.Len)) - dn := b.Cap - b.Len - // call native.Quote, dn is byte count it outputs - ret := native.Quote(sp, nb, dp, &dn, 0) - // update *buf length - b.Len += dn - - // no need more output - if ret >= 0 { - break - } - - // double buf size - *b = growslice(typeByte, *b, b.Cap * 2) - // ret is the complement of consumed input - ret = ^ret - // update input buffer - nb -= ret - sp = unsafe.Pointer(uintptr(sp) + uintptr(ret)) - } -} - func (self *Node) encodeString(buf *[]byte) error { - *buf = append(*buf, '"') - nb := int(self.v) - if nb == 0 { - *buf = append(*buf, '"') + if self.v == 0 { + *buf = append(*buf, '"', '"') return nil } - quote(buf, self.p, nb) - - *buf = append(*buf, '"') + quote(buf, addr2str(self.p, self.v)) return nil } @@ -194,16 +212,14 @@ func (self *Node) encodeArray(buf *[]byte) error { } func (self *Pair) encode(buf *[]byte) error { - *buf = append(*buf, '"') - sptr := (*rt.GoString)(unsafe.Pointer(&self.Key)) - if sptr.Len == 0 { - *buf = append(*buf, '"', ':') + if len(*buf) == 0 { + *buf = append(*buf, '"', '"', ':') return self.Value.encode(buf) } - quote(buf, sptr.Ptr, sptr.Len) + quote(buf, self.Key) + *buf = append(*buf, ':') - *buf = append(*buf, '"', ':') return self.Value.encode(buf) } @@ -238,8 +254,4 @@ func (self *Node) encodeObject(buf *[]byte) error { *buf = append(*buf, '}') return nil -} - -func (self *Node) encodeInterface(buf *[]byte) error { - return encoder.EncodeInto(buf, self.packAny(), 0) } \ No newline at end of file diff --git a/ast/encode_test.go b/ast/encode_test.go index c451834..848600d 100644 --- a/ast/encode_test.go +++ b/ast/encode_test.go @@ -17,12 +17,11 @@ package ast import ( + `encoding/json` `runtime` `sync` `testing` - `github.com/bytedance/sonic/decoder` - `github.com/bytedance/sonic/encoder` `github.com/bytedance/sonic/internal/native/types` `github.com/stretchr/testify/assert` ) @@ -63,14 +62,15 @@ func TestGC_Encode(t *testing.T) { func TestEncodeValue(t *testing.T) { obj := new(_TwitterStruct) - if err := decoder.NewDecoder(_TwitterJson).Decode(obj); err != nil { + if err := json.Unmarshal([]byte(_TwitterJson), obj); err != nil { t.Fatal(err) } - buf, err := encoder.Encode(obj, 0) + // buf, err := encoder.Encode(obj, encoder.EscapeHTML|encoder.SortMapKeys) + buf, err := json.Marshal(obj) if err != nil { t.Fatal(err) } - quote, err := encoder.Encode(_TwitterJson, 0) + quote, err := json.Marshal(_TwitterJson) if err != nil { t.Fatal(err) } @@ -90,16 +90,17 @@ func TestEncodeValue(t *testing.T) { {NewArray([]Node{}), "[]", false}, {NewArray([]Node{NewBool(true), NewString("true"), NewString("\t")}), `[true,"true","\t"]`, false}, {NewObject([]Pair{Pair{"a", NewNull()}, Pair{"b", NewNumber("0")}}), `{"a":null,"b":0}`, false}, - {NewObject([]Pair{Pair{"\ta", NewString("\t")}, Pair{"\bb", NewString("\b")}, Pair{"\nb", NewString("\n")}, Pair{"\ra", NewString("\r")}}), `{"\ta":"\t","\u0008b":"\u0008","\nb":"\n","\ra":"\r"}`, false}, + {NewObject([]Pair{Pair{"\ta", NewString("\t")}, Pair{"\bb", NewString("\b")}, Pair{"\nb", NewString("\n")}, Pair{"\ra", NewString("\r")}}),`{"\ta":"\t","\u0008b":"\u0008","\nb":"\n","\ra":"\r"}`, false}, {NewObject([]Pair{}), `{}`, false}, {NewBytes([]byte("hello, world")), `"aGVsbG8sIHdvcmxk"`, false}, {NewAny(obj), string(buf), false}, - {NewRaw(`[{ }]`), "[{ }]", false}, + {NewRaw(`[{ }]`), "[{}]", false}, {Node{}, "", true}, {Node{t: types.ValueType(1)}, "", true}, } for i, c := range input { - buf, err := encoder.Encode(&c.node, 0) + t.Log(i) + buf, err := json.Marshal(&c.node) if c.err { if err == nil { t.Fatal(i) diff --git a/ast/error.go b/ast/error.go new file mode 100644 index 0000000..f4c441a --- /dev/null +++ b/ast/error.go @@ -0,0 +1,98 @@ +package ast + +import ( + `fmt` + `strings` + `unsafe` + + `github.com/bytedance/sonic/internal/native/types` +) + +func (self *Parser) syntaxError(err types.ParsingError) SyntaxError { + return SyntaxError{ + Pos : self.p, + Src : self.s, + Code: err, + } +} + +func newSyntaxError(err SyntaxError) *Node { + msg := err.Description() + return &Node{ + t: V_ERROR, + v: int64(err.Code), + p: unsafe.Pointer(&msg), + } +} + +type SyntaxError struct { + Pos int + Src string + Code types.ParsingError + Msg string +} + +func (self SyntaxError) Error() string { + return fmt.Sprintf("%q", self.Description()) +} + +func (self SyntaxError) Description() string { + return "Syntax error " + self.description() +} + +func (self SyntaxError) description() string { + i := 16 + p := self.Pos - i + q := self.Pos + i + + /* check for empty source */ + if self.Src == "" { + return fmt.Sprintf("no sources available: %#v", self) + } + + /* prevent slicing before the beginning */ + if p < 0 { + p, q, i = 0, q - p, i + p + } + + /* prevent slicing beyond the end */ + if n := len(self.Src); q > n { + n = q - n + q = len(self.Src) + + /* move the left bound if possible */ + if p > n { + i += n + p -= n + } + } + + /* left and right length */ + x := clamp_zero(i) + y := clamp_zero(q - p - i - 1) + + /* compose the error description */ + return fmt.Sprintf( + "at index %d: %s\n\n\t%s\n\t%s^%s\n", + self.Pos, + self.Message(), + self.Src[p:q], + strings.Repeat(".", x), + strings.Repeat(".", y), + ) +} + +func (self SyntaxError) Message() string { + if self.Msg == "" { + return self.Code.Message() + } + return self.Msg +} + +func clamp_zero(v int) int { + if v < 0 { + return 0 + } else { + return v + } +} diff --git a/ast/node.go b/ast/node.go index ae386c9..d9f9c0d 100644 --- a/ast/node.go +++ b/ast/node.go @@ -22,10 +22,8 @@ import ( `strconv` `unsafe` - `github.com/bytedance/sonic/decoder` `github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/rt` - `github.com/chenzhuoyu/base64x` ) const ( @@ -1566,7 +1564,7 @@ func NewBytes(src []byte) Node { if len(src) == 0 { panic("empty src bytes") } - out := base64x.StdEncoding.EncodeToString(src) + out := encodeBase64(src) return NewString(out) } @@ -1756,15 +1754,6 @@ func newError(err types.ParsingError, msg string) *Node { } } -func newSyntaxError(err *decoder.SyntaxError) *Node { - msg := err.Description() - return &Node{ - t: V_ERROR, - v: int64(err.Code), - p: unsafe.Pointer(&msg), - } -} - var typeJumpTable = [256]types.ValueType{ '"' : types.V_STRING, '-' : _V_NUMBER, diff --git a/ast/node_test.go b/ast/node_test.go index 5a46a38..75ddc17 100644 --- a/ast/node_test.go +++ b/ast/node_test.go @@ -26,7 +26,6 @@ import ( `strconv` `testing` - `github.com/bytedance/sonic/encoder` `github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/rt` `github.com/stretchr/testify/assert` @@ -34,7 +33,8 @@ import ( func TestNodeSortKeys(t *testing.T) { - root, err := NewSearcher(_TwitterJson).GetByPath() + var src = `{"b":1,"a":2,"c":3}` + root, err := NewSearcher(src).GetByPath() if err != nil { t.Fatal(err) } @@ -42,7 +42,7 @@ func TestNodeSortKeys(t *testing.T) { if err != nil { t.Fatal(err) } - exp, err := encoder.Encode(obj, encoder.SortMapKeys) + exp, err := json.Marshal(obj) if err != nil { t.Fatal(err) } diff --git a/ast/parser.go b/ast/parser.go index 9833796..880dbf0 100644 --- a/ast/parser.go +++ b/ast/parser.go @@ -18,13 +18,9 @@ package ast import ( `fmt` - `unsafe` - `github.com/bytedance/sonic/decoder` - `github.com/bytedance/sonic/internal/native` `github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/rt` - `github.com/bytedance/sonic/unquote` ) const _DEFAULT_NODE_CAP int = 16 @@ -112,12 +108,6 @@ func (self *Parser) lspace(sp int) int { return sp } -func (self *Parser) decodeValue() (val types.JsonState) { - sv := (*rt.GoString)(unsafe.Pointer(&self.s)) - self.p = native.Value(sv.Ptr, sv.Len, self.p, &val, 0) - return -} - func (self *Parser) decodeArray(ret []Node) (Node, types.ParsingError) { sp := self.p ns := len(self.s) @@ -213,7 +203,7 @@ func (self *Parser) decodeObject(ret []Pair) (Node, types.ParsingError) { /* check for escape sequence */ if njs.Ep != -1 { - if key, err = unquote.String(key); err != 0 { + if key, err = unquote(key); err != 0 { return Node{}, err } } @@ -277,14 +267,13 @@ func (self *Parser) decodeString(iv int64, ep int) (Node, types.ParsingError) { } /* unquote the string */ - buf := make([]byte, 0, len(s)) - err := unquote.IntoBytes(s, &buf) + out, err := unquote(s) /* check for errors */ if err != 0 { return Node{}, err } else { - return newBytes(buf), 0 + return newBytes(rt.Str2Mem(out)), 0 } } @@ -317,17 +306,6 @@ func (self *Parser) Parse() (Node, types.ParsingError) { } } -func (self *Parser) skip() (int, types.ParsingError) { - fsm := types.NewStateMachine() - start := native.SkipOne(&self.s, &self.p, fsm, uint64(0)) - types.FreeStateMachine(fsm) - - if start < 0 { - return self.p, types.ParsingError(-start) - } - return start, 0 -} - func (self *Parser) searchKey(match string) types.ParsingError { ns := len(self.s) if err := self.object(); err != 0 { @@ -361,7 +339,7 @@ func (self *Parser) searchKey(match string) types.ParsingError { /* check for escape sequence */ if njs.Ep != -1 { - if key, err = unquote.String(key); err != 0 { + if key, err = unquote(key); err != 0 { return err } } @@ -542,7 +520,7 @@ func (self *Node) skipNextPair() (*Pair) { /* check for escape sequence */ if njs.Ep != -1 { - if key, err = unquote.String(key); err != 0 { + if key, err = unquote(key); err != 0 { return &Pair{key, *newSyntaxError(parser.syntaxError(err))} } } @@ -633,17 +611,9 @@ func (self *Parser) ExportError(err types.ParsingError) error { if err == _ERR_NOT_FOUND { return ErrNotExist } - return fmt.Errorf("%q", decoder.SyntaxError{ + return fmt.Errorf("%q", SyntaxError{ Pos : self.p, Src : self.s, Code: err, }.Description()) -} - -func (self *Parser) syntaxError(err types.ParsingError) *decoder.SyntaxError { - return &decoder.SyntaxError{ - Pos : self.p, - Src : self.s, - Code: err, - } } \ No newline at end of file diff --git a/ast/search_test.go b/ast/search_test.go index 5919596..57fdf06 100644 --- a/ast/search_test.go +++ b/ast/search_test.go @@ -20,6 +20,7 @@ import ( `math` `runtime` `strconv` + `strings` `sync` `testing` @@ -60,7 +61,7 @@ func TestExportError(t *testing.T) { if err == nil { t.Fatal() } - if err.Error() != `"Syntax error at index 6: invalid char\n\n\t{\"a\":]\n\t......^\n"` { + if strings.Index(err.Error(), `"Syntax error at `) != 0 { t.Fatal(err) } diff --git a/ast/stubs.go b/ast/stubs.go index 5e933c5..002b920 100644 --- a/ast/stubs.go +++ b/ast/stubs.go @@ -19,6 +19,7 @@ package ast import ( `unsafe` `reflect` + `unicode/utf8` `github.com/bytedance/sonic/internal/rt` ) @@ -38,4 +39,43 @@ func unsafe_NewArray(typ *rt.GoType, n int) unsafe.Pointer //go:linkname growslice runtime.growslice //goland:noinspection GoUnusedParameter -func growslice(et *rt.GoType, old rt.GoSlice, cap int) rt.GoSlice \ No newline at end of file +func growslice(et *rt.GoType, old rt.GoSlice, cap int) rt.GoSlice + +//go:nosplit +func mem2ptr(s []byte) unsafe.Pointer { + return (*rt.GoSlice)(unsafe.Pointer(&s)).Ptr +} + +//go:nosplit +func ptr2slice(s unsafe.Pointer, l int, c int) unsafe.Pointer { + slice := &rt.GoSlice{ + Ptr: s, + Len: l, + Cap: c, + } + return unsafe.Pointer(slice) +} + +//go:nosplit +func str2ptr(s string) unsafe.Pointer { + return (*rt.GoString)(unsafe.Pointer(&s)).Ptr +} + +//go:nosplit +func addr2str(p unsafe.Pointer, n int64) (s string) { + (*rt.GoString)(unsafe.Pointer(&s)).Ptr = p + (*rt.GoString)(unsafe.Pointer(&s)).Len = int(n) + return +} + + +var ( + //go:linkname safeSet encoding/json.safeSet + safeSet [utf8.RuneSelf]bool + + //go:linkname hex encoding/json.hex + hex string +) + +//go:linkname unquoteBytes encoding/json.unquoteBytes +func unquoteBytes(s []byte) (t []byte, ok bool) \ No newline at end of file diff --git a/ast/utils.go b/ast/utils.go deleted file mode 100644 index 0f99a77..0000000 --- a/ast/utils.go +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2021 ByteDance Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ast - -import ( - `unsafe` - - `github.com/bytedance/sonic/internal/rt` -) - -//go:nosplit -func mem2ptr(s []byte) unsafe.Pointer { - return (*rt.GoSlice)(unsafe.Pointer(&s)).Ptr -} - -//go:nosplit -func ptr2slice(s unsafe.Pointer, l int, c int) unsafe.Pointer { - slice := &rt.GoSlice{ - Ptr: s, - Len: l, - Cap: c, - } - return unsafe.Pointer(slice) -} - -//go:nosplit -func str2ptr(s string) unsafe.Pointer { - return (*rt.GoString)(unsafe.Pointer(&s)).Ptr -} - -//go:nosplit -func addr2str(p unsafe.Pointer, n int64) (s string) { - (*rt.GoString)(unsafe.Pointer(&s)).Ptr = p - (*rt.GoString)(unsafe.Pointer(&s)).Len = int(n) - return -} - -const _SPACE_CHAR_MASK = (1<<' ')|(1<<'\t')|(1<<'\r')|(1<<'\n') - -func isSpace(c byte) bool { - return (int(1<>1 + n + l + if c < 32 { + c = 32 + } + tmp := make([]byte, l, c) + copy(tmp, *buf) + *buf = tmp + } + return +}