From 02865de676f323bd9e074efce1d40f7a42bd54cb Mon Sep 17 00:00:00 2001 From: liu Date: Fri, 10 Feb 2023 18:55:27 +0800 Subject: [PATCH] feat: repl invalid utf8 in serde by option (#357) --- .github/workflows/license-check.yml | 2 +- .github/workflows/push-check-go118.yml | 2 +- .github/workflows/push-check-linux-arm64.yml | 2 +- .github/workflows/push-check-linux-x64.yml | 2 +- .github/workflows/push-check-qemu.yml | 2 +- .github/workflows/push-check-windows.yml | 2 +- Makefile | 8 +- api.go | 10 +- ast/node.go | 5 +- decode_test.go | 76 +- decoder/decoder.go | 32 +- decoder/decoder_test.go | 1 - encode_test.go | 31 +- encoder/encoder.go | 66 +- encoder/primitives.go | 33 + fuzz/Makefile | 8 +- fuzz/fuzz_test.go | 143 +- fuzz/go.mod | 11 +- fuzz/go.sum | 37 +- fuzz/struct_fuzz_test.go | 12 +- internal/native/avx/native_amd64.go | 12 +- internal/native/avx/native_amd64.s | 13868 +++++++-------- internal/native/avx/native_amd64_test.go | 54 - internal/native/avx/native_subr_amd64.go | 58 +- internal/native/avx2/native_amd64.go | 12 +- internal/native/avx2/native_amd64.s | 14810 +++++++++-------- internal/native/avx2/native_amd64_test.go | 54 - internal/native/avx2/native_subr_amd64.go | 68 +- internal/native/dispatch_amd64.go | 15 +- internal/native/dispatch_amd64.s | 20 +- internal/native/native_amd64.tmpl | 12 +- internal/native/native_amd64_test.tmpl | 54 - internal/native/sse/native_amd64.go | 12 +- internal/native/sse/native_amd64.s | 13483 +++++++-------- internal/native/sse/native_amd64_test.go | 54 - internal/native/sse/native_subr_amd64.go | 62 +- internal/native/types/types.go | 4 +- licenses/LICENSE-simdjson | 201 + native/f32toa.c | 4 +- native/fastfloat.c | 4 +- native/native.c | 2 + native/native.h | 7 +- native/parsing.c | 22 +- native/scanning.c | 111 +- native/test/xassert.h | 1 + native/test/xprintf.h | 62 +- native/types.h | 4 +- native/utf8.h | 396 +- native/utils.h | 78 + sonic.go | 29 +- utf8/utf8.go | 71 + utf8/utf8_test.go | 138 + 52 files changed, 23478 insertions(+), 20789 deletions(-) create mode 100644 licenses/LICENSE-simdjson create mode 100644 native/utils.h create mode 100644 utf8/utf8.go create mode 100644 utf8/utf8_test.go diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml index 07ad3aa..5034c2b 100644 --- a/.github/workflows/license-check.yml +++ b/.github/workflows/license-check.yml @@ -1,6 +1,6 @@ name: License Check -on: push +on: pull_request jobs: build: diff --git a/.github/workflows/push-check-go118.yml b/.github/workflows/push-check-go118.yml index 795e140..c2d501c 100644 --- a/.github/workflows/push-check-go118.yml +++ b/.github/workflows/push-check-go118.yml @@ -1,6 +1,6 @@ name: Push Check Go1.18-Linux-X64 -on: push +on: pull_request jobs: build: diff --git a/.github/workflows/push-check-linux-arm64.yml b/.github/workflows/push-check-linux-arm64.yml index 9ee5f80..3a1ee13 100644 --- a/.github/workflows/push-check-linux-arm64.yml +++ b/.github/workflows/push-check-linux-arm64.yml @@ -1,6 +1,6 @@ name: Push Check Linux-ARM -on: push +on: pull_request jobs: build: diff --git a/.github/workflows/push-check-linux-x64.yml b/.github/workflows/push-check-linux-x64.yml index 650dd8a..84fc932 100644 --- a/.github/workflows/push-check-linux-x64.yml +++ b/.github/workflows/push-check-linux-x64.yml @@ -1,6 +1,6 @@ name: Push Check Linux-X64 -on: push +on: pull_request jobs: build: diff --git a/.github/workflows/push-check-qemu.yml b/.github/workflows/push-check-qemu.yml index ffbdf48..9939aa4 100644 --- a/.github/workflows/push-check-qemu.yml +++ b/.github/workflows/push-check-qemu.yml @@ -1,6 +1,6 @@ name: Push Check Linux-Qemu -on: push +on: pull_request jobs: build: diff --git a/.github/workflows/push-check-windows.yml b/.github/workflows/push-check-windows.yml index 77fc6f0..7dfbb47 100644 --- a/.github/workflows/push-check-windows.yml +++ b/.github/workflows/push-check-windows.yml @@ -1,6 +1,6 @@ name: Push Check Windows-X64 -on: push +on: pull_request jobs: build: diff --git a/Makefile b/Makefile index d05da4f..8cc0acf 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ CC_amd64 := clang ASM2ASM_amd64 := tools/asm2asm/asm2asm.py CFLAGS := -mno-red-zone -CFLAGS += -arch x86_64 +CFLAGS += -target x86_64-apple-macos11 CFLAGS += -fno-asynchronous-unwind-tables CFLAGS += -fno-builtin CFLAGS += -fno-exceptions @@ -100,8 +100,10 @@ endef all: ${ARCH} clean: - rm -vfr ${TMP_DIR}/{sse,avx,avx2} - rm -vfr ${OUT_DIR}/{sse,avx,avx2} + for arch in ${ARCH}; do \ + rm -vfr ${TMP_DIR}/$${arch}; \ + rm -vfr ${OUT_DIR}/$${arch}; \ + done $(foreach \ arch, \ diff --git a/api.go b/api.go index a3dba54..a2bc67e 100644 --- a/api.go +++ b/api.go @@ -66,8 +66,8 @@ type Config struct { // CopyString indicates decoder to decode string values by copying instead of referring. CopyString bool - // ValidateString indicates decoder to valid string values: decoder will return errors when - // invalid UTF-8 chars or unescaped control chars(\u0000-\u001f) in the string value of JSON. + // ValidateString indicates decoder and encoder to valid string values: decoder will return errors + // when unescaped control chars(\u0000-\u001f) in the string value of JSON. ValidateString bool } @@ -81,6 +81,7 @@ var ( SortMapKeys: true, CompactMarshaler: true, CopyString : true, + ValidateString : true, }.Froze() // ConfigFastest is the fastest config of APIs, aiming at speed. @@ -164,12 +165,15 @@ func UnmarshalString(buf string, val interface{}) error { return ConfigDefault.UnmarshalFromString(buf, val) } -// Get searches the given path json, +// Get searches the given path from json, // and returns its representing ast.Node. // // Each path arg must be integer or string: // - Integer means searching current node as array // - String means searching current node as object +// +// Note, the api expects the json is well-formed at least, +// otherwise it may return unexpected result. func Get(src []byte, path ...interface{}) (ast.Node, error) { return GetFromString(string(src), path...) } diff --git a/ast/node.go b/ast/node.go index c8f148f..0d37baf 100644 --- a/ast/node.go +++ b/ast/node.go @@ -691,7 +691,10 @@ func (self *Node) AddAny(val interface{}) error { } // GetByPath load given path on demands, -// which only ensure nodes before this path got parsed +// which only ensure nodes before this path got parsed. +// +// Note, the api expects the json is well-formed at least, +// otherwise it may return unexpected result. func (self *Node) GetByPath(path ...interface{}) *Node { if !self.Valid() { return self diff --git a/decode_test.go b/decode_test.go index cac964d..d14c823 100644 --- a/decode_test.go +++ b/decode_test.go @@ -34,7 +34,6 @@ import ( `strings` `testing` `time` - `unicode/utf8` `unsafe` `github.com/bytedance/sonic/decoder` @@ -1011,8 +1010,8 @@ var unmarshalTests = []unmarshalTest{ {in: "\"\x00\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true}, {in: "\"\x00\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true}, - {in: "\"\xff\"", ptr: new(interface{}), err: fmt.Errorf("json: invald char"), validateString: true}, - {in: "\"\xff\"", ptr: new(string), err: fmt.Errorf("json: invald char"), validateString: true}, + {in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\ufffd"), validateString: true}, + {in: "\"\xff\"", ptr: new(string), out: "\ufffd", validateString: true}, {in: "\"\x00\"", ptr: new(interface{}), out: interface{}("\x00"), validateString: false}, {in: "\"\x00\"", ptr: new(string), out: "\x00", validateString: false}, {in: "\"\xff\"", ptr: new(interface{}), out: interface{}("\xff"), validateString: false}, @@ -1147,7 +1146,6 @@ func TestUnmarshal(t *testing.T) { } dec := decoder.NewDecoder(tt.in) - validUtf8 := true if tt.useNumber { dec.UseNumber() } @@ -1156,10 +1154,9 @@ func TestUnmarshal(t *testing.T) { } if tt.validateString { dec.ValidateString() - validUtf8 = utf8.Valid([]byte(tt.in)) } - if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil && validUtf8) { - spew.Dump(tt.in) + if err := dec.Decode(v.Interface()); (err == nil) != (tt.err == nil) { + spew.Dump(tt) t.Fatalf("#%d: %v, want %v", i, err, tt.err) continue } else if err != nil { @@ -2524,3 +2521,68 @@ func TestChangeTool(t *testing.T) { } } + +func TestDecoder_LongestInvalidUtf8(t *testing.T) { + for _, data := range([]string{ + "\"" + strings.Repeat("\x80", 4096) + "\"", + "\"" + strings.Repeat("\x80", 4095) + "\"", + "\"" + strings.Repeat("\x80", 4097) + "\"", + "\"" + strings.Repeat("\x80", 12345) + "\"", + }) { + testDecodeInvalidUtf8(t, []byte(data)) + } +} + +func testDecodeInvalidUtf8(t *testing.T, data []byte) { + var sgot, jgot string + serr := ConfigStd.Unmarshal(data, &sgot) + jerr := json.Unmarshal(data, &jgot) + assert.Equal(t, serr != nil, jerr != nil) + if jerr == nil { + assert.Equal(t, sgot, jgot) + } +} + +func needEscape(b byte) bool { + return b == '"' || b == '\\' || b < '\x20' +} + +func genRandJsonBytes(length int) []byte { + var buf bytes.Buffer + buf.WriteByte('"') + for j := 0; j < length; j++ { + r := rand.Intn(0xff + 1) + if needEscape(byte(r)) { + buf.WriteByte('\\') + } + buf.WriteByte(byte(r)) + } + buf.WriteByte('"') + return buf.Bytes() +} + +func genRandJsonRune(length int) []byte { + var buf bytes.Buffer + buf.WriteByte('"') + for j := 0; j < length; j++ { + r := rand.Intn(0x10FFFF + 1) + if r < 0x80 && needEscape(byte(r)) { + buf.WriteByte('\\') + buf.WriteByte(byte(r)) + } else { + buf.WriteRune(rune(r)) + } + } + buf.WriteByte('"') + return buf.Bytes() +} + +func TestDecoder_RandomInvalidUtf8(t *testing.T) { + nums := 1000 + maxLen := 1000 + for i := 0; i < nums; i++ { + length := rand.Intn(maxLen) + testDecodeInvalidUtf8(t, genRandJsonBytes(length)) + testDecodeInvalidUtf8(t, genRandJsonRune(length)) + } +} diff --git a/decoder/decoder.go b/decoder/decoder.go index 190e318..5326f97 100644 --- a/decoder/decoder.go +++ b/decoder/decoder.go @@ -25,6 +25,7 @@ import ( `github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/rt` `github.com/bytedance/sonic/option` + `github.com/bytedance/sonic/utf8` ) const ( @@ -80,9 +81,39 @@ func (self *Decoder) Reset(s string) { // self.f = 0 } +func (self *Decoder) CheckTrailings() error { + pos := self.i + buf := self.s + /* skip all the trailing spaces */ + if pos != len(buf) { + for pos < len(buf) && (types.SPACE_MASK & (1 << buf[pos])) != 0 { + pos++ + } + } + + /* then it must be at EOF */ + if pos == len(buf) { + return nil + } + + /* junk after JSON value */ + return SyntaxError { + Src : buf, + Pos : pos, + Code : types.ERR_INVALID_CHAR, + } +} + + // Decode parses the JSON-encoded data from current position and stores the result // in the value pointed to by val. func (self *Decoder) Decode(val interface{}) error { + /* validate json if needed */ + if (self.f & (1 << _F_validate_string)) != 0 && !utf8.ValidateString(self.s){ + dbuf := utf8.CorrectWith(nil, rt.Str2Mem(self.s), "\ufffd") + self.s = rt.Mem2Str(dbuf) + } + vv := rt.UnpackEface(val) vp := vv.Value @@ -99,7 +130,6 @@ func (self *Decoder) Decode(val interface{}) error { /* create a new stack, and call the decoder */ sb, etp := newStack(), rt.PtrElem(vv.Type) nb, err := decodeTypedPointer(self.s, self.i, etp, vp, sb, self.f) - /* return the stack back */ self.i = nb freeStack(sb) diff --git a/decoder/decoder_test.go b/decoder/decoder_test.go index 4d95cf2..afcb9c1 100644 --- a/decoder/decoder_test.go +++ b/decoder/decoder_test.go @@ -345,7 +345,6 @@ func TestDecoder_Generic(t *testing.T) { pos, err := decode(TwitterJson, &v, false) assert.NoError(t, err) assert.Equal(t, len(TwitterJson), pos) - spew.Dump(v) } func TestDecoder_Binding(t *testing.T) { diff --git a/encode_test.go b/encode_test.go index b1869b8..6135f0b 100644 --- a/encode_test.go +++ b/encode_test.go @@ -34,6 +34,7 @@ import ( `testing` `time` `unsafe` + `strings` `github.com/bytedance/sonic/encoder` `github.com/stretchr/testify/assert` @@ -52,7 +53,6 @@ func TestMain(m *testing.M) { runtime.GC() debug.FreeOSMemory() } - println("stop GC looping!") }() time.Sleep(time.Millisecond) m.Run() @@ -1168,4 +1168,33 @@ func TestMarshalNullNil(t *testing.T) { }.Froze().Marshal(v) assert.Nil(t, e) assert.Equal(t, `{"A":[],"B":{}}`, string(o)) +} + +func TestEncoder_LongestInvalidUtf8(t *testing.T) { + for _, data := range([]string{ + "\"" + strings.Repeat("\x80", 4096) + "\"", + "\"" + strings.Repeat("\x80", 4095) + "\"", + "\"" + strings.Repeat("\x80", 4097) + "\"", + "\"" + strings.Repeat("\x80", 12345) + "\"", + }) { + testEncodeInvalidUtf8(t, []byte(data)) + } +} + +func testEncodeInvalidUtf8(t *testing.T, data []byte) { + jgot, jerr := json.Marshal(data) + sgot, serr := ConfigStd.Marshal(data) + assert.Equal(t, serr != nil, jerr != nil) + if jerr == nil { + assert.Equal(t, sgot, jgot) + } +} + +func TestEncoder_RandomInvalidUtf8(t *testing.T) { + nums := 1000 + maxLen := 1000 + for i := 0; i < nums; i++ { + testEncodeInvalidUtf8(t, genRandJsonBytes(maxLen)) + testEncodeInvalidUtf8(t, genRandJsonRune(maxLen)) + } } \ No newline at end of file diff --git a/encoder/encoder.go b/encoder/encoder.go index 19a35de..7a13301 100644 --- a/encoder/encoder.go +++ b/encoder/encoder.go @@ -21,11 +21,11 @@ import ( `encoding/json` `reflect` `runtime` - `unsafe` `github.com/bytedance/sonic/internal/native` `github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/rt` + `github.com/bytedance/sonic/utf8` `github.com/bytedance/sonic/option` ) @@ -38,6 +38,7 @@ const ( bitCompactMarshaler bitNoQuoteTextMarshaler bitNoNullSliceOrMap + bitValidateString // used for recursive compile bitPointerValue = 63 @@ -65,6 +66,10 @@ const ( // NoNullSliceOrMap indicates all empty Array or Object are encoded as '[]' or '{}', // instead of 'null' NoNullSliceOrMap Options = 1 << bitNoNullSliceOrMap + + // ValidateString indicates that encoder should validate the input string + // before encoding it into JSON. + ValidateString Options = 1 << bitValidateString // CompatibleWithStd is used to be compatible with std encoder. CompatibleWithStd Options = SortMapKeys | EscapeHTML | CompactMarshaler @@ -100,6 +105,15 @@ func (self *Encoder) SetEscapeHTML(f bool) { } } +// SetValidateString specifies if option ValidateString opens +func (self *Encoder) SetValidateString(f bool) { + if f { + self.Opts |= ValidateString + } else { + self.Opts &= ^ValidateString + } +} + // SetCompactMarshaler specifies if option CompactMarshaler opens func (self *Encoder) SetCompactMarshaler(f bool) { if f { @@ -156,7 +170,7 @@ func Encode(val interface{}, opts Options) ([]byte, error) { return nil, err } - if opts & EscapeHTML != 0 { + if opts & EscapeHTML != 0 || opts & ValidateString != 0 { return buf, nil } @@ -189,6 +203,12 @@ func EncodeInto(buf *[]byte, val interface{}, opts Options) error { *buf = dest } + if opts & ValidateString != 0 && !utf8.Validate(*buf) { + dest := utf8.CorrectWith(nil, *buf, `\ufffd`) + freeBytes(*buf) // free origin used buffer + *buf = dest + } + /* avoid GC ahead */ runtime.KeepAlive(buf) runtime.KeepAlive(efv) @@ -203,38 +223,8 @@ var typeByte = rt.UnpackType(reflect.TypeOf(byte(0))) // For historical reasons, web browsers don't honor standard HTML // escaping within