fix: move large-size local array to _Stack (#162)

* fix: move large-size local array to _Stack * fix: adjust jsonstate and add alwaysinline * fix: initialize types.JsonState's dbuf at prologue * fix: replace `always_inline` with `inline` * fix: remove unused types * feat: update asm2asm * fix: check stack size befor call JIT function Co-authored-by: liuqiang <liuqiang.06@bytedance.com> Co-authored-by: duanyi.aster <duanyi.aster@bytedance.com>
2026-06-20 16:45:22 +08:00 · 2021-12-30 14:51:23 +08:00 · 2021-12-30 14:51:23 +08:00 · 1443eb3bcf
commit 1443eb3bcf
parent c3cb5de704
25 changed files with 2707 additions and 8251 deletions
--- a/.github/workflows/push-check-benchmark.yml
+++ b/.github/workflows/push-check-benchmark.yml
@ -14,7 +14,7 @@ jobs:
      - name: Set up Go
        uses: actions/setup-go@v2
        with:
-          go-version: 1.17
+          go-version: 1.17.1

      - uses: actions/cache@v2
        with:
@ -24,4 +24,4 @@ jobs:
            ${{ runner.os }}-go-

      - name: Benchmark
-        run: GOMAXPROCS=4 go test -bench=. -benchmem -run=none ./...
+        run: sh bench.sh
--- a/decode_float_test.go
+++ b/decode_float_test.go
@ -149,7 +149,7 @@ var atoftests = []atofTest{
 func TestDecodeFloat(t *testing.T) {
 	for i, tt := range atoftests {
 		// default float64
-		var sonicout, stdout interface{}
+		var sonicout, stdout float64
 		sonicerr := decoder.NewDecoder(tt.in).Decode(&sonicout)
 		stderr := json.NewDecoder(strings.NewReader(tt.in)).Decode(&stdout)
 		if !reflect.DeepEqual(sonicout, stdout) {
--- a/decoder/assembler_amd64_go115.go
+++ b/decoder/assembler_amd64_go115.go
@ -70,7 +70,7 @@
     _FP_args   = 96     // 96 bytes to pass arguments and return values for this function
     _FP_fargs  = 80     // 80 bytes for passing arguments to other Go functions
     _FP_saves  = 40     // 40 bytes for saving the registers before CALL instructions
-     _FP_locals = 72     // 72 bytes for local variables
+     _FP_locals = 88     // 88 bytes for local variables
 )
 
 const (
@ -174,14 +174,16 @@
     _VAR_st_Dv = jit.Ptr(_SP, _FP_fargs + _FP_saves + 8)
     _VAR_st_Iv = jit.Ptr(_SP, _FP_fargs + _FP_saves + 16)
     _VAR_st_Ep = jit.Ptr(_SP, _FP_fargs + _FP_saves + 24)
+     _VAR_st_Db = jit.Ptr(_SP, _FP_fargs + _FP_saves + 32)
+     _VAR_st_Dc = jit.Ptr(_SP, _FP_fargs + _FP_saves + 40)
 )
 
 var (
-     _VAR_ss_AX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 32)
-     _VAR_ss_CX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 40)
-     _VAR_ss_SI = jit.Ptr(_SP, _FP_fargs + _FP_saves + 48)
-     _VAR_ss_R8 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 56)
-     _VAR_ss_R9 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 64)
+     _VAR_ss_AX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 48)
+     _VAR_ss_CX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 56)
+     _VAR_ss_SI = jit.Ptr(_SP, _FP_fargs + _FP_saves + 64)
+     _VAR_ss_R8 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 72)
+     _VAR_ss_R9 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 80)
 )
 
 type _Assembler struct {
@ -320,6 +322,10 @@
     self.Emit("MOVQ", _ARG_ic, _IC)                 // MOVQ ic<>+16(FP), IC
     self.Emit("MOVQ", _ARG_vp, _VP)                 // MOVQ vp<>+24(FP), VP
     self.Emit("MOVQ", _ARG_sb, _ST)                 // MOVQ vp<>+32(FP), ST
+     // initialize digital buffer first
+     self.Emit("MOVQ", jit.Imm(_MaxDigitNums), _VAR_st_Dc)    // MOVQ $_MaxDigitNums, ss.Dcap
+     self.Emit("LEAQ", jit.Ptr(_ST, _DbufOffset), _AX)        // LEAQ _DbufOffset(ST), AX
+     self.Emit("MOVQ", _AX, _VAR_st_Db)                       // MOVQ AX, ss.Dbuf
 }
 
 /** Function Calling Helpers **/
@ -578,8 +584,8 @@
 }
 
 func (self *_Assembler) parse_number() {
-     self.call_vf(_F_vnumber)
-     self.check_err()
+    self.call_vf(_F_vnumber)                               // call  vnumber
+    self.check_err()
 }
 
 func (self *_Assembler) parse_signed() {
--- a/decoder/assembler_amd64_go116.go
+++ b/decoder/assembler_amd64_go116.go
@ -71,7 +71,7 @@ const (
    _FP_args   = 96     // 96 bytes to pass arguments and return values for this function
    _FP_fargs  = 80     // 80 bytes for passing arguments to other Go functions
    _FP_saves  = 40     // 40 bytes for saving the registers before CALL instructions
-    _FP_locals = 72    // 72 bytes for local variables
+    _FP_locals = 88    // 88 bytes for local variables
 )

 const (
@ -177,14 +177,16 @@ var (
    _VAR_st_Dv = jit.Ptr(_SP, _FP_fargs + _FP_saves + 8)
    _VAR_st_Iv = jit.Ptr(_SP, _FP_fargs + _FP_saves + 16)
    _VAR_st_Ep = jit.Ptr(_SP, _FP_fargs + _FP_saves + 24)
+    _VAR_st_Db = jit.Ptr(_SP, _FP_fargs + _FP_saves + 32)
+    _VAR_st_Dc = jit.Ptr(_SP, _FP_fargs + _FP_saves + 40)
 )

 var (
-    _VAR_ss_AX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 32)
-    _VAR_ss_CX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 40)
-    _VAR_ss_SI = jit.Ptr(_SP, _FP_fargs + _FP_saves + 48)
-    _VAR_ss_R8 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 56)
-    _VAR_ss_R9 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 64)
+    _VAR_ss_AX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 48)
+    _VAR_ss_CX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 56)
+    _VAR_ss_SI = jit.Ptr(_SP, _FP_fargs + _FP_saves + 64)
+    _VAR_ss_R8 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 72)
+    _VAR_ss_R9 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 80)
 )

 type _Assembler struct {
@ -323,6 +325,10 @@ func (self *_Assembler) prologue() {
    self.Emit("MOVQ", _ARG_ic, _IC)                 // MOVQ ic<>+16(FP), IC
    self.Emit("MOVQ", _ARG_vp, _VP)                 // MOVQ vp<>+24(FP), VP
    self.Emit("MOVQ", _ARG_sb, _ST)                 // MOVQ vp<>+32(FP), ST
+    // initialize digital buffer first
+    self.Emit("MOVQ", jit.Imm(_MaxDigitNums), _VAR_st_Dc)    // MOVQ $_MaxDigitNums, ss.Dcap
+    self.Emit("LEAQ", jit.Ptr(_ST, _DbufOffset), _AX)           // LEAQ _DbufOffset(ST), AX
+    self.Emit("MOVQ", _AX, _VAR_st_Db)                          // MOVQ AX, ss.Dbuf
 }

 /** Function Calling Helpers **/
@ -581,7 +587,7 @@ func (self *_Assembler) parse_string() {
 }

 func (self *_Assembler) parse_number() {
-    self.call_vf(_F_vnumber)
+    self.call_vf(_F_vnumber)                               // call  vnumber
    self.check_err()
 }

--- a/decoder/assembler_amd64_go117.go
+++ b/decoder/assembler_amd64_go117.go
@ -71,7 +71,7 @@ const (
    _FP_args   = 72     // 72 bytes to pass and spill register arguements
    _FP_fargs  = 80     // 80 bytes for passing arguments to other Go functions
    _FP_saves  = 48     // 48 bytes for saving the registers before CALL instructions
-    _FP_locals = 72     // 72 bytes for local variables
+    _FP_locals = 88     // 88 bytes for local variables
 )

 const (
@ -172,14 +172,16 @@ var (
    _VAR_st_Dv = jit.Ptr(_SP, _FP_fargs + _FP_saves + 8)
    _VAR_st_Iv = jit.Ptr(_SP, _FP_fargs + _FP_saves + 16)
    _VAR_st_Ep = jit.Ptr(_SP, _FP_fargs + _FP_saves + 24)
+    _VAR_st_Db = jit.Ptr(_SP, _FP_fargs + _FP_saves + 32)
+    _VAR_st_Dc = jit.Ptr(_SP, _FP_fargs + _FP_saves + 40)
 )

 var (
-    _VAR_ss_AX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 32)
-    _VAR_ss_CX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 40)
-    _VAR_ss_SI = jit.Ptr(_SP, _FP_fargs + _FP_saves + 48)
-    _VAR_ss_R8 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 56)
-    _VAR_ss_R9 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 64)
+    _VAR_ss_AX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 48)
+    _VAR_ss_CX = jit.Ptr(_SP, _FP_fargs + _FP_saves + 56)
+    _VAR_ss_SI = jit.Ptr(_SP, _FP_fargs + _FP_saves + 64)
+    _VAR_ss_R8 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 72)
+    _VAR_ss_R9 = jit.Ptr(_SP, _FP_fargs + _FP_saves + 80)
 )

 type _Assembler struct {
@ -331,6 +333,10 @@ func (self *_Assembler) prologue() {
    self.Emit("MOVQ", jit.Imm(0), _VAR_sv_p)        // MOVQ $0, sv.p<>+48(FP)
    self.Emit("MOVQ", jit.Imm(0), _VAR_sv_n)        // MOVQ $0, sv.n<>+56(FP)
    self.Emit("MOVQ", jit.Imm(0), _VAR_vk)          // MOVQ $0, vk<>+64(FP)
+    // initialize digital buffer first
+    self.Emit("MOVQ", jit.Imm(_MaxDigitNums), _VAR_st_Dc)    // MOVQ $_MaxDigitNums, ss.Dcap
+    self.Emit("LEAQ", jit.Ptr(_ST, _DbufOffset), _AX)        // LEAQ _DbufOffset(ST), AX
+    self.Emit("MOVQ", _AX, _VAR_st_Db)                       // MOVQ AX, ss.Dbuf
 }

 /** Function Calling Helpers **/
--- a/decoder/errors_test.go
+++ b/decoder/errors_test.go
@ -17,7 +17,6 @@
 package decoder

 import (
-    `runtime`
    `testing`

    `github.com/bytedance/sonic/internal/native/types`
@ -61,7 +60,7 @@ func TestErrors_EmptyDescription(t *testing.T) {

 func TestDecoderErrorStackOverflower(t *testing.T) {
    src := `{"a":[]}`
-    N := _MaxStack * runtime.GOMAXPROCS(0)
+    N := _MaxStack
    for i:=0; i<N; i++ {
        var obj map[string]string
        err := NewDecoder(src).Decode(&obj)
--- a/decoder/generic_amd64_go115.go
+++ b/decoder/generic_amd64_go115.go
@ -44,7 +44,7 @@
     _VD_args   = 8      // 8 bytes  for passing arguments to this functions
     _VD_fargs  = 64     // 64 bytes for passing arguments to other Go functions
     _VD_saves  = 40     // 40 bytes for saving the registers before CALL instructions
-     _VD_locals = 40     // 40 bytes for local variables
+     _VD_locals = 56     // 56 bytes for local variables
 )
 
 const (
@ -62,6 +62,8 @@
     _VAR_ss_Dv = jit.Ptr(_SP, _VD_fargs + _VD_saves + 16)
     _VAR_ss_Iv = jit.Ptr(_SP, _VD_fargs + _VD_saves + 24)
     _VAR_ss_Ep = jit.Ptr(_SP, _VD_fargs + _VD_saves + 32)
+     _VAR_ss_Db = jit.Ptr(_SP, _VD_fargs + _VD_saves + 40)
+     _VAR_ss_Dc = jit.Ptr(_SP, _VD_fargs + _VD_saves + 48)
 )
 
 type _ValueDecoder struct {
@ -180,6 +182,11 @@
     /* initialize the state machine */
     self.Emit("XORL", _CX, _CX)                                 // XORL CX, CX
     self.Emit("MOVQ", _DF, _VAR_df)                             // MOVQ DF, df
+     /* initialize digital buffer first */
+     self.Emit("MOVQ", jit.Imm(_MaxDigitNums), _VAR_ss_Dc)       // MOVQ $_MaxDigitNums, ss.Dcap
+     self.Emit("LEAQ", jit.Ptr(_ST, _DbufOffset), _AX)           // LEAQ _DbufOffset(ST), AX
+     self.Emit("MOVQ", _AX, _VAR_ss_Db)                          // MOVQ AX, ss.Dbuf
+     /* add ST offset */
     self.Emit("ADDQ", jit.Imm(_FsmOffset), _ST)                 // ADDQ _FsmOffset, _ST
     self.Emit("MOVQ", _CX, jit.Ptr(_ST, _ST_Sp))                // MOVQ CX, ST.Sp
     self.Emit("MOVQ", _VP, jit.Ptr(_ST, _ST_Vp))                // MOVQ VP, ST.Vp[0]
--- a/decoder/generic_amd64_go116.go
+++ b/decoder/generic_amd64_go116.go
@ -46,7 +46,7 @@ const (
    _VD_args   = 8      // 8 bytes  for passing arguments to this functions
    _VD_fargs  = 64     // 64 bytes for passing arguments to other Go functions
    _VD_saves  = 40     // 40 bytes for saving the registers before CALL instructions
-    _VD_locals = 40     // 40 bytes for local variables
+    _VD_locals = 56     // 56 bytes for local variables
 )

 const (
@ -64,6 +64,8 @@ var (
    _VAR_ss_Dv = jit.Ptr(_SP, _VD_fargs + _VD_saves + 16)
    _VAR_ss_Iv = jit.Ptr(_SP, _VD_fargs + _VD_saves + 24)
    _VAR_ss_Ep = jit.Ptr(_SP, _VD_fargs + _VD_saves + 32)
+    _VAR_ss_Db = jit.Ptr(_SP, _VD_fargs + _VD_saves + 40)
+    _VAR_ss_Dc = jit.Ptr(_SP, _VD_fargs + _VD_saves + 48)
 )

 type _ValueDecoder struct {
@ -182,6 +184,11 @@ func (self *_ValueDecoder) compile() {
    /* initialize the state machine */
    self.Emit("XORL", _CX, _CX)                                 // XORL CX, CX
    self.Emit("MOVQ", _DF, _VAR_df)                             // MOVQ DF, df
+    /* initialize digital buffer first */
+    self.Emit("MOVQ", jit.Imm(_MaxDigitNums), _VAR_ss_Dc)       // MOVQ $_MaxDigitNums, ss.Dcap
+    self.Emit("LEAQ", jit.Ptr(_ST, _DbufOffset), _AX)           // LEAQ _DbufOffset(ST), AX
+    self.Emit("MOVQ", _AX, _VAR_ss_Db)                          // MOVQ AX, ss.Dbuf
+    /* add ST offset */
    self.Emit("ADDQ", jit.Imm(_FsmOffset), _ST)                 // ADDQ _FsmOffset, _ST
    self.Emit("MOVQ", _CX, jit.Ptr(_ST, _ST_Sp))                // MOVQ CX, ST.Sp
    self.WriteRecNotAX(0, _VP, jit.Ptr(_ST, _ST_Vp), false)                // MOVQ VP, ST.Vp[0]
--- a/decoder/generic_amd64_go117.go
+++ b/decoder/generic_amd64_go117.go
@ -46,7 +46,7 @@ const (
    _VD_args   = 8      // 8 bytes  for passing arguments to this functions
    _VD_fargs  = 64     // 64 bytes for passing arguments to other Go functions
    _VD_saves  = 48     // 48 bytes for saving the registers before CALL instructions
-    _VD_locals = 40     // 40 bytes for local variables
+    _VD_locals = 64     // 64 bytes for local variables
 )

 const (
@ -64,10 +64,12 @@ var (
    _VAR_ss_Dv = jit.Ptr(_SP, _VD_fargs + _VD_saves + 16)
    _VAR_ss_Iv = jit.Ptr(_SP, _VD_fargs + _VD_saves + 24)
    _VAR_ss_Ep = jit.Ptr(_SP, _VD_fargs + _VD_saves + 32)
+    _VAR_ss_Db = jit.Ptr(_SP, _VD_fargs + _VD_saves + 40)
+    _VAR_ss_Dc = jit.Ptr(_SP, _VD_fargs + _VD_saves + 48)
 )

 var (
-    _VAR_R9 = jit.Ptr(_SP, _VD_fargs + _VD_saves +40)
+    _VAR_R9 = jit.Ptr(_SP, _VD_fargs + _VD_saves + 56)
 )
 type _ValueDecoder struct {
    jit.BaseAssembler
@ -197,6 +199,11 @@ func (self *_ValueDecoder) compile() {
    /* initialize the state machine */
    self.Emit("XORL", _CX, _CX)                                 // XORL CX, CX
    self.Emit("MOVQ", _DF, _VAR_df)                             // MOVQ DF, df
+    /* initialize digital buffer first */
+    self.Emit("MOVQ", jit.Imm(_MaxDigitNums), _VAR_ss_Dc)       // MOVQ $_MaxDigitNums, ss.Dcap
+    self.Emit("LEAQ", jit.Ptr(_ST, _DbufOffset), _AX)           // LEAQ _DbufOffset(ST), AX
+    self.Emit("MOVQ", _AX, _VAR_ss_Db)                          // MOVQ AX, ss.Dbuf
+    /* add ST offset */
    self.Emit("ADDQ", jit.Imm(_FsmOffset), _ST)                 // ADDQ _FsmOffset, _ST
    self.Emit("MOVQ", _CX, jit.Ptr(_ST, _ST_Sp))                // MOVQ CX, ST.Sp
    self.WriteRecNotAX(0, _VP, jit.Ptr(_ST, _ST_Vp), false)                // MOVQ VP, ST.Vp[0]
--- a/decoder/pools.go
+++ b/decoder/pools.go
@ -30,12 +30,14 @@ const (
    _MinSlice = 16
    _MaxStack = 65536 // 64k slots
    _MaxStackBytes = _MaxStack * _PtrBytes
+    _MaxDigitNums = 800  // used in atof fallback algorithm
 )

 const (
-    _PtrBytes  = _PTR_SIZE / 8
-    _FsmOffset = (_MaxStack + 1) * _PtrBytes
-    _StackSize = unsafe.Sizeof(_Stack{})
+    _PtrBytes   = _PTR_SIZE / 8
+    _FsmOffset  = (_MaxStack + 1) * _PtrBytes
+    _DbufOffset = _FsmOffset + int64(unsafe.Sizeof(types.StateMachine{})) + types.MAX_RECURSE * _PtrBytes
+    _StackSize  = unsafe.Sizeof(_Stack{})
 )

 var (
@ -51,6 +53,7 @@ type _Stack struct {
    sb [_MaxStack]unsafe.Pointer
    mm types.StateMachine
    vp [types.MAX_RECURSE]unsafe.Pointer
+    dp [_MaxDigitNums]byte
 }

 type _Decoder func(
--- a/decoder/primitives.go
+++ b/decoder/primitives.go
@ -17,17 +17,19 @@
 package decoder

 import (
-	`encoding`
-	`encoding/json`
-	`unsafe`
+    `encoding`
+    `encoding/json`
+    `unsafe`

-	`github.com/bytedance/sonic/internal/rt`
+    `github.com/bytedance/sonic/internal/native`
+    `github.com/bytedance/sonic/internal/rt`
 )

 func decodeTypedPointer(s string, i int, vt *rt.GoType, vp unsafe.Pointer, sb *_Stack, fv uint64) (int, error) {
    if fn, err := findOrCompile(vt); err != nil {
        return 0, err
    } else {
+        rt.MoreStack(_FP_size + _VD_size + native.MaxFrameSize)
        return fn(s, i, vp, sb, fv, "", nil)
    }
 }
--- a/encoder/primitives.go
+++ b/encoder/primitives.go
@ -71,8 +71,10 @@ func encodeTypedPointer(buf *[]byte, vt *rt.GoType, vp *unsafe.Pointer, sb *_Sta
    } else if fn, err := findOrCompile(vt); err != nil {
        return err
    } else if (vt.KindFlags & rt.F_direct) == 0 {
+        rt.MoreStack(_FP_size + native.MaxFrameSize)
        return fn(buf, *vp, sb, fv)
    } else {
+        rt.MoreStack(_FP_size + native.MaxFrameSize)
        return fn(buf, unsafe.Pointer(vp), sb, fv)
    }
 }
--- a/internal/native/avx/native_amd64.s
+++ b/internal/native/avx/native_amd64.s
--- a/internal/native/avx/native_subr_amd64.go
+++ b/internal/native/avx/native_subr_amd64.go
@ -14,16 +14,34 @@ var (
    _subr__lspace      = __native_entry__() + 301
    _subr__lzero       = __native_entry__() + 13
    _subr__quote       = __native_entry__() + 4955
-    _subr__skip_array  = __native_entry__() + 17304
-    _subr__skip_object = __native_entry__() + 17339
-    _subr__skip_one    = __native_entry__() + 15525
+    _subr__skip_array  = __native_entry__() + 16074
+    _subr__skip_object = __native_entry__() + 16109
+    _subr__skip_one    = __native_entry__() + 14295
    _subr__u64toa      = __native_entry__() + 3735
    _subr__unquote     = __native_entry__() + 5888
-    _subr__value       = __native_entry__() + 10928
-    _subr__vnumber     = __native_entry__() + 13724
-    _subr__vsigned     = __native_entry__() + 14997
-    _subr__vstring     = __native_entry__() + 12689
-    _subr__vunsigned   = __native_entry__() + 15256
+    _subr__value       = __native_entry__() + 9657
+    _subr__vnumber     = __native_entry__() + 12453
+    _subr__vsigned     = __native_entry__() + 13767
+    _subr__vstring     = __native_entry__() + 11418
+    _subr__vunsigned   = __native_entry__() + 14026
+)
+
+const (
+    _stack__f64toa = 120
+    _stack__i64toa = 24
+    _stack__lspace = 8
+    _stack__lzero = 8
+    _stack__quote = 64
+    _stack__skip_array = 136
+    _stack__skip_object = 136
+    _stack__skip_one = 136
+    _stack__u64toa = 8
+    _stack__unquote = 88
+    _stack__value = 400
+    _stack__vnumber = 312
+    _stack__vsigned = 16
+    _stack__vstring = 128
+    _stack__vunsigned = 8
 )

 var (
@ -43,3 +61,21 @@ var (
    _ = _subr__vstring
    _ = _subr__vunsigned
 )
+
+const (
+    _ = _stack__f64toa
+    _ = _stack__i64toa
+    _ = _stack__lspace
+    _ = _stack__lzero
+    _ = _stack__quote
+    _ = _stack__skip_array
+    _ = _stack__skip_object
+    _ = _stack__skip_one
+    _ = _stack__u64toa
+    _ = _stack__unquote
+    _ = _stack__value
+    _ = _stack__vnumber
+    _ = _stack__vsigned
+    _ = _stack__vstring
+    _ = _stack__vunsigned
+)
--- a/internal/native/avx2/native_amd64.s
+++ b/internal/native/avx2/native_amd64.s
--- a/internal/native/avx2/native_subr_amd64.go
+++ b/internal/native/avx2/native_subr_amd64.go
@ -14,16 +14,34 @@ var (
    _subr__lspace      = __native_entry__() + 429
    _subr__lzero       = __native_entry__() + 13
    _subr__quote       = __native_entry__() + 5328
-    _subr__skip_array  = __native_entry__() + 20330
-    _subr__skip_object = __native_entry__() + 20365
-    _subr__skip_one    = __native_entry__() + 17473
+    _subr__skip_array  = __native_entry__() + 19163
+    _subr__skip_object = __native_entry__() + 19198
+    _subr__skip_one    = __native_entry__() + 16306
    _subr__u64toa      = __native_entry__() + 4008
    _subr__unquote     = __native_entry__() + 7125
-    _subr__value       = __native_entry__() + 13020
-    _subr__vnumber     = __native_entry__() + 15672
-    _subr__vsigned     = __native_entry__() + 16945
-    _subr__vstring     = __native_entry__() + 14795
-    _subr__vunsigned   = __native_entry__() + 17204
+    _subr__value       = __native_entry__() + 11812
+    _subr__vnumber     = __native_entry__() + 14464
+    _subr__vsigned     = __native_entry__() + 15778
+    _subr__vstring     = __native_entry__() + 13587
+    _subr__vunsigned   = __native_entry__() + 16037
+)
+
+const (
+    _stack__f64toa = 120
+    _stack__i64toa = 24
+    _stack__lspace = 8
+    _stack__lzero = 8
+    _stack__quote = 80
+    _stack__skip_array = 128
+    _stack__skip_object = 128
+    _stack__skip_one = 128
+    _stack__u64toa = 8
+    _stack__unquote = 72
+    _stack__value = 392
+    _stack__vnumber = 312
+    _stack__vsigned = 16
+    _stack__vstring = 112
+    _stack__vunsigned = 8
 )

 var (
@ -43,3 +61,21 @@ var (
    _ = _subr__vstring
    _ = _subr__vunsigned
 )
+
+const (
+    _ = _stack__f64toa
+    _ = _stack__i64toa
+    _ = _stack__lspace
+    _ = _stack__lzero
+    _ = _stack__quote
+    _ = _stack__skip_array
+    _ = _stack__skip_object
+    _ = _stack__skip_one
+    _ = _stack__u64toa
+    _ = _stack__unquote
+    _ = _stack__value
+    _ = _stack__vnumber
+    _ = _stack__vsigned
+    _ = _stack__vstring
+    _ = _stack__vunsigned
+)
--- a/internal/native/dispatch_amd64.go
+++ b/internal/native/dispatch_amd64.go
@ -25,6 +25,8 @@ import (
    `github.com/bytedance/sonic/internal/native/types`
 )

+const MaxFrameSize uintptr = 400
+
 var (
    S_f64toa uintptr
    S_i64toa uintptr
--- a/internal/native/types/types.go
+++ b/internal/native/types/types.go
@ -92,9 +92,11 @@ func (self ParsingError) Message() string {

 type JsonState struct {
    Vt ValueType
-    Dv float64
-    Iv int64
-    Ep int
+    Dv   float64
+    Iv   int64
+    Ep   int
+    Dbuf *byte
+    Dcap int
 }

 type StateMachine struct {
--- a/internal/rt/asm.s
+++ b/internal/rt/asm.s
@ -0,0 +1,20 @@
+// +build !noasm !appengine
+// Code generated by asm2asm, DO NOT EDIT.
+
+#include "go_asm.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT ·MoreStack(SB), NOSPLIT, $0 - 8
+    NO_LOCAL_POINTERS
+_entry:
+    MOVQ (TLS), R14
+    MOVQ size+0(FP), R12
+    NOTQ R12
+    LEAQ (SP)(R12*1), R12
+    CMPQ R12, 16(R14)
+	JBE  _stack_grow
+    RET
+_stack_grow:
+	CALL runtime·morestack_noctxt<>(SB)
+	JMP  _entry
--- a/internal/rt/fastmem.go
+++ b/internal/rt/fastmem.go
@ -49,3 +49,6 @@ func Str2Mem(s string) (v []byte) {
    (*GoSlice)(unsafe.Pointer(&v)).Ptr = (*GoString)(unsafe.Pointer(&s)).Ptr
    return
 }
+
+//go:nosplit
+func MoreStack(size uintptr)
--- a/native/atof_eisel_lemire.c
+++ b/native/atof_eisel_lemire.c
@ -22,25 +22,10 @@ typedef struct u128_output {
    uint64_t lo;
 } u128_output;

-static const uint8_t U8_LEN_TAB[256];
 static const uint64_t POW10_M128_TAB[697][2];

-static inline int count_len_u64(uint64_t val) {
-    if (val >> 32) {
-        return count_len_u64(val >> 32) + 32;
-    }
-    if (val >> 16) {
-        return count_len_u64(val >> 16) + 16;
-    }
-    if (val >> 8) {
-        return count_len_u64(val >> 8) + 8;
-    }
-
-    return U8_LEN_TAB[val];
-}
-
-static inline int count_leading_zeros_u64(uint64_t val) {
-    return 64 - count_len_u64(val);
+static inline int count_leading_zeroes_u64(uint64_t u) {
+    return u ? __builtin_clzl(u) : 64;
 }

 static inline u128_output mul_u64(uint64_t x, uint64_t y) {
@ -60,7 +45,7 @@ bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val) {
    }

    /* Calculate the 2-base exponent of float */
-    int clz = count_leading_zeros_u64(mant);
+    int clz = count_leading_zeroes_u64(mant);
    mant <<= clz;
    /* lg10/lg2 ≈ 217706>>16 */
    uint64_t ret_exp2 = ((uint64_t)((217706 * exp10) >> 16) + 64 + 1023) - ((uint64_t)clz);
@ -134,27 +119,6 @@ bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val) {
    return true;
 }

-
-/* the len of significant digits for unit8 */
-static const uint8_t U8_LEN_TAB[256] = {
-    0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
-    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
-    0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-    0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-};
-    
 /* Including 128-bit mantissa approximations (rounded down) of the powers of 10.
 * For example: 
 * 1e-348 ≈ 0xFA8FD5A0081C0288_1732C869CD60E453 * (2 ** (-348 * lg10 / lg2 - 127)).
--- a/native/atof_native.c
+++ b/native/atof_native.c
@ -16,7 +16,6 @@

 #include "native.h"

-#define DECIMAL_MAX_DNUM 800
 /* decimical shift witout overflow, e.g. 9 << 61 overflow */
 #define MAX_SHIFT 60

@ -26,11 +25,12 @@
 * example 3: 999   {"999", 3, 3, 0}
 */
 typedef struct Decimal {
-    char  d[DECIMAL_MAX_DNUM];
-    int   nd;
-    int   dp;
-    int   neg;
-    int   trunc;
+    char*  d;
+    size_t cap;
+    int    nd;
+    int    dp;
+    int    neg;
+    int    trunc;
 } Decimal;

 /* decimal power of ten to binary power of two.
@ -44,7 +44,7 @@ static const int POW_TAB[9] = {1, 3, 6, 9, 13, 16, 19, 23, 26};
 */
 typedef struct lshift_cheat  {
    int   delta;                             // number of added digits when left shift
-    const char  cutoff[DECIMAL_MAX_DNUM];    // minus one digit if under the half(cutoff).
+    const char  cutoff[100];                 // minus one digit if under the half(cutoff).
 } lshift_cheat;

 /* Look up for the decimal shift information by binary shift bits.
@ -56,8 +56,10 @@ typedef struct lshift_cheat  {
 */
 const static lshift_cheat LSHIFT_TAB[61];

-static inline void decimal_init(Decimal *d) {
-    for (int i = 0; i < DECIMAL_MAX_DNUM; ++i) {
+static inline void decimal_init(Decimal *d, char *dbuf, size_t cap) {
+    d->d = dbuf;
+    d->cap = cap;
+    for (int i = 0; i < d->cap; ++i) {
        d->d[i] = 0;
    }
    d->dp    = 0;
@ -66,10 +68,10 @@ static inline void decimal_init(Decimal *d) {
    d->trunc = 0;
 }

-static inline void decimal_set(Decimal *d, const char *s, int len) {
+static inline void decimal_set(Decimal *d, const char *s, ssize_t len, char *dbuf, ssize_t cap) {
    int i = 0;

-    decimal_init(d);
+    decimal_init(d, dbuf, cap);
    if (s[i] == '-') {
        i++;
        d->neg = 1;
@ -82,7 +84,7 @@ static inline void decimal_set(Decimal *d, const char *s, int len) {
                d->dp--;
                continue;
            }
-            if (d->nd < DECIMAL_MAX_DNUM) {
+            if (d->nd < d->cap) {
                d->d[d->nd] = s[i];
                d->nd++;
            } else if (s[i] != '0') {
@ -173,7 +175,7 @@ static inline void right_shift(Decimal *d, uint32_t k) {
    while (n > 0) {
        dig = n >> k;
        n &= mask;
-        if (w < DECIMAL_MAX_DNUM) {
+        if (w < d->cap) {
            d->d[w] = (char)(dig + '0');
            w++;
        } else if (dig > 0) {
@ -221,7 +223,7 @@ static inline void left_shift(Decimal *d, uint32_t k) {
        quo = n / 10;
        rem = n - 10 * quo;
        w--;
-        if (w < DECIMAL_MAX_DNUM) {
+        if (w < d->cap) {
            d->d[w] = (char)(rem + '0');
        } else if (rem != 0) {
            /* truncated */
@ -235,7 +237,7 @@ static inline void left_shift(Decimal *d, uint32_t k) {
        quo = n / 10;
        rem = n - 10 * quo;
        w--;
-        if (w < DECIMAL_MAX_DNUM) {
+        if (w < d->cap) {
            d->d[w] = (char)(rem + '0');
        } else if (rem != 0) {
            /* truncated */
@ -245,8 +247,8 @@ static inline void left_shift(Decimal *d, uint32_t k) {
    }

    d->nd += delta;
-    if (d->nd >= DECIMAL_MAX_DNUM) {
-        d->nd = DECIMAL_MAX_DNUM;
+    if (d->nd >= d->cap) {
+        d->nd = d->cap;
    }
    d->dp += delta;
    trim(d);
@ -413,15 +415,14 @@ out:
    return 0;
 }

-double atof_native_decimal(const char *buf, int len) {
+double atof_native(const char *sp, ssize_t nb, char* dbuf, ssize_t cap) {
    Decimal d;
    double val = 0;
-    decimal_set(&d, buf, len);
+    decimal_set(&d, sp, nb, dbuf, cap);
    decimal_to_f64(&d, &val);
    return val;
 }

-#undef DECIMAL_MAX_DNUM
 #undef MAX_SHIFT

 const static lshift_cheat LSHIFT_TAB[61] = {
--- a/native/native.h
+++ b/native/native.h
@ -35,6 +35,7 @@
 #define V_ELEM_SEP      11
 #define V_ARRAY_END     12
 #define V_OBJECT_END    13
+#define V_ATOF_NEED_FALLBACK 14

 #define F_DBLUNQ        (1 << 0)
 #define F_UNIREP        (1 << 1)
@ -56,11 +57,14 @@

 #define likely(v)       (__builtin_expect((v), 1))
 #define unlikely(v)     (__builtin_expect((v), 0))
+#define always_inline   inline __attribute__((always_inline)) 

 #define as_m128p(v)     ((__m128i *)(v))
 #define as_m128c(v)     ((const __m128i *)(v))
 #define as_m256c(v)     ((const __m256i *)(v))
 #define as_m128v(v)     (*(const __m128i *)(v))
+#define as_uint64v(p)   (*(uint64_t *)(p))
+#define is_infinity(v)  ((as_uint64v(&v) << 1) == 0xFFE0000000000000)

 typedef struct {
    char * buf;
@ -84,6 +88,8 @@ typedef struct {
    double  dv;
    int64_t iv;
    int64_t ep;
+    char*   dbuf;
+    ssize_t dcap;
 } JsonState;

 typedef struct {
@ -116,6 +122,6 @@ long skip_negative(const GoString *src, long *p);
 long skip_positive(const GoString *src, long *p);

 bool atof_eisel_lemire64(uint64_t mant, int exp10, int sgn, double *val);
-double atof_native_decimal(const char *buf, int len);
+double atof_native(const char *sp, ssize_t nb, char* dbuf, ssize_t cap);

 #endif
--- a/native/scanning.c
+++ b/native/scanning.c
@ -528,27 +528,22 @@ static inline bool is_atof_exact(uint64_t man, int exp, int sgn, double *val) {
    return false;
 }

-static inline double parse_float64(uint64_t man, int exp, int sgn, int trunc, const GoString *src, long idx) {
-    double val    = 0.0;
+static inline double atof_fast(uint64_t man, int exp, int sgn, int trunc, double *val) {
    double val_up = 0.0;

    /* look-up for fast atof if the conversion can be exactly */
-    if (is_atof_exact(man, exp, sgn, &val)) {
-        return val;
+    if (is_atof_exact(man, exp, sgn, val)) {
+        return true;
    }

    /* A fast atof algorithm for high percison */
-    if (atof_eisel_lemire64(man, exp, sgn, &val)) {
-        if (!trunc) {
-            return val;
-        }
-        if (atof_eisel_lemire64(man+1, exp, sgn, &val_up) && val_up == val) {
-            return val;
+    if (atof_eisel_lemire64(man, exp, sgn, val)) {
+        if (!trunc || (atof_eisel_lemire64(man+1, exp, sgn, &val_up) && val_up == *val)) {
+            return true;
        }
    }

-    /* when above algorithms failed, fallback. It is slow. */
-    return atof_native_decimal(src->buf + idx, src->len - idx);
+    return false;
 }

 static bool inline is_overflow(uint64_t man, int sgn, int exp10) {
@ -564,12 +559,14 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {
    int   man_nd = 0; // # digits of mantissa, 10 ^ 19 fits uint64_t
    int    exp10 = 0; // val = sgn * man * 10 ^ exp10
    int    trunc = 0;
+    double   val = 0;

    /* initial buffer pointers */
    long         i = *p;
    size_t       n = src->len;
    const char * s = src->buf;
-    long        si = *p; // record the idx for fall-back when parsing float.
+    char     *dbuf = ret->dbuf;
+    ssize_t   dcap = ret->dcap;

    /* initialize the result, and check for EOF */
    init_ret(V_INTEGER)
@ -600,11 +597,10 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {

    /* skip the leading zeros of 0.000xxxx */
    if (man == 0 && exp10 == 0) {
-        int idx = i;
        while (i < n && s[i] == '0') {
            i++;
+            exp10--;
        }
-        exp10 = idx - i;
        man = 0;
        man_nd = 0;
    }
@ -657,12 +653,18 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {
    }

 parse_float:
-    ret->dv = parse_float64(man, exp10, sgn, trunc, src, si);
-    /* if the float number is infinity */
-    if (((*(uint64_t *)&ret->dv) << 1) == 0xFFE0000000000000) {
+    /* when fast algorithms failed, use slow fallback.*/
+    if(!atof_fast(man, exp10, sgn, trunc, &val)) {
+        val = atof_native(s + *p, i - *p, dbuf, dcap);
+    }
+
+    /* check parsed double val */
+    if (is_infinity(val)) {
        ret->vt = -ERR_FLOAT_INF;
    }
+
    /* update the result */
+    ret->dv = val;
    *p = i;
 }

--- a/tools/asm2asm
+++ b/tools/asm2asm
@ -1 +1 @@
-Subproject commit a9988b2b8191ac9b8bc879ff8db18c650753a067
+Subproject commit 09224ab8c109bdb8da13af04abd7c01cb6e38d87