fix: enhance float parsing as Go encoding/json

2026-06-21 00:46:43 +08:00 · 2021-07-21 18:52:23 +08:00 · 2021-07-21 18:52:23 +08:00 · f9632ab873
commit f9632ab873
parent 6b4022a19f
7 changed files with 3164 additions and 3586 deletions
--- a/decode_float_test.go
+++ b/decode_float_test.go
@ -0,0 +1,158 @@
+/*
+ * Copyright 2021 ByteDance Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sonic
+
+import (
+	"encoding/json"
+	"reflect"
+	"strings"
+	"testing"
+
+	"github.com/bytedance/sonic/decoder"
+)
+
+type atofTest struct {
+	in  string
+	out string
+	err error
+}
+
+// Tests from Go strconv package, https://github.com/golang/go/blob/master/src/strconv/atof_test.go
+// All tests are passed in Go encoding/json.
+var atoftests = []atofTest{
+	{"1.234e", "1.234", nil},
+	{"1i", "1", nil},
+	{"1", "1", nil},
+	{"1e23", "1e+23", nil},
+	{"1E23", "1e+23", nil},
+	{"100000000000000000000000", "1e+23", nil},
+	{"1e-100", "1e-100", nil},
+	{"123456700", "1.234567e+08", nil},
+	{"99999999999999974834176", "9.999999999999997e+22", nil},
+	{"100000000000000000000001", "1.0000000000000001e+23", nil},
+	{"100000000000000008388608", "1.0000000000000001e+23", nil},
+	{"100000000000000016777215", "1.0000000000000001e+23", nil},
+	{"100000000000000016777216", "1.0000000000000003e+23", nil},
+	{"-1", "-1", nil},
+	{"-0.1", "-0.1", nil},
+	{"-0", "-0", nil},
+	{"1e-20", "1e-20", nil},
+	{"625e-3", "0.625", nil},
+
+	// zeros
+	{"0", "0", nil},
+	{"0e0", "0", nil},
+	{"-0e0", "-0", nil},
+	{"0e-0", "0", nil},
+	{"-0e-0", "-0", nil},
+	{"0e+0", "0", nil},
+	{"-0e+0", "-0", nil},
+	{"0e+01234567890123456789", "0", nil},
+	{"0.00e-01234567890123456789", "0", nil},
+	{"-0e+01234567890123456789", "-0", nil},
+	{"-0.00e-01234567890123456789", "-0", nil},
+
+	{"0e291", "0", nil}, // issue 15364
+	{"0e292", "0", nil}, // issue 15364
+	{"0e347", "0", nil}, // issue 15364
+	{"0e348", "0", nil}, // issue 15364
+	{"-0e291", "-0", nil},
+	{"-0e292", "-0", nil},
+	{"-0e347", "-0", nil},
+	{"-0e348", "-0", nil},
+
+	// largest float64
+	{"1.7976931348623157e308", "1.7976931348623157e+308", nil},
+	{"-1.7976931348623157e308", "-1.7976931348623157e+308", nil},
+
+	// the border is ...158079
+	// borderline - okay
+	{"1.7976931348623158e308", "1.7976931348623157e+308", nil},
+	{"-1.7976931348623158e308", "-1.7976931348623157e+308", nil},
+
+	// a little too large
+	{"1e308", "1e+308", nil},
+
+	// denormalized
+	{"1e-305", "1e-305", nil},
+	{"1e-306", "1e-306", nil},
+	{"1e-307", "1e-307", nil},
+	{"1e-308", "1e-308", nil},
+	{"1e-309", "1e-309", nil},
+	{"1e-310", "1e-310", nil},
+	{"1e-322", "1e-322", nil},
+	// smallest denormal
+	{"5e-324", "5e-324", nil},
+	{"4e-324", "5e-324", nil},
+	{"3e-324", "5e-324", nil},
+	// too small
+	{"2e-324", "0", nil},
+	// way too small
+	{"1e-350", "0", nil},
+	{"1e-400000", "0", nil},
+
+	// try to overflow exponent
+	{"1e-4294967296", "0", nil},
+	{"1e-18446744073709551616", "0", nil},
+
+	// https://www.exploringbinary.com/java-hangs-when-converting-2-2250738585072012e-308/
+	{"2.2250738585072012e-308", "2.2250738585072014e-308", nil},
+	// https://www.exploringbinary.com/php-hangs-on-numeric-value-2-2250738585072011e-308/
+	{"2.2250738585072011e-308", "2.225073858507201e-308", nil},
+
+	// A very large number (initially wrongly parsed by the fast algorithm).
+	{"4.630813248087435e+307", "4.630813248087435e+307", nil},
+
+	// A different kind of very large number.
+	{"22.222222222222222", "22.22222222222222", nil},
+	{"2." + strings.Repeat("2", 800) + "e+1", "22.22222222222222", nil},
+
+	// Exactly halfway between 1 and math.Nextafter(1, 2).
+	// Round to even (down).
+	{"1.00000000000000011102230246251565404236316680908203125", "1", nil},
+	// Slightly lower; still round down.
+	{"1.00000000000000011102230246251565404236316680908203124", "1", nil},
+	// Slightly higher; round up.
+	{"1.00000000000000011102230246251565404236316680908203126", "1.0000000000000002", nil},
+	// Slightly higher, but you have to read all the way to the end.
+	{"1.00000000000000011102230246251565404236316680908203125" + strings.Repeat("0", 10000) + "1", "1.0000000000000002", nil},
+
+	// Halfway between x := math.Nextafter(1, 2) and math.Nextafter(x, 2)
+	// Round to even (up).
+	{"1.00000000000000033306690738754696212708950042724609375", "1.0000000000000004", nil},
+
+	// Halfway between 1090544144181609278303144771584 and 1090544144181609419040633126912
+	// (15497564393479157p+46, should round to even 15497564393479156p+46, issue 36657)
+	{"1090544144181609348671888949248", "1.0905441441816093e+30", nil},
+	// slightly above, rounds up
+	{"1090544144181609348835077142190", "1.0905441441816094e+30", nil},
+}
+
+func TestDecodeFloat(t *testing.T) {
+	var sonicout, stdout interface{}
+	for _, tt := range atoftests {
+		// default float64
+		sonicerr := decoder.NewDecoder(tt.in).Decode(&sonicout)
+		stderr := json.NewDecoder(strings.NewReader(tt.in)).Decode(&stdout)
+		if !reflect.DeepEqual(sonicout, stdout) {
+			t.Fatalf("Test %#v\ngot:\n   %#v\nexp:\n   %#v\n", tt.in, sonicout, stdout)
+		}
+		if !reflect.DeepEqual(sonicerr == nil, stderr == nil) {
+			t.Fatalf("Test %#v\ngot:\n   %#v\nexp:\n   %#v\n", tt.in, sonicerr, stderr)
+		}
+	}
+}
--- a/internal/native/avx/native_amd64.s
+++ b/internal/native/avx/native_amd64.s
--- a/internal/native/avx/native_subr_amd64.go
+++ b/internal/native/avx/native_subr_amd64.go
@ -19,16 +19,16 @@ var (
    _subr__lspace      = **(**uintptr)(unsafe.Pointer(&_func__base)) + 238
    _subr__lzero       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 0
    _subr__quote       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 4951
-    _subr__skip_array  = **(**uintptr)(unsafe.Pointer(&_func__base)) + 16916
-    _subr__skip_object = **(**uintptr)(unsafe.Pointer(&_func__base)) + 16951
-    _subr__skip_one    = **(**uintptr)(unsafe.Pointer(&_func__base)) + 14594
+    _subr__skip_array  = **(**uintptr)(unsafe.Pointer(&_func__base)) + 16081
+    _subr__skip_object = **(**uintptr)(unsafe.Pointer(&_func__base)) + 16116
+    _subr__skip_one    = **(**uintptr)(unsafe.Pointer(&_func__base)) + 13759
    _subr__u64toa      = **(**uintptr)(unsafe.Pointer(&_func__base)) + 3731
    _subr__unquote     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 5972
-    _subr__value       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 10112
-    _subr__vnumber     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 12979
-    _subr__vsigned     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 14044
-    _subr__vstring     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 11718
-    _subr__vunsigned   = **(**uintptr)(unsafe.Pointer(&_func__base)) + 14321
+    _subr__value       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 9426
+    _subr__vnumber     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 11985
+    _subr__vsigned     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 13209
+    _subr__vstring     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 11032
+    _subr__vunsigned   = **(**uintptr)(unsafe.Pointer(&_func__base)) + 13486
 )

 var (
--- a/internal/native/avx2/native_amd64.s
+++ b/internal/native/avx2/native_amd64.s
--- a/internal/native/avx2/native_subr_amd64.go
+++ b/internal/native/avx2/native_subr_amd64.go
@ -19,16 +19,16 @@ var (
    _subr__lspace      = **(**uintptr)(unsafe.Pointer(&_func__base)) + 366
    _subr__lzero       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 0
    _subr__quote       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 5299
-    _subr__skip_array  = **(**uintptr)(unsafe.Pointer(&_func__base)) + 19341
-    _subr__skip_object = **(**uintptr)(unsafe.Pointer(&_func__base)) + 19376
-    _subr__skip_one    = **(**uintptr)(unsafe.Pointer(&_func__base)) + 16541
+    _subr__skip_array  = **(**uintptr)(unsafe.Pointer(&_func__base)) + 18501
+    _subr__skip_object = **(**uintptr)(unsafe.Pointer(&_func__base)) + 18536
+    _subr__skip_one    = **(**uintptr)(unsafe.Pointer(&_func__base)) + 15701
    _subr__u64toa      = **(**uintptr)(unsafe.Pointer(&_func__base)) + 3979
    _subr__unquote     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 7136
-    _subr__value       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 12070
-    _subr__vnumber     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 14926
-    _subr__vsigned     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 15991
-    _subr__vstring     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 13781
-    _subr__vunsigned   = **(**uintptr)(unsafe.Pointer(&_func__base)) + 16268
+    _subr__value       = **(**uintptr)(unsafe.Pointer(&_func__base)) + 11379
+    _subr__vnumber     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 13927
+    _subr__vsigned     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 15151
+    _subr__vstring     = **(**uintptr)(unsafe.Pointer(&_func__base)) + 13090
+    _subr__vunsigned   = **(**uintptr)(unsafe.Pointer(&_func__base)) + 15428
 )

 var (
--- a/native/atof_native.c
+++ b/native/atof_native.c
@ -23,16 +23,17 @@
 /* Decimal represent the integer or float
 * example 1: 1.1   {"11", 2, 1, 0}
 * example 2: -0.1  {"1", 1, 0, 1}
- * example 3: 999   {"999", 3, 3, 0} 
+ * example 3: 999   {"999", 3, 3, 0}
 */
 typedef struct Decimal {
    char  d[DECIMAL_MAX_DNUM];
    int   nd;
    int   dp;
-    int   neg; 
+    int   neg;
+    int   trunc;
 } Decimal;

-/* decimal power of ten to binary power of two. 
+/* decimal power of ten to binary power of two.
 * For example: POW_TAB[1]: 10 ** 1 ~ 2 ** 3
 */
 static const int POW_TAB[9] = {1, 3, 6, 9, 13, 16, 19, 23, 26};
@ -50,7 +51,7 @@ typedef struct lshift_cheat  {
 * idx is shift bits for binary.
 * value is the shift information for decimal.
 * For example, idx is 4, the value is {2, "625"}.
- * That means the binary shift 4 bits left, will cause add 2 digits to the decimal 
+ * That means the binary shift 4 bits left, will cause add 2 digits to the decimal
 * if the prefix of decimal is under "625".
 */
 const static lshift_cheat LSHIFT_TAB[61];
@ -59,9 +60,10 @@ static inline void decimal_init(Decimal *d) {
    for (int i = 0; i < DECIMAL_MAX_DNUM; ++i) {
        d->d[i] = 0;
    }
-    d->dp  = 0;
-    d->nd  = 0;
-    d->neg = 0;
+    d->dp    = 0;
+    d->nd    = 0;
+    d->neg   = 0;
+    d->trunc = 0;
 }

 static inline void decimal_set(Decimal *d, const char *s, int len) {
@ -70,7 +72,7 @@ static inline void decimal_set(Decimal *d, const char *s, int len) {
    decimal_init(d);
    if (s[i] == '+') {
        i++;
-    } 
+    }
    else if (s[i] == '-') {
        i++;
        d->neg = 1;
@ -88,17 +90,18 @@ static inline void decimal_set(Decimal *d, const char *s, int len) {
                d->nd++;
            } else if (s[i] != '0') {
                /* truncat the remaining digits */
+                d->trunc = 1;
            }
        } else if (s[i] == '.') {
            saw_dot = 1;
            d->dp = d->nd;
        } else {
            break;
-        } 
+        }
    }

    /* integer */
-    if (saw_dot == 0) { 
+    if (saw_dot == 0) {
        d->dp = d->nd;
    }

@ -115,7 +118,7 @@ static inline void decimal_set(Decimal *d, const char *s, int len) {
            esgn = -1;
        }

-        for (; i < len && ('0' <= s[i] && s[i] <= '9'); i++) {
+        for (; i < len && ('0' <= s[i] && s[i] <= '9') && exp < 10000; i++) {
                exp = exp * 10 + (s[i] - '0');
        }
        d->dp += exp * esgn;
@ -148,7 +151,7 @@ static inline void right_shift(Decimal *d, uint32_t k) {
                return;
            }
            /* until n has enough bits for right shift */
-            while (n>>k == 0) { 
+            while (n>>k == 0) {
                n *= 10;
                r++;
            }
@ -165,7 +168,7 @@ static inline void right_shift(Decimal *d, uint32_t k) {
    for (; r < d->nd; r++) {
        dig = n >> k;
        n &= mask;
-        d->d[w++] = (char)(dig + '0'); 
+        d->d[w++] = (char)(dig + '0');
        n = n * 10 + d->d[r] - '0';
    }

@ -178,6 +181,7 @@ static inline void right_shift(Decimal *d, uint32_t k) {
            w++;
        } else if (dig > 0) {
            /* truncated */
+            d->trunc = 1;
        }
        n *= 10;
    }
@ -227,6 +231,7 @@ static inline void left_shift(Decimal *d, uint32_t k) {
            w--;
        } else if (rem != 0) {
            /* truncated */
+            d->trunc = 1;
        }
        n = quo;
    }
@ -240,6 +245,7 @@ static inline void left_shift(Decimal *d, uint32_t k) {
            w--;
        } else if (rem != 0) {
            /* truncated */
+            d->trunc = 1;
        }
        n = quo;
    }
@ -276,7 +282,7 @@ static inline void decimal_shift(Decimal *d, int k) {
            right_shift(d, -k);
        }
    }
-    
+
 }

 static inline int should_roundup(Decimal *d, int nd) {
@ -285,7 +291,10 @@ static inline int should_roundup(Decimal *d, int nd) {
    }

    /* Exactly halfway - round to even */
-    if (d->d[nd] == '5' && nd+1 == d->nd) { 
+    if (d->d[nd] == '5' && nd+1 == d->nd) {
+        if (d->trunc) {
+            return 1;
+        }
        return nd > 0 && (d->d[nd-1]-'0')%2 != 0;
    }

@ -344,7 +353,7 @@ int decimal_to_f64(Decimal *d, double *val) {
        } else {
            n = POW_TAB[d->dp];
        }
-        decimal_shift(d, -n); // shift right 
+        decimal_shift(d, -n); // shift right
        exp2 += n;
    }
    while ((d->dp < 0) || (d->dp == 0) && (d->d[0] < '5')) { // d < 0.5
@ -376,12 +385,12 @@ int decimal_to_f64(Decimal *d, double *val) {
    }

    /* Extract 53 bits. */
-    decimal_shift(d, 53);  // shift left 
+    decimal_shift(d, 53);  // shift left
    mant = rounded_integer(d);

    /* Rounding might have added a bit; shift down. */
    if (mant == (((uint64_t)2) << 52)) { // mant has 54 bits
-        mant >>= 1; 
+        mant >>= 1;
        exp2 ++;
        if ((exp2 + 1023) >= 0x7FF) {
            goto overflow;
@ -496,4 +505,4 @@ const static lshift_cheat LSHIFT_TAB[61] = {
    {18, "34694469519536141888238489627838134765625"},  // * 288230376151711744
    {18, "173472347597680709441192448139190673828125"}, // * 576460752303423488
    {19, "867361737988403547205962240695953369140625"}, // * 1152921504606846976
-};
+};
--- a/native/scanning.c
+++ b/native/scanning.c
@ -486,14 +486,14 @@ void vstring(const GoString *src, long *p, JsonState *ret) {
 static inline int is_atof_exact(uint64_t man, int exp, int sgn, double *val) {
    double f = (double)man;

-    if (man >> 52 != 0) { 
+    if (man >> 52 != 0) {
        return 0;
    }

    f *= sgn;
    *val = 0;

-    if (exp == 0) {
+    if (exp == 0 || man == 0) {
        *val = f;
        return 1;
    } else if (exp > 0 && exp <= 15+22) {
@ -515,13 +515,14 @@ static inline int is_atof_exact(uint64_t man, int exp, int sgn, double *val) {
    } else if (exp < 0 && exp >= -22) {
        *val = f / P10_TAB[-exp];
        return 1;
-    } 
+    }

    return 0;
 }

-double parse_float64(uint64_t man, int exp, int sgn, const GoString *src, long idx) {
-    double val = 0.0;
+static inline double parse_float64(uint64_t man, int exp, int sgn, int trunc, const GoString *src, long idx) {
+    double val    = 0.0;
+    double val_up = 0.0;

    /* look-up for fast atof if the conversion can be exactly */
    if (is_atof_exact(man, exp, sgn, &val)) {
@ -530,22 +531,26 @@ double parse_float64(uint64_t man, int exp, int sgn, const GoString *src, long i

    /* A fast atof algorithm for high percison */
    if (atof_eisel_lemire64(man, exp, sgn, &val)) {
-        return val;
+        if (!trunc) {
+            return val;
+        }
+        if (atof_eisel_lemire64(man+1, exp, sgn, &val_up) && val_up == val) {
+            return val;
+        }
    }

    /* when above algorithms failed, fallback. It is slow. */
    return atof_native_decimal(src->buf + idx, src->len - idx);
-} 
+}


 void vnumber(const GoString *src, long *p, JsonState *ret) {
    int      dig;
-    int      ovf = 0;
    int      sgn = 1;
-    double   val = 0;
    uint64_t man = 0; // mantissa for double (float64)
    int   man_nd = 0; // # digits of mantissa, 10^19 fits uint64_t
    int    exp10 = 0; // man * exp10 represents the true value
+    int    trunc = 0;

    /* initial buffer pointers */
    long         i = *p;
@ -559,7 +564,7 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {
    check_sign(sgn = -1)

    /* zero */
-    if (i + 1 == n && s[i] == '0') { 
+    if (i + 1 == n && s[i] == '0') {
        i++;
        goto out;
    }
@ -574,6 +579,10 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {
        i++;
    }

+    if (exp10 > 0) {
+        trunc = 1;
+    }
+
    /* check for decimal points */
    if (i < n && s[i] == '.') {
        i++;
@ -582,8 +591,8 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {
        check_digit()
    }

-    /* skip the leading zeros of 0.000xxxx */ 
-    if (man == 0 && exp10 == 0) { 
+    /* skip the leading zeros of 0.000xxxx */
+    if (man == 0 && exp10 == 0) {
        int idx = i;
        while (i < n && s[i] == '0') {
            i++;
@ -601,6 +610,7 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {

     /* skip the remaining digits */
    while (i < n && is_digit(s[i])) {
+        trunc = 1;
        i++;
    }

@ -616,12 +626,15 @@ void vnumber(const GoString *src, long *p, JsonState *ret) {
        parse_sign(esm)
        check_digit()
        while (i < n && is_digit(s[i])) {
-            exp = exp * 10 + (s[i++] - '0');
+            if (exp < 10000) {
+                exp = exp * 10 + (s[i] - '0');
+            }
+            i++;
        }
        exp10 += exp * esm;
    }
-   
-out:    
+
+out:
    if (ret->vt == V_INTEGER) {
        /* if INT64_MIN <= man * sgn <= INT64_MAX */
        if ( exp10 == 0 && (((man & ((uint64_t)1 << 63)) == 0) || ((man & sgn) == man))) {
@ -633,7 +646,7 @@ out:
    }

    if (ret->vt == V_DOUBLE) {
-        ret->dv = parse_float64(man, exp10, sgn, src, si);
+        ret->dv = parse_float64(man, exp10, sgn, trunc, src, si);
    }

    /* update the result */