mirror of
https://github.com/ii64/sonic.git
synced 2026-06-21 00:46:43 +08:00
Co-authored-by: liuqiang <liuqiang.06@bytedance.com> Co-authored-by: Oxygen <chenzhuoyu@users.noreply.github.com>
487 lines
14 KiB
C
487 lines
14 KiB
C
/* Copyright 2018 Ulf Adams.
|
|
* Modifications copyright 2021 ByteDance Inc.
|
|
*
|
|
* The contents of this file may be used under the terms of the Apache License,
|
|
* Version 2.0.
|
|
*
|
|
* (See accompanying file LICENSE-Apache or copy at
|
|
* http: *www.apache.org/licenses/LICENSE-2.0)
|
|
*
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
* the Boost Software License, Version 1.0.
|
|
* (See accompanying file LICENSE-Boost or copy at
|
|
* https: *www.boost.org/LICENSE_1_0.txt)
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, this software
|
|
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
* KIND, either express or implied.
|
|
*/
|
|
#include "native.h"
|
|
#include "ryu_tab.h"
|
|
|
|
/* use 128-bit type for performance */
|
|
typedef __uint128_t uint128_t;
|
|
|
|
/* Returns e == 0 ? 1 : ceil(log_2(5^e)) */
|
|
static inline int32_t pow5bits(const int32_t e) {
|
|
return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
|
|
}
|
|
|
|
/* Returns floor(log_10(2^e)) */
|
|
static inline uint32_t log10pow2(const int32_t e) {
|
|
return (((uint32_t) e) * 78913) >> 18;
|
|
}
|
|
|
|
/* Returns floor(log_10(5^e)) */
|
|
static inline uint32_t log10pow5(const int32_t e) {
|
|
return (((uint32_t) e) * 732923) >> 20;
|
|
}
|
|
|
|
static inline uint32_t pow5factor(uint64_t v) {
|
|
uint64_t m_inv5 = 14757395258967641293u; // *5 = 1(mod 2^64)
|
|
uint64_t n_div5 = 3689348814741910323u; // =2^64 / 5
|
|
uint32_t cnt = 0;
|
|
for (;;) {
|
|
v *= m_inv5;
|
|
if (v > n_div5)
|
|
break;
|
|
++cnt;
|
|
}
|
|
return cnt;
|
|
}
|
|
|
|
/* Returns true if value is divisible by 5^p */
|
|
static inline bool ispow5(const uint64_t v, const uint32_t p) {
|
|
return pow5factor(v) >= p;
|
|
}
|
|
|
|
/* Returns true if value is divisible by 2^p */
|
|
static inline bool ispow2(const uint64_t v, const uint32_t p) {
|
|
return (v & ((1ull << p) - 1)) == 0;
|
|
}
|
|
|
|
/* Requires 0 <= v < v < 100000000000000000L */
|
|
static inline uint32_t ctz10(const uint64_t v) {
|
|
if (v < 10) return 1;
|
|
if (v < 100) return 2;
|
|
if (v < 1000) return 3;
|
|
if (v < 10000) return 4;
|
|
if (v < 100000) return 5;
|
|
if (v < 1000000) return 6;
|
|
if (v < 10000000) return 7;
|
|
if (v < 100000000) return 8;
|
|
if (v < 1000000000) return 9;
|
|
if (v < 10000000000) return 10;
|
|
if (v < 100000000000) return 11;
|
|
if (v < 1000000000000) return 12;
|
|
if (v < 10000000000000) return 13;
|
|
if (v < 100000000000000) return 14;
|
|
if (v < 1000000000000000) return 15;
|
|
if (v < 10000000000000000) return 16;
|
|
return 17;
|
|
}
|
|
|
|
/* Best case: use 128-bit type */
|
|
static inline uint64_t mulshift(const uint64_t m, const uint64_t *mul, const int32_t j) {
|
|
uint128_t lo = ((uint128_t) m) * mul[0];
|
|
uint128_t hi = ((uint128_t) m) * mul[1];
|
|
return (uint64_t) (((lo >> 64) + hi) >> (j - 64));
|
|
}
|
|
|
|
#define mul_shift_all(m, mul, j, shift) \
|
|
vp = mulshift(4 * m + 2, mul, j); \
|
|
vm = mulshift(4 * m - 1 - shift, mul, j); \
|
|
vr = mulshift(4 * m, mul, j);
|
|
|
|
#define copy_two_digs(dst, src) \
|
|
*(dst) = *(src); \
|
|
*(dst+1) = *(src+1);
|
|
|
|
/* A floating decimal representing man * 10^exp */
|
|
typedef struct f64_d {
|
|
uint64_t man;
|
|
int32_t exp;
|
|
} f64_d;
|
|
|
|
static inline f64_d f64tod(const uint64_t man,const uint32_t exp) {
|
|
int32_t e2;
|
|
uint64_t m2;
|
|
if (exp == 0) {
|
|
/* subtract 2 so that the bounds computation has 2 additional bits */
|
|
e2 = 1 - 1023 - 52 - 2;
|
|
m2 = man;
|
|
} else {
|
|
e2 = (int32_t) exp - 1023 - 52 - 2;
|
|
m2 = (1ull << 52) | man;
|
|
}
|
|
bool even = (m2 & 1) == 0;
|
|
|
|
/* Step 2: Determine the interval of valid decimal representations */
|
|
uint64_t mv = 4 * m2;
|
|
/* Implicit bool -> int conversion. True is 1, false is 0 */
|
|
uint32_t shift = man != 0 || exp <= 1;
|
|
|
|
/* Step 3: Convert to a decimal power base using 128-bit arithmetic */
|
|
uint64_t vr, vp, vm;
|
|
int32_t e10;
|
|
bool vmzeros = false;
|
|
bool vrzeros = false;
|
|
if (e2 >= 0) {
|
|
|
|
uint32_t q = log10pow2(e2) - (e2 > 3); // max(0, log10pow2(e2) - 1)
|
|
e10 = (int32_t) q;
|
|
int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
|
|
int32_t i = -e2 + (int32_t) q + k;
|
|
uint64_t *mul = (uint64_t*)DOUBLE_POW5_INV_SPLIT[q];
|
|
/* {vm, vr, vp} * 10^e10 = {mm, mv, mp} * 2^e2
|
|
* mp = 4 * m2 + 2
|
|
* mm = mv - 1 - shift
|
|
*/
|
|
mul_shift_all(m2, mul, i, shift)
|
|
|
|
if (q <= 21) {
|
|
if (mv % 5 == 0) {
|
|
vrzeros = ispow5(mv, q);
|
|
} else if (even) {
|
|
/* Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
|
|
* <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
|
|
* <=> true && pow5Factor(mm) >= q, since e2 >= q.
|
|
*/
|
|
vmzeros = ispow5(mv - 1 - shift, q);
|
|
} else {
|
|
/* Same as min(e2 + 1, pow5Factor(mp)) >= q. */
|
|
vp -= ispow5(mv + 2, q);
|
|
}
|
|
}
|
|
} else {
|
|
uint32_t q = log10pow5(-e2) - (-e2 > 1); // max(0, log10pow5(-e2) - 1)
|
|
e10 = (int32_t) q + e2;
|
|
int32_t i = -e2 - (int32_t) q;
|
|
int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
|
|
int32_t j = (int32_t) q - k;
|
|
uint64_t *mul = (uint64_t*)DOUBLE_POW5_SPLIT[i];
|
|
/* {vm, vr, vp} * 10^e10 = {mm, mv, mp} * 2^e2 */
|
|
mul_shift_all(m2, mul, j, shift)
|
|
|
|
if (q <= 1) {
|
|
/* {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing
|
|
* 0 bits. mv = 4 * m2, so it always has at least two trailing 0 bits.
|
|
*/
|
|
vrzeros = true;
|
|
if (even) {
|
|
/* mm = mv - 1 - shift, so it has 1 trailing 0 bit if shift = 1 */
|
|
vmzeros = shift == 1;
|
|
} else {
|
|
/* mp = mv + 2, so it always has at least one trailing 0 bit */
|
|
--vp;
|
|
}
|
|
} else if (q < 63) {
|
|
vrzeros = ispow2(mv, q);
|
|
}
|
|
}
|
|
|
|
/* Step 4: Find the shortest decimal representation in the interval */
|
|
int32_t removed = 0;
|
|
uint8_t lastrmdig = 0;
|
|
uint64_t dman;
|
|
/* On average, we remove ~2 DIGs */
|
|
if (vmzeros || vrzeros) {
|
|
/* General case, which happens rarely (~0.7%) */
|
|
for (;;) {
|
|
uint64_t vpdiv10 = vp / 10;
|
|
uint64_t vmdiv10 = vm / 10;
|
|
if (vpdiv10 <= vmdiv10) {
|
|
break;
|
|
}
|
|
uint32_t vmmod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmdiv10);
|
|
uint64_t vrdiv10 = vr / 10;
|
|
uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10);
|
|
vmzeros &= vmmod10 == 0;
|
|
vrzeros &= lastrmdig == 0;
|
|
lastrmdig = (uint8_t) vrmod10;
|
|
vr = vrdiv10;
|
|
vp = vpdiv10;
|
|
vm = vmdiv10;
|
|
++removed;
|
|
}
|
|
if (vmzeros) {
|
|
for (;;) {
|
|
uint64_t vmdiv10 = vm / 10;
|
|
uint32_t vmmod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmdiv10);
|
|
if (vmmod10 != 0) {
|
|
break;
|
|
}
|
|
uint64_t vpdiv10 = vp / 10;
|
|
uint64_t vrdiv10 = vr / 10;
|
|
uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10);
|
|
vrzeros &= lastrmdig == 0;
|
|
lastrmdig = (uint8_t) vrmod10;
|
|
vr = vrdiv10;
|
|
vp = vpdiv10;
|
|
vm = vmdiv10;
|
|
++removed;
|
|
}
|
|
}
|
|
if (vrzeros && lastrmdig == 5 && vr % 2 == 0) {
|
|
/* Round even if the exact number is .....50..0 */
|
|
lastrmdig = 4;
|
|
}
|
|
/* We need to take vr + 1 if vr is outside bounds or we need to round up */
|
|
dman = vr + ((vr == vm && (!even || !vmzeros)) || lastrmdig >= 5);
|
|
} else {
|
|
/* Specialized for the common case (~99.3%). Percentages below are relative to this */
|
|
bool roundup= false;
|
|
uint64_t vpdiv100 = vp / 100;
|
|
uint64_t vmdiv100 = vm / 100;
|
|
if (vpdiv100 > vmdiv100) { // Optimization: remove two DIGs at a time (~86.2%).
|
|
uint64_t vrdiv100 = vr / 100;
|
|
uint32_t vrmod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrdiv100);
|
|
roundup = vrmod100 >= 50;
|
|
vr = vrdiv100;
|
|
vp = vpdiv100;
|
|
vm = vmdiv100;
|
|
removed += 2;
|
|
}
|
|
/* Loop iterations below (approximately), without optimization above:
|
|
* 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
|
|
* Loop iterations below (approximately), with optimization above:
|
|
* 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
|
|
*/
|
|
for (;;) {
|
|
uint64_t vpdiv10 = vp / 10;
|
|
uint64_t vmdiv10 = vm / 10;
|
|
if (vpdiv10 <= vmdiv10) {
|
|
break;
|
|
}
|
|
uint64_t vrdiv10 = vr / 10;
|
|
uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10);
|
|
roundup = vrmod10 >= 5;
|
|
vr = vrdiv10;
|
|
vp = vpdiv10;
|
|
vm = vmdiv10;
|
|
++removed;
|
|
}
|
|
/* We need to take vr + 1 if vr is outside bounds or we need to round up */
|
|
dman = vr + (vr == vm || roundup);
|
|
}
|
|
f64_d fd = {
|
|
.exp = e10 + removed,
|
|
.man = dman,
|
|
};
|
|
return fd;
|
|
}
|
|
|
|
/* Print the decimal DIGs from mantissa */
|
|
static inline void print_mantissa(uint64_t man, char *out, int mlen) {
|
|
/* We have at most 17 DIGs, and uint32_t can store 9 DIGs.
|
|
* If man doesn't fit into uint32_t, we cut off 8 DIGs,
|
|
* so the rest will fit into uint32_t.
|
|
*/
|
|
char *r = out + mlen;
|
|
if (man < 10) {}
|
|
if ((man >> 32) != 0) {
|
|
/* Expensive 64-bit division */
|
|
uint64_t q = man / 100000000;
|
|
uint32_t man2 = ((uint32_t) man) - 100000000 * ((uint32_t) q);
|
|
man = q;
|
|
|
|
uint32_t c = man2 % 10000;
|
|
man2 /= 10000;
|
|
uint32_t d = man2 % 10000;
|
|
uint32_t c0 = (c % 100) << 1;
|
|
uint32_t c1 = (c / 100) << 1;
|
|
uint32_t d0 = (d % 100) << 1;
|
|
uint32_t d1 = (d / 100) << 1;
|
|
copy_two_digs(r - 2, DIG_TAB + c0)
|
|
copy_two_digs(r - 4, DIG_TAB + c1)
|
|
copy_two_digs(r - 6, DIG_TAB + d0)
|
|
copy_two_digs(r - 8, DIG_TAB + d1)
|
|
r -= 8;
|
|
}
|
|
uint32_t man2 = (uint32_t) man;
|
|
while (man2 >= 10000) {
|
|
#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
|
|
uint32_t c = man2 - 10000 * (man2 / 10000);
|
|
#else
|
|
uint32_t c = man2 % 10000;
|
|
#endif
|
|
man2 /= 10000;
|
|
uint32_t c0 = (c % 100) << 1;
|
|
uint32_t c1 = (c / 100) << 1;
|
|
copy_two_digs(r - 2, DIG_TAB + c0)
|
|
copy_two_digs(r - 4, DIG_TAB + c1)
|
|
r -= 4;
|
|
}
|
|
if (man2 >= 100) {
|
|
uint32_t c = (man2 % 100) << 1;
|
|
man2 /= 100;
|
|
copy_two_digs(r - 2, DIG_TAB + c)
|
|
r -= 2;
|
|
}
|
|
if (man2 >= 10) {
|
|
uint32_t c = man2 << 1;
|
|
copy_two_digs(r - 2, DIG_TAB + c)
|
|
} else {
|
|
*out = (char) ('0' + man2);
|
|
}
|
|
}
|
|
|
|
static inline int print_exponent(f64_d v, char *out, int mlen) {
|
|
int idx = 0;
|
|
|
|
print_mantissa(v.man, out + idx + 1, mlen);
|
|
|
|
/* Print decimal point if needed */
|
|
out[idx] = out[idx + 1];
|
|
if (mlen > 1) {
|
|
out[idx + 1] = '.';
|
|
idx += mlen + 1;
|
|
} else {
|
|
++idx;
|
|
}
|
|
|
|
/* Print the exponent */
|
|
out[idx++] = 'e';
|
|
int32_t exp = v.exp + (int32_t) mlen - 1;
|
|
if (exp < 0) {
|
|
out[idx++] = '-';
|
|
exp = -exp;
|
|
}
|
|
|
|
if (exp >= 100) {
|
|
int32_t c = exp % 10;
|
|
copy_two_digs(out + idx, DIG_TAB + 2 * (exp / 10))
|
|
out[idx + 2] = (char) ('0' + c);
|
|
idx += 3;
|
|
} else if (exp >= 10) {
|
|
copy_two_digs(out + idx, DIG_TAB + 2 * exp)
|
|
idx += 2;
|
|
} else {
|
|
out[idx++] = (char) ('0' + exp);
|
|
}
|
|
|
|
return idx;
|
|
}
|
|
|
|
static inline int print_decimal(const f64_d v, char *out, int mlen) {
|
|
int idx = 0;
|
|
int lzeros = 0;
|
|
int rzeros = 0;
|
|
int point = 0;
|
|
int exp10 = mlen - 1 + v.exp;
|
|
|
|
/* parse the point idx and additional zeros */
|
|
if (exp10 < 0) {
|
|
lzeros = -exp10;
|
|
point = 1;
|
|
} else if (exp10 < mlen - 1) {
|
|
point = 1 + exp10;
|
|
} else {
|
|
rzeros = exp10 - mlen + 1;
|
|
}
|
|
|
|
int i = 0;
|
|
/* add left zeros */
|
|
if (lzeros) {
|
|
out[idx++] = '0';
|
|
out[idx++] = '.';
|
|
point = 0;
|
|
}
|
|
for (i = 1; i < lzeros; ++i) {
|
|
out[idx++] = '0';
|
|
}
|
|
|
|
/* add the mantissa DIGs */
|
|
print_mantissa(v.man, out + idx, mlen);
|
|
if (point) {
|
|
for (i = idx + mlen; i > idx + point; --i) {
|
|
out[i] = out[i-1];
|
|
}
|
|
out[idx + point] = '.';
|
|
idx += 1;
|
|
}
|
|
|
|
/* add right zeros */
|
|
idx += mlen;
|
|
for (i = 0; i < rzeros; ++i) {
|
|
out[idx++] = '0';
|
|
}
|
|
|
|
return idx;
|
|
}
|
|
|
|
static inline bool f64tod_exct_int(const uint64_t man, const uint32_t exp,
|
|
f64_d* v) {
|
|
uint64_t m2 = (1ull << 52) | man; // implicit 1
|
|
int32_t e2 = (int32_t) exp - 1023 - 52;
|
|
|
|
if (e2 > 0 || e2 < -52) {
|
|
return false;
|
|
}
|
|
|
|
uint64_t mask = (1ull << -e2) - 1;
|
|
if ((m2 & mask) != 0) { // with fraction
|
|
return false;
|
|
}
|
|
|
|
v->man = m2 >> -e2;
|
|
v->exp = 0;
|
|
return true;
|
|
}
|
|
|
|
static int inline ryu(double val, char *out) {
|
|
/* Step 1: Decode the floating-point number */
|
|
uint64_t bits = *(uint64_t *)(&val);
|
|
uint64_t man = bits & ((1ull << 52) - 1);
|
|
uint32_t exp = (uint32_t) ((bits >> 52) & ((1u << 11) - 1));
|
|
|
|
/* Skip when Infinity */
|
|
if (exp == ((1u << 11) - 1u)) {
|
|
return 0;
|
|
}
|
|
|
|
f64_d v;
|
|
/* for integer from [1, 2^53], can be resepensated exactly by double */
|
|
bool is_exact_int = f64tod_exct_int(man, exp, &v);
|
|
if (!is_exact_int){ // find the shortest decimal representation
|
|
v = f64tod(man, exp);
|
|
}
|
|
|
|
/* Step 5: Print the decimal representation */
|
|
int idx = 0;
|
|
uint32_t mlen = ctz10(v.man);
|
|
int exp10 = mlen - 1 + v.exp;
|
|
/* The format as Go encoding/json package */
|
|
bool isexp = exp10 < -6 || exp10 >= 21;
|
|
|
|
if (isexp) // exponent format
|
|
idx += print_exponent(v, out + idx, mlen);
|
|
else // decimal format
|
|
idx += print_decimal(v, out + idx, mlen);
|
|
|
|
/* Terminate the string */
|
|
out[idx] = '\0';
|
|
return idx;
|
|
}
|
|
|
|
int f64toa(char *out, double val) {
|
|
int i = 0;
|
|
char *p = out;
|
|
|
|
/* simple case of 0.0 */
|
|
if (val == 0.0) {
|
|
*p = '0';
|
|
return 1;
|
|
}
|
|
|
|
/* negative numbers */
|
|
if (val < 0.0) {
|
|
i = 1;
|
|
val = -val;
|
|
*p++ = '-';
|
|
}
|
|
|
|
/* print the number with Ryu algorithm */
|
|
int n = ryu(val, p);
|
|
return n + i;
|
|
}
|