2
0
Fork 0
mirror of https://github.com/ii64/sonic.git synced 2026-06-21 00:46:43 +08:00
sonic/native/fastfloat.c
liu 83208b7ac4
feat: optimize float-to-str encoding through the Ryu algorithm (#60)
Co-authored-by: liuqiang <liuqiang.06@bytedance.com>
Co-authored-by: Oxygen <chenzhuoyu@users.noreply.github.com>
2021-07-29 17:03:38 +08:00

487 lines
14 KiB
C

/* Copyright 2018 Ulf Adams.
* Modifications copyright 2021 ByteDance Inc.
*
* The contents of this file may be used under the terms of the Apache License,
* Version 2.0.
*
* (See accompanying file LICENSE-Apache or copy at
* http: *www.apache.org/licenses/LICENSE-2.0)
*
* Alternatively, the contents of this file may be used under the terms of
* the Boost Software License, Version 1.0.
* (See accompanying file LICENSE-Boost or copy at
* https: *www.boost.org/LICENSE_1_0.txt)
*
* Unless required by applicable law or agreed to in writing, this software
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.
*/
#include "native.h"
#include "ryu_tab.h"
/* use 128-bit type for performance */
typedef __uint128_t uint128_t;
/* Returns e == 0 ? 1 : ceil(log_2(5^e)) */
static inline int32_t pow5bits(const int32_t e) {
return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
}
/* Returns floor(log_10(2^e)) */
static inline uint32_t log10pow2(const int32_t e) {
return (((uint32_t) e) * 78913) >> 18;
}
/* Returns floor(log_10(5^e)) */
static inline uint32_t log10pow5(const int32_t e) {
return (((uint32_t) e) * 732923) >> 20;
}
static inline uint32_t pow5factor(uint64_t v) {
uint64_t m_inv5 = 14757395258967641293u; // *5 = 1(mod 2^64)
uint64_t n_div5 = 3689348814741910323u; // =2^64 / 5
uint32_t cnt = 0;
for (;;) {
v *= m_inv5;
if (v > n_div5)
break;
++cnt;
}
return cnt;
}
/* Returns true if value is divisible by 5^p */
static inline bool ispow5(const uint64_t v, const uint32_t p) {
return pow5factor(v) >= p;
}
/* Returns true if value is divisible by 2^p */
static inline bool ispow2(const uint64_t v, const uint32_t p) {
return (v & ((1ull << p) - 1)) == 0;
}
/* Requires 0 <= v < v < 100000000000000000L */
static inline uint32_t ctz10(const uint64_t v) {
if (v < 10) return 1;
if (v < 100) return 2;
if (v < 1000) return 3;
if (v < 10000) return 4;
if (v < 100000) return 5;
if (v < 1000000) return 6;
if (v < 10000000) return 7;
if (v < 100000000) return 8;
if (v < 1000000000) return 9;
if (v < 10000000000) return 10;
if (v < 100000000000) return 11;
if (v < 1000000000000) return 12;
if (v < 10000000000000) return 13;
if (v < 100000000000000) return 14;
if (v < 1000000000000000) return 15;
if (v < 10000000000000000) return 16;
return 17;
}
/* Best case: use 128-bit type */
static inline uint64_t mulshift(const uint64_t m, const uint64_t *mul, const int32_t j) {
uint128_t lo = ((uint128_t) m) * mul[0];
uint128_t hi = ((uint128_t) m) * mul[1];
return (uint64_t) (((lo >> 64) + hi) >> (j - 64));
}
#define mul_shift_all(m, mul, j, shift) \
vp = mulshift(4 * m + 2, mul, j); \
vm = mulshift(4 * m - 1 - shift, mul, j); \
vr = mulshift(4 * m, mul, j);
#define copy_two_digs(dst, src) \
*(dst) = *(src); \
*(dst+1) = *(src+1);
/* A floating decimal representing man * 10^exp */
typedef struct f64_d {
uint64_t man;
int32_t exp;
} f64_d;
static inline f64_d f64tod(const uint64_t man,const uint32_t exp) {
int32_t e2;
uint64_t m2;
if (exp == 0) {
/* subtract 2 so that the bounds computation has 2 additional bits */
e2 = 1 - 1023 - 52 - 2;
m2 = man;
} else {
e2 = (int32_t) exp - 1023 - 52 - 2;
m2 = (1ull << 52) | man;
}
bool even = (m2 & 1) == 0;
/* Step 2: Determine the interval of valid decimal representations */
uint64_t mv = 4 * m2;
/* Implicit bool -> int conversion. True is 1, false is 0 */
uint32_t shift = man != 0 || exp <= 1;
/* Step 3: Convert to a decimal power base using 128-bit arithmetic */
uint64_t vr, vp, vm;
int32_t e10;
bool vmzeros = false;
bool vrzeros = false;
if (e2 >= 0) {
uint32_t q = log10pow2(e2) - (e2 > 3); // max(0, log10pow2(e2) - 1)
e10 = (int32_t) q;
int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
int32_t i = -e2 + (int32_t) q + k;
uint64_t *mul = (uint64_t*)DOUBLE_POW5_INV_SPLIT[q];
/* {vm, vr, vp} * 10^e10 = {mm, mv, mp} * 2^e2
* mp = 4 * m2 + 2
* mm = mv - 1 - shift
*/
mul_shift_all(m2, mul, i, shift)
if (q <= 21) {
if (mv % 5 == 0) {
vrzeros = ispow5(mv, q);
} else if (even) {
/* Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
* <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
* <=> true && pow5Factor(mm) >= q, since e2 >= q.
*/
vmzeros = ispow5(mv - 1 - shift, q);
} else {
/* Same as min(e2 + 1, pow5Factor(mp)) >= q. */
vp -= ispow5(mv + 2, q);
}
}
} else {
uint32_t q = log10pow5(-e2) - (-e2 > 1); // max(0, log10pow5(-e2) - 1)
e10 = (int32_t) q + e2;
int32_t i = -e2 - (int32_t) q;
int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
int32_t j = (int32_t) q - k;
uint64_t *mul = (uint64_t*)DOUBLE_POW5_SPLIT[i];
/* {vm, vr, vp} * 10^e10 = {mm, mv, mp} * 2^e2 */
mul_shift_all(m2, mul, j, shift)
if (q <= 1) {
/* {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing
* 0 bits. mv = 4 * m2, so it always has at least two trailing 0 bits.
*/
vrzeros = true;
if (even) {
/* mm = mv - 1 - shift, so it has 1 trailing 0 bit if shift = 1 */
vmzeros = shift == 1;
} else {
/* mp = mv + 2, so it always has at least one trailing 0 bit */
--vp;
}
} else if (q < 63) {
vrzeros = ispow2(mv, q);
}
}
/* Step 4: Find the shortest decimal representation in the interval */
int32_t removed = 0;
uint8_t lastrmdig = 0;
uint64_t dman;
/* On average, we remove ~2 DIGs */
if (vmzeros || vrzeros) {
/* General case, which happens rarely (~0.7%) */
for (;;) {
uint64_t vpdiv10 = vp / 10;
uint64_t vmdiv10 = vm / 10;
if (vpdiv10 <= vmdiv10) {
break;
}
uint32_t vmmod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmdiv10);
uint64_t vrdiv10 = vr / 10;
uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10);
vmzeros &= vmmod10 == 0;
vrzeros &= lastrmdig == 0;
lastrmdig = (uint8_t) vrmod10;
vr = vrdiv10;
vp = vpdiv10;
vm = vmdiv10;
++removed;
}
if (vmzeros) {
for (;;) {
uint64_t vmdiv10 = vm / 10;
uint32_t vmmod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmdiv10);
if (vmmod10 != 0) {
break;
}
uint64_t vpdiv10 = vp / 10;
uint64_t vrdiv10 = vr / 10;
uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10);
vrzeros &= lastrmdig == 0;
lastrmdig = (uint8_t) vrmod10;
vr = vrdiv10;
vp = vpdiv10;
vm = vmdiv10;
++removed;
}
}
if (vrzeros && lastrmdig == 5 && vr % 2 == 0) {
/* Round even if the exact number is .....50..0 */
lastrmdig = 4;
}
/* We need to take vr + 1 if vr is outside bounds or we need to round up */
dman = vr + ((vr == vm && (!even || !vmzeros)) || lastrmdig >= 5);
} else {
/* Specialized for the common case (~99.3%). Percentages below are relative to this */
bool roundup= false;
uint64_t vpdiv100 = vp / 100;
uint64_t vmdiv100 = vm / 100;
if (vpdiv100 > vmdiv100) { // Optimization: remove two DIGs at a time (~86.2%).
uint64_t vrdiv100 = vr / 100;
uint32_t vrmod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrdiv100);
roundup = vrmod100 >= 50;
vr = vrdiv100;
vp = vpdiv100;
vm = vmdiv100;
removed += 2;
}
/* Loop iterations below (approximately), without optimization above:
* 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
* Loop iterations below (approximately), with optimization above:
* 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
*/
for (;;) {
uint64_t vpdiv10 = vp / 10;
uint64_t vmdiv10 = vm / 10;
if (vpdiv10 <= vmdiv10) {
break;
}
uint64_t vrdiv10 = vr / 10;
uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10);
roundup = vrmod10 >= 5;
vr = vrdiv10;
vp = vpdiv10;
vm = vmdiv10;
++removed;
}
/* We need to take vr + 1 if vr is outside bounds or we need to round up */
dman = vr + (vr == vm || roundup);
}
f64_d fd = {
.exp = e10 + removed,
.man = dman,
};
return fd;
}
/* Print the decimal DIGs from mantissa */
static inline void print_mantissa(uint64_t man, char *out, int mlen) {
/* We have at most 17 DIGs, and uint32_t can store 9 DIGs.
* If man doesn't fit into uint32_t, we cut off 8 DIGs,
* so the rest will fit into uint32_t.
*/
char *r = out + mlen;
if (man < 10) {}
if ((man >> 32) != 0) {
/* Expensive 64-bit division */
uint64_t q = man / 100000000;
uint32_t man2 = ((uint32_t) man) - 100000000 * ((uint32_t) q);
man = q;
uint32_t c = man2 % 10000;
man2 /= 10000;
uint32_t d = man2 % 10000;
uint32_t c0 = (c % 100) << 1;
uint32_t c1 = (c / 100) << 1;
uint32_t d0 = (d % 100) << 1;
uint32_t d1 = (d / 100) << 1;
copy_two_digs(r - 2, DIG_TAB + c0)
copy_two_digs(r - 4, DIG_TAB + c1)
copy_two_digs(r - 6, DIG_TAB + d0)
copy_two_digs(r - 8, DIG_TAB + d1)
r -= 8;
}
uint32_t man2 = (uint32_t) man;
while (man2 >= 10000) {
#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
uint32_t c = man2 - 10000 * (man2 / 10000);
#else
uint32_t c = man2 % 10000;
#endif
man2 /= 10000;
uint32_t c0 = (c % 100) << 1;
uint32_t c1 = (c / 100) << 1;
copy_two_digs(r - 2, DIG_TAB + c0)
copy_two_digs(r - 4, DIG_TAB + c1)
r -= 4;
}
if (man2 >= 100) {
uint32_t c = (man2 % 100) << 1;
man2 /= 100;
copy_two_digs(r - 2, DIG_TAB + c)
r -= 2;
}
if (man2 >= 10) {
uint32_t c = man2 << 1;
copy_two_digs(r - 2, DIG_TAB + c)
} else {
*out = (char) ('0' + man2);
}
}
static inline int print_exponent(f64_d v, char *out, int mlen) {
int idx = 0;
print_mantissa(v.man, out + idx + 1, mlen);
/* Print decimal point if needed */
out[idx] = out[idx + 1];
if (mlen > 1) {
out[idx + 1] = '.';
idx += mlen + 1;
} else {
++idx;
}
/* Print the exponent */
out[idx++] = 'e';
int32_t exp = v.exp + (int32_t) mlen - 1;
if (exp < 0) {
out[idx++] = '-';
exp = -exp;
}
if (exp >= 100) {
int32_t c = exp % 10;
copy_two_digs(out + idx, DIG_TAB + 2 * (exp / 10))
out[idx + 2] = (char) ('0' + c);
idx += 3;
} else if (exp >= 10) {
copy_two_digs(out + idx, DIG_TAB + 2 * exp)
idx += 2;
} else {
out[idx++] = (char) ('0' + exp);
}
return idx;
}
static inline int print_decimal(const f64_d v, char *out, int mlen) {
int idx = 0;
int lzeros = 0;
int rzeros = 0;
int point = 0;
int exp10 = mlen - 1 + v.exp;
/* parse the point idx and additional zeros */
if (exp10 < 0) {
lzeros = -exp10;
point = 1;
} else if (exp10 < mlen - 1) {
point = 1 + exp10;
} else {
rzeros = exp10 - mlen + 1;
}
int i = 0;
/* add left zeros */
if (lzeros) {
out[idx++] = '0';
out[idx++] = '.';
point = 0;
}
for (i = 1; i < lzeros; ++i) {
out[idx++] = '0';
}
/* add the mantissa DIGs */
print_mantissa(v.man, out + idx, mlen);
if (point) {
for (i = idx + mlen; i > idx + point; --i) {
out[i] = out[i-1];
}
out[idx + point] = '.';
idx += 1;
}
/* add right zeros */
idx += mlen;
for (i = 0; i < rzeros; ++i) {
out[idx++] = '0';
}
return idx;
}
static inline bool f64tod_exct_int(const uint64_t man, const uint32_t exp,
f64_d* v) {
uint64_t m2 = (1ull << 52) | man; // implicit 1
int32_t e2 = (int32_t) exp - 1023 - 52;
if (e2 > 0 || e2 < -52) {
return false;
}
uint64_t mask = (1ull << -e2) - 1;
if ((m2 & mask) != 0) { // with fraction
return false;
}
v->man = m2 >> -e2;
v->exp = 0;
return true;
}
static int inline ryu(double val, char *out) {
/* Step 1: Decode the floating-point number */
uint64_t bits = *(uint64_t *)(&val);
uint64_t man = bits & ((1ull << 52) - 1);
uint32_t exp = (uint32_t) ((bits >> 52) & ((1u << 11) - 1));
/* Skip when Infinity */
if (exp == ((1u << 11) - 1u)) {
return 0;
}
f64_d v;
/* for integer from [1, 2^53], can be resepensated exactly by double */
bool is_exact_int = f64tod_exct_int(man, exp, &v);
if (!is_exact_int){ // find the shortest decimal representation
v = f64tod(man, exp);
}
/* Step 5: Print the decimal representation */
int idx = 0;
uint32_t mlen = ctz10(v.man);
int exp10 = mlen - 1 + v.exp;
/* The format as Go encoding/json package */
bool isexp = exp10 < -6 || exp10 >= 21;
if (isexp) // exponent format
idx += print_exponent(v, out + idx, mlen);
else // decimal format
idx += print_decimal(v, out + idx, mlen);
/* Terminate the string */
out[idx] = '\0';
return idx;
}
int f64toa(char *out, double val) {
int i = 0;
char *p = out;
/* simple case of 0.0 */
if (val == 0.0) {
*p = '0';
return 1;
}
/* negative numbers */
if (val < 0.0) {
i = 1;
val = -val;
*p++ = '-';
}
/* print the number with Ryu algorithm */
int n = ryu(val, p);
return n + i;
}