/* Copyright 2018 Ulf Adams. * Modifications copyright 2021 ByteDance Inc. * * The contents of this file may be used under the terms of the Apache License, * Version 2.0. * * (See accompanying file LICENSE-Apache or copy at * http: *www.apache.org/licenses/LICENSE-2.0) * * Alternatively, the contents of this file may be used under the terms of * the Boost Software License, Version 1.0. * (See accompanying file LICENSE-Boost or copy at * https: *www.boost.org/LICENSE_1_0.txt) * * Unless required by applicable law or agreed to in writing, this software * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. */ #include "native.h" #include "ryu_tab.h" /* use 128-bit type for performance */ typedef __uint128_t uint128_t; /* Returns e == 0 ? 1 : ceil(log_2(5^e)) */ static inline int32_t pow5bits(const int32_t e) { return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); } /* Returns floor(log_10(2^e)) */ static inline uint32_t log10pow2(const int32_t e) { return (((uint32_t) e) * 78913) >> 18; } /* Returns floor(log_10(5^e)) */ static inline uint32_t log10pow5(const int32_t e) { return (((uint32_t) e) * 732923) >> 20; } static inline uint32_t pow5factor(uint64_t v) { uint64_t m_inv5 = 14757395258967641293u; // *5 = 1(mod 2^64) uint64_t n_div5 = 3689348814741910323u; // =2^64 / 5 uint32_t cnt = 0; for (;;) { v *= m_inv5; if (v > n_div5) break; ++cnt; } return cnt; } /* Returns true if value is divisible by 5^p */ static inline bool ispow5(const uint64_t v, const uint32_t p) { return pow5factor(v) >= p; } /* Returns true if value is divisible by 2^p */ static inline bool ispow2(const uint64_t v, const uint32_t p) { return (v & ((1ull << p) - 1)) == 0; } /* Requires 0 <= v < v < 100000000000000000L */ static inline uint32_t ctz10(const uint64_t v) { if (v < 10) return 1; if (v < 100) return 2; if (v < 1000) return 3; if (v < 10000) return 4; if (v < 100000) return 5; if (v < 1000000) return 6; if (v < 10000000) return 7; if (v < 100000000) return 8; if (v < 1000000000) return 9; if (v < 10000000000) return 10; if (v < 100000000000) return 11; if (v < 1000000000000) return 12; if (v < 10000000000000) return 13; if (v < 100000000000000) return 14; if (v < 1000000000000000) return 15; if (v < 10000000000000000) return 16; return 17; } /* Best case: use 128-bit type */ static inline uint64_t mulshift(const uint64_t m, const uint64_t *mul, const int32_t j) { uint128_t lo = ((uint128_t) m) * mul[0]; uint128_t hi = ((uint128_t) m) * mul[1]; return (uint64_t) (((lo >> 64) + hi) >> (j - 64)); } #define mul_shift_all(m, mul, j, shift) \ vp = mulshift(4 * m + 2, mul, j); \ vm = mulshift(4 * m - 1 - shift, mul, j); \ vr = mulshift(4 * m, mul, j); #define copy_two_digs(dst, src) \ *(dst) = *(src); \ *(dst+1) = *(src+1); /* A floating decimal representing man * 10^exp */ typedef struct f64_d { uint64_t man; int32_t exp; } f64_d; static inline f64_d f64tod(const uint64_t man,const uint32_t exp) { int32_t e2; uint64_t m2; if (exp == 0) { /* subtract 2 so that the bounds computation has 2 additional bits */ e2 = 1 - 1023 - 52 - 2; m2 = man; } else { e2 = (int32_t) exp - 1023 - 52 - 2; m2 = (1ull << 52) | man; } bool even = (m2 & 1) == 0; /* Step 2: Determine the interval of valid decimal representations */ uint64_t mv = 4 * m2; /* Implicit bool -> int conversion. True is 1, false is 0 */ uint32_t shift = man != 0 || exp <= 1; /* Step 3: Convert to a decimal power base using 128-bit arithmetic */ uint64_t vr, vp, vm; int32_t e10; bool vmzeros = false; bool vrzeros = false; if (e2 >= 0) { uint32_t q = log10pow2(e2) - (e2 > 3); // max(0, log10pow2(e2) - 1) e10 = (int32_t) q; int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; int32_t i = -e2 + (int32_t) q + k; uint64_t *mul = (uint64_t*)DOUBLE_POW5_INV_SPLIT[q]; /* {vm, vr, vp} * 10^e10 = {mm, mv, mp} * 2^e2 * mp = 4 * m2 + 2 * mm = mv - 1 - shift */ mul_shift_all(m2, mul, i, shift) if (q <= 21) { if (mv % 5 == 0) { vrzeros = ispow5(mv, q); } else if (even) { /* Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q * <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q * <=> true && pow5Factor(mm) >= q, since e2 >= q. */ vmzeros = ispow5(mv - 1 - shift, q); } else { /* Same as min(e2 + 1, pow5Factor(mp)) >= q. */ vp -= ispow5(mv + 2, q); } } } else { uint32_t q = log10pow5(-e2) - (-e2 > 1); // max(0, log10pow5(-e2) - 1) e10 = (int32_t) q + e2; int32_t i = -e2 - (int32_t) q; int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; int32_t j = (int32_t) q - k; uint64_t *mul = (uint64_t*)DOUBLE_POW5_SPLIT[i]; /* {vm, vr, vp} * 10^e10 = {mm, mv, mp} * 2^e2 */ mul_shift_all(m2, mul, j, shift) if (q <= 1) { /* {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing * 0 bits. mv = 4 * m2, so it always has at least two trailing 0 bits. */ vrzeros = true; if (even) { /* mm = mv - 1 - shift, so it has 1 trailing 0 bit if shift = 1 */ vmzeros = shift == 1; } else { /* mp = mv + 2, so it always has at least one trailing 0 bit */ --vp; } } else if (q < 63) { vrzeros = ispow2(mv, q); } } /* Step 4: Find the shortest decimal representation in the interval */ int32_t removed = 0; uint8_t lastrmdig = 0; uint64_t dman; /* On average, we remove ~2 DIGs */ if (vmzeros || vrzeros) { /* General case, which happens rarely (~0.7%) */ for (;;) { uint64_t vpdiv10 = vp / 10; uint64_t vmdiv10 = vm / 10; if (vpdiv10 <= vmdiv10) { break; } uint32_t vmmod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmdiv10); uint64_t vrdiv10 = vr / 10; uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10); vmzeros &= vmmod10 == 0; vrzeros &= lastrmdig == 0; lastrmdig = (uint8_t) vrmod10; vr = vrdiv10; vp = vpdiv10; vm = vmdiv10; ++removed; } if (vmzeros) { for (;;) { uint64_t vmdiv10 = vm / 10; uint32_t vmmod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmdiv10); if (vmmod10 != 0) { break; } uint64_t vpdiv10 = vp / 10; uint64_t vrdiv10 = vr / 10; uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10); vrzeros &= lastrmdig == 0; lastrmdig = (uint8_t) vrmod10; vr = vrdiv10; vp = vpdiv10; vm = vmdiv10; ++removed; } } if (vrzeros && lastrmdig == 5 && vr % 2 == 0) { /* Round even if the exact number is .....50..0 */ lastrmdig = 4; } /* We need to take vr + 1 if vr is outside bounds or we need to round up */ dman = vr + ((vr == vm && (!even || !vmzeros)) || lastrmdig >= 5); } else { /* Specialized for the common case (~99.3%). Percentages below are relative to this */ bool roundup= false; uint64_t vpdiv100 = vp / 100; uint64_t vmdiv100 = vm / 100; if (vpdiv100 > vmdiv100) { // Optimization: remove two DIGs at a time (~86.2%). uint64_t vrdiv100 = vr / 100; uint32_t vrmod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrdiv100); roundup = vrmod100 >= 50; vr = vrdiv100; vp = vpdiv100; vm = vmdiv100; removed += 2; } /* Loop iterations below (approximately), without optimization above: * 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% * Loop iterations below (approximately), with optimization above: * 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% */ for (;;) { uint64_t vpdiv10 = vp / 10; uint64_t vmdiv10 = vm / 10; if (vpdiv10 <= vmdiv10) { break; } uint64_t vrdiv10 = vr / 10; uint32_t vrmod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrdiv10); roundup = vrmod10 >= 5; vr = vrdiv10; vp = vpdiv10; vm = vmdiv10; ++removed; } /* We need to take vr + 1 if vr is outside bounds or we need to round up */ dman = vr + (vr == vm || roundup); } f64_d fd = { .exp = e10 + removed, .man = dman, }; return fd; } /* Print the decimal DIGs from mantissa */ static inline void print_mantissa(uint64_t man, char *out, int mlen) { /* We have at most 17 DIGs, and uint32_t can store 9 DIGs. * If man doesn't fit into uint32_t, we cut off 8 DIGs, * so the rest will fit into uint32_t. */ char *r = out + mlen; if (man < 10) {} if ((man >> 32) != 0) { /* Expensive 64-bit division */ uint64_t q = man / 100000000; uint32_t man2 = ((uint32_t) man) - 100000000 * ((uint32_t) q); man = q; uint32_t c = man2 % 10000; man2 /= 10000; uint32_t d = man2 % 10000; uint32_t c0 = (c % 100) << 1; uint32_t c1 = (c / 100) << 1; uint32_t d0 = (d % 100) << 1; uint32_t d1 = (d / 100) << 1; copy_two_digs(r - 2, DIG_TAB + c0) copy_two_digs(r - 4, DIG_TAB + c1) copy_two_digs(r - 6, DIG_TAB + d0) copy_two_digs(r - 8, DIG_TAB + d1) r -= 8; } uint32_t man2 = (uint32_t) man; while (man2 >= 10000) { #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217 uint32_t c = man2 - 10000 * (man2 / 10000); #else uint32_t c = man2 % 10000; #endif man2 /= 10000; uint32_t c0 = (c % 100) << 1; uint32_t c1 = (c / 100) << 1; copy_two_digs(r - 2, DIG_TAB + c0) copy_two_digs(r - 4, DIG_TAB + c1) r -= 4; } if (man2 >= 100) { uint32_t c = (man2 % 100) << 1; man2 /= 100; copy_two_digs(r - 2, DIG_TAB + c) r -= 2; } if (man2 >= 10) { uint32_t c = man2 << 1; copy_two_digs(r - 2, DIG_TAB + c) } else { *out = (char) ('0' + man2); } } static inline int print_exponent(f64_d v, char *out, int mlen) { int idx = 0; print_mantissa(v.man, out + idx + 1, mlen); /* Print decimal point if needed */ out[idx] = out[idx + 1]; if (mlen > 1) { out[idx + 1] = '.'; idx += mlen + 1; } else { ++idx; } /* Print the exponent */ out[idx++] = 'e'; int32_t exp = v.exp + (int32_t) mlen - 1; if (exp < 0) { out[idx++] = '-'; exp = -exp; } if (exp >= 100) { int32_t c = exp % 10; copy_two_digs(out + idx, DIG_TAB + 2 * (exp / 10)) out[idx + 2] = (char) ('0' + c); idx += 3; } else if (exp >= 10) { copy_two_digs(out + idx, DIG_TAB + 2 * exp) idx += 2; } else { out[idx++] = (char) ('0' + exp); } return idx; } static inline int print_decimal(const f64_d v, char *out, int mlen) { int idx = 0; int lzeros = 0; int rzeros = 0; int point = 0; int exp10 = mlen - 1 + v.exp; /* parse the point idx and additional zeros */ if (exp10 < 0) { lzeros = -exp10; point = 1; } else if (exp10 < mlen - 1) { point = 1 + exp10; } else { rzeros = exp10 - mlen + 1; } int i = 0; /* add left zeros */ if (lzeros) { out[idx++] = '0'; out[idx++] = '.'; point = 0; } for (i = 1; i < lzeros; ++i) { out[idx++] = '0'; } /* add the mantissa DIGs */ print_mantissa(v.man, out + idx, mlen); if (point) { for (i = idx + mlen; i > idx + point; --i) { out[i] = out[i-1]; } out[idx + point] = '.'; idx += 1; } /* add right zeros */ idx += mlen; for (i = 0; i < rzeros; ++i) { out[idx++] = '0'; } return idx; } static inline bool f64tod_exct_int(const uint64_t man, const uint32_t exp, f64_d* v) { uint64_t m2 = (1ull << 52) | man; // implicit 1 int32_t e2 = (int32_t) exp - 1023 - 52; if (e2 > 0 || e2 < -52) { return false; } uint64_t mask = (1ull << -e2) - 1; if ((m2 & mask) != 0) { // with fraction return false; } v->man = m2 >> -e2; v->exp = 0; return true; } static int inline ryu(double val, char *out) { /* Step 1: Decode the floating-point number */ uint64_t bits = *(uint64_t *)(&val); uint64_t man = bits & ((1ull << 52) - 1); uint32_t exp = (uint32_t) ((bits >> 52) & ((1u << 11) - 1)); /* Skip when Infinity */ if (exp == ((1u << 11) - 1u)) { return 0; } f64_d v; /* for integer from [1, 2^53], can be resepensated exactly by double */ bool is_exact_int = f64tod_exct_int(man, exp, &v); if (!is_exact_int){ // find the shortest decimal representation v = f64tod(man, exp); } /* Step 5: Print the decimal representation */ int idx = 0; uint32_t mlen = ctz10(v.man); int exp10 = mlen - 1 + v.exp; /* The format as Go encoding/json package */ bool isexp = exp10 < -6 || exp10 >= 21; if (isexp) // exponent format idx += print_exponent(v, out + idx, mlen); else // decimal format idx += print_decimal(v, out + idx, mlen); /* Terminate the string */ out[idx] = '\0'; return idx; } int f64toa(char *out, double val) { int i = 0; char *p = out; /* simple case of 0.0 */ if (val == 0.0) { *p = '0'; return 1; } /* negative numbers */ if (val < 0.0) { i = 1; val = -val; *p++ = '-'; } /* print the number with Ryu algorithm */ int n = ryu(val, p); return n + i; }