Fix preferred serialization of subnormals (#192)
Preferred Serialization now fully supports conversion to/from half, single and double subnormals. This includes NaN payloads.
The tests for floating-point are much better organized and give greater coverage.
IEEE 754 code is better organized and cleaner.
* Fix preferred serialization of subnormals (checkpoint)
* Check point progress
* Preferred float mostly working and tests passing
* added NaN tests
* Fix up ieee754.h; a few other compiler warnings
* decoding NaN payloads fix; rework half-double; tests
* Code tidyness
* indent to 3, not 4
* TODO's are done in other test; code tidy
* test running with float HW use disabled
* Remove / rearrange float tests
* Fix full float ifdef test fan out
* Code tidiness; sort out final TODO's
---------
Co-authored-by: Laurence Lundblade <lgl@securitytheory.com>
diff --git a/src/ieee754.c b/src/ieee754.c
index a8079f8..2d98159 100644
--- a/src/ieee754.c
+++ b/src/ieee754.c
@@ -1,71 +1,63 @@
-/*==============================================================================
- ieee754.c -- floating-point conversion between half, double & single-precision
-
- Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
- Copyright (c) 2021, Arm Limited. All rights reserved.
-
- SPDX-License-Identifier: BSD-3-Clause
-
- See BSD-3-Clause license in README.md
-
- Created on 7/23/18
- =============================================================================*/
+/* ==========================================================================
+ * ieee754.c -- floating-point conversion between half, double & single-precision
+ *
+ * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
+ * Copyright (c) 2021, Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * See BSD-3-Clause license in README.md
+ *
+ * Created on 7/23/18
+ * ========================================================================== */
/*
- Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as
- QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h
+ * Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as
+ * QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h
*/
#include "qcbor/qcbor_common.h"
#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
#include "ieee754.h"
-#include <string.h> // For memcpy()
+#include <string.h> /* For memcpy() */
/*
- This code is written for clarity and verifiability, not for size, on
- the assumption that the optimizer will do a good job. The LLVM
- optimizer, -Os, does seem to do the job and the resulting object code
- is smaller from combining code for the many different cases (normal,
- subnormal, infinity, zero...) for the conversions. GCC is no where near
- as good.
-
- This code has really long lines and is much easier to read because of
- them. Some coding guidelines prefer 80 column lines (can they not afford
- big displays?). It would make this code much worse even to wrap at 120
- columns.
-
- Dead stripping is also really helpful to get code size down when
- floating-point encoding is not needed. (If this is put in a library
- and linking is against the library, then dead stripping is automatic).
-
- This code works solely using shifts and masks and thus has no
- dependency on any math libraries. It can even work if the CPU doesn't
- have any floating-point support, though that isn't the most useful
- thing to do.
-
- The memcpy() dependency is only for CopyFloatToUint32() and friends
- which only is needed to avoid type punning when converting the actual
- float bits to an unsigned value so the bit shifts and masks can work.
- */
-
-/*
- The references used to write this code:
-
- - IEEE 754-2008, particularly section 3.6 and 6.2.1
-
- - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
-
- - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
-
- - https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
-
- - https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
+ * This code has long lines and is easier to read because of
+ * them. Some coding guidelines prefer 80 column lines (can they not
+ * afford big displays?).
+ *
+ * This code works solely using shifts and masks and thus has no
+ * dependency on any math libraries. It can even work if the CPU
+ * doesn't have any floating-point support, though that isn't the most
+ * useful thing to do.
+ *
+ * The memcpy() dependency is only for CopyFloatToUint32() and friends
+ * which only is needed to avoid type punning when converting the
+ * actual float bits to an unsigned value so the bit shifts and masks
+ * can work.
+ *
+ * The references used to write this code:
+ *
+ * IEEE 754-2008, particularly section 3.6 and 6.2.1
+ *
+ * https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
+ *
+ * https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
+ *
+ * https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
+ *
+ * https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
+ *
+ * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not
+ * needed. It can be retrieved from github history if needed.
*/
-// ----- Half Precsion -----------
+
+
+/* ----- Half Precsion ----------- */
#define HALF_NUM_SIGNIFICAND_BITS (10)
#define HALF_NUM_EXPONENT_BITS (5)
#define HALF_NUM_SIGN_BITS (1)
@@ -74,16 +66,16 @@
#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
-#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits // 0x03ff
+#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits
#define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
-#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // // 0x8000 1 bit of sign
+#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign
#define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
/* Biased Biased Unbiased Use
- 0x00 0 -15 0 and subnormal
- 0x01 1 -14 Smallest normal exponent
- 0x1e 30 15 Largest normal exponent
- 0x1F 31 16 NaN and Infinity */
+ * 0x00 0 -15 0 and subnormal
+ * 0x01 1 -14 Smallest normal exponent
+ * 0x1e 30 15 Largest normal exponent
+ * 0x1F 31 16 NaN and Infinity */
#define HALF_EXPONENT_BIAS (15)
#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
@@ -91,7 +83,7 @@
#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
-// ------ Single-Precision --------
+/* ------ Single-Precision -------- */
#define SINGLE_NUM_SIGNIFICAND_BITS (23)
#define SINGLE_NUM_EXPONENT_BITS (8)
#define SINGLE_NUM_SIGN_BITS (1)
@@ -106,19 +98,19 @@
#define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
/* Biased Biased Unbiased Use
- 0x0000 0 -127 0 and subnormal
- 0x0001 1 -126 Smallest normal exponent
- 0x7f 127 0 1
- 0xfe 254 127 Largest normal exponent
- 0xff 255 128 NaN and Infinity */
+ * 0x0000 0 -127 0 and subnormal
+ * 0x0001 1 -126 Smallest normal exponent
+ * 0x7f 127 0 1
+ * 0xfe 254 127 Largest normal exponent
+ * 0xff 255 128 NaN and Infinity */
#define SINGLE_EXPONENT_BIAS (127)
-#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
-#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
-#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
-#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
+#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS)
+#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1)
+#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS)
+#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1)
-// --------- Double-Precision ----------
+/* --------- Double-Precision ---------- */
#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
#define DOUBLE_NUM_EXPONENT_BITS (11)
#define DOUBLE_NUM_SIGN_BITS (1)
@@ -134,372 +126,518 @@
/* Biased Biased Unbiased Use
- 0x00000000 0 -1023 0 and subnormal
- 0x00000001 1 -1022 Smallest normal exponent
- 0x000007fe 2046 1023 Largest normal exponent
- 0x000007ff 2047 1024 NaN and Infinity */
+ * 0x00000000 0 -1023 0 and subnormal
+ * 0x00000001 1 -1022 Smallest normal exponent
+ * 0x000007fe 2046 1023 Largest normal exponent
+ * 0x000007ff 2047 1024 NaN and Infinity */
#define DOUBLE_EXPONENT_BIAS (1023)
-#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
-#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
-#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
-#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
+#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS)
+#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1)
+#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS)
+#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1)
+
/*
- Convenient functions to avoid type punning, compiler warnings and
- such. The optimizer reduces them to a simple assignment. This is a
- crusty corner of C. It shouldn't be this hard.
-
- These are also in UsefulBuf.h under a different name. They are copied
- here to avoid a dependency on UsefulBuf.h. There is no object code
- size impact because these always optimze down to a simple assignment.
+ * Convenient functions to avoid type punning, compiler warnings and
+ * such. The optimizer reduces them to a simple assignment. This is a
+ * crusty corner of C. It shouldn't be this hard.
+ *
+ * These are also in UsefulBuf.h under a different name. They are copied
+ * here to avoid a dependency on UsefulBuf.h. There is no object code
+ * size impact because these always optimze down to a simple assignment.
*/
-static inline uint32_t CopyFloatToUint32(float f)
+static inline uint32_t
+CopyFloatToUint32(float f)
{
- uint32_t u32;
- memcpy(&u32, &f, sizeof(uint32_t));
- return u32;
+ uint32_t u32;
+ memcpy(&u32, &f, sizeof(uint32_t));
+ return u32;
}
-static inline uint64_t CopyDoubleToUint64(double d)
+static inline uint64_t
+CopyDoubleToUint64(double d)
{
- uint64_t u64;
- memcpy(&u64, &d, sizeof(uint64_t));
- return u64;
+ uint64_t u64;
+ memcpy(&u64, &d, sizeof(uint64_t));
+ return u64;
}
-static inline double CopyUint64ToDouble(uint64_t u64)
+static inline double
+CopyUint64ToDouble(uint64_t u64)
{
- double d;
- memcpy(&d, &u64, sizeof(uint64_t));
- return d;
+ double d;
+ memcpy(&d, &u64, sizeof(uint64_t));
+ return d;
+}
+
+static inline float
+CopyUint32ToSingle(uint32_t u32)
+{
+ float f;
+ memcpy(&f, &u32, sizeof(uint32_t));
+ return f;
}
-// Public function; see ieee754.h
-uint16_t IEEE754_FloatToHalf(float f)
-{
- // Pull the three parts out of the single-precision float
- const uint32_t uSingle = CopyFloatToUint32(f);
- const int32_t nSingleUnbiasedExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
- const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
- const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
- // Now convert the three parts to half-precision.
-
- // All works is done on uint32_t with conversion to uint16_t at
- // the end. This avoids integer promotions that static analyzers
- // complain about and reduces code size.
- uint32_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
-
- if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
- // +/- Infinity and NaNs -- single biased exponent is 0xff
- uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
- if(!uSingleSignificand) {
- // Infinity
- uHalfSignificand = 0;
- } else {
- // Copy the LSBs of the NaN payload that will fit from the
- // single to the half
- uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
- if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
- // It's a qNaN; copy the qNaN bit
- uHalfSignificand |= HALF_QUIET_NAN_BIT;
- } else {
- // It's an sNaN; make sure the significand is not zero
- // so it stays a NaN This is needed because not all
- // significand bits are copied from single
- if(!uHalfSignificand) {
- // Set the LSB. This is what wikipedia shows for
- // sNAN.
- uHalfSignificand |= 0x01;
- }
- }
- }
- } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
- // 0 or a subnormal number -- singled biased exponent is 0
- uHalfBiasedExponent = 0;
- uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
- } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
- // Exponent is too large to express in half-precision; round
- // up to infinity
- uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
- uHalfSignificand = 0;
- } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
- // Exponent is too small to express in half-precision normal;
- // make it a half-precision subnormal
- uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
- uHalfSignificand = 0;
- // Could convert some of these values to a half-precision
- // subnormal, but the layer above this will never use it. See
- // layer above. There is code to do this in github history
- // for this file, but it was removed because it was never
- // invoked.
- } else {
- // The normal case, exponent is in range for half-precision
- uHalfBiasedExponent = (uint32_t)(nSingleUnbiasedExponent + HALF_EXPONENT_BIAS);
- uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
- }
- uHalfSign = uSingleSign;
-
- // Put the 3 values in the right place for a half precision
- const uint32_t uHalfPrecision = uHalfSignificand |
- (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
- (uHalfSign << HALF_SIGN_SHIFT);
- // Cast is safe because all the masks and shifts above work to
- // make a half precision value which is only 16 bits.
- return (uint16_t)uHalfPrecision;
-}
-
-
-// Public function; see ieee754.h
-uint16_t IEEE754_DoubleToHalf(double d)
-{
- // Pull the three parts out of the double-precision float
- const uint64_t uDouble = CopyDoubleToUint64(d);
- const int64_t nDoubleUnbiasedExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
- const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
- const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
-
- // Now convert the three parts to half-precision.
-
- // All works is done on uint64_t with conversion to uint16_t at
- // the end. This avoids integer promotions that static analyzers
- // complain about. Other options are for these to be unsigned int
- // or fast_int16_t. Code size doesn't vary much between all these
- // options for 64-bit LLVM, 64-bit GCC and 32-bit Armv7 LLVM.
- uint64_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
-
- if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
- // +/- Infinity and NaNs -- single biased exponent is 0xff
- uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
- if(!uDoubleSignificand) {
- // Infinity
- uHalfSignificand = 0;
- } else {
- // Copy the LSBs of the NaN payload that will fit from the
- // double to the half
- uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
- if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
- // It's a qNaN; copy the qNaN bit
- uHalfSignificand |= HALF_QUIET_NAN_BIT;
- } else {
- // It's an sNaN; make sure the significand is not zero
- // so it stays a NaN This is needed because not all
- // significand bits are copied from single
- if(!uHalfSignificand) {
- // Set the LSB. This is what wikipedia shows for
- // sNAN.
- uHalfSignificand |= 0x01;
- }
- }
- }
- } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
- // 0 or a subnormal number -- double biased exponent is 0
- uHalfBiasedExponent = 0;
- uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
- } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
- // Exponent is too large to express in half-precision; round
- // up to infinity; TODO, is this really true?
- uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
- uHalfSignificand = 0;
- } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
- // Exponent is too small to express in half-precision; round
- // down to zero
- uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
- uHalfSignificand = 0;
- // Could convert some of these values to a half-precision
- // subnormal, but the layer above this will never use it. See
- // layer above. There is code to do this in github history
- // for this file, but it was removed because it was never
- // invoked.
- } else {
- // The normal case, exponent is in range for half-precision
- uHalfBiasedExponent = (uint32_t)(nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS);
- uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
- }
- uHalfSign = uDoubleSign;
-
-
- // Put the 3 values in the right place for a half precision
- const uint64_t uHalfPrecision = uHalfSignificand |
- (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
- (uHalfSign << HALF_SIGN_SHIFT);
- // Cast is safe because all the masks and shifts above work to
- // make a half precision value which is only 16 bits.
- return (uint16_t)uHalfPrecision;
-}
-
-
-/*
- EEE754_HalfToFloat() was created but is not needed. It can be retrieved from
- github history if needed.
+/**
+ * @brief Assemble sign, significand and exponent into single precision float.
+ *
+ * @param[in] uDoubleSign 0 if positive, 1 if negative
+ * @pararm[in] uDoubleSignificand Bits of the significand
+ * @param[in] nDoubleUnBiasedExponent Exponent
+ *
+ * This returns the bits for a single-precision float, a binary64
+ * as specified in IEEE754.
*/
-
-
-// Public function; see ieee754.h
-double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
+static double
+IEEE754_AssembleDouble(uint64_t uDoubleSign,
+ uint64_t uDoubleSignificand,
+ int64_t nDoubleUnBiasedExponent)
{
- // Pull out the three parts of the half-precision float. Do all
- // the work in 64 bits because that is what the end result is. It
- // may give smaller code size and will keep static analyzers
- // happier.
- const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
- const int64_t nHalfUnBiasedExponent = (int64_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
- const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
+ uint64_t uDoubleBiasedExponent;
+
+ uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
+
+ return CopyUint64ToDouble(uDoubleSignificand |
+ (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
+ (uDoubleSign << DOUBLE_SIGN_SHIFT));
+}
- // Make the three parts of hte single-precision number
- uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
- if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
- // 0 or subnormal
- uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
- if(uHalfSignificand) {
- // Subnormal case
- uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
- // A half-precision subnormal can always be converted to a
- // normal double-precision float because the ranges line
- // up
- uDoubleSignificand = uHalfSignificand;
- // Shift bits from right of the decimal to left, reducing
- // the exponent by 1 each time
- do {
- uDoubleSignificand <<= 1;
- uDoubleBiasedExponent--;
- } while ((uDoubleSignificand & 0x400) == 0);
- uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
- uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
+double
+IEEE754_HalfToDouble(uint16_t uHalfPrecision)
+{
+ uint64_t uDoubleSignificand;
+ int64_t nDoubleUnBiasedExponent;
+ double dResult;
+
+ /* Pull out the three parts of the half-precision float. Do all
+ * the work in 64 bits because that is what the end result is. It
+ * may give smaller code size and will keep static analyzers
+ * happier.
+ */
+ const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
+ const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT;
+ const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS;
+ const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
+
+ if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
+ /* 0 or subnormal */
+ if(uHalfSignificand) {
+ /* --- SUBNORMAL --- */
+ /* A half-precision subnormal can always be converted to a
+ * normal double-precision float because the ranges line up.
+ * The exponent of a subnormal starts out at the min exponent
+ * for a normal. As the sub normal significand bits are
+ * shifted, left to normalize, the exponent is
+ * decremented. Shifting continues until fully normalized.
+ */
+ nDoubleUnBiasedExponent = HALF_EXPONENT_MIN;
+ uDoubleSignificand = uHalfSignificand;
+ do {
+ uDoubleSignificand <<= 1;
+ nDoubleUnBiasedExponent--;
+ } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0);
+ /* A normal has an implied 1 in the most significant
+ * position that a subnormal doesn't. */
+ uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS;
+ /* Must shift into place for a double significand */
+ uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
+
+ dResult = IEEE754_AssembleDouble(uHalfSign,
+ uDoubleSignificand,
+ nDoubleUnBiasedExponent);
+ } else {
+ /* --- ZERO --- */
+ dResult = IEEE754_AssembleDouble(uHalfSign,
+ 0,
+ DOUBLE_EXPONENT_ZERO);
+ }
+ } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
+ /* NaN or Inifinity */
+ if(uHalfSignificand) {
+ /* --- NaN --- */
+ /* Half-precision payloads always fit into double precision
+ * payloads. They are shifted left the same as a normal
+ * number significand.
+ */
+ uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
+ dResult = IEEE754_AssembleDouble(uHalfSign,
+ uDoubleSignificand,
+ DOUBLE_EXPONENT_INF_OR_NAN);
+ } else {
+ /* --- INFINITY --- */
+ dResult = IEEE754_AssembleDouble(uHalfSign,
+ 0,
+ DOUBLE_EXPONENT_INF_OR_NAN);
+ }
+ } else {
+ /* --- NORMAL NUMBER --- */
+ uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
+ dResult = IEEE754_AssembleDouble(uHalfSign,
+ uDoubleSignificand,
+ nHalfUnBiasedExponent);
+ }
+
+ return dResult;
+}
+
+
+/**
+ * @brief Assemble sign, significand and exponent into single precision float.
+ *
+ * @param[in] uHalfSign 0 if positive, 1 if negative
+ * @pararm[in] uHalfSignificand Bits of the significand
+ * @param[in] nHalfUnBiasedExponent Exponent
+ *
+ * This returns the bits for a single-precision float, a binary32 as
+ * specified in IEEE754. It is returned as a uint64_t rather than a
+ * uint32_t or a float for convenience of usage.
+ */
+static uint32_t
+IEEE754_AssembleHalf(uint32_t uHalfSign,
+ uint32_t uHalfSignificand,
+ int32_t nHalfUnBiasedExponent)
+{
+ uint32_t uHalfUnbiasedExponent;
+
+ uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS);
+
+ return uHalfSignificand |
+ (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) |
+ (uHalfSign << HALF_SIGN_SHIFT);
+}
+
+
+/* Public function; see ieee754.h */
+IEEE754_union
+IEEE754_SingleToHalf(float f)
+{
+ IEEE754_union result;
+ uint32_t uDroppedBits;
+ int32_t nExponentDifference;
+ int32_t nShiftAmount;
+ uint32_t uHalfSignificand;
+
+ /* Pull the three parts out of the double-precision float Most work
+ * is done with uint32_t which helps avoid integer promotions and
+ * static analyzer complaints.
+ */
+ const uint32_t uSingle = CopyFloatToUint32(f);
+ const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT;
+ const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS;
+ const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
+ const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
+
+ if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
+ if(uSingleSignificand == 0) {
+ /* --- IS ZERO --- */
+ result.uSize = IEEE754_UNION_IS_HALF;
+ result.uValue = IEEE754_AssembleHalf(uSingleSign,
+ 0,
+ HALF_EXPONENT_ZERO);
+ } else {
+ /* --- IS SINGLE SUBNORMAL --- */
+ /* The largest single subnormal is slightly less than the
+ * largest single normal which is 2^-149 or
+ * 2.2040517676619426e-38. The smallest half subnormal is
+ * 2^-14 or 5.9604644775390625E-8. There is no overlap so
+ * single subnormals can't be converted to halfs of any sort.
+ */
+ result.uSize = IEEE754_UNION_IS_SINGLE;
+ result.uValue = uSingle;
+ }
+ } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
+ if(uSingleSignificand == 0) {
+ /* ---- IS INFINITY ---- */
+ result.uSize = IEEE754_UNION_IS_HALF;
+ result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN);
+ } else {
+ /* The NaN can only be converted if no payload bits are lost
+ * per RFC 8949 section 4.1 that defines Preferred
+ * Serializaton. Note that Deterministically Encode CBOR in
+ * section 4.2 allows for some variation of this rule, but at
+ * the moment this implementation is of Preferred
+ * Serialization, not CDE. As of December 2023, we are also
+ * expecting an update to CDE. This code may need to be
+ * updated for CDE.
+ */
+ uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS);
+ if(uDroppedBits == 0) {
+ /* --- IS CONVERTABLE NAN --- */
+ uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
+ result.uSize = IEEE754_UNION_IS_HALF;
+ result.uValue = IEEE754_AssembleHalf(uSingleSign,
+ uHalfSignificand,
+ HALF_EXPONENT_INF_OR_NAN);
+
+ } else {
+ /* --- IS UNCONVERTABLE NAN --- */
+ result.uSize = IEEE754_UNION_IS_SINGLE;
+ result.uValue = uSingle;
+ }
+ }
+ } else {
+ /* ---- REGULAR NUMBER ---- */
+ /* A regular single can be converted to a regular half if the
+ * single's exponent is in the smaller range of a half and if no
+ * precision is lost in the significand.
+ */
+ if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN &&
+ nSingleUnbiasedExponent <= HALF_EXPONENT_MAX &&
+ (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) {
+ uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
+
+ /* --- CONVERT TO HALF NORMAL --- */
+ result.uSize = IEEE754_UNION_IS_HALF;
+ result.uValue = IEEE754_AssembleHalf(uSingleSign,
+ uHalfSignificand,
+ nSingleUnbiasedExponent);
+ } else {
+ /* Unable to convert to a half normal. See if it can be
+ * converted to a half subnormal. To do that, the exponent
+ * must be in range and no precision can be lost in the
+ * signficand.
+ *
+ * This is more complicated because the number is not
+ * normalized. The signficand must be shifted proprotionally
+ * to the exponent and 1 must be added in. See
+ * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
+ *
+ * Exponents -14 to -24 map to a shift of 0 to 10 of the
+ * significand. The largest value of a half subnormal has an
+ * exponent of -14. Subnormals are not normalized like
+ * normals meaning they lose precision as the numbers get
+ * smaller. Normals don't lose precision because the exponent
+ * allows all the bits of the significand to be significant.
+ */
+ /* The exponent of the largest possible half-precision
+ * subnormal is HALF_EXPONENT_MIN (-14). Exponents larger
+ * than this are normal and handled above. We're going to
+ * shift the significand right by at least this amount.
+ */
+ nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
+
+ /* In addition to the shift based on the exponent's value,
+ * the single significand has to be shifted right to fit into
+ * a half-precision significand */
+ nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
+
+ /* Must add 1 in to the possible significand because there is
+ * an implied 1 for normal values and not for subnormal
+ * values. See equations here:
+ * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
+ */
+ uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
+
+ /* If only zero bits get shifted out, this can be converted
+ * to subnormal */
+ if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN &&
+ nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS &&
+ uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) {
+ /* --- CONVERTABLE TO HALF SUBNORMAL --- */
+ result.uSize = IEEE754_UNION_IS_HALF;
+ result.uValue = IEEE754_AssembleHalf(uSingleSign,
+ uHalfSignificand,
+ HALF_EXPONENT_ZERO);
+ } else {
+ /* --- DO NOT CONVERT --- */
+ result.uSize = IEEE754_UNION_IS_SINGLE;
+ result.uValue = uSingle;
+ }
+ }
+ }
+
+ return result;
+}
+
+
+/**
+ * @brief Assemble sign, significand and exponent into single precision float.
+ *
+ * @param[in] uSingleSign 0 if positive, 1 if negative
+ * @pararm[in] uSingleSignificand Bits of the significand
+ * @param[in] nSingleUnBiasedExponent Exponent
+ *
+ * This returns the bits for a single-precision float, a binary32 as
+ * specified in IEEE754. It is returned as a uint64_t rather than a
+ * uint32_t or a float for convenience of usage.
+ */
+static uint64_t
+IEEE754_AssembleSingle(uint64_t uSingleSign,
+ uint64_t uSingleSignificand,
+ int64_t nSingleUnBiasedExponent)
+{
+ uint64_t uSingleBiasedExponent;
+
+ uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS);
+
+ return uSingleSignificand |
+ (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
+ (uSingleSign << SINGLE_SIGN_SHIFT);
+}
+
+
+/**
+ * @brief Convert a double-precision float to single-precision.
+ *
+ * @param[in] d The value to convert.
+ *
+ * @returns Either unconverted value or value converted to single-precision.
+ *
+ * This always succeeds. If the value cannot be converted without the
+ * loss of precision, it is not converted.
+ *
+ * This handles all subnormals and NaN payloads.
+ */
+static IEEE754_union
+IEEE754_DoubleToSingle(double d)
+{
+ IEEE754_union Result;
+ int64_t nExponentDifference;
+ int64_t nShiftAmount;
+ uint64_t uSingleSignificand;
+ uint64_t uDroppedBits;
+
+
+ /* Pull the three parts out of the double-precision float. Most
+ * work is done with uint64_t which helps avoid integer promotions
+ * and static analyzer complaints.
+ */
+ const uint64_t uDouble = CopyDoubleToUint64(d);
+ const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT;
+ const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS;
+ const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
+ const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
+
+
+ if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
+ if(uDoubleSignificand == 0) {
+ /* --- IS ZERO --- */
+ Result.uSize = IEEE754_UNION_IS_SINGLE;
+ Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
+ 0,
+ SINGLE_EXPONENT_ZERO);
} else {
- // Just zero
- uDoubleSignificand = 0;
- }
- } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
- // NaN or Inifinity
- uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
- if(uHalfSignificand) {
- // NaN
- // First preserve the NaN payload from half to single
- uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
- if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
- // Next, set qNaN if needed since half qNaN bit is not
- // copied above
- uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
+ /* --- IS DOUBLE SUBNORMAL --- */
+ /* The largest double subnormal is slightly less than the
+ * largest double normal which is 2^-1022 or
+ * 2.2250738585072014e-308. The smallest single subnormal
+ * is 2^-149 or 1.401298464324817e-45. There is no
+ * overlap so double subnormals can't be converted to
+ * singles of any sort.
+ */
+ Result.uSize = IEEE754_UNION_IS_DOUBLE;
+ Result.uValue = uDouble;
+ }
+ } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
+ if(uDoubleSignificand == 0) {
+ /* ---- IS INFINITY ---- */
+ Result.uSize = IEEE754_UNION_IS_SINGLE;
+ Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
+ 0,
+ SINGLE_EXPONENT_INF_OR_NAN);
+ } else {
+ /* The NaN can only be converted if no payload bits are
+ * lost per RFC 8949 section 4.1 that defines Preferred
+ * Serializaton. Note that Deterministically Encode CBOR
+ * in section 4.2 allows for some variation of this rule,
+ * but at the moment this implementation is of Preferred
+ * Serialization, not CDE. As of December 2023, we are
+ * also expecting an update to CDE. This code may need to
+ * be updated for CDE.
+ */
+ uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
+ if(uDroppedBits == 0) {
+ /* --- IS CONVERTABLE NAN --- */
+ uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
+ Result.uSize = IEEE754_UNION_IS_SINGLE;
+ Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
+ uSingleSignificand,
+ SINGLE_EXPONENT_INF_OR_NAN);
+ } else {
+ /* --- IS UNCONVERTABLE NAN --- */
+ Result.uSize = IEEE754_UNION_IS_DOUBLE;
+ Result.uValue = uDouble;
}
+ }
+ } else {
+ /* ---- REGULAR NUMBER ---- */
+ /* A regular double can be converted to a regular single if
+ * the double's exponent is in the smaller range of a single
+ * and if no precision is lost in the significand.
+ */
+ uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
+ if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN &&
+ nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX &&
+ uDroppedBits == 0) {
+ /* --- IS CONVERTABLE TO SINGLE --- */
+ uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
+ Result.uSize = IEEE754_UNION_IS_SINGLE;
+ Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
+ uSingleSignificand,
+ nDoubleUnbiasedExponent);
} else {
- // Infinity
- uDoubleSignificand = 0;
+ /* Unable to convert to a single normal. See if it can be
+ * converted to a single subnormal. To do that, the
+ * exponent must be in range and no precision can be lost
+ * in the signficand.
+ *
+ * This is more complicated because the number is not
+ * normalized. The signficand must be shifted
+ * proprotionally to the exponent and 1 must be added
+ * in. See
+ * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
+ */
+ nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN);
+ nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
+ uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
+
+ if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN &&
+ nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS &&
+ uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) {
+ /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */
+ Result.uSize = IEEE754_UNION_IS_SINGLE;
+ Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
+ uSingleSignificand,
+ SINGLE_EXPONENT_ZERO);
+ } else {
+ /* --- CAN NOT BE CONVERTED --- */
+ Result.uSize = IEEE754_UNION_IS_DOUBLE;
+ Result.uValue = uDouble;
+ }
}
- } else {
- // Normal number
- uDoubleBiasedExponent = (uint64_t)(nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
- uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
}
- uDoubleSign = uHalfSign;
-
- // Shift the 3 parts into place as a double-precision
- const uint64_t uDouble = uDoubleSignificand |
- (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
- (uDoubleSign << DOUBLE_SIGN_SHIFT);
- return CopyUint64ToDouble(uDouble);
+ return Result;
}
-
-/*
- IEEE754_FloatToDouble(uint32_t uFloat) was created but is not needed. It can be retrieved from
-github history if needed.
-*/
-
-
-
-// Public function; see ieee754.h
-IEEE754_union IEEE754_FloatToSmallest(float f)
+/* Public function; see ieee754.h */
+IEEE754_union
+IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision)
{
- IEEE754_union result;
+ IEEE754_union result;
- // Pull the neeed two parts out of the single-precision float
- const uint32_t uSingle = CopyFloatToUint32(f);
- const int32_t nSingleExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
- const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
+ result = IEEE754_DoubleToSingle(d);
- // Bit mask that is the significand bits that would be lost when
- // converting from single-precision to half-precision
- const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
+ if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) {
+ /* Cast to uint32_t is OK, because value was just successfully
+ * converted to single. */
+ float uSingle = CopyUint32ToSingle((uint32_t)result.uValue);
+ result = IEEE754_SingleToHalf(uSingle);
+ }
- // Optimizer will re organize so there is only one call to
- // IEEE754_FloatToHalf() in the final code.
- if(uSingle == 0) {
- // Value is 0.0000, not a a subnormal
- result.uSize = IEEE754_UNION_IS_HALF;
- result.uValue = IEEE754_FloatToHalf(f);
- } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
- // NaN, +/- infinity
- result.uSize = IEEE754_UNION_IS_HALF;
- result.uValue = IEEE754_FloatToHalf(f);
- } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
- // Normal number in exponent range and precision won't be lost
- result.uSize = IEEE754_UNION_IS_HALF;
- result.uValue = IEEE754_FloatToHalf(f);
- } else {
- // Subnormal, exponent out of range, or precision will be lost
- result.uSize = IEEE754_UNION_IS_SINGLE;
- result.uValue = uSingle;
- }
-
- return result;
+ return result;
}
-// Public function; see ieee754.h
-IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
-{
- IEEE754_union result;
- // Pull the needed two parts out of the double-precision float
- const uint64_t uDouble = CopyDoubleToUint64(d);
- const int64_t nDoubleExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
- const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
+#else /* QCBOR_DISABLE_PREFERRED_FLOAT */
- // Masks to check whether dropped significand bits are zero or not
- const uint64_t uDroppedHalfBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
- const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
-
- // This will not convert to half-precion or single-precision
- // subnormals. Values that could be converted will be output as
- // the double they are or occasionally to a normal single. This
- // could be implemented, but it is more code and would rarely be
- // used and rarely reduce the output size.
-
- // The various cases
- if(d == 0.0) { // Take care of positive and negative zero
- // Value is 0.0000, not a a subnormal
- result.uSize = IEEE754_UNION_IS_HALF;
- result.uValue = IEEE754_DoubleToHalf(d);
- } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
- // NaN, +/- infinity
- result.uSize = IEEE754_UNION_IS_HALF;
- result.uValue = IEEE754_DoubleToHalf(d);
- } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedHalfBits))) {
- // Can convert to half without precision loss
- result.uSize = IEEE754_UNION_IS_HALF;
- result.uValue = IEEE754_DoubleToHalf(d);
- } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
- // Can convert to single without precision loss
- result.uSize = IEEE754_UNION_IS_SINGLE;
- result.uValue = CopyFloatToUint32((float)d);
- } else {
- // Can't convert without precision loss
- result.uSize = IEEE754_UNION_IS_DOUBLE;
- result.uValue = uDouble;
- }
-
- return result;
-}
-
-#else
-
-int x;
+int ieee754_dummy_place_holder;
#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */
diff --git a/src/ieee754.h b/src/ieee754.h
index d37532a..863019b 100644
--- a/src/ieee754.h
+++ b/src/ieee754.h
@@ -1,14 +1,14 @@
-/*==============================================================================
- ieee754.c -- floating-point conversion between half, double & single-precision
-
- Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
-
- SPDX-License-Identifier: BSD-3-Clause
-
- See BSD-3-Clause license in README.md
-
- Created on 7/23/18
- =============================================================================*/
+/* ==========================================================================
+ * ieee754.h -- Conversion between half, double & single-precision floats
+ *
+ * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * See BSD-3-Clause license in README.md
+ *
+ * Created on 7/23/18
+ * ========================================================================== */
#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
@@ -18,130 +18,109 @@
#include <stdint.h>
-
-/*
- General comments
-
- This is a complete in that it handles all conversion cases including
- +/- infinity, +/- zero, subnormal numbers, qNaN, sNaN and NaN
- payloads.
-
- This conforms to IEEE 754-2008, but note that this doesn't specify
- conversions, just the encodings.
-
- NaN payloads are preserved with alignment on the LSB. The qNaN bit is
- handled differently and explicity copied. It is always the MSB of the
- significand. The NaN payload MSBs (except the qNaN bit) are truncated
- when going from double or single to half.
-
- TODO: what does the C cast do with NaN payloads from
- double to single? It probably depends entirely on the
- CPU.
-
- */
-
-/*
- Most simply just explicilty encode the type you want, single or
- double. This works easily everywhere since standard C supports both
- these types and so does qcbor. This encoder also supports half
- precision and there's a few ways to use it to encode floating-point
- numbers in less space.
-
- Without losing precision, you can encode a single or double such that
- the special values of 0, NaN and Infinity encode as half-precision.
- This CBOR decodoer and most others should handle this properly.
-
- If you don't mind losing precision, then you can use half-precision.
- One way to do this is to set up your environment to use
- ___fp_16. Some compilers and CPUs support it even though it is not
- standard C. What is nice about this is that your program will use
- less memory and floating-point operations like multiplying, adding
- and such will be faster.
-
- Another way to make use of half-precision is to represent the values
- in your program as single or double, but encode them in CBOR as
- half-precision. This cuts the size of the encoded messages by 2 or 4,
- but doesn't reduce memory needs or speed because you are still using
- single or double in your code.
-
+/** @file ieee754.h
+ *
+ * This implements floating-point conversion between half, single and
+ * double precision floating-point numbers, in particular convesion to
+ * smaller representation (e.g., double to single) that does not lose
+ * precision for CBOR preferred serialization.
+ *
+ * This implementation works entirely with shifts and masks and does
+ * not require any floating-point HW or library.
+ *
+ * This conforms to IEEE 754-2008, but note that it doesn't specify
+ * conversions, just the encodings.
+ *
+ * This is complete, supporting +/- infinity, +/- zero, subnormals and
+ * NaN payloads. NaN payloads are converted to smaller by dropping the
+ * right most bits if they are zero and shifting to the right. If the
+ * rightmost bits are not zero the conversion is not performed. When
+ * converting from smaller to larger, the payload is shifted left and
+ * zero-padded. This is what is specified by CBOR preferred
+ * serialization and what modern HW conversion instructions do. CBOR
+ * CDE handling for NaN is not clearly specified, but upcoming
+ * documents may clarify this.
+ *
+ * There is no special handling of silent and quiet NaNs. It probably
+ * isn't necessary to transmit these special NaNs as there purpose is
+ * more for propgating errors up through some calculation. In many
+ * cases the handlng of the NaN payload will work for silent and quiet
+ * NaNs.
+ *
+ * A previous version of this was usable as a general library for
+ * conversion. This version is reduced to what is needed for CBOR.
*/
-
-/*
- Convert single-precision float to half-precision float. Precision
- and NaN payload bits will be lost. Too-large values will round up to
- infinity and too small to zero.
+/**
+ * @brief Convert half-precision float to double-precision float.
+ *
+ * @param[in] uHalfPrecision Half-prevision number to convert.
+ *
+ * @returns double-presion value.
+ *
+ * This is a lossless conversion because every half-precision value
+ * can be represented as a double. There is no error condition.
+ *
+ * There is no half-precision type in C, so it is represented here as
+ * a @c uint16_t. The bits of @c uHalfPrecision are as described for
+ * half-precision by IEEE 754.
*/
-uint16_t IEEE754_FloatToHalf(float f);
+double
+IEEE754_HalfToDouble(uint16_t uHalfPrecision);
-/*
- Convert double-precision float to half-precision float. Precision
- and NaN payload bits will be lost. Too-large values will round up to
- infinity and too small to zero.
+/** Holds a floating-point value that could be half, single or
+ * double-precision. The value is in a @c uint64_t that may be copied
+ * to a float or double. Simply casting uValue will usually work but
+ * may generate compiler or static analyzer warnings. Using
+ * UsefulBufUtil_CopyUint64ToDouble() or
+ * UsefulBufUtil_CopyUint32ToFloat() will not (and will not generate
+ * any extra code).
*/
-uint16_t IEEE754_DoubleToHalf(double d);
-
-
-/*
- Convert half-precision float to double-precision float.
- This is a loss-less conversion.
- */
-double IEEE754_HalfToDouble(uint16_t uHalfPrecision);
-
-
-// Both tags the value and gives the size
-#define IEEE754_UNION_IS_HALF 2
-#define IEEE754_UNION_IS_SINGLE 4
-#define IEEE754_UNION_IS_DOUBLE 8
-
typedef struct {
- uint8_t uSize; // One of IEEE754_IS_xxxx
- uint64_t uValue;
+ enum {IEEE754_UNION_IS_HALF = 2,
+ IEEE754_UNION_IS_SINGLE = 4,
+ IEEE754_UNION_IS_DOUBLE = 8,
+ } uSize; /* Size of uValue */
+ uint64_t uValue;
} IEEE754_union;
-/*
- Converts double-precision to single-precision or half-precision if
- possible without loss of precisions. If not, leaves it as a
- double. Only converts to single-precision unless bAllowHalfPrecision
- is set.
+/**
+ * @brief Convert a double to either single or half-precision.
+ *
+ * @param[in] d The value to convert.
+ * @param[in] bAllowHalfPrecision If true, convert to either half or
+ * single precision.
+ *
+ * @returns Unconverted value, or value converted to single or half-precision.
+ *
+ * This always succeeds. If the value cannot be converted without the
+ * loss of precision, it is not converted.
+ *
+ * This handles all subnormals and NaN payloads.
*/
-IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision);
+IEEE754_union
+IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision);
-/*
- Converts double-precision to single-precision if possible without
- loss of precision. If not, leaves it as a double.
+
+/**
+ * @brief Convert a single-precision float to half-precision.
+ *
+ * @param[in] f The value to convert.
+ *
+ * @returns Either unconverted value or value converted to half-precision.
+ *
+ * This always succeeds. If the value cannot be converted without the
+ * loss of precision, it is not converted.
+ *
+ * This handles all subnormals and NaN payloads.
*/
-static inline IEEE754_union IEEE754_DoubleToSmall(double d)
-{
- return IEEE754_DoubleToSmallestInternal(d, 0);
-}
-
-
-/*
- Converts double-precision to single-precision or half-precision if
- possible without loss of precisions. If not, leaves it as a double.
- */
-static inline IEEE754_union IEEE754_DoubleToSmallest(double d)
-{
- return IEEE754_DoubleToSmallestInternal(d, 1);
-}
-
-
-/*
- Converts single-precision to half-precision if possible without loss
- of precision. If not leaves as single-precision.
- */
-IEEE754_union IEEE754_FloatToSmallest(float f);
+IEEE754_union
+IEEE754_SingleToHalf(float f);
#endif /* ieee754_h */
-
#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */
-
-
-
-
diff --git a/src/qcbor_encode.c b/src/qcbor_encode.c
index 53df657..f52692a 100644
--- a/src/qcbor_encode.c
+++ b/src/qcbor_encode.c
@@ -768,9 +768,9 @@
void QCBOREncode_AddDouble(QCBOREncodeContext *me, double dNum)
{
#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
- const IEEE754_union uNum = IEEE754_DoubleToSmallest(dNum);
+ const IEEE754_union uNum = IEEE754_DoubleToSmaller(dNum, true);
- QCBOREncode_AddType7(me, uNum.uSize, uNum.uValue);
+ QCBOREncode_AddType7(me, (uint8_t)uNum.uSize, uNum.uValue);
#else /* QCBOR_DISABLE_PREFERRED_FLOAT */
QCBOREncode_AddDoubleNoPreferred(me, dNum);
#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */
@@ -794,9 +794,9 @@
void QCBOREncode_AddFloat(QCBOREncodeContext *me, float fNum)
{
#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
- const IEEE754_union uNum = IEEE754_FloatToSmallest(fNum);
+ const IEEE754_union uNum = IEEE754_SingleToHalf(fNum);
- QCBOREncode_AddType7(me, uNum.uSize, uNum.uValue);
+ QCBOREncode_AddType7(me, (uint8_t)uNum.uSize, uNum.uValue);
#else /* QCBOR_DISABLE_PREFERRED_FLOAT */
QCBOREncode_AddFloatNoPreferred(me, fNum);
#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */