make sNaN, qNaN and NaN payloads work right; not tested yet

commit: 8db3d3e86a84b643ee770770065d02ff668dcad9 [log] [tgz]
author: Laurence Lundblade <llundbla@qti.qualcomm.com> Sat Sep 29 11:46:37 2018 -0700
committer: Laurence Lundblade <llundbla@qti.qualcomm.com> Sat Sep 29 11:46:37 2018 -0700
tree: 267b1c7da6e605e76619eedfcd4fb50fc04127cb
parent: d711fb2e196f1599fae4ae14a72f6be712dc39d9 [diff]
diff --git a/src/ieee754.c b/src/ieee754.c
index 12877bd..82ba659 100644
--- a/src/ieee754.c
+++ b/src/ieee754.c

@@ -35,19 +35,32 @@
 #include "ieee754.h"
 #include <string.h> // For memcpy()
 
+
 /*
- 
- https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
- 
- These values come from IEEE 754-2008 section 3.6
- 
  This code is written for clarity and verifiability, not for size, on the assumption
  that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
- job and the resulting object code is smaller from combing code for the many different
+ job and the resulting object code is smaller from combining code for the many different
  cases (normal, subnormal, infinity, zero...) for the conversions.
  
  Dead stripping is also really helpful to get code size down.
  
+ This code also works solely using shifts and masks and thus has no dependency on
+ any math libraries. It will even work if the CPU doesn't have any floating
+ point support.
+ 
+ The memcpy() dependency is only for CopyFloatToUint32() and friends which only
+ is needed to avoid type punning when converting the actual float bits to
+ an unsigned value so the bit shifts and masks can work.
+ */
+
+/*
+ The references used to write this code:
+ 
+ - IEEE 754-2008, particularly section 3.6 and 6.2.1
+ 
+ - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
+ 
+ - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
  */
 
 
@@ -86,9 +99,10 @@
 #define SINGLE_EXPONENT_SHIFT       (SINGLE_NUM_SIGNIFICAND_BITS)
 #define SINGLE_SIGN_SHIFT           (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
 
-#define SINGLE_SIGNIFICAND_MASK     (0x7fffff) // The lower 23 bits
-#define SINGLE_EXPONENT_MASK        (0xff << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
-#define SINGLE_SIGN_MASK            (0x01 << SINGLE_SIGN_SHIFT) // 1 bit of sign
+#define SINGLE_SIGNIFICAND_MASK     (0x7fffffUL) // The lower 23 bits
+#define SINGLE_EXPONENT_MASK        (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
+#define SINGLE_SIGN_MASK            (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
+#define SINGLE_QUIET_NAN_BIT        (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
 
 /* Biased  Biased   Unbiased  Use
     0x0000     0     -127      0 and subnormal
@@ -112,9 +126,11 @@
 #define DOUBLE_EXPONENT_SHIFT       (DOUBLE_NUM_SIGNIFICAND_BITS)
 #define DOUBLE_SIGN_SHIFT           (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
 
-#define DOUBLE_SIGNIFICAND_MASK     (0xfffffffffffffLL) // The lower 52 bits
-#define DOUBLE_EXPONENT_MASK        (0x7ffLL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
-#define DOUBLE_SIGN_MASK            (0x01LL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
+#define DOUBLE_SIGNIFICAND_MASK     (0xfffffffffffffULL) // The lower 52 bits
+#define DOUBLE_EXPONENT_MASK        (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
+#define DOUBLE_SIGN_MASK            (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
+#define DOUBLE_QUIET_NAN_BIT        (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
+
 
 /* Biased      Biased   Unbiased  Use
    0x00000000     0     -1023     0 and subnormal
@@ -131,8 +147,13 @@
 
 /*
  Convenient functions to avoid type punning, compiler warnings and such
- The optimizer reduces them to a simple assignment
+ The optimizer reduces them to a simple assignment.
  This is a crusty corner of C. It shouldn't be this hard.
+ 
+ These are also in UsefulBuf.h under a different name. They are copied
+ here because to avoid a dependency on UsefulBuf.h. There is no
+ object code size impact because these always optimze down to a
+ simple assignment.
  */
 static inline uint32_t CopyFloatToUint32(float f)
 {
@@ -183,17 +204,18 @@
             // Infinity
             uHalfSignificand = 0;
         } else {
-            // NaN; significand has to be non-zero
-            if(!(uSingleSignificand & HALF_SIGNIFICAND_MASK)) {
-                // NaN payload bits that can't be carried; convert to a quite NaN
-                // since this has to be non-zero to still be a NaN
-                uHalfSignificand = HALF_QUIET_NAN_BIT; // standard qNaN;
+            // Copy the LBSs of the NaN payload that will fit from the single to the half
+            uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
+            if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
+                // It's a qNaN; copy the qNaN bit
+                uHalfSignificand |= HALF_QUIET_NAN_BIT;
             } else {
-                // The LSBs are preserved, but not the MSBs
-                // This preservation allows some limited form of NaN payloads / boxing
-                // Would be good to find out what other implementations do for
-                // this kind of conversion of NaN
-                uHalfSignificand = uSingleSignificand & HALF_SIGNIFICAND_MASK;
+                // It's a sNaN; make sure the significand is not zero so it stays a NaN
+                // This is needed because not all significand bits are copied from single
+                if(!uHalfSignificand) {
+                    // Set the LSB. This is what wikipedia shows for sNAN.
+                    uHalfSignificand |= 0x01;
+                }
             }
         }
     } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
@@ -248,17 +270,18 @@
             // Infinity
             uHalfSignificand = 0;
         } else {
-            // NaN; significand has to be non-zero
-            if(!(uDoubleSignificand & HALF_SIGNIFICAND_MASK)) {
-                // NaN payload bits that can't be carried; convert to a quite NaN
-                // since this has to be non-zero to still be a NaN
-                uHalfSignificand = HALF_QUIET_NAN_BIT; // standard qNaN;
+            // Copy the LBSs of the NaN payload that will fit from the double to the half
+            uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
+            if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
+                // It's a qNaN; copy the qNaN bit
+                uHalfSignificand |= HALF_QUIET_NAN_BIT;
             } else {
-                // The LSBs are preserved, but not the MSBs
-                // This preservation allows some limited form of NaN payloads / boxing
-                // Would be good to find out what other implementations do for
-                // this kind of conversion of NaN
-                uHalfSignificand = uDoubleSignificand & HALF_SIGNIFICAND_MASK;
+                // It's an sNaN; make sure the significand is not zero so it stays a NaN
+                // This is needed because not all significand bits are copied from single
+                if(!uHalfSignificand) {
+                    // Set the LSB. This is what wikipedia shows for sNAN.
+                    uHalfSignificand |= 0x01;
+                }
             }
         }
     } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
@@ -329,8 +352,13 @@
         // NaN or Inifinity
         uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
         if(uHalfSignificand) {
-            // Preserve NaN payload for NaN boxing
-            uSingleSignificand = uHalfSignificand;
+            // NaN
+            // First preserve the NaN payload from half to single
+            uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
+            if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
+                // Next, set qNaN if needed since half qNaN bit is not copied above
+                uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
+            }
         } else {
             // Infinity
             uSingleSignificand = 0;
@@ -386,8 +414,13 @@
         // NaN or Inifinity
         uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
         if(uHalfSignificand) {
-            // Preserve NaN payload for NaN boxing
-            uDoubleSignificand = uHalfSignificand;
+            // NaN
+            // First preserve the NaN payload from half to single
+            uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
+            if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
+                // Next, set qNaN if needed since half qNaN bit is not copied above
+                uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
+            }
         } else {
             // Infinity
             uDoubleSignificand = 0;
@@ -444,7 +477,7 @@
     return result;
 }
 
-
+// Public function; see ieee754.h
 IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
 {
     IEEE754_union result;

diff --git a/src/ieee754.h b/src/ieee754.h
index ba2b030..f6824ba 100644
--- a/src/ieee754.h
+++ b/src/ieee754.h

@@ -37,6 +37,30 @@
 #include <stdint.h>
 
 
+
+/*
+ General comments
+ 
+ This is a complete in that it handles all conversion cases
+ including +/- infinity, +/- zero, subnormal numbers, qNaN, sNaN
+ and NaN payloads.
+ 
+ This confirms to IEEE 754-2008, but note that this doesn't
+ specify conversions, just the encodings.
+ 
+ NaN payloads are preserved with alignment on the LSB. The
+ qNaN bit is handled differently and explicity copied. It
+ is always the MSB of the significand. The NaN payload MSBs
+ (except the qNaN bit) are truncated when going from
+ double or single to half.
+ 
+ TODO: what does the C cast do with NaN payloads from
+ double to single?
+ 
+ 
+ 
+ */
+
 /*
  Most simply just explicilty encode the type you want, single or double.
  This works easily everywhere since standard C supports both
@@ -78,12 +102,35 @@
  
  */
 
+
+
+/*
+ Convert single precision float to half-precision float.
+ Precision and NaN payload bits will be lost. Too large
+ values will round up to infinity and too small to zero.
+ */
 uint16_t IEEE754_FloatToHalf(float f);
 
+
+/*
+ Convert half precision float to single precision float.
+ This is a loss-less conversion.
+ */
 float IEEE754_HalfToFloat(uint16_t uHalfPrecision);
 
+
+/*
+ Convert double precision float to half-precision float.
+ Precision and NaN payload bits will be lost. Too large
+ values will round up to infinity and too small to zero.
+ */
 uint16_t IEEE754_DoubleToHalf(double d);
 
+
+/*
+ Convert half precision float to double precision float.
+ This is a loss-less conversion.
+ */
 double IEEE754_HalfToDouble(uint16_t uHalfPrecision);
 
 
@@ -103,10 +150,15 @@
 } IEEE754_union;
 
 
+/*
+ Converts double-precision to single-precision or half-precision if possible without
+ loss of precisions. If not, leaves it as a double. Only converts to single-precision
+ unless bAllowHalfPrecision is set.
+ */
 IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision);
 
 /*
- Converts double-precision to half- or single-precision if possible without
+ Converts double-precision to single-precision if possible without
  loss of precision. If not, leaves it as a double.
  */
 static inline IEEE754_union IEEE754_DoubleToSmall(double d)
@@ -116,7 +168,7 @@
 
 
 /*
- Converts double-precision to single-precision if possible without
+ Converts double-precision to single-precision or half-precision if possible without
  loss of precisions. If not, leaves it as a double.
  */
 static inline IEEE754_union IEEE754_DoubleToSmallest(double d)
@@ -124,7 +176,6 @@
     return IEEE754_DoubleToSmallestInternal(d, 1);
 }
 
-
 /*
  Converts single-precision to half-precision if possible without
  loss of precision. If not leaves as single-precision.
@@ -132,12 +183,6 @@
 IEEE754_union IEEE754_FloatToSmallest(float f);
 
 
-
-
-
-
-
-
 #endif /* ieee754_h */
 
 

diff --git a/test/half_precision_test.c b/test/half_precision_test.c
index 4038cd6..377ae9d 100644
--- a/test/half_precision_test.c
+++ b/test/half_precision_test.c

@@ -257,7 +257,7 @@
 
 int half_precision_to_float_vs_rfc_test()
 {
-    for(uint32_t uHalfP = 0; uHalfP < 0x1fff; uHalfP += 10) {
+    for(uint32_t uHalfP = 0; uHalfP < 0xffff; uHalfP += 1) {
         unsigned char x[2];
         x[1] = uHalfP & 0xff;
         x[0] = uHalfP >> 8;
commit	8db3d3e86a84b643ee770770065d02ff668dcad9	[log] [tgz]
author	Laurence Lundblade <llundbla@qti.qualcomm.com>	Sat Sep 29 11:46:37 2018 -0700
committer	Laurence Lundblade <llundbla@qti.qualcomm.com>	Sat Sep 29 11:46:37 2018 -0700
tree	267b1c7da6e605e76619eedfcd4fb50fc04127cb
parent	d711fb2e196f1599fae4ae14a72f6be712dc39d9 [diff]