blob: 53ab3ebe8c653dd2b9abc72d027670a82b8d2fd5 [file] [log] [blame]
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -07001/* ==========================================================================
2 * ieee754.h -- Conversion between half, double & single-precision floats
3 *
4 * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
5 *
6 * SPDX-License-Identifier: BSD-3-Clause
7 *
8 * See BSD-3-Clause license in README.md
9 *
10 * Created on 7/23/18
11 * ========================================================================== */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070012
Laurence Lundblade9682a532020-06-06 18:33:04 -070013
Laurence Lundblade12d32c52018-09-19 11:25:27 -070014#ifndef ieee754_h
15#define ieee754_h
16
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080017
Laurence Lundblade12d32c52018-09-19 11:25:27 -070018#include <stdint.h>
19
20
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070021/** @file ieee754.h
22 *
23 * This implements floating-point conversion between half, single and
24 * double precision floating-point numbers, in particular convesion to
25 * smaller representation (e.g., double to single) that does not lose
26 * precision for CBOR preferred serialization.
27 *
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080028 * This also implements conversion of floats to whole numbers as
29 * is required for dCBOR.
30 *
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070031 * This implementation works entirely with shifts and masks and does
32 * not require any floating-point HW or library.
33 *
34 * This conforms to IEEE 754-2008, but note that it doesn't specify
35 * conversions, just the encodings.
36 *
37 * This is complete, supporting +/- infinity, +/- zero, subnormals and
38 * NaN payloads. NaN payloads are converted to smaller by dropping the
39 * right most bits if they are zero and shifting to the right. If the
40 * rightmost bits are not zero the conversion is not performed. When
41 * converting from smaller to larger, the payload is shifted left and
42 * zero-padded. This is what is specified by CBOR preferred
43 * serialization and what modern HW conversion instructions do. CBOR
44 * CDE handling for NaN is not clearly specified, but upcoming
45 * documents may clarify this.
46 *
47 * There is no special handling of silent and quiet NaNs. It probably
48 * isn't necessary to transmit these special NaNs as there purpose is
49 * more for propgating errors up through some calculation. In many
50 * cases the handlng of the NaN payload will work for silent and quiet
51 * NaNs.
52 *
53 * A previous version of this was usable as a general library for
54 * conversion. This version is reduced to what is needed for CBOR.
Laurence Lundblade12d32c52018-09-19 11:25:27 -070055 */
56
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080057#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070058
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070059/**
60 * @brief Convert half-precision float to double-precision float.
61 *
62 * @param[in] uHalfPrecision Half-prevision number to convert.
63 *
64 * @returns double-presion value.
65 *
66 * This is a lossless conversion because every half-precision value
67 * can be represented as a double. There is no error condition.
68 *
69 * There is no half-precision type in C, so it is represented here as
70 * a @c uint16_t. The bits of @c uHalfPrecision are as described for
71 * half-precision by IEEE 754.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070072 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070073double
74IEEE754_HalfToDouble(uint16_t uHalfPrecision);
Laurence Lundblade12d32c52018-09-19 11:25:27 -070075
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070076
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070077/** Holds a floating-point value that could be half, single or
78 * double-precision. The value is in a @c uint64_t that may be copied
79 * to a float or double. Simply casting uValue will usually work but
80 * may generate compiler or static analyzer warnings. Using
81 * UsefulBufUtil_CopyUint64ToDouble() or
82 * UsefulBufUtil_CopyUint32ToFloat() will not (and will not generate
83 * any extra code).
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070084 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070085typedef struct {
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070086 enum {IEEE754_UNION_IS_HALF = 2,
87 IEEE754_UNION_IS_SINGLE = 4,
88 IEEE754_UNION_IS_DOUBLE = 8,
89 } uSize; /* Size of uValue */
90 uint64_t uValue;
Laurence Lundblade12d32c52018-09-19 11:25:27 -070091} IEEE754_union;
92
93
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080094/** Holds result of an attempt to convert a floating-point
95 * number to an int64_t or uint64_t.
96 */
97struct IEEE754_ToInt {
98 enum {IEEE754_ToInt_IS_INT,
99 IEEE754_ToInt_IS_UINT,
100 IEEE754_ToInt_NO_CONVERSION,
101 IEEE754_ToInt_NaN
102 } type;
103 union {
104 uint64_t un_signed;
105 int64_t is_signed;
106 } integer;
107};
108
109
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700110/**
111 * @brief Convert a double to either single or half-precision.
112 *
113 * @param[in] d The value to convert.
114 * @param[in] bAllowHalfPrecision If true, convert to either half or
115 * single precision.
116 *
117 * @returns Unconverted value, or value converted to single or half-precision.
118 *
119 * This always succeeds. If the value cannot be converted without the
120 * loss of precision, it is not converted.
121 *
122 * This handles all subnormals and NaN payloads.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700123 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700124IEEE754_union
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800125IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision, int bNoNaNPayload);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700126
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700127
128/**
129 * @brief Convert a single-precision float to half-precision.
130 *
131 * @param[in] f The value to convert.
132 *
133 * @returns Either unconverted value or value converted to half-precision.
134 *
135 * This always succeeds. If the value cannot be converted without the
136 * loss of precision, it is not converted.
137 *
138 * This handles all subnormals and NaN payloads.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700139 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700140IEEE754_union
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800141IEEE754_SingleToHalf(float f, int bNoNanPayloads);
142
143
144/**
145 * @brief Convert a double-precision float to integer if whole number
146 *
147 * @param[in] d The value to convert.
148 *
149 * @returns Either converted number or conversion status.
150 *
151 * If the value is a whole number that will fit either in a uint64_t
152 * or an int64_t, it is converted. If it is a NaN, then there is no
153 * conversion and and the fact that it is a NaN is indicated in the
154 * returned structure. If it can't be converted, then that is
155 * indicated in the returned structure.
156 *
157 * This always returns postive numbers as a uint64_t even if they will
158 * fit in an int64_t.
159 *
160 * This never fails becaue of precision, but may fail because of range.
161 */
162struct IEEE754_ToInt
163IEEE754_DoubleToInt(double d);
164
165
166/**
167 * @brief Convert a single-precision float to integer if whole number
168 *
169 * @param[in] f The value to convert.
170 *
171 * @returns Either converted number or conversion status.
172 *
173 * If the value is a whole number that will fit either in a uint64_t
174 * or an int64_t, it is converted. If it is a NaN, then there is no
175 * conversion and and the fact that it is a NaN is indicated in the
176 * returned structure. If it can't be converted, then that is
177 * indicated in the returned structure.
178 *
179 * This always returns postive numbers as a uint64_t even if they will
180 * fit in an int64_t.
181 *
182 * This never fails becaue of precision, but may fail because of range.
183 */
184struct IEEE754_ToInt
185IEEE754_SingleToInt(float f);
186
187#endif /* ! QCBOR_DISABLE_PREFERRED_FLOAT */
188
189
190/**
191 * @brief Tests whether NaN is "quiet" vs having a payload.
192 *
193 * @param[in] dNum Double number to test.
194 *
195 * @returns 0 if a quiet NaN, 1 if it has a payload.
196 *
197 * A quiet NaN is usually represented as 0x7ff8000000000000. That is
198 * the significand bits are 0x8000000000000. If the significand bits
199 * are other than 0x8000000000000 it is considered to have a NaN
200 * payload.
201 *
202 * Note that 0x7ff8000000000000 is not specified in a standard, but it
203 * is commonly implemented and chosen by CBOR as the best way to
204 * represent a NaN.
205 */
206int
207IEEE754_IsNotStandardDoubleNaN(double dNum);
208
209
210
211/**
212 * @brief Tests whether NaN is "quiet" vs having a payload.
213 *
214 * @param[in] fNum Float number to test.
215 *
216 * @returns 0 if a quiet NaN, 1 if it has a payload.
217 *
218 * See IEEE754_IsNotStandardDoubleNaN(). A single precision quiet NaN
219 * is 0x7fc00000.
220 */
221int
222IEEE754_IsNotStandardSingleNaN(float fNum);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700223
224
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700225#endif /* ieee754_h */
226