36 inline static uint16_t
quantize(
float value) noexcept
39 std::memcpy(&tmp, &value,
sizeof(
float));
41 int32_t s = (tmp >> 16) & 0x00008000;
42 int32_t e = ((tmp >> 23) & 0X000000FF) - (127 - 15);
43 int32_t m = tmp & 0X007FFFFF;
47 return static_cast<uint16_t
>(s);
52 int32_t a = (1 << (t - 1)) - 1;
53 int32_t b = (m >> t) & 1;
57 return static_cast<uint16_t
>(s | m);
60 if (e == 0XFF - (127 - 15)) {
62 return static_cast<uint16_t
>(s | 0X7C00);
66 return static_cast<uint16_t
>(s | 0X7C00 | m | ((m == 0) ? 1 : 0));
69 m = m + 0X00000FFF + ((m >> 13) & 1);
71 if ((m & 0x00800000) != 0) {
77 return static_cast<uint16_t
>(s | 0X7C00);
79 return static_cast<uint16_t
>(s | (e << 10) | (m >> 13));
85 uint32_t mantissa =
static_cast<uint32_t
>(value & 1023);
86 uint32_t exponent = 0XFFFFFFF2;
88 if ((value & -33792) == 0) {
90 while ((mantissa & 1024) == 0) {
95 mantissa &= 0XFFFFFBFF;
96 tmp = ((
static_cast<uint32_t
>(value) & 0x8000) << 16) | ((exponent + 127) << 23) | (mantissa << 13);
100 tmp =
static_cast<uint32_t
>((value & 0x8000) << 16);
105 tmp = ((
static_cast<uint32_t
>(value) & 0x8000) << 16) | (((((
static_cast<uint32_t
>(value) >> 10) & 0X1F) - 15) + 127) << 23) | (mantissa << 13);
109 std::memcpy(&result, &tmp,
sizeof(
float));
Class for quantizing single-precision floats into half-precision.
Definition: half_precision.h:34
static float dequantize(uint16_t value) noexcept
Definition: half_precision.h:82
static uint16_t quantize(float value) noexcept
Definition: half_precision.h:36
Definition: bounded_range.h:28