BitStream
All Classes Namespaces Files Functions Variables Typedefs Macros
half_precision.h
Go to the documentation of this file.
1 #pragma once
2 
3 /*
4  * Copyright (c) 2018 Stanislav Denisov
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include <cstdint>
26 #include <cstring>
27 
28 namespace bitstream
29 {
34  {
35  public:
36  inline static uint16_t quantize(float value) noexcept
37  {
38  int32_t tmp;
39  std::memcpy(&tmp, &value, sizeof(float));
40 
41  int32_t s = (tmp >> 16) & 0x00008000;
42  int32_t e = ((tmp >> 23) & 0X000000FF) - (127 - 15);
43  int32_t m = tmp & 0X007FFFFF;
44 
45  if (e <= 0) {
46  if (e < -10)
47  return static_cast<uint16_t>(s);
48 
49  m |= 0x00800000;
50 
51  int32_t t = 14 - e;
52  int32_t a = (1 << (t - 1)) - 1;
53  int32_t b = (m >> t) & 1;
54 
55  m = (m + a + b) >> t;
56 
57  return static_cast<uint16_t>(s | m);
58  }
59 
60  if (e == 0XFF - (127 - 15)) {
61  if (m == 0)
62  return static_cast<uint16_t>(s | 0X7C00);
63 
64  m >>= 13;
65 
66  return static_cast<uint16_t>(s | 0X7C00 | m | ((m == 0) ? 1 : 0));
67  }
68 
69  m = m + 0X00000FFF + ((m >> 13) & 1);
70 
71  if ((m & 0x00800000) != 0) {
72  m = 0;
73  e++;
74  }
75 
76  if (e > 30)
77  return static_cast<uint16_t>(s | 0X7C00);
78 
79  return static_cast<uint16_t>(s | (e << 10) | (m >> 13));
80  }
81 
82  inline static float dequantize(uint16_t value) noexcept
83  {
84  uint32_t tmp;
85  uint32_t mantissa = static_cast<uint32_t>(value & 1023);
86  uint32_t exponent = 0XFFFFFFF2;
87 
88  if ((value & -33792) == 0) {
89  if (mantissa != 0) {
90  while ((mantissa & 1024) == 0) {
91  exponent--;
92  mantissa <<= 1;
93  }
94 
95  mantissa &= 0XFFFFFBFF;
96  tmp = ((static_cast<uint32_t>(value) & 0x8000) << 16) | ((exponent + 127) << 23) | (mantissa << 13);
97  }
98  else
99  {
100  tmp = static_cast<uint32_t>((value & 0x8000) << 16);
101  }
102  }
103  else
104  {
105  tmp = ((static_cast<uint32_t>(value) & 0x8000) << 16) | (((((static_cast<uint32_t>(value) >> 10) & 0X1F) - 15) + 127) << 23) | (mantissa << 13);
106  }
107 
108  float result;
109  std::memcpy(&result, &tmp, sizeof(float));
110 
111  return result;
112  }
113  };
114 }
Class for quantizing single-precision floats into half-precision.
Definition: half_precision.h:34
static float dequantize(uint16_t value) noexcept
Definition: half_precision.h:82
static uint16_t quantize(float value) noexcept
Definition: half_precision.h:36
Definition: bounded_range.h:28