这是演示 16 位浮点到 32 位浮点转换的代码以及一个测试程序。测试程序需要Clang的__fp16
类型,但转换代码不需要。未测试 NaN 有效负载和信令/非信令语义的处理。
#include <stdint.h>
// Produce value of bit n. n must be less than 32.
#define Bit(n) ((uint32_t) 1 << (n))
// Create a mask of n bits in the low bits. n must be less than 32.
#define Mask(n) (Bit(n) - 1)
/* Convert an IEEE-754 16-bit binary floating-point encoding to an IEEE-754
32-bit binary floating-point encoding.
This code has not been tested.
*/
uint32_t Float16ToFloat32(uint16_t x)
{
/* Separate the sign encoding (1 bit starting at bit 15), the exponent
encoding (5 bits starting at bit 10), and the primary significand
(fraction) encoding (10 bits starting at bit 0).
*/
uint32_t s = x >> 15;
uint32_t e = x >> 10 & Mask( 5);
uint32_t f = x & Mask(10);
// Left-adjust the significand field.
f <<= 23 - 10;
// Switch to handle subnormal numbers, normal numbers, and infinities/NaNs.
switch (e)
{
// Exponent code is subnormal.
case 0:
// Zero does need any changes, but subnormals need normalization.
if (f != 0)
{
/* Set the 32-bit exponent code corresponding to the 16-bit
subnormal exponent.
*/
e = 1 + (127 - 15);
/* Normalize the significand by shifting until its leading
bit moves out of the field. (This code could benefit from
a find-first-set instruction or possibly using a conversion
from integer to floating-point to do the normalization.)
*/
while (f < Bit(23))
{
f <<= 1;
e -= 1;
}
// Remove the leading bit.
f &= Mask(23);
}
break;
// Exponent code is normal.
default:
e += 127 - 15; // Adjust from 16-bit bias to 32-bit bias.
break;
// Exponent code indicates infinity or NaN.
case 31:
e = 255; // Set 32-bit exponent code for infinity or NaN.
break;
}
// Assemble and return the 32-bit encoding.
return s << 31 | e << 23 | f;
}
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
// Use unions so we can iterate and manipulate the encodings.
union { uint16_t enc; __fp16 value; } x;
union { uint32_t enc; float value; } y;
// Iterate through all 16-bit encodings.
for (uint32_t i = 0; i < Bit(16); ++i)
{
x.enc = i;
y.enc = Float16ToFloat32(x.enc);
if (isnan(x.value) != isnan(y.value) ||
!isnan(x.value) && x.value != y.value)
{
printf("Failure:\n");
printf("\tx encoding = 0x%04" PRIx16 ", value = %.99g.\n",
x.enc, x.value);
printf("\ty encoding = 0x%08" PRIx32 ", value = %.99g.\n",
y.enc, y.value);
exit(EXIT_FAILURE);
}
}
}
正如chtz指出的那样,我们可以使用 32 位浮点算法来处理正常值和次正常值的缩放调整。为此,请将Float16ToFloat32
after中的代码替换为f <<= 23 - 10;
:
// For infinities and NaNs, set 32-bit exponent code.
if (e == 31)
return s << 31 | 255 << 23 | f;
/* For finite values, reassemble with shifted fields and using a
floating-point multiply to adjust for the changed exponent bias.
*/
union { uint32_t enc; float value; } y = { .enc = s << 31 | e << 23 | f };
y.value *= 0x1p112f;
return y.enc;