Add conversion from float64 to extended

depp · May 27, 2022 · c2da1b1 · c2da1b1
1 parent bea69ab
commit c2da1b1
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 0 deletions.
diff --git a/extended.go b/extended.go
@@ -0,0 +1,59 @@
+// Package extended provides conversions to and from 80-bit "extended"
+// floating-point numbers.
+//
+// Note that while NaNs are handled by this package, the distinction between
+// quiet NaN and signaling NaN is not preserved.
+package extended
+
+import (
+	"math"
+	"math/bits"
+)
+
+// An Extended is an 80-bit extended precision floating-point number.
+type Extended struct {
+	// The sign is stored as the high bit, the low 15 bits contain the exponent,
+	// with a bias of 16383.
+	SignExponent uint16
+
+	// The fraction includes a ones place as the high bit. The valiue in the
+	// ones place may be zero.
+	Fraction uint64
+}
+
+// FromFloat64 converts a 64-bit floating-point number to an 80-bit extended
+// floating-point number.
+func FromFloat64(x float64) (e Extended) {
+	xbits := math.Float64bits(x)
+	sign := int(xbits>>(63-15)) & 0x8000
+	exponent := int(xbits>>52) & ((1 << 11) - 1)
+	mantissa := xbits & ((1 << 52) - 1)
+	switch exponent {
+	case 0:
+		// Zero or subnormal.
+		// Number is (-1)^sign * 2^-1022 * 0.mantissa.
+		if mantissa == 0 {
+			return Extended{uint16(sign), 0}
+		}
+		// 2^-1022 * 0.mantissa = 2^(e-16383) * 2^lzero * 0.mantissa
+		// -1022 = e - 16383 + lzero
+		// e = -1022 + 16383 - lzero
+		nzero := bits.LeadingZeros64(mantissa)
+		exponent := 16383 - 1022 + 11 - nzero
+		return Extended{uint16(sign | exponent), mantissa << nzero}
+
+	case (1 << 11) - 1:
+		// Infinity or NaN.
+		if mantissa == 0 {
+			return Extended{uint16(sign | 0x7fff), 0}
+		}
+		return Extended{uint16(sign | 0x7fff), ^uint64(0)}
+
+	default:
+		// 2^(e64 - 1023) * 1.fraction = 2^(e80 - 16383) * 1.fraction
+		// e63 - 1023 = e80 - 16383
+		// e80 = e63 + 16383 - 1023
+		exponent := exponent + 16383 - 1023
+		return Extended{uint16(sign | exponent), 1<<63 | mantissa<<11}
+	}
+}
diff --git a/extended_test.go b/extended_test.go
@@ -0,0 +1,50 @@
+package extended
+
+import (
+	"math"
+	"strconv"
+	"testing"
+)
+
+func TestFromF64(t *testing.T) {
+	type testcase struct {
+		name     string
+		exponent uint16
+		fraction uint64
+		input    float64
+	}
+	cases := []testcase{
+		{"basic_one", 16383, 1 << 63, 1.0},
+		{"basic_two", 16384, 1 << 63, 2.0},
+		{"basic_half", 16382, 1 << 63, 0.5},
+		{"small", 16383 - 10, 1 << 63, 0.0009765625},
+		{"smaller", 16383 - 100, 1 << 63, 7.888609052210118e-31},
+		{"after_one", 16383, (1 << 63) + (1 << 11), 1.0000000000000002},
+		{"infinity", 32767, 0, math.Inf(0)},
+		{"zero", 0, 0, 0.0},
+		{"nan", 32767, ^uint64(0), math.NaN()},
+		{"smallest_normal", 15361, 1 << 63, 2.2250738585072014e-308},
+		{"subnormal", 15360, 1 << 63, 1.1125369292536007e-308},
+		{"smallest_subnormal", 15309, 1 << 63, 5e-324},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			for sign := 0; sign < 2; sign++ {
+				expect := Extended{
+					c.exponent | uint16(sign<<15),
+					c.fraction,
+				}
+				fin := c.input
+				if sign != 0 {
+					fin = -fin
+				}
+				out := FromFloat64(fin)
+				if out != expect {
+					t.Errorf("FromFloat64(%s) = %04x:%016x, expect %04x:%016x",
+						strconv.FormatFloat(fin, 'g', -1, 64),
+						out.SignExponent, out.Fraction, expect.SignExponent, expect.Fraction)
+				}
+			}
+		})
+	}
+}
diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/depp/extended
+
+go 1.18