pax_global_header00006660000000000000000000000064141420160470014511gustar00rootroot0000000000000052 comment=7af39913ab0ef6cd4fd08f3b2e916457548578ef golang-github-x448-float16-0.8.4/000077500000000000000000000000001414201604700162305ustar00rootroot00000000000000golang-github-x448-float16-0.8.4/.travis.yml000066400000000000000000000003051414201604700203370ustar00rootroot00000000000000language: go go: - 1.11.x env: - GO111MODULE=on script: - go test -short -coverprofile=coverage.txt -covermode=count ./... after_success: - bash <(curl -s https://codecov.io/bash) golang-github-x448-float16-0.8.4/LICENSE000066400000000000000000000021161414201604700172350ustar00rootroot00000000000000MIT License Copyright (c) 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. golang-github-x448-float16-0.8.4/README.md000066400000000000000000000164101414201604700175110ustar00rootroot00000000000000# Float16 (Binary16) in Go/Golang [![Build Status](https://travis-ci.org/x448/float16.svg?branch=master)](https://travis-ci.org/x448/float16) [![codecov](https://codecov.io/gh/x448/float16/branch/master/graph/badge.svg?v=4)](https://codecov.io/gh/x448/float16) [![Go Report Card](https://goreportcard.com/badge/github.com/x448/float16)](https://goreportcard.com/report/github.com/x448/float16) [![Release](https://img.shields.io/github/release/x448/float16.svg?style=flat-square)](https://github.com/x448/float16/releases) [![License](http://img.shields.io/badge/license-mit-blue.svg?style=flat-square)](https://raw.githubusercontent.com/x448/float16/master/LICENSE) `float16` package provides [IEEE 754 half-precision floating-point format (binary16)](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) with IEEE 754 default rounding for conversions. IEEE 754-2008 refers to this 16-bit floating-point format as binary16. IEEE 754 default rounding ("Round-to-Nearest RoundTiesToEven") is considered the most accurate and statistically unbiased estimate of the true result. All possible 4+ billion floating-point conversions with this library are verified to be correct. Lowercase "float16" refers to IEEE 754 binary16. And capitalized "Float16" refers to exported Go data type provided by this library. ## Features Current features include: * float16 to float32 conversions use lossless conversion. * float32 to float16 conversions use IEEE 754-2008 "Round-to-Nearest RoundTiesToEven". * conversions using pure Go take about 2.65 ns/op on a desktop amd64. * unit tests provide 100% code coverage and check all possible 4+ billion conversions. * other functions include: IsInf(), IsNaN(), IsNormal(), PrecisionFromfloat32(), String(), etc. * all functions in this library use zero allocs except String(). ## Status This library is used by [fxamacker/cbor](https://github.com/fxamacker/cbor) and is ready for production use on supported platforms. The version number < 1.0 indicates more functions and options are planned but not yet published. Current status: * core API is done and breaking API changes are unlikely. * 100% of unit tests pass: * short mode (`go test -short`) tests around 65765 conversions in 0.005s. * normal mode (`go test`) tests all possible 4+ billion conversions in about 95s. * 100% code coverage with both short mode and normal mode. * tested on amd64 but it should work on all little-endian platforms supported by Go. Roadmap: * add functions for fast batch conversions leveraging SIMD when supported by hardware. * speed up unit test when verifying all possible 4+ billion conversions. * test on additional platforms. ## Float16 to Float32 Conversion Conversions from float16 to float32 are lossless conversions. All 65536 possible float16 to float32 conversions (in pure Go) are confirmed to be correct. Unit tests take a fraction of a second to check all 65536 expected values for float16 to float32 conversions. ## Float32 to Float16 Conversion Conversions from float32 to float16 use IEEE 754 default rounding ("Round-to-Nearest RoundTiesToEven"). All 4294967296 possible float32 to float16 conversions (in pure Go) are confirmed to be correct. Unit tests in normal mode take about 1-2 minutes to check all 4+ billion float32 input values and results for Fromfloat32(), FromNaN32ps(), and PrecisionFromfloat32(). Unit tests in short mode use a small subset (around 229 float32 inputs) and finish in under 0.01 second while still reaching 100% code coverage. ## Usage Install with `go get github.com/x448/float16`. ``` // Convert float32 to float16 pi := float32(math.Pi) pi16 := float16.Fromfloat32(pi) // Convert float16 to float32 pi32 := pi16.Float32() // PrecisionFromfloat32() is faster than the overhead of calling a function. // This example only converts if there's no data loss and input is not a subnormal. if float16.PrecisionFromfloat32(pi) == float16.PrecisionExact { pi16 := float16.Fromfloat32(pi) } ``` ## Float16 Type and API Float16 (capitalized) is a Go type with uint16 as the underlying state. There are 6 exported functions and 9 exported methods. ``` package float16 // import "github.com/x448/float16" // Exported types and consts type Float16 uint16 const ErrInvalidNaNValue = float16Error("float16: invalid NaN value, expected IEEE 754 NaN") // Exported functions Fromfloat32(f32 float32) Float16 // Float16 number converted from f32 using IEEE 754 default rounding with identical results to AMD and Intel F16C hardware. NaN inputs are converted with quiet bit always set on, to be like F16C. FromNaN32ps(nan float32) (Float16, error) // Float16 NaN without modifying quiet bit. // The "ps" suffix means "preserve signaling". // Returns sNaN and ErrInvalidNaNValue if nan isn't a NaN. Frombits(b16 uint16) Float16 // Float16 number corresponding to b16 (IEEE 754 binary16 rep.) NaN() Float16 // Float16 of IEEE 754 binary16 not-a-number Inf(sign int) Float16 // Float16 of IEEE 754 binary16 infinity according to sign PrecisionFromfloat32(f32 float32) Precision // quickly indicates exact, ..., overflow, underflow // (inline and < 1 ns/op) // Exported methods (f Float16) Float32() float32 // float32 number converted from f16 using lossless conversion (f Float16) Bits() uint16 // the IEEE 754 binary16 representation of f (f Float16) IsNaN() bool // true if f is not-a-number (NaN) (f Float16) IsQuietNaN() bool // true if f is a quiet not-a-number (NaN) (f Float16) IsInf(sign int) bool // true if f is infinite based on sign (-1=NegInf, 0=any, 1=PosInf) (f Float16) IsFinite() bool // true if f is not infinite or NaN (f Float16) IsNormal() bool // true if f is not zero, infinite, subnormal, or NaN. (f Float16) Signbit() bool // true if f is negative or negative zero (f Float16) String() string // string representation of f to satisfy fmt.Stringer interface ``` See [API](https://godoc.org/github.com/x448/float16) at godoc.org for more info. ## Benchmarks Conversions (in pure Go) are around 2.65 ns/op for float16 -> float32 and float32 -> float16 on amd64. Speeds can vary depending on input value. ``` All functions have zero allocations except float16.String(). FromFloat32pi-2 2.59ns ± 0% // speed using Fromfloat32() to convert a float32 of math.Pi to Float16 ToFloat32pi-2 2.69ns ± 0% // speed using Float32() to convert a float16 of math.Pi to float32 Frombits-2 0.29ns ± 5% // speed using Frombits() to cast a uint16 to Float16 PrecisionFromFloat32-2 0.29ns ± 1% // speed using PrecisionFromfloat32() to check for overflows, etc. ``` ## System Requirements * Tested on Go 1.11, 1.12, and 1.13 but it should also work with older versions. * Tested on amd64 but it should also work on all little-endian platforms supported by Go. ## Special Thanks Special thanks to Kathryn Long (starkat99) for creating [half-rs](https://github.com/starkat99/half-rs), a very nice rust implementation of float16. ## License Copyright (c) 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker Licensed under [MIT License](LICENSE) golang-github-x448-float16-0.8.4/float16.go000066400000000000000000000222351414201604700200370ustar00rootroot00000000000000// Copyright 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker // // Special thanks to Kathryn Long for her Rust implementation // of float16 at github.com/starkat99/half-rs (MIT license) package float16 import ( "math" "strconv" ) // Float16 represents IEEE 754 half-precision floating-point numbers (binary16). type Float16 uint16 // Precision indicates whether the conversion to Float16 is // exact, subnormal without dropped bits, inexact, underflow, or overflow. type Precision int const ( // PrecisionExact is for non-subnormals that don't drop bits during conversion. // All of these can round-trip. Should always convert to float16. PrecisionExact Precision = iota // PrecisionUnknown is for subnormals that don't drop bits during conversion but // not all of these can round-trip so precision is unknown without more effort. // Only 2046 of these can round-trip and the rest cannot round-trip. PrecisionUnknown // PrecisionInexact is for dropped significand bits and cannot round-trip. // Some of these are subnormals. Cannot round-trip float32->float16->float32. PrecisionInexact // PrecisionUnderflow is for Underflows. Cannot round-trip float32->float16->float32. PrecisionUnderflow // PrecisionOverflow is for Overflows. Cannot round-trip float32->float16->float32. PrecisionOverflow ) // PrecisionFromfloat32 returns Precision without performing // the conversion. Conversions from both Infinity and NaN // values will always report PrecisionExact even if NaN payload // or NaN-Quiet-Bit is lost. This function is kept simple to // allow inlining and run < 0.5 ns/op, to serve as a fast filter. func PrecisionFromfloat32(f32 float32) Precision { u32 := math.Float32bits(f32) if u32 == 0 || u32 == 0x80000000 { // +- zero will always be exact conversion return PrecisionExact } const COEFMASK uint32 = 0x7fffff // 23 least significant bits const EXPSHIFT uint32 = 23 const EXPBIAS uint32 = 127 const EXPMASK uint32 = uint32(0xff) << EXPSHIFT const DROPMASK uint32 = COEFMASK >> 10 exp := int32(((u32 & EXPMASK) >> EXPSHIFT) - EXPBIAS) coef := u32 & COEFMASK if exp == 128 { // +- infinity or NaN // apps may want to do extra checks for NaN separately return PrecisionExact } // https://en.wikipedia.org/wiki/Half-precision_floating-point_format says, // "Decimals between 2^−24 (minimum positive subnormal) and 2^−14 (maximum subnormal): fixed interval 2^−24" if exp < -24 { return PrecisionUnderflow } if exp > 15 { return PrecisionOverflow } if (coef & DROPMASK) != uint32(0) { // these include subnormals and non-subnormals that dropped bits return PrecisionInexact } if exp < -14 { // Subnormals. Caller may want to test these further. // There are 2046 subnormals that can successfully round-trip f32->f16->f32 // and 20 of those 2046 have 32-bit input coef == 0. // RFC 7049 and 7049bis Draft 12 don't precisely define "preserves value" // so some protocols and libraries will choose to handle subnormals differently // when deciding to encode them to CBOR float32 vs float16. return PrecisionUnknown } return PrecisionExact } // Frombits returns the float16 number corresponding to the IEEE 754 binary16 // representation u16, with the sign bit of u16 and the result in the same bit // position. Frombits(Bits(x)) == x. func Frombits(u16 uint16) Float16 { return Float16(u16) } // Fromfloat32 returns a Float16 value converted from f32. Conversion uses // IEEE default rounding (nearest int, with ties to even). func Fromfloat32(f32 float32) Float16 { return Float16(f32bitsToF16bits(math.Float32bits(f32))) } // ErrInvalidNaNValue indicates a NaN was not received. const ErrInvalidNaNValue = float16Error("float16: invalid NaN value, expected IEEE 754 NaN") type float16Error string func (e float16Error) Error() string { return string(e) } // FromNaN32ps converts nan to IEEE binary16 NaN while preserving both // signaling and payload. Unlike Fromfloat32(), which can only return // qNaN because it sets quiet bit = 1, this can return both sNaN and qNaN. // If the result is infinity (sNaN with empty payload), then the // lowest bit of payload is set to make the result a NaN. // Returns ErrInvalidNaNValue and 0x7c01 (sNaN) if nan isn't IEEE 754 NaN. // This function was kept simple to be able to inline. func FromNaN32ps(nan float32) (Float16, error) { const SNAN = Float16(uint16(0x7c01)) // signalling NaN u32 := math.Float32bits(nan) sign := u32 & 0x80000000 exp := u32 & 0x7f800000 coef := u32 & 0x007fffff if (exp != 0x7f800000) || (coef == 0) { return SNAN, ErrInvalidNaNValue } u16 := uint16((sign >> 16) | uint32(0x7c00) | (coef >> 13)) if (u16 & 0x03ff) == 0 { // result became infinity, make it NaN by setting lowest bit in payload u16 = u16 | 0x0001 } return Float16(u16), nil } // NaN returns a Float16 of IEEE 754 binary16 not-a-number (NaN). // Returned NaN value 0x7e01 has all exponent bits = 1 with the // first and last bits = 1 in the significand. This is consistent // with Go's 64-bit math.NaN(). Canonical CBOR in RFC 7049 uses 0x7e00. func NaN() Float16 { return Float16(0x7e01) } // Inf returns a Float16 with an infinity value with the specified sign. // A sign >= returns positive infinity. // A sign < 0 returns negative infinity. func Inf(sign int) Float16 { if sign >= 0 { return Float16(0x7c00) } return Float16(0x8000 | 0x7c00) } // Float32 returns a float32 converted from f (Float16). // This is a lossless conversion. func (f Float16) Float32() float32 { u32 := f16bitsToF32bits(uint16(f)) return math.Float32frombits(u32) } // Bits returns the IEEE 754 binary16 representation of f, with the sign bit // of f and the result in the same bit position. Bits(Frombits(x)) == x. func (f Float16) Bits() uint16 { return uint16(f) } // IsNaN reports whether f is an IEEE 754 binary16 “not-a-number” value. func (f Float16) IsNaN() bool { return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0) } // IsQuietNaN reports whether f is a quiet (non-signaling) IEEE 754 binary16 // “not-a-number” value. func (f Float16) IsQuietNaN() bool { return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0) && (f&0x0200 != 0) } // IsInf reports whether f is an infinity (inf). // A sign > 0 reports whether f is positive inf. // A sign < 0 reports whether f is negative inf. // A sign == 0 reports whether f is either inf. func (f Float16) IsInf(sign int) bool { return ((f == 0x7c00) && sign >= 0) || (f == 0xfc00 && sign <= 0) } // IsFinite returns true if f is neither infinite nor NaN. func (f Float16) IsFinite() bool { return (uint16(f) & uint16(0x7c00)) != uint16(0x7c00) } // IsNormal returns true if f is neither zero, infinite, subnormal, or NaN. func (f Float16) IsNormal() bool { exp := uint16(f) & uint16(0x7c00) return (exp != uint16(0x7c00)) && (exp != 0) } // Signbit reports whether f is negative or negative zero. func (f Float16) Signbit() bool { return (uint16(f) & uint16(0x8000)) != 0 } // String satisfies the fmt.Stringer interface. func (f Float16) String() string { return strconv.FormatFloat(float64(f.Float32()), 'f', -1, 32) } // f16bitsToF32bits returns uint32 (float32 bits) converted from specified uint16. func f16bitsToF32bits(in uint16) uint32 { // All 65536 conversions with this were confirmed to be correct // by Montgomery Edwards⁴⁴⁸ (github.com/x448). sign := uint32(in&0x8000) << 16 // sign for 32-bit exp := uint32(in&0x7c00) >> 10 // exponenent for 16-bit coef := uint32(in&0x03ff) << 13 // significand for 32-bit if exp == 0x1f { if coef == 0 { // infinity return sign | 0x7f800000 | coef } // NaN return sign | 0x7fc00000 | coef } if exp == 0 { if coef == 0 { // zero return sign } // normalize subnormal numbers exp++ for coef&0x7f800000 == 0 { coef <<= 1 exp-- } coef &= 0x007fffff } return sign | ((exp + (0x7f - 0xf)) << 23) | coef } // f32bitsToF16bits returns uint16 (Float16 bits) converted from the specified float32. // Conversion rounds to nearest integer with ties to even. func f32bitsToF16bits(u32 uint32) uint16 { // Translated from Rust to Go by Montgomery Edwards⁴⁴⁸ (github.com/x448). // All 4294967296 conversions with this were confirmed to be correct by x448. // Original Rust implementation is by Kathryn Long (github.com/starkat99) with MIT license. sign := u32 & 0x80000000 exp := u32 & 0x7f800000 coef := u32 & 0x007fffff if exp == 0x7f800000 { // NaN or Infinity nanBit := uint32(0) if coef != 0 { nanBit = uint32(0x0200) } return uint16((sign >> 16) | uint32(0x7c00) | nanBit | (coef >> 13)) } halfSign := sign >> 16 unbiasedExp := int32(exp>>23) - 127 halfExp := unbiasedExp + 15 if halfExp >= 0x1f { return uint16(halfSign | uint32(0x7c00)) } if halfExp <= 0 { if 14-halfExp > 24 { return uint16(halfSign) } coef := coef | uint32(0x00800000) halfCoef := coef >> uint32(14-halfExp) roundBit := uint32(1) << uint32(13-halfExp) if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 { halfCoef++ } return uint16(halfSign | halfCoef) } uHalfExp := uint32(halfExp) << 10 halfCoef := coef >> 13 roundBit := uint32(0x00001000) if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 { return uint16((halfSign | uHalfExp | halfCoef) + 1) } return uint16(halfSign | uHalfExp | halfCoef) } golang-github-x448-float16-0.8.4/float16_bench_test.go000066400000000000000000000034301414201604700222310ustar00rootroot00000000000000// Copyright 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker package float16_test import ( "math" "testing" "github.com/x448/float16" ) // prevent compiler optimizing out code by assigning to these var resultF16 float16.Float16 var resultF32 float32 var resultStr string var pcn float16.Precision func BenchmarkFloat32pi(b *testing.B) { result := float32(0) pi32 := float32(math.Pi) pi16 := float16.Fromfloat32(pi32) for i := 0; i < b.N; i++ { f16 := float16.Frombits(uint16(pi16)) result = f16.Float32() } resultF32 = result } func BenchmarkFrombits(b *testing.B) { result := float16.Float16(0) pi32 := float32(math.Pi) pi16 := float16.Fromfloat32(pi32) for i := 0; i < b.N; i++ { result = float16.Frombits(uint16(pi16)) } resultF16 = result } func BenchmarkFromFloat32pi(b *testing.B) { result := float16.Float16(0) pi := float32(math.Pi) for i := 0; i < b.N; i++ { result = float16.Fromfloat32(pi) } resultF16 = result } func BenchmarkFromFloat32nan(b *testing.B) { result := float16.Float16(0) nan := float32(math.NaN()) for i := 0; i < b.N; i++ { result = float16.Fromfloat32(nan) } resultF16 = result } func BenchmarkFromFloat32subnorm(b *testing.B) { result := float16.Float16(0) subnorm := math.Float32frombits(0x007fffff) for i := 0; i < b.N; i++ { result = float16.Fromfloat32(subnorm) } resultF16 = result } func BenchmarkPrecisionFromFloat32(b *testing.B) { var result float16.Precision //pi := float32(math.Pi) for i := 0; i < b.N; i++ { f32 := float32(0.00001) + float32(0.00001) result = float16.PrecisionFromfloat32(f32) } pcn = result } func BenchmarkString(b *testing.B) { result := "1.5" pi32 := float32(math.Pi) pi16 := float16.Fromfloat32(pi32) for i := 0; i < b.N; i++ { result = pi16.String() } resultStr = result } golang-github-x448-float16-0.8.4/float16_test.go000066400000000000000000001141641414201604700211010ustar00rootroot00000000000000// Copyright 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker package float16_test import ( "bytes" "crypto/sha512" "encoding/binary" "encoding/hex" "fmt" "math" "testing" "github.com/x448/float16" ) // wantF32toF16bits is a tiny subset of expected values var wantF32toF16bits = []struct { in float32 out uint16 }{ // generated to provide 100% code coverage plus additional tests for rounding, etc. {in: math.Float32frombits(0x00000000), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x00000001), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x00001fff), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x00002000), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x00003fff), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x00004000), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x007fffff), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x00800000), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x33000000), out: 0x0000}, // in f32=0.000000, out f16=0 {in: math.Float32frombits(0x33000001), out: 0x0001}, // in f32=0.000000, out f16=0.000000059604645 {in: math.Float32frombits(0x33000002), out: 0x0001}, // in f32=0.000000, out f16=0.000000059604645 {in: math.Float32frombits(0x387fc000), out: 0x03ff}, // in f32=0.000061, out f16=0.00006097555 // exp32=-15 (underflows binary16 exp) but round-trips {in: math.Float32frombits(0x387fffff), out: 0x0400}, // in f32=0.000061, out f16=0.000061035156 {in: math.Float32frombits(0x38800000), out: 0x0400}, // in f32=0.000061, out f16=0.000061035156 {in: math.Float32frombits(0x38801fff), out: 0x0401}, // in f32=0.000061, out f16=0.00006109476 {in: math.Float32frombits(0x38802000), out: 0x0401}, // in f32=0.000061, out f16=0.00006109476 {in: math.Float32frombits(0x38803fff), out: 0x0402}, // in f32=0.000061, out f16=0.000061154366 {in: math.Float32frombits(0x38804000), out: 0x0402}, // in f32=0.000061, out f16=0.000061154366 {in: math.Float32frombits(0x33bfffff), out: 0x0001}, // in f32=0.000000, out f16=0.000000059604645 {in: math.Float32frombits(0x33c00000), out: 0x0002}, // in f32=0.000000, out f16=0.00000011920929 {in: math.Float32frombits(0x33c00001), out: 0x0002}, // in f32=0.000000, out f16=0.00000011920929 {in: math.Float32frombits(0x477fffff), out: 0x7c00}, // in f32=65535.996094, out f16=+Inf {in: math.Float32frombits(0x47800000), out: 0x7c00}, // in f32=65536.000000, out f16=+Inf {in: math.Float32frombits(0x7f7fffff), out: 0x7c00}, // in f32=340282346638528859811704183484516925440.000000, out f16=+Inf {in: math.Float32frombits(0x7f800000), out: 0x7c00}, // in f32=+Inf, out f16=+Inf {in: math.Float32frombits(0x7f801fff), out: 0x7e00}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0x7f802000), out: 0x7e01}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0x7f803fff), out: 0x7e01}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0x7f804000), out: 0x7e02}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0x7fffffff), out: 0x7fff}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0x80000000), out: 0x8000}, // in f32=-0.000000, out f16=-0 {in: math.Float32frombits(0x80001fff), out: 0x8000}, // in f32=-0.000000, out f16=-0 {in: math.Float32frombits(0x80002000), out: 0x8000}, // in f32=-0.000000, out f16=-0 {in: math.Float32frombits(0x80003fff), out: 0x8000}, // in f32=-0.000000, out f16=-0 {in: math.Float32frombits(0x80004000), out: 0x8000}, // in f32=-0.000000, out f16=-0 {in: math.Float32frombits(0x807fffff), out: 0x8000}, // in f32=-0.000000, out f16=-0 {in: math.Float32frombits(0x80800000), out: 0x8000}, // in f32=-0.000000, out f16=-0 {in: math.Float32frombits(0xb87fc000), out: 0x83ff}, // in f32=-0.000061, out f16=-0.00006097555 // exp32=-15 (underflows binary16 exp) but round-trips {in: math.Float32frombits(0xb87fffff), out: 0x8400}, // in f32=-0.000061, out f16=-0.000061035156 {in: math.Float32frombits(0xb8800000), out: 0x8400}, // in f32=-0.000061, out f16=-0.000061035156 {in: math.Float32frombits(0xb8801fff), out: 0x8401}, // in f32=-0.000061, out f16=-0.00006109476 {in: math.Float32frombits(0xb8802000), out: 0x8401}, // in f32=-0.000061, out f16=-0.00006109476 {in: math.Float32frombits(0xb8803fff), out: 0x8402}, // in f32=-0.000061, out f16=-0.000061154366 {in: math.Float32frombits(0xb8804000), out: 0x8402}, // in f32=-0.000061, out f16=-0.000061154366 {in: math.Float32frombits(0xc77fffff), out: 0xfc00}, // in f32=-65535.996094, out f16=-Inf {in: math.Float32frombits(0xc7800000), out: 0xfc00}, // in f32=-65536.000000, out f16=-Inf {in: math.Float32frombits(0xff7fffff), out: 0xfc00}, // in f32=-340282346638528859811704183484516925440.000000, out f16=-Inf {in: math.Float32frombits(0xff800000), out: 0xfc00}, // in f32=-Inf, out f16=-Inf {in: math.Float32frombits(0xff801fff), out: 0xfe00}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0xff802000), out: 0xfe01}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0xff803fff), out: 0xfe01}, // in f32=NaN, out f16=NaN {in: math.Float32frombits(0xff804000), out: 0xfe02}, // in f32=NaN, out f16=NaN // additional tests {in: math.Float32frombits(0xc77ff000), out: 0xfc00}, // in f32=-65520.000000, out f16=-Inf {in: math.Float32frombits(0xc77fef00), out: 0xfbff}, // in f32=-65519.000000, out f16=-65504 {in: math.Float32frombits(0xc77fee00), out: 0xfbff}, // in f32=-65518.000000, out f16=-65504 {in: math.Float32frombits(0xc5802000), out: 0xec01}, // in f32=-4100.000000, out f16=-4100 {in: math.Float32frombits(0xc5801800), out: 0xec01}, // in f32=-4099.000000, out f16=-4100 {in: math.Float32frombits(0xc5801000), out: 0xec00}, // in f32=-4098.000000, out f16=-4096 {in: math.Float32frombits(0xc5800800), out: 0xec00}, // in f32=-4097.000000, out f16=-4096 {in: math.Float32frombits(0xc5800000), out: 0xec00}, // in f32=-4096.000000, out f16=-4096 {in: math.Float32frombits(0xc57ff000), out: 0xec00}, // in f32=-4095.000000, out f16=-4096 {in: math.Float32frombits(0xc57fe000), out: 0xebff}, // in f32=-4094.000000, out f16=-4094 {in: math.Float32frombits(0xc57fd000), out: 0xebfe}, // in f32=-4093.000000, out f16=-4092 {in: math.Float32frombits(0xc5002000), out: 0xe801}, // in f32=-2050.000000, out f16=-2050 {in: math.Float32frombits(0xc5001000), out: 0xe800}, // in f32=-2049.000000, out f16=-2048 {in: math.Float32frombits(0xc5000829), out: 0xe800}, // in f32=-2048.510010, out f16=-2048 {in: math.Float32frombits(0xc5000800), out: 0xe800}, // in f32=-2048.500000, out f16=-2048 {in: math.Float32frombits(0xc50007d7), out: 0xe800}, // in f32=-2048.489990, out f16=-2048 {in: math.Float32frombits(0xc5000000), out: 0xe800}, // in f32=-2048.000000, out f16=-2048 {in: math.Float32frombits(0xc4fff052), out: 0xe800}, // in f32=-2047.510010, out f16=-2048 {in: math.Float32frombits(0xc4fff000), out: 0xe800}, // in f32=-2047.500000, out f16=-2048 {in: math.Float32frombits(0xc4ffefae), out: 0xe7ff}, // in f32=-2047.489990, out f16=-2047 {in: math.Float32frombits(0xc4ffe000), out: 0xe7ff}, // in f32=-2047.000000, out f16=-2047 {in: math.Float32frombits(0xc4ffc000), out: 0xe7fe}, // in f32=-2046.000000, out f16=-2046 {in: math.Float32frombits(0xc4ffa000), out: 0xe7fd}, // in f32=-2045.000000, out f16=-2045 {in: math.Float32frombits(0xbf800000), out: 0xbc00}, // in f32=-1.000000, out f16=-1 {in: math.Float32frombits(0xbf028f5c), out: 0xb814}, // in f32=-0.510000, out f16=-0.5097656 {in: math.Float32frombits(0xbf000000), out: 0xb800}, // in f32=-0.500000, out f16=-0.5 {in: math.Float32frombits(0xbefae148), out: 0xb7d7}, // in f32=-0.490000, out f16=-0.48999023 {in: math.Float32frombits(0x3efae148), out: 0x37d7}, // in f32=0.490000, out f16=0.48999023 {in: math.Float32frombits(0x3f000000), out: 0x3800}, // in f32=0.500000, out f16=0.5 {in: math.Float32frombits(0x3f028f5c), out: 0x3814}, // in f32=0.510000, out f16=0.5097656 {in: math.Float32frombits(0x3f800000), out: 0x3c00}, // in f32=1.000000, out f16=1 {in: math.Float32frombits(0x3fbeb852), out: 0x3df6}, // in f32=1.490000, out f16=1.4902344 {in: math.Float32frombits(0x3fc00000), out: 0x3e00}, // in f32=1.500000, out f16=1.5 {in: math.Float32frombits(0x3fc147ae), out: 0x3e0a}, // in f32=1.510000, out f16=1.5097656 {in: math.Float32frombits(0x3fcf1bbd), out: 0x3e79}, // in f32=1.618034, out f16=1.6181641 {in: math.Float32frombits(0x401f5c29), out: 0x40fb}, // in f32=2.490000, out f16=2.4902344 {in: math.Float32frombits(0x40200000), out: 0x4100}, // in f32=2.500000, out f16=2.5 {in: math.Float32frombits(0x4020a3d7), out: 0x4105}, // in f32=2.510000, out f16=2.5097656 {in: math.Float32frombits(0x402df854), out: 0x4170}, // in f32=2.718282, out f16=2.71875 {in: math.Float32frombits(0x40490fdb), out: 0x4248}, // in f32=3.141593, out f16=3.140625 {in: math.Float32frombits(0x40b00000), out: 0x4580}, // in f32=5.500000, out f16=5.5 {in: math.Float32frombits(0x44ffa000), out: 0x67fd}, // in f32=2045.000000, out f16=2045 {in: math.Float32frombits(0x44ffc000), out: 0x67fe}, // in f32=2046.000000, out f16=2046 {in: math.Float32frombits(0x44ffe000), out: 0x67ff}, // in f32=2047.000000, out f16=2047 {in: math.Float32frombits(0x44ffefae), out: 0x67ff}, // in f32=2047.489990, out f16=2047 {in: math.Float32frombits(0x44fff000), out: 0x6800}, // in f32=2047.500000, out f16=2048 {in: math.Float32frombits(0x44fff052), out: 0x6800}, // in f32=2047.510010, out f16=2048 {in: math.Float32frombits(0x45000000), out: 0x6800}, // in f32=2048.000000, out f16=2048 {in: math.Float32frombits(0x450007d7), out: 0x6800}, // in f32=2048.489990, out f16=2048 {in: math.Float32frombits(0x45000800), out: 0x6800}, // in f32=2048.500000, out f16=2048 {in: math.Float32frombits(0x45000829), out: 0x6800}, // in f32=2048.510010, out f16=2048 {in: math.Float32frombits(0x45001000), out: 0x6800}, // in f32=2049.000000, out f16=2048 {in: math.Float32frombits(0x450017d7), out: 0x6801}, // in f32=2049.489990, out f16=2050 {in: math.Float32frombits(0x45001800), out: 0x6801}, // in f32=2049.500000, out f16=2050 {in: math.Float32frombits(0x45001829), out: 0x6801}, // in f32=2049.510010, out f16=2050 {in: math.Float32frombits(0x45002000), out: 0x6801}, // in f32=2050.000000, out f16=2050 {in: math.Float32frombits(0x45003000), out: 0x6802}, // in f32=2051.000000, out f16=2052 {in: math.Float32frombits(0x457fd000), out: 0x6bfe}, // in f32=4093.000000, out f16=4092 {in: math.Float32frombits(0x457fe000), out: 0x6bff}, // in f32=4094.000000, out f16=4094 {in: math.Float32frombits(0x457ff000), out: 0x6c00}, // in f32=4095.000000, out f16=4096 {in: math.Float32frombits(0x45800000), out: 0x6c00}, // in f32=4096.000000, out f16=4096 {in: math.Float32frombits(0x45800800), out: 0x6c00}, // in f32=4097.000000, out f16=4096 {in: math.Float32frombits(0x45801000), out: 0x6c00}, // in f32=4098.000000, out f16=4096 {in: math.Float32frombits(0x45801800), out: 0x6c01}, // in f32=4099.000000, out f16=4100 {in: math.Float32frombits(0x45802000), out: 0x6c01}, // in f32=4100.000000, out f16=4100 {in: math.Float32frombits(0x45ad9c00), out: 0x6d6d}, // in f32=5555.500000, out f16=5556 {in: math.Float32frombits(0x45ffe800), out: 0x6fff}, // in f32=8189.000000, out f16=8188 {in: math.Float32frombits(0x45fff000), out: 0x7000}, // in f32=8190.000000, out f16=8192 {in: math.Float32frombits(0x45fff800), out: 0x7000}, // in f32=8191.000000, out f16=8192 {in: math.Float32frombits(0x46000000), out: 0x7000}, // in f32=8192.000000, out f16=8192 {in: math.Float32frombits(0x46000400), out: 0x7000}, // in f32=8193.000000, out f16=8192 {in: math.Float32frombits(0x46000800), out: 0x7000}, // in f32=8194.000000, out f16=8192 {in: math.Float32frombits(0x46000c00), out: 0x7000}, // in f32=8195.000000, out f16=8192 {in: math.Float32frombits(0x46001000), out: 0x7000}, // in f32=8196.000000, out f16=8192 {in: math.Float32frombits(0x46001400), out: 0x7001}, // in f32=8197.000000, out f16=8200 {in: math.Float32frombits(0x46001800), out: 0x7001}, // in f32=8198.000000, out f16=8200 {in: math.Float32frombits(0x46001c00), out: 0x7001}, // in f32=8199.000000, out f16=8200 {in: math.Float32frombits(0x46002000), out: 0x7001}, // in f32=8200.000000, out f16=8200 {in: math.Float32frombits(0x46002400), out: 0x7001}, // in f32=8201.000000, out f16=8200 {in: math.Float32frombits(0x46002800), out: 0x7001}, // in f32=8202.000000, out f16=8200 {in: math.Float32frombits(0x46002c00), out: 0x7001}, // in f32=8203.000000, out f16=8200 {in: math.Float32frombits(0x46003000), out: 0x7002}, // in f32=8204.000000, out f16=8208 {in: math.Float32frombits(0x467fec00), out: 0x73ff}, // in f32=16379.000000, out f16=16376 {in: math.Float32frombits(0x467ff000), out: 0x7400}, // in f32=16380.000000, out f16=16384 {in: math.Float32frombits(0x467ff400), out: 0x7400}, // in f32=16381.000000, out f16=16384 {in: math.Float32frombits(0x467ff800), out: 0x7400}, // in f32=16382.000000, out f16=16384 {in: math.Float32frombits(0x467ffc00), out: 0x7400}, // in f32=16383.000000, out f16=16384 {in: math.Float32frombits(0x46800000), out: 0x7400}, // in f32=16384.000000, out f16=16384 {in: math.Float32frombits(0x46800200), out: 0x7400}, // in f32=16385.000000, out f16=16384 {in: math.Float32frombits(0x46800400), out: 0x7400}, // in f32=16386.000000, out f16=16384 {in: math.Float32frombits(0x46800600), out: 0x7400}, // in f32=16387.000000, out f16=16384 {in: math.Float32frombits(0x46800800), out: 0x7400}, // in f32=16388.000000, out f16=16384 {in: math.Float32frombits(0x46800a00), out: 0x7400}, // in f32=16389.000000, out f16=16384 {in: math.Float32frombits(0x46800c00), out: 0x7400}, // in f32=16390.000000, out f16=16384 {in: math.Float32frombits(0x46800e00), out: 0x7400}, // in f32=16391.000000, out f16=16384 {in: math.Float32frombits(0x46801000), out: 0x7400}, // in f32=16392.000000, out f16=16384 {in: math.Float32frombits(0x46801200), out: 0x7401}, // in f32=16393.000000, out f16=16400 {in: math.Float32frombits(0x46801400), out: 0x7401}, // in f32=16394.000000, out f16=16400 {in: math.Float32frombits(0x46801600), out: 0x7401}, // in f32=16395.000000, out f16=16400 {in: math.Float32frombits(0x46801800), out: 0x7401}, // in f32=16396.000000, out f16=16400 {in: math.Float32frombits(0x46801a00), out: 0x7401}, // in f32=16397.000000, out f16=16400 {in: math.Float32frombits(0x46801c00), out: 0x7401}, // in f32=16398.000000, out f16=16400 {in: math.Float32frombits(0x46801e00), out: 0x7401}, // in f32=16399.000000, out f16=16400 {in: math.Float32frombits(0x46802000), out: 0x7401}, // in f32=16400.000000, out f16=16400 {in: math.Float32frombits(0x46802200), out: 0x7401}, // in f32=16401.000000, out f16=16400 {in: math.Float32frombits(0x46802400), out: 0x7401}, // in f32=16402.000000, out f16=16400 {in: math.Float32frombits(0x46802600), out: 0x7401}, // in f32=16403.000000, out f16=16400 {in: math.Float32frombits(0x46802800), out: 0x7401}, // in f32=16404.000000, out f16=16400 {in: math.Float32frombits(0x46802a00), out: 0x7401}, // in f32=16405.000000, out f16=16400 {in: math.Float32frombits(0x46802c00), out: 0x7401}, // in f32=16406.000000, out f16=16400 {in: math.Float32frombits(0x46802e00), out: 0x7401}, // in f32=16407.000000, out f16=16400 {in: math.Float32frombits(0x46803000), out: 0x7402}, // in f32=16408.000000, out f16=16416 {in: math.Float32frombits(0x46ffee00), out: 0x77ff}, // in f32=32759.000000, out f16=32752 {in: math.Float32frombits(0x46fff000), out: 0x7800}, // in f32=32760.000000, out f16=32768 {in: math.Float32frombits(0x46fff200), out: 0x7800}, // in f32=32761.000000, out f16=32768 {in: math.Float32frombits(0x46fff400), out: 0x7800}, // in f32=32762.000000, out f16=32768 {in: math.Float32frombits(0x46fff600), out: 0x7800}, // in f32=32763.000000, out f16=32768 {in: math.Float32frombits(0x46fff800), out: 0x7800}, // in f32=32764.000000, out f16=32768 {in: math.Float32frombits(0x46fffa00), out: 0x7800}, // in f32=32765.000000, out f16=32768 {in: math.Float32frombits(0x46fffc00), out: 0x7800}, // in f32=32766.000000, out f16=32768 {in: math.Float32frombits(0x46fffe00), out: 0x7800}, // in f32=32767.000000, out f16=32768 {in: math.Float32frombits(0x47000000), out: 0x7800}, // in f32=32768.000000, out f16=32768 {in: math.Float32frombits(0x47000100), out: 0x7800}, // in f32=32769.000000, out f16=32768 {in: math.Float32frombits(0x47000200), out: 0x7800}, // in f32=32770.000000, out f16=32768 {in: math.Float32frombits(0x47000300), out: 0x7800}, // in f32=32771.000000, out f16=32768 {in: math.Float32frombits(0x47000400), out: 0x7800}, // in f32=32772.000000, out f16=32768 {in: math.Float32frombits(0x47000500), out: 0x7800}, // in f32=32773.000000, out f16=32768 {in: math.Float32frombits(0x47000600), out: 0x7800}, // in f32=32774.000000, out f16=32768 {in: math.Float32frombits(0x47000700), out: 0x7800}, // in f32=32775.000000, out f16=32768 {in: math.Float32frombits(0x47000800), out: 0x7800}, // in f32=32776.000000, out f16=32768 {in: math.Float32frombits(0x47000900), out: 0x7800}, // in f32=32777.000000, out f16=32768 {in: math.Float32frombits(0x47000a00), out: 0x7800}, // in f32=32778.000000, out f16=32768 {in: math.Float32frombits(0x47000b00), out: 0x7800}, // in f32=32779.000000, out f16=32768 {in: math.Float32frombits(0x47000c00), out: 0x7800}, // in f32=32780.000000, out f16=32768 {in: math.Float32frombits(0x47000d00), out: 0x7800}, // in f32=32781.000000, out f16=32768 {in: math.Float32frombits(0x47000e00), out: 0x7800}, // in f32=32782.000000, out f16=32768 {in: math.Float32frombits(0x47000f00), out: 0x7800}, // in f32=32783.000000, out f16=32768 {in: math.Float32frombits(0x47001000), out: 0x7800}, // in f32=32784.000000, out f16=32768 {in: math.Float32frombits(0x47001100), out: 0x7801}, // in f32=32785.000000, out f16=32800 {in: math.Float32frombits(0x47001200), out: 0x7801}, // in f32=32786.000000, out f16=32800 {in: math.Float32frombits(0x47001300), out: 0x7801}, // in f32=32787.000000, out f16=32800 {in: math.Float32frombits(0x47001400), out: 0x7801}, // in f32=32788.000000, out f16=32800 {in: math.Float32frombits(0x47001500), out: 0x7801}, // in f32=32789.000000, out f16=32800 {in: math.Float32frombits(0x47001600), out: 0x7801}, // in f32=32790.000000, out f16=32800 {in: math.Float32frombits(0x47001700), out: 0x7801}, // in f32=32791.000000, out f16=32800 {in: math.Float32frombits(0x47001800), out: 0x7801}, // in f32=32792.000000, out f16=32800 {in: math.Float32frombits(0x47001900), out: 0x7801}, // in f32=32793.000000, out f16=32800 {in: math.Float32frombits(0x47001a00), out: 0x7801}, // in f32=32794.000000, out f16=32800 {in: math.Float32frombits(0x47001b00), out: 0x7801}, // in f32=32795.000000, out f16=32800 {in: math.Float32frombits(0x47001c00), out: 0x7801}, // in f32=32796.000000, out f16=32800 {in: math.Float32frombits(0x47001d00), out: 0x7801}, // in f32=32797.000000, out f16=32800 {in: math.Float32frombits(0x47001e00), out: 0x7801}, // in f32=32798.000000, out f16=32800 {in: math.Float32frombits(0x47001f00), out: 0x7801}, // in f32=32799.000000, out f16=32800 {in: math.Float32frombits(0x47002000), out: 0x7801}, // in f32=32800.000000, out f16=32800 {in: math.Float32frombits(0x47002100), out: 0x7801}, // in f32=32801.000000, out f16=32800 {in: math.Float32frombits(0x47002200), out: 0x7801}, // in f32=32802.000000, out f16=32800 {in: math.Float32frombits(0x47002300), out: 0x7801}, // in f32=32803.000000, out f16=32800 {in: math.Float32frombits(0x47002400), out: 0x7801}, // in f32=32804.000000, out f16=32800 {in: math.Float32frombits(0x47002500), out: 0x7801}, // in f32=32805.000000, out f16=32800 {in: math.Float32frombits(0x47002600), out: 0x7801}, // in f32=32806.000000, out f16=32800 {in: math.Float32frombits(0x47002700), out: 0x7801}, // in f32=32807.000000, out f16=32800 {in: math.Float32frombits(0x47002800), out: 0x7801}, // in f32=32808.000000, out f16=32800 {in: math.Float32frombits(0x47002900), out: 0x7801}, // in f32=32809.000000, out f16=32800 {in: math.Float32frombits(0x47002a00), out: 0x7801}, // in f32=32810.000000, out f16=32800 {in: math.Float32frombits(0x47002b00), out: 0x7801}, // in f32=32811.000000, out f16=32800 {in: math.Float32frombits(0x47002c00), out: 0x7801}, // in f32=32812.000000, out f16=32800 {in: math.Float32frombits(0x47002d00), out: 0x7801}, // in f32=32813.000000, out f16=32800 {in: math.Float32frombits(0x47002e00), out: 0x7801}, // in f32=32814.000000, out f16=32800 {in: math.Float32frombits(0x47002f00), out: 0x7801}, // in f32=32815.000000, out f16=32800 {in: math.Float32frombits(0x47003000), out: 0x7802}, // in f32=32816.000000, out f16=32832 {in: math.Float32frombits(0x477fe500), out: 0x7bff}, // in f32=65509.000000, out f16=65504 {in: math.Float32frombits(0x477fe100), out: 0x7bff}, // in f32=65505.000000, out f16=65504 {in: math.Float32frombits(0x477fee00), out: 0x7bff}, // in f32=65518.000000, out f16=65504 {in: math.Float32frombits(0x477fef00), out: 0x7bff}, // in f32=65519.000000, out f16=65504 {in: math.Float32frombits(0x477feffd), out: 0x7bff}, // in f32=65519.988281, out f16=65504 {in: math.Float32frombits(0x477ff000), out: 0x7c00}, // in f32=65520.000000, out f16=+Inf } func TestPrecisionFromfloat32(t *testing.T) { for i, v := range wantF32toF16bits { f16 := float16.Fromfloat32(v.in) u16 := uint16(f16) if u16 != v.out { t.Errorf("i=%d, in f32bits=0x%08x, wanted=0x%04x, got=0x%04x.", i, math.Float32bits(v.in), v.out, u16) } checkPrecision(t, v.in, f16, uint64(i)) } f32 := float32(5.5) // value that doesn't drop any bits in the significand, is within normal exponent range pre := float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionExact { t.Errorf("f32bits=0x%08x, wanted=PrecisionExact (%d), got=%d.", math.Float32bits(f32), float16.PrecisionExact, pre) } f32 = math.Float32frombits(0x38000000) // subnormal value with coef = 0 that can round-trip float32->float16->float32 pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionUnknown { t.Errorf("f32bits=0x%08x, wanted=PrecisionUnknown (%d), got=%d.", math.Float32bits(f32), float16.PrecisionUnknown, pre) } f32 = math.Float32frombits(0x387fc000) // subnormal value with coef !=0 that can round-trip float32->float16->float32 pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionUnknown { t.Errorf("f32bits=0x%08x, wanted=PrecisionUnknown (%d), got=%d.", math.Float32bits(f32), float16.PrecisionUnknown, pre) } f32 = math.Float32frombits(0x33c00000) // subnormal value with no dropped bits that cannot round-trip float32->float16->float32 pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionUnknown { t.Errorf("f32bits=0x%08x, wanted=PrecisionUnknown (%d), got=%d.", math.Float32bits(f32), float16.PrecisionUnknown, pre) } f32 = math.Float32frombits(0x38000001) // subnormal value with dropped non-zero bits > 0 pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionInexact { t.Errorf("f32bits=0x%08x, wanted=PrecisionInexact (%d), got=%d.", math.Float32bits(f32), float16.PrecisionInexact, pre) } f32 = float32(math.Pi) // value that cannot "preserve value" because it drops bits in the significand pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionInexact { t.Errorf("f32bits=0x%08x, wanted=PrecisionInexact (%d), got=%d.", math.Float32bits(f32), float16.PrecisionInexact, pre) } f32 = math.Float32frombits(0x1) // value that will underflow pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionUnderflow { t.Errorf("f32bits=0x%08x, wanted=PrecisionUnderflow (%d), got=%d.", math.Float32bits(f32), float16.PrecisionUnderflow, pre) } f32 = math.Float32frombits(0x33000000) // value that will underflow pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionUnderflow { t.Errorf("f32bits=0x%08x, wanted=PrecisionUnderflow (%d), got=%d.", math.Float32bits(f32), float16.PrecisionUnderflow, pre) } f32 = math.Float32frombits(0x47800000) // value that will overflow pre = float16.PrecisionFromfloat32(f32) if pre != float16.PrecisionOverflow { t.Errorf("f32bits=0x%08x, wanted=PrecisionOverflow (%d), got=%d.", math.Float32bits(f32), float16.PrecisionOverflow, pre) } } func TestFromNaN32ps(t *testing.T) { for i, v := range wantF32toF16bits { f16 := float16.Fromfloat32(v.in) u16 := uint16(f16) if u16 != v.out { t.Errorf("i=%d, in f32bits=0x%08x, wanted=0x%04x, got=0x%04x.", i, math.Float32bits(v.in), v.out, u16) } checkFromNaN32ps(t, v.in, f16) } // since checkFromNaN32ps rejects non-NaN input, try one here nan, err := float16.FromNaN32ps(float32(math.Pi)) if err != float16.ErrInvalidNaNValue { t.Errorf("FromNaN32ps: in float32(math.Pi) wanted err float16.ErrInvalidNaNValue, got err = %q", err) } if err.Error() != "float16: invalid NaN value, expected IEEE 754 NaN" { t.Errorf("unexpected string value returned by err.Error() for ErrInvalidNaNValue: %s", err.Error()) } if uint16(nan) != 0x7c01 { // signalling NaN t.Errorf("FromNaN32ps: in float32(math.Pi) wanted nan = 0x7c01, got nan = 0x%04x", uint16(nan)) } } // Test a small subset of possible conversions from float32 to Float16. // TestSomeFromFloat32 runs in under 1 second while TestAllFromFloat32 takes about 45 seconds. func TestSomeFromFloat32(t *testing.T) { for i, v := range wantF32toF16bits { f16 := float16.Fromfloat32(v.in) u16 := uint16(f16) if u16 != v.out { t.Errorf("i=%d, in f32bits=0x%08x, wanted=0x%04x, got=0x%04x.", i, math.Float32bits(v.in), v.out, u16) } } } // Test all possible 4294967296 float32 input values and results for // Fromfloat32(), FromNaN32ps(), and PrecisionFromfloat32(). func TestAllFromFloat32(t *testing.T) { if testing.Short() { t.Skip("skipping TestAllFromFloat32 in short mode.") } fmt.Printf("WARNING: TestAllFromFloat32 should take about 1-2 minutes to run on amd64, other platforms may take longer...\n") //const wantBlake2b = "3f310bc5608a087462d361644fe66feeb4c68145f6f18eb6f1439cd7914888b6df9e30ae5350dce0635162cc6a2f23b31b3e4353ca132a3c552bdbd58baa54e6" const wantSHA512 = "08670429a475164d6c4a080969e35231c77ef7069b430b5f38af22e013796b7818bbe8f5942a6ddf26de0e1dfc67d02243f483d85729ebc3762fc2948a5ca1f8" const batchSize uint32 = 16384 results := make([]uint16, batchSize) buf := new(bytes.Buffer) h := sha512.New() for i := uint64(0); i < uint64(0xFFFFFFFF); i += uint64(batchSize) { // fill results for j := uint32(0); j < batchSize; j++ { inF32 := math.Float32frombits(uint32(i) + uint32(j)) f16 := float16.Fromfloat32(inF32) results[j] = uint16(f16) checkPrecision(t, inF32, f16, i) checkFromNaN32ps(t, inF32, f16) } // convert results to []byte err := binary.Write(buf, binary.LittleEndian, results) if err != nil { panic(err) } // update hash with []byte of results h.Write(buf.Bytes()) buf.Reset() } // display hash digest in hex digest := h.Sum(nil) gotSHA512hex := hex.EncodeToString(digest) if gotSHA512hex != wantSHA512 { t.Errorf("gotSHA512hex = %s", gotSHA512hex) } } // Test all 65536 conversions from float16 to float32. // TestAllToFloat32 runs in under 1 second. func TestAllToFloat32(t *testing.T) { //const wantBlake2b = "078d8e3fac9480de1493f22c8f9bfc1eb2051537c536f00f621557d70eed1af057a487c3e252f6d593769f5288d5ab66d8e9cd1adba359838802944bdb731f4d" const wantSHA512 = "1a4ccec9fd7b6e83310c6b4958a25778cd95f8d4f88b19950e4b8d6932a955f7fbd96b1c9bd9b2a79c3a9d34d653f55e671f8f86e6a5a876660cd38479001aa6" const batchSize uint32 = 16384 results := make([]float32, batchSize) buf := new(bytes.Buffer) h := sha512.New() for i := uint64(0); i < uint64(0xFFFF); i += uint64(batchSize) { // fill results for j := uint32(0); j < batchSize; j++ { inU16 := uint16((uint16(i) + uint16(j))) f16 := float16.Float16(inU16) results[j] = f16.Float32() } // convert results to []byte err := binary.Write(buf, binary.LittleEndian, results) if err != nil { panic(err) } // update hash with []byte of results h.Write(buf.Bytes()) buf.Reset() } // display hash digest in hex digest := h.Sum(nil) gotSHA512hex := hex.EncodeToString(digest) if gotSHA512hex != wantSHA512 { t.Errorf("Float16toFloat32: gotSHA512hex = %s", gotSHA512hex) } } func TestFrombits(t *testing.T) { x := uint16(0x1234) f16 := float16.Frombits(x) if uint16(f16) != f16.Bits() || uint16(f16) != x { t.Errorf("float16.Frombits(0x7fff) returned %04x, wanted %04x", uint16(f16), x) } } func TestNaN(t *testing.T) { nan := float16.NaN() if !nan.IsNaN() { t.Errorf("nan.IsNaN() returned false, wanted true") } } func TestInf(t *testing.T) { posInf := float16.Inf(0) if uint16(posInf) != 0x7c00 { t.Errorf("float16.Inf(0) returned %04x, wanted %04x", uint16(posInf), 0x7c00) } posInf = float16.Inf(1) if uint16(posInf) != 0x7c00 { t.Errorf("float16.Inf(1) returned %04x, wanted %04x", uint16(posInf), 0x7c00) } negInf := float16.Inf(-1) if uint16(negInf) != 0xfc00 { t.Errorf("float16.Inf(-1) returned %04x, wanted %04x", uint16(negInf), 0xfc00) } } func TestBits(t *testing.T) { x := uint16(0x1234) f16 := float16.Frombits(x) if uint16(f16) != f16.Bits() || f16.Bits() != x { t.Errorf("Bits() returned %04x, wanted %04x", uint16(f16), x) } } func TestIsFinite(t *testing.T) { // IsFinite returns true if f is neither infinite nor NaN. finite := float16.Fromfloat32(float32(1.5)) if !finite.IsFinite() { t.Errorf("finite.Infinite() returned false, wanted true") } posInf := float16.Inf(0) if posInf.IsFinite() { t.Errorf("posInf.Infinite() returned true, wanted false") } negInf := float16.Inf(-1) if negInf.IsFinite() { t.Errorf("negInf.Infinite() returned true, wanted false") } nan := float16.NaN() if nan.IsFinite() { t.Errorf("nan.Infinite() returned true, wanted false") } } func TestIsNaN(t *testing.T) { f16 := float16.Float16(0) if f16.IsNaN() { t.Errorf("Float16(0).IsNaN() returned true, wanted false") } f16 = float16.Float16(0x7e00) if !f16.IsNaN() { t.Errorf("Float16(0x7e00).IsNaN() returned false, wanted true") } } func TestIsQuietNaN(t *testing.T) { f16 := float16.Float16(0) if f16.IsQuietNaN() { t.Errorf("Float16(0).IsQuietNaN() returned true, wanted false") } f16 = float16.Float16(0x7e00) if !f16.IsQuietNaN() { t.Errorf("Float16(0x7e00).IsQuietNaN() returned false, wanted true") } f16 = float16.Float16(0x7e00 ^ 0x0200) if f16.IsQuietNaN() { t.Errorf("Float16(0x7e00 ^ 0x0200).IsQuietNaN() returned true, wanted false") } } func TestIsNormal(t *testing.T) { // IsNormal returns true if f is neither zero, infinite, subnormal, or NaN. zero := float16.Frombits(0) if zero.IsNormal() { t.Errorf("zero.IsNormal() returned true, wanted false") } posInf := float16.Inf(0) if posInf.IsNormal() { t.Errorf("posInf.IsNormal() returned true, wanted false") } negInf := float16.Inf(-1) if negInf.IsNormal() { t.Errorf("negInf.IsNormal() returned true, wanted false") } nan := float16.NaN() if nan.IsNormal() { t.Errorf("nan.IsNormal() returned true, wanted false") } subnormal := float16.Frombits(0x0001) if subnormal.IsNormal() { t.Errorf("subnormal.IsNormal() returned true, wanted false") } normal := float16.Fromfloat32(float32(1.5)) if !normal.IsNormal() { t.Errorf("normal.IsNormal() returned false, wanted true") } } func TestSignbit(t *testing.T) { f16 := float16.Fromfloat32(float32(0.0)) if f16.Signbit() { t.Errorf("float16.Fromfloat32(float32(0)).Signbit() returned true, wanted false") } f16 = float16.Fromfloat32(float32(2.0)) if f16.Signbit() { t.Errorf("float16.Fromfloat32(float32(2)).Signbit() returned true, wanted false") } f16 = float16.Fromfloat32(float32(-2.0)) if !f16.Signbit() { t.Errorf("float16.Fromfloat32(float32(-2)).Signbit() returned false, wanted true") } } func TestString(t *testing.T) { f16 := float16.Fromfloat32(1.5) s := f16.String() if s != "1.5" { t.Errorf("Float16(1.5).String() returned %s, wanted 1.5", s) } f16 = float16.Fromfloat32(3.141593) s = f16.String() if s != "3.140625" { t.Errorf("Float16(3.141593).String() returned %s, wanted 3.140625", s) } } func TestIsInf(t *testing.T) { f16 := float16.Float16(0) if f16.IsInf(0) { t.Errorf("Float16(0).IsInf(0) returned true, wanted false") } f16 = float16.Float16(0x7c00) if !f16.IsInf(0) { t.Errorf("Float16(0x7c00).IsInf(0) returned false, wanted true") } f16 = float16.Float16(0x7c00) if !f16.IsInf(1) { t.Errorf("Float16(0x7c00).IsInf(1) returned false, wanted true") } f16 = float16.Float16(0x7c00) if f16.IsInf(-1) { t.Errorf("Float16(0x7c00).IsInf(-1) returned true, wanted false") } f16 = float16.Float16(0xfc00) if !f16.IsInf(0) { t.Errorf("Float16(0xfc00).IsInf(0) returned false, wanted true") } f16 = float16.Float16(0xfc00) if f16.IsInf(1) { t.Errorf("Float16(0xfc00).IsInf(1) returned true, wanted false") } f16 = float16.Float16(0xfc00) if !f16.IsInf(-1) { t.Errorf("Float16(0xfc00).IsInf(-1) returned false, wanted true") } } func float32parts(f32 float32) (exp int32, coef uint32, dropped uint32) { const COEFMASK uint32 = 0x7fffff // 23 least significant bits const EXPSHIFT uint32 = 23 const EXPBIAS uint32 = 127 const EXPMASK uint32 = uint32(0xff) << EXPSHIFT const DROPMASK uint32 = COEFMASK >> 10 u32 := math.Float32bits(f32) exp = int32(((u32 & EXPMASK) >> EXPSHIFT) - EXPBIAS) coef = u32 & COEFMASK dropped = coef & DROPMASK return exp, coef, dropped } func isNaN32(f32 float32) bool { exp, coef, _ := float32parts(f32) return (exp == 128) && (coef != 0) } func isQuietNaN32(f32 float32) bool { exp, coef, _ := float32parts(f32) return (exp == 128) && (coef != 0) && ((coef & 0x00400000) != 0) } func checkFromNaN32ps(t *testing.T, f32 float32, f16 float16.Float16) { if !isNaN32(f32) { return } u32 := math.Float32bits(f32) nan16, err := float16.FromNaN32ps(f32) if isQuietNaN32(f32) { // result should be the same if err != nil { t.Errorf("FromNaN32ps: qnan = 0x%08x (%f) wanted err = nil, got err = %q", u32, f32, err) } if uint16(nan16) != uint16(f16) { t.Errorf("FromNaN32ps: qnan = 0x%08x (%f) wanted nan16 = %v, got nan16 = %v", u32, f32, f16, nan16) } } else { // result should differ only by the signaling/quiet bit unless payload is empty if err != nil { t.Errorf("FromNaN32ps: snan = 0x%08x (%f) wanted err = nil, got err = %q", u32, f32, err) } coef := uint16(f16) & uint16(0x03ff) payload := uint16(f16) & uint16(0x01ff) diff := uint16(nan16 ^ f16) if payload == 0 { // the lowest bit needed to be set to prevent turning sNaN into infinity, so 2 bits differ if diff != 0x0201 { t.Errorf("FromNaN32ps: snan = 0x%08x (%f) wanted diff == 0x0201, got 0x%04x", u32, f32, diff) } } else { // only the quiet bit was restored, so 1 bit differs if diff != 0x0200 { t.Errorf("FromNaN32ps: snan = 0x%08x (%f) wanted diff == 0x0200, got 0x%04x. f16=0x%04x n16=0x%04x coef=0x%04x", u32, f32, diff, uint16(f16), uint16(nan16), coef) } } } } func checkPrecision(t *testing.T, f32 float32, f16 float16.Float16, i uint64) { //TODO rewrite this test when time allows u32 := math.Float32bits(f32) u16 := f16.Bits() f32bis := f16.Float32() u32bis := math.Float32bits(f32bis) pre := float16.PrecisionFromfloat32(f32) roundtripped := u32 == u32bis exp32, coef32, dropped32 := float32parts(f32) if roundtripped { checkRoundTrippedPrecision(t, u32, u16, u32bis, exp32, coef32, dropped32) return } if pre == float16.PrecisionExact { // this should only happen if both input and output are NaN if !(f16.IsNaN() && isNaN32(f32)) { t.Errorf("i=%d, PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionExact when roundtrip failed with non-special value", i, u32, f32, u16, u32bis, f32bis) } } else if pre == float16.PrecisionUnknown { if exp32 < -24 { t.Errorf("i=%d, PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionUnknown, wanted PrecisionUnderflow", i, u32, f32, u16, u32bis, f32bis) } if dropped32 != 0 { t.Errorf("i=%d, PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionUnknown, wanted PrecisionInexact", i, u32, f32, u16, u32bis, f32bis) } } else if pre == float16.PrecisionInexact { checkPrecisionInexact(t, u32, u16, u32bis, exp32, coef32, dropped32) } else if pre == float16.PrecisionUnderflow { if exp32 >= -14 { t.Errorf("i=%d, PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionUnderflow when exp32 is >= -14", i, u32, f32, u16, u32bis, f32bis) } } else if pre == float16.PrecisionOverflow { if exp32 <= 15 { t.Errorf("i=%d, PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionOverflow when exp32 is <= 15", i, u32, f32, u16, u32bis, f32bis) } } } func checkPrecisionInexact(t *testing.T, u32 uint32, u16 uint16, u32bis uint32, exp32 int32, coef32 uint32, dropped32 uint32) { f32 := math.Float32frombits(u32) f32bis := math.Float32frombits(u32bis) if exp32 < -24 { t.Errorf("PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionInexact, wanted PrecisionUnderflow", u32, f32, u16, u32bis, f32bis) } if exp32 > 15 { t.Errorf("PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionInexact, wanted PrecisionOverflow", u32, f32, u16, u32bis, f32bis) } if coef32 == 0 { t.Errorf("PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionInexact when coef32 is 0", u32, f32, u16, u32bis, f32bis) } if dropped32 == 0 { t.Errorf("PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionInexact when dropped32 is 0", u32, f32, u16, u32bis, f32bis) } } func checkRoundTrippedPrecision(t *testing.T, u32 uint32, u16 uint16, u32bis uint32, exp32 int32, coef32 uint32, dropped32 uint32) { f32 := math.Float32frombits(u32) f32bis := math.Float32frombits(u32bis) pre := float16.PrecisionFromfloat32(f32) f16 := float16.Frombits(u16) if dropped32 != 0 { t.Errorf("PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), dropped32 != 0 with successful roundtrip", u32, f32, u16, u32bis, f32bis) } if pre != float16.PrecisionExact { // there are 2046 values that are subnormal and can round-trip float32->float16->float32 if pre != float16.PrecisionUnknown { t.Errorf("PrecisionFromfloat32 in f32bits=0x%08x (%032b) (%f), out f16bits=0x%04x (%v), back=0x%08x (%f), got %v, wanted PrecisionExact, exp=%d, coef=%d, drpd=%d", u32, u32, f32, u16, f16, u32bis, f32bis, pre, exp32, coef32, dropped32) } } } golang-github-x448-float16-0.8.4/go.mod000066400000000000000000000000501414201604700173310ustar00rootroot00000000000000module github.com/x448/float16 go 1.11