pax_global_header00006660000000000000000000000064135336242420014516gustar00rootroot0000000000000052 comment=50ea049a9000a9d51e09929ee39572ff2ba68b59 vecf32-0.9.0/000077500000000000000000000000001353362424200126145ustar00rootroot00000000000000vecf32-0.9.0/.gitignore000066400000000000000000000004331353362424200146040ustar00rootroot00000000000000# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.test *.prof vendor vendor/* vecf32-0.9.0/.travis.yml000066400000000000000000000005141353362424200147250ustar00rootroot00000000000000sudo: false language: go branches: only: - master go: - 1.6.x - 1.7.x - 1.8.x - 1.9.x - 1.10.x - tip env: global: - GOARCH=amd64 - TRAVISTEST=true go_import_path: gorgonia.org/vecf32 before_install: - go get github.com/mattn/goveralls script: - ./test.sh matrix: allow_failures: - go: tipvecf32-0.9.0/CONTRIBUTORS.md000066400000000000000000000004151353362424200150730ustar00rootroot00000000000000# Contributors # The list of contributors to this library is listed in alphabetical order, first by name, failing which (i.e. if a name was not provided), the Github username will be used. * Austin Clements (@aclements) * Naseer Dari (@ndari) * Xuanyi Chew (@chewxy)vecf32-0.9.0/Gopkg.lock000066400000000000000000000015761353362424200145460ustar00rootroot00000000000000# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. [[projects]] name = "github.com/chewxy/math32" packages = ["."] revision = "d1e7b22839c693f54edf7811dd9487623abf2cd2" version = "v1.0.0" [[projects]] name = "github.com/davecgh/go-spew" packages = ["spew"] revision = "346938d642f2ec3594ed81d874461961cd0faa76" version = "v1.1.0" [[projects]] name = "github.com/pmezard/go-difflib" packages = ["difflib"] revision = "792786c7400a136282c1664665ae0a8db921c6c2" version = "v1.0.0" [[projects]] name = "github.com/stretchr/testify" packages = ["assert"] revision = "69483b4bd14f5845b5a1e55bca19e954e827f1d0" version = "v1.1.4" [solve-meta] analyzer-name = "dep" analyzer-version = 1 inputs-digest = "efdc54437fbcbccc06963a55dfe8a20ed1debec74f99044bbdeee793399ab451" solver-name = "gps-cdcl" solver-version = 1 vecf32-0.9.0/Gopkg.toml000066400000000000000000000002221353362424200145540ustar00rootroot00000000000000[[constraint]] name = "github.com/chewxy/math32" version = "1.0.0" [[constraint]] name = "github.com/stretchr/testify" version = "1.1.4" vecf32-0.9.0/LICENSE000066400000000000000000000020471353362424200136240ustar00rootroot00000000000000MIT License Copyright (c) 2017 Chewxy Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. vecf32-0.9.0/README.md000066400000000000000000000044511353362424200140770ustar00rootroot00000000000000# vecf32 [![GoDoc](https://godoc.org/gorgonia.org/vecf32?status.svg)](https://godoc.org/gorgonia.org/vecf32) [![Build Status](https://travis-ci.org/gorgonia/vecf32.svg?branch=master)](https://travis-ci.org/gorgonia/vecf32) [![Coverage Status](https://coveralls.io/repos/github/gorgonia/vecf32/badge.svg?branch=master)](https://coveralls.io/github/gorgonia/vecf32?branch=master) Package vecf32 provides common functions and methods for slices of float32 # Installation `go get -u gorgonia.org/vecf32` ## Dependencies This package [math32](https://github.com/chewxy/math32). For testing this package uses [testify/assert](https://github.com/stretchr/testify), which is licenced with a [MIT/BSD-like licence](https://github.com/stretchr/testify/blob/master/LICENSE) # Build Tags The point of this package is to provide operations that are accelerated by SIMD. However, this pakcage by default does not use SIMD. To use SIMD, build tags must be used. The supported build tags are `sse` and `avx`. Here's an example on how to use them: * [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) - `go build -tags='sse' ... * [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) - `go build -tags='avx' ... # FAQ ### Why are there so many `b = b[:len(a)]` lines? This is mainly done to eliminate bounds checking in a loop. The idea is the bounds of the slice is checked early on, and if need be, panics early. Then if everything is normal, there won't be bounds checking while in the loop. This also means that `b` must be at least `len(a)`, otherwise a panic will occur. To check for boundschecking and bounds check elimination (an amazing feature that landed in Go 1.7), compile your programs with `-gcflags='-d=ssa/check_bce/debug=1'`. # Contributing Contributions are welcome. The typical process works like this: 1. File an issue on the topic you want to contribute 2. Fork this repo 3. Add your contribution 4. Make a pull request 5. The pull request will be merged once tests pass, and code reviewed. 6. Add your name (if it hasn't already been added to CONTRIBUTORS.md) ## Pull Requests This package is very well tested. Please ensure tests are written if any new features are added. If bugs are fixed, please add the bugs to the tests as well. # Licence Package vecf32 is licenced under the MIT licence. vecf32-0.9.0/arith.go000066400000000000000000000075031353362424200142570ustar00rootroot00000000000000package vecf32 import "github.com/chewxy/math32" // Pow performs elementwise // a̅ ^ b̅ func Pow(a, b []float32) { b = b[:len(a)] for i, v := range a { switch b[i] { case 0: a[i] = float32(1) case 1: a[i] = v case 2: a[i] = v * v case 3: a[i] = v * v * v default: a[i] = math32.Pow(v, b[i]) } } } func Mod(a, b []float32) { b = b[:len(a)] for i, v := range a { a[i] = math32.Mod(v, b[i]) } } // Scale multiplies all values in the slice by the scalar. It performs elementwise // a̅ * s func Scale(a []float32, s float32) { for i, v := range a { a[i] = v * s } } // ScaleInv divides all values in the slice by the scalar. It performs elementwise // a̅ / s func ScaleInv(a []float32, s float32) { Scale(a, 1/s) } /// ScaleInvR divides all numbers in the slice by a scalar // s / a̅ func ScaleInvR(a []float32, s float32) { for i, v := range a { a[i] = s / v } } // Trans adds all the values in the slice by a scalar // a̅ + s func Trans(a []float32, s float32) { for i, v := range a { a[i] = v + s } } // TransInv subtracts all the values in the slice by a scalar // a̅ - s func TransInv(a []float32, s float32) { Trans(a, -s) } // TransInvR subtracts all the numbers in a slice from a scalar // s - a̅ func TransInvR(a []float32, s float32) { for i, v := range a { a[i] = s - v } } // PowOf performs elementwise // a̅ ^ s func PowOf(a []float32, s float32) { for i, v := range a { a[i] = math32.Pow(v, s) } } // PowOfR performs elementwise // s ^ a̅ func PowOfR(a []float32, s float32) { for i, v := range a { a[i] = math32.Pow(s, v) } } // Max takes two slices, a̅ + b̅, and compares them elementwise. The highest value is put into a̅. func Max(a, b []float32) { b = b[:len(a)] for i, v := range a { bv := b[i] if bv > v { a[i] = bv } } } // Min takes two slices, a̅ + b̅ and compares them elementwise. The lowest value is put into a̅. func Min(a, b []float32) { b = b[:len(a)] for i, v := range a { bv := b[i] if bv < v { a[i] = bv } } } /* REDUCTION RELATED */ // Sum sums a slice of float32 and returns a float32 func Sum(a []float32) float32 { return Reduce(add, float32(0), a...) } // MaxOf finds the max of a []float32. it panics if the slice is empty func MaxOf(a []float32) (retVal float32) { if len(a) < 1 { panic("Cannot find the max of an empty slice") } return Reduce(max, a[0], a[1:]...) } // MinOf finds the max of a []float32. it panics if the slice is empty func MinOf(a []float32) (retVal float32) { if len(a) < 1 { panic("Cannot find the min of an empty slice") } return Reduce(min, a[0], a[1:]...) } // Argmax returns the index of the min in a slice func Argmax(a []float32) int { var f float32 var max int var set bool for i, v := range a { if !set { f = v max = i set = true continue } // TODO: Maybe error instead of this? if math32.IsNaN(v) || math32.IsInf(v, 1) { max = i f = v break } if v > f { max = i f = v } } return max } // Argmin returns the index of the min in a slice func Argmin(a []float32) int { var f float32 var min int var set bool for i, v := range a { if !set { f = v min = i set = true continue } // TODO: Maybe error instead of this? if math32.IsNaN(v) || math32.IsInf(v, -1) { min = i f = v break } if v < f { min = i f = v } } return min } /* FUNCTION VARIABLES */ var ( add = func(a, b float32) float32 { return a + b } // sub = func(a, b float32) float32 { return a - b } // mul = func(a, b float32) float32 { return a * b } // div = func(a, b float32) float32 { return a / b } // mod = func(a, b float32) float32 { return math32.Mod(a, b) } min = func(a, b float32) float32 { if a < b { return a } return b } max = func(a, b float32) float32 { if a > b { return a } return b } ) vecf32-0.9.0/arith_bench_test.go000066400000000000000000000044451353362424200164570ustar00rootroot00000000000000// +build !sse,!avx package vecf32 import ( "testing" "github.com/chewxy/math32" ) /* BENCHMARKS */ func _vanillaVecAdd(a, b []float32) { for i := range a { a[i] += b[i] } } func BenchmarkVecAdd(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Add(x, y) } } func BenchmarkVanillaVecAdd(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecAdd(x, y) } } func _vanillaVecSub(a, b []float32) { for i := range a { a[i] -= b[i] } } func BenchmarkVecSub(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Sub(x, y) } } func BenchmarkVanillaVecSub(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecSub(x, y) } } func _vanillaVecMul(a, b []float32) { for i := range a { a[i] *= b[i] } } func BenchmarkVecMul(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Mul(x, y) } } func BenchmarkVanillaVecMul(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecMul(x, y) } } func _vanillaVecDiv(a, b []float32) { for i := range a { a[i] /= b[i] } } func BenchmarkVecDiv(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Div(x, y) } } func BenchmarkVanillaVecDiv(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecDiv(x, y) } } func _vanillaVecSqrt(a []float32) { for i, v := range a { a[i] = math32.Sqrt(v) } } func BenchmarkVecSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { Sqrt(x) } } func BenchmarkVanillaVecSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { _vanillaVecSqrt(x) } } func _vanillaVecInverseSqrt(a []float32) { for i, v := range a { a[i] = 1.0 / math32.Sqrt(v) } } func BenchmarkVecInvSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { InvSqrt(x) } } func BenchmarkVanillaVecInvSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { _vanillaVecInverseSqrt(x) } } vecf32-0.9.0/arith_test.go000066400000000000000000000156371353362424200153250ustar00rootroot00000000000000package vecf32 import ( "testing" "unsafe" "github.com/chewxy/math32" "github.com/stretchr/testify/assert" ) // 1049 is actually a prime, so it cannot be divisible by any other number // This is a good way to test that the remainder part of the Add/Sub/Mul/Div/Pow works const ( // niceprime = 37 // niceprime = 1049 niceprime = 597929 // niceprime = 1299827 // because sometimes I feel like being an idiot ) func TestAdd(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i := range correct { correct[i] = correct[i] + correct[i] } Add(a, a) assert.Equal(correct, a) b := Range(niceprime, 2*niceprime-1) for i := range correct { correct[i] = a[i] + b[i] } Add(a, b) assert.Equal(correct, a) /* Weird Corner Cases*/ for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { b = Range(i, 2*i) correct = make([]float32, i) for j := range correct { correct[j] = b[j] + a[j] } Add(a, b) assert.Equal(correct, a) } } } func TestSub(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := make([]float32, niceprime-1) Sub(a, a) assert.Equal(correct, a) b := Range(niceprime, 2*niceprime-1) for i := range correct { correct[i] = a[i] - b[i] } Sub(a, b) assert.Equal(correct, a) /* Weird Corner Cases*/ for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { b = Range(i, 2*i) correct = make([]float32, i) for j := range correct { correct[j] = a[j] - b[j] } Sub(a, b) assert.Equal(correct, a) } } } func TestMul(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i := range correct { correct[i] = correct[i] * correct[i] } Mul(a, a) assert.Equal(correct, a) b := Range(niceprime, 2*niceprime-1) for i := range correct { correct[i] = a[i] * b[i] } Mul(a, b) assert.Equal(correct, a) /* Weird Corner Cases*/ for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { b = Range(i, 2*i) correct = make([]float32, i) for j := range correct { correct[j] = a[j] * b[j] } Mul(a, b) assert.Equal(correct, a) } } } func TestPow(t *testing.T) { a := []float32{0, 1, 2, 3, 4} b := []float32{0, 1, 2, 3, 4} correct := make([]float32, 5) for i := range correct { correct[i] = math32.Pow(a[i], b[i]) } Pow(a, b) assert.Equal(t, correct, a) } func TestScale(t *testing.T) { a := []float32{0, 1, 2, 3, 4} correct := make([]float32, 5) for i := range correct { correct[i] = a[i] * 5 } Scale(a, 5) assert.Equal(t, correct, a) } func TestScaleInv(t *testing.T) { a := []float32{0, 1, 2, 4, 6} correct := make([]float32, len(a)) for i := range correct { correct[i] = a[i] / 2 } ScaleInv(a, 2) assert.Equal(t, correct, a) } func TestScaleInvR(t *testing.T) { a := []float32{0, 1, 2, 4, 6} correct := make([]float32, len(a)) for i := range correct { correct[i] = 2 / a[i] } ScaleInvR(a, 2) assert.Equal(t, correct, a) } func TestTrans(t *testing.T) { a := []float32{1, 2, 3, 4} correct := make([]float32, 4) for i := range correct { correct[i] = a[i] + float32(1) } Trans(a, 1) assert.Equal(t, correct, a) } func TestTransInv(t *testing.T) { a := []float32{1, 2, 3, 4} correct := make([]float32, 4) for i := range correct { correct[i] = a[i] - float32(1) } TransInv(a, 1) assert.Equal(t, correct, a) } func TestTransInvR(t *testing.T) { a := []float32{1, 2, 3, 4} correct := make([]float32, len(a)) for i := range correct { correct[i] = float32(1) - a[i] } TransInvR(a, 1) assert.Equal(t, correct, a) } func TestPowOf(t *testing.T) { a := []float32{1, 2, 3, 4} correct := make([]float32, len(a)) for i := range correct { correct[i] = math32.Pow(a[i], 5) } PowOf(a, 5) assert.Equal(t, correct, a) } func TestPowOfR(t *testing.T) { a := []float32{1, 2, 3, 4} correct := make([]float32, len(a)) for i := range correct { correct[i] = math32.Pow(5, a[i]) } PowOfR(a, 5) assert.Equal(t, correct, a) } func TestMax(t *testing.T) { a := []float32{0, 1, 2, 3, 4} b := []float32{5, 4, 2, 2, 1} correct := []float32{5, 4, 2, 3, 4} Max(a, b) assert.Equal(t, correct, a) b = []float32{2} f := func() { Max(a, b) } assert.Panics(t, f) } func TestMin(t *testing.T) { a := []float32{0, 1, 2, 3, 4} b := []float32{5, 4, 2, 2, 1} correct := []float32{0, 1, 2, 2, 1} Min(a, b) assert.Equal(t, correct, a) b = []float32{2} f := func() { Min(a, b) } assert.Panics(t, f) } func TestSum(t *testing.T) { a := []float32{0, 1, 2, 3, 4} correct := float32(10) got := Sum(a) if correct != got { t.Errorf("Expected %f. Got %v instead", correct, got) } } func TestMaxOf(t *testing.T) { a := []float32{0, 1, 2, 1, 0} correct := float32(2) got := MaxOf(a) if got != correct { t.Errorf("Expected %f. Got %v instead", correct, got) } a = []float32{} f := func() { MaxOf(a) } assert.Panics(t, f, "Expected panic when empty slice passed into MaxOf") } func TestMinOf(t *testing.T) { a := []float32{0, 1, 2, 1, 0} correct := float32(0) got := MinOf(a) if got != correct { t.Errorf("Expected %f. Got %v instead", correct, got) } a = []float32{} f := func() { MinOf(a) } assert.Panics(t, f, "Expected panic when empty slice passed into MinOf") } func TestArgmax(t *testing.T) { a := []float32{0, 1, 2, 34, 5} correct := 3 got := Argmax(a) if got != correct { t.Errorf("Expected argmax to be %v. Got %v instead", correct, got) } a = []float32{math32.Inf(-1), 2, 3, 4} correct = 3 got = Argmax(a) if got != correct { t.Errorf("Expected argmax to be %v. Got %v instead", correct, got) } a = []float32{math32.Inf(1), 2, 3, 4} correct = 0 got = Argmax(a) if got != correct { t.Errorf("Expected argmax to be %v. Got %v instead", correct, got) } a = []float32{1, math32.NaN(), 3, 4} correct = 1 got = Argmax(a) if got != correct { t.Errorf("Expected argmax to be %v. Got %v instead", correct, got) } } func TestArgmin(t *testing.T) { a := []float32{0, 1, 2, -34, 5} correct := 3 got := Argmin(a) if got != correct { t.Errorf("Expected argmin to be %v. Got %v instead", correct, got) } a = []float32{math32.Inf(-1), 2, 3, 4} correct = 0 got = Argmin(a) if got != correct { t.Errorf("Expected argmin to be %v. Got %v instead", correct, got) } a = []float32{math32.Inf(1), 2, 3, 4} correct = 1 got = Argmin(a) if got != correct { t.Errorf("Expected argmin to be %v. Got %v instead", correct, got) } a = []float32{1, math32.NaN(), 3, 4} correct = 1 got = Argmin(a) if got != correct { t.Errorf("Expected argmin to be %v. Got %v instead", correct, got) } } vecf32-0.9.0/asm.go000066400000000000000000000023201353362424200137200ustar00rootroot00000000000000// +build sse avx package vecf32 // Add performs a̅ + b̅. a̅ will be clobbered func Add(a, b []float32) { if len(a) != len(b) { panic("vectors must be the same length") } addAsm(a, b) } func addAsm(a, b []float32) // Sub performs a̅ - b̅. a̅ will be clobbered func Sub(a, b []float32) { if len(a) != len(b) { panic("vectors must be the same length") } subAsm(a, b) } func subAsm(a, b []float32) // Mul performs a̅ × b̅. a̅ will be clobbered func Mul(a, b []float32) { if len(a) != len(b) { panic("vectors must be the same length") } mulAsm(a, b) } func mulAsm(a, b []float32) // Div performs a̅ ÷ b̅. a̅ will be clobbered func Div(a, b []float32) { if len(a) != len(b) { panic("vectors must be the same length") } divAsm(a, b) } func divAsm(a, b []float32) // Sqrt performs √a̅ elementwise. a̅ will be clobbered func Sqrt(a []float32) // InvSqrt performs 1/√a̅ elementwise. a̅ will be clobbered func InvSqrt(a []float32) /* func Pow(a, b []float32) */ /* func Scale(s float32, a []float32) func ScaleFrom(s float32, a []float32) func Trans(s float32, a []float32) func TransFrom(s float32, a []float32) func Power(s float32, a []float32) func PowerFrom(s float32, a []float32) */ vecf32-0.9.0/asm_test.go000066400000000000000000000116511353362424200147660ustar00rootroot00000000000000// +build sse avx package vecf32 /* IMPORTANT NOTE: Currently Div does not handle division by zero correctly. It returns a NaN instead of +Inf */ import ( "testing" "unsafe" "github.com/chewxy/math32" "github.com/stretchr/testify/assert" ) // this file is mainly added to facilitate testing of the ASM code, and that it matches up correctly with the expected results func TestDiv(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i := range correct { correct[i] = correct[i] / correct[i] } Div(a, a) assert.Equal(correct[1:], a[1:]) assert.Equal(true, math32.IsNaN(a[0]), "a[0] is: %v", a[0]) b := Range(niceprime, 2*niceprime-1) for i := range correct { correct[i] = a[i] / b[i] } Div(a, b) assert.Equal(correct[1:], a[1:]) assert.Equal(true, math32.IsNaN(a[0]), "a[0] is: %v", a[0]) /* Weird Corner Cases*/ for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { b = Range(i, 2*i) correct = make([]float32, i) for j := range correct { correct[j] = a[j] / b[j] } Div(a, b) assert.Equal(correct[1:], a[1:]) } } } func TestSqrt(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i, v := range correct { correct[i] = math32.Sqrt(v) } Sqrt(a) assert.Equal(correct, a) // negatives a = []float32{-1, -2, -3, -4} Sqrt(a) for _, v := range a { if !math32.IsNaN(v) { t.Error("Expected NaN") } } /* Weird Corner Cases*/ for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { correct = make([]float32, i) for j := range correct { correct[j] = math32.Sqrt(a[j]) } Sqrt(a) assert.Equal(correct, a) } } } func TestInvSqrt(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i, v := range correct { correct[i] = 1.0 / math32.Sqrt(v) } InvSqrt(a) assert.Equal(correct[1:], a[1:]) if !math32.IsInf(a[0], 0) { t.Error("1/0 should be +Inf or -Inf") } // Weird Corner Cases for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { correct = make([]float32, i) for j := range correct { correct[j] = 1.0 / math32.Sqrt(a[j]) } InvSqrt(a) assert.Equal(correct[1:], a[1:], "i = %d, %v", i, Range(0, i)) if !math32.IsInf(a[0], 0) { t.Error("1/0 should be +Inf or -Inf") } } } } /* BENCHMARKS */ func _vanillaVecAdd(a, b []float32) { for i := range a { a[i] += b[i] } } func BenchmarkVecAdd(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Add(x, y) } } func BenchmarkVanillaVecAdd(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecAdd(x, y) } } func _vanillaVecSub(a, b []float32) { for i := range a { a[i] -= b[i] } } func BenchmarkVecSub(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Sub(x, y) } } func BenchmarkVanillaVecSub(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecSub(x, y) } } func _vanillaVecMul(a, b []float32) { for i := range a { a[i] *= b[i] } } func BenchmarkVecMul(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Mul(x, y) } } func BenchmarkVanillaVecMul(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecMul(x, y) } } func _vanillaVecDiv(a, b []float32) { for i := range a { a[i] /= b[i] } } func BenchmarkVecDiv(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { Div(x, y) } } func BenchmarkVanillaVecDiv(b *testing.B) { x := Range(0, niceprime) y := Range(niceprime, 2*niceprime) for n := 0; n < b.N; n++ { _vanillaVecDiv(x, y) } } func _vanillaVecSqrt(a []float32) { for i, v := range a { a[i] = math32.Sqrt(v) } } func BenchmarkVecSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { Sqrt(x) } } func BenchmarkVanillaVecSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { _vanillaVecSqrt(x) } } func _vanillaVecInverseSqrt(a []float32) { for i, v := range a { a[i] = 1.0 / math32.Sqrt(v) } } func BenchmarkVecInvSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { InvSqrt(x) } } func BenchmarkVanillaVecInvSqrt(b *testing.B) { x := Range(0, niceprime) for n := 0; n < b.N; n++ { _vanillaVecInverseSqrt(x) } } vecf32-0.9.0/asm_vecAdd_avx.s000066400000000000000000000065771353362424200157230ustar00rootroot00000000000000// +build avx // +build amd64 /* This function adds two []float32 with some SIMD optimizations using AVX. Instead of doing this: for i := 0; i < len(a); i++ { a[i] += b[i] } Here, I use the term "pairs" to denote an element of `a` and and element of `b` that will be added together. a[i], b[i] is a pair. Using AVX, we can simultaneously add 8 pairs at the same time, which will look something like this: for i := 0; i < len(a); i+=8{ a[i:i+8] += b[i:i+8] // this code won't run. } AVX registers are 256 bits, meaning we can put 8 float32s in there. These are the registers I use to store the relevant information: SI - Used to store the top element of slice A (index 0). This register is incremented every loop DI - used to store the top element of slice B. Incremented every loop AX - len(a) is stored in here. AX is also used as the "working" count of the length that is decremented. Y0, Y1 - YMM registers. X0, X1 - XMM registers. This pseudocode best explains the rather simple assembly: lenA := len(a) i := 0 loop: for { a[i:i+8*4] += b[i:i+8*4] lenA -= 8 i += 8 * 4 // 8 elements, 4 bytes each if lenA < 0{ break } } remainder4head: lenA += 8 if lenA == 0 { return } remainder4: for { a[i:i+4*4] += b[i:i+4*4] lenA -=4 i += 4 * 4 // 4 elements, 4 bytes each if lenA < 0{ break } } remainder1head: lenA += 4 if lenA == 0 { return } remainder1: for { a[i] += b[i] i+=4 // each element is 4 bytes lenA-- } return */ #include "textflag.h" // func addAsm(a, b []float32) TEXT ·addAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use detination index register for this MOVQ a_len+8(FP), AX // len(a) into AX - +8, because first 8 is pointer, second 8 is length, third 8 is cap // each ymm register can take up to 8 float32s. SUBQ $8, AX JL remainder loop: // a[0] to a[7] // VMOVUPS 0(%rsi), %ymm0 // VMOVUPS 0(%rdi), %ymm1 // VADDPS %ymm0, %ymm1, %ymm0 // VMOVUPS %ymm0, 0(%rsi) BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x06 // vmovups (%rsi),%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x0f // vmovups (%rdi),%ymm1 BYTE $0xc5; BYTE $0xf4; BYTE $0x58; BYTE $0xc0 // vaddps %ymm0,%ymm1,%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x11; BYTE $0x06 // vmovups %ymm0,(%rsi) ADDQ $32, SI ADDQ $32, DI SUBQ $8, AX JGE loop remainder: ADDQ $8, AX JE done SUBQ $4, AX JL remainder1head remainder4: // VMOVUPS (SI), X0 // VMOVUPS (DI), X1 // VADDPS X0, X1, X0 // VMOVUPS X0, (SI) BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x06 // vmovss (%rsi),%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x0f // vmovss (%rdi),%xmm1 BYTE $0xc5; BYTE $0xf0; BYTE $0x58; BYTE $0xc0 // vaddss %xmm0,%xmm1,%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x11; BYTE $0x06 // vmovss %xmm0,(%rsi) ADDQ $16, SI ADDQ $16, DI SUBQ $4, AX JGE remainder4 remainder1head: ADDQ $4, AX JE done remainder1: // copy into the appropriate registers // VMOVSS (SI), X0 // VMOVSS (DI), X1 // VADDSS X0, X1, X0 // VMOVSS X0, (SI) BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x06 // vmovss (%rsi),%xmm0 BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x0f // vmovss (%rdi),%xmm1 BYTE $0xc5; BYTE $0xf2; BYTE $0x58; BYTE $0xc0 // vaddss %xmm0,%xmm1,%xmm0 BYTE $0xc5; BYTE $0xfa; BYTE $0x11; BYTE $0x06 // vmovss %xmm0,(%rsi) // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecAdd_sse.s000066400000000000000000000017671353362424200157130ustar00rootroot00000000000000// +build sse // +build amd64 #include "textflag.h" // func addAsm(a, b []float32) TEXT ·addAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use destination index register for this MOVQ a_len+8(FP), AX // len(a) into AX // check if there are at least 16 elements SUBQ $16, AX JL remainder loop: // a[0] MOVUPS (SI), X0 MOVUPS (DI), X1 ADDPS X0, X1 MOVUPS X1, (SI) MOVUPS 16(SI), X2 MOVUPS 16(DI), X3 ADDPS X2, X3 MOVUPS X3, 16(SI) MOVUPS 32(SI), X4 MOVUPS 32(DI), X5 ADDPS X4, X5 MOVUPS X5, 32(SI) MOVUPS 48(SI), X6 MOVUPS 48(DI), X7 ADDPS X6, X7 MOVUPS X7, 48(SI) // update pointers. 4 registers, 4 elements each, 4 bytes per element ADDQ $64, SI ADDQ $64, DI // len(a) is now 4*4 elements less SUBQ $16, AX JGE loop remainder: ADDQ $16, AX JE done remainderloop: MOVSS (SI), X0 MOVSS (DI), X1 ADDSS X0, X1 MOVSS X1, (SI) // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainderloop done: RET vecf32-0.9.0/asm_vecDiv_avx.s000066400000000000000000000077071353362424200157510ustar00rootroot00000000000000// +build avx // +build amd64 /* This function adds two []float32 with some SIMD optimizations using AVX. Instead of doing this: for i := 0; i < len(a); i++ { a[i] /= b[i] } Here, I use the term "pairs" to denote an element of `a` and and element of `b` that will be added together. a[i], b[i] is a pair. Using AVX, we can simultaneously add 8 pairs at the same time, which will look something like this: for i := 0; i < len(a); i+=8{ a[i:i+8] /= b[i:i+8] // this code won't run. } AVX registers are 256 bits, meaning we can put 8 float32s in there. These are the registers I use to store the relevant information: SI - Used to store the top element of slice A (index 0). This register is incremented every loop DI - used to store the top element of slice B. Incremented every loop AX - len(a) is stored in here. AX is also used as the "working" count of the length that is decremented. Y0, Y1 - YMM registers. X0, X1 - XMM registers. With regards to VDIVPS and VSUBSS, it turns out that the description of these instructions are: VDIVPS ymm1, ymm2, ymm3: Subtract packed double-precision floating-point values in ymm3/mem from ymm2 and stores result in ymm1.[0] The description is written with intel's syntax (in this form: Dest, Src1, Src2). When converting to Go's ASM it becomes: (Src2, Src1, Dest) This pseudocode best explains the rather simple assembly: lenA := len(a) i := 0 loop: for { a[i:i+8*4] /= b[i:i+8*4] lenA -= 8 i += 8 * 4 // 8 elements, 4 bytes each if lenA < 0{ break } } remainder4head: lenA += 8 if lenA == 0 { return } remainder4: for { a[i:i+4*4] /= b[i:i+4*4] lenA -=4 i += 4 * 4 // 4 elements, 4 bytes each if lenA < 0{ break } } remainder1head: lenA += 4 if lenA == 0 { return } remainder1: for { a[i] /= b[i] i+=4 // each element is 4 bytes lenA-- } return Citation ======== [0]http://www.felixcloutier.com/x86/DIVPS.html */ #include "textflag.h" // func divAsm(a, b []float32) TEXT ·divAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use destination index register for this MOVQ a_len+8(FP), AX // len(a) into AX - +8, because first 8 is pointer, second 8 is length, third 8 is cap SUBQ $8, AX JL remainder // each ymm register can take up to 4 float64s. // There are 8 ymm registers (8 pairs to do addition) available (TODO: check how to access the other 8 ymm registers without fucking things up) // Therefore a total of 16 elements can be processed at a time loop: // a[0] to a[7] // VMOVUPS (SI), Y0 // VMOVUPS (DI), Y1 // VDIVPS Y1, Y0, Y0 // VMOVUPS Y0, (SI) BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x06 // vmovups (%rsi),%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x0f // vmovups (%rdi),%ymm1 BYTE $0xc5; BYTE $0xfc; BYTE $0x5e; BYTE $0xc1 // vdivps %ymm1,%ymm0,%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x11; BYTE $0x06 // vmovups %ymm0,(%rsi) ADDQ $32, SI ADDQ $32, DI SUBQ $8, AX JGE loop remainder: ADDQ $8, AX JE done SUBQ $4, AX JL remainder1head remainder4: // VMOVUPS (SI), X0 // VMOVUPS (DI), X1 // VDIVPS X1, X0, X0 // VMOVUPS X0, (SI) BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x06 // vmovups (%rsi),%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x0f // vmovups (%rdi),%xmm1 BYTE $0xc5; BYTE $0xf8; BYTE $0x5e; BYTE $0xc1 // vdivps %xmm1,%xmm0,%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x11; BYTE $0x06 // vmovups %xmm0,(%rsi) ADDQ $16, SI ADDQ $16, DI SUBQ $4, AX JGE remainder4 remainder1head: ADDQ $4, AX JE done remainder1: // VMOVSS (SI), X0 // VMOVSS (DI), X1 // VDIVSS X1, X0, X0 // VMOVSS X0, (SI) BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x06 // vmovss (%rsi),%xmm0 BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x0f // vmovss (%rdi),%xmm1 BYTE $0xc5; BYTE $0xfa; BYTE $0x5e; BYTE $0xc1 // vdivss %xmm1,%xmm0,%xmm0 BYTE $0xc5; BYTE $0xfa; BYTE $0x11; BYTE $0x06 // vmovss %xmm0,(%rsi) // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecDiv_sse.s000066400000000000000000000020601353362424200157300ustar00rootroot00000000000000// +build sse // +build amd64 #include "textflag.h" // func divAsm(a, b []float32) TEXT ·divAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use destination index register for this MOVQ a_len+8(FP), AX // len(a) into AX // check if there are at least 16 elements SUBQ $16, AX JL remainder loop: // a[0] MOVAPS (SI), X0 MOVAPS (DI), X1 DIVPS X1, X0 MOVAPS X0, (SI) MOVAPS 16(SI), X2 MOVAPS 16(DI), X3 DIVPS X3, X2 MOVAPS X2, 16(SI) MOVAPS 32(SI), X4 MOVAPS 32(DI), X5 DIVPS X5, X4 MOVAPS X4, 32(SI) MOVAPS 48(SI), X6 MOVAPS 48(DI), X7 DIVPS X7, X6 MOVAPS X6, 48(SI) // update pointers. 4 registers, 4 elements each, 4 bytes per element ADDQ $64, SI ADDQ $64, DI // len(a) is now 4*4 elements less SUBQ $16, AX JGE loop remainder: ADDQ $16, AX JE done remainderloop: // copy into the appropriate registers MOVSS (SI), X0 MOVSS (DI), X1 DIVSS X1, X0 // save it back MOVSS X0, (SI) // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainderloop done: RET vecf32-0.9.0/asm_vecInvSqrt_avx.s000066400000000000000000000037601353362424200166300ustar00rootroot00000000000000// +build avx // +build amd64 // +build !fastmath /* InvSqrt is a function that inverse square roots (1/√x) each element in a []float32 Because of the way VBROADCASTSS works, we first backup the first element of the slice into a register, BX. Meanwhile, we replace the first element with a constant 1.0. This is done so that we can broadcast the constant into the Y1 register. After 1.0 has been broadcasted into Y1, we move the value back into the top of the slice. The following is then performed: Y0 = Sqrt(a[i:i+8]) Y0 = Y1/Y0 And the standard looping thing happens */ #include "textflag.h" #define one 0x3f800000 // func InvSqrt(a []float32) TEXT ·InvSqrt(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ SI, CX MOVQ a_len+8(FP), AX // len(a) into AX - +8, because first 8 is pointer, second 8 is length, third 8 is cap // make sure that len(a) >= 1 XORQ BX, BX CMPQ BX, AX JGE done MOVL $one, DX SUBQ $8, AX JL remainder // store the first element in BX // This is done so that we can move 1.0 into the first element of the slice // because AVX instruction vbroadcastss can only read from memory location not from registers MOVL (SI), BX // load 1.0 into the first element MOVL DX, (SI) // VBROADCASTSS (SI), Y1 BYTE $0xc4; BYTE $0xe2; BYTE $0x7d; BYTE $0x18; BYTE $0x0e // vbroadcastss (%rsi),%ymm1 // now that we're done with the ghastly business of trying to broadcast 1.0 without using any extra memory... // we restore the first element MOVL BX, (SI) loop: // a[0] to a[7] // VSQRTPS (SI), Y0 // VDIVPS Y0, Y1, Y0 // VMOVUPS Y0, (SI) BYTE $0xc5; BYTE $0xfc; BYTE $0x51; BYTE $0x06 // vsqrtpd (%rsi),%ymm0 BYTE $0xc5; BYTE $0xf4; BYTE $0x5e; BYTE $0xc0 // vdivps %ymm0, %ymm1, %ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x11; BYTE $0x06 // vmovups %ymm0, (%rsi) ADDQ $32, SI SUBQ $8, AX JGE loop remainder: ADDQ $8, AX JE done remainder1: MOVQ DX, X1 MOVSS (SI), X0 SQRTSS X0, X0 DIVSS X0, X1 MOVSS X1, (SI) ADDQ $4, SI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecInvSqrt_sse.s000066400000000000000000000024221353362424200166160ustar00rootroot00000000000000// +build sse // +build amd64 // +build !fastmath /* InvSqrt is a function that inverse square roots (1/√x) each element in a []float32 The SSE version uses SHUFPS to "broadcast" the 1.0 constant to the X1 register. The rest proceeds as expected */ #include "textflag.h" #define one 0x3f800000 // func InvSqrt(a []float32) TEXT ·InvSqrt(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ SI, CX MOVQ a_len+8(FP), AX // len(a) into AX // make sure that len(a) >= 1 XORQ BX, BX CMPQ BX, AX JGE done MOVL $one, DX SUBQ $4, AX JL remainder // back up the first element of the slice MOVL (SI), BX MOVL DX, (SI) // broadcast 1.0 to all elements of X1 // 0x00 shuffles the least significant bits of the X1 reg, which means the first element is repeated MOVUPS (SI), X1 SHUFPS $0x00, X1, X1 MOVAPS X1, X2 // backup, because X1 will get clobbered in DIVPS // restore the first element now we're done MOVL BX, (SI) loop: MOVAPS X2, X1 SQRTPS (SI), X0 DIVPS X0, X1 MOVUPS X1, (SI) // we processed 4 elements. Each element is 4 bytes. So jump 16 ahead ADDQ $16, SI SUBQ $4, AX JGE loop remainder: ADDQ $4, AX JE done remainder1: MOVQ DX, X1 MOVSS (SI), X0 SQRTSS X0, X0 DIVSS X0, X1 MOVSS X1, (SI) ADDQ $4, SI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecMul_avx.s000066400000000000000000000064421353362424200157570ustar00rootroot00000000000000// +build avx // +build amd64 /* Mul multiplies two []float32 with some SIMD optimizations using AVX. Instead of doing this: for i := 0; i < len(a); i++ { a[i] *= b[i] } Here, I use the term "pairs" to denote an element of `a` and and element of `b` that will be added together. a[i], b[i] is a pair. Using AVX, we can simultaneously add 8 pairs at the same time, which will look something like this: for i := 0; i < len(a); i+=8{ a[i:i+8] *= b[i:i+8] // this code won't run. } AVX registers are 256 bits, meaning we can put 8 float32s in there. These are the registers I use to store the relevant information: SI - Used to store the top element of slice A (index 0). This register is incremented every loop DI - used to store the top element of slice B. Incremented every loop AX - len(a) is stored in here. AX is also used as the "working" count of the length that is decremented. Y0, Y1 - YMM registers. X0, X1 - XMM registers. This pseudocode best explains the rather simple assembly: lenA := len(a) i := 0 loop: for { a[i:i+8*4] *= b[i:i+8*4] lenA -= 8 i += 8 * 4 // 8 elements, 4 bytes each if lenA < 0{ break } } remainder4head: lenA += 8 if lenA == 0 { return } remainder4: for { a[i:i+4*4] *= b[i:i+4*4] lenA -=4 i += 4 * 4 // 4 elements, 4 bytes each if lenA < 0{ break } } remainder1head: lenA += 4 if lenA == 0 { return } remainder1: for { a[i] *= b[i] i+=4 // each element is 4 bytes lenA-- } return */ #include "textflag.h" // func mulAsm(a, b []float32) TEXT ·mulAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use detination index register for this MOVQ a_len+8(FP), AX // len(a) into AX // each ymm register can take up to 8 float32s. SUBQ $8, AX JL remainder loop: // a[0] to a[7] // VMOVUPS 0(SI), Y0 // VMOVUPS 0(DI), Y1 // VMULPS Y0, Y1, Y0 // VMOVUPS Y0, 0(SI) BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x06 // vmovups (%rsi),%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x0f // vmovups (%rdi),%ymm1 BYTE $0xc5; BYTE $0xf4; BYTE $0x59; BYTE $0xc0 // vmulps %ymm0,%ymm1,%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x11; BYTE $0x06 // vmovups %ymm0,(%rsi) ADDQ $32, SI ADDQ $32, DI SUBQ $8, AX JGE loop remainder: ADDQ $8, AX JE done SUBQ $4, AX JL remainder1head remainder4: // VMOVUPS (SI), X0 // VMOVUPS (DI), X1 // VMULPS X0, X1, X0 // VMOVUPS X0 (SI) BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x06 // vmovups (%rsi),%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x0f // vmovups (%rdi),%xmm1 BYTE $0xc5; BYTE $0xf0; BYTE $0x59; BYTE $0xc0 // vmulps %xmm0,%xmm1,%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x11; BYTE $0x06 // vmovups %xmm0,(%rsi) ADDQ $16, SI ADDQ $16, DI SUBQ $4, AX JGE remainder4 remainder1head: ADDQ $4, AX JE done remainder1: // copy into the appropriate registers // VMOVSS (SI), X0 // VMOVSS (DI), X1 // VMULSS X0, X1, X0 // VMOVSS X0, (SI) BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x06 // vmovss (%rsi),%xmm0 BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x0f // vmovss (%rdi),%xmm1 BYTE $0xc5; BYTE $0xf2; BYTE $0x59; BYTE $0xc0 // vmulss %xmm0,%xmm1,%xmm0 BYTE $0xc5; BYTE $0xfa; BYTE $0x11; BYTE $0x06 // vmovss %xmm0,(%rsi) // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecMul_sse.s000066400000000000000000000017741353362424200157560ustar00rootroot00000000000000// +build sse // +build amd64 #include "textflag.h" // func mulAsm(a, b []float32) TEXT ·mulAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use destination index register for this MOVQ a_len+8(FP), AX // len(a) into AX // check if there are at least 16 elements SUBQ $16, AX JL remainder loop: // a[0] MOVAPS (SI), X0 MOVAPS (DI), X1 MULPS X0, X1 MOVAPS X1, (SI) MOVAPS 16(SI), X2 MOVAPS 16(DI), X3 MULPS X2, X3 MOVAPS X3, 16(SI) MOVAPS 32(SI), X4 MOVAPS 32(DI), X5 MULPS X4, X5 MOVAPS X5, 32(SI) MOVAPS 48(SI), X6 MOVAPS 48(DI), X7 MULPS X6, X7 MOVAPS X7, 48(SI) // update pointers. 4 registers, 4 elements at once, each element is 4 bytes ADDQ $64, SI ADDQ $64, DI // len(a) is now 4*4 elements less SUBQ $16, AX JGE loop remainder: ADDQ $16, AX JE done remainderloop: MOVSS (SI), X0 MOVSS (DI), X1 MULSS X0, X1 MOVSS X1, (SI) // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainderloop done: RET vecf32-0.9.0/asm_vecSqrt_avx.s000066400000000000000000000021121353362424200161410ustar00rootroot00000000000000// +build avx // +build amd64 // +build !fastmath /* Sqrt takes a []float32 and square roots every element in the slice. */ #include "textflag.h" // func Sqrt(a []float32) TEXT ·Sqrt(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ SI, CX MOVQ a_len+8(FP), AX // len(a) into AX - +8, because first 8 is pointer, second 8 is length, third 8 is cap SUBQ $8, AX JL remainder loop: // a[0] to a[7] // VSQRTPS (SI), Y0 // VMOVUPS Y0, (SI) BYTE $0xc5; BYTE $0xfc; BYTE $0x51; BYTE $0x06 // vsqrtps (%rsi),%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x11; BYTE $0x06 // vmovups %ymm0,(%rsi) ADDQ $32, SI SUBQ $8, AX JGE loop remainder: ADDQ $8, AX JE done SUBQ $4, AX JL remainder1head remainder4: // VSQRTPS (SI), X0 // VMOVUPS X0, (SI) BYTE $0xc5; BYTE $0xf8; BYTE $0x51; BYTE $0x06 // vsqrtps (%rsi),%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x11; BYTE $0x06 // vmovups %xmm0,(%rsi) ADDQ $16, SI SUBQ $4, AX JGE remainder4 remainder1head: ADDQ $4, AX JE done remainder1: MOVSS (SI), X0 SQRTSS X0, X0 MOVSS X0, (SI) ADDQ $4, SI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecSqrt_sse.s000066400000000000000000000012511353362424200161400ustar00rootroot00000000000000// +build sse // +build amd64 // +build !fastmath /* Sqrt takes a []float32 and square roots every element in the slice. */ #include "textflag.h" // func Sqrt(a []float32) TEXT ·Sqrt(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ SI, CX MOVQ a_len+8(FP), AX // len(a) into AX - +8, because first 8 is pointer, second 8 is length, third 8 is cap SUBQ $4, AX JL remainder loop: SQRTPS (SI), X0 MOVUPS X0, (SI) // we processed 4 elements. Each element is 4 bytes. So jump 16 ahead ADDQ $16, SI SUBQ $4, AX JGE loop remainder: ADDQ $4, AX JE done remainder1: MOVSS (SI), X0 SQRTSS X0, X0 MOVSS X0, (SI) ADDQ $4, SI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecSub_avx.s000066400000000000000000000074151353362424200157540ustar00rootroot00000000000000// +build avx // +build amd64 /* Sub subtracts two []float32 with some SIMD optimizations using AVX. Instead of doing this: for i := 0; i < len(a); i++ { a[i] -= b[i] } Here, I use the term "pairs" to denote an element of `a` and and element of `b` that will be added together. a[i], b[i] is a pair. Using AVX, we can simultaneously add 8 pairs at the same time, which will look something like this: for i := 0; i < len(a); i+=8{ a[i:i+8] -= b[i:i+8] // this code won't run. } These are the registers I use to store the relevant information: SI - Used to store the top element of slice A (index 0). This register is incremented every loop DI - used to store the top element of slice B. Incremented every loop AX - len(a) is stored in here. Volatile register. AX is also used as the "working" count of the length that is decremented. AX - len(a) is stored in here. AX is also used as the "working" count of the length that is decremented. Y0, Y1 - YMM registers. X0, X1 - XMM registers. With regards to VSUBPS and VSUBSS, it turns out that the description of these instructions are: VSUBPS ymm1, ymm2, ymm3: Subtract packed double-precision floating-point values in ymm3/mem from ymm2 and stores result in ymm1.[0] The description is written with intel's syntax (in this form: Dest, Src1, Src2). When converting to Go's ASM it becomes: (Src2, Src1, Dest) This pseudocode best explains the rather simple assembly: lenA := len(a) i := 0 loop: for { a[i:i+8*4] -= b[i:i+8*4] lenA -= 8 i += 8*4 // 8 elements, 4 bytes each if lenA < 0{ break } } remainder4head: lenA += 8 if lenA == 0 { return } remainder4: for { a[i:i+4*4] -= b[i:i+4*4] lenA -=4 i += 4 * 4 // 4 elements, 4 bytes each if lenA < 0{ break } } remainder1head: lenA += 4 if lenA == 0 { return } remainder1: for { a[i] -= b[i] i+=4 // each element is 4 bytes lenA-- } return Citation ======== [0]http://www.felixcloutier.com/x86/SUBPS.html */ #include "textflag.h" // func subAsm(a, b []float32) TEXT ·subAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use destination index register for this MOVQ a_len+8(FP), AX // len(a) into AX MOVQ b_len+32(FP), BX // len(b) into BX // each ymm register can take 8 float32s SUBQ $8, AX JL remainder loop: // a[0] to a[7] // VMOVUPS (SI), Y0 // VMOVUPS (DI), Y1 // VSUBPS Y1, Y0, Y0 // VMOVUPS Y0, (SI) BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x06 // vmovups (%rsi),%ymm0 BYTE $0xc5; BYTE $0xfc; BYTE $0x10; BYTE $0x0f // vmovups (%rdi),%ymm1 BYTE $0xc5; BYTE $0xfc; BYTE $0x5c; BYTE $0xc1 // vsubps %ymm1,%ymm0,%ymm0 BYTE $0xc5; BYTE $0xfC; BYTE $0x11; BYTE $0x06 // vmovups %ymm0,(%rsi) // 8 elements processed. Each element is 4 bytes. So jump 32 bytes ahead ADDQ $32, SI ADDQ $32, DI SUBQ $8, AX JGE loop remainder: ADDQ $8, AX JE done SUBQ $4, AX JL remainder1head remainder4: // VMOVUPS (SI), X0 // VMOVUPS (DI), X1 // VSUBPS X1, X0, X0 // VMOVUPS X0, (SI) BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x06 // vmovups (%rsi),%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x10; BYTE $0x0f // vmovups (%rdi),%xmm1 BYTE $0xc5; BYTE $0xf8; BYTE $0x5c; BYTE $0xc1 // vsubps %xmm1,%xmm0,%xmm0 BYTE $0xc5; BYTE $0xf8; BYTE $0x11; BYTE $0x06 // vmovups %xmm0,(%rsi) ADDQ $16, SI ADDQ $16, DI SUBQ $4, AX JGE remainder4 remainder1head: ADDQ $4, AX JE done remainder1: // copy into the appropriate registers // VMOVSS (SI), X0 // VMOVSS (DI), X1 // VSUBSS X1, X0, X0 // VMOVSS X0, (SI) BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x06 BYTE $0xc5; BYTE $0xfa; BYTE $0x10; BYTE $0x0f BYTE $0xc5; BYTE $0xfa; BYTE $0x5c; BYTE $0xc1 BYTE $0xc5; BYTE $0xfa; BYTE $0x11; BYTE $0x06 // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainder1 done: RET vecf32-0.9.0/asm_vecSub_sse.s000066400000000000000000000021741353362424200157450ustar00rootroot00000000000000// +build sse // +build amd64 #include "textflag.h" // func subAsm(a, b []float32) TEXT ·subAsm(SB), NOSPLIT, $0 MOVQ a_data+0(FP), SI MOVQ b_data+24(FP), DI // use destination index register for this MOVQ a_len+8(FP), AX // len(a) into AX - +8, because first 8 is pointer, second 8 is length, third 8 is cap SUBQ $16, AX // at least 16 elements? JL remainder loop: // a[0] MOVAPS (SI), X0 MOVAPS (DI), X1 SUBPS X1, X0 MOVAPS X0, (SI) MOVAPS 16(SI), X2 MOVAPS 16(DI), X3 SUBPS X3, X2 MOVAPS X2, 16(SI) MOVAPS 32(SI), X4 MOVAPS 32(DI), X5 SUBPS X5, X4 MOVAPS X4, 32(SI) MOVAPS 48(SI), X6 MOVAPS 48(DI), X7 SUBPS X7, X6 MOVAPS X6, 48(SI) // update pointers. 4 element per register, 4 registers, 4 bytes per element. So jump 4*4*4 bytes ahead ADDQ $64, SI ADDQ $64, DI // len(a) now 4*4 less SUBQ $16, AX JGE loop remainder: ADDQ $16, AX JE done remainderloop: // copy into the appropriate registers MOVSS (SI), X0 MOVSS (DI), X1 SUBSS X1, X0 // save it back MOVSS X0, (SI) // update pointer to the top of the data ADDQ $4, SI ADDQ $4, DI DECQ AX JNE remainderloop done: RET vecf32-0.9.0/bench.sh000077500000000000000000000011321353362424200142270ustar00rootroot00000000000000set -ex benchtime=${1:-1s} go test -bench . -benchtime $benchtime go test -tags='sse' -bench . -benchtime $benchtime go test -tags='avx' -bench . -benchtime $benchtime # travis compiles commands in script and then executes in bash. By adding # set -e we are changing the travis build script's behavior, and the set # -e lives on past the commands we are providing it. Some of the travis # commands are supposed to exit with non zero status, but then continue # executing. set -x makes the travis log files extremely verbose and # difficult to understand. # # see travis-ci/travis-ci#5120 set +ex vecf32-0.9.0/doc.go000066400000000000000000000055401353362424200137140ustar00rootroot00000000000000// Package vecf32 provides common functions and methods for slices of float32. // // Name // // In the days of yore, scientists who computed with computers would use arrays to represent vectors, each value representing // magnitude and/or direction. Then came the C++ Standard Templates Library, which sought to provide this data type in the standard // library. Now, everyone conflates a term "vector" with dynamic arrays. // // In the C++ book, Bjarne Stroustrup has this to say: // One could argue that valarray should have been called vector because is is a traditional mathematical vector // and that vector should have been called array. // However, this is not the way that the terminology evolved. // A valarray is a vector optimized for numeric computation; // a vector is a flexible container designed for holding and manipulating objects of a wide variety of types; // and an array is a low-level built-in type // // Go has a better name for representing dynamically allocated arrays of any type - "slice". However, "slice" is both a noun and verb // and many libraries that I use already use "slice"-as-a-verb as a name, so I had to settle for the second best name: "vector". // // It should be noted that while the names used in this package were definitely mathematically inspired, they bear only little resemblance // the actual mathematical operations performed. // // Naming Convention // // The names of the operations assume you're working with slices of float32s. Hence `Add` performs elementwise addition between two []float32. // // Operations between []float32 and float32 are also supported, however they are differently named. Here are the equivalents: /* +------------------------+--------------------------------------------+ | []float32-[]float32 Op | []float32-float32 Op | +------------------------+--------------------------------------------+ | Add(a, b []float32) | Trans(a float32, b []float32) | | Sub(a, b []float32) | TransInv/TransInvR(a float32, b []float32) | | Mul(a, b []float32) | Scale(a float32, b []float32) | | Div(a, b []float32) | ScaleInv/ScaleInvR(a float32, b []float32) | | Pow(a, b []float32) | PowOf/PowOfR(a float32, b []float32) | +------------------------+--------------------------------------------+ */ // You may note that for the []float64 - float64 binary operations, the scalar (float64) is always the first operand. In operations // that are not commutative, an additional function is provided, suffixed with "R" (for reverse) // // Range Check and BCE // // This package does not provide range checking. If indices are out of range, the functions will panic. This package should play well with BCE. // // TODO(anyone): provide SIMD vectorization for Incr and []float32-float64 functions // Pull requests accepted package vecf32 // import "gorgonia.org/vecf32" vecf32-0.9.0/go.go000066400000000000000000000017261353362424200135560ustar00rootroot00000000000000// +build !avx,!sse package vecf32 import "github.com/chewxy/math32" // Add performs a̅ + b̅. a̅ will be clobbered func Add(a, b []float32) { b = b[:len(a)] for i, v := range a { a[i] = v + b[i] } } // Sub performs a̅ - b̅. a̅ will be clobbered func Sub(a, b []float32) { b = b[:len(a)] for i, v := range a { a[i] = v - b[i] } } // Mul performs a̅ × b̅. a̅ will be clobbered func Mul(a, b []float32) { b = b[:len(a)] for i, v := range a { a[i] = v * b[i] } } // Div performs a̅ ÷ b̅. a̅ will be clobbered func Div(a, b []float32) { b = b[:len(a)] for i, v := range a { if b[i] == 0 { a[i] = math32.Inf(0) continue } a[i] = v / b[i] } } // Sqrt performs √a̅ elementwise. a̅ will be clobbered func Sqrt(a []float32) { for i, v := range a { a[i] = math32.Sqrt(v) } } // InvSqrt performs 1/√a̅ elementwise. a̅ will be clobbered func InvSqrt(a []float32) { for i, v := range a { a[i] = float32(1) / math32.Sqrt(v) } } vecf32-0.9.0/go.mod000066400000000000000000000002771353362424200137300ustar00rootroot00000000000000module gorgonia.org/vecf32 go 1.13 require ( github.com/chewxy/math32 v1.0.0 github.com/davecgh/go-spew v1.1.0 github.com/pmezard/go-difflib v1.0.0 github.com/stretchr/testify v1.1.4 ) vecf32-0.9.0/go.sum000066400000000000000000000012601353362424200137460ustar00rootroot00000000000000github.com/chewxy/math32 v1.0.0 h1:RTt2SACA7BTzvbsAKVQJLZpV6zY2MZw4bW9L2HEKkHg= github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0= github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.1.4 h1:ToftOQTytwshuOSj6bDSolVUa3GINfJP/fg3OkkOzQQ= github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= vecf32-0.9.0/go_test.go000066400000000000000000000051301353362424200146060ustar00rootroot00000000000000// +build !sse // +build !avx package vecf32 /* IMPORTANT NOTE: Currently Div does not handle division by zero correctly. It returns a NaN instead of +Inf */ import ( "testing" "unsafe" "github.com/chewxy/math32" "github.com/stretchr/testify/assert" ) func TestDiv(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i := range correct { correct[i] = 1 } Div(a, a) assert.Equal(correct[1:], a[1:]) assert.Equal(true, math32.IsInf(a[0], 0), "a[0] is: %v", a[0]) b := Range(niceprime, 2*niceprime-1) for i := range correct { correct[i] = a[i] / b[i] } Div(a, b) assert.Equal(correct[1:], a[1:]) assert.Equal(true, math32.IsInf(a[0], 0), "a[0] is: %v", a[0]) /* Weird Corner Cases*/ for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { b = Range(i, 2*i) correct = make([]float32, i) for j := range correct { correct[j] = a[j] / b[j] } Div(a, b) assert.Equal(correct[1:], a[1:]) } } } func TestSqrt(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i, v := range correct { correct[i] = math32.Sqrt(v) } Sqrt(a) assert.Equal(correct, a) // negatives a = []float32{-1, -2, -3, -4} Sqrt(a) for _, v := range a { if !math32.IsNaN(v) { t.Error("Expected NaN") } } /* Weird Corner Cases*/ for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { correct = make([]float32, i) for j := range correct { correct[j] = math32.Sqrt(a[j]) } Sqrt(a) assert.Equal(correct, a) } } } func TestInvSqrt(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime-1) correct := Range(0, niceprime-1) for i, v := range correct { correct[i] = 1.0 / math32.Sqrt(v) } InvSqrt(a) assert.Equal(correct[1:], a[1:]) if !math32.IsInf(a[0], 0) { t.Error("1/0 should be +Inf or -Inf") } // Weird Corner Cases for i := 1; i < 65; i++ { a = Range(0, i) var testAlign bool addr := &a[0] u := uint(uintptr(unsafe.Pointer(addr))) if u&uint(32) != 0 { testAlign = true } if testAlign { correct = make([]float32, i) for j := range correct { correct[j] = 1.0 / math32.Sqrt(a[j]) } InvSqrt(a) assert.Equal(correct[1:], a[1:], "i = %d, %v", i, Range(0, i)) if !math32.IsInf(a[0], 0) { t.Error("1/0 should be +Inf or -Inf") } } } } vecf32-0.9.0/incr.go000066400000000000000000000062731353362424200141060ustar00rootroot00000000000000package vecf32 import "github.com/chewxy/math32" // IncrAdd performs a̅ + b̅ and then adds it elementwise to the incr slice func IncrAdd(a, b, incr []float32) { b = b[:len(a)] incr = incr[:len(a)] for i, v := range a { incr[i] += v + b[i] } } // IncrSub performs a̅ = b̅ and then adds it elementwise to the incr slice func IncrSub(a, b, incr []float32) { b = b[:len(a)] incr = incr[:len(a)] for i, v := range a { incr[i] += v - b[i] } } // IncrMul performs a̅ × b̅ and then adds it elementwise to the incr slice func IncrMul(a, b, incr []float32) { b = b[:len(a)] incr = incr[:len(a)] for i, v := range a { incr[i] += v * b[i] } } func IncrDiv(a, b, incr []float32) { b = b[:len(a)] incr = incr[:len(a)] for i, v := range a { if b[i] == 0 { incr[i] = math32.Inf(0) continue } incr[i] += v / b[i] } } // IncrDiv performs a̅ ÷ b̅. then adds it to incr func IncrPow(a, b, incr []float32) { b = b[:len(a)] incr = incr[:len(a)] for i, v := range a { switch b[i] { case 0: incr[i]++ case 1: incr[i] += v case 2: incr[i] += v * v case 3: incr[i] += v * v * v default: incr[i] += math32.Pow(v, b[i]) } } } // IncrMod performs a̅ % b̅ then adds it to incr func IncrMod(a, b, incr []float32) { b = b[:len(a)] incr = incr[:len(a)] for i, v := range a { incr[i] += math32.Mod(v, b[i]) } } // Scale multiplies all values in the slice by the scalar and then increments the incr slice // incr += a̅ * s func IncrScale(a []float32, s float32, incr []float32) { incr = incr[:len(a)] for i, v := range a { incr[i] += v * s } } // IncrScaleInv divides all values in the slice by the scalar and then increments the incr slice // incr += a̅ / s func IncrScaleInv(a []float32, s float32, incr []float32) { IncrScale(a, 1/s, incr) } /// IncrScaleInvR divides all numbers in the slice by a scalar and then increments the incr slice // incr += s / a̅ func IncrScaleInvR(a []float32, s float32, incr []float32) { incr = incr[:len(a)] for i, v := range a { incr[i] += s / v } } // IncrTrans adds all the values in the slice by a scalar and then increments the incr slice // incr += a̅ + s func IncrTrans(a []float32, s float32, incr []float32) { incr = incr[:len(a)] for i, v := range a { incr[i] += v + s } } // IncrTransInv subtracts all the values in the slice by a scalar and then increments the incr slice // incr += a̅ - s func IncrTransInv(a []float32, s float32, incr []float32) { IncrTrans(a, -s, incr) } // IncrTransInvR subtracts all the numbers in a slice from a scalar and then increments the incr slice // incr += s - a̅ func IncrTransInvR(a []float32, s float32, incr []float32) { incr = incr[:len(a)] for i, v := range a { incr[i] += s - v } } // IncrPowOf performs elementwise power function and then increments the incr slice // incr += a̅ ^ s func IncrPowOf(a []float32, s float32, incr []float32) { incr = incr[:len(a)] for i, v := range a { incr[i] += math32.Pow(v, s) } } // PowOfR performs elementwise power function below and then increments the incr slice. // incr += s ^ a̅ func IncrPowOfR(a []float32, s float32, incr []float32) { incr = incr[:len(a)] for i, v := range a { incr[i] += math32.Pow(s, v) } } vecf32-0.9.0/incr_test.go000066400000000000000000000105651353362424200151440ustar00rootroot00000000000000package vecf32 import ( "testing" "github.com/chewxy/math32" "github.com/stretchr/testify/assert" ) func makeIncr(size int) []float32 { retVal := make([]float32, size) for i := range retVal { retVal[i] = 100 } return retVal } func TestIncrAdd(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime) incr := makeIncr(len(a)) correct := Range(0, niceprime) for i := range correct { correct[i] = correct[i] + correct[i] + incr[i] } IncrAdd(a, a, incr) assert.Equal(correct, incr) b := Range(niceprime, 2*niceprime) for i := range correct { correct[i] = a[i] + b[i] + incr[i] } IncrAdd(a, b, incr) assert.Equal(correct, incr) } func TestIncrSub(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime) incr := makeIncr(len(a)) correct := make([]float32, niceprime) copy(correct, incr) IncrSub(a, a, incr) assert.Equal(correct, incr) b := Range(niceprime, 2*niceprime) for i := range correct { correct[i] = a[i] - b[i] + incr[i] } IncrSub(a, b, incr) assert.Equal(correct, incr) } func TestIncrMul(t *testing.T) { assert := assert.New(t) a := Range(0, niceprime) incr := makeIncr(len(a)) correct := Range(0, niceprime) for i := range correct { correct[i] = correct[i]*correct[i] + incr[i] } IncrMul(a, a, incr) assert.Equal(correct, incr) b := Range(niceprime, 2*niceprime) for i := range correct { correct[i] = a[i]*b[i] + incr[i] } IncrMul(a, b, incr) assert.Equal(correct, incr) } func TestIncrDiv(t *testing.T) { assert := assert.New(t) a := []float32{1, 2, 4, 8, 10} incr := makeIncr(len(a)) correct := make([]float32, len(a)) copy(correct, a) for i := range correct { correct[i] = correct[i]/correct[i] + incr[i] } IncrDiv(a, a, incr) assert.Equal(correct, incr) b := []float32{2, 4, 8, 16, 20} incr = makeIncr(len(a)) for i := range correct { correct[i] = a[i]/b[i] + incr[i] } IncrDiv(a, b, incr) assert.Equal(correct, incr) // division by 0 b = make([]float32, len(a)) IncrDiv(a, b, incr) for _, v := range incr { if !math32.IsInf(v, 0) && !math32.IsNaN(v) { t.Error("Expected Inf or NaN") } } } func TestIncrPow(t *testing.T) { a := []float32{0, 1, 2, 3, 4} b := []float32{0, 1, 2, 3, 4} incr := makeIncr(len(a)) correct := make([]float32, 5) for i := range correct { correct[i] = math32.Pow(a[i], b[i]) + incr[i] } IncrPow(a, b, incr) assert.Equal(t, correct, incr) } func TestIncrScale(t *testing.T) { a := []float32{0, 1, 2, 3, 4} incr := makeIncr(len(a)) correct := make([]float32, 5) for i := range correct { correct[i] = a[i]*5 + incr[i] } IncrScale(a, 5, incr) assert.Equal(t, correct, incr) } func TestIncrScaleInv(t *testing.T) { a := []float32{0, 1, 2, 4, 6} incr := makeIncr(len(a)) correct := make([]float32, len(a)) for i := range correct { correct[i] = a[i]/2 + incr[i] } IncrScaleInv(a, 2, incr) assert.Equal(t, correct, incr) } func TestIncrScaleInvR(t *testing.T) { a := []float32{0, 1, 2, 4, 6} incr := makeIncr(len(a)) correct := make([]float32, len(a)) for i := range correct { correct[i] = 2/a[i] + incr[i] } IncrScaleInvR(a, 2, incr) assert.Equal(t, correct, incr) } func TestIncrTrans(t *testing.T) { a := []float32{1, 2, 3, 4} incr := makeIncr(len(a)) correct := make([]float32, len(a)) for i := range correct { correct[i] = a[i] + float32(1) + incr[i] } IncrTrans(a, 1, incr) assert.Equal(t, correct, incr) } func TestIncrTransInv(t *testing.T) { a := []float32{1, 2, 3, 4} incr := makeIncr(len(a)) correct := make([]float32, len(a)) for i := range correct { correct[i] = a[i] - float32(1) + incr[i] } IncrTransInv(a, 1, incr) assert.Equal(t, correct, incr) } func TestIncrTransInvR(t *testing.T) { a := []float32{1, 2, 3, 4} incr := makeIncr(len(a)) correct := make([]float32, len(a)) for i := range correct { correct[i] = float32(1) - a[i] + incr[i] } IncrTransInvR(a, 1, incr) assert.Equal(t, correct, incr) } func TestIncrPowOf(t *testing.T) { a := []float32{1, 2, 3, 4} incr := makeIncr(len(a)) correct := make([]float32, len(a)) for i := range correct { correct[i] = math32.Pow(a[i], 5) + incr[i] } IncrPowOf(a, 5, incr) assert.Equal(t, correct, incr) } func TestIncrPowOfR(t *testing.T) { a := []float32{1, 2, 3, 4} incr := makeIncr(len(a)) correct := make([]float32, len(a)) for i := range correct { correct[i] = math32.Pow(5, a[i]) + incr[i] } IncrPowOfR(a, 5, incr) assert.Equal(t, correct, incr) } vecf32-0.9.0/test.sh000077500000000000000000000013711353362424200141340ustar00rootroot00000000000000set -ex go env go test -v -a -coverprofile=test.cover go test -tags='sse' -v -a -coverprofile=test.cover.sse go test -tags='avx' -v -a -coverprofile=test.cover.avx echo "mode: set" > final.cover tail -q -n +2 test.cover test.cover.sse test.cover.avx >> ./final.cover goveralls -coverprofile=./final.cover -service=travis-ci # travis compiles commands in script and then executes in bash. By adding # set -e we are changing the travis build script's behavior, and the set # -e lives on past the commands we are providing it. Some of the travis # commands are supposed to exit with non zero status, but then continue # executing. set -x makes the travis log files extremely verbose and # difficult to understand. # # see travis-ci/travis-ci#5120 set +ex vecf32-0.9.0/utils.go000066400000000000000000000013211353362424200143000ustar00rootroot00000000000000package vecf32 // Range is a function to create arithmetic progressions of float32 func Range(start, end int) []float32 { size := end - start incr := true if start > end { incr = false size = start - end } if size < 0 { panic("Cannot create a float range that is negative in size") } r := make([]float32, size) for i, v := 0, float32(start); i < size; i++ { r[i] = v if incr { v++ } else { v-- } } return r } // Reduce takes a function to reduce by, a defalut, and a splatted list of float32s func Reduce(f func(a, b float32) float32, def float32, l ...float32) (retVal float32) { retVal = def if len(l) == 0 { return } for _, v := range l { retVal = f(retVal, v) } return }