You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/11/07 22:25:20 UTC
[arrow] branch master updated: ARROW-18108: [Go] More scalar binary arithmetic (Multiply and Divide) (#14544)
This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 98943d90da ARROW-18108: [Go] More scalar binary arithmetic (Multiply and Divide) (#14544)
98943d90da is described below
commit 98943d90dacb0311fe0d7a17a8ef10762e1c0ef2
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Mon Nov 7 17:25:14 2022 -0500
ARROW-18108: [Go] More scalar binary arithmetic (Multiply and Divide) (#14544)
Authored-by: Matt Topol <zo...@gmail.com>
Signed-off-by: Matt Topol <zo...@gmail.com>
---
go/arrow/array/compare.go | 8 +-
go/arrow/array/numeric_test.go | 18 +
go/arrow/compute/arithmetic.go | 88 +-
go/arrow/compute/arithmetic_test.go | 503 +-
go/arrow/compute/cast_test.go | 8 +-
.../internal/kernels/_lib/base_arithmetic.cc | 111 +-
.../kernels/_lib/base_arithmetic_avx2_amd64.s | 20336 +++++++++++++------
.../kernels/_lib/base_arithmetic_sse4_amd64.s | 17932 ++++++++++------
go/arrow/compute/internal/kernels/_lib/types.h | 340 +
.../compute/internal/kernels/base_arithmetic.go | 86 +-
.../internal/kernels/base_arithmetic_amd64.go | 6 +-
.../internal/kernels/base_arithmetic_avx2_amd64.s | 19677 ++++++++++++------
.../internal/kernels/base_arithmetic_sse4_amd64.s | 16877 ++++++++++-----
.../compute/internal/kernels/scalar_arithmetic.go | 4 +
go/arrow/decimal128/decimal128.go | 39 +-
go/arrow/decimal256/decimal256.go | 69 +-
go/arrow/scalar/parse.go | 14 +
17 files changed, 52762 insertions(+), 23354 deletions(-)
diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go
index ea8ac25203..68143e0086 100644
--- a/go/arrow/array/compare.go
+++ b/go/arrow/array/compare.go
@@ -402,18 +402,18 @@ func (eq equalOption) f32(f1, f2 float32) bool {
v2 := float64(f2)
switch {
case eq.nansEq:
- return math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
+ return v1 == v2 || math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
default:
- return math.Abs(v1-v2) <= eq.atol
+ return v1 == v2 || math.Abs(v1-v2) <= eq.atol
}
}
func (eq equalOption) f64(v1, v2 float64) bool {
switch {
case eq.nansEq:
- return math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
+ return v1 == v2 || math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
default:
- return math.Abs(v1-v2) <= eq.atol
+ return v1 == v2 || math.Abs(v1-v2) <= eq.atol
}
}
diff --git a/go/arrow/array/numeric_test.go b/go/arrow/array/numeric_test.go
index 8efe41f97f..e485ba47f9 100644
--- a/go/arrow/array/numeric_test.go
+++ b/go/arrow/array/numeric_test.go
@@ -17,6 +17,8 @@
package array_test
import (
+ "encoding/json"
+ "math"
"reflect"
"testing"
@@ -135,6 +137,22 @@ func TestFloat64SliceDataWithNull(t *testing.T) {
}
}
+func TestUnmarshalSpecialFloat(t *testing.T) {
+ pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer pool.AssertSize(t, 0)
+
+ bldr := array.NewFloat32Builder(pool)
+ defer bldr.Release()
+
+ assert.NoError(t, json.Unmarshal([]byte(`[3.4, "Inf", "-Inf"]`), bldr))
+ arr := bldr.NewFloat32Array()
+ defer arr.Release()
+
+ assert.False(t, math.IsInf(float64(arr.Value(0)), 0), arr.Value(0))
+ assert.True(t, math.IsInf(float64(arr.Value(1)), 1), arr.Value(1))
+ assert.True(t, math.IsInf(float64(arr.Value(2)), -1), arr.Value(2))
+}
+
func TestNewTime32Data(t *testing.T) {
data := []arrow.Time32{
arrow.Time32(1),
diff --git a/go/arrow/compute/arithmetic.go b/go/arrow/compute/arithmetic.go
index 865fa5dfe0..4b6f6109a5 100644
--- a/go/arrow/compute/arithmetic.go
+++ b/go/arrow/compute/arithmetic.go
@@ -227,6 +227,53 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
reg.AddFunction(fn, false)
}
+
+ oplist := []struct {
+ funcName string
+ op kernels.ArithmeticOp
+ decPromote decimalPromotion
+ commutative bool
+ }{
+ {"multiply_unchecked", kernels.OpMul, decPromoteMultiply, true},
+ {"multiply", kernels.OpMulChecked, decPromoteMultiply, true},
+ {"divide_unchecked", kernels.OpDiv, decPromoteDivide, false},
+ {"divide", kernels.OpDivChecked, decPromoteDivide, false},
+ }
+
+ for _, o := range oplist {
+ fn := &arithmeticFunction{*NewScalarFunction(o.funcName, Binary(), addDoc), o.decPromote}
+ for _, k := range append(kernels.GetArithmeticKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...) {
+ if err := fn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ for _, unit := range arrow.TimeUnitValues {
+ durInput := exec.NewExactInput(&arrow.DurationType{Unit: unit})
+ i64Input := exec.NewExactInput(arrow.PrimitiveTypes.Int64)
+ durOutput := exec.NewOutputType(&arrow.DurationType{Unit: unit})
+ ex := kernels.ArithmeticExec(arrow.DURATION, o.op)
+ err := fn.AddNewKernel([]exec.InputType{durInput, i64Input}, durOutput, ex, nil)
+ if err != nil {
+ panic(err)
+ }
+ if o.commutative {
+ err = fn.AddNewKernel([]exec.InputType{i64Input, durInput}, durOutput, ex, nil)
+ if err != nil {
+ panic(err)
+ }
+ }
+ }
+
+ reg.AddFunction(fn, false)
+ }
+}
+
+func impl(ctx context.Context, fn string, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+ if opts.NoCheckOverflow {
+ fn += "_unchecked"
+ }
+ return CallFunction(ctx, fn, nil, left, right)
}
// Add performs an addition between the passed in arguments (scalar or array)
@@ -235,13 +282,9 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
//
// ArithmeticOptions specifies whether or not to check for overflows,
// performance is faster if not explicitly checking for overflows but
-// will error on an overflow if CheckOverflow is true.
+// will error on an overflow if NoCheckOverflow is false (default).
func Add(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
- fn := "add"
- if opts.NoCheckOverflow {
- fn = "add_unchecked"
- }
- return CallFunction(ctx, fn, nil, left, right)
+ return impl(ctx, "add", opts, left, right)
}
// Sub performs a subtraction between the passed in arguments (scalar or array)
@@ -250,11 +293,32 @@ func Add(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum,
//
// ArithmeticOptions specifies whether or not to check for overflows,
// performance is faster if not explicitly checking for overflows but
-// will error on an overflow if CheckOverflow is true.
+// will error on an overflow if NoCheckOverflow is false (default).
func Subtract(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
- fn := "sub"
- if opts.NoCheckOverflow {
- fn = "sub_unchecked"
- }
- return CallFunction(ctx, fn, nil, left, right)
+ return impl(ctx, "sub", opts, left, right)
+}
+
+// Multiply performs a multiplication between the passed in arguments (scalar or array)
+// and returns the result. If one argument is a scalar and the other is an
+// array, the scalar value is multiplied against each value of the array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// performance is faster if not explicitly checking for overflows but
+// will error on an overflow if NoCheckOverflow is false (default).
+func Multiply(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+ return impl(ctx, "multiply", opts, left, right)
+}
+
+// Divide performs a division between the passed in arguments (scalar or array)
+// and returns the result. If one argument is a scalar and the other is an
+// array, the scalar value is used with each value of the array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// performance is faster if not explicitly checking for overflows but
+// will error on an overflow if NoCheckOverflow is false (default).
+//
+// Will error on divide by zero regardless of whether or not checking for
+// overflows.
+func Divide(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+ return impl(ctx, "divide", opts, left, right)
}
diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go
index 5396188883..12e837a811 100644
--- a/go/arrow/compute/arithmetic_test.go
+++ b/go/arrow/compute/arithmetic_test.go
@@ -62,15 +62,15 @@ type binaryArithmeticFunc = func(context.Context, compute.ArithmeticOptions, com
type binaryFunc = func(left, right compute.Datum) (compute.Datum, error)
-func assertScalarEquals(t *testing.T, expected, actual scalar.Scalar) {
- assert.Truef(t, scalar.Equals(expected, actual), "expected: %s\ngot: %s", expected, actual)
+func assertScalarEquals(t *testing.T, expected, actual scalar.Scalar, opt ...scalar.EqualOption) {
+ assert.Truef(t, scalar.ApproxEquals(expected, actual, opt...), "expected: %s\ngot: %s", expected, actual)
}
-func assertBinop(t *testing.T, fn binaryFunc, left, right, expected arrow.Array) {
+func assertBinop(t *testing.T, fn binaryFunc, left, right, expected arrow.Array, opt []array.EqualOption, scalarOpt []scalar.EqualOption) {
actual, err := fn(&compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()})
require.NoError(t, err)
defer actual.Release()
- assertDatumsEqual(t, &compute.ArrayDatum{Value: expected.Data()}, actual)
+ assertDatumsEqual(t, &compute.ArrayDatum{Value: expected.Data()}, actual, opt...)
// also check (Scalar, Scalar) operations
for i := 0; i < expected.Len(); i++ {
@@ -81,7 +81,7 @@ func assertBinop(t *testing.T, fn binaryFunc, left, right, expected arrow.Array)
actual, err := fn(&compute.ScalarDatum{Value: lhs}, &compute.ScalarDatum{Value: rhs})
assert.NoError(t, err)
- assertScalarEquals(t, s, actual.(*compute.ScalarDatum).Value)
+ assertScalarEquals(t, s, actual.(*compute.ScalarDatum).Value, scalarOpt...)
}
}
@@ -146,14 +146,21 @@ func (b *Float16BinaryFuncTestSuite) TestSub() {
type BinaryArithmeticSuite[T exec.NumericTypes] struct {
BinaryFuncTestSuite
- opts compute.ArithmeticOptions
- min, max T
+ opts compute.ArithmeticOptions
+ min, max T
+ equalOpts []array.EqualOption
+ scalarEqualOpts []scalar.EqualOption
}
func (BinaryArithmeticSuite[T]) DataType() arrow.DataType {
return exec.GetDataType[T]()
}
+func (b *BinaryArithmeticSuite[T]) setNansEqual(val bool) {
+ b.equalOpts = []array.EqualOption{array.WithNaNsEqual(val)}
+ b.scalarEqualOpts = []scalar.EqualOption{scalar.WithNaNsEqual(val)}
+}
+
func (b *BinaryArithmeticSuite[T]) SetupTest() {
b.BinaryFuncTestSuite.SetupTest()
b.opts.NoCheckOverflow = false
@@ -209,7 +216,7 @@ func (b *BinaryArithmeticSuite[T]) assertBinopArrScalar(fn binaryArithmeticFunc,
actual, err := fn(b.ctx, b.opts, &compute.ArrayDatum{Value: left.Data()}, &compute.ScalarDatum{Value: rhs})
b.NoError(err)
defer actual.Release()
- assertDatumsEqual(b.T(), &compute.ArrayDatum{Value: exp.Data()}, actual)
+ assertDatumsEqual(b.T(), &compute.ArrayDatum{Value: exp.Data()}, actual, b.equalOpts...)
}
func (b *BinaryArithmeticSuite[T]) assertBinop(fn binaryArithmeticFunc, lhs, rhs, expected string) {
@@ -222,11 +229,11 @@ func (b *BinaryArithmeticSuite[T]) assertBinop(fn binaryArithmeticFunc, lhs, rhs
assertBinop(b.T(), func(left, right compute.Datum) (compute.Datum, error) {
return fn(b.ctx, b.opts, left, right)
- }, left, right, exp)
+ }, left, right, exp, b.equalOpts, b.scalarEqualOpts)
}
func (b *BinaryArithmeticSuite[T]) setOverflowCheck(value bool) {
- b.opts.NoCheckOverflow = value
+ b.opts.NoCheckOverflow = !value
}
func (b *BinaryArithmeticSuite[T]) assertBinopErr(fn binaryArithmeticFunc, lhs, rhs, expectedMsg string) {
@@ -267,7 +274,7 @@ func (b *BinaryArithmeticSuite[T]) TestAdd() {
b.assertBinopArrScalar(compute.Add, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
b.assertBinopArrScalar(compute.Add, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
- if !arrow.IsFloating(b.DataType().ID()) && !overflow {
+ if !arrow.IsFloating(b.DataType().ID()) && overflow {
val := fmt.Sprintf("[%v]", b.max)
b.assertBinopErr(compute.Add, val, val, "overflow")
}
@@ -303,7 +310,7 @@ func (b *BinaryArithmeticSuite[T]) TestSub() {
b.assertBinopArrScalar(compute.Subtract, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
b.assertBinopArrScalar(compute.Subtract, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
- if !arrow.IsFloating(b.DataType().ID()) && !overflow {
+ if !arrow.IsFloating(b.DataType().ID()) && overflow {
b.assertBinopErr(compute.Subtract, fmt.Sprintf("[%v]", b.min), fmt.Sprintf("[%v]", b.max), "overflow")
}
})
@@ -311,6 +318,92 @@ func (b *BinaryArithmeticSuite[T]) TestSub() {
})
}
+func (b *BinaryArithmeticSuite[T]) TestMuliply() {
+ b.Run(b.DataType().String(), func() {
+ for _, overflow := range []bool{false, true} {
+ b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+ b.setOverflowCheck(overflow)
+
+ b.assertBinop(compute.Multiply, `[]`, `[]`, `[]`)
+ b.assertBinop(compute.Multiply, `[3, 2, 6]`, `[1, 0, 2]`, `[3, 0, 12]`)
+ // nulls on one side
+ b.assertBinop(compute.Multiply, `[null, 2, null]`, `[4, 5, 6]`, `[null, 10, null]`)
+ b.assertBinop(compute.Multiply, `[4, 5, 6]`, `[null, 2, null]`, `[null, 10, null]`)
+ // nulls on both sides
+ b.assertBinop(compute.Multiply, `[null, 2, 3]`, `[4, 5, null]`, `[null, 10, null]`)
+ // all nulls
+ b.assertBinop(compute.Multiply, `[null]`, `[null]`, `[null]`)
+
+ // scalar on left
+ b.assertBinopScalarValArr(compute.Multiply, 3, `[4, 5]`, `[12, 15]`)
+ b.assertBinopScalarValArr(compute.Multiply, 3, `[null, 5]`, `[null, 15]`)
+ b.assertBinopScalarArr(compute.Multiply, b.makeNullScalar(), `[1, 2]`, `[null, null]`)
+ b.assertBinopScalarArr(compute.Multiply, b.makeNullScalar(), `[null, 2]`, `[null, null]`)
+ // scalar on right
+ b.assertBinopArrScalarVal(compute.Multiply, `[4, 5]`, 3, `[12, 15]`)
+ b.assertBinopArrScalarVal(compute.Multiply, `[null, 5]`, 3, `[null, 15]`)
+ b.assertBinopArrScalar(compute.Multiply, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
+ b.assertBinopArrScalar(compute.Multiply, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
+ })
+ }
+ })
+}
+
+func (b *BinaryArithmeticSuite[T]) TestDiv() {
+ b.Run(b.DataType().String(), func() {
+ for _, overflow := range []bool{false, true} {
+ b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+ b.setOverflowCheck(overflow)
+
+ // empty arrays
+ b.assertBinop(compute.Divide, `[]`, `[]`, `[]`)
+ // ordinary arrays
+ b.assertBinop(compute.Divide, `[3, 2, 6]`, `[1, 1, 2]`, `[3, 2, 3]`)
+ // with nulls
+ b.assertBinop(compute.Divide, `[null, 10, 30, null, 20]`, `[1, 5, 2, 5, 10]`, `[null, 2, 15, null, 2]`)
+ if !arrow.IsFloating(b.DataType().ID()) {
+ // scalar divided by array
+ b.assertBinopScalarValArr(compute.Divide, 33, `[null, 1, 3, null, 2]`, `[null, 33, 11, null, 16]`)
+ // array divided by scalar
+ b.assertBinopArrScalarVal(compute.Divide, `[null, 10, 30, null, 2]`, 3, `[null, 3, 10, null, 0]`)
+ // scalar divided by scalar
+ b.assertBinopScalars(compute.Divide, 16, 7, 2)
+ } else {
+ b.assertBinop(compute.Divide, `[3.4, 0.64, 1.28]`, `[1, 2, 4]`, `[3.4, 0.32, 0.32]`)
+ b.assertBinop(compute.Divide, `[null, 1, 3.3, null, 2]`, `[1, 4, 2, 5, 0.1]`, `[null, 0.25, 1.65, null, 20]`)
+ b.assertBinopScalarValArr(compute.Divide, 10, `[null, 1, 2.5, null, 2, 5]`, `[null, 10, 4, null, 5, 2]`)
+ b.assertBinopArrScalarVal(compute.Divide, `[null, 1, 2.5, null, 2, 5]`, 10, `[null, 0.1, 0.25, null, 0.2, 0.5]`)
+
+ b.assertBinop(compute.Divide, `[3.4, "Inf", "-Inf"]`, `[1, 2, 3]`, `[3.4, "Inf", "-Inf"]`)
+ b.setNansEqual(true)
+ b.assertBinop(compute.Divide, `[3.4, "NaN", 2.0]`, `[1, 2, 2.0]`, `[3.4, "NaN", 1.0]`)
+ b.assertBinopScalars(compute.Divide, 21, 3, 7)
+ }
+ })
+ }
+ })
+}
+
+func (b *BinaryArithmeticSuite[T]) TestDivideByZero() {
+ if !arrow.IsFloating(b.DataType().ID()) {
+ for _, checkOverflow := range []bool{false, true} {
+ b.setOverflowCheck(checkOverflow)
+ b.assertBinopErr(compute.Divide, `[3, 2, 6]`, `[1, 1, 0]`, "divide by zero")
+ }
+ } else {
+ b.setOverflowCheck(true)
+ b.assertBinopErr(compute.Divide, `[3, 2, 6]`, `[1, 1, 0]`, "divide by zero")
+ b.assertBinopErr(compute.Divide, `[3, 2, 0]`, `[1, 1, 0]`, "divide by zero")
+ b.assertBinopErr(compute.Divide, `[3, 2, -6]`, `[1, 1, 0]`, "divide by zero")
+
+ b.setOverflowCheck(false)
+ b.setNansEqual(true)
+ b.assertBinop(compute.Divide, `[3, 2, 6]`, `[1, 1, 0]`, `[3, 2, "Inf"]`)
+ b.assertBinop(compute.Divide, `[3, 2, 0]`, `[1, 1, 0]`, `[3, 2, "NaN"]`)
+ b.assertBinop(compute.Divide, `[3, 2, -6]`, `[1, 1, 0]`, `[3, 2, "-Inf"]`)
+ }
+}
+
func TestBinaryArithmetic(t *testing.T) {
suite.Run(t, &BinaryArithmeticSuite[int8]{min: math.MinInt8, max: math.MaxInt8})
suite.Run(t, &BinaryArithmeticSuite[uint8]{min: 0, max: math.MaxUint8})
@@ -425,66 +518,159 @@ type DecimalBinaryArithmeticSuite struct {
func (ds *DecimalBinaryArithmeticSuite) TestDispatchBest() {
// decimal, floating point
- for _, fn := range []string{"add", "sub"} {
- for _, suffix := range []string{"", "_unchecked"} {
- fn += suffix
-
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal128Type{Precision: 1, Scale: 0},
- arrow.PrimitiveTypes.Float32}, []arrow.DataType{
- arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal256Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Float64},
- []arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- arrow.PrimitiveTypes.Float32, &arrow.Decimal256Type{Precision: 1, Scale: 0}},
- []arrow.DataType{arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- arrow.PrimitiveTypes.Float64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
- []arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
+ ds.Run("dec/floatingpoint", func() {
+ for _, fn := range []string{"add", "sub", "multiply", "divide"} {
+ for _, suffix := range []string{"", "_unchecked"} {
+ fn += suffix
+ ds.Run(fn, func() {
+
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 1, Scale: 0},
+ arrow.PrimitiveTypes.Float32}, []arrow.DataType{
+ arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal256Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Float64},
+ []arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ arrow.PrimitiveTypes.Float32, &arrow.Decimal256Type{Precision: 1, Scale: 0}},
+ []arrow.DataType{arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ arrow.PrimitiveTypes.Float64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
+ []arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
+ })
+ }
}
- }
+ })
// decimal, decimal => decimal
// decimal, integer => decimal
- for _, fn := range []string{"add", "sub"} {
+ ds.Run("dec/dec_int", func() {
+ for _, fn := range []string{"add", "sub"} {
+ for _, suffix := range []string{"", "_unchecked"} {
+ fn += suffix
+ ds.Run(fn, func() {
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 19, Scale: 0},
+ &arrow.Decimal128Type{Precision: 1, Scale: 0}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Int64},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 1, Scale: 0},
+ &arrow.Decimal128Type{Precision: 19, Scale: 0}})
+
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 3, Scale: 1},
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+ &arrow.Decimal128Type{Precision: 3, Scale: 1}})
+ })
+ }
+ }
+ })
+
+ {
+ fn := "multiply"
for _, suffix := range []string{"", "_unchecked"} {
fn += suffix
+ ds.Run(fn, func() {
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 19},
+ &arrow.Decimal128Type{Precision: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 1}, arrow.PrimitiveTypes.Int64},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 1},
+ &arrow.Decimal128Type{Precision: 19}})
+
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 0},
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+ &arrow.Decimal128Type{Precision: 2, Scale: 0}})
+ })
+ }
+ }
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
- []arrow.DataType{&arrow.Decimal128Type{Precision: 19, Scale: 0},
- &arrow.Decimal128Type{Precision: 1, Scale: 0}})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal128Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Int64},
- []arrow.DataType{&arrow.Decimal128Type{Precision: 1, Scale: 0},
- &arrow.Decimal128Type{Precision: 19, Scale: 0}})
-
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
- []arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
- &arrow.Decimal128Type{Precision: 2, Scale: 1}})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
- []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
- &arrow.Decimal256Type{Precision: 2, Scale: 1}})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
- []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
- &arrow.Decimal256Type{Precision: 2, Scale: 1}})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
- []arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
- &arrow.Decimal256Type{Precision: 2, Scale: 1}})
-
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
- []arrow.DataType{&arrow.Decimal128Type{Precision: 3, Scale: 1},
- &arrow.Decimal128Type{Precision: 2, Scale: 1}})
- CheckDispatchBest(ds.T(), fn, []arrow.DataType{
- &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
- []arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
- &arrow.Decimal128Type{Precision: 3, Scale: 1}})
+ {
+ fn := "divide"
+ for _, suffix := range []string{"", "_unchecked"} {
+ fn += suffix
+ ds.Run(fn, func() {
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 23, Scale: 4},
+ &arrow.Decimal128Type{Precision: 1, Scale: 0}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Int64},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 21, Scale: 20},
+ &arrow.Decimal128Type{Precision: 19, Scale: 0}})
+
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 6, Scale: 5},
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 6, Scale: 5},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 6, Scale: 5},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal256Type{Precision: 6, Scale: 5},
+ &arrow.Decimal256Type{Precision: 2, Scale: 1}})
+
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 7, Scale: 5},
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}})
+ CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+ &arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
+ []arrow.DataType{&arrow.Decimal128Type{Precision: 5, Scale: 4},
+ &arrow.Decimal128Type{Precision: 2, Scale: 0}})
+ })
}
}
}
@@ -537,7 +723,7 @@ func (ds *DecimalBinaryArithmeticSuite) TestAddSubtractDec256() {
strings.NewReader(`[
"-2.00000000000000000001",
"2469135780.24691357800000000000",
- "-9876549999.641975555509876543212",
+ "-9876549999.64197555550987654321",
"-99999999989999999999.99999999990000000001"
]`))
defer subtracted.Release()
@@ -606,6 +792,191 @@ func (ds *DecimalBinaryArithmeticSuite) TestAddSubScalars() {
})
}
+func (ds *DecimalBinaryArithmeticSuite) TestMultiply() {
+ ds.Run("array x array, decimal128", func() {
+ left, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 20, Scale: 10},
+ strings.NewReader(`["1234567890.1234567890", "-0.0000000001", "-9999999999.9999999999"]`))
+ ds.Require().NoError(err)
+ defer left.Release()
+ right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 13, Scale: 3},
+ strings.NewReader(`["1234567890.123", "0.001", "-9999999999.999"]`))
+ ds.Require().NoError(err)
+ defer right.Release()
+ expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 34, Scale: 13},
+ strings.NewReader(`["1524157875323319737.98709039504701", "-0.0000000000001", "99999999999989999999.0000000000001"]`))
+ ds.Require().NoError(err)
+ defer expected.Release()
+
+ checkScalarBinary(ds.T(), "multiply_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+ })
+
+ ds.Run("array x array decimal256", func() {
+ left, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 30, Scale: 3},
+ strings.NewReader(`["123456789012345678901234567.890", "0.000"]`))
+ ds.Require().NoError(err)
+ defer left.Release()
+ right, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 20, Scale: 9},
+ strings.NewReader(`["-12345678901.234567890", "99999999999.999999999"]`))
+ ds.Require().NoError(err)
+ defer right.Release()
+ expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 51, Scale: 12},
+ strings.NewReader(`["-1524157875323883675034293577501905199.875019052100", "0.000000000000"]`))
+ ds.Require().NoError(err)
+ defer expected.Release()
+ checkScalarBinary(ds.T(), "multiply_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+ })
+
+ ds.Run("scalar x array", func() {
+ left, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3, Scale: 2}, "3.14")
+ ds.Require().NoError(err)
+ right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 1, Scale: 0},
+ strings.NewReader(`["1", "2", "3", "4", "5"]`))
+ ds.Require().NoError(err)
+ defer right.Release()
+ expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 5, Scale: 2},
+ strings.NewReader(`["3.14", "6.28", "9.42", "12.56", "15.70"]`))
+ ds.Require().NoError(err)
+ defer expected.Release()
+
+ leftDatum, rightDatum := &compute.ScalarDatum{left}, &compute.ArrayDatum{right.Data()}
+ expDatum := &compute.ArrayDatum{expected.Data()}
+
+ checkScalarBinary(ds.T(), "multiply_unchecked", leftDatum, rightDatum, expDatum, nil)
+ checkScalarBinary(ds.T(), "multiply_unchecked", rightDatum, leftDatum, expDatum, nil)
+ })
+
+ ds.Run("scalar x scalar", func() {
+ left, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 1}, "1")
+ ds.Require().NoError(err)
+ right, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 1}, "1")
+ ds.Require().NoError(err)
+ expected, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "1")
+ ds.Require().NoError(err)
+ checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+ })
+
+ ds.Run("decimal128 x decimal256", func() {
+ left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3, Scale: 2}, "6.66")
+ right, _ := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 3, Scale: 1}, "88.8")
+ expected, _ := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 7, Scale: 3}, "591.408")
+ checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+ checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(expected), nil)
+ })
+
+ ds.Run("decimal x float", func() {
+ left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "666")
+ right := scalar.MakeScalar(float64(888))
+ expected := scalar.MakeScalar(float64(591408))
+ checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+ checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(expected), nil)
+ })
+
+ ds.Run("decimal x integer", func() {
+ left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "666")
+ right := scalar.MakeScalar(int64(888))
+ expected, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 23}, "591408")
+ checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+ })
+}
+
+func (ds *DecimalBinaryArithmeticSuite) TestDivide() {
+ ds.Run("array / array, decimal128", func() {
+ left, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 13, Scale: 3},
+ strings.NewReader(`["1234567890.123", "0.001"]`))
+ ds.Require().NoError(err)
+ defer left.Release()
+ right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 3, Scale: 0},
+ strings.NewReader(`["-987", "999"]`))
+ ds.Require().NoError(err)
+ defer right.Release()
+ expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 17, Scale: 7},
+ strings.NewReader(`["-1250828.6627386", "0.0000010"]`))
+ ds.Require().NoError(err)
+ defer expected.Release()
+
+ checkScalarBinary(ds.T(), "divide_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+ })
+
+ ds.Run("array / array decimal256", func() {
+ left, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 20, Scale: 10},
+ strings.NewReader(`["1234567890.1234567890", "9999999999.9999999999"]`))
+ ds.Require().NoError(err)
+ defer left.Release()
+ right, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 13, Scale: 3},
+ strings.NewReader(`["1234567890.123", "0.001"]`))
+ ds.Require().NoError(err)
+ defer right.Release()
+ expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 34, Scale: 21},
+ strings.NewReader(`["1.000000000000369999093", "9999999999999.999999900000000000000"]`))
+ ds.Require().NoError(err)
+ defer expected.Release()
+ checkScalarBinary(ds.T(), "divide_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+ })
+
+ ds.Run("scalar / array", func() {
+ left, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 1, Scale: 0}, "1")
+ ds.Require().NoError(err)
+ right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 1, Scale: 0},
+ strings.NewReader(`["1", "2", "3", "4"]`))
+ ds.Require().NoError(err)
+ defer right.Release()
+ leftDivRight, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 5, Scale: 4},
+ strings.NewReader(`["1.0000", "0.5000", "0.3333", "0.2500"]`))
+ ds.Require().NoError(err)
+ defer leftDivRight.Release()
+ rightDivLeft, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 5, Scale: 4},
+ strings.NewReader(`["1.0000", "2.0000", "3.0000", "4.0000"]`))
+ ds.Require().NoError(err)
+ defer rightDivLeft.Release()
+
+ leftDatum, rightDatum := &compute.ScalarDatum{left}, &compute.ArrayDatum{right.Data()}
+
+ checkScalarBinary(ds.T(), "divide_unchecked", leftDatum, rightDatum, &compute.ArrayDatum{leftDivRight.Data()}, nil)
+ checkScalarBinary(ds.T(), "divide_unchecked", rightDatum, leftDatum, &compute.ArrayDatum{rightDivLeft.Data()}, nil)
+ })
+
+ ds.Run("scalar / scalar", func() {
+ left, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 6, Scale: 5}, "2.71828")
+ ds.Require().NoError(err)
+ right, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 6, Scale: 5}, "3.14159")
+ ds.Require().NoError(err)
+ expected, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 13, Scale: 7}, "0.8652561")
+ ds.Require().NoError(err)
+ checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+ })
+
+ ds.Run("decimal128 / decimal256", func() {
+ left, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 6, Scale: 5}, "2.71828")
+ ds.Require().NoError(err)
+ right, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 6, Scale: 5}, "3.14159")
+ ds.Require().NoError(err)
+ leftDivRight, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 13, Scale: 7}, "0.8652561")
+ ds.Require().NoError(err)
+ rightDivLeft, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 13, Scale: 7}, "1.1557271")
+ ds.Require().NoError(err)
+ checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(leftDivRight), nil)
+ checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(rightDivLeft), nil)
+ })
+
+ ds.Run("decimal / float", func() {
+ left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "100")
+ right := scalar.MakeScalar(float64(50))
+ leftDivRight := scalar.MakeScalar(float64(2))
+ rightDivLeft := scalar.MakeScalar(float64(0.5))
+ checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(leftDivRight), nil)
+ checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(rightDivLeft), nil)
+ })
+
+ ds.Run("decimal / integer", func() {
+ left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "100")
+ right := scalar.MakeScalar(int64(50))
+ leftDivRight, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 23, Scale: 20}, "2.0000000000000000000")
+ rightDivLeft, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 23, Scale: 4}, "0.5000")
+ checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(leftDivRight), nil)
+ checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(rightDivLeft), nil)
+ })
+}
+
type ScalarBinaryTemporalArithmeticSuite struct {
BinaryFuncTestSuite
}
diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index 116774bfcd..5b6f17e13b 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -61,11 +61,11 @@ func getDatums[T any](inputs []T) []compute.Datum {
return out
}
-func assertArraysEqual(t *testing.T, expected, actual arrow.Array) bool {
- return assert.Truef(t, array.Equal(expected, actual), "expected: %s\ngot: %s", expected, actual)
+func assertArraysEqual(t *testing.T, expected, actual arrow.Array, opts ...array.EqualOption) bool {
+ return assert.Truef(t, array.ApproxEqual(expected, actual, opts...), "expected: %s\ngot: %s", expected, actual)
}
-func assertDatumsEqual(t *testing.T, expected, actual compute.Datum) {
+func assertDatumsEqual(t *testing.T, expected, actual compute.Datum, opts ...array.EqualOption) {
require.Equal(t, expected.Kind(), actual.Kind())
switch expected.Kind() {
@@ -76,7 +76,7 @@ func assertDatumsEqual(t *testing.T, expected, actual compute.Datum) {
case compute.KindArray:
want := expected.(*compute.ArrayDatum).MakeArray()
got := actual.(*compute.ArrayDatum).MakeArray()
- assertArraysEqual(t, want, got)
+ assertArraysEqual(t, want, got, opts...)
want.Release()
got.Release()
case compute.KindChunked:
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
index 7b0093af8a..3a8f6a7e70 100644
--- a/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
@@ -29,12 +29,16 @@
// worth the cost.
enum class optype : int8_t {
ADD,
- SUB,
+ SUB,
+ MUL,
+ DIV,
// this impl doesn't actually perform any overflow checks as we need
// to only run overflow checks on non-null entries
ADD_CHECKED,
- SUB_CHECKED,
+ SUB_CHECKED,
+ MUL_CHECKED,
+ DIV_CHECKED,
};
struct Add {
@@ -42,7 +46,7 @@ struct Add {
static constexpr T Call(Arg0 left, Arg1 right) {
if constexpr (is_arithmetic_v<T>)
return left + right;
- }
+ }
};
struct Sub {
@@ -60,18 +64,65 @@ struct AddChecked {
if constexpr(is_arithmetic_v<T>) {
return left + right;
}
- }
+ }
};
-struct SubChecked {
+struct SubChecked {
template <typename T, typename Arg0, typename Arg1>
static constexpr T Call(Arg0 left, Arg1 right) {
static_assert(is_same<T, Arg0>::value && is_same<T, Arg1>::value, "");
- if constexpr(is_arithmetic_v<T>) {
+ if constexpr(is_arithmetic_v<T>) {
return left - right;
}
- }
+ }
+};
+
+template <typename T>
+using maybe_make_unsigned = conditional_t<is_integral_v<T> && !is_same_v<T, bool>, make_unsigned_t<T>, T>;
+
+template <typename T, typename Unsigned = maybe_make_unsigned<T>>
+constexpr Unsigned to_unsigned(T signed_) {
+ return static_cast<Unsigned>(signed_);
+}
+
+struct Multiply {
+ static_assert(is_same_v<decltype(int8_t() * int8_t()), int32_t>, "");
+ static_assert(is_same_v<decltype(uint8_t() * uint8_t()), int32_t>, "");
+ static_assert(is_same_v<decltype(int16_t() * int16_t()), int32_t>, "");
+ static_assert(is_same_v<decltype(uint16_t() * uint16_t()), int32_t>, "");
+ static_assert(is_same_v<decltype(int32_t() * int32_t()), int32_t>, "");
+ static_assert(is_same_v<decltype(uint32_t() * uint32_t()), uint32_t>, "");
+ static_assert(is_same_v<decltype(int64_t() * int64_t()), int64_t>, "");
+ static_assert(is_same_v<decltype(uint64_t() * uint64_t()), uint64_t>, "");
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(Arg0 left, Arg1 right) {
+ static_assert(is_same_v<T, Arg0> && is_same_v<T, Arg1>, "");
+ if constexpr(is_floating_point_v<T>) {
+ return left * right;
+ } else if constexpr(is_unsigned_v<T> && !is_same_v<T, uint16_t>) {
+ return left * right;
+ } else if constexpr(is_signed_v<T> && !is_same_v<T, int16_t>) {
+ return to_unsigned(left) * to_unsigned(right);
+ } else if constexpr(is_same_v<T, int16_t> || is_same_v<T, uint16_t>) {
+ // multiplication of 16 bit integer types implicitly promotes to
+ // signed 32 bit integer. However, some inputs may overflow (which
+ // triggers undefined behavior). Therefore we first cast to 32 bit
+ // unsigned integers where overflow is well defined.
+ return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
+ }
+ }
+};
+
+struct MultiplyChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(Arg0 left, Arg1 right) {
+ static_assert(is_same_v<T, Arg0> && is_same_v<T, Arg1>, "");
+ if constexpr(is_arithmetic_v<T>) {
+ return left * right;
+ }
+ }
};
template <typename T, typename Op>
@@ -80,10 +131,10 @@ struct arithmetic_op_arr_arr_impl {
const T* left = reinterpret_cast<const T*>(in_left);
const T* right = reinterpret_cast<const T*>(in_right);
T* output = reinterpret_cast<T*>(out);
-
+
for (int i = 0; i < len; ++i) {
output[i] = Op::template Call<T, T, T>(left[i], right[i]);
- }
+ }
}
};
@@ -93,10 +144,10 @@ struct arithmetic_op_arr_scalar_impl {
const T* left = reinterpret_cast<const T*>(in_left);
const T right = *reinterpret_cast<const T*>(scalar_right);
T* output = reinterpret_cast<T*>(out);
-
+
for (int i = 0; i < len; ++i) {
output[i] = Op::template Call<T, T, T>(left[i], right);
- }
+ }
}
};
@@ -106,7 +157,7 @@ struct arithmetic_op_scalar_arr_impl {
const T left = *reinterpret_cast<const T*>(scalar_left);
const T* right = reinterpret_cast<const T*>(in_right);
T* output = reinterpret_cast<T*>(out);
-
+
for (int i = 0; i < len; ++i) {
output[i] = Op::template Call<T, T, T>(left, right[i]);
}
@@ -120,25 +171,25 @@ static inline void arithmetic_op(const int type, const void* in_left, const void
switch (intype) {
case arrtype::UINT8:
- return Impl<uint8_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<uint8_t, Op>::exec(in_left, in_right, output, len);
case arrtype::INT8:
- return Impl<int8_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<int8_t, Op>::exec(in_left, in_right, output, len);
case arrtype::UINT16:
- return Impl<uint16_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<uint16_t, Op>::exec(in_left, in_right, output, len);
case arrtype::INT16:
- return Impl<int16_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<int16_t, Op>::exec(in_left, in_right, output, len);
case arrtype::UINT32:
- return Impl<uint32_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<uint32_t, Op>::exec(in_left, in_right, output, len);
case arrtype::INT32:
- return Impl<int32_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<int32_t, Op>::exec(in_left, in_right, output, len);
case arrtype::UINT64:
- return Impl<uint64_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<uint64_t, Op>::exec(in_left, in_right, output, len);
case arrtype::INT64:
- return Impl<int64_t, Op>::exec(in_left, in_right, output, len);
+ return Impl<int64_t, Op>::exec(in_left, in_right, output, len);
case arrtype::FLOAT32:
- return Impl<float, Op>::exec(in_left, in_right, output, len);
+ return Impl<float, Op>::exec(in_left, in_right, output, len);
case arrtype::FLOAT64:
- return Impl<double, Op>::exec(in_left, in_right, output, len);
+ return Impl<double, Op>::exec(in_left, in_right, output, len);
default:
break;
}
@@ -150,14 +201,20 @@ static inline void arithmetic_impl(const int type, const int8_t op, const void*
switch (opt) {
case optype::ADD:
- return arithmetic_op<Add, Impl>(type, in_left, in_right, out, len);
+ return arithmetic_op<Add, Impl>(type, in_left, in_right, out, len);
case optype::ADD_CHECKED:
- return arithmetic_op<AddChecked, Impl>(type, in_left, in_right, out, len);
+ return arithmetic_op<AddChecked, Impl>(type, in_left, in_right, out, len);
case optype::SUB:
- return arithmetic_op<Sub, Impl>(type, in_left, in_right, out, len);
+ return arithmetic_op<Sub, Impl>(type, in_left, in_right, out, len);
case optype::SUB_CHECKED:
- return arithmetic_op<SubChecked, Impl>(type, in_left, in_right, out, len);
- default:
+ return arithmetic_op<SubChecked, Impl>(type, in_left, in_right, out, len);
+ case optype::MUL:
+ return arithmetic_op<Multiply, Impl>(type, in_left, in_right, out, len);
+ case optype::MUL_CHECKED:
+ return arithmetic_op<MultiplyChecked, Impl>(type, in_left, in_right, out, len);
+ default:
+ // don't implement divide here as we can only divide on non-null entries
+ // so we can avoid dividing by zero
break;
}
}
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
index 76355712b8..54bc7d754f 100644
--- a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
@@ -1,7 +1,27 @@
.text
.intel_syntax noprefix
.file "base_arithmetic.cc"
- .globl arithmetic_avx2 # -- Begin function arithmetic_avx2
+ .section .rodata.cst32,"aM",@progbits,32
+ .p2align 5 # -- Begin function arithmetic_avx2
+.LCPI0_0:
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .text
+ .globl arithmetic_avx2
.p2align 4, 0x90
.type arithmetic_avx2,@function
arithmetic_avx2: # @arithmetic_avx2
@@ -9,577 +29,1653 @@ arithmetic_avx2: # @arithmetic_avx2
push rbp
mov rbp, rsp
and rsp, -8
- cmp sil, 1
- jg .LBB0_10
+ cmp sil, 3
+ jg .LBB0_11
# %bb.1:
test sil, sil
- je .LBB0_19
+ je .LBB0_21
# %bb.2:
cmp sil, 1
- jne .LBB0_537
+ je .LBB0_287
# %bb.3:
- cmp edi, 6
- jg .LBB0_291
+ cmp sil, 2
+ jne .LBB0_825
# %bb.4:
+ cmp edi, 6
+ jg .LBB0_559
+# %bb.5:
cmp edi, 3
- jle .LBB0_5
-# %bb.285:
+ jle .LBB0_6
+# %bb.553:
cmp edi, 4
- je .LBB0_324
-# %bb.286:
+ je .LBB0_602
+# %bb.554:
cmp edi, 5
- je .LBB0_336
-# %bb.287:
+ je .LBB0_614
+# %bb.555:
cmp edi, 6
- jne .LBB0_537
-# %bb.288:
+ jne .LBB0_825
+# %bb.556:
test r9d, r9d
- jle .LBB0_537
-# %bb.289:
+ jle .LBB0_825
+# %bb.557:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_348
-# %bb.290:
+ jae .LBB0_626
+# %bb.558:
xor esi, esi
- jmp .LBB0_353
-.LBB0_10:
- cmp sil, 2
- je .LBB0_152
-# %bb.11:
- cmp sil, 3
- jne .LBB0_537
+.LBB0_631:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_633
+.LBB0_632: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ imul edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_632
+.LBB0_633:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_634: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ imul eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ imul eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ imul eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ imul eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_634
+ jmp .LBB0_825
+.LBB0_11:
+ cmp sil, 4
+ je .LBB0_154
# %bb.12:
- cmp edi, 6
- jg .LBB0_417
+ cmp sil, 5
+ je .LBB0_420
# %bb.13:
+ cmp sil, 6
+ jne .LBB0_825
+# %bb.14:
+ cmp edi, 6
+ jg .LBB0_695
+# %bb.15:
cmp edi, 3
- jle .LBB0_14
-# %bb.411:
+ jle .LBB0_16
+# %bb.689:
cmp edi, 4
- je .LBB0_450
-# %bb.412:
+ je .LBB0_738
+# %bb.690:
cmp edi, 5
- je .LBB0_462
-# %bb.413:
+ je .LBB0_750
+# %bb.691:
cmp edi, 6
- jne .LBB0_537
-# %bb.414:
+ jne .LBB0_825
+# %bb.692:
test r9d, r9d
- jle .LBB0_537
-# %bb.415:
+ jle .LBB0_825
+# %bb.693:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_474
-# %bb.416:
+ jae .LBB0_762
+# %bb.694:
xor esi, esi
- jmp .LBB0_479
-.LBB0_19:
+.LBB0_767:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_769
+.LBB0_768: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ imul edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_768
+.LBB0_769:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_770: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ imul eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ imul eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ imul eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ imul eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_770
+ jmp .LBB0_825
+.LBB0_21:
cmp edi, 6
- jg .LBB0_32
-# %bb.20:
+ jg .LBB0_34
+# %bb.22:
cmp edi, 3
- jle .LBB0_21
-# %bb.26:
+ jle .LBB0_23
+# %bb.28:
cmp edi, 4
- je .LBB0_65
-# %bb.27:
+ je .LBB0_67
+# %bb.29:
cmp edi, 5
- je .LBB0_77
-# %bb.28:
+ je .LBB0_79
+# %bb.30:
cmp edi, 6
- jne .LBB0_537
-# %bb.29:
+ jne .LBB0_825
+# %bb.31:
test r9d, r9d
- jle .LBB0_537
-# %bb.30:
+ jle .LBB0_825
+# %bb.32:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_89
-# %bb.31:
+ jae .LBB0_91
+# %bb.33:
xor esi, esi
- jmp .LBB0_94
-.LBB0_152:
+ jmp .LBB0_96
+.LBB0_287:
cmp edi, 6
- jg .LBB0_165
-# %bb.153:
+ jg .LBB0_300
+# %bb.288:
cmp edi, 3
- jle .LBB0_154
-# %bb.159:
+ jle .LBB0_289
+# %bb.294:
cmp edi, 4
- je .LBB0_198
-# %bb.160:
+ je .LBB0_333
+# %bb.295:
cmp edi, 5
- je .LBB0_210
-# %bb.161:
+ je .LBB0_345
+# %bb.296:
cmp edi, 6
- jne .LBB0_537
-# %bb.162:
+ jne .LBB0_825
+# %bb.297:
test r9d, r9d
- jle .LBB0_537
-# %bb.163:
+ jle .LBB0_825
+# %bb.298:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_222
+ jae .LBB0_357
+# %bb.299:
+ xor esi, esi
+.LBB0_362:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_364
+.LBB0_363: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_363
+.LBB0_364:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_365: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_365
+ jmp .LBB0_825
+.LBB0_154:
+ cmp edi, 6
+ jg .LBB0_167
+# %bb.155:
+ cmp edi, 3
+ jle .LBB0_156
+# %bb.161:
+ cmp edi, 4
+ je .LBB0_200
+# %bb.162:
+ cmp edi, 5
+ je .LBB0_212
+# %bb.163:
+ cmp edi, 6
+ jne .LBB0_825
# %bb.164:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.165:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_224
+# %bb.166:
+ xor esi, esi
+ jmp .LBB0_229
+.LBB0_420:
+ cmp edi, 6
+ jg .LBB0_433
+# %bb.421:
+ cmp edi, 3
+ jle .LBB0_422
+# %bb.427:
+ cmp edi, 4
+ je .LBB0_466
+# %bb.428:
+ cmp edi, 5
+ je .LBB0_478
+# %bb.429:
+ cmp edi, 6
+ jne .LBB0_825
+# %bb.430:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.431:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_490
+# %bb.432:
xor esi, esi
- jmp .LBB0_227
-.LBB0_291:
+.LBB0_495:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_497
+.LBB0_496: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_496
+.LBB0_497:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_498: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_498
+ jmp .LBB0_825
+.LBB0_559:
cmp edi, 8
- jle .LBB0_292
-# %bb.297:
+ jle .LBB0_560
+# %bb.565:
cmp edi, 9
- je .LBB0_378
-# %bb.298:
+ je .LBB0_656
+# %bb.566:
cmp edi, 11
- je .LBB0_390
-# %bb.299:
+ je .LBB0_668
+# %bb.567:
cmp edi, 12
- jne .LBB0_537
-# %bb.300:
+ jne .LBB0_825
+# %bb.568:
test r9d, r9d
- jle .LBB0_537
-# %bb.301:
+ jle .LBB0_825
+# %bb.569:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_402
-# %bb.302:
+ jae .LBB0_680
+# %bb.570:
xor esi, esi
- jmp .LBB0_407
-.LBB0_417:
+.LBB0_685:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_687
+.LBB0_686: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_686
+.LBB0_687:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_688: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_688
+ jmp .LBB0_825
+.LBB0_695:
cmp edi, 8
- jle .LBB0_418
-# %bb.423:
+ jle .LBB0_696
+# %bb.701:
cmp edi, 9
- je .LBB0_504
-# %bb.424:
+ je .LBB0_792
+# %bb.702:
cmp edi, 11
- je .LBB0_516
-# %bb.425:
+ je .LBB0_804
+# %bb.703:
cmp edi, 12
- jne .LBB0_537
-# %bb.426:
+ jne .LBB0_825
+# %bb.704:
test r9d, r9d
- jle .LBB0_537
-# %bb.427:
+ jle .LBB0_825
+# %bb.705:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_528
-# %bb.428:
+ jae .LBB0_816
+# %bb.706:
xor esi, esi
- jmp .LBB0_533
-.LBB0_32:
+.LBB0_821:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_823
+.LBB0_822: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_822
+.LBB0_823:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_824: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vmulsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_824
+ jmp .LBB0_825
+.LBB0_34:
cmp edi, 8
- jle .LBB0_33
-# %bb.38:
+ jle .LBB0_35
+# %bb.40:
cmp edi, 9
- je .LBB0_119
-# %bb.39:
+ je .LBB0_121
+# %bb.41:
cmp edi, 11
- je .LBB0_131
-# %bb.40:
+ je .LBB0_133
+# %bb.42:
cmp edi, 12
- jne .LBB0_537
-# %bb.41:
+ jne .LBB0_825
+# %bb.43:
test r9d, r9d
- jle .LBB0_537
-# %bb.42:
+ jle .LBB0_825
+# %bb.44:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_143
-# %bb.43:
+ jae .LBB0_145
+# %bb.45:
xor esi, esi
- jmp .LBB0_148
-.LBB0_165:
+ jmp .LBB0_150
+.LBB0_300:
cmp edi, 8
- jle .LBB0_166
-# %bb.171:
+ jle .LBB0_301
+# %bb.306:
cmp edi, 9
- je .LBB0_252
-# %bb.172:
+ je .LBB0_387
+# %bb.307:
cmp edi, 11
- je .LBB0_264
-# %bb.173:
+ je .LBB0_399
+# %bb.308:
cmp edi, 12
- jne .LBB0_537
-# %bb.174:
+ jne .LBB0_825
+# %bb.309:
test r9d, r9d
- jle .LBB0_537
-# %bb.175:
+ jle .LBB0_825
+# %bb.310:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_276
+ jae .LBB0_411
+# %bb.311:
+ xor esi, esi
+.LBB0_416:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_418
+.LBB0_417: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_417
+.LBB0_418:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_419: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_419
+ jmp .LBB0_825
+.LBB0_167:
+ cmp edi, 8
+ jle .LBB0_168
+# %bb.173:
+ cmp edi, 9
+ je .LBB0_254
+# %bb.174:
+ cmp edi, 11
+ je .LBB0_266
+# %bb.175:
+ cmp edi, 12
+ jne .LBB0_825
# %bb.176:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.177:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_278
+# %bb.178:
+ xor esi, esi
+ jmp .LBB0_283
+.LBB0_433:
+ cmp edi, 8
+ jle .LBB0_434
+# %bb.439:
+ cmp edi, 9
+ je .LBB0_520
+# %bb.440:
+ cmp edi, 11
+ je .LBB0_532
+# %bb.441:
+ cmp edi, 12
+ jne .LBB0_825
+# %bb.442:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.443:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_544
+# %bb.444:
xor esi, esi
- jmp .LBB0_281
-.LBB0_5:
+.LBB0_549:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_551
+.LBB0_550: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_550
+.LBB0_551:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_552: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_552
+ jmp .LBB0_825
+.LBB0_6:
cmp edi, 2
- je .LBB0_303
-# %bb.6:
- cmp edi, 3
- jne .LBB0_537
+ je .LBB0_571
# %bb.7:
- test r9d, r9d
- jle .LBB0_537
+ cmp edi, 3
+ jne .LBB0_825
# %bb.8:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.9:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_588
+# %bb.10:
+ xor edi, edi
+ jmp .LBB0_598
+.LBB0_16:
+ cmp edi, 2
+ je .LBB0_707
+# %bb.17:
+ cmp edi, 3
+ jne .LBB0_825
+# %bb.18:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.19:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_724
+# %bb.20:
+ xor edi, edi
+ jmp .LBB0_734
+.LBB0_23:
+ cmp edi, 2
+ je .LBB0_46
+# %bb.24:
+ cmp edi, 3
+ jne .LBB0_825
+# %bb.25:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.26:
mov r10d, r9d
cmp r9d, 128
- jae .LBB0_315
-# %bb.9:
+ jae .LBB0_58
+# %bb.27:
xor esi, esi
- jmp .LBB0_320
-.LBB0_14:
+ jmp .LBB0_63
+.LBB0_289:
cmp edi, 2
- je .LBB0_429
-# %bb.15:
+ je .LBB0_312
+# %bb.290:
cmp edi, 3
- jne .LBB0_537
-# %bb.16:
+ jne .LBB0_825
+# %bb.291:
test r9d, r9d
- jle .LBB0_537
-# %bb.17:
+ jle .LBB0_825
+# %bb.292:
mov r10d, r9d
cmp r9d, 128
- jae .LBB0_441
-# %bb.18:
+ jae .LBB0_324
+# %bb.293:
xor esi, esi
- jmp .LBB0_446
-.LBB0_21:
+ jmp .LBB0_329
+.LBB0_156:
cmp edi, 2
- je .LBB0_44
-# %bb.22:
+ je .LBB0_179
+# %bb.157:
cmp edi, 3
- jne .LBB0_537
-# %bb.23:
+ jne .LBB0_825
+# %bb.158:
test r9d, r9d
- jle .LBB0_537
-# %bb.24:
+ jle .LBB0_825
+# %bb.159:
mov r10d, r9d
cmp r9d, 128
- jae .LBB0_56
-# %bb.25:
+ jae .LBB0_191
+# %bb.160:
xor esi, esi
- jmp .LBB0_61
-.LBB0_154:
+ jmp .LBB0_196
+.LBB0_422:
cmp edi, 2
- je .LBB0_177
-# %bb.155:
+ je .LBB0_445
+# %bb.423:
cmp edi, 3
- jne .LBB0_537
-# %bb.156:
+ jne .LBB0_825
+# %bb.424:
test r9d, r9d
- jle .LBB0_537
-# %bb.157:
+ jle .LBB0_825
+# %bb.425:
mov r10d, r9d
cmp r9d, 128
- jae .LBB0_189
-# %bb.158:
+ jae .LBB0_457
+# %bb.426:
xor esi, esi
- jmp .LBB0_194
-.LBB0_292:
+ jmp .LBB0_462
+.LBB0_560:
cmp edi, 7
- je .LBB0_357
-# %bb.293:
+ je .LBB0_635
+# %bb.561:
cmp edi, 8
- jne .LBB0_537
-# %bb.294:
+ jne .LBB0_825
+# %bb.562:
test r9d, r9d
- jle .LBB0_537
-# %bb.295:
+ jle .LBB0_825
+# %bb.563:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_369
-# %bb.296:
+ jae .LBB0_647
+# %bb.564:
xor esi, esi
- jmp .LBB0_374
-.LBB0_418:
+ jmp .LBB0_652
+.LBB0_696:
cmp edi, 7
- je .LBB0_483
-# %bb.419:
+ je .LBB0_771
+# %bb.697:
cmp edi, 8
- jne .LBB0_537
-# %bb.420:
+ jne .LBB0_825
+# %bb.698:
test r9d, r9d
- jle .LBB0_537
-# %bb.421:
+ jle .LBB0_825
+# %bb.699:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_495
-# %bb.422:
+ jae .LBB0_783
+# %bb.700:
xor esi, esi
- jmp .LBB0_500
-.LBB0_33:
+ jmp .LBB0_788
+.LBB0_35:
cmp edi, 7
- je .LBB0_98
-# %bb.34:
+ je .LBB0_100
+# %bb.36:
cmp edi, 8
- jne .LBB0_537
-# %bb.35:
+ jne .LBB0_825
+# %bb.37:
test r9d, r9d
- jle .LBB0_537
-# %bb.36:
+ jle .LBB0_825
+# %bb.38:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_110
-# %bb.37:
+ jae .LBB0_112
+# %bb.39:
xor esi, esi
- jmp .LBB0_115
-.LBB0_166:
+ jmp .LBB0_117
+.LBB0_301:
cmp edi, 7
- je .LBB0_231
-# %bb.167:
+ je .LBB0_366
+# %bb.302:
cmp edi, 8
- jne .LBB0_537
-# %bb.168:
+ jne .LBB0_825
+# %bb.303:
test r9d, r9d
- jle .LBB0_537
-# %bb.169:
+ jle .LBB0_825
+# %bb.304:
mov r10d, r9d
cmp r9d, 16
- jae .LBB0_243
-# %bb.170:
- xor esi, esi
- jmp .LBB0_248
-.LBB0_324:
- test r9d, r9d
- jle .LBB0_537
-# %bb.325:
- mov r10d, r9d
- cmp r9d, 64
- jae .LBB0_327
-# %bb.326:
+ jae .LBB0_378
+# %bb.305:
xor esi, esi
- jmp .LBB0_332
-.LBB0_336:
+ jmp .LBB0_383
+.LBB0_168:
+ cmp edi, 7
+ je .LBB0_233
+# %bb.169:
+ cmp edi, 8
+ jne .LBB0_825
+# %bb.170:
test r9d, r9d
- jle .LBB0_537
-# %bb.337:
+ jle .LBB0_825
+# %bb.171:
mov r10d, r9d
- cmp r9d, 64
- jae .LBB0_339
-# %bb.338:
+ cmp r9d, 16
+ jae .LBB0_245
+# %bb.172:
xor esi, esi
- jmp .LBB0_344
-.LBB0_450:
+ jmp .LBB0_250
+.LBB0_434:
+ cmp edi, 7
+ je .LBB0_499
+# %bb.435:
+ cmp edi, 8
+ jne .LBB0_825
+# %bb.436:
test r9d, r9d
- jle .LBB0_537
-# %bb.451:
+ jle .LBB0_825
+# %bb.437:
mov r10d, r9d
- cmp r9d, 64
- jae .LBB0_453
-# %bb.452:
+ cmp r9d, 16
+ jae .LBB0_511
+# %bb.438:
xor esi, esi
- jmp .LBB0_458
-.LBB0_462:
+ jmp .LBB0_516
+.LBB0_602:
test r9d, r9d
- jle .LBB0_537
-# %bb.463:
+ jle .LBB0_825
+# %bb.603:
mov r10d, r9d
cmp r9d, 64
- jae .LBB0_465
-# %bb.464:
+ jae .LBB0_605
+# %bb.604:
xor esi, esi
- jmp .LBB0_470
-.LBB0_65:
+.LBB0_610:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_612
+.LBB0_611: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ imul di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_611
+.LBB0_612:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_613: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ imul ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ imul ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ imul ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ imul ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_613
+ jmp .LBB0_825
+.LBB0_614:
test r9d, r9d
- jle .LBB0_537
-# %bb.66:
+ jle .LBB0_825
+# %bb.615:
mov r10d, r9d
cmp r9d, 64
- jae .LBB0_68
-# %bb.67:
+ jae .LBB0_617
+# %bb.616:
xor esi, esi
- jmp .LBB0_73
-.LBB0_77:
+.LBB0_622:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_624
+.LBB0_623: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ imul di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_623
+.LBB0_624:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_625: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ imul ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ imul ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ imul ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ imul ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_625
+ jmp .LBB0_825
+.LBB0_738:
test r9d, r9d
- jle .LBB0_537
-# %bb.78:
+ jle .LBB0_825
+# %bb.739:
mov r10d, r9d
cmp r9d, 64
- jae .LBB0_80
-# %bb.79:
+ jae .LBB0_741
+# %bb.740:
xor esi, esi
- jmp .LBB0_85
-.LBB0_198:
+.LBB0_746:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_748
+.LBB0_747: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ imul di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_747
+.LBB0_748:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_749: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ imul ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ imul ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ imul ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ imul ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_749
+ jmp .LBB0_825
+.LBB0_750:
test r9d, r9d
- jle .LBB0_537
-# %bb.199:
+ jle .LBB0_825
+# %bb.751:
mov r10d, r9d
cmp r9d, 64
- jae .LBB0_201
-# %bb.200:
+ jae .LBB0_753
+# %bb.752:
xor esi, esi
- jmp .LBB0_206
-.LBB0_210:
- test r9d, r9d
- jle .LBB0_537
-# %bb.211:
+.LBB0_758:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_760
+.LBB0_759: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ imul di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_759
+.LBB0_760:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_761: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ imul ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ imul ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ imul ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ imul ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_761
+ jmp .LBB0_825
+.LBB0_67:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.68:
mov r10d, r9d
cmp r9d, 64
- jae .LBB0_213
-# %bb.212:
+ jae .LBB0_70
+# %bb.69:
xor esi, esi
- jmp .LBB0_218
-.LBB0_378:
+ jmp .LBB0_75
+.LBB0_79:
test r9d, r9d
- jle .LBB0_537
-# %bb.379:
+ jle .LBB0_825
+# %bb.80:
mov r10d, r9d
- cmp r9d, 16
- jae .LBB0_381
-# %bb.380:
+ cmp r9d, 64
+ jae .LBB0_82
+# %bb.81:
xor esi, esi
- jmp .LBB0_386
-.LBB0_390:
+ jmp .LBB0_87
+.LBB0_333:
test r9d, r9d
- jle .LBB0_537
-# %bb.391:
+ jle .LBB0_825
+# %bb.334:
mov r10d, r9d
- cmp r9d, 32
- jae .LBB0_393
-# %bb.392:
+ cmp r9d, 64
+ jae .LBB0_336
+# %bb.335:
xor esi, esi
- jmp .LBB0_398
-.LBB0_504:
+.LBB0_341:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_343
+.LBB0_342: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_342
+.LBB0_343:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_344: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_344
+ jmp .LBB0_825
+.LBB0_345:
test r9d, r9d
- jle .LBB0_537
-# %bb.505:
+ jle .LBB0_825
+# %bb.346:
mov r10d, r9d
- cmp r9d, 16
- jae .LBB0_507
-# %bb.506:
+ cmp r9d, 64
+ jae .LBB0_348
+# %bb.347:
xor esi, esi
- jmp .LBB0_512
-.LBB0_516:
+.LBB0_353:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_355
+.LBB0_354: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_354
+.LBB0_355:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_356: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_356
+ jmp .LBB0_825
+.LBB0_200:
test r9d, r9d
- jle .LBB0_537
-# %bb.517:
+ jle .LBB0_825
+# %bb.201:
mov r10d, r9d
- cmp r9d, 32
- jae .LBB0_519
-# %bb.518:
+ cmp r9d, 64
+ jae .LBB0_203
+# %bb.202:
xor esi, esi
- jmp .LBB0_524
-.LBB0_119:
+ jmp .LBB0_208
+.LBB0_212:
test r9d, r9d
- jle .LBB0_537
-# %bb.120:
+ jle .LBB0_825
+# %bb.213:
mov r10d, r9d
- cmp r9d, 16
- jae .LBB0_122
-# %bb.121:
+ cmp r9d, 64
+ jae .LBB0_215
+# %bb.214:
xor esi, esi
- jmp .LBB0_127
-.LBB0_131:
+ jmp .LBB0_220
+.LBB0_466:
test r9d, r9d
- jle .LBB0_537
-# %bb.132:
+ jle .LBB0_825
+# %bb.467:
mov r10d, r9d
- cmp r9d, 32
- jae .LBB0_134
-# %bb.133:
+ cmp r9d, 64
+ jae .LBB0_469
+# %bb.468:
xor esi, esi
- jmp .LBB0_139
-.LBB0_252:
+.LBB0_474:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_476
+.LBB0_475: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_475
+.LBB0_476:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_477: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_477
+ jmp .LBB0_825
+.LBB0_478:
test r9d, r9d
- jle .LBB0_537
-# %bb.253:
+ jle .LBB0_825
+# %bb.479:
mov r10d, r9d
- cmp r9d, 16
- jae .LBB0_255
-# %bb.254:
+ cmp r9d, 64
+ jae .LBB0_481
+# %bb.480:
xor esi, esi
- jmp .LBB0_260
-.LBB0_264:
+.LBB0_486:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_488
+.LBB0_487: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_487
+.LBB0_488:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_489: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_489
+ jmp .LBB0_825
+.LBB0_656:
test r9d, r9d
- jle .LBB0_537
-# %bb.265:
+ jle .LBB0_825
+# %bb.657:
mov r10d, r9d
- cmp r9d, 32
- jae .LBB0_267
-# %bb.266:
+ cmp r9d, 16
+ jae .LBB0_659
+# %bb.658:
xor esi, esi
- jmp .LBB0_272
-.LBB0_303:
+.LBB0_664:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_666
+.LBB0_665: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ imul rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_665
+.LBB0_666:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_667: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ imul rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ imul rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ imul rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ imul rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_667
+ jmp .LBB0_825
+.LBB0_668:
test r9d, r9d
- jle .LBB0_537
-# %bb.304:
+ jle .LBB0_825
+# %bb.669:
mov r10d, r9d
- cmp r9d, 128
- jae .LBB0_306
-# %bb.305:
+ cmp r9d, 32
+ jae .LBB0_671
+# %bb.670:
xor esi, esi
- jmp .LBB0_311
-.LBB0_429:
+.LBB0_676:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_678
+.LBB0_677: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_677
+.LBB0_678:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_679: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_679
+ jmp .LBB0_825
+.LBB0_792:
test r9d, r9d
- jle .LBB0_537
-# %bb.430:
+ jle .LBB0_825
+# %bb.793:
mov r10d, r9d
- cmp r9d, 128
- jae .LBB0_432
-# %bb.431:
+ cmp r9d, 16
+ jae .LBB0_795
+# %bb.794:
xor esi, esi
- jmp .LBB0_437
-.LBB0_44:
+.LBB0_800:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_802
+.LBB0_801: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ imul rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_801
+.LBB0_802:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_803: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ imul rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ imul rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ imul rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ imul rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_803
+ jmp .LBB0_825
+.LBB0_804:
test r9d, r9d
- jle .LBB0_537
-# %bb.45:
+ jle .LBB0_825
+# %bb.805:
mov r10d, r9d
- cmp r9d, 128
- jae .LBB0_47
-# %bb.46:
+ cmp r9d, 32
+ jae .LBB0_807
+# %bb.806:
xor esi, esi
- jmp .LBB0_52
-.LBB0_177:
- test r9d, r9d
- jle .LBB0_537
-# %bb.178:
- mov r10d, r9d
+.LBB0_812:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_814
+.LBB0_813: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_813
+.LBB0_814:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_815: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vmulss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_815
+ jmp .LBB0_825
+.LBB0_121:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.122:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_124
+# %bb.123:
+ xor esi, esi
+ jmp .LBB0_129
+.LBB0_133:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.134:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_136
+# %bb.135:
+ xor esi, esi
+ jmp .LBB0_141
+.LBB0_387:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.388:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_390
+# %bb.389:
+ xor esi, esi
+.LBB0_395:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_397
+.LBB0_396: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_396
+.LBB0_397:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_398: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_398
+ jmp .LBB0_825
+.LBB0_399:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.400:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_402
+# %bb.401:
+ xor esi, esi
+.LBB0_407:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_409
+.LBB0_408: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_408
+.LBB0_409:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_410: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_410
+ jmp .LBB0_825
+.LBB0_254:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.255:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_257
+# %bb.256:
+ xor esi, esi
+ jmp .LBB0_262
+.LBB0_266:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.267:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_269
+# %bb.268:
+ xor esi, esi
+ jmp .LBB0_274
+.LBB0_520:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.521:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_523
+# %bb.522:
+ xor esi, esi
+.LBB0_528:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_530
+.LBB0_529: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_529
+.LBB0_530:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_531: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_531
+ jmp .LBB0_825
+.LBB0_532:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.533:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_535
+# %bb.534:
+ xor esi, esi
+.LBB0_540:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_542
+.LBB0_541: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_541
+.LBB0_542:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_543: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_543
+ jmp .LBB0_825
+.LBB0_571:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.572:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_574
+# %bb.573:
+ xor edi, edi
+ jmp .LBB0_584
+.LBB0_707:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.708:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_710
+# %bb.709:
+ xor edi, edi
+ jmp .LBB0_720
+.LBB0_46:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.47:
+ mov r10d, r9d
cmp r9d, 128
- jae .LBB0_180
-# %bb.179:
+ jae .LBB0_49
+# %bb.48:
xor esi, esi
- jmp .LBB0_185
-.LBB0_357:
+ jmp .LBB0_54
+.LBB0_312:
test r9d, r9d
- jle .LBB0_537
-# %bb.358:
+ jle .LBB0_825
+# %bb.313:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_315
+# %bb.314:
+ xor esi, esi
+ jmp .LBB0_320
+.LBB0_179:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.180:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_182
+# %bb.181:
+ xor esi, esi
+ jmp .LBB0_187
+.LBB0_445:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.446:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_448
+# %bb.447:
+ xor esi, esi
+ jmp .LBB0_453
+.LBB0_635:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.636:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_360
-# %bb.359:
+ jae .LBB0_638
+# %bb.637:
xor esi, esi
- jmp .LBB0_365
-.LBB0_483:
+ jmp .LBB0_643
+.LBB0_771:
test r9d, r9d
- jle .LBB0_537
-# %bb.484:
+ jle .LBB0_825
+# %bb.772:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_486
-# %bb.485:
+ jae .LBB0_774
+# %bb.773:
xor esi, esi
- jmp .LBB0_491
-.LBB0_98:
+ jmp .LBB0_779
+.LBB0_100:
test r9d, r9d
- jle .LBB0_537
-# %bb.99:
+ jle .LBB0_825
+# %bb.101:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_101
-# %bb.100:
+ jae .LBB0_103
+# %bb.102:
xor esi, esi
- jmp .LBB0_106
-.LBB0_231:
+ jmp .LBB0_108
+.LBB0_366:
test r9d, r9d
- jle .LBB0_537
-# %bb.232:
+ jle .LBB0_825
+# %bb.367:
mov r10d, r9d
cmp r9d, 32
- jae .LBB0_234
-# %bb.233:
+ jae .LBB0_369
+# %bb.368:
xor esi, esi
- jmp .LBB0_239
-.LBB0_348:
+ jmp .LBB0_374
+.LBB0_233:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.234:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_236
+# %bb.235:
+ xor esi, esi
+ jmp .LBB0_241
+.LBB0_499:
+ test r9d, r9d
+ jle .LBB0_825
+# %bb.500:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_502
+# %bb.501:
+ xor esi, esi
+ jmp .LBB0_507
+.LBB0_91:
lea rsi, [r8 + 4*r10]
lea rax, [rdx + 4*r10]
cmp rax, r8
@@ -593,73 +1689,1326 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_353
-# %bb.349:
+ jne .LBB0_96
+# %bb.92:
and al, dil
- jne .LBB0_353
-# %bb.350:
+ jne .LBB0_96
+# %bb.93:
mov esi, r10d
and esi, -32
xor edi, edi
-.LBB0_351: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
- vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+.LBB0_94: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
add rdi, 32
cmp rsi, rdi
- jne .LBB0_351
-# %bb.352:
+ jne .LBB0_94
+# %bb.95:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_96:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_98
+.LBB0_97: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_97
+.LBB0_98:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_99: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_99
+ jmp .LBB0_825
+.LBB0_224:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_229
+# %bb.225:
+ and al, dil
+ jne .LBB0_229
+# %bb.226:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_227: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_227
+# %bb.228:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_229:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_231
+.LBB0_230: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_230
+.LBB0_231:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_232: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_232
+ jmp .LBB0_825
+.LBB0_145:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_150
+# %bb.146:
+ and al, dil
+ jne .LBB0_150
+# %bb.147:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_148: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vaddpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_148
+# %bb.149:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_150:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_152
+.LBB0_151: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_151
+.LBB0_152:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_153: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_153
+ jmp .LBB0_825
+.LBB0_278:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_283
+# %bb.279:
+ and al, dil
+ jne .LBB0_283
+# %bb.280:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_281: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vaddpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_281
+# %bb.282:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_283:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_285
+.LBB0_284: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_284
+.LBB0_285:
+ cmp rdi, 3
+ jb .LBB0_825
+.LBB0_286: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_286
+ jmp .LBB0_825
+.LBB0_588:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta sil
+ xor edi, edi
+ test r9b, r11b
+ jne .LBB0_598
+# %bb.589:
+ and al, sil
+ jne .LBB0_598
+# %bb.590:
+ mov edi, r10d
+ and edi, -32
+ lea rsi, [rdi - 32]
+ mov rax, rsi
+ shr rax, 5
+ add rax, 1
+ mov r9d, eax
+ and r9d, 3
+ cmp rsi, 96
+ jae .LBB0_592
+# %bb.591:
+ xor esi, esi
+ jmp .LBB0_594
+.LBB0_724:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta sil
+ xor edi, edi
+ test r9b, r11b
+ jne .LBB0_734
+# %bb.725:
+ and al, sil
+ jne .LBB0_734
+# %bb.726:
+ mov edi, r10d
+ and edi, -32
+ lea rsi, [rdi - 32]
+ mov rax, rsi
+ shr rax, 5
+ add rax, 1
+ mov r9d, eax
+ and r9d, 3
+ cmp rsi, 96
+ jae .LBB0_728
+# %bb.727:
+ xor esi, esi
+ jmp .LBB0_730
+.LBB0_58:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_63
+# %bb.59:
+ and al, dil
+ jne .LBB0_63
+# %bb.60:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_61: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_61
+# %bb.62:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_63:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_65
+.LBB0_64: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_64
+.LBB0_65:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_66: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_66
+ jmp .LBB0_825
+.LBB0_324:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_329
+# %bb.325:
+ and al, dil
+ jne .LBB0_329
+# %bb.326:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_327: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_327
+# %bb.328:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_329:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_331
+.LBB0_330: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_330
+.LBB0_331:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_332: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_332
+ jmp .LBB0_825
+.LBB0_191:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_196
+# %bb.192:
+ and al, dil
+ jne .LBB0_196
+# %bb.193:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_194: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_194
+# %bb.195:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_196:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_198
+.LBB0_197: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_197
+.LBB0_198:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_199: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_199
+ jmp .LBB0_825
+.LBB0_457:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_462
+# %bb.458:
+ and al, dil
+ jne .LBB0_462
+# %bb.459:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_460: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_460
+# %bb.461:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_462:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_464
+.LBB0_463: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_463
+.LBB0_464:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_465: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_465
+ jmp .LBB0_825
+.LBB0_647:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_652
+# %bb.648:
+ and al, dil
+ jne .LBB0_652
+# %bb.649:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_650: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymm4, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm5, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm6, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm7, ymmword ptr [rcx + 8*rdi + 96]
+ vpsrlq ymm8, ymm4, 32
+ vpmuludq ymm8, ymm8, ymm1
+ vpsrlq ymm9, ymm1, 32
+ vpmuludq ymm9, ymm9, ymm4
+ vpaddq ymm8, ymm9, ymm8
+ vpsllq ymm8, ymm8, 32
+ vpmuludq ymm1, ymm4, ymm1
+ vpaddq ymm1, ymm8, ymm1
+ vpsrlq ymm4, ymm5, 32
+ vpmuludq ymm4, ymm4, ymm2
+ vpsrlq ymm8, ymm2, 32
+ vpmuludq ymm8, ymm8, ymm5
+ vpaddq ymm4, ymm8, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm2, ymm5, ymm2
+ vpaddq ymm2, ymm2, ymm4
+ vpsrlq ymm4, ymm6, 32
+ vpmuludq ymm4, ymm4, ymm3
+ vpsrlq ymm5, ymm3, 32
+ vpmuludq ymm5, ymm6, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm3, ymm6, ymm3
+ vpaddq ymm3, ymm3, ymm4
+ vpsrlq ymm4, ymm7, 32
+ vpmuludq ymm4, ymm4, ymm0
+ vpsrlq ymm5, ymm0, 32
+ vpmuludq ymm5, ymm7, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm0, ymm7, ymm0
+ vpaddq ymm0, ymm0, ymm4
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_650
+# %bb.651:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_652:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_654
+.LBB0_653: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ imul rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_653
+.LBB0_654:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_655: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ imul rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ imul rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ imul rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ imul rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_655
+ jmp .LBB0_825
+.LBB0_783:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_788
+# %bb.784:
+ and al, dil
+ jne .LBB0_788
+# %bb.785:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_786: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymm4, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm5, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm6, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm7, ymmword ptr [rcx + 8*rdi + 96]
+ vpsrlq ymm8, ymm4, 32
+ vpmuludq ymm8, ymm8, ymm1
+ vpsrlq ymm9, ymm1, 32
+ vpmuludq ymm9, ymm9, ymm4
+ vpaddq ymm8, ymm9, ymm8
+ vpsllq ymm8, ymm8, 32
+ vpmuludq ymm1, ymm4, ymm1
+ vpaddq ymm1, ymm8, ymm1
+ vpsrlq ymm4, ymm5, 32
+ vpmuludq ymm4, ymm4, ymm2
+ vpsrlq ymm8, ymm2, 32
+ vpmuludq ymm8, ymm8, ymm5
+ vpaddq ymm4, ymm8, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm2, ymm5, ymm2
+ vpaddq ymm2, ymm2, ymm4
+ vpsrlq ymm4, ymm6, 32
+ vpmuludq ymm4, ymm4, ymm3
+ vpsrlq ymm5, ymm3, 32
+ vpmuludq ymm5, ymm6, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm3, ymm6, ymm3
+ vpaddq ymm3, ymm3, ymm4
+ vpsrlq ymm4, ymm7, 32
+ vpmuludq ymm4, ymm4, ymm0
+ vpsrlq ymm5, ymm0, 32
+ vpmuludq ymm5, ymm7, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm0, ymm7, ymm0
+ vpaddq ymm0, ymm0, ymm4
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_786
+# %bb.787:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_788:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_790
+.LBB0_789: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ imul rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_789
+.LBB0_790:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_791: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ imul rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ imul rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ imul rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ imul rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_791
+ jmp .LBB0_825
+.LBB0_112:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_117
+# %bb.113:
+ and al, dil
+ jne .LBB0_117
+# %bb.114:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_115: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_115
+# %bb.116:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_117:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_119
+.LBB0_118: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_118
+.LBB0_119:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_120: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_120
+ jmp .LBB0_825
+.LBB0_378:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_383
+# %bb.379:
+ and al, dil
+ jne .LBB0_383
+# %bb.380:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_381: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_381
+# %bb.382:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_383:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_385
+.LBB0_384: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_384
+.LBB0_385:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_386: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_386
+ jmp .LBB0_825
+.LBB0_245:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_250
+# %bb.246:
+ and al, dil
+ jne .LBB0_250
+# %bb.247:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_248: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_248
+# %bb.249:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_250:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_252
+.LBB0_251: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_251
+.LBB0_252:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_253: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_253
+ jmp .LBB0_825
+.LBB0_511:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_516
+# %bb.512:
+ and al, dil
+ jne .LBB0_516
+# %bb.513:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_514: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_514
+# %bb.515:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_516:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_518
+.LBB0_517: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_517
+.LBB0_518:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_519: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_519
+ jmp .LBB0_825
+.LBB0_70:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_75
+# %bb.71:
+ and al, dil
+ jne .LBB0_75
+# %bb.72:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_73: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_73
+# %bb.74:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_75:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_77
+.LBB0_76: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_76
+.LBB0_77:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_78: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_78
+ jmp .LBB0_825
+.LBB0_82:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_87
+# %bb.83:
+ and al, dil
+ jne .LBB0_87
+# %bb.84:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_85: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_85
+# %bb.86:
cmp rsi, r10
- je .LBB0_537
-.LBB0_353:
+ je .LBB0_825
+.LBB0_87:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_355
-.LBB0_354: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rdx + 4*rsi]
- sub edi, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
+ je .LBB0_89
+.LBB0_88: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
add rsi, 1
add rax, -1
- jne .LBB0_354
-.LBB0_355:
+ jne .LBB0_88
+.LBB0_89:
cmp r9, 3
- jb .LBB0_537
-.LBB0_356: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rdx + 4*rsi]
- sub eax, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rdx + 4*rsi + 4]
- sub eax, dword ptr [rcx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rdx + 4*rsi + 8]
- sub eax, dword ptr [rcx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rdx + 4*rsi + 12]
- sub eax, dword ptr [rcx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
+ jb .LBB0_825
+.LBB0_90: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
add rsi, 4
cmp r10, rsi
- jne .LBB0_356
- jmp .LBB0_537
-.LBB0_474:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
+ jne .LBB0_90
+ jmp .LBB0_825
+.LBB0_203:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 4*r10]
+ lea rax, [rcx + 2*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -668,73 +3017,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_479
-# %bb.475:
+ jne .LBB0_208
+# %bb.204:
and al, dil
- jne .LBB0_479
-# %bb.476:
+ jne .LBB0_208
+# %bb.205:
mov esi, r10d
- and esi, -32
+ and esi, -64
xor edi, edi
-.LBB0_477: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
- vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
+.LBB0_206: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
cmp rsi, rdi
- jne .LBB0_477
-# %bb.478:
+ jne .LBB0_206
+# %bb.207:
cmp rsi, r10
- je .LBB0_537
-.LBB0_479:
+ je .LBB0_825
+.LBB0_208:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_481
-.LBB0_480: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rdx + 4*rsi]
- sub edi, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
+ je .LBB0_210
+.LBB0_209: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
add rsi, 1
add rax, -1
- jne .LBB0_480
-.LBB0_481:
+ jne .LBB0_209
+.LBB0_210:
cmp r9, 3
- jb .LBB0_537
-.LBB0_482: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rdx + 4*rsi]
- sub eax, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rdx + 4*rsi + 4]
- sub eax, dword ptr [rcx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rdx + 4*rsi + 8]
- sub eax, dword ptr [rcx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rdx + 4*rsi + 12]
- sub eax, dword ptr [rcx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
+ jb .LBB0_825
+.LBB0_211: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
add rsi, 4
cmp r10, rsi
- jne .LBB0_482
- jmp .LBB0_537
-.LBB0_89:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
+ jne .LBB0_211
+ jmp .LBB0_825
+.LBB0_215:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 4*r10]
+ lea rax, [rcx + 2*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -743,73 +3092,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_94
-# %bb.90:
+ jne .LBB0_220
+# %bb.216:
and al, dil
- jne .LBB0_94
-# %bb.91:
+ jne .LBB0_220
+# %bb.217:
mov esi, r10d
- and esi, -32
+ and esi, -64
xor edi, edi
-.LBB0_92: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
- vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
+.LBB0_218: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
cmp rsi, rdi
- jne .LBB0_92
-# %bb.93:
+ jne .LBB0_218
+# %bb.219:
cmp rsi, r10
- je .LBB0_537
-.LBB0_94:
+ je .LBB0_825
+.LBB0_220:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_96
-.LBB0_95: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rcx + 4*rsi]
- add edi, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
+ je .LBB0_222
+.LBB0_221: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
add rsi, 1
add rax, -1
- jne .LBB0_95
-.LBB0_96:
+ jne .LBB0_221
+.LBB0_222:
cmp r9, 3
- jb .LBB0_537
-.LBB0_97: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rcx + 4*rsi]
- add eax, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rcx + 4*rsi + 4]
- add eax, dword ptr [rdx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rcx + 4*rsi + 8]
- add eax, dword ptr [rdx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rcx + 4*rsi + 12]
- add eax, dword ptr [rdx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
+ jb .LBB0_825
+.LBB0_223: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
add rsi, 4
cmp r10, rsi
- jne .LBB0_97
- jmp .LBB0_537
-.LBB0_222:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
+ jne .LBB0_223
+ jmp .LBB0_825
+.LBB0_124:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 4*r10]
+ lea rax, [rcx + 8*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -818,73 +3167,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_227
-# %bb.223:
+ jne .LBB0_129
+# %bb.125:
and al, dil
- jne .LBB0_227
-# %bb.224:
+ jne .LBB0_129
+# %bb.126:
mov esi, r10d
- and esi, -32
+ and esi, -16
xor edi, edi
-.LBB0_225: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
- vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
+.LBB0_127: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
cmp rsi, rdi
- jne .LBB0_225
-# %bb.226:
+ jne .LBB0_127
+# %bb.128:
cmp rsi, r10
- je .LBB0_537
-.LBB0_227:
+ je .LBB0_825
+.LBB0_129:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_229
-.LBB0_228: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rcx + 4*rsi]
- add edi, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
+ je .LBB0_131
+.LBB0_130: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
add rsi, 1
add rax, -1
- jne .LBB0_228
-.LBB0_229:
+ jne .LBB0_130
+.LBB0_131:
cmp r9, 3
- jb .LBB0_537
-.LBB0_230: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rcx + 4*rsi]
- add eax, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rcx + 4*rsi + 4]
- add eax, dword ptr [rdx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rcx + 4*rsi + 8]
- add eax, dword ptr [rdx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rcx + 4*rsi + 12]
- add eax, dword ptr [rdx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
+ jb .LBB0_825
+.LBB0_132: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
add rsi, 4
cmp r10, rsi
- jne .LBB0_230
- jmp .LBB0_537
-.LBB0_402:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_132
+ jmp .LBB0_825
+.LBB0_136:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -893,68 +3242,68 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_407
-# %bb.403:
+ jne .LBB0_141
+# %bb.137:
and al, dil
- jne .LBB0_407
-# %bb.404:
+ jne .LBB0_141
+# %bb.138:
mov esi, r10d
- and esi, -16
+ and esi, -32
xor edi, edi
-.LBB0_405: # =>This Inner Loop Header: Depth=1
- vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
- vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vsubpd ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
- vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vsubpd ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vsubpd ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vmovupd ymmword ptr [r8 + 8*rdi], ymm0
- vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_139: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vaddps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_405
-# %bb.406:
+ jne .LBB0_139
+# %bb.140:
cmp rsi, r10
- je .LBB0_537
-.LBB0_407:
+ je .LBB0_825
+.LBB0_141:
mov rdi, rsi
not rdi
add rdi, r10
mov rax, r10
and rax, 3
- je .LBB0_409
-.LBB0_408: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
+ je .LBB0_143
+.LBB0_142: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
add rsi, 1
add rax, -1
- jne .LBB0_408
-.LBB0_409:
+ jne .LBB0_142
+.LBB0_143:
cmp rdi, 3
- jb .LBB0_537
-.LBB0_410: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
- vmovsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
- vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
- vmovsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
- vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
- vmovsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
- vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ jb .LBB0_825
+.LBB0_144: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
add rsi, 4
cmp r10, rsi
- jne .LBB0_410
- jmp .LBB0_537
-.LBB0_528:
+ jne .LBB0_144
+ jmp .LBB0_825
+.LBB0_257:
lea rsi, [r8 + 8*r10]
lea rax, [rdx + 8*r10]
cmp rax, r8
@@ -968,73 +3317,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_533
-# %bb.529:
+ jne .LBB0_262
+# %bb.258:
and al, dil
- jne .LBB0_533
-# %bb.530:
+ jne .LBB0_262
+# %bb.259:
mov esi, r10d
and esi, -16
xor edi, edi
-.LBB0_531: # =>This Inner Loop Header: Depth=1
- vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
- vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vsubpd ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
- vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vsubpd ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vsubpd ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vmovupd ymmword ptr [r8 + 8*rdi], ymm0
- vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+.LBB0_260: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
add rdi, 16
cmp rsi, rdi
- jne .LBB0_531
-# %bb.532:
+ jne .LBB0_260
+# %bb.261:
cmp rsi, r10
- je .LBB0_537
-.LBB0_533:
- mov rdi, rsi
- not rdi
- add rdi, r10
+ je .LBB0_825
+.LBB0_262:
+ mov r9, rsi
+ not r9
+ add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_535
-.LBB0_534: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
+ je .LBB0_264
+.LBB0_263: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
add rsi, 1
add rax, -1
- jne .LBB0_534
-.LBB0_535:
- cmp rdi, 3
- jb .LBB0_537
-.LBB0_536: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
- vmovsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
- vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
- vmovsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
- vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
- vmovsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
- vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
- vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ jne .LBB0_263
+.LBB0_264:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_265: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
add rsi, 4
cmp r10, rsi
- jne .LBB0_536
- jmp .LBB0_537
-.LBB0_143:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_265
+ jmp .LBB0_825
+.LBB0_269:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1043,73 +3392,137 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_148
-# %bb.144:
+ jne .LBB0_274
+# %bb.270:
and al, dil
- jne .LBB0_148
-# %bb.145:
+ jne .LBB0_274
+# %bb.271:
mov esi, r10d
- and esi, -16
+ and esi, -32
xor edi, edi
-.LBB0_146: # =>This Inner Loop Header: Depth=1
- vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
- vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vaddpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
- vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vaddpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vaddpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vmovupd ymmword ptr [r8 + 8*rdi], ymm0
- vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_272: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vaddps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_146
-# %bb.147:
+ jne .LBB0_272
+# %bb.273:
cmp rsi, r10
- je .LBB0_537
-.LBB0_148:
+ je .LBB0_825
+.LBB0_274:
mov rdi, rsi
not rdi
add rdi, r10
mov rax, r10
and rax, 3
- je .LBB0_150
-.LBB0_149: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
+ je .LBB0_276
+.LBB0_275: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
add rsi, 1
add rax, -1
- jne .LBB0_149
-.LBB0_150:
+ jne .LBB0_275
+.LBB0_276:
cmp rdi, 3
- jb .LBB0_537
-.LBB0_151: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
- vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
- vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
- vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
- vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
- vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
- vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ jb .LBB0_825
+.LBB0_277: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
add rsi, 4
cmp r10, rsi
- jne .LBB0_151
- jmp .LBB0_537
-.LBB0_276:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_277
+ jmp .LBB0_825
+.LBB0_574:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta sil
+ xor edi, edi
+ test r9b, r11b
+ jne .LBB0_584
+# %bb.575:
+ and al, sil
+ jne .LBB0_584
+# %bb.576:
+ mov edi, r10d
+ and edi, -32
+ lea rsi, [rdi - 32]
+ mov rax, rsi
+ shr rax, 5
+ add rax, 1
+ mov r9d, eax
+ and r9d, 3
+ cmp rsi, 96
+ jae .LBB0_578
+# %bb.577:
+ xor esi, esi
+ jmp .LBB0_580
+.LBB0_710:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta sil
+ xor edi, edi
+ test r9b, r11b
+ jne .LBB0_720
+# %bb.711:
+ and al, sil
+ jne .LBB0_720
+# %bb.712:
+ mov edi, r10d
+ and edi, -32
+ lea rsi, [rdi - 32]
+ mov rax, rsi
+ shr rax, 5
+ add rax, 1
+ mov r9d, eax
+ and r9d, 3
+ cmp rsi, 96
+ jae .LBB0_714
+# %bb.713:
+ xor esi, esi
+ jmp .LBB0_716
+.LBB0_49:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1118,67 +3531,67 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_281
-# %bb.277:
+ jne .LBB0_54
+# %bb.50:
and al, dil
- jne .LBB0_281
-# %bb.278:
+ jne .LBB0_54
+# %bb.51:
mov esi, r10d
- and esi, -16
+ and esi, -128
xor edi, edi
-.LBB0_279: # =>This Inner Loop Header: Depth=1
- vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
- vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vaddpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
- vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vaddpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vaddpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vmovupd ymmword ptr [r8 + 8*rdi], ymm0
- vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_52: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
cmp rsi, rdi
- jne .LBB0_279
-# %bb.280:
+ jne .LBB0_52
+# %bb.53:
cmp rsi, r10
- je .LBB0_537
-.LBB0_281:
- mov rdi, rsi
- not rdi
- add rdi, r10
- mov rax, r10
- and rax, 3
- je .LBB0_283
-.LBB0_282: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
+ je .LBB0_825
+.LBB0_54:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_56
+.LBB0_55: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
add rsi, 1
- add rax, -1
- jne .LBB0_282
-.LBB0_283:
- cmp rdi, 3
- jb .LBB0_537
-.LBB0_284: # =>This Inner Loop Header: Depth=1
- vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
- vmovsd qword ptr [r8 + 8*rsi], xmm0
- vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
- vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
- vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
- vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
- vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
- vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
- vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rdi, -1
+ jne .LBB0_55
+.LBB0_56:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_57: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
add rsi, 4
cmp r10, rsi
- jne .LBB0_284
- jmp .LBB0_537
+ jne .LBB0_57
+ jmp .LBB0_825
.LBB0_315:
lea rsi, [r8 + r10]
lea rax, [rdx + r10]
@@ -1219,7 +3632,7 @@ arithmetic_avx2: # @arithmetic_avx2
jne .LBB0_318
# %bb.319:
cmp rsi, r10
- je .LBB0_537
+ je .LBB0_825
.LBB0_320:
mov r9, rsi
not r9
@@ -1236,7 +3649,7 @@ arithmetic_avx2: # @arithmetic_avx2
jne .LBB0_321
.LBB0_322:
cmp r9, 3
- jb .LBB0_537
+ jb .LBB0_825
.LBB0_323: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rdx + rsi]
sub al, byte ptr [rcx + rsi]
@@ -1253,8 +3666,8 @@ arithmetic_avx2: # @arithmetic_avx2
add rsi, 4
cmp r10, rsi
jne .LBB0_323
- jmp .LBB0_537
-.LBB0_441:
+ jmp .LBB0_825
+.LBB0_182:
lea rsi, [r8 + r10]
lea rax, [rdx + r10]
cmp rax, r8
@@ -1268,15 +3681,90 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_446
-# %bb.442:
+ jne .LBB0_187
+# %bb.183:
and al, dil
- jne .LBB0_446
-# %bb.443:
+ jne .LBB0_187
+# %bb.184:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_185: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_185
+# %bb.186:
+ cmp rsi, r10
+ je .LBB0_825
+.LBB0_187:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_189
+.LBB0_188: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_188
+.LBB0_189:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_190: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_190
+ jmp .LBB0_825
+.LBB0_448:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_453
+# %bb.449:
+ and al, dil
+ jne .LBB0_453
+# %bb.450:
mov esi, r10d
and esi, -128
xor edi, edi
-.LBB0_444: # =>This Inner Loop Header: Depth=1
+.LBB0_451: # =>This Inner Loop Header: Depth=1
vmovdqu ymm0, ymmword ptr [rdx + rdi]
vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
@@ -1291,28 +3779,28 @@ arithmetic_avx2: # @arithmetic_avx2
vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
sub rdi, -128
cmp rsi, rdi
- jne .LBB0_444
-# %bb.445:
+ jne .LBB0_451
+# %bb.452:
cmp rsi, r10
- je .LBB0_537
-.LBB0_446:
+ je .LBB0_825
+.LBB0_453:
mov r9, rsi
not r9
add r9, r10
mov rdi, r10
and rdi, 3
- je .LBB0_448
-.LBB0_447: # =>This Inner Loop Header: Depth=1
+ je .LBB0_455
+.LBB0_454: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rdx + rsi]
sub al, byte ptr [rcx + rsi]
mov byte ptr [r8 + rsi], al
add rsi, 1
add rdi, -1
- jne .LBB0_447
-.LBB0_448:
+ jne .LBB0_454
+.LBB0_455:
cmp r9, 3
- jb .LBB0_537
-.LBB0_449: # =>This Inner Loop Header: Depth=1
+ jb .LBB0_825
+.LBB0_456: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rdx + rsi]
sub al, byte ptr [rcx + rsi]
mov byte ptr [r8 + rsi], al
@@ -1327,14 +3815,14 @@ arithmetic_avx2: # @arithmetic_avx2
mov byte ptr [r8 + rsi + 3], al
add rsi, 4
cmp r10, rsi
- jne .LBB0_449
- jmp .LBB0_537
-.LBB0_56:
- lea rsi, [r8 + r10]
- lea rax, [rdx + r10]
+ jne .LBB0_456
+ jmp .LBB0_825
+.LBB0_638:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1343,73 +3831,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_61
-# %bb.57:
+ jne .LBB0_643
+# %bb.639:
and al, dil
- jne .LBB0_61
-# %bb.58:
+ jne .LBB0_643
+# %bb.640:
mov esi, r10d
- and esi, -128
+ and esi, -32
xor edi, edi
-.LBB0_59: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + rdi]
- vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
- vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
- vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
- vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
- vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
- vmovdqu ymmword ptr [r8 + rdi], ymm0
- vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
- sub rdi, -128
+.LBB0_641: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpmulld ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpmulld ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpmulld ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_59
-# %bb.60:
+ jne .LBB0_641
+# %bb.642:
cmp rsi, r10
- je .LBB0_537
-.LBB0_61:
+ je .LBB0_825
+.LBB0_643:
mov r9, rsi
not r9
add r9, r10
- mov rdi, r10
- and rdi, 3
- je .LBB0_63
-.LBB0_62: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
+ mov rax, r10
+ and rax, 3
+ je .LBB0_645
+.LBB0_644: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ imul edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
add rsi, 1
- add rdi, -1
- jne .LBB0_62
-.LBB0_63:
+ add rax, -1
+ jne .LBB0_644
+.LBB0_645:
cmp r9, 3
- jb .LBB0_537
-.LBB0_64: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
- movzx eax, byte ptr [rcx + rsi + 1]
- add al, byte ptr [rdx + rsi + 1]
- mov byte ptr [r8 + rsi + 1], al
- movzx eax, byte ptr [rcx + rsi + 2]
- add al, byte ptr [rdx + rsi + 2]
- mov byte ptr [r8 + rsi + 2], al
- movzx eax, byte ptr [rcx + rsi + 3]
- add al, byte ptr [rdx + rsi + 3]
- mov byte ptr [r8 + rsi + 3], al
+ jb .LBB0_825
+.LBB0_646: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ imul eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ imul eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ imul eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ imul eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
add rsi, 4
cmp r10, rsi
- jne .LBB0_64
- jmp .LBB0_537
-.LBB0_189:
- lea rsi, [r8 + r10]
- lea rax, [rdx + r10]
+ jne .LBB0_646
+ jmp .LBB0_825
+.LBB0_774:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1418,73 +3906,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_194
-# %bb.190:
+ jne .LBB0_779
+# %bb.775:
and al, dil
- jne .LBB0_194
-# %bb.191:
+ jne .LBB0_779
+# %bb.776:
mov esi, r10d
- and esi, -128
+ and esi, -32
xor edi, edi
-.LBB0_192: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + rdi]
- vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
- vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
- vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
- vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
- vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
- vmovdqu ymmword ptr [r8 + rdi], ymm0
- vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
- sub rdi, -128
+.LBB0_777: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpmulld ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpmulld ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpmulld ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_192
-# %bb.193:
+ jne .LBB0_777
+# %bb.778:
cmp rsi, r10
- je .LBB0_537
-.LBB0_194:
+ je .LBB0_825
+.LBB0_779:
mov r9, rsi
not r9
add r9, r10
- mov rdi, r10
- and rdi, 3
- je .LBB0_196
-.LBB0_195: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
+ mov rax, r10
+ and rax, 3
+ je .LBB0_781
+.LBB0_780: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ imul edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
add rsi, 1
- add rdi, -1
- jne .LBB0_195
-.LBB0_196:
+ add rax, -1
+ jne .LBB0_780
+.LBB0_781:
cmp r9, 3
- jb .LBB0_537
-.LBB0_197: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
- movzx eax, byte ptr [rcx + rsi + 1]
- add al, byte ptr [rdx + rsi + 1]
- mov byte ptr [r8 + rsi + 1], al
- movzx eax, byte ptr [rcx + rsi + 2]
- add al, byte ptr [rdx + rsi + 2]
- mov byte ptr [r8 + rsi + 2], al
- movzx eax, byte ptr [rcx + rsi + 3]
- add al, byte ptr [rdx + rsi + 3]
- mov byte ptr [r8 + rsi + 3], al
+ jb .LBB0_825
+.LBB0_782: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ imul eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ imul eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ imul eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ imul eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
add rsi, 4
cmp r10, rsi
- jne .LBB0_197
- jmp .LBB0_537
-.LBB0_369:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_782
+ jmp .LBB0_825
+.LBB0_103:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1493,73 +3981,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_374
-# %bb.370:
+ jne .LBB0_108
+# %bb.104:
and al, dil
- jne .LBB0_374
-# %bb.371:
+ jne .LBB0_108
+# %bb.105:
mov esi, r10d
- and esi, -16
+ and esi, -32
xor edi, edi
-.LBB0_372: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
- vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_106: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_372
-# %bb.373:
+ jne .LBB0_106
+# %bb.107:
cmp rsi, r10
- je .LBB0_537
-.LBB0_374:
+ je .LBB0_825
+.LBB0_108:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_376
-.LBB0_375: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rdx + 8*rsi]
- sub rdi, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
+ je .LBB0_110
+.LBB0_109: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
add rsi, 1
add rax, -1
- jne .LBB0_375
-.LBB0_376:
+ jne .LBB0_109
+.LBB0_110:
cmp r9, 3
- jb .LBB0_537
-.LBB0_377: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rdx + 8*rsi]
- sub rax, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rdx + 8*rsi + 8]
- sub rax, qword ptr [rcx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rdx + 8*rsi + 16]
- sub rax, qword ptr [rcx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rdx + 8*rsi + 24]
- sub rax, qword ptr [rcx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
+ jb .LBB0_825
+.LBB0_111: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
add rsi, 4
cmp r10, rsi
- jne .LBB0_377
- jmp .LBB0_537
-.LBB0_495:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_111
+ jmp .LBB0_825
+.LBB0_369:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1568,73 +4056,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_500
-# %bb.496:
+ jne .LBB0_374
+# %bb.370:
and al, dil
- jne .LBB0_500
-# %bb.497:
+ jne .LBB0_374
+# %bb.371:
mov esi, r10d
- and esi, -16
+ and esi, -32
xor edi, edi
-.LBB0_498: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
- vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_372: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_498
-# %bb.499:
+ jne .LBB0_372
+# %bb.373:
cmp rsi, r10
- je .LBB0_537
-.LBB0_500:
+ je .LBB0_825
+.LBB0_374:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
- and rax, 3
- je .LBB0_502
-.LBB0_501: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rdx + 8*rsi]
- sub rdi, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
- add rsi, 1
- add rax, -1
- jne .LBB0_501
-.LBB0_502:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_503: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rdx + 8*rsi]
- sub rax, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rdx + 8*rsi + 8]
- sub rax, qword ptr [rcx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rdx + 8*rsi + 16]
- sub rax, qword ptr [rcx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rdx + 8*rsi + 24]
- sub rax, qword ptr [rcx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
+ and rax, 3
+ je .LBB0_376
+.LBB0_375: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_375
+.LBB0_376:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_377: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
add rsi, 4
cmp r10, rsi
- jne .LBB0_503
- jmp .LBB0_537
-.LBB0_110:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_377
+ jmp .LBB0_825
+.LBB0_236:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1643,73 +4131,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_115
-# %bb.111:
+ jne .LBB0_241
+# %bb.237:
and al, dil
- jne .LBB0_115
-# %bb.112:
+ jne .LBB0_241
+# %bb.238:
mov esi, r10d
- and esi, -16
+ and esi, -32
xor edi, edi
-.LBB0_113: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
- vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_239: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_113
-# %bb.114:
+ jne .LBB0_239
+# %bb.240:
cmp rsi, r10
- je .LBB0_537
-.LBB0_115:
+ je .LBB0_825
+.LBB0_241:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_117
-.LBB0_116: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rcx + 8*rsi]
- add rdi, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
+ je .LBB0_243
+.LBB0_242: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
add rsi, 1
add rax, -1
- jne .LBB0_116
-.LBB0_117:
+ jne .LBB0_242
+.LBB0_243:
cmp r9, 3
- jb .LBB0_537
-.LBB0_118: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rcx + 8*rsi]
- add rax, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rcx + 8*rsi + 8]
- add rax, qword ptr [rdx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rcx + 8*rsi + 16]
- add rax, qword ptr [rdx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rcx + 8*rsi + 24]
- add rax, qword ptr [rdx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
+ jb .LBB0_825
+.LBB0_244: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
add rsi, 4
cmp r10, rsi
- jne .LBB0_118
- jmp .LBB0_537
-.LBB0_243:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_244
+ jmp .LBB0_825
+.LBB0_502:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1718,73 +4206,73 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_248
-# %bb.244:
+ jne .LBB0_507
+# %bb.503:
and al, dil
- jne .LBB0_248
-# %bb.245:
+ jne .LBB0_507
+# %bb.504:
mov esi, r10d
- and esi, -16
+ and esi, -32
xor edi, edi
-.LBB0_246: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
- vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_505: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_246
-# %bb.247:
+ jne .LBB0_505
+# %bb.506:
cmp rsi, r10
- je .LBB0_537
-.LBB0_248:
+ je .LBB0_825
+.LBB0_507:
mov r9, rsi
not r9
add r9, r10
mov rax, r10
and rax, 3
- je .LBB0_250
-.LBB0_249: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rcx + 8*rsi]
- add rdi, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
+ je .LBB0_509
+.LBB0_508: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
add rsi, 1
add rax, -1
- jne .LBB0_249
-.LBB0_250:
+ jne .LBB0_508
+.LBB0_509:
cmp r9, 3
- jb .LBB0_537
-.LBB0_251: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rcx + 8*rsi]
- add rax, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rcx + 8*rsi + 8]
- add rax, qword ptr [rdx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rcx + 8*rsi + 16]
- add rax, qword ptr [rdx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rcx + 8*rsi + 24]
- add rax, qword ptr [rdx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
+ jb .LBB0_825
+.LBB0_510: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
add rsi, 4
cmp r10, rsi
- jne .LBB0_251
- jmp .LBB0_537
-.LBB0_327:
- lea rsi, [r8 + 2*r10]
- lea rax, [rdx + 2*r10]
+ jne .LBB0_510
+ jmp .LBB0_825
+.LBB0_626:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 2*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1793,73 +4281,82 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_332
-# %bb.328:
+ jne .LBB0_631
+# %bb.627:
and al, dil
- jne .LBB0_332
-# %bb.329:
+ jne .LBB0_631
+# %bb.628:
mov esi, r10d
- and esi, -64
+ and esi, -32
xor edi, edi
-.LBB0_330: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
- vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
- vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
- vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
- vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
- add rdi, 64
+.LBB0_629: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpmulld ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpmulld ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpmulld ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_330
-# %bb.331:
+ jne .LBB0_629
+# %bb.630:
cmp rsi, r10
- je .LBB0_537
-.LBB0_332:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_334
-.LBB0_333: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rdx + 2*rsi]
- sub di, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_333
-.LBB0_334:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_335: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rdx + 2*rsi]
- sub ax, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rdx + 2*rsi + 2]
- sub ax, word ptr [rcx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rdx + 2*rsi + 4]
- sub ax, word ptr [rcx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rdx + 2*rsi + 6]
- sub ax, word ptr [rcx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_335
- jmp .LBB0_537
-.LBB0_339:
- lea rsi, [r8 + 2*r10]
- lea rax, [rdx + 2*r10]
+ jne .LBB0_631
+ jmp .LBB0_825
+.LBB0_762:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_767
+# %bb.763:
+ and al, dil
+ jne .LBB0_767
+# %bb.764:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_765: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpmulld ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpmulld ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpmulld ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_765
+# %bb.766:
+ cmp rsi, r10
+ jne .LBB0_767
+ jmp .LBB0_825
+.LBB0_357:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 2*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1868,73 +4365,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_344
-# %bb.340:
+ jne .LBB0_362
+# %bb.358:
and al, dil
- jne .LBB0_344
-# %bb.341:
+ jne .LBB0_362
+# %bb.359:
mov esi, r10d
- and esi, -64
+ and esi, -32
xor edi, edi
-.LBB0_342: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
- vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
- vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
- vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
- vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
- add rdi, 64
+.LBB0_360: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_342
-# %bb.343:
+ jne .LBB0_360
+# %bb.361:
cmp rsi, r10
- je .LBB0_537
-.LBB0_344:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_346
-.LBB0_345: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rdx + 2*rsi]
- sub di, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_345
-.LBB0_346:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_347: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rdx + 2*rsi]
- sub ax, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rdx + 2*rsi + 2]
- sub ax, word ptr [rcx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rdx + 2*rsi + 4]
- sub ax, word ptr [rcx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rdx + 2*rsi + 6]
- sub ax, word ptr [rcx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_347
- jmp .LBB0_537
-.LBB0_453:
- lea rsi, [r8 + 2*r10]
- lea rax, [rdx + 2*r10]
+ jne .LBB0_362
+ jmp .LBB0_825
+.LBB0_490:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 2*r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -1943,73 +4407,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_458
-# %bb.454:
+ jne .LBB0_495
+# %bb.491:
and al, dil
- jne .LBB0_458
-# %bb.455:
+ jne .LBB0_495
+# %bb.492:
mov esi, r10d
- and esi, -64
+ and esi, -32
xor edi, edi
-.LBB0_456: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
- vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
- vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
- vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
- vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
- add rdi, 64
+.LBB0_493: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_456
-# %bb.457:
+ jne .LBB0_493
+# %bb.494:
cmp rsi, r10
- je .LBB0_537
-.LBB0_458:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_460
-.LBB0_459: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rdx + 2*rsi]
- sub di, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_459
-.LBB0_460:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_461: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rdx + 2*rsi]
- sub ax, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rdx + 2*rsi + 2]
- sub ax, word ptr [rcx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rdx + 2*rsi + 4]
- sub ax, word ptr [rcx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rdx + 2*rsi + 6]
- sub ax, word ptr [rcx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_461
- jmp .LBB0_537
-.LBB0_465:
- lea rsi, [r8 + 2*r10]
- lea rax, [rdx + 2*r10]
+ jne .LBB0_495
+ jmp .LBB0_825
+.LBB0_680:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 2*r10]
+ lea rax, [rcx + 8*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2018,73 +4449,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_470
-# %bb.466:
+ jne .LBB0_685
+# %bb.681:
and al, dil
- jne .LBB0_470
-# %bb.467:
+ jne .LBB0_685
+# %bb.682:
mov esi, r10d
- and esi, -64
+ and esi, -16
xor edi, edi
-.LBB0_468: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
- vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
- vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
- vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
- vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
- add rdi, 64
+.LBB0_683: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmulpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vmulpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmulpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmulpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
cmp rsi, rdi
- jne .LBB0_468
-# %bb.469:
+ jne .LBB0_683
+# %bb.684:
cmp rsi, r10
- je .LBB0_537
-.LBB0_470:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_472
-.LBB0_471: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rdx + 2*rsi]
- sub di, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_471
-.LBB0_472:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_473: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rdx + 2*rsi]
- sub ax, word ptr [rcx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rdx + 2*rsi + 2]
- sub ax, word ptr [rcx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rdx + 2*rsi + 4]
- sub ax, word ptr [rcx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rdx + 2*rsi + 6]
- sub ax, word ptr [rcx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_473
- jmp .LBB0_537
-.LBB0_68:
- lea rsi, [r8 + 2*r10]
- lea rax, [rdx + 2*r10]
+ jne .LBB0_685
+ jmp .LBB0_825
+.LBB0_816:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 2*r10]
+ lea rax, [rcx + 8*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2093,73 +4491,82 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_73
-# %bb.69:
+ jne .LBB0_821
+# %bb.817:
and al, dil
- jne .LBB0_73
-# %bb.70:
+ jne .LBB0_821
+# %bb.818:
mov esi, r10d
- and esi, -64
+ and esi, -16
xor edi, edi
-.LBB0_71: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
- vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
- vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
- add rdi, 64
+.LBB0_819: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmulpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vmulpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmulpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmulpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_819
+# %bb.820:
+ cmp rsi, r10
+ jne .LBB0_821
+ jmp .LBB0_825
+.LBB0_411:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_416
+# %bb.412:
+ and al, dil
+ jne .LBB0_416
+# %bb.413:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_414: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vsubpd ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
cmp rsi, rdi
- jne .LBB0_71
-# %bb.72:
+ jne .LBB0_414
+# %bb.415:
cmp rsi, r10
- je .LBB0_537
-.LBB0_73:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_75
-.LBB0_74: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rcx + 2*rsi]
- add di, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_74
-.LBB0_75:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_76: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rcx + 2*rsi]
- add ax, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rcx + 2*rsi + 2]
- add ax, word ptr [rdx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rcx + 2*rsi + 4]
- add ax, word ptr [rdx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rcx + 2*rsi + 6]
- add ax, word ptr [rdx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_76
- jmp .LBB0_537
-.LBB0_80:
- lea rsi, [r8 + 2*r10]
- lea rax, [rdx + 2*r10]
+ jne .LBB0_416
+ jmp .LBB0_825
+.LBB0_544:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 2*r10]
+ lea rax, [rcx + 8*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2168,68 +4575,35 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_85
-# %bb.81:
+ jne .LBB0_549
+# %bb.545:
and al, dil
- jne .LBB0_85
-# %bb.82:
+ jne .LBB0_549
+# %bb.546:
mov esi, r10d
- and esi, -64
+ and esi, -16
xor edi, edi
-.LBB0_83: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
- vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
- vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
- add rdi, 64
+.LBB0_547: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vsubpd ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
cmp rsi, rdi
- jne .LBB0_83
-# %bb.84:
+ jne .LBB0_547
+# %bb.548:
cmp rsi, r10
- je .LBB0_537
-.LBB0_85:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_87
-.LBB0_86: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rcx + 2*rsi]
- add di, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_86
-.LBB0_87:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_88: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rcx + 2*rsi]
- add ax, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rcx + 2*rsi + 2]
- add ax, word ptr [rdx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rcx + 2*rsi + 4]
- add ax, word ptr [rdx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rcx + 2*rsi + 6]
- add ax, word ptr [rdx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_88
- jmp .LBB0_537
-.LBB0_201:
+ jne .LBB0_549
+ jmp .LBB0_825
+.LBB0_605:
lea rsi, [r8 + 2*r10]
lea rax, [rdx + 2*r10]
cmp rax, r8
@@ -2243,68 +4617,35 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_206
-# %bb.202:
+ jne .LBB0_610
+# %bb.606:
and al, dil
- jne .LBB0_206
-# %bb.203:
+ jne .LBB0_610
+# %bb.607:
mov esi, r10d
and esi, -64
xor edi, edi
-.LBB0_204: # =>This Inner Loop Header: Depth=1
+.LBB0_608: # =>This Inner Loop Header: Depth=1
vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
- vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpmullw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpmullw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpmullw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpmullw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
add rdi, 64
cmp rsi, rdi
- jne .LBB0_204
-# %bb.205:
+ jne .LBB0_608
+# %bb.609:
cmp rsi, r10
- je .LBB0_537
-.LBB0_206:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_208
-.LBB0_207: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rcx + 2*rsi]
- add di, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_207
-.LBB0_208:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_209: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rcx + 2*rsi]
- add ax, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rcx + 2*rsi + 2]
- add ax, word ptr [rdx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rcx + 2*rsi + 4]
- add ax, word ptr [rdx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rcx + 2*rsi + 6]
- add ax, word ptr [rdx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_209
- jmp .LBB0_537
-.LBB0_213:
+ jne .LBB0_610
+ jmp .LBB0_825
+.LBB0_617:
lea rsi, [r8 + 2*r10]
lea rax, [rdx + 2*r10]
cmp rax, r8
@@ -2318,148 +4659,82 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_218
-# %bb.214:
+ jne .LBB0_622
+# %bb.618:
and al, dil
- jne .LBB0_218
-# %bb.215:
+ jne .LBB0_622
+# %bb.619:
mov esi, r10d
and esi, -64
xor edi, edi
-.LBB0_216: # =>This Inner Loop Header: Depth=1
+.LBB0_620: # =>This Inner Loop Header: Depth=1
vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
- vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
- vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
- vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
- vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpmullw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpmullw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpmullw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpmullw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
add rdi, 64
cmp rsi, rdi
- jne .LBB0_216
-# %bb.217:
+ jne .LBB0_620
+# %bb.621:
cmp rsi, r10
- je .LBB0_537
-.LBB0_218:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_220
-.LBB0_219: # =>This Inner Loop Header: Depth=1
- movzx edi, word ptr [rcx + 2*rsi]
- add di, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], di
- add rsi, 1
- add rax, -1
- jne .LBB0_219
-.LBB0_220:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_221: # =>This Inner Loop Header: Depth=1
- movzx eax, word ptr [rcx + 2*rsi]
- add ax, word ptr [rdx + 2*rsi]
- mov word ptr [r8 + 2*rsi], ax
- movzx eax, word ptr [rcx + 2*rsi + 2]
- add ax, word ptr [rdx + 2*rsi + 2]
- mov word ptr [r8 + 2*rsi + 2], ax
- movzx eax, word ptr [rcx + 2*rsi + 4]
- add ax, word ptr [rdx + 2*rsi + 4]
- mov word ptr [r8 + 2*rsi + 4], ax
- movzx eax, word ptr [rcx + 2*rsi + 6]
- add ax, word ptr [rdx + 2*rsi + 6]
- mov word ptr [r8 + 2*rsi + 6], ax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_221
- jmp .LBB0_537
-.LBB0_381:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
- cmp rax, r8
- seta r9b
- lea rax, [rcx + 8*r10]
- cmp rsi, rdx
- seta r11b
+ jne .LBB0_622
+ jmp .LBB0_825
+.LBB0_741:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
- seta al
- cmp rsi, rcx
- seta dil
- xor esi, esi
- test r9b, r11b
- jne .LBB0_386
-# %bb.382:
- and al, dil
- jne .LBB0_386
-# %bb.383:
- mov esi, r10d
- and esi, -16
- xor edi, edi
-.LBB0_384: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
- vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
- cmp rsi, rdi
- jne .LBB0_384
-# %bb.385:
- cmp rsi, r10
- je .LBB0_537
-.LBB0_386:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_388
-.LBB0_387: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rdx + 8*rsi]
- sub rdi, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
- add rsi, 1
- add rax, -1
- jne .LBB0_387
-.LBB0_388:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_389: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rdx + 8*rsi]
- sub rax, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rdx + 8*rsi + 8]
- sub rax, qword ptr [rcx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rdx + 8*rsi + 16]
- sub rax, qword ptr [rcx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rdx + 8*rsi + 24]
- sub rax, qword ptr [rcx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_389
- jmp .LBB0_537
-.LBB0_393:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_746
+# %bb.742:
+ and al, dil
+ jne .LBB0_746
+# %bb.743:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_744: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpmullw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpmullw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpmullw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpmullw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_744
+# %bb.745:
+ cmp rsi, r10
+ jne .LBB0_746
+ jmp .LBB0_825
+.LBB0_753:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 4*r10]
+ lea rax, [rcx + 2*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2468,73 +4743,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_398
-# %bb.394:
+ jne .LBB0_758
+# %bb.754:
and al, dil
- jne .LBB0_398
-# %bb.395:
+ jne .LBB0_758
+# %bb.755:
mov esi, r10d
- and esi, -32
+ and esi, -64
xor edi, edi
-.LBB0_396: # =>This Inner Loop Header: Depth=1
- vmovups ymm0, ymmword ptr [rdx + 4*rdi]
- vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vsubps ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
- vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vsubps ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vsubps ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vmovups ymmword ptr [r8 + 4*rdi], ymm0
- vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
+.LBB0_756: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpmullw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpmullw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpmullw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpmullw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
cmp rsi, rdi
- jne .LBB0_396
-# %bb.397:
+ jne .LBB0_756
+# %bb.757:
cmp rsi, r10
- je .LBB0_537
-.LBB0_398:
- mov rdi, rsi
- not rdi
- add rdi, r10
- mov rax, r10
- and rax, 3
- je .LBB0_400
-.LBB0_399: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- add rsi, 1
- add rax, -1
- jne .LBB0_399
-.LBB0_400:
- cmp rdi, 3
- jb .LBB0_537
-.LBB0_401: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- vmovss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
- vmovss dword ptr [r8 + 4*rsi + 4], xmm0
- vmovss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
- vmovss dword ptr [r8 + 4*rsi + 8], xmm0
- vmovss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
- vmovss dword ptr [r8 + 4*rsi + 12], xmm0
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_401
- jmp .LBB0_537
-.LBB0_507:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_758
+ jmp .LBB0_825
+.LBB0_336:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 2*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2543,73 +4785,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_512
-# %bb.508:
+ jne .LBB0_341
+# %bb.337:
and al, dil
- jne .LBB0_512
-# %bb.509:
+ jne .LBB0_341
+# %bb.338:
mov esi, r10d
- and esi, -16
+ and esi, -64
xor edi, edi
-.LBB0_510: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
- vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_339: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
cmp rsi, rdi
- jne .LBB0_510
-# %bb.511:
+ jne .LBB0_339
+# %bb.340:
cmp rsi, r10
- je .LBB0_537
-.LBB0_512:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_514
-.LBB0_513: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rdx + 8*rsi]
- sub rdi, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
- add rsi, 1
- add rax, -1
- jne .LBB0_513
-.LBB0_514:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_515: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rdx + 8*rsi]
- sub rax, qword ptr [rcx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rdx + 8*rsi + 8]
- sub rax, qword ptr [rcx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rdx + 8*rsi + 16]
- sub rax, qword ptr [rcx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rdx + 8*rsi + 24]
- sub rax, qword ptr [rcx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_515
- jmp .LBB0_537
-.LBB0_519:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
+ jne .LBB0_341
+ jmp .LBB0_825
+.LBB0_348:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 4*r10]
+ lea rax, [rcx + 2*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2618,73 +4827,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_524
-# %bb.520:
+ jne .LBB0_353
+# %bb.349:
and al, dil
- jne .LBB0_524
-# %bb.521:
+ jne .LBB0_353
+# %bb.350:
mov esi, r10d
- and esi, -32
+ and esi, -64
xor edi, edi
-.LBB0_522: # =>This Inner Loop Header: Depth=1
- vmovups ymm0, ymmword ptr [rdx + 4*rdi]
- vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vsubps ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
- vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vsubps ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vsubps ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vmovups ymmword ptr [r8 + 4*rdi], ymm0
- vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
+.LBB0_351: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
cmp rsi, rdi
- jne .LBB0_522
-# %bb.523:
+ jne .LBB0_351
+# %bb.352:
cmp rsi, r10
- je .LBB0_537
-.LBB0_524:
- mov rdi, rsi
- not rdi
- add rdi, r10
- mov rax, r10
- and rax, 3
- je .LBB0_526
-.LBB0_525: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- add rsi, 1
- add rax, -1
- jne .LBB0_525
-.LBB0_526:
- cmp rdi, 3
- jb .LBB0_537
-.LBB0_527: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- vmovss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
- vmovss dword ptr [r8 + 4*rsi + 4], xmm0
- vmovss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
- vmovss dword ptr [r8 + 4*rsi + 8], xmm0
- vmovss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
- vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
- vmovss dword ptr [r8 + 4*rsi + 12], xmm0
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_527
- jmp .LBB0_537
-.LBB0_122:
- lea rsi, [r8 + 8*r10]
- lea rax, [rdx + 8*r10]
+ jne .LBB0_353
+ jmp .LBB0_825
+.LBB0_469:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 8*r10]
+ lea rax, [rcx + 2*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2693,73 +4869,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_127
-# %bb.123:
+ jne .LBB0_474
+# %bb.470:
and al, dil
- jne .LBB0_127
-# %bb.124:
+ jne .LBB0_474
+# %bb.471:
mov esi, r10d
- and esi, -16
+ and esi, -64
xor edi, edi
-.LBB0_125: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
- vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
- add rdi, 16
+.LBB0_472: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
cmp rsi, rdi
- jne .LBB0_125
-# %bb.126:
+ jne .LBB0_472
+# %bb.473:
cmp rsi, r10
- je .LBB0_537
-.LBB0_127:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_129
-.LBB0_128: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rcx + 8*rsi]
- add rdi, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
- add rsi, 1
- add rax, -1
- jne .LBB0_128
-.LBB0_129:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_130: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rcx + 8*rsi]
- add rax, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rcx + 8*rsi + 8]
- add rax, qword ptr [rdx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rcx + 8*rsi + 16]
- add rax, qword ptr [rdx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rcx + 8*rsi + 24]
- add rax, qword ptr [rdx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_130
- jmp .LBB0_537
-.LBB0_134:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
+ jne .LBB0_474
+ jmp .LBB0_825
+.LBB0_481:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 4*r10]
+ lea rax, [rcx + 2*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -2768,68 +4911,35 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_139
-# %bb.135:
+ jne .LBB0_486
+# %bb.482:
and al, dil
- jne .LBB0_139
-# %bb.136:
+ jne .LBB0_486
+# %bb.483:
mov esi, r10d
- and esi, -32
+ and esi, -64
xor edi, edi
-.LBB0_137: # =>This Inner Loop Header: Depth=1
- vmovups ymm0, ymmword ptr [rcx + 4*rdi]
- vmovups ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vmovups ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vmovups ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vaddps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
- vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vaddps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vaddps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vmovups ymmword ptr [r8 + 4*rdi], ymm0
- vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
+.LBB0_484: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
cmp rsi, rdi
- jne .LBB0_137
-# %bb.138:
+ jne .LBB0_484
+# %bb.485:
cmp rsi, r10
- je .LBB0_537
-.LBB0_139:
- mov rdi, rsi
- not rdi
- add rdi, r10
- mov rax, r10
- and rax, 3
- je .LBB0_141
-.LBB0_140: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- add rsi, 1
- add rax, -1
- jne .LBB0_140
-.LBB0_141:
- cmp rdi, 3
- jb .LBB0_537
-.LBB0_142: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
- vmovss dword ptr [r8 + 4*rsi + 4], xmm0
- vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
- vmovss dword ptr [r8 + 4*rsi + 8], xmm0
- vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
- vmovss dword ptr [r8 + 4*rsi + 12], xmm0
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_142
- jmp .LBB0_537
-.LBB0_255:
+ jne .LBB0_486
+ jmp .LBB0_825
+.LBB0_659:
lea rsi, [r8 + 8*r10]
lea rax, [rdx + 8*r10]
cmp rax, r8
@@ -2843,68 +4953,67 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_260
-# %bb.256:
+ jne .LBB0_664
+# %bb.660:
and al, dil
- jne .LBB0_260
-# %bb.257:
+ jne .LBB0_664
+# %bb.661:
mov esi, r10d
and esi, -16
xor edi, edi
-.LBB0_258: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
- vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
- vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
- vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
- vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
- vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+.LBB0_662: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymm4, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm5, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm6, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm7, ymmword ptr [rcx + 8*rdi + 96]
+ vpsrlq ymm8, ymm4, 32
+ vpmuludq ymm8, ymm8, ymm1
+ vpsrlq ymm9, ymm1, 32
+ vpmuludq ymm9, ymm9, ymm4
+ vpaddq ymm8, ymm9, ymm8
+ vpsllq ymm8, ymm8, 32
+ vpmuludq ymm1, ymm4, ymm1
+ vpaddq ymm1, ymm8, ymm1
+ vpsrlq ymm4, ymm5, 32
+ vpmuludq ymm4, ymm4, ymm2
+ vpsrlq ymm8, ymm2, 32
+ vpmuludq ymm8, ymm8, ymm5
+ vpaddq ymm4, ymm8, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm2, ymm5, ymm2
+ vpaddq ymm2, ymm2, ymm4
+ vpsrlq ymm4, ymm6, 32
+ vpmuludq ymm4, ymm4, ymm3
+ vpsrlq ymm5, ymm3, 32
+ vpmuludq ymm5, ymm6, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm3, ymm6, ymm3
+ vpaddq ymm3, ymm3, ymm4
+ vpsrlq ymm4, ymm7, 32
+ vpmuludq ymm4, ymm4, ymm0
+ vpsrlq ymm5, ymm0, 32
+ vpmuludq ymm5, ymm7, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm0, ymm7, ymm0
+ vpaddq ymm0, ymm0, ymm4
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
add rdi, 16
cmp rsi, rdi
- jne .LBB0_258
-# %bb.259:
+ jne .LBB0_662
+# %bb.663:
cmp rsi, r10
- je .LBB0_537
-.LBB0_260:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_262
-.LBB0_261: # =>This Inner Loop Header: Depth=1
- mov rdi, qword ptr [rcx + 8*rsi]
- add rdi, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rdi
- add rsi, 1
- add rax, -1
- jne .LBB0_261
-.LBB0_262:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_263: # =>This Inner Loop Header: Depth=1
- mov rax, qword ptr [rcx + 8*rsi]
- add rax, qword ptr [rdx + 8*rsi]
- mov qword ptr [r8 + 8*rsi], rax
- mov rax, qword ptr [rcx + 8*rsi + 8]
- add rax, qword ptr [rdx + 8*rsi + 8]
- mov qword ptr [r8 + 8*rsi + 8], rax
- mov rax, qword ptr [rcx + 8*rsi + 16]
- add rax, qword ptr [rdx + 8*rsi + 16]
- mov qword ptr [r8 + 8*rsi + 16], rax
- mov rax, qword ptr [rcx + 8*rsi + 24]
- add rax, qword ptr [rdx + 8*rsi + 24]
- mov qword ptr [r8 + 8*rsi + 24], rax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_263
- jmp .LBB0_537
-.LBB0_267:
+ jne .LBB0_664
+ jmp .LBB0_825
+.LBB0_671:
lea rsi, [r8 + 4*r10]
lea rax, [rdx + 4*r10]
cmp rax, r8
@@ -2918,148 +5027,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_272
-# %bb.268:
+ jne .LBB0_676
+# %bb.672:
and al, dil
- jne .LBB0_272
-# %bb.269:
+ jne .LBB0_676
+# %bb.673:
mov esi, r10d
and esi, -32
xor edi, edi
-.LBB0_270: # =>This Inner Loop Header: Depth=1
+.LBB0_674: # =>This Inner Loop Header: Depth=1
vmovups ymm0, ymmword ptr [rcx + 4*rdi]
vmovups ymm1, ymmword ptr [rcx + 4*rdi + 32]
vmovups ymm2, ymmword ptr [rcx + 4*rdi + 64]
vmovups ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vaddps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
- vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vaddps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vaddps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmulps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vmulps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmulps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmulps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
vmovups ymmword ptr [r8 + 4*rdi], ymm0
vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
- cmp rsi, rdi
- jne .LBB0_270
-# %bb.271:
- cmp rsi, r10
- je .LBB0_537
-.LBB0_272:
- mov rdi, rsi
- not rdi
- add rdi, r10
- mov rax, r10
- and rax, 3
- je .LBB0_274
-.LBB0_273: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- add rsi, 1
- add rax, -1
- jne .LBB0_273
-.LBB0_274:
- cmp rdi, 3
- jb .LBB0_537
-.LBB0_275: # =>This Inner Loop Header: Depth=1
- vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
- vmovss dword ptr [r8 + 4*rsi], xmm0
- vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
- vmovss dword ptr [r8 + 4*rsi + 4], xmm0
- vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
- vmovss dword ptr [r8 + 4*rsi + 8], xmm0
- vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
- vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
- vmovss dword ptr [r8 + 4*rsi + 12], xmm0
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_275
- jmp .LBB0_537
-.LBB0_306:
- lea rsi, [r8 + r10]
- lea rax, [rdx + r10]
- cmp rax, r8
- seta r9b
- lea rax, [rcx + r10]
- cmp rsi, rdx
- seta r11b
- cmp rax, r8
- seta al
- cmp rsi, rcx
- seta dil
- xor esi, esi
- test r9b, r11b
- jne .LBB0_311
-# %bb.307:
- and al, dil
- jne .LBB0_311
-# %bb.308:
- mov esi, r10d
- and esi, -128
- xor edi, edi
-.LBB0_309: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + rdi]
- vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
- vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
- vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
- vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
- vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
- vmovdqu ymmword ptr [r8 + rdi], ymm0
- vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
- sub rdi, -128
- cmp rsi, rdi
- jne .LBB0_309
-# %bb.310:
- cmp rsi, r10
- je .LBB0_537
-.LBB0_311:
- mov r9, rsi
- not r9
- add r9, r10
- mov rdi, r10
- and rdi, 3
- je .LBB0_313
-.LBB0_312: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rdx + rsi]
- sub al, byte ptr [rcx + rsi]
- mov byte ptr [r8 + rsi], al
- add rsi, 1
- add rdi, -1
- jne .LBB0_312
-.LBB0_313:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_314: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rdx + rsi]
- sub al, byte ptr [rcx + rsi]
- mov byte ptr [r8 + rsi], al
- movzx eax, byte ptr [rdx + rsi + 1]
- sub al, byte ptr [rcx + rsi + 1]
- mov byte ptr [r8 + rsi + 1], al
- movzx eax, byte ptr [rdx + rsi + 2]
- sub al, byte ptr [rcx + rsi + 2]
- mov byte ptr [r8 + rsi + 2], al
- movzx eax, byte ptr [rdx + rsi + 3]
- sub al, byte ptr [rcx + rsi + 3]
- mov byte ptr [r8 + rsi + 3], al
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_314
- jmp .LBB0_537
-.LBB0_432:
- lea rsi, [r8 + r10]
- lea rax, [rdx + r10]
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_674
+# %bb.675:
+ cmp rsi, r10
+ jne .LBB0_676
+ jmp .LBB0_825
+.LBB0_795:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + r10]
+ lea rax, [rcx + 8*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -3068,73 +5069,72 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_437
-# %bb.433:
+ jne .LBB0_800
+# %bb.796:
and al, dil
- jne .LBB0_437
-# %bb.434:
+ jne .LBB0_800
+# %bb.797:
mov esi, r10d
- and esi, -128
+ and esi, -16
xor edi, edi
-.LBB0_435: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + rdi]
- vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
- vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
- vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
- vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
- vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
- vmovdqu ymmword ptr [r8 + rdi], ymm0
- vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
- sub rdi, -128
+.LBB0_798: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymm4, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm5, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm6, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm7, ymmword ptr [rcx + 8*rdi + 96]
+ vpsrlq ymm8, ymm4, 32
+ vpmuludq ymm8, ymm8, ymm1
+ vpsrlq ymm9, ymm1, 32
+ vpmuludq ymm9, ymm9, ymm4
+ vpaddq ymm8, ymm9, ymm8
+ vpsllq ymm8, ymm8, 32
+ vpmuludq ymm1, ymm4, ymm1
+ vpaddq ymm1, ymm8, ymm1
+ vpsrlq ymm4, ymm5, 32
+ vpmuludq ymm4, ymm4, ymm2
+ vpsrlq ymm8, ymm2, 32
+ vpmuludq ymm8, ymm8, ymm5
+ vpaddq ymm4, ymm8, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm2, ymm5, ymm2
+ vpaddq ymm2, ymm2, ymm4
+ vpsrlq ymm4, ymm6, 32
+ vpmuludq ymm4, ymm4, ymm3
+ vpsrlq ymm5, ymm3, 32
+ vpmuludq ymm5, ymm6, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm3, ymm6, ymm3
+ vpaddq ymm3, ymm3, ymm4
+ vpsrlq ymm4, ymm7, 32
+ vpmuludq ymm4, ymm4, ymm0
+ vpsrlq ymm5, ymm0, 32
+ vpmuludq ymm5, ymm7, ymm5
+ vpaddq ymm4, ymm5, ymm4
+ vpsllq ymm4, ymm4, 32
+ vpmuludq ymm0, ymm7, ymm0
+ vpaddq ymm0, ymm0, ymm4
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+ add rdi, 16
cmp rsi, rdi
- jne .LBB0_435
-# %bb.436:
+ jne .LBB0_798
+# %bb.799:
cmp rsi, r10
- je .LBB0_537
-.LBB0_437:
- mov r9, rsi
- not r9
- add r9, r10
- mov rdi, r10
- and rdi, 3
- je .LBB0_439
-.LBB0_438: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rdx + rsi]
- sub al, byte ptr [rcx + rsi]
- mov byte ptr [r8 + rsi], al
- add rsi, 1
- add rdi, -1
- jne .LBB0_438
-.LBB0_439:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_440: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rdx + rsi]
- sub al, byte ptr [rcx + rsi]
- mov byte ptr [r8 + rsi], al
- movzx eax, byte ptr [rdx + rsi + 1]
- sub al, byte ptr [rcx + rsi + 1]
- mov byte ptr [r8 + rsi + 1], al
- movzx eax, byte ptr [rdx + rsi + 2]
- sub al, byte ptr [rcx + rsi + 2]
- mov byte ptr [r8 + rsi + 2], al
- movzx eax, byte ptr [rdx + rsi + 3]
- sub al, byte ptr [rcx + rsi + 3]
- mov byte ptr [r8 + rsi + 3], al
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_440
- jmp .LBB0_537
-.LBB0_47:
- lea rsi, [r8 + r10]
- lea rax, [rdx + r10]
+ jne .LBB0_800
+ jmp .LBB0_825
+.LBB0_807:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + r10]
+ lea rax, [rcx + 4*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -3143,73 +5143,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_52
-# %bb.48:
+ jne .LBB0_812
+# %bb.808:
and al, dil
- jne .LBB0_52
-# %bb.49:
+ jne .LBB0_812
+# %bb.809:
mov esi, r10d
- and esi, -128
+ and esi, -32
xor edi, edi
-.LBB0_50: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + rdi]
- vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
- vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
- vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
- vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
- vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
- vmovdqu ymmword ptr [r8 + rdi], ymm0
- vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
- sub rdi, -128
+.LBB0_810: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmulps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vmulps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmulps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmulps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
cmp rsi, rdi
- jne .LBB0_50
-# %bb.51:
+ jne .LBB0_810
+# %bb.811:
cmp rsi, r10
- je .LBB0_537
-.LBB0_52:
- mov r9, rsi
- not r9
- add r9, r10
- mov rdi, r10
- and rdi, 3
- je .LBB0_54
-.LBB0_53: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
- add rsi, 1
- add rdi, -1
- jne .LBB0_53
-.LBB0_54:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_55: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
- movzx eax, byte ptr [rcx + rsi + 1]
- add al, byte ptr [rdx + rsi + 1]
- mov byte ptr [r8 + rsi + 1], al
- movzx eax, byte ptr [rcx + rsi + 2]
- add al, byte ptr [rdx + rsi + 2]
- mov byte ptr [r8 + rsi + 2], al
- movzx eax, byte ptr [rcx + rsi + 3]
- add al, byte ptr [rdx + rsi + 3]
- mov byte ptr [r8 + rsi + 3], al
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_55
- jmp .LBB0_537
-.LBB0_180:
- lea rsi, [r8 + r10]
- lea rax, [rdx + r10]
+ jne .LBB0_812
+ jmp .LBB0_825
+.LBB0_390:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + r10]
+ lea rax, [rcx + 8*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -3218,68 +5185,35 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_185
-# %bb.181:
+ jne .LBB0_395
+# %bb.391:
and al, dil
- jne .LBB0_185
-# %bb.182:
+ jne .LBB0_395
+# %bb.392:
mov esi, r10d
- and esi, -128
+ and esi, -16
xor edi, edi
-.LBB0_183: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + rdi]
- vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
- vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
- vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
- vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
- vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
- vmovdqu ymmword ptr [r8 + rdi], ymm0
- vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
- sub rdi, -128
+.LBB0_393: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
cmp rsi, rdi
- jne .LBB0_183
-# %bb.184:
+ jne .LBB0_393
+# %bb.394:
cmp rsi, r10
- je .LBB0_537
-.LBB0_185:
- mov r9, rsi
- not r9
- add r9, r10
- mov rdi, r10
- and rdi, 3
- je .LBB0_187
-.LBB0_186: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
- add rsi, 1
- add rdi, -1
- jne .LBB0_186
-.LBB0_187:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_188: # =>This Inner Loop Header: Depth=1
- movzx eax, byte ptr [rcx + rsi]
- add al, byte ptr [rdx + rsi]
- mov byte ptr [r8 + rsi], al
- movzx eax, byte ptr [rcx + rsi + 1]
- add al, byte ptr [rdx + rsi + 1]
- mov byte ptr [r8 + rsi + 1], al
- movzx eax, byte ptr [rcx + rsi + 2]
- add al, byte ptr [rdx + rsi + 2]
- mov byte ptr [r8 + rsi + 2], al
- movzx eax, byte ptr [rcx + rsi + 3]
- add al, byte ptr [rdx + rsi + 3]
- mov byte ptr [r8 + rsi + 3], al
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_188
- jmp .LBB0_537
-.LBB0_360:
+ jne .LBB0_395
+ jmp .LBB0_825
+.LBB0_402:
lea rsi, [r8 + 4*r10]
lea rax, [rdx + 4*r10]
cmp rax, r8
@@ -3293,73 +5227,40 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_365
-# %bb.361:
+ jne .LBB0_407
+# %bb.403:
and al, dil
- jne .LBB0_365
-# %bb.362:
+ jne .LBB0_407
+# %bb.404:
mov esi, r10d
and esi, -32
xor edi, edi
-.LBB0_363: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
- vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
- cmp rsi, rdi
- jne .LBB0_363
-# %bb.364:
- cmp rsi, r10
- je .LBB0_537
-.LBB0_365:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_367
-.LBB0_366: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rdx + 4*rsi]
- sub edi, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
- add rsi, 1
- add rax, -1
- jne .LBB0_366
-.LBB0_367:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_368: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rdx + 4*rsi]
- sub eax, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rdx + 4*rsi + 4]
- sub eax, dword ptr [rcx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rdx + 4*rsi + 8]
- sub eax, dword ptr [rcx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rdx + 4*rsi + 12]
- sub eax, dword ptr [rcx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_368
- jmp .LBB0_537
-.LBB0_486:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
+.LBB0_405: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vsubps ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_405
+# %bb.406:
+ cmp rsi, r10
+ jne .LBB0_407
+ jmp .LBB0_825
+.LBB0_523:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
cmp rax, r8
seta r9b
- lea rax, [rcx + 4*r10]
+ lea rax, [rcx + 8*r10]
cmp rsi, rdx
seta r11b
cmp rax, r8
@@ -3368,68 +5269,35 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_491
-# %bb.487:
+ jne .LBB0_528
+# %bb.524:
and al, dil
- jne .LBB0_491
-# %bb.488:
+ jne .LBB0_528
+# %bb.525:
mov esi, r10d
- and esi, -32
+ and esi, -16
xor edi, edi
-.LBB0_489: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
- vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
+.LBB0_526: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
cmp rsi, rdi
- jne .LBB0_489
-# %bb.490:
+ jne .LBB0_526
+# %bb.527:
cmp rsi, r10
- je .LBB0_537
-.LBB0_491:
- mov r9, rsi
- not r9
- add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_493
-.LBB0_492: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rdx + 4*rsi]
- sub edi, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
- add rsi, 1
- add rax, -1
- jne .LBB0_492
-.LBB0_493:
- cmp r9, 3
- jb .LBB0_537
-.LBB0_494: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rdx + 4*rsi]
- sub eax, dword ptr [rcx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rdx + 4*rsi + 4]
- sub eax, dword ptr [rcx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rdx + 4*rsi + 8]
- sub eax, dword ptr [rcx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rdx + 4*rsi + 12]
- sub eax, dword ptr [rcx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
- add rsi, 4
- cmp r10, rsi
- jne .LBB0_494
- jmp .LBB0_537
-.LBB0_101:
+ jne .LBB0_528
+ jmp .LBB0_825
+.LBB0_535:
lea rsi, [r8 + 4*r10]
lea rax, [rdx + 4*r10]
cmp rax, r8
@@ -3443,289 +5311,805 @@ arithmetic_avx2: # @arithmetic_avx2
seta dil
xor esi, esi
test r9b, r11b
- jne .LBB0_106
-# %bb.102:
+ jne .LBB0_540
+# %bb.536:
and al, dil
- jne .LBB0_106
-# %bb.103:
+ jne .LBB0_540
+# %bb.537:
mov esi, r10d
and esi, -32
xor edi, edi
-.LBB0_104: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
- vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+.LBB0_538: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vsubps ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
add rdi, 32
cmp rsi, rdi
- jne .LBB0_104
-# %bb.105:
+ jne .LBB0_538
+# %bb.539:
cmp rsi, r10
- je .LBB0_537
-.LBB0_106:
+ jne .LBB0_540
+ jmp .LBB0_825
+.LBB0_592:
+ and rax, -4
+ neg rax
+ xor esi, esi
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_593: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 32]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 32], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 64]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 64]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 64], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 96]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 96]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 96], ymm1
+ sub rsi, -128
+ add rax, 4
+ jne .LBB0_593
+.LBB0_594:
+ test r9, r9
+ je .LBB0_597
+# %bb.595:
+ neg r9
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_596: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ add rsi, 32
+ inc r9
+ jne .LBB0_596
+.LBB0_597:
+ cmp rdi, r10
+ je .LBB0_825
+.LBB0_598:
+ mov r9, rdi
+ not r9
+ add r9, r10
+ mov rsi, r10
+ and rsi, 3
+ je .LBB0_600
+.LBB0_599: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ add rdi, 1
+ add rsi, -1
+ jne .LBB0_599
+.LBB0_600:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_601: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ movzx eax, byte ptr [rcx + rdi + 1]
+ mul byte ptr [rdx + rdi + 1]
+ mov byte ptr [r8 + rdi + 1], al
+ movzx eax, byte ptr [rcx + rdi + 2]
+ mul byte ptr [rdx + rdi + 2]
+ mov byte ptr [r8 + rdi + 2], al
+ movzx eax, byte ptr [rcx + rdi + 3]
+ mul byte ptr [rdx + rdi + 3]
+ mov byte ptr [r8 + rdi + 3], al
+ add rdi, 4
+ cmp r10, rdi
+ jne .LBB0_601
+ jmp .LBB0_825
+.LBB0_728:
+ and rax, -4
+ neg rax
+ xor esi, esi
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_729: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 32]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 32], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 64]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 64]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 64], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 96]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 96]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 96], ymm1
+ sub rsi, -128
+ add rax, 4
+ jne .LBB0_729
+.LBB0_730:
+ test r9, r9
+ je .LBB0_733
+# %bb.731:
+ neg r9
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_732: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ add rsi, 32
+ inc r9
+ jne .LBB0_732
+.LBB0_733:
+ cmp rdi, r10
+ je .LBB0_825
+.LBB0_734:
+ mov r9, rdi
+ not r9
+ add r9, r10
+ mov rsi, r10
+ and rsi, 3
+ je .LBB0_736
+.LBB0_735: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ add rdi, 1
+ add rsi, -1
+ jne .LBB0_735
+.LBB0_736:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_737: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ movzx eax, byte ptr [rcx + rdi + 1]
+ mul byte ptr [rdx + rdi + 1]
+ mov byte ptr [r8 + rdi + 1], al
+ movzx eax, byte ptr [rcx + rdi + 2]
+ mul byte ptr [rdx + rdi + 2]
+ mov byte ptr [r8 + rdi + 2], al
+ movzx eax, byte ptr [rcx + rdi + 3]
+ mul byte ptr [rdx + rdi + 3]
+ mov byte ptr [r8 + rdi + 3], al
+ add rdi, 4
+ cmp r10, rdi
+ jne .LBB0_737
+ jmp .LBB0_825
+.LBB0_578:
+ and rax, -4
+ neg rax
+ xor esi, esi
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_579: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 32]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 32], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 64]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 64]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 64], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 96]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 96]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 96], ymm1
+ sub rsi, -128
+ add rax, 4
+ jne .LBB0_579
+.LBB0_580:
+ test r9, r9
+ je .LBB0_583
+# %bb.581:
+ neg r9
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_582: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ add rsi, 32
+ inc r9
+ jne .LBB0_582
+.LBB0_583:
+ cmp rdi, r10
+ je .LBB0_825
+.LBB0_584:
+ mov r9, rdi
+ not r9
+ add r9, r10
+ mov rsi, r10
+ and rsi, 3
+ je .LBB0_586
+.LBB0_585: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ add rdi, 1
+ add rsi, -1
+ jne .LBB0_585
+.LBB0_586:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_587: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ movzx eax, byte ptr [rcx + rdi + 1]
+ mul byte ptr [rdx + rdi + 1]
+ mov byte ptr [r8 + rdi + 1], al
+ movzx eax, byte ptr [rcx + rdi + 2]
+ mul byte ptr [rdx + rdi + 2]
+ mov byte ptr [r8 + rdi + 2], al
+ movzx eax, byte ptr [rcx + rdi + 3]
+ mul byte ptr [rdx + rdi + 3]
+ mov byte ptr [r8 + rdi + 3], al
+ add rdi, 4
+ cmp r10, rdi
+ jne .LBB0_587
+ jmp .LBB0_825
+.LBB0_714:
+ and rax, -4
+ neg rax
+ xor esi, esi
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_715: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 32]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 32], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 64]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 64]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 64], ymm1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi + 96]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi + 96]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi + 96], ymm1
+ sub rsi, -128
+ add rax, 4
+ jne .LBB0_715
+.LBB0_716:
+ test r9, r9
+ je .LBB0_719
+# %bb.717:
+ neg r9
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_718: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rsi]
+ vmovdqu ymm2, ymmword ptr [rcx + rsi]
+ vpunpckhbw ymm3, ymm1, ymm1 # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpunpckhbw ymm4, ymm2, ymm2 # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+ vpmullw ymm3, ymm4, ymm3
+ vpand ymm3, ymm3, ymm0
+ vpunpcklbw ymm1, ymm1, ymm1 # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpunpcklbw ymm2, ymm2, ymm2 # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+ vpmullw ymm1, ymm2, ymm1
+ vpand ymm1, ymm1, ymm0
+ vpackuswb ymm1, ymm1, ymm3
+ vmovdqu ymmword ptr [r8 + rsi], ymm1
+ add rsi, 32
+ inc r9
+ jne .LBB0_718
+.LBB0_719:
+ cmp rdi, r10
+ je .LBB0_825
+.LBB0_720:
+ mov r9, rdi
+ not r9
+ add r9, r10
+ mov rsi, r10
+ and rsi, 3
+ je .LBB0_722
+.LBB0_721: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ add rdi, 1
+ add rsi, -1
+ jne .LBB0_721
+.LBB0_722:
+ cmp r9, 3
+ jb .LBB0_825
+.LBB0_723: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdi]
+ mul byte ptr [rdx + rdi]
+ mov byte ptr [r8 + rdi], al
+ movzx eax, byte ptr [rcx + rdi + 1]
+ mul byte ptr [rdx + rdi + 1]
+ mov byte ptr [r8 + rdi + 1], al
+ movzx eax, byte ptr [rcx + rdi + 2]
+ mul byte ptr [rdx + rdi + 2]
+ mov byte ptr [r8 + rdi + 2], al
+ movzx eax, byte ptr [rcx + rdi + 3]
+ mul byte ptr [rdx + rdi + 3]
+ mov byte ptr [r8 + rdi + 3], al
+ add rdi, 4
+ cmp r10, rdi
+ jne .LBB0_723
+.LBB0_825:
+ mov rsp, rbp
+ pop rbp
+ vzeroupper
+ ret
+.Lfunc_end0:
+ .size arithmetic_avx2, .Lfunc_end0-arithmetic_avx2
+ # -- End function
+ .section .rodata.cst32,"aM",@progbits,32
+ .p2align 5 # -- Begin function arithmetic_arr_scalar_avx2
+.LCPI1_0:
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .short 255 # 0xff
+ .text
+ .globl arithmetic_arr_scalar_avx2
+ .p2align 4, 0x90
+ .type arithmetic_arr_scalar_avx2,@function
+arithmetic_arr_scalar_avx2: # @arithmetic_arr_scalar_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 3
+ jg .LBB1_12
+# %bb.1:
+ test sil, sil
+ je .LBB1_23
+# %bb.2:
+ cmp sil, 1
+ je .LBB1_31
+# %bb.3:
+ cmp sil, 2
+ jne .LBB1_1109
+# %bb.4:
+ cmp edi, 6
+ jg .LBB1_55
+# %bb.5:
+ cmp edi, 3
+ jle .LBB1_97
+# %bb.6:
+ cmp edi, 4
+ je .LBB1_157
+# %bb.7:
+ cmp edi, 5
+ je .LBB1_160
+# %bb.8:
+ cmp edi, 6
+ jne .LBB1_1109
+# %bb.9:
+ test r9d, r9d
+ jle .LBB1_1109
+# %bb.10:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_11
+# %bb.265:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_445
+# %bb.266:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_445
+.LBB1_11:
+ xor esi, esi
+.LBB1_665:
mov r9, rsi
not r9
add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_108
-.LBB0_107: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rcx + 4*rsi]
- add edi, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_667
+.LBB1_666: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
add rsi, 1
- add rax, -1
- jne .LBB0_107
-.LBB0_108:
+ add rdi, -1
+ jne .LBB1_666
+.LBB1_667:
cmp r9, 3
- jb .LBB0_537
-.LBB0_109: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rcx + 4*rsi]
- add eax, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rcx + 4*rsi + 4]
- add eax, dword ptr [rdx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rcx + 4*rsi + 8]
- add eax, dword ptr [rdx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rcx + 4*rsi + 12]
- add eax, dword ptr [rdx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
+ jb .LBB1_1109
+.LBB1_668: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
add rsi, 4
cmp r10, rsi
- jne .LBB0_109
- jmp .LBB0_537
-.LBB0_234:
- lea rsi, [r8 + 4*r10]
- lea rax, [rdx + 4*r10]
- cmp rax, r8
- seta r9b
- lea rax, [rcx + 4*r10]
- cmp rsi, rdx
- seta r11b
- cmp rax, r8
- seta al
- cmp rsi, rcx
- seta dil
- xor esi, esi
- test r9b, r11b
- jne .LBB0_239
-# %bb.235:
- and al, dil
- jne .LBB0_239
-# %bb.236:
- mov esi, r10d
- and esi, -32
- xor edi, edi
-.LBB0_237: # =>This Inner Loop Header: Depth=1
- vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
- vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
- vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
- vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
- vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
- vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
- vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
- vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
- vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
- vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
- vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
- vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
- add rdi, 32
- cmp rsi, rdi
- jne .LBB0_237
-# %bb.238:
- cmp rsi, r10
- je .LBB0_537
-.LBB0_239:
+ jne .LBB1_668
+ jmp .LBB1_1109
+.LBB1_12:
+ cmp sil, 4
+ je .LBB1_39
+# %bb.13:
+ cmp sil, 5
+ je .LBB1_47
+# %bb.14:
+ cmp sil, 6
+ jne .LBB1_1109
+# %bb.15:
+ cmp edi, 6
+ jg .LBB1_62
+# %bb.16:
+ cmp edi, 3
+ jle .LBB1_102
+# %bb.17:
+ cmp edi, 4
+ je .LBB1_163
+# %bb.18:
+ cmp edi, 5
+ je .LBB1_166
+# %bb.19:
+ cmp edi, 6
+ jne .LBB1_1109
+# %bb.20:
+ test r9d, r9d
+ jle .LBB1_1109
+# %bb.21:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_22
+# %bb.268:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_448
+# %bb.269:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_448
+.LBB1_22:
+ xor esi, esi
+.LBB1_673:
mov r9, rsi
not r9
add r9, r10
- mov rax, r10
- and rax, 3
- je .LBB0_241
-.LBB0_240: # =>This Inner Loop Header: Depth=1
- mov edi, dword ptr [rcx + 4*rsi]
- add edi, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], edi
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_675
+.LBB1_674: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
add rsi, 1
- add rax, -1
- jne .LBB0_240
-.LBB0_241:
+ add rdi, -1
+ jne .LBB1_674
+.LBB1_675:
cmp r9, 3
- jb .LBB0_537
-.LBB0_242: # =>This Inner Loop Header: Depth=1
- mov eax, dword ptr [rcx + 4*rsi]
- add eax, dword ptr [rdx + 4*rsi]
- mov dword ptr [r8 + 4*rsi], eax
- mov eax, dword ptr [rcx + 4*rsi + 4]
- add eax, dword ptr [rdx + 4*rsi + 4]
- mov dword ptr [r8 + 4*rsi + 4], eax
- mov eax, dword ptr [rcx + 4*rsi + 8]
- add eax, dword ptr [rdx + 4*rsi + 8]
- mov dword ptr [r8 + 4*rsi + 8], eax
- mov eax, dword ptr [rcx + 4*rsi + 12]
- add eax, dword ptr [rdx + 4*rsi + 12]
- mov dword ptr [r8 + 4*rsi + 12], eax
+ jb .LBB1_1109
+.LBB1_676: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ imul ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
add rsi, 4
cmp r10, rsi
- jne .LBB0_242
-.LBB0_537:
- mov rsp, rbp
- pop rbp
- vzeroupper
- ret
-.Lfunc_end0:
- .size arithmetic_avx2, .Lfunc_end0-arithmetic_avx2
- # -- End function
- .globl arithmetic_arr_scalar_avx2 # -- Begin function arithmetic_arr_scalar_avx2
- .p2align 4, 0x90
- .type arithmetic_arr_scalar_avx2,@function
-arithmetic_arr_scalar_avx2: # @arithmetic_arr_scalar_avx2
-# %bb.0:
- push rbp
- mov rbp, rsp
- and rsp, -8
- cmp sil, 1
- jg .LBB1_11
-# %bb.1:
- test sil, sil
- je .LBB1_21
-# %bb.2:
- cmp sil, 1
- jne .LBB1_737
-# %bb.3:
+ jne .LBB1_676
+ jmp .LBB1_1109
+.LBB1_23:
cmp edi, 6
- jg .LBB1_37
-# %bb.4:
+ jg .LBB1_69
+# %bb.24:
cmp edi, 3
- jle .LBB1_65
-# %bb.5:
+ jle .LBB1_107
+# %bb.25:
cmp edi, 4
- je .LBB1_105
-# %bb.6:
+ je .LBB1_169
+# %bb.26:
cmp edi, 5
- je .LBB1_108
-# %bb.7:
+ je .LBB1_172
+# %bb.27:
cmp edi, 6
- jne .LBB1_737
-# %bb.8:
+ jne .LBB1_1109
+# %bb.28:
test r9d, r9d
- jle .LBB1_737
-# %bb.9:
+ jle .LBB1_1109
+# %bb.29:
mov eax, dword ptr [rcx]
mov r10d, r9d
cmp r9d, 32
- jb .LBB1_10
-# %bb.177:
+ jb .LBB1_30
+# %bb.271:
lea rcx, [rdx + 4*r10]
cmp rcx, r8
- jbe .LBB1_297
-# %bb.178:
+ jbe .LBB1_451
+# %bb.272:
lea rcx, [r8 + 4*r10]
cmp rcx, rdx
- jbe .LBB1_297
-.LBB1_10:
+ jbe .LBB1_451
+.LBB1_30:
xor esi, esi
-.LBB1_421:
+.LBB1_681:
mov r9, rsi
not r9
add r9, r10
mov rdi, r10
and rdi, 3
- je .LBB1_423
-.LBB1_422: # =>This Inner Loop Header: Depth=1
+ je .LBB1_683
+.LBB1_682: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
- sub ecx, eax
+ add ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
add rsi, 1
add rdi, -1
- jne .LBB1_422
-.LBB1_423:
+ jne .LBB1_682
+.LBB1_683:
cmp r9, 3
- jb .LBB1_737
-.LBB1_424: # =>This Inner Loop Header: Depth=1
+ jb .LBB1_1109
+.LBB1_684: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
- sub ecx, eax
+ add ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
mov ecx, dword ptr [rdx + 4*rsi + 4]
- sub ecx, eax
+ add ecx, eax
mov dword ptr [r8 + 4*rsi + 4], ecx
mov ecx, dword ptr [rdx + 4*rsi + 8]
- sub ecx, eax
+ add ecx, eax
mov dword ptr [r8 + 4*rsi + 8], ecx
mov ecx, dword ptr [rdx + 4*rsi + 12]
- sub ecx, eax
+ add ecx, eax
mov dword ptr [r8 + 4*rsi + 12], ecx
add rsi, 4
cmp r10, rsi
- jne .LBB1_424
- jmp .LBB1_737
-.LBB1_11:
- cmp sil, 2
- je .LBB1_29
-# %bb.12:
- cmp sil, 3
- jne .LBB1_737
-# %bb.13:
+ jne .LBB1_684
+ jmp .LBB1_1109
+.LBB1_31:
cmp edi, 6
- jg .LBB1_44
-# %bb.14:
+ jg .LBB1_76
+# %bb.32:
cmp edi, 3
- jle .LBB1_70
-# %bb.15:
+ jle .LBB1_112
+# %bb.33:
cmp edi, 4
- je .LBB1_111
-# %bb.16:
+ je .LBB1_175
+# %bb.34:
cmp edi, 5
- je .LBB1_114
-# %bb.17:
+ je .LBB1_178
+# %bb.35:
cmp edi, 6
- jne .LBB1_737
-# %bb.18:
+ jne .LBB1_1109
+# %bb.36:
test r9d, r9d
- jle .LBB1_737
-# %bb.19:
+ jle .LBB1_1109
+# %bb.37:
mov eax, dword ptr [rcx]
mov r10d, r9d
cmp r9d, 32
- jb .LBB1_20
-# %bb.180:
+ jb .LBB1_38
+# %bb.274:
lea rcx, [rdx + 4*r10]
cmp rcx, r8
- jbe .LBB1_300
-# %bb.181:
+ jbe .LBB1_454
+# %bb.275:
lea rcx, [r8 + 4*r10]
cmp rcx, rdx
- jbe .LBB1_300
-.LBB1_20:
+ jbe .LBB1_454
+.LBB1_38:
xor esi, esi
-.LBB1_429:
+.LBB1_689:
mov r9, rsi
not r9
add r9, r10
mov rdi, r10
and rdi, 3
- je .LBB1_431
-.LBB1_430: # =>This Inner Loop Header: Depth=1
+ je .LBB1_691
+.LBB1_690: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
sub ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
add rsi, 1
add rdi, -1
- jne .LBB1_430
-.LBB1_431:
+ jne .LBB1_690
+.LBB1_691:
cmp r9, 3
- jb .LBB1_737
-.LBB1_432: # =>This Inner Loop Header: Depth=1
+ jb .LBB1_1109
+.LBB1_692: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
sub ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
@@ -3740,59 +6124,59 @@ arithmetic_arr_scalar_avx2: # @arithmetic_arr_scalar_avx2
mov dword ptr [r8 + 4*rsi + 12], ecx
add rsi, 4
cmp r10, rsi
- jne .LBB1_432
- jmp .LBB1_737
-.LBB1_21:
+ jne .LBB1_692
+ jmp .LBB1_1109
+.LBB1_39:
cmp edi, 6
- jg .LBB1_51
-# %bb.22:
+ jg .LBB1_83
+# %bb.40:
cmp edi, 3
- jle .LBB1_75
-# %bb.23:
+ jle .LBB1_117
+# %bb.41:
cmp edi, 4
- je .LBB1_117
-# %bb.24:
+ je .LBB1_181
+# %bb.42:
cmp edi, 5
- je .LBB1_120
-# %bb.25:
+ je .LBB1_184
+# %bb.43:
cmp edi, 6
- jne .LBB1_737
-# %bb.26:
+ jne .LBB1_1109
+# %bb.44:
test r9d, r9d
- jle .LBB1_737
-# %bb.27:
+ jle .LBB1_1109
+# %bb.45:
mov eax, dword ptr [rcx]
mov r10d, r9d
cmp r9d, 32
- jb .LBB1_28
-# %bb.183:
+ jb .LBB1_46
+# %bb.277:
lea rcx, [rdx + 4*r10]
cmp rcx, r8
- jbe .LBB1_303
-# %bb.184:
+ jbe .LBB1_457
+# %bb.278:
lea rcx, [r8 + 4*r10]
cmp rcx, rdx
- jbe .LBB1_303
-.LBB1_28:
+ jbe .LBB1_457
+.LBB1_46:
xor esi, esi
-.LBB1_437:
+.LBB1_697:
mov r9, rsi
not r9
add r9, r10
mov rdi, r10
and rdi, 3
- je .LBB1_439
-.LBB1_438: # =>This Inner Loop Header: Depth=1
+ je .LBB1_699
+.LBB1_698: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
add ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
add rsi, 1
add rdi, -1
- jne .LBB1_438
-.LBB1_439:
+ jne .LBB1_698
+.LBB1_699:
cmp r9, 3
- jb .LBB1_737
-.LBB1_440: # =>This Inner Loop Header: Depth=1
+ jb .LBB1_1109
+.LBB1_700: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
add ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
@@ -3807,187 +6191,300 @@ arithmetic_arr_scalar_avx2: # @arithmetic_arr_scalar_avx2
mov dword ptr [r8 + 4*rsi + 12], ecx
add rsi, 4
cmp r10, rsi
- jne .LBB1_440
- jmp .LBB1_737
-.LBB1_29:
+ jne .LBB1_700
+ jmp .LBB1_1109
+.LBB1_47:
cmp edi, 6
- jg .LBB1_58
-# %bb.30:
+ jg .LBB1_90
+# %bb.48:
cmp edi, 3
- jle .LBB1_80
-# %bb.31:
+ jle .LBB1_122
+# %bb.49:
cmp edi, 4
- je .LBB1_123
-# %bb.32:
+ je .LBB1_187
+# %bb.50:
cmp edi, 5
- je .LBB1_126
-# %bb.33:
+ je .LBB1_190
+# %bb.51:
cmp edi, 6
- jne .LBB1_737
-# %bb.34:
+ jne .LBB1_1109
+# %bb.52:
test r9d, r9d
- jle .LBB1_737
-# %bb.35:
+ jle .LBB1_1109
+# %bb.53:
mov eax, dword ptr [rcx]
mov r10d, r9d
cmp r9d, 32
- jb .LBB1_36
-# %bb.186:
+ jb .LBB1_54
+# %bb.280:
lea rcx, [rdx + 4*r10]
cmp rcx, r8
- jbe .LBB1_306
-# %bb.187:
+ jbe .LBB1_460
+# %bb.281:
lea rcx, [r8 + 4*r10]
cmp rcx, rdx
- jbe .LBB1_306
-.LBB1_36:
+ jbe .LBB1_460
+.LBB1_54:
xor esi, esi
-.LBB1_445:
+.LBB1_705:
mov r9, rsi
not r9
add r9, r10
mov rdi, r10
and rdi, 3
- je .LBB1_447
-.LBB1_446: # =>This Inner Loop Header: Depth=1
+ je .LBB1_707
+.LBB1_706: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
- add ecx, eax
+ sub ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
add rsi, 1
add rdi, -1
- jne .LBB1_446
-.LBB1_447:
+ jne .LBB1_706
+.LBB1_707:
cmp r9, 3
- jb .LBB1_737
-.LBB1_448: # =>This Inner Loop Header: Depth=1
+ jb .LBB1_1109
+.LBB1_708: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rdx + 4*rsi]
- add ecx, eax
+ sub ecx, eax
mov dword ptr [r8 + 4*rsi], ecx
mov ecx, dword ptr [rdx + 4*rsi + 4]
- add ecx, eax
+ sub ecx, eax
mov dword ptr [r8 + 4*rsi + 4], ecx
mov ecx, dword ptr [rdx + 4*rsi + 8]
- add ecx, eax
+ sub ecx, eax
mov dword ptr [r8 + 4*rsi + 8], ecx
mov ecx, dword ptr [rdx + 4*rsi + 12]
- add ecx, eax
+ sub ecx, eax
mov dword ptr [r8 + 4*rsi + 12], ecx
add rsi, 4
cmp r10, rsi
- jne .LBB1_448
- jmp .LBB1_737
-.LBB1_37:
+ jne .LBB1_708
+ jmp .LBB1_1109
+.LBB1_55:
cmp edi, 8
- jle .LBB1_85
-# %bb.38:
+ jle .LBB1_127
+# %bb.56:
cmp edi, 9
- je .LBB1_129
-# %bb.39:
+ je .LBB1_193
+# %bb.57:
cmp edi, 11
- je .LBB1_132
-# %bb.40:
+ je .LBB1_196
+# %bb.58:
cmp edi, 12
- jne .LBB1_737
-# %bb.41:
+ jne .LBB1_1109
+# %bb.59:
test r9d, r9d
- jle .LBB1_737
-# %bb.42:
+ jle .LBB1_1109
+# %bb.60:
vmovsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
mov eax, r9d
cmp r9d, 16
- jb .LBB1_43
-# %bb.189:
+ jb .LBB1_61
+# %bb.283:
lea rcx, [rdx + 8*rax]
cmp rcx, r8
- jbe .LBB1_309
-# %bb.190:
+ jbe .LBB1_463
+# %bb.284:
lea rcx, [r8 + 8*rax]
cmp rcx, rdx
- jbe .LBB1_309
-.LBB1_43:
+ jbe .LBB1_463
+.LBB1_61:
xor ecx, ecx
-.LBB1_453:
+.LBB1_713:
mov rsi, rcx
not rsi
add rsi, rax
mov rdi, rax
and rdi, 3
- je .LBB1_455
-.LBB1_454: # =>This Inner Loop Header: Depth=1
- vmovsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
- vsubsd xmm1, xmm1, xmm0
+ je .LBB1_715
+.LBB1_714: # =>This Inner Loop Header: Depth=1
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
vmovsd qword ptr [r8 + 8*rcx], xmm1
add rcx, 1
add rdi, -1
- jne .LBB1_454
-.LBB1_455:
+ jne .LBB1_714
+.LBB1_715:
cmp rsi, 3
- jb .LBB1_737
-.LBB1_456: # =>This Inner Loop Header: Depth=1
- vmovsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
- vsubsd xmm1, xmm1, xmm0
+ jb .LBB1_1109
+.LBB1_716: # =>This Inner Loop Header: Depth=1
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
vmovsd qword ptr [r8 + 8*rcx], xmm1
- vmovsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
- vsubsd xmm1, xmm1, xmm0
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 8]
vmovsd qword ptr [r8 + 8*rcx + 8], xmm1
- vmovsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
- vsubsd xmm1, xmm1, xmm0
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 16]
vmovsd qword ptr [r8 + 8*rcx + 16], xmm1
- vmovsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
- vsubsd xmm1, xmm1, xmm0
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 24]
vmovsd qword ptr [r8 + 8*rcx + 24], xmm1
add rcx, 4
cmp rax, rcx
- jne .LBB1_456
- jmp .LBB1_737
-.LBB1_44:
+ jne .LBB1_716
+ jmp .LBB1_1109
+.LBB1_62:
+ cmp edi, 8
+ jle .LBB1_132
+# %bb.63:
+ cmp edi, 9
+ je .LBB1_199
+# %bb.64:
+ cmp edi, 11
+ je .LBB1_202
+# %bb.65:
+ cmp edi, 12
+ jne .LBB1_1109
+# %bb.66:
+ test r9d, r9d
+ jle .LBB1_1109
+# %bb.67:
+ vmovsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB1_68
+# %bb.286:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_466
+# %bb.287:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_466
+.LBB1_68:
+ xor ecx, ecx
+.LBB1_721:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_723
+.LBB1_722: # =>This Inner Loop Header: Depth=1
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_722
+.LBB1_723:
+ cmp rsi, 3
+ jb .LBB1_1109
+.LBB1_724: # =>This Inner Loop Header: Depth=1
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 8]
+ vmovsd qword ptr [r8 + 8*rcx + 8], xmm1
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 16]
+ vmovsd qword ptr [r8 + 8*rcx + 16], xmm1
+ vmulsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 24]
+ vmovsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_724
+ jmp .LBB1_1109
+.LBB1_69:
+ cmp edi, 8
+ jle .LBB1_137
+# %bb.70:
+ cmp edi, 9
+ je .LBB1_205
+# %bb.71:
+ cmp edi, 11
+ je .LBB1_208
+# %bb.72:
+ cmp edi, 12
+ jne .LBB1_1109
+# %bb.73:
+ test r9d, r9d
+ jle .LBB1_1109
+# %bb.74:
+ vmovsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB1_75
+# %bb.289:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_469
... 91290 lines suppressed ...