You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/11/07 22:25:20 UTC

[arrow] branch master updated: ARROW-18108: [Go] More scalar binary arithmetic (Multiply and Divide) (#14544)

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 98943d90da ARROW-18108: [Go] More scalar binary arithmetic (Multiply and Divide) (#14544)
98943d90da is described below

commit 98943d90dacb0311fe0d7a17a8ef10762e1c0ef2
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Mon Nov 7 17:25:14 2022 -0500

    ARROW-18108: [Go] More scalar binary arithmetic (Multiply and Divide) (#14544)
    
    Authored-by: Matt Topol <zo...@gmail.com>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 go/arrow/array/compare.go                          |     8 +-
 go/arrow/array/numeric_test.go                     |    18 +
 go/arrow/compute/arithmetic.go                     |    88 +-
 go/arrow/compute/arithmetic_test.go                |   503 +-
 go/arrow/compute/cast_test.go                      |     8 +-
 .../internal/kernels/_lib/base_arithmetic.cc       |   111 +-
 .../kernels/_lib/base_arithmetic_avx2_amd64.s      | 20336 +++++++++++++------
 .../kernels/_lib/base_arithmetic_sse4_amd64.s      | 17932 ++++++++++------
 go/arrow/compute/internal/kernels/_lib/types.h     |   340 +
 .../compute/internal/kernels/base_arithmetic.go    |    86 +-
 .../internal/kernels/base_arithmetic_amd64.go      |     6 +-
 .../internal/kernels/base_arithmetic_avx2_amd64.s  | 19677 ++++++++++++------
 .../internal/kernels/base_arithmetic_sse4_amd64.s  | 16877 ++++++++++-----
 .../compute/internal/kernels/scalar_arithmetic.go  |     4 +
 go/arrow/decimal128/decimal128.go                  |    39 +-
 go/arrow/decimal256/decimal256.go                  |    69 +-
 go/arrow/scalar/parse.go                           |    14 +
 17 files changed, 52762 insertions(+), 23354 deletions(-)

diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go
index ea8ac25203..68143e0086 100644
--- a/go/arrow/array/compare.go
+++ b/go/arrow/array/compare.go
@@ -402,18 +402,18 @@ func (eq equalOption) f32(f1, f2 float32) bool {
 	v2 := float64(f2)
 	switch {
 	case eq.nansEq:
-		return math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
+		return v1 == v2 || math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
 	default:
-		return math.Abs(v1-v2) <= eq.atol
+		return v1 == v2 || math.Abs(v1-v2) <= eq.atol
 	}
 }
 
 func (eq equalOption) f64(v1, v2 float64) bool {
 	switch {
 	case eq.nansEq:
-		return math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
+		return v1 == v2 || math.Abs(v1-v2) <= eq.atol || (math.IsNaN(v1) && math.IsNaN(v2))
 	default:
-		return math.Abs(v1-v2) <= eq.atol
+		return v1 == v2 || math.Abs(v1-v2) <= eq.atol
 	}
 }
 
diff --git a/go/arrow/array/numeric_test.go b/go/arrow/array/numeric_test.go
index 8efe41f97f..e485ba47f9 100644
--- a/go/arrow/array/numeric_test.go
+++ b/go/arrow/array/numeric_test.go
@@ -17,6 +17,8 @@
 package array_test
 
 import (
+	"encoding/json"
+	"math"
 	"reflect"
 	"testing"
 
@@ -135,6 +137,22 @@ func TestFloat64SliceDataWithNull(t *testing.T) {
 	}
 }
 
+func TestUnmarshalSpecialFloat(t *testing.T) {
+	pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer pool.AssertSize(t, 0)
+
+	bldr := array.NewFloat32Builder(pool)
+	defer bldr.Release()
+
+	assert.NoError(t, json.Unmarshal([]byte(`[3.4, "Inf", "-Inf"]`), bldr))
+	arr := bldr.NewFloat32Array()
+	defer arr.Release()
+
+	assert.False(t, math.IsInf(float64(arr.Value(0)), 0), arr.Value(0))
+	assert.True(t, math.IsInf(float64(arr.Value(1)), 1), arr.Value(1))
+	assert.True(t, math.IsInf(float64(arr.Value(2)), -1), arr.Value(2))
+}
+
 func TestNewTime32Data(t *testing.T) {
 	data := []arrow.Time32{
 		arrow.Time32(1),
diff --git a/go/arrow/compute/arithmetic.go b/go/arrow/compute/arithmetic.go
index 865fa5dfe0..4b6f6109a5 100644
--- a/go/arrow/compute/arithmetic.go
+++ b/go/arrow/compute/arithmetic.go
@@ -227,6 +227,53 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
 
 		reg.AddFunction(fn, false)
 	}
+
+	oplist := []struct {
+		funcName    string
+		op          kernels.ArithmeticOp
+		decPromote  decimalPromotion
+		commutative bool
+	}{
+		{"multiply_unchecked", kernels.OpMul, decPromoteMultiply, true},
+		{"multiply", kernels.OpMulChecked, decPromoteMultiply, true},
+		{"divide_unchecked", kernels.OpDiv, decPromoteDivide, false},
+		{"divide", kernels.OpDivChecked, decPromoteDivide, false},
+	}
+
+	for _, o := range oplist {
+		fn := &arithmeticFunction{*NewScalarFunction(o.funcName, Binary(), addDoc), o.decPromote}
+		for _, k := range append(kernels.GetArithmeticKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...) {
+			if err := fn.AddKernel(k); err != nil {
+				panic(err)
+			}
+		}
+
+		for _, unit := range arrow.TimeUnitValues {
+			durInput := exec.NewExactInput(&arrow.DurationType{Unit: unit})
+			i64Input := exec.NewExactInput(arrow.PrimitiveTypes.Int64)
+			durOutput := exec.NewOutputType(&arrow.DurationType{Unit: unit})
+			ex := kernels.ArithmeticExec(arrow.DURATION, o.op)
+			err := fn.AddNewKernel([]exec.InputType{durInput, i64Input}, durOutput, ex, nil)
+			if err != nil {
+				panic(err)
+			}
+			if o.commutative {
+				err = fn.AddNewKernel([]exec.InputType{i64Input, durInput}, durOutput, ex, nil)
+				if err != nil {
+					panic(err)
+				}
+			}
+		}
+
+		reg.AddFunction(fn, false)
+	}
+}
+
+func impl(ctx context.Context, fn string, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+	if opts.NoCheckOverflow {
+		fn += "_unchecked"
+	}
+	return CallFunction(ctx, fn, nil, left, right)
 }
 
 // Add performs an addition between the passed in arguments (scalar or array)
@@ -235,13 +282,9 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
 //
 // ArithmeticOptions specifies whether or not to check for overflows,
 // performance is faster if not explicitly checking for overflows but
-// will error on an overflow if CheckOverflow is true.
+// will error on an overflow if NoCheckOverflow is false (default).
 func Add(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
-	fn := "add"
-	if opts.NoCheckOverflow {
-		fn = "add_unchecked"
-	}
-	return CallFunction(ctx, fn, nil, left, right)
+	return impl(ctx, "add", opts, left, right)
 }
 
 // Sub performs a subtraction between the passed in arguments (scalar or array)
@@ -250,11 +293,32 @@ func Add(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum,
 //
 // ArithmeticOptions specifies whether or not to check for overflows,
 // performance is faster if not explicitly checking for overflows but
-// will error on an overflow if CheckOverflow is true.
+// will error on an overflow if NoCheckOverflow is false (default).
 func Subtract(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
-	fn := "sub"
-	if opts.NoCheckOverflow {
-		fn = "sub_unchecked"
-	}
-	return CallFunction(ctx, fn, nil, left, right)
+	return impl(ctx, "sub", opts, left, right)
+}
+
+// Multiply performs a multiplication between the passed in arguments (scalar or array)
+// and returns the result. If one argument is a scalar and the other is an
+// array, the scalar value is multiplied against each value of the array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// performance is faster if not explicitly checking for overflows but
+// will error on an overflow if NoCheckOverflow is false (default).
+func Multiply(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+	return impl(ctx, "multiply", opts, left, right)
+}
+
+// Divide performs a division between the passed in arguments (scalar or array)
+// and returns the result. If one argument is a scalar and the other is an
+// array, the scalar value is used with each value of the array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// performance is faster if not explicitly checking for overflows but
+// will error on an overflow if NoCheckOverflow is false (default).
+//
+// Will error on divide by zero regardless of whether or not checking for
+// overflows.
+func Divide(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+	return impl(ctx, "divide", opts, left, right)
 }
diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go
index 5396188883..12e837a811 100644
--- a/go/arrow/compute/arithmetic_test.go
+++ b/go/arrow/compute/arithmetic_test.go
@@ -62,15 +62,15 @@ type binaryArithmeticFunc = func(context.Context, compute.ArithmeticOptions, com
 
 type binaryFunc = func(left, right compute.Datum) (compute.Datum, error)
 
-func assertScalarEquals(t *testing.T, expected, actual scalar.Scalar) {
-	assert.Truef(t, scalar.Equals(expected, actual), "expected: %s\ngot: %s", expected, actual)
+func assertScalarEquals(t *testing.T, expected, actual scalar.Scalar, opt ...scalar.EqualOption) {
+	assert.Truef(t, scalar.ApproxEquals(expected, actual, opt...), "expected: %s\ngot: %s", expected, actual)
 }
 
-func assertBinop(t *testing.T, fn binaryFunc, left, right, expected arrow.Array) {
+func assertBinop(t *testing.T, fn binaryFunc, left, right, expected arrow.Array, opt []array.EqualOption, scalarOpt []scalar.EqualOption) {
 	actual, err := fn(&compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()})
 	require.NoError(t, err)
 	defer actual.Release()
-	assertDatumsEqual(t, &compute.ArrayDatum{Value: expected.Data()}, actual)
+	assertDatumsEqual(t, &compute.ArrayDatum{Value: expected.Data()}, actual, opt...)
 
 	// also check (Scalar, Scalar) operations
 	for i := 0; i < expected.Len(); i++ {
@@ -81,7 +81,7 @@ func assertBinop(t *testing.T, fn binaryFunc, left, right, expected arrow.Array)
 
 		actual, err := fn(&compute.ScalarDatum{Value: lhs}, &compute.ScalarDatum{Value: rhs})
 		assert.NoError(t, err)
-		assertScalarEquals(t, s, actual.(*compute.ScalarDatum).Value)
+		assertScalarEquals(t, s, actual.(*compute.ScalarDatum).Value, scalarOpt...)
 	}
 }
 
@@ -146,14 +146,21 @@ func (b *Float16BinaryFuncTestSuite) TestSub() {
 type BinaryArithmeticSuite[T exec.NumericTypes] struct {
 	BinaryFuncTestSuite
 
-	opts     compute.ArithmeticOptions
-	min, max T
+	opts            compute.ArithmeticOptions
+	min, max        T
+	equalOpts       []array.EqualOption
+	scalarEqualOpts []scalar.EqualOption
 }
 
 func (BinaryArithmeticSuite[T]) DataType() arrow.DataType {
 	return exec.GetDataType[T]()
 }
 
+func (b *BinaryArithmeticSuite[T]) setNansEqual(val bool) {
+	b.equalOpts = []array.EqualOption{array.WithNaNsEqual(val)}
+	b.scalarEqualOpts = []scalar.EqualOption{scalar.WithNaNsEqual(val)}
+}
+
 func (b *BinaryArithmeticSuite[T]) SetupTest() {
 	b.BinaryFuncTestSuite.SetupTest()
 	b.opts.NoCheckOverflow = false
@@ -209,7 +216,7 @@ func (b *BinaryArithmeticSuite[T]) assertBinopArrScalar(fn binaryArithmeticFunc,
 	actual, err := fn(b.ctx, b.opts, &compute.ArrayDatum{Value: left.Data()}, &compute.ScalarDatum{Value: rhs})
 	b.NoError(err)
 	defer actual.Release()
-	assertDatumsEqual(b.T(), &compute.ArrayDatum{Value: exp.Data()}, actual)
+	assertDatumsEqual(b.T(), &compute.ArrayDatum{Value: exp.Data()}, actual, b.equalOpts...)
 }
 
 func (b *BinaryArithmeticSuite[T]) assertBinop(fn binaryArithmeticFunc, lhs, rhs, expected string) {
@@ -222,11 +229,11 @@ func (b *BinaryArithmeticSuite[T]) assertBinop(fn binaryArithmeticFunc, lhs, rhs
 
 	assertBinop(b.T(), func(left, right compute.Datum) (compute.Datum, error) {
 		return fn(b.ctx, b.opts, left, right)
-	}, left, right, exp)
+	}, left, right, exp, b.equalOpts, b.scalarEqualOpts)
 }
 
 func (b *BinaryArithmeticSuite[T]) setOverflowCheck(value bool) {
-	b.opts.NoCheckOverflow = value
+	b.opts.NoCheckOverflow = !value
 }
 
 func (b *BinaryArithmeticSuite[T]) assertBinopErr(fn binaryArithmeticFunc, lhs, rhs, expectedMsg string) {
@@ -267,7 +274,7 @@ func (b *BinaryArithmeticSuite[T]) TestAdd() {
 				b.assertBinopArrScalar(compute.Add, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
 				b.assertBinopArrScalar(compute.Add, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
 
-				if !arrow.IsFloating(b.DataType().ID()) && !overflow {
+				if !arrow.IsFloating(b.DataType().ID()) && overflow {
 					val := fmt.Sprintf("[%v]", b.max)
 					b.assertBinopErr(compute.Add, val, val, "overflow")
 				}
@@ -303,7 +310,7 @@ func (b *BinaryArithmeticSuite[T]) TestSub() {
 				b.assertBinopArrScalar(compute.Subtract, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
 				b.assertBinopArrScalar(compute.Subtract, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
 
-				if !arrow.IsFloating(b.DataType().ID()) && !overflow {
+				if !arrow.IsFloating(b.DataType().ID()) && overflow {
 					b.assertBinopErr(compute.Subtract, fmt.Sprintf("[%v]", b.min), fmt.Sprintf("[%v]", b.max), "overflow")
 				}
 			})
@@ -311,6 +318,92 @@ func (b *BinaryArithmeticSuite[T]) TestSub() {
 	})
 }
 
+func (b *BinaryArithmeticSuite[T]) TestMuliply() {
+	b.Run(b.DataType().String(), func() {
+		for _, overflow := range []bool{false, true} {
+			b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+				b.setOverflowCheck(overflow)
+
+				b.assertBinop(compute.Multiply, `[]`, `[]`, `[]`)
+				b.assertBinop(compute.Multiply, `[3, 2, 6]`, `[1, 0, 2]`, `[3, 0, 12]`)
+				// nulls on one side
+				b.assertBinop(compute.Multiply, `[null, 2, null]`, `[4, 5, 6]`, `[null, 10, null]`)
+				b.assertBinop(compute.Multiply, `[4, 5, 6]`, `[null, 2, null]`, `[null, 10, null]`)
+				// nulls on both sides
+				b.assertBinop(compute.Multiply, `[null, 2, 3]`, `[4, 5, null]`, `[null, 10, null]`)
+				// all nulls
+				b.assertBinop(compute.Multiply, `[null]`, `[null]`, `[null]`)
+
+				// scalar on left
+				b.assertBinopScalarValArr(compute.Multiply, 3, `[4, 5]`, `[12, 15]`)
+				b.assertBinopScalarValArr(compute.Multiply, 3, `[null, 5]`, `[null, 15]`)
+				b.assertBinopScalarArr(compute.Multiply, b.makeNullScalar(), `[1, 2]`, `[null, null]`)
+				b.assertBinopScalarArr(compute.Multiply, b.makeNullScalar(), `[null, 2]`, `[null, null]`)
+				// scalar on right
+				b.assertBinopArrScalarVal(compute.Multiply, `[4, 5]`, 3, `[12, 15]`)
+				b.assertBinopArrScalarVal(compute.Multiply, `[null, 5]`, 3, `[null, 15]`)
+				b.assertBinopArrScalar(compute.Multiply, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
+				b.assertBinopArrScalar(compute.Multiply, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
+			})
+		}
+	})
+}
+
+func (b *BinaryArithmeticSuite[T]) TestDiv() {
+	b.Run(b.DataType().String(), func() {
+		for _, overflow := range []bool{false, true} {
+			b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+				b.setOverflowCheck(overflow)
+
+				// empty arrays
+				b.assertBinop(compute.Divide, `[]`, `[]`, `[]`)
+				// ordinary arrays
+				b.assertBinop(compute.Divide, `[3, 2, 6]`, `[1, 1, 2]`, `[3, 2, 3]`)
+				// with nulls
+				b.assertBinop(compute.Divide, `[null, 10, 30, null, 20]`, `[1, 5, 2, 5, 10]`, `[null, 2, 15, null, 2]`)
+				if !arrow.IsFloating(b.DataType().ID()) {
+					// scalar divided by array
+					b.assertBinopScalarValArr(compute.Divide, 33, `[null, 1, 3, null, 2]`, `[null, 33, 11, null, 16]`)
+					// array divided by scalar
+					b.assertBinopArrScalarVal(compute.Divide, `[null, 10, 30, null, 2]`, 3, `[null, 3, 10, null, 0]`)
+					// scalar divided by scalar
+					b.assertBinopScalars(compute.Divide, 16, 7, 2)
+				} else {
+					b.assertBinop(compute.Divide, `[3.4, 0.64, 1.28]`, `[1, 2, 4]`, `[3.4, 0.32, 0.32]`)
+					b.assertBinop(compute.Divide, `[null, 1, 3.3, null, 2]`, `[1, 4, 2, 5, 0.1]`, `[null, 0.25, 1.65, null, 20]`)
+					b.assertBinopScalarValArr(compute.Divide, 10, `[null, 1, 2.5, null, 2, 5]`, `[null, 10, 4, null, 5, 2]`)
+					b.assertBinopArrScalarVal(compute.Divide, `[null, 1, 2.5, null, 2, 5]`, 10, `[null, 0.1, 0.25, null, 0.2, 0.5]`)
+
+					b.assertBinop(compute.Divide, `[3.4, "Inf", "-Inf"]`, `[1, 2, 3]`, `[3.4, "Inf", "-Inf"]`)
+					b.setNansEqual(true)
+					b.assertBinop(compute.Divide, `[3.4, "NaN", 2.0]`, `[1, 2, 2.0]`, `[3.4, "NaN", 1.0]`)
+					b.assertBinopScalars(compute.Divide, 21, 3, 7)
+				}
+			})
+		}
+	})
+}
+
+func (b *BinaryArithmeticSuite[T]) TestDivideByZero() {
+	if !arrow.IsFloating(b.DataType().ID()) {
+		for _, checkOverflow := range []bool{false, true} {
+			b.setOverflowCheck(checkOverflow)
+			b.assertBinopErr(compute.Divide, `[3, 2, 6]`, `[1, 1, 0]`, "divide by zero")
+		}
+	} else {
+		b.setOverflowCheck(true)
+		b.assertBinopErr(compute.Divide, `[3, 2, 6]`, `[1, 1, 0]`, "divide by zero")
+		b.assertBinopErr(compute.Divide, `[3, 2, 0]`, `[1, 1, 0]`, "divide by zero")
+		b.assertBinopErr(compute.Divide, `[3, 2, -6]`, `[1, 1, 0]`, "divide by zero")
+
+		b.setOverflowCheck(false)
+		b.setNansEqual(true)
+		b.assertBinop(compute.Divide, `[3, 2, 6]`, `[1, 1, 0]`, `[3, 2, "Inf"]`)
+		b.assertBinop(compute.Divide, `[3, 2, 0]`, `[1, 1, 0]`, `[3, 2, "NaN"]`)
+		b.assertBinop(compute.Divide, `[3, 2, -6]`, `[1, 1, 0]`, `[3, 2, "-Inf"]`)
+	}
+}
+
 func TestBinaryArithmetic(t *testing.T) {
 	suite.Run(t, &BinaryArithmeticSuite[int8]{min: math.MinInt8, max: math.MaxInt8})
 	suite.Run(t, &BinaryArithmeticSuite[uint8]{min: 0, max: math.MaxUint8})
@@ -425,66 +518,159 @@ type DecimalBinaryArithmeticSuite struct {
 
 func (ds *DecimalBinaryArithmeticSuite) TestDispatchBest() {
 	// decimal, floating point
-	for _, fn := range []string{"add", "sub"} {
-		for _, suffix := range []string{"", "_unchecked"} {
-			fn += suffix
-
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal128Type{Precision: 1, Scale: 0},
-				arrow.PrimitiveTypes.Float32}, []arrow.DataType{
-				arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal256Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Float64},
-				[]arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				arrow.PrimitiveTypes.Float32, &arrow.Decimal256Type{Precision: 1, Scale: 0}},
-				[]arrow.DataType{arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				arrow.PrimitiveTypes.Float64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
-				[]arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
+	ds.Run("dec/floatingpoint", func() {
+		for _, fn := range []string{"add", "sub", "multiply", "divide"} {
+			for _, suffix := range []string{"", "_unchecked"} {
+				fn += suffix
+				ds.Run(fn, func() {
+
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal128Type{Precision: 1, Scale: 0},
+						arrow.PrimitiveTypes.Float32}, []arrow.DataType{
+						arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal256Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Float64},
+						[]arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						arrow.PrimitiveTypes.Float32, &arrow.Decimal256Type{Precision: 1, Scale: 0}},
+						[]arrow.DataType{arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						arrow.PrimitiveTypes.Float64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
+						[]arrow.DataType{arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64})
+				})
+			}
 		}
-	}
+	})
 
 	// decimal, decimal => decimal
 	// decimal, integer => decimal
-	for _, fn := range []string{"add", "sub"} {
+	ds.Run("dec/dec_int", func() {
+		for _, fn := range []string{"add", "sub"} {
+			for _, suffix := range []string{"", "_unchecked"} {
+				fn += suffix
+				ds.Run(fn, func() {
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
+						[]arrow.DataType{&arrow.Decimal128Type{Precision: 19, Scale: 0},
+							&arrow.Decimal128Type{Precision: 1, Scale: 0}})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal128Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Int64},
+						[]arrow.DataType{&arrow.Decimal128Type{Precision: 1, Scale: 0},
+							&arrow.Decimal128Type{Precision: 19, Scale: 0}})
+
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+						[]arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+							&arrow.Decimal128Type{Precision: 2, Scale: 1}})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+						[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+							&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+						[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+							&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+						[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+							&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+						[]arrow.DataType{&arrow.Decimal128Type{Precision: 3, Scale: 1},
+							&arrow.Decimal128Type{Precision: 2, Scale: 1}})
+					CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+						&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
+						[]arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+							&arrow.Decimal128Type{Precision: 3, Scale: 1}})
+				})
+			}
+		}
+	})
+
+	{
+		fn := "multiply"
 		for _, suffix := range []string{"", "_unchecked"} {
 			fn += suffix
+			ds.Run(fn, func() {
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 19},
+						&arrow.Decimal128Type{Precision: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 1}, arrow.PrimitiveTypes.Int64},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 1},
+						&arrow.Decimal128Type{Precision: 19}})
+
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+						&arrow.Decimal128Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 0},
+						&arrow.Decimal128Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
+						&arrow.Decimal128Type{Precision: 2, Scale: 0}})
+			})
+		}
+	}
 
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
-				[]arrow.DataType{&arrow.Decimal128Type{Precision: 19, Scale: 0},
-					&arrow.Decimal128Type{Precision: 1, Scale: 0}})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal128Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Int64},
-				[]arrow.DataType{&arrow.Decimal128Type{Precision: 1, Scale: 0},
-					&arrow.Decimal128Type{Precision: 19, Scale: 0}})
-
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
-				[]arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
-					&arrow.Decimal128Type{Precision: 2, Scale: 1}})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
-				[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
-					&arrow.Decimal256Type{Precision: 2, Scale: 1}})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
-				[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
-					&arrow.Decimal256Type{Precision: 2, Scale: 1}})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
-				[]arrow.DataType{&arrow.Decimal256Type{Precision: 2, Scale: 1},
-					&arrow.Decimal256Type{Precision: 2, Scale: 1}})
-
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
-				[]arrow.DataType{&arrow.Decimal128Type{Precision: 3, Scale: 1},
-					&arrow.Decimal128Type{Precision: 2, Scale: 1}})
-			CheckDispatchBest(ds.T(), fn, []arrow.DataType{
-				&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
-				[]arrow.DataType{&arrow.Decimal128Type{Precision: 2, Scale: 1},
-					&arrow.Decimal128Type{Precision: 3, Scale: 1}})
+	{
+		fn := "divide"
+		for _, suffix := range []string{"", "_unchecked"} {
+			fn += suffix
+			ds.Run(fn, func() {
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					arrow.PrimitiveTypes.Int64, &arrow.Decimal128Type{Precision: 1, Scale: 0}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 23, Scale: 4},
+						&arrow.Decimal128Type{Precision: 1, Scale: 0}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 1, Scale: 0}, arrow.PrimitiveTypes.Int64},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 21, Scale: 20},
+						&arrow.Decimal128Type{Precision: 19, Scale: 0}})
+
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 6, Scale: 5},
+						&arrow.Decimal128Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal256Type{Precision: 6, Scale: 5},
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal256Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal256Type{Precision: 6, Scale: 5},
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal256Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal256Type{Precision: 6, Scale: 5},
+						&arrow.Decimal256Type{Precision: 2, Scale: 1}})
+
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 0}, &arrow.Decimal128Type{Precision: 2, Scale: 1}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 7, Scale: 5},
+						&arrow.Decimal128Type{Precision: 2, Scale: 1}})
+				CheckDispatchBest(ds.T(), fn, []arrow.DataType{
+					&arrow.Decimal128Type{Precision: 2, Scale: 1}, &arrow.Decimal128Type{Precision: 2, Scale: 0}},
+					[]arrow.DataType{&arrow.Decimal128Type{Precision: 5, Scale: 4},
+						&arrow.Decimal128Type{Precision: 2, Scale: 0}})
+			})
 		}
 	}
 }
@@ -537,7 +723,7 @@ func (ds *DecimalBinaryArithmeticSuite) TestAddSubtractDec256() {
 		strings.NewReader(`[
 			"-2.00000000000000000001",
 			"2469135780.24691357800000000000",
-			"-9876549999.641975555509876543212",
+			"-9876549999.64197555550987654321",
 			"-99999999989999999999.99999999990000000001"
 		  ]`))
 	defer subtracted.Release()
@@ -606,6 +792,191 @@ func (ds *DecimalBinaryArithmeticSuite) TestAddSubScalars() {
 	})
 }
 
+func (ds *DecimalBinaryArithmeticSuite) TestMultiply() {
+	ds.Run("array x array, decimal128", func() {
+		left, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 20, Scale: 10},
+			strings.NewReader(`["1234567890.1234567890", "-0.0000000001", "-9999999999.9999999999"]`))
+		ds.Require().NoError(err)
+		defer left.Release()
+		right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 13, Scale: 3},
+			strings.NewReader(`["1234567890.123", "0.001", "-9999999999.999"]`))
+		ds.Require().NoError(err)
+		defer right.Release()
+		expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 34, Scale: 13},
+			strings.NewReader(`["1524157875323319737.98709039504701", "-0.0000000000001", "99999999999989999999.0000000000001"]`))
+		ds.Require().NoError(err)
+		defer expected.Release()
+
+		checkScalarBinary(ds.T(), "multiply_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+	})
+
+	ds.Run("array x array decimal256", func() {
+		left, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 30, Scale: 3},
+			strings.NewReader(`["123456789012345678901234567.890", "0.000"]`))
+		ds.Require().NoError(err)
+		defer left.Release()
+		right, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 20, Scale: 9},
+			strings.NewReader(`["-12345678901.234567890", "99999999999.999999999"]`))
+		ds.Require().NoError(err)
+		defer right.Release()
+		expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 51, Scale: 12},
+			strings.NewReader(`["-1524157875323883675034293577501905199.875019052100", "0.000000000000"]`))
+		ds.Require().NoError(err)
+		defer expected.Release()
+		checkScalarBinary(ds.T(), "multiply_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+	})
+
+	ds.Run("scalar x array", func() {
+		left, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3, Scale: 2}, "3.14")
+		ds.Require().NoError(err)
+		right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 1, Scale: 0},
+			strings.NewReader(`["1", "2", "3", "4", "5"]`))
+		ds.Require().NoError(err)
+		defer right.Release()
+		expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 5, Scale: 2},
+			strings.NewReader(`["3.14", "6.28", "9.42", "12.56", "15.70"]`))
+		ds.Require().NoError(err)
+		defer expected.Release()
+
+		leftDatum, rightDatum := &compute.ScalarDatum{left}, &compute.ArrayDatum{right.Data()}
+		expDatum := &compute.ArrayDatum{expected.Data()}
+
+		checkScalarBinary(ds.T(), "multiply_unchecked", leftDatum, rightDatum, expDatum, nil)
+		checkScalarBinary(ds.T(), "multiply_unchecked", rightDatum, leftDatum, expDatum, nil)
+	})
+
+	ds.Run("scalar x scalar", func() {
+		left, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 1}, "1")
+		ds.Require().NoError(err)
+		right, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 1}, "1")
+		ds.Require().NoError(err)
+		expected, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "1")
+		ds.Require().NoError(err)
+		checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+	})
+
+	ds.Run("decimal128 x decimal256", func() {
+		left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3, Scale: 2}, "6.66")
+		right, _ := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 3, Scale: 1}, "88.8")
+		expected, _ := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 7, Scale: 3}, "591.408")
+		checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+		checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(expected), nil)
+	})
+
+	ds.Run("decimal x float", func() {
+		left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "666")
+		right := scalar.MakeScalar(float64(888))
+		expected := scalar.MakeScalar(float64(591408))
+		checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+		checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(expected), nil)
+	})
+
+	ds.Run("decimal x integer", func() {
+		left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "666")
+		right := scalar.MakeScalar(int64(888))
+		expected, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 23}, "591408")
+		checkScalarBinary(ds.T(), "multiply_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+	})
+}
+
+func (ds *DecimalBinaryArithmeticSuite) TestDivide() {
+	ds.Run("array / array, decimal128", func() {
+		left, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 13, Scale: 3},
+			strings.NewReader(`["1234567890.123", "0.001"]`))
+		ds.Require().NoError(err)
+		defer left.Release()
+		right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 3, Scale: 0},
+			strings.NewReader(`["-987", "999"]`))
+		ds.Require().NoError(err)
+		defer right.Release()
+		expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 17, Scale: 7},
+			strings.NewReader(`["-1250828.6627386", "0.0000010"]`))
+		ds.Require().NoError(err)
+		defer expected.Release()
+
+		checkScalarBinary(ds.T(), "divide_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+	})
+
+	ds.Run("array / array decimal256", func() {
+		left, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 20, Scale: 10},
+			strings.NewReader(`["1234567890.1234567890", "9999999999.9999999999"]`))
+		ds.Require().NoError(err)
+		defer left.Release()
+		right, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 13, Scale: 3},
+			strings.NewReader(`["1234567890.123", "0.001"]`))
+		ds.Require().NoError(err)
+		defer right.Release()
+		expected, _, err := array.FromJSON(ds.mem, &arrow.Decimal256Type{Precision: 34, Scale: 21},
+			strings.NewReader(`["1.000000000000369999093", "9999999999999.999999900000000000000"]`))
+		ds.Require().NoError(err)
+		defer expected.Release()
+		checkScalarBinary(ds.T(), "divide_unchecked", &compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()}, &compute.ArrayDatum{expected.Data()}, nil)
+	})
+
+	ds.Run("scalar / array", func() {
+		left, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 1, Scale: 0}, "1")
+		ds.Require().NoError(err)
+		right, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 1, Scale: 0},
+			strings.NewReader(`["1", "2", "3", "4"]`))
+		ds.Require().NoError(err)
+		defer right.Release()
+		leftDivRight, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 5, Scale: 4},
+			strings.NewReader(`["1.0000", "0.5000", "0.3333", "0.2500"]`))
+		ds.Require().NoError(err)
+		defer leftDivRight.Release()
+		rightDivLeft, _, err := array.FromJSON(ds.mem, &arrow.Decimal128Type{Precision: 5, Scale: 4},
+			strings.NewReader(`["1.0000", "2.0000", "3.0000", "4.0000"]`))
+		ds.Require().NoError(err)
+		defer rightDivLeft.Release()
+
+		leftDatum, rightDatum := &compute.ScalarDatum{left}, &compute.ArrayDatum{right.Data()}
+
+		checkScalarBinary(ds.T(), "divide_unchecked", leftDatum, rightDatum, &compute.ArrayDatum{leftDivRight.Data()}, nil)
+		checkScalarBinary(ds.T(), "divide_unchecked", rightDatum, leftDatum, &compute.ArrayDatum{rightDivLeft.Data()}, nil)
+	})
+
+	ds.Run("scalar / scalar", func() {
+		left, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 6, Scale: 5}, "2.71828")
+		ds.Require().NoError(err)
+		right, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 6, Scale: 5}, "3.14159")
+		ds.Require().NoError(err)
+		expected, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 13, Scale: 7}, "0.8652561")
+		ds.Require().NoError(err)
+		checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(expected), nil)
+	})
+
+	ds.Run("decimal128 / decimal256", func() {
+		left, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 6, Scale: 5}, "2.71828")
+		ds.Require().NoError(err)
+		right, err := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 6, Scale: 5}, "3.14159")
+		ds.Require().NoError(err)
+		leftDivRight, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 13, Scale: 7}, "0.8652561")
+		ds.Require().NoError(err)
+		rightDivLeft, err := scalar.ParseScalar(&arrow.Decimal256Type{Precision: 13, Scale: 7}, "1.1557271")
+		ds.Require().NoError(err)
+		checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(leftDivRight), nil)
+		checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(rightDivLeft), nil)
+	})
+
+	ds.Run("decimal / float", func() {
+		left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "100")
+		right := scalar.MakeScalar(float64(50))
+		leftDivRight := scalar.MakeScalar(float64(2))
+		rightDivLeft := scalar.MakeScalar(float64(0.5))
+		checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(leftDivRight), nil)
+		checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(rightDivLeft), nil)
+	})
+
+	ds.Run("decimal / integer", func() {
+		left, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 3}, "100")
+		right := scalar.MakeScalar(int64(50))
+		leftDivRight, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 23, Scale: 20}, "2.0000000000000000000")
+		rightDivLeft, _ := scalar.ParseScalar(&arrow.Decimal128Type{Precision: 23, Scale: 4}, "0.5000")
+		checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(left), compute.NewDatum(right), compute.NewDatum(leftDivRight), nil)
+		checkScalarBinary(ds.T(), "divide_unchecked", compute.NewDatum(right), compute.NewDatum(left), compute.NewDatum(rightDivLeft), nil)
+	})
+}
+
 type ScalarBinaryTemporalArithmeticSuite struct {
 	BinaryFuncTestSuite
 }
diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index 116774bfcd..5b6f17e13b 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -61,11 +61,11 @@ func getDatums[T any](inputs []T) []compute.Datum {
 	return out
 }
 
-func assertArraysEqual(t *testing.T, expected, actual arrow.Array) bool {
-	return assert.Truef(t, array.Equal(expected, actual), "expected: %s\ngot: %s", expected, actual)
+func assertArraysEqual(t *testing.T, expected, actual arrow.Array, opts ...array.EqualOption) bool {
+	return assert.Truef(t, array.ApproxEqual(expected, actual, opts...), "expected: %s\ngot: %s", expected, actual)
 }
 
-func assertDatumsEqual(t *testing.T, expected, actual compute.Datum) {
+func assertDatumsEqual(t *testing.T, expected, actual compute.Datum, opts ...array.EqualOption) {
 	require.Equal(t, expected.Kind(), actual.Kind())
 
 	switch expected.Kind() {
@@ -76,7 +76,7 @@ func assertDatumsEqual(t *testing.T, expected, actual compute.Datum) {
 	case compute.KindArray:
 		want := expected.(*compute.ArrayDatum).MakeArray()
 		got := actual.(*compute.ArrayDatum).MakeArray()
-		assertArraysEqual(t, want, got)
+		assertArraysEqual(t, want, got, opts...)
 		want.Release()
 		got.Release()
 	case compute.KindChunked:
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
index 7b0093af8a..3a8f6a7e70 100644
--- a/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
@@ -29,12 +29,16 @@
 // worth the cost.
 enum class optype : int8_t {
     ADD,
-    SUB, 
+    SUB,
+    MUL,
+    DIV,
 
     // this impl doesn't actually perform any overflow checks as we need
     // to only run overflow checks on non-null entries
     ADD_CHECKED,
-    SUB_CHECKED, 
+    SUB_CHECKED,
+    MUL_CHECKED,
+    DIV_CHECKED,
 };
 
 struct Add {
@@ -42,7 +46,7 @@ struct Add {
     static constexpr T Call(Arg0 left, Arg1 right) {
         if constexpr (is_arithmetic_v<T>)
             return left + right;
-    }    
+    }
 };
 
 struct Sub {
@@ -60,18 +64,65 @@ struct AddChecked {
         if constexpr(is_arithmetic_v<T>) {
             return left + right;
         }
-    }    
+    }
 };
 
 
-struct SubChecked {    
+struct SubChecked {
     template <typename T, typename Arg0, typename Arg1>
     static constexpr T Call(Arg0 left, Arg1 right) {
         static_assert(is_same<T, Arg0>::value && is_same<T, Arg1>::value, "");
-        if constexpr(is_arithmetic_v<T>) {            
+        if constexpr(is_arithmetic_v<T>) {
             return left - right;
         }
-    }    
+    }
+};
+
+template <typename T>
+using maybe_make_unsigned = conditional_t<is_integral_v<T> && !is_same_v<T, bool>, make_unsigned_t<T>, T>;
+
+template <typename T, typename Unsigned = maybe_make_unsigned<T>>
+constexpr Unsigned to_unsigned(T signed_) {
+    return static_cast<Unsigned>(signed_);
+}
+
+struct Multiply {
+    static_assert(is_same_v<decltype(int8_t() * int8_t()), int32_t>, "");
+    static_assert(is_same_v<decltype(uint8_t() * uint8_t()), int32_t>, "");
+    static_assert(is_same_v<decltype(int16_t() * int16_t()), int32_t>, "");
+    static_assert(is_same_v<decltype(uint16_t() * uint16_t()), int32_t>, "");
+    static_assert(is_same_v<decltype(int32_t() * int32_t()), int32_t>, "");
+    static_assert(is_same_v<decltype(uint32_t() * uint32_t()), uint32_t>, "");
+    static_assert(is_same_v<decltype(int64_t() * int64_t()), int64_t>, "");
+    static_assert(is_same_v<decltype(uint64_t() * uint64_t()), uint64_t>, "");
+
+    template <typename T, typename Arg0, typename Arg1>
+    static constexpr T Call(Arg0 left, Arg1 right) {
+        static_assert(is_same_v<T, Arg0> && is_same_v<T, Arg1>, "");
+        if constexpr(is_floating_point_v<T>) {
+            return left * right;
+        } else if constexpr(is_unsigned_v<T> && !is_same_v<T, uint16_t>) {
+            return left * right;
+        } else if constexpr(is_signed_v<T> && !is_same_v<T, int16_t>) {
+            return to_unsigned(left) * to_unsigned(right);
+        } else if constexpr(is_same_v<T, int16_t> || is_same_v<T, uint16_t>) {
+            // multiplication of 16 bit integer types implicitly promotes to
+            // signed 32 bit integer. However, some inputs may overflow (which
+            // triggers undefined behavior). Therefore we first cast to 32 bit
+            // unsigned integers where overflow is well defined.
+            return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
+        }
+    }
+};
+
+struct MultiplyChecked {
+    template <typename T, typename Arg0, typename Arg1>
+    static constexpr T Call(Arg0 left, Arg1 right) {
+        static_assert(is_same_v<T, Arg0> && is_same_v<T, Arg1>, "");
+        if constexpr(is_arithmetic_v<T>) {
+            return left * right;
+        }
+    }
 };
 
 template <typename T, typename Op>
@@ -80,10 +131,10 @@ struct arithmetic_op_arr_arr_impl {
         const T* left = reinterpret_cast<const T*>(in_left);
         const T* right = reinterpret_cast<const T*>(in_right);
         T* output = reinterpret_cast<T*>(out);
-        
+
         for (int i = 0; i < len; ++i) {
             output[i] = Op::template Call<T, T, T>(left[i], right[i]);
-        }        
+        }
     }
 };
 
@@ -93,10 +144,10 @@ struct arithmetic_op_arr_scalar_impl {
         const T* left = reinterpret_cast<const T*>(in_left);
         const T right = *reinterpret_cast<const T*>(scalar_right);
         T* output = reinterpret_cast<T*>(out);
-        
+
         for (int i = 0; i < len; ++i) {
             output[i] = Op::template Call<T, T, T>(left[i], right);
-        }        
+        }
     }
 };
 
@@ -106,7 +157,7 @@ struct arithmetic_op_scalar_arr_impl {
         const T left = *reinterpret_cast<const T*>(scalar_left);
         const T* right = reinterpret_cast<const T*>(in_right);
         T* output = reinterpret_cast<T*>(out);
-        
+
         for (int i = 0; i < len; ++i) {
             output[i] = Op::template Call<T, T, T>(left, right[i]);
         }
@@ -120,25 +171,25 @@ static inline void arithmetic_op(const int type, const void* in_left, const void
 
     switch (intype) {
     case arrtype::UINT8:
-        return Impl<uint8_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<uint8_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::INT8:
-        return Impl<int8_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<int8_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::UINT16:
-        return Impl<uint16_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<uint16_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::INT16:
-        return Impl<int16_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<int16_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::UINT32:
-        return Impl<uint32_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<uint32_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::INT32:
-        return Impl<int32_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<int32_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::UINT64:
-        return Impl<uint64_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<uint64_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::INT64:
-        return Impl<int64_t, Op>::exec(in_left, in_right, output, len);        
+        return Impl<int64_t, Op>::exec(in_left, in_right, output, len);
     case arrtype::FLOAT32:
-        return Impl<float, Op>::exec(in_left, in_right, output, len);        
+        return Impl<float, Op>::exec(in_left, in_right, output, len);
     case arrtype::FLOAT64:
-        return Impl<double, Op>::exec(in_left, in_right, output, len);        
+        return Impl<double, Op>::exec(in_left, in_right, output, len);
     default:
         break;
     }
@@ -150,14 +201,20 @@ static inline void arithmetic_impl(const int type, const int8_t op, const void*
 
     switch (opt) {
     case optype::ADD:
-        return arithmetic_op<Add, Impl>(type, in_left, in_right, out, len);        
+        return arithmetic_op<Add, Impl>(type, in_left, in_right, out, len);
     case optype::ADD_CHECKED:
-        return arithmetic_op<AddChecked, Impl>(type, in_left, in_right, out, len);        
+        return arithmetic_op<AddChecked, Impl>(type, in_left, in_right, out, len);
     case optype::SUB:
-        return arithmetic_op<Sub, Impl>(type, in_left, in_right, out, len);        
+        return arithmetic_op<Sub, Impl>(type, in_left, in_right, out, len);
     case optype::SUB_CHECKED:
-        return arithmetic_op<SubChecked, Impl>(type, in_left, in_right, out, len);        
-    default:
+        return arithmetic_op<SubChecked, Impl>(type, in_left, in_right, out, len);
+    case optype::MUL:
+        return arithmetic_op<Multiply, Impl>(type, in_left, in_right, out, len);
+    case optype::MUL_CHECKED:
+        return arithmetic_op<MultiplyChecked, Impl>(type, in_left, in_right, out, len);    
+    default: 
+        // don't implement divide here as we can only divide on non-null entries
+        // so we can avoid dividing by zero
         break;
     }
 }
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
index 76355712b8..54bc7d754f 100644
--- a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
@@ -1,7 +1,27 @@
 	.text
 	.intel_syntax noprefix
 	.file	"base_arithmetic.cc"
-	.globl	arithmetic_avx2                 # -- Begin function arithmetic_avx2
+	.section	.rodata.cst32,"aM",@progbits,32
+	.p2align	5                               # -- Begin function arithmetic_avx2
+.LCPI0_0:
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.text
+	.globl	arithmetic_avx2
 	.p2align	4, 0x90
 	.type	arithmetic_avx2,@function
 arithmetic_avx2:                        # @arithmetic_avx2
@@ -9,577 +29,1653 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	push	rbp
 	mov	rbp, rsp
 	and	rsp, -8
-	cmp	sil, 1
-	jg	.LBB0_10
+	cmp	sil, 3
+	jg	.LBB0_11
 # %bb.1:
 	test	sil, sil
-	je	.LBB0_19
+	je	.LBB0_21
 # %bb.2:
 	cmp	sil, 1
-	jne	.LBB0_537
+	je	.LBB0_287
 # %bb.3:
-	cmp	edi, 6
-	jg	.LBB0_291
+	cmp	sil, 2
+	jne	.LBB0_825
 # %bb.4:
+	cmp	edi, 6
+	jg	.LBB0_559
+# %bb.5:
 	cmp	edi, 3
-	jle	.LBB0_5
-# %bb.285:
+	jle	.LBB0_6
+# %bb.553:
 	cmp	edi, 4
-	je	.LBB0_324
-# %bb.286:
+	je	.LBB0_602
+# %bb.554:
 	cmp	edi, 5
-	je	.LBB0_336
-# %bb.287:
+	je	.LBB0_614
+# %bb.555:
 	cmp	edi, 6
-	jne	.LBB0_537
-# %bb.288:
+	jne	.LBB0_825
+# %bb.556:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.289:
+	jle	.LBB0_825
+# %bb.557:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_348
-# %bb.290:
+	jae	.LBB0_626
+# %bb.558:
 	xor	esi, esi
-	jmp	.LBB0_353
-.LBB0_10:
-	cmp	sil, 2
-	je	.LBB0_152
-# %bb.11:
-	cmp	sil, 3
-	jne	.LBB0_537
+.LBB0_631:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_633
+.LBB0_632:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	imul	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_632
+.LBB0_633:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_634:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	imul	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	imul	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	imul	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	imul	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_634
+	jmp	.LBB0_825
+.LBB0_11:
+	cmp	sil, 4
+	je	.LBB0_154
 # %bb.12:
-	cmp	edi, 6
-	jg	.LBB0_417
+	cmp	sil, 5
+	je	.LBB0_420
 # %bb.13:
+	cmp	sil, 6
+	jne	.LBB0_825
+# %bb.14:
+	cmp	edi, 6
+	jg	.LBB0_695
+# %bb.15:
 	cmp	edi, 3
-	jle	.LBB0_14
-# %bb.411:
+	jle	.LBB0_16
+# %bb.689:
 	cmp	edi, 4
-	je	.LBB0_450
-# %bb.412:
+	je	.LBB0_738
+# %bb.690:
 	cmp	edi, 5
-	je	.LBB0_462
-# %bb.413:
+	je	.LBB0_750
+# %bb.691:
 	cmp	edi, 6
-	jne	.LBB0_537
-# %bb.414:
+	jne	.LBB0_825
+# %bb.692:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.415:
+	jle	.LBB0_825
+# %bb.693:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_474
-# %bb.416:
+	jae	.LBB0_762
+# %bb.694:
 	xor	esi, esi
-	jmp	.LBB0_479
-.LBB0_19:
+.LBB0_767:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_769
+.LBB0_768:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	imul	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_768
+.LBB0_769:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_770:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	imul	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	imul	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	imul	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	imul	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_770
+	jmp	.LBB0_825
+.LBB0_21:
 	cmp	edi, 6
-	jg	.LBB0_32
-# %bb.20:
+	jg	.LBB0_34
+# %bb.22:
 	cmp	edi, 3
-	jle	.LBB0_21
-# %bb.26:
+	jle	.LBB0_23
+# %bb.28:
 	cmp	edi, 4
-	je	.LBB0_65
-# %bb.27:
+	je	.LBB0_67
+# %bb.29:
 	cmp	edi, 5
-	je	.LBB0_77
-# %bb.28:
+	je	.LBB0_79
+# %bb.30:
 	cmp	edi, 6
-	jne	.LBB0_537
-# %bb.29:
+	jne	.LBB0_825
+# %bb.31:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.30:
+	jle	.LBB0_825
+# %bb.32:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_89
-# %bb.31:
+	jae	.LBB0_91
+# %bb.33:
 	xor	esi, esi
-	jmp	.LBB0_94
-.LBB0_152:
+	jmp	.LBB0_96
+.LBB0_287:
 	cmp	edi, 6
-	jg	.LBB0_165
-# %bb.153:
+	jg	.LBB0_300
+# %bb.288:
 	cmp	edi, 3
-	jle	.LBB0_154
-# %bb.159:
+	jle	.LBB0_289
+# %bb.294:
 	cmp	edi, 4
-	je	.LBB0_198
-# %bb.160:
+	je	.LBB0_333
+# %bb.295:
 	cmp	edi, 5
-	je	.LBB0_210
-# %bb.161:
+	je	.LBB0_345
+# %bb.296:
 	cmp	edi, 6
-	jne	.LBB0_537
-# %bb.162:
+	jne	.LBB0_825
+# %bb.297:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.163:
+	jle	.LBB0_825
+# %bb.298:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_222
+	jae	.LBB0_357
+# %bb.299:
+	xor	esi, esi
+.LBB0_362:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_364
+.LBB0_363:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rdx + 4*rsi]
+	sub	edi, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_363
+.LBB0_364:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_365:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rdx + 4*rsi]
+	sub	eax, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 4]
+	sub	eax, dword ptr [rcx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 8]
+	sub	eax, dword ptr [rcx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 12]
+	sub	eax, dword ptr [rcx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_365
+	jmp	.LBB0_825
+.LBB0_154:
+	cmp	edi, 6
+	jg	.LBB0_167
+# %bb.155:
+	cmp	edi, 3
+	jle	.LBB0_156
+# %bb.161:
+	cmp	edi, 4
+	je	.LBB0_200
+# %bb.162:
+	cmp	edi, 5
+	je	.LBB0_212
+# %bb.163:
+	cmp	edi, 6
+	jne	.LBB0_825
 # %bb.164:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.165:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_224
+# %bb.166:
+	xor	esi, esi
+	jmp	.LBB0_229
+.LBB0_420:
+	cmp	edi, 6
+	jg	.LBB0_433
+# %bb.421:
+	cmp	edi, 3
+	jle	.LBB0_422
+# %bb.427:
+	cmp	edi, 4
+	je	.LBB0_466
+# %bb.428:
+	cmp	edi, 5
+	je	.LBB0_478
+# %bb.429:
+	cmp	edi, 6
+	jne	.LBB0_825
+# %bb.430:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.431:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_490
+# %bb.432:
 	xor	esi, esi
-	jmp	.LBB0_227
-.LBB0_291:
+.LBB0_495:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_497
+.LBB0_496:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rdx + 4*rsi]
+	sub	edi, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_496
+.LBB0_497:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_498:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rdx + 4*rsi]
+	sub	eax, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 4]
+	sub	eax, dword ptr [rcx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 8]
+	sub	eax, dword ptr [rcx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 12]
+	sub	eax, dword ptr [rcx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_498
+	jmp	.LBB0_825
+.LBB0_559:
 	cmp	edi, 8
-	jle	.LBB0_292
-# %bb.297:
+	jle	.LBB0_560
+# %bb.565:
 	cmp	edi, 9
-	je	.LBB0_378
-# %bb.298:
+	je	.LBB0_656
+# %bb.566:
 	cmp	edi, 11
-	je	.LBB0_390
-# %bb.299:
+	je	.LBB0_668
+# %bb.567:
 	cmp	edi, 12
-	jne	.LBB0_537
-# %bb.300:
+	jne	.LBB0_825
+# %bb.568:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.301:
+	jle	.LBB0_825
+# %bb.569:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_402
-# %bb.302:
+	jae	.LBB0_680
+# %bb.570:
 	xor	esi, esi
-	jmp	.LBB0_407
-.LBB0_417:
+.LBB0_685:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_687
+.LBB0_686:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_686
+.LBB0_687:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_688:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_688
+	jmp	.LBB0_825
+.LBB0_695:
 	cmp	edi, 8
-	jle	.LBB0_418
-# %bb.423:
+	jle	.LBB0_696
+# %bb.701:
 	cmp	edi, 9
-	je	.LBB0_504
-# %bb.424:
+	je	.LBB0_792
+# %bb.702:
 	cmp	edi, 11
-	je	.LBB0_516
-# %bb.425:
+	je	.LBB0_804
+# %bb.703:
 	cmp	edi, 12
-	jne	.LBB0_537
-# %bb.426:
+	jne	.LBB0_825
+# %bb.704:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.427:
+	jle	.LBB0_825
+# %bb.705:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_528
-# %bb.428:
+	jae	.LBB0_816
+# %bb.706:
 	xor	esi, esi
-	jmp	.LBB0_533
-.LBB0_32:
+.LBB0_821:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_823
+.LBB0_822:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_822
+.LBB0_823:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_824:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+	vmulsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_824
+	jmp	.LBB0_825
+.LBB0_34:
 	cmp	edi, 8
-	jle	.LBB0_33
-# %bb.38:
+	jle	.LBB0_35
+# %bb.40:
 	cmp	edi, 9
-	je	.LBB0_119
-# %bb.39:
+	je	.LBB0_121
+# %bb.41:
 	cmp	edi, 11
-	je	.LBB0_131
-# %bb.40:
+	je	.LBB0_133
+# %bb.42:
 	cmp	edi, 12
-	jne	.LBB0_537
-# %bb.41:
+	jne	.LBB0_825
+# %bb.43:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.42:
+	jle	.LBB0_825
+# %bb.44:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_143
-# %bb.43:
+	jae	.LBB0_145
+# %bb.45:
 	xor	esi, esi
-	jmp	.LBB0_148
-.LBB0_165:
+	jmp	.LBB0_150
+.LBB0_300:
 	cmp	edi, 8
-	jle	.LBB0_166
-# %bb.171:
+	jle	.LBB0_301
+# %bb.306:
 	cmp	edi, 9
-	je	.LBB0_252
-# %bb.172:
+	je	.LBB0_387
+# %bb.307:
 	cmp	edi, 11
-	je	.LBB0_264
-# %bb.173:
+	je	.LBB0_399
+# %bb.308:
 	cmp	edi, 12
-	jne	.LBB0_537
-# %bb.174:
+	jne	.LBB0_825
+# %bb.309:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.175:
+	jle	.LBB0_825
+# %bb.310:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_276
+	jae	.LBB0_411
+# %bb.311:
+	xor	esi, esi
+.LBB0_416:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_418
+.LBB0_417:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_417
+.LBB0_418:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_419:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
+	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
+	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
+	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_419
+	jmp	.LBB0_825
+.LBB0_167:
+	cmp	edi, 8
+	jle	.LBB0_168
+# %bb.173:
+	cmp	edi, 9
+	je	.LBB0_254
+# %bb.174:
+	cmp	edi, 11
+	je	.LBB0_266
+# %bb.175:
+	cmp	edi, 12
+	jne	.LBB0_825
 # %bb.176:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.177:
+	mov	r10d, r9d
+	cmp	r9d, 16
+	jae	.LBB0_278
+# %bb.178:
+	xor	esi, esi
+	jmp	.LBB0_283
+.LBB0_433:
+	cmp	edi, 8
+	jle	.LBB0_434
+# %bb.439:
+	cmp	edi, 9
+	je	.LBB0_520
+# %bb.440:
+	cmp	edi, 11
+	je	.LBB0_532
+# %bb.441:
+	cmp	edi, 12
+	jne	.LBB0_825
+# %bb.442:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.443:
+	mov	r10d, r9d
+	cmp	r9d, 16
+	jae	.LBB0_544
+# %bb.444:
 	xor	esi, esi
-	jmp	.LBB0_281
-.LBB0_5:
+.LBB0_549:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_551
+.LBB0_550:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_550
+.LBB0_551:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_552:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
+	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
+	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
+	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
+	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_552
+	jmp	.LBB0_825
+.LBB0_6:
 	cmp	edi, 2
-	je	.LBB0_303
-# %bb.6:
-	cmp	edi, 3
-	jne	.LBB0_537
+	je	.LBB0_571
 # %bb.7:
-	test	r9d, r9d
-	jle	.LBB0_537
+	cmp	edi, 3
+	jne	.LBB0_825
 # %bb.8:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.9:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_588
+# %bb.10:
+	xor	edi, edi
+	jmp	.LBB0_598
+.LBB0_16:
+	cmp	edi, 2
+	je	.LBB0_707
+# %bb.17:
+	cmp	edi, 3
+	jne	.LBB0_825
+# %bb.18:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.19:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_724
+# %bb.20:
+	xor	edi, edi
+	jmp	.LBB0_734
+.LBB0_23:
+	cmp	edi, 2
+	je	.LBB0_46
+# %bb.24:
+	cmp	edi, 3
+	jne	.LBB0_825
+# %bb.25:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.26:
 	mov	r10d, r9d
 	cmp	r9d, 128
-	jae	.LBB0_315
-# %bb.9:
+	jae	.LBB0_58
+# %bb.27:
 	xor	esi, esi
-	jmp	.LBB0_320
-.LBB0_14:
+	jmp	.LBB0_63
+.LBB0_289:
 	cmp	edi, 2
-	je	.LBB0_429
-# %bb.15:
+	je	.LBB0_312
+# %bb.290:
 	cmp	edi, 3
-	jne	.LBB0_537
-# %bb.16:
+	jne	.LBB0_825
+# %bb.291:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.17:
+	jle	.LBB0_825
+# %bb.292:
 	mov	r10d, r9d
 	cmp	r9d, 128
-	jae	.LBB0_441
-# %bb.18:
+	jae	.LBB0_324
+# %bb.293:
 	xor	esi, esi
-	jmp	.LBB0_446
-.LBB0_21:
+	jmp	.LBB0_329
+.LBB0_156:
 	cmp	edi, 2
-	je	.LBB0_44
-# %bb.22:
+	je	.LBB0_179
+# %bb.157:
 	cmp	edi, 3
-	jne	.LBB0_537
-# %bb.23:
+	jne	.LBB0_825
+# %bb.158:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.24:
+	jle	.LBB0_825
+# %bb.159:
 	mov	r10d, r9d
 	cmp	r9d, 128
-	jae	.LBB0_56
-# %bb.25:
+	jae	.LBB0_191
+# %bb.160:
 	xor	esi, esi
-	jmp	.LBB0_61
-.LBB0_154:
+	jmp	.LBB0_196
+.LBB0_422:
 	cmp	edi, 2
-	je	.LBB0_177
-# %bb.155:
+	je	.LBB0_445
+# %bb.423:
 	cmp	edi, 3
-	jne	.LBB0_537
-# %bb.156:
+	jne	.LBB0_825
+# %bb.424:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.157:
+	jle	.LBB0_825
+# %bb.425:
 	mov	r10d, r9d
 	cmp	r9d, 128
-	jae	.LBB0_189
-# %bb.158:
+	jae	.LBB0_457
+# %bb.426:
 	xor	esi, esi
-	jmp	.LBB0_194
-.LBB0_292:
+	jmp	.LBB0_462
+.LBB0_560:
 	cmp	edi, 7
-	je	.LBB0_357
-# %bb.293:
+	je	.LBB0_635
+# %bb.561:
 	cmp	edi, 8
-	jne	.LBB0_537
-# %bb.294:
+	jne	.LBB0_825
+# %bb.562:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.295:
+	jle	.LBB0_825
+# %bb.563:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_369
-# %bb.296:
+	jae	.LBB0_647
+# %bb.564:
 	xor	esi, esi
-	jmp	.LBB0_374
-.LBB0_418:
+	jmp	.LBB0_652
+.LBB0_696:
 	cmp	edi, 7
-	je	.LBB0_483
-# %bb.419:
+	je	.LBB0_771
+# %bb.697:
 	cmp	edi, 8
-	jne	.LBB0_537
-# %bb.420:
+	jne	.LBB0_825
+# %bb.698:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.421:
+	jle	.LBB0_825
+# %bb.699:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_495
-# %bb.422:
+	jae	.LBB0_783
+# %bb.700:
 	xor	esi, esi
-	jmp	.LBB0_500
-.LBB0_33:
+	jmp	.LBB0_788
+.LBB0_35:
 	cmp	edi, 7
-	je	.LBB0_98
-# %bb.34:
+	je	.LBB0_100
+# %bb.36:
 	cmp	edi, 8
-	jne	.LBB0_537
-# %bb.35:
+	jne	.LBB0_825
+# %bb.37:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.36:
+	jle	.LBB0_825
+# %bb.38:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_110
-# %bb.37:
+	jae	.LBB0_112
+# %bb.39:
 	xor	esi, esi
-	jmp	.LBB0_115
-.LBB0_166:
+	jmp	.LBB0_117
+.LBB0_301:
 	cmp	edi, 7
-	je	.LBB0_231
-# %bb.167:
+	je	.LBB0_366
+# %bb.302:
 	cmp	edi, 8
-	jne	.LBB0_537
-# %bb.168:
+	jne	.LBB0_825
+# %bb.303:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.169:
+	jle	.LBB0_825
+# %bb.304:
 	mov	r10d, r9d
 	cmp	r9d, 16
-	jae	.LBB0_243
-# %bb.170:
-	xor	esi, esi
-	jmp	.LBB0_248
-.LBB0_324:
-	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.325:
-	mov	r10d, r9d
-	cmp	r9d, 64
-	jae	.LBB0_327
-# %bb.326:
+	jae	.LBB0_378
+# %bb.305:
 	xor	esi, esi
-	jmp	.LBB0_332
-.LBB0_336:
+	jmp	.LBB0_383
+.LBB0_168:
+	cmp	edi, 7
+	je	.LBB0_233
+# %bb.169:
+	cmp	edi, 8
+	jne	.LBB0_825
+# %bb.170:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.337:
+	jle	.LBB0_825
+# %bb.171:
 	mov	r10d, r9d
-	cmp	r9d, 64
-	jae	.LBB0_339
-# %bb.338:
+	cmp	r9d, 16
+	jae	.LBB0_245
+# %bb.172:
 	xor	esi, esi
-	jmp	.LBB0_344
-.LBB0_450:
+	jmp	.LBB0_250
+.LBB0_434:
+	cmp	edi, 7
+	je	.LBB0_499
+# %bb.435:
+	cmp	edi, 8
+	jne	.LBB0_825
+# %bb.436:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.451:
+	jle	.LBB0_825
+# %bb.437:
 	mov	r10d, r9d
-	cmp	r9d, 64
-	jae	.LBB0_453
-# %bb.452:
+	cmp	r9d, 16
+	jae	.LBB0_511
+# %bb.438:
 	xor	esi, esi
-	jmp	.LBB0_458
-.LBB0_462:
+	jmp	.LBB0_516
+.LBB0_602:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.463:
+	jle	.LBB0_825
+# %bb.603:
 	mov	r10d, r9d
 	cmp	r9d, 64
-	jae	.LBB0_465
-# %bb.464:
+	jae	.LBB0_605
+# %bb.604:
 	xor	esi, esi
-	jmp	.LBB0_470
-.LBB0_65:
+.LBB0_610:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_612
+.LBB0_611:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	imul	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_611
+.LBB0_612:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_613:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	imul	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	imul	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	imul	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	imul	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_613
+	jmp	.LBB0_825
+.LBB0_614:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.66:
+	jle	.LBB0_825
+# %bb.615:
 	mov	r10d, r9d
 	cmp	r9d, 64
-	jae	.LBB0_68
-# %bb.67:
+	jae	.LBB0_617
+# %bb.616:
 	xor	esi, esi
-	jmp	.LBB0_73
-.LBB0_77:
+.LBB0_622:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_624
+.LBB0_623:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	imul	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_623
+.LBB0_624:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_625:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	imul	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	imul	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	imul	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	imul	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_625
+	jmp	.LBB0_825
+.LBB0_738:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.78:
+	jle	.LBB0_825
+# %bb.739:
 	mov	r10d, r9d
 	cmp	r9d, 64
-	jae	.LBB0_80
-# %bb.79:
+	jae	.LBB0_741
+# %bb.740:
 	xor	esi, esi
-	jmp	.LBB0_85
-.LBB0_198:
+.LBB0_746:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_748
+.LBB0_747:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	imul	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_747
+.LBB0_748:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_749:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	imul	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	imul	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	imul	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	imul	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_749
+	jmp	.LBB0_825
+.LBB0_750:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.199:
+	jle	.LBB0_825
+# %bb.751:
 	mov	r10d, r9d
 	cmp	r9d, 64
-	jae	.LBB0_201
-# %bb.200:
+	jae	.LBB0_753
+# %bb.752:
 	xor	esi, esi
-	jmp	.LBB0_206
-.LBB0_210:
-	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.211:
+.LBB0_758:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_760
+.LBB0_759:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	imul	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_759
+.LBB0_760:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_761:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	imul	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	imul	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	imul	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	imul	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_761
+	jmp	.LBB0_825
+.LBB0_67:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.68:
 	mov	r10d, r9d
 	cmp	r9d, 64
-	jae	.LBB0_213
-# %bb.212:
+	jae	.LBB0_70
+# %bb.69:
 	xor	esi, esi
-	jmp	.LBB0_218
-.LBB0_378:
+	jmp	.LBB0_75
+.LBB0_79:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.379:
+	jle	.LBB0_825
+# %bb.80:
 	mov	r10d, r9d
-	cmp	r9d, 16
-	jae	.LBB0_381
-# %bb.380:
+	cmp	r9d, 64
+	jae	.LBB0_82
+# %bb.81:
 	xor	esi, esi
-	jmp	.LBB0_386
-.LBB0_390:
+	jmp	.LBB0_87
+.LBB0_333:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.391:
+	jle	.LBB0_825
+# %bb.334:
 	mov	r10d, r9d
-	cmp	r9d, 32
-	jae	.LBB0_393
-# %bb.392:
+	cmp	r9d, 64
+	jae	.LBB0_336
+# %bb.335:
 	xor	esi, esi
-	jmp	.LBB0_398
-.LBB0_504:
+.LBB0_341:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_343
+.LBB0_342:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rdx + 2*rsi]
+	sub	di, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_342
+.LBB0_343:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_344:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rdx + 2*rsi]
+	sub	ax, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 2]
+	sub	ax, word ptr [rcx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 4]
+	sub	ax, word ptr [rcx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 6]
+	sub	ax, word ptr [rcx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_344
+	jmp	.LBB0_825
+.LBB0_345:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.505:
+	jle	.LBB0_825
+# %bb.346:
 	mov	r10d, r9d
-	cmp	r9d, 16
-	jae	.LBB0_507
-# %bb.506:
+	cmp	r9d, 64
+	jae	.LBB0_348
+# %bb.347:
 	xor	esi, esi
-	jmp	.LBB0_512
-.LBB0_516:
+.LBB0_353:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_355
+.LBB0_354:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rdx + 2*rsi]
+	sub	di, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_354
+.LBB0_355:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_356:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rdx + 2*rsi]
+	sub	ax, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 2]
+	sub	ax, word ptr [rcx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 4]
+	sub	ax, word ptr [rcx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 6]
+	sub	ax, word ptr [rcx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_356
+	jmp	.LBB0_825
+.LBB0_200:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.517:
+	jle	.LBB0_825
+# %bb.201:
 	mov	r10d, r9d
-	cmp	r9d, 32
-	jae	.LBB0_519
-# %bb.518:
+	cmp	r9d, 64
+	jae	.LBB0_203
+# %bb.202:
 	xor	esi, esi
-	jmp	.LBB0_524
-.LBB0_119:
+	jmp	.LBB0_208
+.LBB0_212:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.120:
+	jle	.LBB0_825
+# %bb.213:
 	mov	r10d, r9d
-	cmp	r9d, 16
-	jae	.LBB0_122
-# %bb.121:
+	cmp	r9d, 64
+	jae	.LBB0_215
+# %bb.214:
 	xor	esi, esi
-	jmp	.LBB0_127
-.LBB0_131:
+	jmp	.LBB0_220
+.LBB0_466:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.132:
+	jle	.LBB0_825
+# %bb.467:
 	mov	r10d, r9d
-	cmp	r9d, 32
-	jae	.LBB0_134
-# %bb.133:
+	cmp	r9d, 64
+	jae	.LBB0_469
+# %bb.468:
 	xor	esi, esi
-	jmp	.LBB0_139
-.LBB0_252:
+.LBB0_474:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_476
+.LBB0_475:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rdx + 2*rsi]
+	sub	di, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_475
+.LBB0_476:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_477:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rdx + 2*rsi]
+	sub	ax, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 2]
+	sub	ax, word ptr [rcx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 4]
+	sub	ax, word ptr [rcx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 6]
+	sub	ax, word ptr [rcx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_477
+	jmp	.LBB0_825
+.LBB0_478:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.253:
+	jle	.LBB0_825
+# %bb.479:
 	mov	r10d, r9d
-	cmp	r9d, 16
-	jae	.LBB0_255
-# %bb.254:
+	cmp	r9d, 64
+	jae	.LBB0_481
+# %bb.480:
 	xor	esi, esi
-	jmp	.LBB0_260
-.LBB0_264:
+.LBB0_486:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_488
+.LBB0_487:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rdx + 2*rsi]
+	sub	di, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_487
+.LBB0_488:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_489:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rdx + 2*rsi]
+	sub	ax, word ptr [rcx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 2]
+	sub	ax, word ptr [rcx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 4]
+	sub	ax, word ptr [rcx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rdx + 2*rsi + 6]
+	sub	ax, word ptr [rcx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_489
+	jmp	.LBB0_825
+.LBB0_656:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.265:
+	jle	.LBB0_825
+# %bb.657:
 	mov	r10d, r9d
-	cmp	r9d, 32
-	jae	.LBB0_267
-# %bb.266:
+	cmp	r9d, 16
+	jae	.LBB0_659
+# %bb.658:
 	xor	esi, esi
-	jmp	.LBB0_272
-.LBB0_303:
+.LBB0_664:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_666
+.LBB0_665:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	imul	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_665
+.LBB0_666:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_667:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	imul	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	imul	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	imul	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	imul	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_667
+	jmp	.LBB0_825
+.LBB0_668:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.304:
+	jle	.LBB0_825
+# %bb.669:
 	mov	r10d, r9d
-	cmp	r9d, 128
-	jae	.LBB0_306
-# %bb.305:
+	cmp	r9d, 32
+	jae	.LBB0_671
+# %bb.670:
 	xor	esi, esi
-	jmp	.LBB0_311
-.LBB0_429:
+.LBB0_676:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_678
+.LBB0_677:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_677
+.LBB0_678:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_679:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_679
+	jmp	.LBB0_825
+.LBB0_792:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.430:
+	jle	.LBB0_825
+# %bb.793:
 	mov	r10d, r9d
-	cmp	r9d, 128
-	jae	.LBB0_432
-# %bb.431:
+	cmp	r9d, 16
+	jae	.LBB0_795
+# %bb.794:
 	xor	esi, esi
-	jmp	.LBB0_437
-.LBB0_44:
+.LBB0_800:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_802
+.LBB0_801:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	imul	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_801
+.LBB0_802:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_803:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	imul	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	imul	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	imul	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	imul	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_803
+	jmp	.LBB0_825
+.LBB0_804:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.45:
+	jle	.LBB0_825
+# %bb.805:
 	mov	r10d, r9d
-	cmp	r9d, 128
-	jae	.LBB0_47
-# %bb.46:
+	cmp	r9d, 32
+	jae	.LBB0_807
+# %bb.806:
 	xor	esi, esi
-	jmp	.LBB0_52
-.LBB0_177:
-	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.178:
-	mov	r10d, r9d
+.LBB0_812:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_814
+.LBB0_813:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_813
+.LBB0_814:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_815:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+	vmulss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_815
+	jmp	.LBB0_825
+.LBB0_121:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.122:
+	mov	r10d, r9d
+	cmp	r9d, 16
+	jae	.LBB0_124
+# %bb.123:
+	xor	esi, esi
+	jmp	.LBB0_129
+.LBB0_133:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.134:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_136
+# %bb.135:
+	xor	esi, esi
+	jmp	.LBB0_141
+.LBB0_387:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.388:
+	mov	r10d, r9d
+	cmp	r9d, 16
+	jae	.LBB0_390
+# %bb.389:
+	xor	esi, esi
+.LBB0_395:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_397
+.LBB0_396:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rdx + 8*rsi]
+	sub	rdi, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_396
+.LBB0_397:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_398:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rdx + 8*rsi]
+	sub	rax, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 8]
+	sub	rax, qword ptr [rcx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 16]
+	sub	rax, qword ptr [rcx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 24]
+	sub	rax, qword ptr [rcx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_398
+	jmp	.LBB0_825
+.LBB0_399:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.400:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_402
+# %bb.401:
+	xor	esi, esi
+.LBB0_407:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_409
+.LBB0_408:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_408
+.LBB0_409:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_410:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	vmovss	xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
+	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
+	vmovss	xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
+	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
+	vmovss	xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
+	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_410
+	jmp	.LBB0_825
+.LBB0_254:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.255:
+	mov	r10d, r9d
+	cmp	r9d, 16
+	jae	.LBB0_257
+# %bb.256:
+	xor	esi, esi
+	jmp	.LBB0_262
+.LBB0_266:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.267:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_269
+# %bb.268:
+	xor	esi, esi
+	jmp	.LBB0_274
+.LBB0_520:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.521:
+	mov	r10d, r9d
+	cmp	r9d, 16
+	jae	.LBB0_523
+# %bb.522:
+	xor	esi, esi
+.LBB0_528:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_530
+.LBB0_529:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rdx + 8*rsi]
+	sub	rdi, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_529
+.LBB0_530:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_531:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rdx + 8*rsi]
+	sub	rax, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 8]
+	sub	rax, qword ptr [rcx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 16]
+	sub	rax, qword ptr [rcx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 24]
+	sub	rax, qword ptr [rcx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_531
+	jmp	.LBB0_825
+.LBB0_532:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.533:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_535
+# %bb.534:
+	xor	esi, esi
+.LBB0_540:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_542
+.LBB0_541:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_541
+.LBB0_542:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_543:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	vmovss	xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
+	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
+	vmovss	xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
+	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
+	vmovss	xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
+	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_543
+	jmp	.LBB0_825
+.LBB0_571:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.572:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_574
+# %bb.573:
+	xor	edi, edi
+	jmp	.LBB0_584
+.LBB0_707:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.708:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_710
+# %bb.709:
+	xor	edi, edi
+	jmp	.LBB0_720
+.LBB0_46:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.47:
+	mov	r10d, r9d
 	cmp	r9d, 128
-	jae	.LBB0_180
-# %bb.179:
+	jae	.LBB0_49
+# %bb.48:
 	xor	esi, esi
-	jmp	.LBB0_185
-.LBB0_357:
+	jmp	.LBB0_54
+.LBB0_312:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.358:
+	jle	.LBB0_825
+# %bb.313:
+	mov	r10d, r9d
+	cmp	r9d, 128
+	jae	.LBB0_315
+# %bb.314:
+	xor	esi, esi
+	jmp	.LBB0_320
+.LBB0_179:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.180:
+	mov	r10d, r9d
+	cmp	r9d, 128
+	jae	.LBB0_182
+# %bb.181:
+	xor	esi, esi
+	jmp	.LBB0_187
+.LBB0_445:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.446:
+	mov	r10d, r9d
+	cmp	r9d, 128
+	jae	.LBB0_448
+# %bb.447:
+	xor	esi, esi
+	jmp	.LBB0_453
+.LBB0_635:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.636:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_360
-# %bb.359:
+	jae	.LBB0_638
+# %bb.637:
 	xor	esi, esi
-	jmp	.LBB0_365
-.LBB0_483:
+	jmp	.LBB0_643
+.LBB0_771:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.484:
+	jle	.LBB0_825
+# %bb.772:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_486
-# %bb.485:
+	jae	.LBB0_774
+# %bb.773:
 	xor	esi, esi
-	jmp	.LBB0_491
-.LBB0_98:
+	jmp	.LBB0_779
+.LBB0_100:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.99:
+	jle	.LBB0_825
+# %bb.101:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_101
-# %bb.100:
+	jae	.LBB0_103
+# %bb.102:
 	xor	esi, esi
-	jmp	.LBB0_106
-.LBB0_231:
+	jmp	.LBB0_108
+.LBB0_366:
 	test	r9d, r9d
-	jle	.LBB0_537
-# %bb.232:
+	jle	.LBB0_825
+# %bb.367:
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jae	.LBB0_234
-# %bb.233:
+	jae	.LBB0_369
+# %bb.368:
 	xor	esi, esi
-	jmp	.LBB0_239
-.LBB0_348:
+	jmp	.LBB0_374
+.LBB0_233:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.234:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_236
+# %bb.235:
+	xor	esi, esi
+	jmp	.LBB0_241
+.LBB0_499:
+	test	r9d, r9d
+	jle	.LBB0_825
+# %bb.500:
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jae	.LBB0_502
+# %bb.501:
+	xor	esi, esi
+	jmp	.LBB0_507
+.LBB0_91:
 	lea	rsi, [r8 + 4*r10]
 	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
@@ -593,73 +1689,1326 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_353
-# %bb.349:
+	jne	.LBB0_96
+# %bb.92:
 	and	al, dil
-	jne	.LBB0_353
-# %bb.350:
+	jne	.LBB0_96
+# %bb.93:
 	mov	esi, r10d
 	and	esi, -32
 	xor	edi, edi
-.LBB0_351:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
-	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+.LBB0_94:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
 	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
 	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
 	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
 	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
 	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_351
-# %bb.352:
+	jne	.LBB0_94
+# %bb.95:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_96:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_98
+.LBB0_97:                               # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	add	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_97
+.LBB0_98:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_99:                               # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	add	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	add	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	add	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	add	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_99
+	jmp	.LBB0_825
+.LBB0_224:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 4*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_229
+# %bb.225:
+	and	al, dil
+	jne	.LBB0_229
+# %bb.226:
+	mov	esi, r10d
+	and	esi, -32
+	xor	edi, edi
+.LBB0_227:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
+	cmp	rsi, rdi
+	jne	.LBB0_227
+# %bb.228:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_229:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_231
+.LBB0_230:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	add	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_230
+.LBB0_231:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_232:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	add	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	add	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	add	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	add	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_232
+	jmp	.LBB0_825
+.LBB0_145:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_150
+# %bb.146:
+	and	al, dil
+	jne	.LBB0_150
+# %bb.147:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_148:                              # =>This Inner Loop Header: Depth=1
+	vmovupd	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovupd	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovupd	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovupd	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vaddpd	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vaddpd	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vaddpd	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vaddpd	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_148
+# %bb.149:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_150:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_152
+.LBB0_151:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_151
+.LBB0_152:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_153:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_153
+	jmp	.LBB0_825
+.LBB0_278:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_283
+# %bb.279:
+	and	al, dil
+	jne	.LBB0_283
+# %bb.280:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_281:                              # =>This Inner Loop Header: Depth=1
+	vmovupd	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovupd	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovupd	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovupd	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vaddpd	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vaddpd	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vaddpd	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vaddpd	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_281
+# %bb.282:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_283:
+	mov	rdi, rsi
+	not	rdi
+	add	rdi, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_285
+.LBB0_284:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_284
+.LBB0_285:
+	cmp	rdi, 3
+	jb	.LBB0_825
+.LBB0_286:                              # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
+	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
+	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_286
+	jmp	.LBB0_825
+.LBB0_588:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	sil
+	xor	edi, edi
+	test	r9b, r11b
+	jne	.LBB0_598
+# %bb.589:
+	and	al, sil
+	jne	.LBB0_598
+# %bb.590:
+	mov	edi, r10d
+	and	edi, -32
+	lea	rsi, [rdi - 32]
+	mov	rax, rsi
+	shr	rax, 5
+	add	rax, 1
+	mov	r9d, eax
+	and	r9d, 3
+	cmp	rsi, 96
+	jae	.LBB0_592
+# %bb.591:
+	xor	esi, esi
+	jmp	.LBB0_594
+.LBB0_724:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	sil
+	xor	edi, edi
+	test	r9b, r11b
+	jne	.LBB0_734
+# %bb.725:
+	and	al, sil
+	jne	.LBB0_734
+# %bb.726:
+	mov	edi, r10d
+	and	edi, -32
+	lea	rsi, [rdi - 32]
+	mov	rax, rsi
+	shr	rax, 5
+	add	rax, 1
+	mov	r9d, eax
+	and	r9d, 3
+	cmp	rsi, 96
+	jae	.LBB0_728
+# %bb.727:
+	xor	esi, esi
+	jmp	.LBB0_730
+.LBB0_58:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_63
+# %bb.59:
+	and	al, dil
+	jne	.LBB0_63
+# %bb.60:
+	mov	esi, r10d
+	and	esi, -128
+	xor	edi, edi
+.LBB0_61:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
+	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
+	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+	vmovdqu	ymmword ptr [r8 + rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
+	sub	rdi, -128
+	cmp	rsi, rdi
+	jne	.LBB0_61
+# %bb.62:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_63:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB0_65
+.LBB0_64:                               # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	add	rsi, 1
+	add	rdi, -1
+	jne	.LBB0_64
+.LBB0_65:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_66:                               # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	movzx	eax, byte ptr [rcx + rsi + 1]
+	add	al, byte ptr [rdx + rsi + 1]
+	mov	byte ptr [r8 + rsi + 1], al
+	movzx	eax, byte ptr [rcx + rsi + 2]
+	add	al, byte ptr [rdx + rsi + 2]
+	mov	byte ptr [r8 + rsi + 2], al
+	movzx	eax, byte ptr [rcx + rsi + 3]
+	add	al, byte ptr [rdx + rsi + 3]
+	mov	byte ptr [r8 + rsi + 3], al
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_66
+	jmp	.LBB0_825
+.LBB0_324:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_329
+# %bb.325:
+	and	al, dil
+	jne	.LBB0_329
+# %bb.326:
+	mov	esi, r10d
+	and	esi, -128
+	xor	edi, edi
+.LBB0_327:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + rdi + 96]
+	vpsubb	ymm0, ymm0, ymmword ptr [rcx + rdi]
+	vpsubb	ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+	vpsubb	ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+	vpsubb	ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+	vmovdqu	ymmword ptr [r8 + rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
+	sub	rdi, -128
+	cmp	rsi, rdi
+	jne	.LBB0_327
+# %bb.328:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_329:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB0_331
+.LBB0_330:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rdx + rsi]
+	sub	al, byte ptr [rcx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	add	rsi, 1
+	add	rdi, -1
+	jne	.LBB0_330
+.LBB0_331:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_332:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rdx + rsi]
+	sub	al, byte ptr [rcx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	movzx	eax, byte ptr [rdx + rsi + 1]
+	sub	al, byte ptr [rcx + rsi + 1]
+	mov	byte ptr [r8 + rsi + 1], al
+	movzx	eax, byte ptr [rdx + rsi + 2]
+	sub	al, byte ptr [rcx + rsi + 2]
+	mov	byte ptr [r8 + rsi + 2], al
+	movzx	eax, byte ptr [rdx + rsi + 3]
+	sub	al, byte ptr [rcx + rsi + 3]
+	mov	byte ptr [r8 + rsi + 3], al
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_332
+	jmp	.LBB0_825
+.LBB0_191:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_196
+# %bb.192:
+	and	al, dil
+	jne	.LBB0_196
+# %bb.193:
+	mov	esi, r10d
+	and	esi, -128
+	xor	edi, edi
+.LBB0_194:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
+	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
+	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+	vmovdqu	ymmword ptr [r8 + rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
+	sub	rdi, -128
+	cmp	rsi, rdi
+	jne	.LBB0_194
+# %bb.195:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_196:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB0_198
+.LBB0_197:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	add	rsi, 1
+	add	rdi, -1
+	jne	.LBB0_197
+.LBB0_198:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_199:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	movzx	eax, byte ptr [rcx + rsi + 1]
+	add	al, byte ptr [rdx + rsi + 1]
+	mov	byte ptr [r8 + rsi + 1], al
+	movzx	eax, byte ptr [rcx + rsi + 2]
+	add	al, byte ptr [rdx + rsi + 2]
+	mov	byte ptr [r8 + rsi + 2], al
+	movzx	eax, byte ptr [rcx + rsi + 3]
+	add	al, byte ptr [rdx + rsi + 3]
+	mov	byte ptr [r8 + rsi + 3], al
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_199
+	jmp	.LBB0_825
+.LBB0_457:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_462
+# %bb.458:
+	and	al, dil
+	jne	.LBB0_462
+# %bb.459:
+	mov	esi, r10d
+	and	esi, -128
+	xor	edi, edi
+.LBB0_460:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + rdi + 96]
+	vpsubb	ymm0, ymm0, ymmword ptr [rcx + rdi]
+	vpsubb	ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+	vpsubb	ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+	vpsubb	ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+	vmovdqu	ymmword ptr [r8 + rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
+	sub	rdi, -128
+	cmp	rsi, rdi
+	jne	.LBB0_460
+# %bb.461:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_462:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB0_464
+.LBB0_463:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rdx + rsi]
+	sub	al, byte ptr [rcx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	add	rsi, 1
+	add	rdi, -1
+	jne	.LBB0_463
+.LBB0_464:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_465:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rdx + rsi]
+	sub	al, byte ptr [rcx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	movzx	eax, byte ptr [rdx + rsi + 1]
+	sub	al, byte ptr [rcx + rsi + 1]
+	mov	byte ptr [r8 + rsi + 1], al
+	movzx	eax, byte ptr [rdx + rsi + 2]
+	sub	al, byte ptr [rcx + rsi + 2]
+	mov	byte ptr [r8 + rsi + 2], al
+	movzx	eax, byte ptr [rdx + rsi + 3]
+	sub	al, byte ptr [rcx + rsi + 3]
+	mov	byte ptr [r8 + rsi + 3], al
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_465
+	jmp	.LBB0_825
+.LBB0_647:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_652
+# %bb.648:
+	and	al, dil
+	jne	.LBB0_652
+# %bb.649:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_650:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymm4, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm5, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm6, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm7, ymmword ptr [rcx + 8*rdi + 96]
+	vpsrlq	ymm8, ymm4, 32
+	vpmuludq	ymm8, ymm8, ymm1
+	vpsrlq	ymm9, ymm1, 32
+	vpmuludq	ymm9, ymm9, ymm4
+	vpaddq	ymm8, ymm9, ymm8
+	vpsllq	ymm8, ymm8, 32
+	vpmuludq	ymm1, ymm4, ymm1
+	vpaddq	ymm1, ymm8, ymm1
+	vpsrlq	ymm4, ymm5, 32
+	vpmuludq	ymm4, ymm4, ymm2
+	vpsrlq	ymm8, ymm2, 32
+	vpmuludq	ymm8, ymm8, ymm5
+	vpaddq	ymm4, ymm8, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm2, ymm5, ymm2
+	vpaddq	ymm2, ymm2, ymm4
+	vpsrlq	ymm4, ymm6, 32
+	vpmuludq	ymm4, ymm4, ymm3
+	vpsrlq	ymm5, ymm3, 32
+	vpmuludq	ymm5, ymm6, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm3, ymm6, ymm3
+	vpaddq	ymm3, ymm3, ymm4
+	vpsrlq	ymm4, ymm7, 32
+	vpmuludq	ymm4, ymm4, ymm0
+	vpsrlq	ymm5, ymm0, 32
+	vpmuludq	ymm5, ymm7, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm0, ymm7, ymm0
+	vpaddq	ymm0, ymm0, ymm4
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm3
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm0
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_650
+# %bb.651:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_652:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_654
+.LBB0_653:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	imul	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_653
+.LBB0_654:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_655:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	imul	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	imul	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	imul	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	imul	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_655
+	jmp	.LBB0_825
+.LBB0_783:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_788
+# %bb.784:
+	and	al, dil
+	jne	.LBB0_788
+# %bb.785:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_786:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymm4, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm5, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm6, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm7, ymmword ptr [rcx + 8*rdi + 96]
+	vpsrlq	ymm8, ymm4, 32
+	vpmuludq	ymm8, ymm8, ymm1
+	vpsrlq	ymm9, ymm1, 32
+	vpmuludq	ymm9, ymm9, ymm4
+	vpaddq	ymm8, ymm9, ymm8
+	vpsllq	ymm8, ymm8, 32
+	vpmuludq	ymm1, ymm4, ymm1
+	vpaddq	ymm1, ymm8, ymm1
+	vpsrlq	ymm4, ymm5, 32
+	vpmuludq	ymm4, ymm4, ymm2
+	vpsrlq	ymm8, ymm2, 32
+	vpmuludq	ymm8, ymm8, ymm5
+	vpaddq	ymm4, ymm8, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm2, ymm5, ymm2
+	vpaddq	ymm2, ymm2, ymm4
+	vpsrlq	ymm4, ymm6, 32
+	vpmuludq	ymm4, ymm4, ymm3
+	vpsrlq	ymm5, ymm3, 32
+	vpmuludq	ymm5, ymm6, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm3, ymm6, ymm3
+	vpaddq	ymm3, ymm3, ymm4
+	vpsrlq	ymm4, ymm7, 32
+	vpmuludq	ymm4, ymm4, ymm0
+	vpsrlq	ymm5, ymm0, 32
+	vpmuludq	ymm5, ymm7, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm0, ymm7, ymm0
+	vpaddq	ymm0, ymm0, ymm4
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm3
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm0
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_786
+# %bb.787:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_788:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_790
+.LBB0_789:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	imul	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_789
+.LBB0_790:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_791:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	imul	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	imul	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	imul	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	imul	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_791
+	jmp	.LBB0_825
+.LBB0_112:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_117
+# %bb.113:
+	and	al, dil
+	jne	.LBB0_117
+# %bb.114:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_115:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_115
+# %bb.116:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_117:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_119
+.LBB0_118:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	add	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_118
+.LBB0_119:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_120:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	add	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	add	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	add	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	add	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_120
+	jmp	.LBB0_825
+.LBB0_378:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_383
+# %bb.379:
+	and	al, dil
+	jne	.LBB0_383
+# %bb.380:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_381:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_381
+# %bb.382:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_383:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_385
+.LBB0_384:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rdx + 8*rsi]
+	sub	rdi, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_384
+.LBB0_385:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_386:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rdx + 8*rsi]
+	sub	rax, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 8]
+	sub	rax, qword ptr [rcx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 16]
+	sub	rax, qword ptr [rcx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 24]
+	sub	rax, qword ptr [rcx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_386
+	jmp	.LBB0_825
+.LBB0_245:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_250
+# %bb.246:
+	and	al, dil
+	jne	.LBB0_250
+# %bb.247:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_248:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_248
+# %bb.249:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_250:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_252
+.LBB0_251:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	add	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_251
+.LBB0_252:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_253:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	add	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	add	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	add	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	add	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_253
+	jmp	.LBB0_825
+.LBB0_511:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_516
+# %bb.512:
+	and	al, dil
+	jne	.LBB0_516
+# %bb.513:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_514:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_514
+# %bb.515:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_516:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_518
+.LBB0_517:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rdx + 8*rsi]
+	sub	rdi, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_517
+.LBB0_518:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_519:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rdx + 8*rsi]
+	sub	rax, qword ptr [rcx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 8]
+	sub	rax, qword ptr [rcx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 16]
+	sub	rax, qword ptr [rcx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rdx + 8*rsi + 24]
+	sub	rax, qword ptr [rcx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_519
+	jmp	.LBB0_825
+.LBB0_70:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 2*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_75
+# %bb.71:
+	and	al, dil
+	jne	.LBB0_75
+# %bb.72:
+	mov	esi, r10d
+	and	esi, -64
+	xor	edi, edi
+.LBB0_73:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
+	cmp	rsi, rdi
+	jne	.LBB0_73
+# %bb.74:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_75:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_77
+.LBB0_76:                               # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	add	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_76
+.LBB0_77:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_78:                               # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	add	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	add	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	add	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	add	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_78
+	jmp	.LBB0_825
+.LBB0_82:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 2*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_87
+# %bb.83:
+	and	al, dil
+	jne	.LBB0_87
+# %bb.84:
+	mov	esi, r10d
+	and	esi, -64
+	xor	edi, edi
+.LBB0_85:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
+	cmp	rsi, rdi
+	jne	.LBB0_85
+# %bb.86:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_353:
+	je	.LBB0_825
+.LBB0_87:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_355
-.LBB0_354:                              # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rdx + 4*rsi]
-	sub	edi, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
+	je	.LBB0_89
+.LBB0_88:                               # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	add	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_354
-.LBB0_355:
+	jne	.LBB0_88
+.LBB0_89:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_356:                              # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rdx + 4*rsi]
-	sub	eax, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 4]
-	sub	eax, dword ptr [rcx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 8]
-	sub	eax, dword ptr [rcx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 12]
-	sub	eax, dword ptr [rcx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
+	jb	.LBB0_825
+.LBB0_90:                               # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	add	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	add	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	add	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	add	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_356
-	jmp	.LBB0_537
-.LBB0_474:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
+	jne	.LBB0_90
+	jmp	.LBB0_825
+.LBB0_203:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 4*r10]
+	lea	rax, [rcx + 2*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -668,73 +3017,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_479
-# %bb.475:
+	jne	.LBB0_208
+# %bb.204:
 	and	al, dil
-	jne	.LBB0_479
-# %bb.476:
+	jne	.LBB0_208
+# %bb.205:
 	mov	esi, r10d
-	and	esi, -32
+	and	esi, -64
 	xor	edi, edi
-.LBB0_477:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
-	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
+.LBB0_206:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_477
-# %bb.478:
+	jne	.LBB0_206
+# %bb.207:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_479:
+	je	.LBB0_825
+.LBB0_208:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_481
-.LBB0_480:                              # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rdx + 4*rsi]
-	sub	edi, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
+	je	.LBB0_210
+.LBB0_209:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	add	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_480
-.LBB0_481:
+	jne	.LBB0_209
+.LBB0_210:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_482:                              # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rdx + 4*rsi]
-	sub	eax, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 4]
-	sub	eax, dword ptr [rcx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 8]
-	sub	eax, dword ptr [rcx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 12]
-	sub	eax, dword ptr [rcx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
+	jb	.LBB0_825
+.LBB0_211:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	add	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	add	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	add	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	add	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_482
-	jmp	.LBB0_537
-.LBB0_89:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
+	jne	.LBB0_211
+	jmp	.LBB0_825
+.LBB0_215:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 4*r10]
+	lea	rax, [rcx + 2*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -743,73 +3092,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_94
-# %bb.90:
+	jne	.LBB0_220
+# %bb.216:
 	and	al, dil
-	jne	.LBB0_94
-# %bb.91:
+	jne	.LBB0_220
+# %bb.217:
 	mov	esi, r10d
-	and	esi, -32
+	and	esi, -64
 	xor	edi, edi
-.LBB0_92:                               # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
-	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
+.LBB0_218:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_92
-# %bb.93:
+	jne	.LBB0_218
+# %bb.219:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_94:
+	je	.LBB0_825
+.LBB0_220:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_96
-.LBB0_95:                               # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rcx + 4*rsi]
-	add	edi, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
+	je	.LBB0_222
+.LBB0_221:                              # =>This Inner Loop Header: Depth=1
+	movzx	edi, word ptr [rcx + 2*rsi]
+	add	di, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], di
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_95
-.LBB0_96:
+	jne	.LBB0_221
+.LBB0_222:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_97:                               # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rcx + 4*rsi]
-	add	eax, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 4]
-	add	eax, dword ptr [rdx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 8]
-	add	eax, dword ptr [rdx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 12]
-	add	eax, dword ptr [rdx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
+	jb	.LBB0_825
+.LBB0_223:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, word ptr [rcx + 2*rsi]
+	add	ax, word ptr [rdx + 2*rsi]
+	mov	word ptr [r8 + 2*rsi], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 2]
+	add	ax, word ptr [rdx + 2*rsi + 2]
+	mov	word ptr [r8 + 2*rsi + 2], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 4]
+	add	ax, word ptr [rdx + 2*rsi + 4]
+	mov	word ptr [r8 + 2*rsi + 4], ax
+	movzx	eax, word ptr [rcx + 2*rsi + 6]
+	add	ax, word ptr [rdx + 2*rsi + 6]
+	mov	word ptr [r8 + 2*rsi + 6], ax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_97
-	jmp	.LBB0_537
-.LBB0_222:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
+	jne	.LBB0_223
+	jmp	.LBB0_825
+.LBB0_124:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 4*r10]
+	lea	rax, [rcx + 8*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -818,73 +3167,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_227
-# %bb.223:
+	jne	.LBB0_129
+# %bb.125:
 	and	al, dil
-	jne	.LBB0_227
-# %bb.224:
+	jne	.LBB0_129
+# %bb.126:
 	mov	esi, r10d
-	and	esi, -32
+	and	esi, -16
 	xor	edi, edi
-.LBB0_225:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
-	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
+.LBB0_127:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_225
-# %bb.226:
+	jne	.LBB0_127
+# %bb.128:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_227:
+	je	.LBB0_825
+.LBB0_129:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_229
-.LBB0_228:                              # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rcx + 4*rsi]
-	add	edi, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
+	je	.LBB0_131
+.LBB0_130:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	add	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_228
-.LBB0_229:
+	jne	.LBB0_130
+.LBB0_131:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_230:                              # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rcx + 4*rsi]
-	add	eax, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 4]
-	add	eax, dword ptr [rdx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 8]
-	add	eax, dword ptr [rdx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 12]
-	add	eax, dword ptr [rdx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
+	jb	.LBB0_825
+.LBB0_132:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	add	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	add	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	add	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	add	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_230
-	jmp	.LBB0_537
-.LBB0_402:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_132
+	jmp	.LBB0_825
+.LBB0_136:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -893,68 +3242,68 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_407
-# %bb.403:
+	jne	.LBB0_141
+# %bb.137:
 	and	al, dil
-	jne	.LBB0_407
-# %bb.404:
+	jne	.LBB0_141
+# %bb.138:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -32
 	xor	edi, edi
-.LBB0_405:                              # =>This Inner Loop Header: Depth=1
-	vmovupd	ymm0, ymmword ptr [rdx + 8*rdi]
-	vmovupd	ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vmovupd	ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vmovupd	ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vsubpd	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
-	vsubpd	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vsubpd	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vsubpd	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_139:                              # =>This Inner Loop Header: Depth=1
+	vmovups	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovups	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovups	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovups	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vaddps	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vaddps	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vaddps	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vaddps	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_405
-# %bb.406:
+	jne	.LBB0_139
+# %bb.140:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_407:
+	je	.LBB0_825
+.LBB0_141:
 	mov	rdi, rsi
 	not	rdi
 	add	rdi, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_409
-.LBB0_408:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	je	.LBB0_143
+.LBB0_142:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_408
-.LBB0_409:
+	jne	.LBB0_142
+.LBB0_143:
 	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_410:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
-	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
-	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
-	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	jb	.LBB0_825
+.LBB0_144:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_410
-	jmp	.LBB0_537
-.LBB0_528:
+	jne	.LBB0_144
+	jmp	.LBB0_825
+.LBB0_257:
 	lea	rsi, [r8 + 8*r10]
 	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
@@ -968,73 +3317,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_533
-# %bb.529:
+	jne	.LBB0_262
+# %bb.258:
 	and	al, dil
-	jne	.LBB0_533
-# %bb.530:
+	jne	.LBB0_262
+# %bb.259:
 	mov	esi, r10d
 	and	esi, -16
 	xor	edi, edi
-.LBB0_531:                              # =>This Inner Loop Header: Depth=1
-	vmovupd	ymm0, ymmword ptr [rdx + 8*rdi]
-	vmovupd	ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vmovupd	ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vmovupd	ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vsubpd	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
-	vsubpd	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vsubpd	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vsubpd	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
+.LBB0_260:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
 	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_531
-# %bb.532:
+	jne	.LBB0_260
+# %bb.261:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_533:
-	mov	rdi, rsi
-	not	rdi
-	add	rdi, r10
+	je	.LBB0_825
+.LBB0_262:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_535
-.LBB0_534:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	je	.LBB0_264
+.LBB0_263:                              # =>This Inner Loop Header: Depth=1
+	mov	rdi, qword ptr [rcx + 8*rsi]
+	add	rdi, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rdi
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_534
-.LBB0_535:
-	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_536:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi]   # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
-	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
-	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
-	vmovsd	xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
-	vsubsd	xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
-	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	jne	.LBB0_263
+.LBB0_264:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_265:                              # =>This Inner Loop Header: Depth=1
+	mov	rax, qword ptr [rcx + 8*rsi]
+	add	rax, qword ptr [rdx + 8*rsi]
+	mov	qword ptr [r8 + 8*rsi], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 8]
+	add	rax, qword ptr [rdx + 8*rsi + 8]
+	mov	qword ptr [r8 + 8*rsi + 8], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 16]
+	add	rax, qword ptr [rdx + 8*rsi + 16]
+	mov	qword ptr [r8 + 8*rsi + 16], rax
+	mov	rax, qword ptr [rcx + 8*rsi + 24]
+	add	rax, qword ptr [rdx + 8*rsi + 24]
+	mov	qword ptr [r8 + 8*rsi + 24], rax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_536
-	jmp	.LBB0_537
-.LBB0_143:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_265
+	jmp	.LBB0_825
+.LBB0_269:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1043,73 +3392,137 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_148
-# %bb.144:
+	jne	.LBB0_274
+# %bb.270:
 	and	al, dil
-	jne	.LBB0_148
-# %bb.145:
+	jne	.LBB0_274
+# %bb.271:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -32
 	xor	edi, edi
-.LBB0_146:                              # =>This Inner Loop Header: Depth=1
-	vmovupd	ymm0, ymmword ptr [rcx + 8*rdi]
-	vmovupd	ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vmovupd	ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vmovupd	ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vaddpd	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
-	vaddpd	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vaddpd	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vaddpd	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_272:                              # =>This Inner Loop Header: Depth=1
+	vmovups	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovups	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovups	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovups	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vaddps	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vaddps	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vaddps	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vaddps	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_146
-# %bb.147:
+	jne	.LBB0_272
+# %bb.273:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_148:
+	je	.LBB0_825
+.LBB0_274:
 	mov	rdi, rsi
 	not	rdi
 	add	rdi, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_150
-.LBB0_149:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	je	.LBB0_276
+.LBB0_275:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_149
-.LBB0_150:
+	jne	.LBB0_275
+.LBB0_276:
 	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_151:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
-	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
-	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
-	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	jb	.LBB0_825
+.LBB0_277:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
+	vmovss	dword ptr [r8 + 4*rsi], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
+	vmovss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_151
-	jmp	.LBB0_537
-.LBB0_276:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_277
+	jmp	.LBB0_825
+.LBB0_574:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	sil
+	xor	edi, edi
+	test	r9b, r11b
+	jne	.LBB0_584
+# %bb.575:
+	and	al, sil
+	jne	.LBB0_584
+# %bb.576:
+	mov	edi, r10d
+	and	edi, -32
+	lea	rsi, [rdi - 32]
+	mov	rax, rsi
+	shr	rax, 5
+	add	rax, 1
+	mov	r9d, eax
+	and	r9d, 3
+	cmp	rsi, 96
+	jae	.LBB0_578
+# %bb.577:
+	xor	esi, esi
+	jmp	.LBB0_580
+.LBB0_710:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	sil
+	xor	edi, edi
+	test	r9b, r11b
+	jne	.LBB0_720
+# %bb.711:
+	and	al, sil
+	jne	.LBB0_720
+# %bb.712:
+	mov	edi, r10d
+	and	edi, -32
+	lea	rsi, [rdi - 32]
+	mov	rax, rsi
+	shr	rax, 5
+	add	rax, 1
+	mov	r9d, eax
+	and	r9d, 3
+	cmp	rsi, 96
+	jae	.LBB0_714
+# %bb.713:
+	xor	esi, esi
+	jmp	.LBB0_716
+.LBB0_49:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1118,67 +3531,67 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_281
-# %bb.277:
+	jne	.LBB0_54
+# %bb.50:
 	and	al, dil
-	jne	.LBB0_281
-# %bb.278:
+	jne	.LBB0_54
+# %bb.51:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -128
 	xor	edi, edi
-.LBB0_279:                              # =>This Inner Loop Header: Depth=1
-	vmovupd	ymm0, ymmword ptr [rcx + 8*rdi]
-	vmovupd	ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vmovupd	ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vmovupd	ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vaddpd	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
-	vaddpd	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vaddpd	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vaddpd	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_52:                               # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
+	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
+	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+	vmovdqu	ymmword ptr [r8 + rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
+	sub	rdi, -128
 	cmp	rsi, rdi
-	jne	.LBB0_279
-# %bb.280:
+	jne	.LBB0_52
+# %bb.53:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_281:
-	mov	rdi, rsi
-	not	rdi
-	add	rdi, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_283
-.LBB0_282:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
+	je	.LBB0_825
+.LBB0_54:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB0_56
+.LBB0_55:                               # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
 	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_282
-.LBB0_283:
-	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_284:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi]   # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi]
-	vmovsd	qword ptr [r8 + 8*rsi], xmm0
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
-	vmovsd	qword ptr [r8 + 8*rsi + 8], xmm0
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
-	vmovsd	qword ptr [r8 + 8*rsi + 16], xmm0
-	vmovsd	xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
-	vaddsd	xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
-	vmovsd	qword ptr [r8 + 8*rsi + 24], xmm0
+	add	rdi, -1
+	jne	.LBB0_55
+.LBB0_56:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_57:                               # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	movzx	eax, byte ptr [rcx + rsi + 1]
+	add	al, byte ptr [rdx + rsi + 1]
+	mov	byte ptr [r8 + rsi + 1], al
+	movzx	eax, byte ptr [rcx + rsi + 2]
+	add	al, byte ptr [rdx + rsi + 2]
+	mov	byte ptr [r8 + rsi + 2], al
+	movzx	eax, byte ptr [rcx + rsi + 3]
+	add	al, byte ptr [rdx + rsi + 3]
+	mov	byte ptr [r8 + rsi + 3], al
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_284
-	jmp	.LBB0_537
+	jne	.LBB0_57
+	jmp	.LBB0_825
 .LBB0_315:
 	lea	rsi, [r8 + r10]
 	lea	rax, [rdx + r10]
@@ -1219,7 +3632,7 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	jne	.LBB0_318
 # %bb.319:
 	cmp	rsi, r10
-	je	.LBB0_537
+	je	.LBB0_825
 .LBB0_320:
 	mov	r9, rsi
 	not	r9
@@ -1236,7 +3649,7 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	jne	.LBB0_321
 .LBB0_322:
 	cmp	r9, 3
-	jb	.LBB0_537
+	jb	.LBB0_825
 .LBB0_323:                              # =>This Inner Loop Header: Depth=1
 	movzx	eax, byte ptr [rdx + rsi]
 	sub	al, byte ptr [rcx + rsi]
@@ -1253,8 +3666,8 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	add	rsi, 4
 	cmp	r10, rsi
 	jne	.LBB0_323
-	jmp	.LBB0_537
-.LBB0_441:
+	jmp	.LBB0_825
+.LBB0_182:
 	lea	rsi, [r8 + r10]
 	lea	rax, [rdx + r10]
 	cmp	rax, r8
@@ -1268,15 +3681,90 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_446
-# %bb.442:
+	jne	.LBB0_187
+# %bb.183:
 	and	al, dil
-	jne	.LBB0_446
-# %bb.443:
+	jne	.LBB0_187
+# %bb.184:
+	mov	esi, r10d
+	and	esi, -128
+	xor	edi, edi
+.LBB0_185:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
+	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
+	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+	vmovdqu	ymmword ptr [r8 + rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
+	sub	rdi, -128
+	cmp	rsi, rdi
+	jne	.LBB0_185
+# %bb.186:
+	cmp	rsi, r10
+	je	.LBB0_825
+.LBB0_187:
+	mov	r9, rsi
+	not	r9
+	add	r9, r10
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB0_189
+.LBB0_188:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	add	rsi, 1
+	add	rdi, -1
+	jne	.LBB0_188
+.LBB0_189:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_190:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rsi]
+	add	al, byte ptr [rdx + rsi]
+	mov	byte ptr [r8 + rsi], al
+	movzx	eax, byte ptr [rcx + rsi + 1]
+	add	al, byte ptr [rdx + rsi + 1]
+	mov	byte ptr [r8 + rsi + 1], al
+	movzx	eax, byte ptr [rcx + rsi + 2]
+	add	al, byte ptr [rdx + rsi + 2]
+	mov	byte ptr [r8 + rsi + 2], al
+	movzx	eax, byte ptr [rcx + rsi + 3]
+	add	al, byte ptr [rdx + rsi + 3]
+	mov	byte ptr [r8 + rsi + 3], al
+	add	rsi, 4
+	cmp	r10, rsi
+	jne	.LBB0_190
+	jmp	.LBB0_825
+.LBB0_448:
+	lea	rsi, [r8 + r10]
+	lea	rax, [rdx + r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_453
+# %bb.449:
+	and	al, dil
+	jne	.LBB0_453
+# %bb.450:
 	mov	esi, r10d
 	and	esi, -128
 	xor	edi, edi
-.LBB0_444:                              # =>This Inner Loop Header: Depth=1
+.LBB0_451:                              # =>This Inner Loop Header: Depth=1
 	vmovdqu	ymm0, ymmword ptr [rdx + rdi]
 	vmovdqu	ymm1, ymmword ptr [rdx + rdi + 32]
 	vmovdqu	ymm2, ymmword ptr [rdx + rdi + 64]
@@ -1291,28 +3779,28 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
 	sub	rdi, -128
 	cmp	rsi, rdi
-	jne	.LBB0_444
-# %bb.445:
+	jne	.LBB0_451
+# %bb.452:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_446:
+	je	.LBB0_825
+.LBB0_453:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rdi, r10
 	and	rdi, 3
-	je	.LBB0_448
-.LBB0_447:                              # =>This Inner Loop Header: Depth=1
+	je	.LBB0_455
+.LBB0_454:                              # =>This Inner Loop Header: Depth=1
 	movzx	eax, byte ptr [rdx + rsi]
 	sub	al, byte ptr [rcx + rsi]
 	mov	byte ptr [r8 + rsi], al
 	add	rsi, 1
 	add	rdi, -1
-	jne	.LBB0_447
-.LBB0_448:
+	jne	.LBB0_454
+.LBB0_455:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_449:                              # =>This Inner Loop Header: Depth=1
+	jb	.LBB0_825
+.LBB0_456:                              # =>This Inner Loop Header: Depth=1
 	movzx	eax, byte ptr [rdx + rsi]
 	sub	al, byte ptr [rcx + rsi]
 	mov	byte ptr [r8 + rsi], al
@@ -1327,14 +3815,14 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	mov	byte ptr [r8 + rsi + 3], al
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_449
-	jmp	.LBB0_537
-.LBB0_56:
-	lea	rsi, [r8 + r10]
-	lea	rax, [rdx + r10]
+	jne	.LBB0_456
+	jmp	.LBB0_825
+.LBB0_638:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1343,73 +3831,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_61
-# %bb.57:
+	jne	.LBB0_643
+# %bb.639:
 	and	al, dil
-	jne	.LBB0_61
-# %bb.58:
+	jne	.LBB0_643
+# %bb.640:
 	mov	esi, r10d
-	and	esi, -128
+	and	esi, -32
 	xor	edi, edi
-.LBB0_59:                               # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
-	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
-	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
-	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
-	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
-	vmovdqu	ymmword ptr [r8 + rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
-	sub	rdi, -128
+.LBB0_641:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpmulld	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpmulld	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpmulld	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpmulld	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_59
-# %bb.60:
+	jne	.LBB0_641
+# %bb.642:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_61:
+	je	.LBB0_825
+.LBB0_643:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
-	mov	rdi, r10
-	and	rdi, 3
-	je	.LBB0_63
-.LBB0_62:                               # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_645
+.LBB0_644:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	imul	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
 	add	rsi, 1
-	add	rdi, -1
-	jne	.LBB0_62
-.LBB0_63:
+	add	rax, -1
+	jne	.LBB0_644
+.LBB0_645:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_64:                               # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	movzx	eax, byte ptr [rcx + rsi + 1]
-	add	al, byte ptr [rdx + rsi + 1]
-	mov	byte ptr [r8 + rsi + 1], al
-	movzx	eax, byte ptr [rcx + rsi + 2]
-	add	al, byte ptr [rdx + rsi + 2]
-	mov	byte ptr [r8 + rsi + 2], al
-	movzx	eax, byte ptr [rcx + rsi + 3]
-	add	al, byte ptr [rdx + rsi + 3]
-	mov	byte ptr [r8 + rsi + 3], al
+	jb	.LBB0_825
+.LBB0_646:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	imul	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	imul	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	imul	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	imul	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_64
-	jmp	.LBB0_537
-.LBB0_189:
-	lea	rsi, [r8 + r10]
-	lea	rax, [rdx + r10]
+	jne	.LBB0_646
+	jmp	.LBB0_825
+.LBB0_774:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1418,73 +3906,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_194
-# %bb.190:
+	jne	.LBB0_779
+# %bb.775:
 	and	al, dil
-	jne	.LBB0_194
-# %bb.191:
+	jne	.LBB0_779
+# %bb.776:
 	mov	esi, r10d
-	and	esi, -128
+	and	esi, -32
 	xor	edi, edi
-.LBB0_192:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
-	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
-	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
-	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
-	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
-	vmovdqu	ymmword ptr [r8 + rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
-	sub	rdi, -128
+.LBB0_777:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpmulld	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpmulld	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpmulld	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpmulld	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_192
-# %bb.193:
+	jne	.LBB0_777
+# %bb.778:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_194:
+	je	.LBB0_825
+.LBB0_779:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
-	mov	rdi, r10
-	and	rdi, 3
-	je	.LBB0_196
-.LBB0_195:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
+	mov	rax, r10
+	and	rax, 3
+	je	.LBB0_781
+.LBB0_780:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	imul	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
 	add	rsi, 1
-	add	rdi, -1
-	jne	.LBB0_195
-.LBB0_196:
+	add	rax, -1
+	jne	.LBB0_780
+.LBB0_781:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_197:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	movzx	eax, byte ptr [rcx + rsi + 1]
-	add	al, byte ptr [rdx + rsi + 1]
-	mov	byte ptr [r8 + rsi + 1], al
-	movzx	eax, byte ptr [rcx + rsi + 2]
-	add	al, byte ptr [rdx + rsi + 2]
-	mov	byte ptr [r8 + rsi + 2], al
-	movzx	eax, byte ptr [rcx + rsi + 3]
-	add	al, byte ptr [rdx + rsi + 3]
-	mov	byte ptr [r8 + rsi + 3], al
+	jb	.LBB0_825
+.LBB0_782:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	imul	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	imul	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	imul	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	imul	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_197
-	jmp	.LBB0_537
-.LBB0_369:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_782
+	jmp	.LBB0_825
+.LBB0_103:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1493,73 +3981,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_374
-# %bb.370:
+	jne	.LBB0_108
+# %bb.104:
 	and	al, dil
-	jne	.LBB0_374
-# %bb.371:
+	jne	.LBB0_108
+# %bb.105:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -32
 	xor	edi, edi
-.LBB0_372:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
-	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_106:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_372
-# %bb.373:
+	jne	.LBB0_106
+# %bb.107:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_374:
+	je	.LBB0_825
+.LBB0_108:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_376
-.LBB0_375:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rdx + 8*rsi]
-	sub	rdi, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
+	je	.LBB0_110
+.LBB0_109:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	add	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_375
-.LBB0_376:
+	jne	.LBB0_109
+.LBB0_110:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_377:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rdx + 8*rsi]
-	sub	rax, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 8]
-	sub	rax, qword ptr [rcx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 16]
-	sub	rax, qword ptr [rcx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 24]
-	sub	rax, qword ptr [rcx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
+	jb	.LBB0_825
+.LBB0_111:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	add	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	add	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	add	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	add	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_377
-	jmp	.LBB0_537
-.LBB0_495:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_111
+	jmp	.LBB0_825
+.LBB0_369:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1568,73 +4056,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_500
-# %bb.496:
+	jne	.LBB0_374
+# %bb.370:
 	and	al, dil
-	jne	.LBB0_500
-# %bb.497:
+	jne	.LBB0_374
+# %bb.371:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -32
 	xor	edi, edi
-.LBB0_498:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
-	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_372:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_498
-# %bb.499:
+	jne	.LBB0_372
+# %bb.373:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_500:
+	je	.LBB0_825
+.LBB0_374:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_502
-.LBB0_501:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rdx + 8*rsi]
-	sub	rdi, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_501
-.LBB0_502:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_503:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rdx + 8*rsi]
-	sub	rax, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 8]
-	sub	rax, qword ptr [rcx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 16]
-	sub	rax, qword ptr [rcx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 24]
-	sub	rax, qword ptr [rcx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
+	and	rax, 3
+	je	.LBB0_376
+.LBB0_375:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rdx + 4*rsi]
+	sub	edi, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
+	add	rsi, 1
+	add	rax, -1
+	jne	.LBB0_375
+.LBB0_376:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_377:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rdx + 4*rsi]
+	sub	eax, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 4]
+	sub	eax, dword ptr [rcx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 8]
+	sub	eax, dword ptr [rcx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 12]
+	sub	eax, dword ptr [rcx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_503
-	jmp	.LBB0_537
-.LBB0_110:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_377
+	jmp	.LBB0_825
+.LBB0_236:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1643,73 +4131,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_115
-# %bb.111:
+	jne	.LBB0_241
+# %bb.237:
 	and	al, dil
-	jne	.LBB0_115
-# %bb.112:
+	jne	.LBB0_241
+# %bb.238:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -32
 	xor	edi, edi
-.LBB0_113:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
-	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_239:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_113
-# %bb.114:
+	jne	.LBB0_239
+# %bb.240:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_115:
+	je	.LBB0_825
+.LBB0_241:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_117
-.LBB0_116:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rcx + 8*rsi]
-	add	rdi, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
+	je	.LBB0_243
+.LBB0_242:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rcx + 4*rsi]
+	add	edi, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_116
-.LBB0_117:
+	jne	.LBB0_242
+.LBB0_243:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_118:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rcx + 8*rsi]
-	add	rax, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 8]
-	add	rax, qword ptr [rdx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 16]
-	add	rax, qword ptr [rdx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 24]
-	add	rax, qword ptr [rdx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
+	jb	.LBB0_825
+.LBB0_244:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rcx + 4*rsi]
+	add	eax, dword ptr [rdx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 4]
+	add	eax, dword ptr [rdx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 8]
+	add	eax, dword ptr [rdx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rcx + 4*rsi + 12]
+	add	eax, dword ptr [rdx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_118
-	jmp	.LBB0_537
-.LBB0_243:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_244
+	jmp	.LBB0_825
+.LBB0_502:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1718,73 +4206,73 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_248
-# %bb.244:
+	jne	.LBB0_507
+# %bb.503:
 	and	al, dil
-	jne	.LBB0_248
-# %bb.245:
+	jne	.LBB0_507
+# %bb.504:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -32
 	xor	edi, edi
-.LBB0_246:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
-	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_505:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_246
-# %bb.247:
+	jne	.LBB0_505
+# %bb.506:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_248:
+	je	.LBB0_825
+.LBB0_507:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rax, r10
 	and	rax, 3
-	je	.LBB0_250
-.LBB0_249:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rcx + 8*rsi]
-	add	rdi, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
+	je	.LBB0_509
+.LBB0_508:                              # =>This Inner Loop Header: Depth=1
+	mov	edi, dword ptr [rdx + 4*rsi]
+	sub	edi, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], edi
 	add	rsi, 1
 	add	rax, -1
-	jne	.LBB0_249
-.LBB0_250:
+	jne	.LBB0_508
+.LBB0_509:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_251:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rcx + 8*rsi]
-	add	rax, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 8]
-	add	rax, qword ptr [rdx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 16]
-	add	rax, qword ptr [rdx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 24]
-	add	rax, qword ptr [rdx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
+	jb	.LBB0_825
+.LBB0_510:                              # =>This Inner Loop Header: Depth=1
+	mov	eax, dword ptr [rdx + 4*rsi]
+	sub	eax, dword ptr [rcx + 4*rsi]
+	mov	dword ptr [r8 + 4*rsi], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 4]
+	sub	eax, dword ptr [rcx + 4*rsi + 4]
+	mov	dword ptr [r8 + 4*rsi + 4], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 8]
+	sub	eax, dword ptr [rcx + 4*rsi + 8]
+	mov	dword ptr [r8 + 4*rsi + 8], eax
+	mov	eax, dword ptr [rdx + 4*rsi + 12]
+	sub	eax, dword ptr [rcx + 4*rsi + 12]
+	mov	dword ptr [r8 + 4*rsi + 12], eax
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_251
-	jmp	.LBB0_537
-.LBB0_327:
-	lea	rsi, [r8 + 2*r10]
-	lea	rax, [rdx + 2*r10]
+	jne	.LBB0_510
+	jmp	.LBB0_825
+.LBB0_626:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 2*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1793,73 +4281,82 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_332
-# %bb.328:
+	jne	.LBB0_631
+# %bb.627:
 	and	al, dil
-	jne	.LBB0_332
-# %bb.329:
+	jne	.LBB0_631
+# %bb.628:
 	mov	esi, r10d
-	and	esi, -64
+	and	esi, -32
 	xor	edi, edi
-.LBB0_330:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
-	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
-	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
-	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
-	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
-	add	rdi, 64
+.LBB0_629:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpmulld	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpmulld	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpmulld	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpmulld	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_330
-# %bb.331:
+	jne	.LBB0_629
+# %bb.630:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_332:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_334
-.LBB0_333:                              # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rdx + 2*rsi]
-	sub	di, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_333
-.LBB0_334:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_335:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rdx + 2*rsi]
-	sub	ax, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 2]
-	sub	ax, word ptr [rcx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 4]
-	sub	ax, word ptr [rcx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 6]
-	sub	ax, word ptr [rcx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_335
-	jmp	.LBB0_537
-.LBB0_339:
-	lea	rsi, [r8 + 2*r10]
-	lea	rax, [rdx + 2*r10]
+	jne	.LBB0_631
+	jmp	.LBB0_825
+.LBB0_762:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 4*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_767
+# %bb.763:
+	and	al, dil
+	jne	.LBB0_767
+# %bb.764:
+	mov	esi, r10d
+	and	esi, -32
+	xor	edi, edi
+.LBB0_765:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vpmulld	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vpmulld	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vpmulld	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vpmulld	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
+	cmp	rsi, rdi
+	jne	.LBB0_765
+# %bb.766:
+	cmp	rsi, r10
+	jne	.LBB0_767
+	jmp	.LBB0_825
+.LBB0_357:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 2*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1868,73 +4365,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_344
-# %bb.340:
+	jne	.LBB0_362
+# %bb.358:
 	and	al, dil
-	jne	.LBB0_344
-# %bb.341:
+	jne	.LBB0_362
+# %bb.359:
 	mov	esi, r10d
-	and	esi, -64
+	and	esi, -32
 	xor	edi, edi
-.LBB0_342:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
-	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
-	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
-	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
-	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
-	add	rdi, 64
+.LBB0_360:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_342
-# %bb.343:
+	jne	.LBB0_360
+# %bb.361:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_344:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_346
-.LBB0_345:                              # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rdx + 2*rsi]
-	sub	di, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_345
-.LBB0_346:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_347:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rdx + 2*rsi]
-	sub	ax, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 2]
-	sub	ax, word ptr [rcx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 4]
-	sub	ax, word ptr [rcx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 6]
-	sub	ax, word ptr [rcx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_347
-	jmp	.LBB0_537
-.LBB0_453:
-	lea	rsi, [r8 + 2*r10]
-	lea	rax, [rdx + 2*r10]
+	jne	.LBB0_362
+	jmp	.LBB0_825
+.LBB0_490:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 2*r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -1943,73 +4407,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_458
-# %bb.454:
+	jne	.LBB0_495
+# %bb.491:
 	and	al, dil
-	jne	.LBB0_458
-# %bb.455:
+	jne	.LBB0_495
+# %bb.492:
 	mov	esi, r10d
-	and	esi, -64
+	and	esi, -32
 	xor	edi, edi
-.LBB0_456:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
-	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
-	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
-	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
-	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
-	add	rdi, 64
+.LBB0_493:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_456
-# %bb.457:
+	jne	.LBB0_493
+# %bb.494:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_458:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_460
-.LBB0_459:                              # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rdx + 2*rsi]
-	sub	di, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_459
-.LBB0_460:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_461:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rdx + 2*rsi]
-	sub	ax, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 2]
-	sub	ax, word ptr [rcx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 4]
-	sub	ax, word ptr [rcx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 6]
-	sub	ax, word ptr [rcx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_461
-	jmp	.LBB0_537
-.LBB0_465:
-	lea	rsi, [r8 + 2*r10]
-	lea	rax, [rdx + 2*r10]
+	jne	.LBB0_495
+	jmp	.LBB0_825
+.LBB0_680:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 2*r10]
+	lea	rax, [rcx + 8*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2018,73 +4449,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_470
-# %bb.466:
+	jne	.LBB0_685
+# %bb.681:
 	and	al, dil
-	jne	.LBB0_470
-# %bb.467:
+	jne	.LBB0_685
+# %bb.682:
 	mov	esi, r10d
-	and	esi, -64
+	and	esi, -16
 	xor	edi, edi
-.LBB0_468:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
-	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
-	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
-	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
-	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
-	add	rdi, 64
+.LBB0_683:                              # =>This Inner Loop Header: Depth=1
+	vmovupd	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovupd	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovupd	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovupd	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmulpd	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vmulpd	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmulpd	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmulpd	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_468
-# %bb.469:
+	jne	.LBB0_683
+# %bb.684:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_470:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_472
-.LBB0_471:                              # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rdx + 2*rsi]
-	sub	di, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_471
-.LBB0_472:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_473:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rdx + 2*rsi]
-	sub	ax, word ptr [rcx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 2]
-	sub	ax, word ptr [rcx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 4]
-	sub	ax, word ptr [rcx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rdx + 2*rsi + 6]
-	sub	ax, word ptr [rcx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_473
-	jmp	.LBB0_537
-.LBB0_68:
-	lea	rsi, [r8 + 2*r10]
-	lea	rax, [rdx + 2*r10]
+	jne	.LBB0_685
+	jmp	.LBB0_825
+.LBB0_816:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 2*r10]
+	lea	rax, [rcx + 8*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2093,73 +4491,82 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_73
-# %bb.69:
+	jne	.LBB0_821
+# %bb.817:
 	and	al, dil
-	jne	.LBB0_73
-# %bb.70:
+	jne	.LBB0_821
+# %bb.818:
 	mov	esi, r10d
-	and	esi, -64
+	and	esi, -16
 	xor	edi, edi
-.LBB0_71:                               # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
-	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
-	add	rdi, 64
+.LBB0_819:                              # =>This Inner Loop Header: Depth=1
+	vmovupd	ymm0, ymmword ptr [rcx + 8*rdi]
+	vmovupd	ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vmovupd	ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vmovupd	ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmulpd	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+	vmulpd	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmulpd	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmulpd	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
+	cmp	rsi, rdi
+	jne	.LBB0_819
+# %bb.820:
+	cmp	rsi, r10
+	jne	.LBB0_821
+	jmp	.LBB0_825
+.LBB0_411:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
+	cmp	rax, r8
+	seta	r9b
+	lea	rax, [rcx + 8*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_416
+# %bb.412:
+	and	al, dil
+	jne	.LBB0_416
+# %bb.413:
+	mov	esi, r10d
+	and	esi, -16
+	xor	edi, edi
+.LBB0_414:                              # =>This Inner Loop Header: Depth=1
+	vmovupd	ymm0, ymmword ptr [rdx + 8*rdi]
+	vmovupd	ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmovupd	ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmovupd	ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vsubpd	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+	vsubpd	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vsubpd	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vsubpd	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_71
-# %bb.72:
+	jne	.LBB0_414
+# %bb.415:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_73:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_75
-.LBB0_74:                               # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rcx + 2*rsi]
-	add	di, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_74
-.LBB0_75:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_76:                               # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rcx + 2*rsi]
-	add	ax, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 2]
-	add	ax, word ptr [rdx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 4]
-	add	ax, word ptr [rdx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 6]
-	add	ax, word ptr [rdx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_76
-	jmp	.LBB0_537
-.LBB0_80:
-	lea	rsi, [r8 + 2*r10]
-	lea	rax, [rdx + 2*r10]
+	jne	.LBB0_416
+	jmp	.LBB0_825
+.LBB0_544:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 2*r10]
+	lea	rax, [rcx + 8*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2168,68 +4575,35 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_85
-# %bb.81:
+	jne	.LBB0_549
+# %bb.545:
 	and	al, dil
-	jne	.LBB0_85
-# %bb.82:
+	jne	.LBB0_549
+# %bb.546:
 	mov	esi, r10d
-	and	esi, -64
+	and	esi, -16
 	xor	edi, edi
-.LBB0_83:                               # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
-	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
-	add	rdi, 64
+.LBB0_547:                              # =>This Inner Loop Header: Depth=1
+	vmovupd	ymm0, ymmword ptr [rdx + 8*rdi]
+	vmovupd	ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmovupd	ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmovupd	ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vsubpd	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+	vsubpd	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vsubpd	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vsubpd	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmovupd	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovupd	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovupd	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovupd	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_83
-# %bb.84:
+	jne	.LBB0_547
+# %bb.548:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_85:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_87
-.LBB0_86:                               # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rcx + 2*rsi]
-	add	di, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_86
-.LBB0_87:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_88:                               # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rcx + 2*rsi]
-	add	ax, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 2]
-	add	ax, word ptr [rdx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 4]
-	add	ax, word ptr [rdx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 6]
-	add	ax, word ptr [rdx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_88
-	jmp	.LBB0_537
-.LBB0_201:
+	jne	.LBB0_549
+	jmp	.LBB0_825
+.LBB0_605:
 	lea	rsi, [r8 + 2*r10]
 	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
@@ -2243,68 +4617,35 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_206
-# %bb.202:
+	jne	.LBB0_610
+# %bb.606:
 	and	al, dil
-	jne	.LBB0_206
-# %bb.203:
+	jne	.LBB0_610
+# %bb.607:
 	mov	esi, r10d
 	and	esi, -64
 	xor	edi, edi
-.LBB0_204:                              # =>This Inner Loop Header: Depth=1
+.LBB0_608:                              # =>This Inner Loop Header: Depth=1
 	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
 	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
 	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
 	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
-	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vpmullw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpmullw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpmullw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpmullw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
 	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
 	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
 	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
 	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
 	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_204
-# %bb.205:
+	jne	.LBB0_608
+# %bb.609:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_206:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_208
-.LBB0_207:                              # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rcx + 2*rsi]
-	add	di, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_207
-.LBB0_208:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_209:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rcx + 2*rsi]
-	add	ax, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 2]
-	add	ax, word ptr [rdx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 4]
-	add	ax, word ptr [rdx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 6]
-	add	ax, word ptr [rdx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_209
-	jmp	.LBB0_537
-.LBB0_213:
+	jne	.LBB0_610
+	jmp	.LBB0_825
+.LBB0_617:
 	lea	rsi, [r8 + 2*r10]
 	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
@@ -2318,148 +4659,82 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_218
-# %bb.214:
+	jne	.LBB0_622
+# %bb.618:
 	and	al, dil
-	jne	.LBB0_218
-# %bb.215:
+	jne	.LBB0_622
+# %bb.619:
 	mov	esi, r10d
 	and	esi, -64
 	xor	edi, edi
-.LBB0_216:                              # =>This Inner Loop Header: Depth=1
+.LBB0_620:                              # =>This Inner Loop Header: Depth=1
 	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
 	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
 	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
 	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
-	vpaddw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
-	vpaddw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
-	vpaddw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
-	vpaddw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vpmullw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpmullw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpmullw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpmullw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
 	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
 	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
 	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
 	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
 	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_216
-# %bb.217:
+	jne	.LBB0_620
+# %bb.621:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_218:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_220
-.LBB0_219:                              # =>This Inner Loop Header: Depth=1
-	movzx	edi, word ptr [rcx + 2*rsi]
-	add	di, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], di
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_219
-.LBB0_220:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_221:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, word ptr [rcx + 2*rsi]
-	add	ax, word ptr [rdx + 2*rsi]
-	mov	word ptr [r8 + 2*rsi], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 2]
-	add	ax, word ptr [rdx + 2*rsi + 2]
-	mov	word ptr [r8 + 2*rsi + 2], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 4]
-	add	ax, word ptr [rdx + 2*rsi + 4]
-	mov	word ptr [r8 + 2*rsi + 4], ax
-	movzx	eax, word ptr [rcx + 2*rsi + 6]
-	add	ax, word ptr [rdx + 2*rsi + 6]
-	mov	word ptr [r8 + 2*rsi + 6], ax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_221
-	jmp	.LBB0_537
-.LBB0_381:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
-	cmp	rax, r8
-	seta	r9b
-	lea	rax, [rcx + 8*r10]
-	cmp	rsi, rdx
-	seta	r11b
+	jne	.LBB0_622
+	jmp	.LBB0_825
+.LBB0_741:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
-	seta	al
-	cmp	rsi, rcx
-	seta	dil
-	xor	esi, esi
-	test	r9b, r11b
-	jne	.LBB0_386
-# %bb.382:
-	and	al, dil
-	jne	.LBB0_386
-# %bb.383:
-	mov	esi, r10d
-	and	esi, -16
-	xor	edi, edi
-.LBB0_384:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
-	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
-	cmp	rsi, rdi
-	jne	.LBB0_384
-# %bb.385:
-	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_386:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_388
-.LBB0_387:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rdx + 8*rsi]
-	sub	rdi, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_387
-.LBB0_388:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_389:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rdx + 8*rsi]
-	sub	rax, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 8]
-	sub	rax, qword ptr [rcx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 16]
-	sub	rax, qword ptr [rcx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 24]
-	sub	rax, qword ptr [rcx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_389
-	jmp	.LBB0_537
-.LBB0_393:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
+	seta	r9b
+	lea	rax, [rcx + 2*r10]
+	cmp	rsi, rdx
+	seta	r11b
+	cmp	rax, r8
+	seta	al
+	cmp	rsi, rcx
+	seta	dil
+	xor	esi, esi
+	test	r9b, r11b
+	jne	.LBB0_746
+# %bb.742:
+	and	al, dil
+	jne	.LBB0_746
+# %bb.743:
+	mov	esi, r10d
+	and	esi, -64
+	xor	edi, edi
+.LBB0_744:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vpmullw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpmullw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpmullw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpmullw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
+	cmp	rsi, rdi
+	jne	.LBB0_744
+# %bb.745:
+	cmp	rsi, r10
+	jne	.LBB0_746
+	jmp	.LBB0_825
+.LBB0_753:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 4*r10]
+	lea	rax, [rcx + 2*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2468,73 +4743,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_398
-# %bb.394:
+	jne	.LBB0_758
+# %bb.754:
 	and	al, dil
-	jne	.LBB0_398
-# %bb.395:
+	jne	.LBB0_758
+# %bb.755:
 	mov	esi, r10d
-	and	esi, -32
+	and	esi, -64
 	xor	edi, edi
-.LBB0_396:                              # =>This Inner Loop Header: Depth=1
-	vmovups	ymm0, ymmword ptr [rdx + 4*rdi]
-	vmovups	ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vmovups	ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vmovups	ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vsubps	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
-	vsubps	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vsubps	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vsubps	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
+.LBB0_756:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rcx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vpmullw	ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+	vpmullw	ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vpmullw	ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vpmullw	ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_396
-# %bb.397:
+	jne	.LBB0_756
+# %bb.757:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_398:
-	mov	rdi, rsi
-	not	rdi
-	add	rdi, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_400
-.LBB0_399:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_399
-.LBB0_400:
-	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_401:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	vmovss	xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
-	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
-	vmovss	xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
-	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
-	vmovss	xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
-	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_401
-	jmp	.LBB0_537
-.LBB0_507:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_758
+	jmp	.LBB0_825
+.LBB0_336:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 2*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2543,73 +4785,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_512
-# %bb.508:
+	jne	.LBB0_341
+# %bb.337:
 	and	al, dil
-	jne	.LBB0_512
-# %bb.509:
+	jne	.LBB0_341
+# %bb.338:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -64
 	xor	edi, edi
-.LBB0_510:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
-	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_339:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_510
-# %bb.511:
+	jne	.LBB0_339
+# %bb.340:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_512:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_514
-.LBB0_513:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rdx + 8*rsi]
-	sub	rdi, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_513
-.LBB0_514:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_515:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rdx + 8*rsi]
-	sub	rax, qword ptr [rcx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 8]
-	sub	rax, qword ptr [rcx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 16]
-	sub	rax, qword ptr [rcx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rdx + 8*rsi + 24]
-	sub	rax, qword ptr [rcx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_515
-	jmp	.LBB0_537
-.LBB0_519:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
+	jne	.LBB0_341
+	jmp	.LBB0_825
+.LBB0_348:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 4*r10]
+	lea	rax, [rcx + 2*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2618,73 +4827,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_524
-# %bb.520:
+	jne	.LBB0_353
+# %bb.349:
 	and	al, dil
-	jne	.LBB0_524
-# %bb.521:
+	jne	.LBB0_353
+# %bb.350:
 	mov	esi, r10d
-	and	esi, -32
+	and	esi, -64
 	xor	edi, edi
-.LBB0_522:                              # =>This Inner Loop Header: Depth=1
-	vmovups	ymm0, ymmword ptr [rdx + 4*rdi]
-	vmovups	ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vmovups	ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vmovups	ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vsubps	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
-	vsubps	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vsubps	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vsubps	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
+.LBB0_351:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_522
-# %bb.523:
+	jne	.LBB0_351
+# %bb.352:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_524:
-	mov	rdi, rsi
-	not	rdi
-	add	rdi, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_526
-.LBB0_525:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_525
-.LBB0_526:
-	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_527:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rdx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	vmovss	xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
-	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
-	vmovss	xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
-	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
-	vmovss	xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
-	vsubss	xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
-	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_527
-	jmp	.LBB0_537
-.LBB0_122:
-	lea	rsi, [r8 + 8*r10]
-	lea	rax, [rdx + 8*r10]
+	jne	.LBB0_353
+	jmp	.LBB0_825
+.LBB0_469:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 8*r10]
+	lea	rax, [rcx + 2*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2693,73 +4869,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_127
-# %bb.123:
+	jne	.LBB0_474
+# %bb.470:
 	and	al, dil
-	jne	.LBB0_127
-# %bb.124:
+	jne	.LBB0_474
+# %bb.471:
 	mov	esi, r10d
-	and	esi, -16
+	and	esi, -64
 	xor	edi, edi
-.LBB0_125:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
-	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
-	add	rdi, 16
+.LBB0_472:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_125
-# %bb.126:
+	jne	.LBB0_472
+# %bb.473:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_127:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_129
-.LBB0_128:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rcx + 8*rsi]
-	add	rdi, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_128
-.LBB0_129:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_130:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rcx + 8*rsi]
-	add	rax, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 8]
-	add	rax, qword ptr [rdx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 16]
-	add	rax, qword ptr [rdx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 24]
-	add	rax, qword ptr [rdx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_130
-	jmp	.LBB0_537
-.LBB0_134:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
+	jne	.LBB0_474
+	jmp	.LBB0_825
+.LBB0_481:
+	lea	rsi, [r8 + 2*r10]
+	lea	rax, [rdx + 2*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 4*r10]
+	lea	rax, [rcx + 2*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -2768,68 +4911,35 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_139
-# %bb.135:
+	jne	.LBB0_486
+# %bb.482:
 	and	al, dil
-	jne	.LBB0_139
-# %bb.136:
+	jne	.LBB0_486
+# %bb.483:
 	mov	esi, r10d
-	and	esi, -32
+	and	esi, -64
 	xor	edi, edi
-.LBB0_137:                              # =>This Inner Loop Header: Depth=1
-	vmovups	ymm0, ymmword ptr [rcx + 4*rdi]
-	vmovups	ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vmovups	ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vmovups	ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vaddps	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
-	vaddps	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vaddps	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vaddps	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
+.LBB0_484:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 2*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 2*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 2*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 2*rdi + 96]
+	vpsubw	ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+	vpsubw	ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+	vpsubw	ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+	vpsubw	ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 2*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 2*rdi + 96], ymm3
+	add	rdi, 64
 	cmp	rsi, rdi
-	jne	.LBB0_137
-# %bb.138:
+	jne	.LBB0_484
+# %bb.485:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_139:
-	mov	rdi, rsi
-	not	rdi
-	add	rdi, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_141
-.LBB0_140:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_140
-.LBB0_141:
-	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_142:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	vmovss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
-	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
-	vmovss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
-	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
-	vmovss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
-	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_142
-	jmp	.LBB0_537
-.LBB0_255:
+	jne	.LBB0_486
+	jmp	.LBB0_825
+.LBB0_659:
 	lea	rsi, [r8 + 8*r10]
 	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
@@ -2843,68 +4953,67 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_260
-# %bb.256:
+	jne	.LBB0_664
+# %bb.660:
 	and	al, dil
-	jne	.LBB0_260
-# %bb.257:
+	jne	.LBB0_664
+# %bb.661:
 	mov	esi, r10d
 	and	esi, -16
 	xor	edi, edi
-.LBB0_258:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 8*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 8*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 8*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 8*rdi + 96]
-	vpaddq	ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
-	vpaddq	ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
-	vpaddq	ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
-	vpaddq	ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+.LBB0_662:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymm4, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm5, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm6, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm7, ymmword ptr [rcx + 8*rdi + 96]
+	vpsrlq	ymm8, ymm4, 32
+	vpmuludq	ymm8, ymm8, ymm1
+	vpsrlq	ymm9, ymm1, 32
+	vpmuludq	ymm9, ymm9, ymm4
+	vpaddq	ymm8, ymm9, ymm8
+	vpsllq	ymm8, ymm8, 32
+	vpmuludq	ymm1, ymm4, ymm1
+	vpaddq	ymm1, ymm8, ymm1
+	vpsrlq	ymm4, ymm5, 32
+	vpmuludq	ymm4, ymm4, ymm2
+	vpsrlq	ymm8, ymm2, 32
+	vpmuludq	ymm8, ymm8, ymm5
+	vpaddq	ymm4, ymm8, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm2, ymm5, ymm2
+	vpaddq	ymm2, ymm2, ymm4
+	vpsrlq	ymm4, ymm6, 32
+	vpmuludq	ymm4, ymm4, ymm3
+	vpsrlq	ymm5, ymm3, 32
+	vpmuludq	ymm5, ymm6, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm3, ymm6, ymm3
+	vpaddq	ymm3, ymm3, ymm4
+	vpsrlq	ymm4, ymm7, 32
+	vpmuludq	ymm4, ymm4, ymm0
+	vpsrlq	ymm5, ymm0, 32
+	vpmuludq	ymm5, ymm7, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm0, ymm7, ymm0
+	vpaddq	ymm0, ymm0, ymm4
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm3
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm0
 	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_258
-# %bb.259:
+	jne	.LBB0_662
+# %bb.663:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_260:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_262
-.LBB0_261:                              # =>This Inner Loop Header: Depth=1
-	mov	rdi, qword ptr [rcx + 8*rsi]
-	add	rdi, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rdi
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_261
-.LBB0_262:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_263:                              # =>This Inner Loop Header: Depth=1
-	mov	rax, qword ptr [rcx + 8*rsi]
-	add	rax, qword ptr [rdx + 8*rsi]
-	mov	qword ptr [r8 + 8*rsi], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 8]
-	add	rax, qword ptr [rdx + 8*rsi + 8]
-	mov	qword ptr [r8 + 8*rsi + 8], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 16]
-	add	rax, qword ptr [rdx + 8*rsi + 16]
-	mov	qword ptr [r8 + 8*rsi + 16], rax
-	mov	rax, qword ptr [rcx + 8*rsi + 24]
-	add	rax, qword ptr [rdx + 8*rsi + 24]
-	mov	qword ptr [r8 + 8*rsi + 24], rax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_263
-	jmp	.LBB0_537
-.LBB0_267:
+	jne	.LBB0_664
+	jmp	.LBB0_825
+.LBB0_671:
 	lea	rsi, [r8 + 4*r10]
 	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
@@ -2918,148 +5027,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_272
-# %bb.268:
+	jne	.LBB0_676
+# %bb.672:
 	and	al, dil
-	jne	.LBB0_272
-# %bb.269:
+	jne	.LBB0_676
+# %bb.673:
 	mov	esi, r10d
 	and	esi, -32
 	xor	edi, edi
-.LBB0_270:                              # =>This Inner Loop Header: Depth=1
+.LBB0_674:                              # =>This Inner Loop Header: Depth=1
 	vmovups	ymm0, ymmword ptr [rcx + 4*rdi]
 	vmovups	ymm1, ymmword ptr [rcx + 4*rdi + 32]
 	vmovups	ymm2, ymmword ptr [rcx + 4*rdi + 64]
 	vmovups	ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vaddps	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
-	vaddps	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vaddps	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vaddps	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmulps	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vmulps	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmulps	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmulps	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
 	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
 	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
-	cmp	rsi, rdi
-	jne	.LBB0_270
-# %bb.271:
-	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_272:
-	mov	rdi, rsi
-	not	rdi
-	add	rdi, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_274
-.LBB0_273:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_273
-.LBB0_274:
-	cmp	rdi, 3
-	jb	.LBB0_537
-.LBB0_275:                              # =>This Inner Loop Header: Depth=1
-	vmovss	xmm0, dword ptr [rcx + 4*rsi]   # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi]
-	vmovss	dword ptr [r8 + 4*rsi], xmm0
-	vmovss	xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
-	vmovss	dword ptr [r8 + 4*rsi + 4], xmm0
-	vmovss	xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
-	vmovss	dword ptr [r8 + 4*rsi + 8], xmm0
-	vmovss	xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
-	vaddss	xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
-	vmovss	dword ptr [r8 + 4*rsi + 12], xmm0
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_275
-	jmp	.LBB0_537
-.LBB0_306:
-	lea	rsi, [r8 + r10]
-	lea	rax, [rdx + r10]
-	cmp	rax, r8
-	seta	r9b
-	lea	rax, [rcx + r10]
-	cmp	rsi, rdx
-	seta	r11b
-	cmp	rax, r8
-	seta	al
-	cmp	rsi, rcx
-	seta	dil
-	xor	esi, esi
-	test	r9b, r11b
-	jne	.LBB0_311
-# %bb.307:
-	and	al, dil
-	jne	.LBB0_311
-# %bb.308:
-	mov	esi, r10d
-	and	esi, -128
-	xor	edi, edi
-.LBB0_309:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + rdi + 96]
-	vpsubb	ymm0, ymm0, ymmword ptr [rcx + rdi]
-	vpsubb	ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
-	vpsubb	ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
-	vpsubb	ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
-	vmovdqu	ymmword ptr [r8 + rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
-	sub	rdi, -128
-	cmp	rsi, rdi
-	jne	.LBB0_309
-# %bb.310:
-	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_311:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rdi, r10
-	and	rdi, 3
-	je	.LBB0_313
-.LBB0_312:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rdx + rsi]
-	sub	al, byte ptr [rcx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	add	rsi, 1
-	add	rdi, -1
-	jne	.LBB0_312
-.LBB0_313:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_314:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rdx + rsi]
-	sub	al, byte ptr [rcx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	movzx	eax, byte ptr [rdx + rsi + 1]
-	sub	al, byte ptr [rcx + rsi + 1]
-	mov	byte ptr [r8 + rsi + 1], al
-	movzx	eax, byte ptr [rdx + rsi + 2]
-	sub	al, byte ptr [rcx + rsi + 2]
-	mov	byte ptr [r8 + rsi + 2], al
-	movzx	eax, byte ptr [rdx + rsi + 3]
-	sub	al, byte ptr [rcx + rsi + 3]
-	mov	byte ptr [r8 + rsi + 3], al
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_314
-	jmp	.LBB0_537
-.LBB0_432:
-	lea	rsi, [r8 + r10]
-	lea	rax, [rdx + r10]
+	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
+	cmp	rsi, rdi
+	jne	.LBB0_674
+# %bb.675:
+	cmp	rsi, r10
+	jne	.LBB0_676
+	jmp	.LBB0_825
+.LBB0_795:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + r10]
+	lea	rax, [rcx + 8*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -3068,73 +5069,72 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_437
-# %bb.433:
+	jne	.LBB0_800
+# %bb.796:
 	and	al, dil
-	jne	.LBB0_437
-# %bb.434:
+	jne	.LBB0_800
+# %bb.797:
 	mov	esi, r10d
-	and	esi, -128
+	and	esi, -16
 	xor	edi, edi
-.LBB0_435:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + rdi + 96]
-	vpsubb	ymm0, ymm0, ymmword ptr [rcx + rdi]
-	vpsubb	ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
-	vpsubb	ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
-	vpsubb	ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
-	vmovdqu	ymmword ptr [r8 + rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
-	sub	rdi, -128
+.LBB0_798:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi + 96]
+	vmovdqu	ymm4, ymmword ptr [rcx + 8*rdi]
+	vmovdqu	ymm5, ymmword ptr [rcx + 8*rdi + 32]
+	vmovdqu	ymm6, ymmword ptr [rcx + 8*rdi + 64]
+	vmovdqu	ymm7, ymmword ptr [rcx + 8*rdi + 96]
+	vpsrlq	ymm8, ymm4, 32
+	vpmuludq	ymm8, ymm8, ymm1
+	vpsrlq	ymm9, ymm1, 32
+	vpmuludq	ymm9, ymm9, ymm4
+	vpaddq	ymm8, ymm9, ymm8
+	vpsllq	ymm8, ymm8, 32
+	vpmuludq	ymm1, ymm4, ymm1
+	vpaddq	ymm1, ymm8, ymm1
+	vpsrlq	ymm4, ymm5, 32
+	vpmuludq	ymm4, ymm4, ymm2
+	vpsrlq	ymm8, ymm2, 32
+	vpmuludq	ymm8, ymm8, ymm5
+	vpaddq	ymm4, ymm8, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm2, ymm5, ymm2
+	vpaddq	ymm2, ymm2, ymm4
+	vpsrlq	ymm4, ymm6, 32
+	vpmuludq	ymm4, ymm4, ymm3
+	vpsrlq	ymm5, ymm3, 32
+	vpmuludq	ymm5, ymm6, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm3, ymm6, ymm3
+	vpaddq	ymm3, ymm3, ymm4
+	vpsrlq	ymm4, ymm7, 32
+	vpmuludq	ymm4, ymm4, ymm0
+	vpsrlq	ymm5, ymm0, 32
+	vpmuludq	ymm5, ymm7, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm0, ymm7, ymm0
+	vpaddq	ymm0, ymm0, ymm4
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm3
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm0
+	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_435
-# %bb.436:
+	jne	.LBB0_798
+# %bb.799:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_437:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rdi, r10
-	and	rdi, 3
-	je	.LBB0_439
-.LBB0_438:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rdx + rsi]
-	sub	al, byte ptr [rcx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	add	rsi, 1
-	add	rdi, -1
-	jne	.LBB0_438
-.LBB0_439:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_440:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rdx + rsi]
-	sub	al, byte ptr [rcx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	movzx	eax, byte ptr [rdx + rsi + 1]
-	sub	al, byte ptr [rcx + rsi + 1]
-	mov	byte ptr [r8 + rsi + 1], al
-	movzx	eax, byte ptr [rdx + rsi + 2]
-	sub	al, byte ptr [rcx + rsi + 2]
-	mov	byte ptr [r8 + rsi + 2], al
-	movzx	eax, byte ptr [rdx + rsi + 3]
-	sub	al, byte ptr [rcx + rsi + 3]
-	mov	byte ptr [r8 + rsi + 3], al
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_440
-	jmp	.LBB0_537
-.LBB0_47:
-	lea	rsi, [r8 + r10]
-	lea	rax, [rdx + r10]
+	jne	.LBB0_800
+	jmp	.LBB0_825
+.LBB0_807:
+	lea	rsi, [r8 + 4*r10]
+	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + r10]
+	lea	rax, [rcx + 4*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -3143,73 +5143,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_52
-# %bb.48:
+	jne	.LBB0_812
+# %bb.808:
 	and	al, dil
-	jne	.LBB0_52
-# %bb.49:
+	jne	.LBB0_812
+# %bb.809:
 	mov	esi, r10d
-	and	esi, -128
+	and	esi, -32
 	xor	edi, edi
-.LBB0_50:                               # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
-	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
-	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
-	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
-	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
-	vmovdqu	ymmword ptr [r8 + rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
-	sub	rdi, -128
+.LBB0_810:                              # =>This Inner Loop Header: Depth=1
+	vmovups	ymm0, ymmword ptr [rcx + 4*rdi]
+	vmovups	ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vmovups	ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vmovups	ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vmulps	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+	vmulps	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmulps	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmulps	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_50
-# %bb.51:
+	jne	.LBB0_810
+# %bb.811:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_52:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rdi, r10
-	and	rdi, 3
-	je	.LBB0_54
-.LBB0_53:                               # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	add	rsi, 1
-	add	rdi, -1
-	jne	.LBB0_53
-.LBB0_54:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_55:                               # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	movzx	eax, byte ptr [rcx + rsi + 1]
-	add	al, byte ptr [rdx + rsi + 1]
-	mov	byte ptr [r8 + rsi + 1], al
-	movzx	eax, byte ptr [rcx + rsi + 2]
-	add	al, byte ptr [rdx + rsi + 2]
-	mov	byte ptr [r8 + rsi + 2], al
-	movzx	eax, byte ptr [rcx + rsi + 3]
-	add	al, byte ptr [rdx + rsi + 3]
-	mov	byte ptr [r8 + rsi + 3], al
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_55
-	jmp	.LBB0_537
-.LBB0_180:
-	lea	rsi, [r8 + r10]
-	lea	rax, [rdx + r10]
+	jne	.LBB0_812
+	jmp	.LBB0_825
+.LBB0_390:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + r10]
+	lea	rax, [rcx + 8*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -3218,68 +5185,35 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_185
-# %bb.181:
+	jne	.LBB0_395
+# %bb.391:
 	and	al, dil
-	jne	.LBB0_185
-# %bb.182:
+	jne	.LBB0_395
+# %bb.392:
 	mov	esi, r10d
-	and	esi, -128
+	and	esi, -16
 	xor	edi, edi
-.LBB0_183:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + rdi + 96]
-	vpaddb	ymm0, ymm0, ymmword ptr [rdx + rdi]
-	vpaddb	ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
-	vpaddb	ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
-	vpaddb	ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
-	vmovdqu	ymmword ptr [r8 + rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + rdi + 96], ymm3
-	sub	rdi, -128
+.LBB0_393:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_183
-# %bb.184:
+	jne	.LBB0_393
+# %bb.394:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_185:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rdi, r10
-	and	rdi, 3
-	je	.LBB0_187
-.LBB0_186:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	add	rsi, 1
-	add	rdi, -1
-	jne	.LBB0_186
-.LBB0_187:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_188:                              # =>This Inner Loop Header: Depth=1
-	movzx	eax, byte ptr [rcx + rsi]
-	add	al, byte ptr [rdx + rsi]
-	mov	byte ptr [r8 + rsi], al
-	movzx	eax, byte ptr [rcx + rsi + 1]
-	add	al, byte ptr [rdx + rsi + 1]
-	mov	byte ptr [r8 + rsi + 1], al
-	movzx	eax, byte ptr [rcx + rsi + 2]
-	add	al, byte ptr [rdx + rsi + 2]
-	mov	byte ptr [r8 + rsi + 2], al
-	movzx	eax, byte ptr [rcx + rsi + 3]
-	add	al, byte ptr [rdx + rsi + 3]
-	mov	byte ptr [r8 + rsi + 3], al
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_188
-	jmp	.LBB0_537
-.LBB0_360:
+	jne	.LBB0_395
+	jmp	.LBB0_825
+.LBB0_402:
 	lea	rsi, [r8 + 4*r10]
 	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
@@ -3293,73 +5227,40 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_365
-# %bb.361:
+	jne	.LBB0_407
+# %bb.403:
 	and	al, dil
-	jne	.LBB0_365
-# %bb.362:
+	jne	.LBB0_407
+# %bb.404:
 	mov	esi, r10d
 	and	esi, -32
 	xor	edi, edi
-.LBB0_363:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
-	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
-	cmp	rsi, rdi
-	jne	.LBB0_363
-# %bb.364:
-	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_365:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_367
-.LBB0_366:                              # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rdx + 4*rsi]
-	sub	edi, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_366
-.LBB0_367:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_368:                              # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rdx + 4*rsi]
-	sub	eax, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 4]
-	sub	eax, dword ptr [rcx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 8]
-	sub	eax, dword ptr [rcx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 12]
-	sub	eax, dword ptr [rcx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_368
-	jmp	.LBB0_537
-.LBB0_486:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
+.LBB0_405:                              # =>This Inner Loop Header: Depth=1
+	vmovups	ymm0, ymmword ptr [rdx + 4*rdi]
+	vmovups	ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmovups	ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmovups	ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vsubps	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+	vsubps	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vsubps	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vsubps	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
+	add	rdi, 32
+	cmp	rsi, rdi
+	jne	.LBB0_405
+# %bb.406:
+	cmp	rsi, r10
+	jne	.LBB0_407
+	jmp	.LBB0_825
+.LBB0_523:
+	lea	rsi, [r8 + 8*r10]
+	lea	rax, [rdx + 8*r10]
 	cmp	rax, r8
 	seta	r9b
-	lea	rax, [rcx + 4*r10]
+	lea	rax, [rcx + 8*r10]
 	cmp	rsi, rdx
 	seta	r11b
 	cmp	rax, r8
@@ -3368,68 +5269,35 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_491
-# %bb.487:
+	jne	.LBB0_528
+# %bb.524:
 	and	al, dil
-	jne	.LBB0_491
-# %bb.488:
+	jne	.LBB0_528
+# %bb.525:
 	mov	esi, r10d
-	and	esi, -32
+	and	esi, -16
 	xor	edi, edi
-.LBB0_489:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rdx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vpsubd	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
-	vpsubd	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vpsubd	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vpsubd	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
+.LBB0_526:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm0, ymmword ptr [rdx + 8*rdi]
+	vmovdqu	ymm1, ymmword ptr [rdx + 8*rdi + 32]
+	vmovdqu	ymm2, ymmword ptr [rdx + 8*rdi + 64]
+	vmovdqu	ymm3, ymmword ptr [rdx + 8*rdi + 96]
+	vpsubq	ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+	vpsubq	ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+	vpsubq	ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+	vpsubq	ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+	vmovdqu	ymmword ptr [r8 + 8*rdi], ymm0
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 32], ymm1
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 64], ymm2
+	vmovdqu	ymmword ptr [r8 + 8*rdi + 96], ymm3
+	add	rdi, 16
 	cmp	rsi, rdi
-	jne	.LBB0_489
-# %bb.490:
+	jne	.LBB0_526
+# %bb.527:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_491:
-	mov	r9, rsi
-	not	r9
-	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_493
-.LBB0_492:                              # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rdx + 4*rsi]
-	sub	edi, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
-	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_492
-.LBB0_493:
-	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_494:                              # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rdx + 4*rsi]
-	sub	eax, dword ptr [rcx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 4]
-	sub	eax, dword ptr [rcx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 8]
-	sub	eax, dword ptr [rcx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rdx + 4*rsi + 12]
-	sub	eax, dword ptr [rcx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
-	add	rsi, 4
-	cmp	r10, rsi
-	jne	.LBB0_494
-	jmp	.LBB0_537
-.LBB0_101:
+	jne	.LBB0_528
+	jmp	.LBB0_825
+.LBB0_535:
 	lea	rsi, [r8 + 4*r10]
 	lea	rax, [rdx + 4*r10]
 	cmp	rax, r8
@@ -3443,289 +5311,805 @@ arithmetic_avx2:                        # @arithmetic_avx2
 	seta	dil
 	xor	esi, esi
 	test	r9b, r11b
-	jne	.LBB0_106
-# %bb.102:
+	jne	.LBB0_540
+# %bb.536:
 	and	al, dil
-	jne	.LBB0_106
-# %bb.103:
+	jne	.LBB0_540
+# %bb.537:
 	mov	esi, r10d
 	and	esi, -32
 	xor	edi, edi
-.LBB0_104:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
-	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
+.LBB0_538:                              # =>This Inner Loop Header: Depth=1
+	vmovups	ymm0, ymmword ptr [rdx + 4*rdi]
+	vmovups	ymm1, ymmword ptr [rdx + 4*rdi + 32]
+	vmovups	ymm2, ymmword ptr [rdx + 4*rdi + 64]
+	vmovups	ymm3, ymmword ptr [rdx + 4*rdi + 96]
+	vsubps	ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+	vsubps	ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+	vsubps	ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+	vsubps	ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+	vmovups	ymmword ptr [r8 + 4*rdi], ymm0
+	vmovups	ymmword ptr [r8 + 4*rdi + 32], ymm1
+	vmovups	ymmword ptr [r8 + 4*rdi + 64], ymm2
+	vmovups	ymmword ptr [r8 + 4*rdi + 96], ymm3
 	add	rdi, 32
 	cmp	rsi, rdi
-	jne	.LBB0_104
-# %bb.105:
+	jne	.LBB0_538
+# %bb.539:
 	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_106:
+	jne	.LBB0_540
+	jmp	.LBB0_825
+.LBB0_592:
+	and	rax, -4
+	neg	rax
+	xor	esi, esi
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_593:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 32]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 32], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 64]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 64]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 64], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 96]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 96]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 96], ymm1
+	sub	rsi, -128
+	add	rax, 4
+	jne	.LBB0_593
+.LBB0_594:
+	test	r9, r9
+	je	.LBB0_597
+# %bb.595:
+	neg	r9
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_596:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	add	rsi, 32
+	inc	r9
+	jne	.LBB0_596
+.LBB0_597:
+	cmp	rdi, r10
+	je	.LBB0_825
+.LBB0_598:
+	mov	r9, rdi
+	not	r9
+	add	r9, r10
+	mov	rsi, r10
+	and	rsi, 3
+	je	.LBB0_600
+.LBB0_599:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	add	rdi, 1
+	add	rsi, -1
+	jne	.LBB0_599
+.LBB0_600:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_601:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	movzx	eax, byte ptr [rcx + rdi + 1]
+	mul	byte ptr [rdx + rdi + 1]
+	mov	byte ptr [r8 + rdi + 1], al
+	movzx	eax, byte ptr [rcx + rdi + 2]
+	mul	byte ptr [rdx + rdi + 2]
+	mov	byte ptr [r8 + rdi + 2], al
+	movzx	eax, byte ptr [rcx + rdi + 3]
+	mul	byte ptr [rdx + rdi + 3]
+	mov	byte ptr [r8 + rdi + 3], al
+	add	rdi, 4
+	cmp	r10, rdi
+	jne	.LBB0_601
+	jmp	.LBB0_825
+.LBB0_728:
+	and	rax, -4
+	neg	rax
+	xor	esi, esi
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_729:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 32]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 32], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 64]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 64]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 64], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 96]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 96]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 96], ymm1
+	sub	rsi, -128
+	add	rax, 4
+	jne	.LBB0_729
+.LBB0_730:
+	test	r9, r9
+	je	.LBB0_733
+# %bb.731:
+	neg	r9
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_732:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	add	rsi, 32
+	inc	r9
+	jne	.LBB0_732
+.LBB0_733:
+	cmp	rdi, r10
+	je	.LBB0_825
+.LBB0_734:
+	mov	r9, rdi
+	not	r9
+	add	r9, r10
+	mov	rsi, r10
+	and	rsi, 3
+	je	.LBB0_736
+.LBB0_735:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	add	rdi, 1
+	add	rsi, -1
+	jne	.LBB0_735
+.LBB0_736:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_737:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	movzx	eax, byte ptr [rcx + rdi + 1]
+	mul	byte ptr [rdx + rdi + 1]
+	mov	byte ptr [r8 + rdi + 1], al
+	movzx	eax, byte ptr [rcx + rdi + 2]
+	mul	byte ptr [rdx + rdi + 2]
+	mov	byte ptr [r8 + rdi + 2], al
+	movzx	eax, byte ptr [rcx + rdi + 3]
+	mul	byte ptr [rdx + rdi + 3]
+	mov	byte ptr [r8 + rdi + 3], al
+	add	rdi, 4
+	cmp	r10, rdi
+	jne	.LBB0_737
+	jmp	.LBB0_825
+.LBB0_578:
+	and	rax, -4
+	neg	rax
+	xor	esi, esi
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_579:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 32]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 32], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 64]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 64]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 64], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 96]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 96]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 96], ymm1
+	sub	rsi, -128
+	add	rax, 4
+	jne	.LBB0_579
+.LBB0_580:
+	test	r9, r9
+	je	.LBB0_583
+# %bb.581:
+	neg	r9
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_582:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	add	rsi, 32
+	inc	r9
+	jne	.LBB0_582
+.LBB0_583:
+	cmp	rdi, r10
+	je	.LBB0_825
+.LBB0_584:
+	mov	r9, rdi
+	not	r9
+	add	r9, r10
+	mov	rsi, r10
+	and	rsi, 3
+	je	.LBB0_586
+.LBB0_585:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	add	rdi, 1
+	add	rsi, -1
+	jne	.LBB0_585
+.LBB0_586:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_587:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	movzx	eax, byte ptr [rcx + rdi + 1]
+	mul	byte ptr [rdx + rdi + 1]
+	mov	byte ptr [r8 + rdi + 1], al
+	movzx	eax, byte ptr [rcx + rdi + 2]
+	mul	byte ptr [rdx + rdi + 2]
+	mov	byte ptr [r8 + rdi + 2], al
+	movzx	eax, byte ptr [rcx + rdi + 3]
+	mul	byte ptr [rdx + rdi + 3]
+	mov	byte ptr [r8 + rdi + 3], al
+	add	rdi, 4
+	cmp	r10, rdi
+	jne	.LBB0_587
+	jmp	.LBB0_825
+.LBB0_714:
+	and	rax, -4
+	neg	rax
+	xor	esi, esi
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_715:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 32]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 32]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 32], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 64]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 64]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 64], ymm1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi + 96]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi + 96]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi + 96], ymm1
+	sub	rsi, -128
+	add	rax, 4
+	jne	.LBB0_715
+.LBB0_716:
+	test	r9, r9
+	je	.LBB0_719
+# %bb.717:
+	neg	r9
+	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+.LBB0_718:                              # =>This Inner Loop Header: Depth=1
+	vmovdqu	ymm1, ymmword ptr [rdx + rsi]
+	vmovdqu	ymm2, ymmword ptr [rcx + rsi]
+	vpunpckhbw	ymm3, ymm1, ymm1        # ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpunpckhbw	ymm4, ymm2, ymm2        # ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+	vpmullw	ymm3, ymm4, ymm3
+	vpand	ymm3, ymm3, ymm0
+	vpunpcklbw	ymm1, ymm1, ymm1        # ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpunpcklbw	ymm2, ymm2, ymm2        # ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+	vpmullw	ymm1, ymm2, ymm1
+	vpand	ymm1, ymm1, ymm0
+	vpackuswb	ymm1, ymm1, ymm3
+	vmovdqu	ymmword ptr [r8 + rsi], ymm1
+	add	rsi, 32
+	inc	r9
+	jne	.LBB0_718
+.LBB0_719:
+	cmp	rdi, r10
+	je	.LBB0_825
+.LBB0_720:
+	mov	r9, rdi
+	not	r9
+	add	r9, r10
+	mov	rsi, r10
+	and	rsi, 3
+	je	.LBB0_722
+.LBB0_721:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	add	rdi, 1
+	add	rsi, -1
+	jne	.LBB0_721
+.LBB0_722:
+	cmp	r9, 3
+	jb	.LBB0_825
+.LBB0_723:                              # =>This Inner Loop Header: Depth=1
+	movzx	eax, byte ptr [rcx + rdi]
+	mul	byte ptr [rdx + rdi]
+	mov	byte ptr [r8 + rdi], al
+	movzx	eax, byte ptr [rcx + rdi + 1]
+	mul	byte ptr [rdx + rdi + 1]
+	mov	byte ptr [r8 + rdi + 1], al
+	movzx	eax, byte ptr [rcx + rdi + 2]
+	mul	byte ptr [rdx + rdi + 2]
+	mov	byte ptr [r8 + rdi + 2], al
+	movzx	eax, byte ptr [rcx + rdi + 3]
+	mul	byte ptr [rdx + rdi + 3]
+	mov	byte ptr [r8 + rdi + 3], al
+	add	rdi, 4
+	cmp	r10, rdi
+	jne	.LBB0_723
+.LBB0_825:
+	mov	rsp, rbp
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end0:
+	.size	arithmetic_avx2, .Lfunc_end0-arithmetic_avx2
+                                        # -- End function
+	.section	.rodata.cst32,"aM",@progbits,32
+	.p2align	5                               # -- Begin function arithmetic_arr_scalar_avx2
+.LCPI1_0:
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.short	255                             # 0xff
+	.text
+	.globl	arithmetic_arr_scalar_avx2
+	.p2align	4, 0x90
+	.type	arithmetic_arr_scalar_avx2,@function
+arithmetic_arr_scalar_avx2:             # @arithmetic_arr_scalar_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	and	rsp, -8
+	cmp	sil, 3
+	jg	.LBB1_12
+# %bb.1:
+	test	sil, sil
+	je	.LBB1_23
+# %bb.2:
+	cmp	sil, 1
+	je	.LBB1_31
+# %bb.3:
+	cmp	sil, 2
+	jne	.LBB1_1109
+# %bb.4:
+	cmp	edi, 6
+	jg	.LBB1_55
+# %bb.5:
+	cmp	edi, 3
+	jle	.LBB1_97
+# %bb.6:
+	cmp	edi, 4
+	je	.LBB1_157
+# %bb.7:
+	cmp	edi, 5
+	je	.LBB1_160
+# %bb.8:
+	cmp	edi, 6
+	jne	.LBB1_1109
+# %bb.9:
+	test	r9d, r9d
+	jle	.LBB1_1109
+# %bb.10:
+	mov	eax, dword ptr [rcx]
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jb	.LBB1_11
+# %bb.265:
+	lea	rcx, [rdx + 4*r10]
+	cmp	rcx, r8
+	jbe	.LBB1_445
+# %bb.266:
+	lea	rcx, [r8 + 4*r10]
+	cmp	rcx, rdx
+	jbe	.LBB1_445
+.LBB1_11:
+	xor	esi, esi
+.LBB1_665:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_108
-.LBB0_107:                              # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rcx + 4*rsi]
-	add	edi, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB1_667
+.LBB1_666:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rdx + 4*rsi]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi], ecx
 	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_107
-.LBB0_108:
+	add	rdi, -1
+	jne	.LBB1_666
+.LBB1_667:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_109:                              # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rcx + 4*rsi]
-	add	eax, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 4]
-	add	eax, dword ptr [rdx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 8]
-	add	eax, dword ptr [rdx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 12]
-	add	eax, dword ptr [rdx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
+	jb	.LBB1_1109
+.LBB1_668:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rdx + 4*rsi]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi], ecx
+	mov	ecx, dword ptr [rdx + 4*rsi + 4]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi + 4], ecx
+	mov	ecx, dword ptr [rdx + 4*rsi + 8]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi + 8], ecx
+	mov	ecx, dword ptr [rdx + 4*rsi + 12]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi + 12], ecx
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_109
-	jmp	.LBB0_537
-.LBB0_234:
-	lea	rsi, [r8 + 4*r10]
-	lea	rax, [rdx + 4*r10]
-	cmp	rax, r8
-	seta	r9b
-	lea	rax, [rcx + 4*r10]
-	cmp	rsi, rdx
-	seta	r11b
-	cmp	rax, r8
-	seta	al
-	cmp	rsi, rcx
-	seta	dil
-	xor	esi, esi
-	test	r9b, r11b
-	jne	.LBB0_239
-# %bb.235:
-	and	al, dil
-	jne	.LBB0_239
-# %bb.236:
-	mov	esi, r10d
-	and	esi, -32
-	xor	edi, edi
-.LBB0_237:                              # =>This Inner Loop Header: Depth=1
-	vmovdqu	ymm0, ymmword ptr [rcx + 4*rdi]
-	vmovdqu	ymm1, ymmword ptr [rcx + 4*rdi + 32]
-	vmovdqu	ymm2, ymmword ptr [rcx + 4*rdi + 64]
-	vmovdqu	ymm3, ymmword ptr [rcx + 4*rdi + 96]
-	vpaddd	ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
-	vpaddd	ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
-	vpaddd	ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
-	vpaddd	ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
-	vmovdqu	ymmword ptr [r8 + 4*rdi], ymm0
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 32], ymm1
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 64], ymm2
-	vmovdqu	ymmword ptr [r8 + 4*rdi + 96], ymm3
-	add	rdi, 32
-	cmp	rsi, rdi
-	jne	.LBB0_237
-# %bb.238:
-	cmp	rsi, r10
-	je	.LBB0_537
-.LBB0_239:
+	jne	.LBB1_668
+	jmp	.LBB1_1109
+.LBB1_12:
+	cmp	sil, 4
+	je	.LBB1_39
+# %bb.13:
+	cmp	sil, 5
+	je	.LBB1_47
+# %bb.14:
+	cmp	sil, 6
+	jne	.LBB1_1109
+# %bb.15:
+	cmp	edi, 6
+	jg	.LBB1_62
+# %bb.16:
+	cmp	edi, 3
+	jle	.LBB1_102
+# %bb.17:
+	cmp	edi, 4
+	je	.LBB1_163
+# %bb.18:
+	cmp	edi, 5
+	je	.LBB1_166
+# %bb.19:
+	cmp	edi, 6
+	jne	.LBB1_1109
+# %bb.20:
+	test	r9d, r9d
+	jle	.LBB1_1109
+# %bb.21:
+	mov	eax, dword ptr [rcx]
+	mov	r10d, r9d
+	cmp	r9d, 32
+	jb	.LBB1_22
+# %bb.268:
+	lea	rcx, [rdx + 4*r10]
+	cmp	rcx, r8
+	jbe	.LBB1_448
+# %bb.269:
+	lea	rcx, [r8 + 4*r10]
+	cmp	rcx, rdx
+	jbe	.LBB1_448
+.LBB1_22:
+	xor	esi, esi
+.LBB1_673:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
-	mov	rax, r10
-	and	rax, 3
-	je	.LBB0_241
-.LBB0_240:                              # =>This Inner Loop Header: Depth=1
-	mov	edi, dword ptr [rcx + 4*rsi]
-	add	edi, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], edi
+	mov	rdi, r10
+	and	rdi, 3
+	je	.LBB1_675
+.LBB1_674:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rdx + 4*rsi]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi], ecx
 	add	rsi, 1
-	add	rax, -1
-	jne	.LBB0_240
-.LBB0_241:
+	add	rdi, -1
+	jne	.LBB1_674
+.LBB1_675:
 	cmp	r9, 3
-	jb	.LBB0_537
-.LBB0_242:                              # =>This Inner Loop Header: Depth=1
-	mov	eax, dword ptr [rcx + 4*rsi]
-	add	eax, dword ptr [rdx + 4*rsi]
-	mov	dword ptr [r8 + 4*rsi], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 4]
-	add	eax, dword ptr [rdx + 4*rsi + 4]
-	mov	dword ptr [r8 + 4*rsi + 4], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 8]
-	add	eax, dword ptr [rdx + 4*rsi + 8]
-	mov	dword ptr [r8 + 4*rsi + 8], eax
-	mov	eax, dword ptr [rcx + 4*rsi + 12]
-	add	eax, dword ptr [rdx + 4*rsi + 12]
-	mov	dword ptr [r8 + 4*rsi + 12], eax
+	jb	.LBB1_1109
+.LBB1_676:                              # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rdx + 4*rsi]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi], ecx
+	mov	ecx, dword ptr [rdx + 4*rsi + 4]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi + 4], ecx
+	mov	ecx, dword ptr [rdx + 4*rsi + 8]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi + 8], ecx
+	mov	ecx, dword ptr [rdx + 4*rsi + 12]
+	imul	ecx, eax
+	mov	dword ptr [r8 + 4*rsi + 12], ecx
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB0_242
-.LBB0_537:
-	mov	rsp, rbp
-	pop	rbp
-	vzeroupper
-	ret
-.Lfunc_end0:
-	.size	arithmetic_avx2, .Lfunc_end0-arithmetic_avx2
-                                        # -- End function
-	.globl	arithmetic_arr_scalar_avx2      # -- Begin function arithmetic_arr_scalar_avx2
-	.p2align	4, 0x90
-	.type	arithmetic_arr_scalar_avx2,@function
-arithmetic_arr_scalar_avx2:             # @arithmetic_arr_scalar_avx2
-# %bb.0:
-	push	rbp
-	mov	rbp, rsp
-	and	rsp, -8
-	cmp	sil, 1
-	jg	.LBB1_11
-# %bb.1:
-	test	sil, sil
-	je	.LBB1_21
-# %bb.2:
-	cmp	sil, 1
-	jne	.LBB1_737
-# %bb.3:
+	jne	.LBB1_676
+	jmp	.LBB1_1109
+.LBB1_23:
 	cmp	edi, 6
-	jg	.LBB1_37
-# %bb.4:
+	jg	.LBB1_69
+# %bb.24:
 	cmp	edi, 3
-	jle	.LBB1_65
-# %bb.5:
+	jle	.LBB1_107
+# %bb.25:
 	cmp	edi, 4
-	je	.LBB1_105
-# %bb.6:
+	je	.LBB1_169
+# %bb.26:
 	cmp	edi, 5
-	je	.LBB1_108
-# %bb.7:
+	je	.LBB1_172
+# %bb.27:
 	cmp	edi, 6
-	jne	.LBB1_737
-# %bb.8:
+	jne	.LBB1_1109
+# %bb.28:
 	test	r9d, r9d
-	jle	.LBB1_737
-# %bb.9:
+	jle	.LBB1_1109
+# %bb.29:
 	mov	eax, dword ptr [rcx]
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jb	.LBB1_10
-# %bb.177:
+	jb	.LBB1_30
+# %bb.271:
 	lea	rcx, [rdx + 4*r10]
 	cmp	rcx, r8
-	jbe	.LBB1_297
-# %bb.178:
+	jbe	.LBB1_451
+# %bb.272:
 	lea	rcx, [r8 + 4*r10]
 	cmp	rcx, rdx
-	jbe	.LBB1_297
-.LBB1_10:
+	jbe	.LBB1_451
+.LBB1_30:
 	xor	esi, esi
-.LBB1_421:
+.LBB1_681:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rdi, r10
 	and	rdi, 3
-	je	.LBB1_423
-.LBB1_422:                              # =>This Inner Loop Header: Depth=1
+	je	.LBB1_683
+.LBB1_682:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
-	sub	ecx, eax
+	add	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
 	add	rsi, 1
 	add	rdi, -1
-	jne	.LBB1_422
-.LBB1_423:
+	jne	.LBB1_682
+.LBB1_683:
 	cmp	r9, 3
-	jb	.LBB1_737
-.LBB1_424:                              # =>This Inner Loop Header: Depth=1
+	jb	.LBB1_1109
+.LBB1_684:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
-	sub	ecx, eax
+	add	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
 	mov	ecx, dword ptr [rdx + 4*rsi + 4]
-	sub	ecx, eax
+	add	ecx, eax
 	mov	dword ptr [r8 + 4*rsi + 4], ecx
 	mov	ecx, dword ptr [rdx + 4*rsi + 8]
-	sub	ecx, eax
+	add	ecx, eax
 	mov	dword ptr [r8 + 4*rsi + 8], ecx
 	mov	ecx, dword ptr [rdx + 4*rsi + 12]
-	sub	ecx, eax
+	add	ecx, eax
 	mov	dword ptr [r8 + 4*rsi + 12], ecx
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB1_424
-	jmp	.LBB1_737
-.LBB1_11:
-	cmp	sil, 2
-	je	.LBB1_29
-# %bb.12:
-	cmp	sil, 3
-	jne	.LBB1_737
-# %bb.13:
+	jne	.LBB1_684
+	jmp	.LBB1_1109
+.LBB1_31:
 	cmp	edi, 6
-	jg	.LBB1_44
-# %bb.14:
+	jg	.LBB1_76
+# %bb.32:
 	cmp	edi, 3
-	jle	.LBB1_70
-# %bb.15:
+	jle	.LBB1_112
+# %bb.33:
 	cmp	edi, 4
-	je	.LBB1_111
-# %bb.16:
+	je	.LBB1_175
+# %bb.34:
 	cmp	edi, 5
-	je	.LBB1_114
-# %bb.17:
+	je	.LBB1_178
+# %bb.35:
 	cmp	edi, 6
-	jne	.LBB1_737
-# %bb.18:
+	jne	.LBB1_1109
+# %bb.36:
 	test	r9d, r9d
-	jle	.LBB1_737
-# %bb.19:
+	jle	.LBB1_1109
+# %bb.37:
 	mov	eax, dword ptr [rcx]
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jb	.LBB1_20
-# %bb.180:
+	jb	.LBB1_38
+# %bb.274:
 	lea	rcx, [rdx + 4*r10]
 	cmp	rcx, r8
-	jbe	.LBB1_300
-# %bb.181:
+	jbe	.LBB1_454
+# %bb.275:
 	lea	rcx, [r8 + 4*r10]
 	cmp	rcx, rdx
-	jbe	.LBB1_300
-.LBB1_20:
+	jbe	.LBB1_454
+.LBB1_38:
 	xor	esi, esi
-.LBB1_429:
+.LBB1_689:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rdi, r10
 	and	rdi, 3
-	je	.LBB1_431
-.LBB1_430:                              # =>This Inner Loop Header: Depth=1
+	je	.LBB1_691
+.LBB1_690:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
 	sub	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
 	add	rsi, 1
 	add	rdi, -1
-	jne	.LBB1_430
-.LBB1_431:
+	jne	.LBB1_690
+.LBB1_691:
 	cmp	r9, 3
-	jb	.LBB1_737
-.LBB1_432:                              # =>This Inner Loop Header: Depth=1
+	jb	.LBB1_1109
+.LBB1_692:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
 	sub	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
@@ -3740,59 +6124,59 @@ arithmetic_arr_scalar_avx2:             # @arithmetic_arr_scalar_avx2
 	mov	dword ptr [r8 + 4*rsi + 12], ecx
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB1_432
-	jmp	.LBB1_737
-.LBB1_21:
+	jne	.LBB1_692
+	jmp	.LBB1_1109
+.LBB1_39:
 	cmp	edi, 6
-	jg	.LBB1_51
-# %bb.22:
+	jg	.LBB1_83
+# %bb.40:
 	cmp	edi, 3
-	jle	.LBB1_75
-# %bb.23:
+	jle	.LBB1_117
+# %bb.41:
 	cmp	edi, 4
-	je	.LBB1_117
-# %bb.24:
+	je	.LBB1_181
+# %bb.42:
 	cmp	edi, 5
-	je	.LBB1_120
-# %bb.25:
+	je	.LBB1_184
+# %bb.43:
 	cmp	edi, 6
-	jne	.LBB1_737
-# %bb.26:
+	jne	.LBB1_1109
+# %bb.44:
 	test	r9d, r9d
-	jle	.LBB1_737
-# %bb.27:
+	jle	.LBB1_1109
+# %bb.45:
 	mov	eax, dword ptr [rcx]
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jb	.LBB1_28
-# %bb.183:
+	jb	.LBB1_46
+# %bb.277:
 	lea	rcx, [rdx + 4*r10]
 	cmp	rcx, r8
-	jbe	.LBB1_303
-# %bb.184:
+	jbe	.LBB1_457
+# %bb.278:
 	lea	rcx, [r8 + 4*r10]
 	cmp	rcx, rdx
-	jbe	.LBB1_303
-.LBB1_28:
+	jbe	.LBB1_457
+.LBB1_46:
 	xor	esi, esi
-.LBB1_437:
+.LBB1_697:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rdi, r10
 	and	rdi, 3
-	je	.LBB1_439
-.LBB1_438:                              # =>This Inner Loop Header: Depth=1
+	je	.LBB1_699
+.LBB1_698:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
 	add	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
 	add	rsi, 1
 	add	rdi, -1
-	jne	.LBB1_438
-.LBB1_439:
+	jne	.LBB1_698
+.LBB1_699:
 	cmp	r9, 3
-	jb	.LBB1_737
-.LBB1_440:                              # =>This Inner Loop Header: Depth=1
+	jb	.LBB1_1109
+.LBB1_700:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
 	add	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
@@ -3807,187 +6191,300 @@ arithmetic_arr_scalar_avx2:             # @arithmetic_arr_scalar_avx2
 	mov	dword ptr [r8 + 4*rsi + 12], ecx
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB1_440
-	jmp	.LBB1_737
-.LBB1_29:
+	jne	.LBB1_700
+	jmp	.LBB1_1109
+.LBB1_47:
 	cmp	edi, 6
-	jg	.LBB1_58
-# %bb.30:
+	jg	.LBB1_90
+# %bb.48:
 	cmp	edi, 3
-	jle	.LBB1_80
-# %bb.31:
+	jle	.LBB1_122
+# %bb.49:
 	cmp	edi, 4
-	je	.LBB1_123
-# %bb.32:
+	je	.LBB1_187
+# %bb.50:
 	cmp	edi, 5
-	je	.LBB1_126
-# %bb.33:
+	je	.LBB1_190
+# %bb.51:
 	cmp	edi, 6
-	jne	.LBB1_737
-# %bb.34:
+	jne	.LBB1_1109
+# %bb.52:
 	test	r9d, r9d
-	jle	.LBB1_737
-# %bb.35:
+	jle	.LBB1_1109
+# %bb.53:
 	mov	eax, dword ptr [rcx]
 	mov	r10d, r9d
 	cmp	r9d, 32
-	jb	.LBB1_36
-# %bb.186:
+	jb	.LBB1_54
+# %bb.280:
 	lea	rcx, [rdx + 4*r10]
 	cmp	rcx, r8
-	jbe	.LBB1_306
-# %bb.187:
+	jbe	.LBB1_460
+# %bb.281:
 	lea	rcx, [r8 + 4*r10]
 	cmp	rcx, rdx
-	jbe	.LBB1_306
-.LBB1_36:
+	jbe	.LBB1_460
+.LBB1_54:
 	xor	esi, esi
-.LBB1_445:
+.LBB1_705:
 	mov	r9, rsi
 	not	r9
 	add	r9, r10
 	mov	rdi, r10
 	and	rdi, 3
-	je	.LBB1_447
-.LBB1_446:                              # =>This Inner Loop Header: Depth=1
+	je	.LBB1_707
+.LBB1_706:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
-	add	ecx, eax
+	sub	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
 	add	rsi, 1
 	add	rdi, -1
-	jne	.LBB1_446
-.LBB1_447:
+	jne	.LBB1_706
+.LBB1_707:
 	cmp	r9, 3
-	jb	.LBB1_737
-.LBB1_448:                              # =>This Inner Loop Header: Depth=1
+	jb	.LBB1_1109
+.LBB1_708:                              # =>This Inner Loop Header: Depth=1
 	mov	ecx, dword ptr [rdx + 4*rsi]
-	add	ecx, eax
+	sub	ecx, eax
 	mov	dword ptr [r8 + 4*rsi], ecx
 	mov	ecx, dword ptr [rdx + 4*rsi + 4]
-	add	ecx, eax
+	sub	ecx, eax
 	mov	dword ptr [r8 + 4*rsi + 4], ecx
 	mov	ecx, dword ptr [rdx + 4*rsi + 8]
-	add	ecx, eax
+	sub	ecx, eax
 	mov	dword ptr [r8 + 4*rsi + 8], ecx
 	mov	ecx, dword ptr [rdx + 4*rsi + 12]
-	add	ecx, eax
+	sub	ecx, eax
 	mov	dword ptr [r8 + 4*rsi + 12], ecx
 	add	rsi, 4
 	cmp	r10, rsi
-	jne	.LBB1_448
-	jmp	.LBB1_737
-.LBB1_37:
+	jne	.LBB1_708
+	jmp	.LBB1_1109
+.LBB1_55:
 	cmp	edi, 8
-	jle	.LBB1_85
-# %bb.38:
+	jle	.LBB1_127
+# %bb.56:
 	cmp	edi, 9
-	je	.LBB1_129
-# %bb.39:
+	je	.LBB1_193
+# %bb.57:
 	cmp	edi, 11
-	je	.LBB1_132
-# %bb.40:
+	je	.LBB1_196
+# %bb.58:
 	cmp	edi, 12
-	jne	.LBB1_737
-# %bb.41:
+	jne	.LBB1_1109
+# %bb.59:
 	test	r9d, r9d
-	jle	.LBB1_737
-# %bb.42:
+	jle	.LBB1_1109
+# %bb.60:
 	vmovsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
 	mov	eax, r9d
 	cmp	r9d, 16
-	jb	.LBB1_43
-# %bb.189:
+	jb	.LBB1_61
+# %bb.283:
 	lea	rcx, [rdx + 8*rax]
 	cmp	rcx, r8
-	jbe	.LBB1_309
-# %bb.190:
+	jbe	.LBB1_463
+# %bb.284:
 	lea	rcx, [r8 + 8*rax]
 	cmp	rcx, rdx
-	jbe	.LBB1_309
-.LBB1_43:
+	jbe	.LBB1_463
+.LBB1_61:
 	xor	ecx, ecx
-.LBB1_453:
+.LBB1_713:
 	mov	rsi, rcx
 	not	rsi
 	add	rsi, rax
 	mov	rdi, rax
 	and	rdi, 3
-	je	.LBB1_455
-.LBB1_454:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
-	vsubsd	xmm1, xmm1, xmm0
+	je	.LBB1_715
+.LBB1_714:                              # =>This Inner Loop Header: Depth=1
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx]
 	vmovsd	qword ptr [r8 + 8*rcx], xmm1
 	add	rcx, 1
 	add	rdi, -1
-	jne	.LBB1_454
-.LBB1_455:
+	jne	.LBB1_714
+.LBB1_715:
 	cmp	rsi, 3
-	jb	.LBB1_737
-.LBB1_456:                              # =>This Inner Loop Header: Depth=1
-	vmovsd	xmm1, qword ptr [rdx + 8*rcx]   # xmm1 = mem[0],zero
-	vsubsd	xmm1, xmm1, xmm0
+	jb	.LBB1_1109
+.LBB1_716:                              # =>This Inner Loop Header: Depth=1
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx]
 	vmovsd	qword ptr [r8 + 8*rcx], xmm1
-	vmovsd	xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
-	vsubsd	xmm1, xmm1, xmm0
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx + 8]
 	vmovsd	qword ptr [r8 + 8*rcx + 8], xmm1
-	vmovsd	xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
-	vsubsd	xmm1, xmm1, xmm0
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx + 16]
 	vmovsd	qword ptr [r8 + 8*rcx + 16], xmm1
-	vmovsd	xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
-	vsubsd	xmm1, xmm1, xmm0
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx + 24]
 	vmovsd	qword ptr [r8 + 8*rcx + 24], xmm1
 	add	rcx, 4
 	cmp	rax, rcx
-	jne	.LBB1_456
-	jmp	.LBB1_737
-.LBB1_44:
+	jne	.LBB1_716
+	jmp	.LBB1_1109
+.LBB1_62:
+	cmp	edi, 8
+	jle	.LBB1_132
+# %bb.63:
+	cmp	edi, 9
+	je	.LBB1_199
+# %bb.64:
+	cmp	edi, 11
+	je	.LBB1_202
+# %bb.65:
+	cmp	edi, 12
+	jne	.LBB1_1109
+# %bb.66:
+	test	r9d, r9d
+	jle	.LBB1_1109
+# %bb.67:
+	vmovsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
+	mov	eax, r9d
+	cmp	r9d, 16
+	jb	.LBB1_68
+# %bb.286:
+	lea	rcx, [rdx + 8*rax]
+	cmp	rcx, r8
+	jbe	.LBB1_466
+# %bb.287:
+	lea	rcx, [r8 + 8*rax]
+	cmp	rcx, rdx
+	jbe	.LBB1_466
+.LBB1_68:
+	xor	ecx, ecx
+.LBB1_721:
+	mov	rsi, rcx
+	not	rsi
+	add	rsi, rax
+	mov	rdi, rax
+	and	rdi, 3
+	je	.LBB1_723
+.LBB1_722:                              # =>This Inner Loop Header: Depth=1
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx]
+	vmovsd	qword ptr [r8 + 8*rcx], xmm1
+	add	rcx, 1
+	add	rdi, -1
+	jne	.LBB1_722
+.LBB1_723:
+	cmp	rsi, 3
+	jb	.LBB1_1109
+.LBB1_724:                              # =>This Inner Loop Header: Depth=1
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx]
+	vmovsd	qword ptr [r8 + 8*rcx], xmm1
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx + 8]
+	vmovsd	qword ptr [r8 + 8*rcx + 8], xmm1
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx + 16]
+	vmovsd	qword ptr [r8 + 8*rcx + 16], xmm1
+	vmulsd	xmm1, xmm0, qword ptr [rdx + 8*rcx + 24]
+	vmovsd	qword ptr [r8 + 8*rcx + 24], xmm1
+	add	rcx, 4
+	cmp	rax, rcx
+	jne	.LBB1_724
+	jmp	.LBB1_1109
+.LBB1_69:
+	cmp	edi, 8
+	jle	.LBB1_137
+# %bb.70:
+	cmp	edi, 9
+	je	.LBB1_205
+# %bb.71:
+	cmp	edi, 11
+	je	.LBB1_208
+# %bb.72:
+	cmp	edi, 12
+	jne	.LBB1_1109
+# %bb.73:
+	test	r9d, r9d
+	jle	.LBB1_1109
+# %bb.74:
+	vmovsd	xmm0, qword ptr [rcx]           # xmm0 = mem[0],zero
+	mov	eax, r9d
+	cmp	r9d, 16
+	jb	.LBB1_75
+# %bb.289:
+	lea	rcx, [rdx + 8*rax]
+	cmp	rcx, r8
+	jbe	.LBB1_469
... 91290 lines suppressed ...