You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/11/10 15:57:08 UTC
[arrow] branch master updated: ARROW-18109: [Go] Initial Unary Arithmetic (#14605)
This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6c988db3e4 ARROW-18109: [Go] Initial Unary Arithmetic (#14605)
6c988db3e4 is described below
commit 6c988db3e443de16ca17941898bf320edd63e74f
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Thu Nov 10 10:57:01 2022 -0500
ARROW-18109: [Go] Initial Unary Arithmetic (#14605)
Authored-by: Matt Topol <zo...@gmail.com>
Signed-off-by: Matt Topol <zo...@gmail.com>
---
go/arrow/compute/arithmetic.go | 163 +-
go/arrow/compute/arithmetic_test.go | 528 +
go/arrow/compute/cast_test.go | 8 +-
go/arrow/compute/internal/kernels/Makefile | 2 +-
.../internal/kernels/_lib/base_arithmetic.cc | 261 +-
.../kernels/_lib/base_arithmetic_avx2_amd64.s | 14952 ++++++++++++++-
.../kernels/_lib/base_arithmetic_sse4_amd64.s | 18382 +++++++++++++++++-
.../compute/internal/kernels/base_arithmetic.go | 347 +-
.../internal/kernels/base_arithmetic_amd64.go | 87 +-
.../internal/kernels/base_arithmetic_avx2_amd64.go | 18 +-
.../internal/kernels/base_arithmetic_avx2_amd64.s | 15021 ++++++++++++++-
.../internal/kernels/base_arithmetic_sse4_amd64.go | 18 +-
.../internal/kernels/base_arithmetic_sse4_amd64.s | 18688 ++++++++++++++++++-
.../internal/kernels/basic_arithmetic_noasm.go | 12 +-
.../compute/internal/kernels/scalar_arithmetic.go | 80 +-
go/arrow/decimal128/decimal128.go | 4 +
go/arrow/decimal256/decimal256.go | 4 +
17 files changed, 68410 insertions(+), 165 deletions(-)
diff --git a/go/arrow/compute/arithmetic.go b/go/arrow/compute/arithmetic.go
index 4b6f6109a5..d28d167f89 100644
--- a/go/arrow/compute/arithmetic.go
+++ b/go/arrow/compute/arithmetic.go
@@ -81,6 +81,44 @@ func (fn *arithmeticFunction) DispatchBest(vals ...arrow.DataType) (exec.Kernel,
return fn.DispatchExact(vals...)
}
+// an arithmetic function which promotes integers and decimal
+// arguments to doubles.
+type arithmeticFloatingPointFunc struct {
+ arithmeticFunction
+}
+
+func (fn *arithmeticFloatingPointFunc) Execute(ctx context.Context, opts FunctionOptions, args ...Datum) (Datum, error) {
+ return execInternal(ctx, fn, opts, -1, args...)
+}
+
+func (fn *arithmeticFloatingPointFunc) DispatchBest(vals ...arrow.DataType) (exec.Kernel, error) {
+ if err := fn.checkArity(len(vals)); err != nil {
+ return nil, err
+ }
+
+ if kn, err := fn.DispatchExact(vals...); err == nil {
+ return kn, nil
+ }
+
+ ensureDictionaryDecoded(vals...)
+
+ if len(vals) == 2 {
+ replaceNullWithOtherType(vals...)
+ }
+
+ for i, v := range vals {
+ if arrow.IsInteger(v.ID()) || arrow.IsDecimal(v.ID()) {
+ vals[i] = arrow.PrimitiveTypes.Float64
+ }
+ }
+
+ if dt := commonNumeric(vals...); dt != nil {
+ replaceTypes(dt, vals...)
+ }
+
+ return fn.DispatchExact(vals...)
+}
+
var (
addDoc FunctionDoc
)
@@ -97,7 +135,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
for _, o := range ops {
fn := &arithmeticFunction{*NewScalarFunction(o.funcName, Binary(), addDoc), o.decPromote}
- kns := append(kernels.GetArithmeticKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...)
+ kns := append(kernels.GetArithmeticBinaryKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...)
kns = append(kns, kernels.GetArithmeticFunctionTimeDuration(o.op)...)
for _, k := range kns {
if err := fn.AddKernel(k); err != nil {
@@ -108,7 +146,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
for _, unit := range arrow.TimeUnitValues {
inType := exec.NewMatchedInput(exec.TimestampTypeUnit(unit))
inDuration := exec.NewExactInput(&arrow.DurationType{Unit: unit})
- ex := kernels.ArithmeticExec(arrow.TIMESTAMP, o.op)
+ ex := kernels.ArithmeticExecSameType(arrow.TIMESTAMP, o.op)
err := fn.AddNewKernel([]exec.InputType{inType, inDuration}, kernels.OutputFirstType, ex, nil)
if err != nil {
panic(err)
@@ -119,7 +157,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
}
matchDur := exec.NewMatchedInput(exec.DurationTypeUnit(unit))
- ex = kernels.ArithmeticExec(arrow.DURATION, o.op)
+ ex = kernels.ArithmeticExecSameType(arrow.DURATION, o.op)
err = fn.AddNewKernel([]exec.InputType{matchDur, matchDur}, exec.NewOutputType(&arrow.DurationType{Unit: unit}), ex, nil)
if err != nil {
panic(err)
@@ -140,7 +178,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
for _, o := range ops {
fn := &arithmeticFunction{*NewScalarFunction(o.funcName, Binary(), addDoc), o.decPromote}
- kns := append(kernels.GetArithmeticKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...)
+ kns := append(kernels.GetArithmeticBinaryKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...)
kns = append(kns, kernels.GetArithmeticFunctionTimeDuration(o.op)...)
for _, k := range kns {
if err := fn.AddKernel(k); err != nil {
@@ -151,7 +189,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
for _, unit := range arrow.TimeUnitValues {
// timestamp - timestamp => duration
inType := exec.NewMatchedInput(exec.TimestampTypeUnit(unit))
- ex := kernels.ArithmeticExec(arrow.TIMESTAMP, o.op)
+ ex := kernels.ArithmeticExecSameType(arrow.TIMESTAMP, o.op)
err := fn.AddNewKernel([]exec.InputType{inType, inType}, kernels.OutputResolveTemporal, ex, nil)
if err != nil {
panic(err)
@@ -159,7 +197,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
// timestamp - duration => timestamp
inDuration := exec.NewExactInput(&arrow.DurationType{Unit: unit})
- ex = kernels.ArithmeticExec(arrow.TIMESTAMP, o.op)
+ ex = kernels.ArithmeticExecSameType(arrow.TIMESTAMP, o.op)
err = fn.AddNewKernel([]exec.InputType{inType, inDuration}, kernels.OutputFirstType, ex, nil)
if err != nil {
panic(err)
@@ -167,7 +205,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
// duration - duration = duration
matchDur := exec.NewMatchedInput(exec.DurationTypeUnit(unit))
- ex = kernels.ArithmeticExec(arrow.DURATION, o.op)
+ ex = kernels.ArithmeticExecSameType(arrow.DURATION, o.op)
err = fn.AddNewKernel([]exec.InputType{matchDur, matchDur}, exec.NewOutputType(&arrow.DurationType{Unit: unit}), ex, nil)
if err != nil {
panic(err)
@@ -177,7 +215,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
// time32 - time32 = duration
for _, unit := range []arrow.TimeUnit{arrow.Second, arrow.Millisecond} {
inType := exec.NewMatchedInput(exec.Time32TypeUnit(unit))
- internalEx := kernels.ArithmeticExec(arrow.TIME32, o.op)
+ internalEx := kernels.ArithmeticExecSameType(arrow.TIME32, o.op)
ex := func(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
if err := internalEx(ctx, batch, out); err != nil {
return err
@@ -204,7 +242,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
// time64 - time64 = duration
for _, unit := range []arrow.TimeUnit{arrow.Microsecond, arrow.Nanosecond} {
inType := exec.NewMatchedInput(exec.Time64TypeUnit(unit))
- ex := kernels.ArithmeticExec(arrow.TIME64, o.op)
+ ex := kernels.ArithmeticExecSameType(arrow.TIME64, o.op)
err := fn.AddNewKernel([]exec.InputType{inType, inType}, exec.NewOutputType(&arrow.DurationType{Unit: unit}), ex, nil)
if err != nil {
panic(err)
@@ -219,7 +257,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
}
inDate64 := exec.NewExactInput(arrow.FixedWidthTypes.Date64)
- ex = kernels.ArithmeticExec(arrow.DATE64, o.op)
+ ex = kernels.ArithmeticExecSameType(arrow.DATE64, o.op)
err = fn.AddNewKernel([]exec.InputType{inDate64, inDate64}, exec.NewOutputType(arrow.FixedWidthTypes.Duration_ms), ex, nil)
if err != nil {
panic(err)
@@ -242,7 +280,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
for _, o := range oplist {
fn := &arithmeticFunction{*NewScalarFunction(o.funcName, Binary(), addDoc), o.decPromote}
- for _, k := range append(kernels.GetArithmeticKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...) {
+ for _, k := range append(kernels.GetArithmeticBinaryKernels(o.op), kernels.GetDecimalBinaryKernels(o.op)...) {
if err := fn.AddKernel(k); err != nil {
panic(err)
}
@@ -252,7 +290,7 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
durInput := exec.NewExactInput(&arrow.DurationType{Unit: unit})
i64Input := exec.NewExactInput(arrow.PrimitiveTypes.Int64)
durOutput := exec.NewOutputType(&arrow.DurationType{Unit: unit})
- ex := kernels.ArithmeticExec(arrow.DURATION, o.op)
+ ex := kernels.ArithmeticExecSameType(arrow.DURATION, o.op)
err := fn.AddNewKernel([]exec.InputType{durInput, i64Input}, durOutput, ex, nil)
if err != nil {
panic(err)
@@ -267,6 +305,69 @@ func RegisterScalarArithmetic(reg FunctionRegistry) {
reg.AddFunction(fn, false)
}
+
+ ops = []struct {
+ funcName string
+ op kernels.ArithmeticOp
+ decPromote decimalPromotion
+ }{
+ {"abs_unchecked", kernels.OpAbsoluteValue, decPromoteNone},
+ {"abs", kernels.OpAbsoluteValueChecked, decPromoteNone},
+ {"negate_unchecked", kernels.OpNegate, decPromoteNone},
+ }
+
+ for _, o := range ops {
+ fn := &arithmeticFunction{*NewScalarFunction(o.funcName, Unary(), addDoc), decPromoteNone}
+ kns := append(kernels.GetArithmeticUnaryKernels(o.op), kernels.GetDecimalUnaryKernels(o.op)...)
+ for _, k := range kns {
+ if err := fn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(fn, false)
+ }
+
+ fn := &arithmeticFunction{*NewScalarFunction("negate", Unary(), addDoc), decPromoteNone}
+ kns := append(kernels.GetArithmeticUnarySignedKernels(kernels.OpNegateChecked), kernels.GetDecimalUnaryKernels(kernels.OpNegateChecked)...)
+ for _, k := range kns {
+ if err := fn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(fn, false)
+
+ ops = []struct {
+ funcName string
+ op kernels.ArithmeticOp
+ decPromote decimalPromotion
+ }{
+ {"sqrt_unchecked", kernels.OpSqrt, decPromoteNone},
+ {"sqrt", kernels.OpSqrtChecked, decPromoteNone},
+ }
+
+ for _, o := range ops {
+ fn := &arithmeticFloatingPointFunc{arithmeticFunction{*NewScalarFunction(o.funcName, Unary(), addDoc), decPromoteNone}}
+ kns := kernels.GetArithmeticUnaryFloatingPointKernels(o.op)
+ for _, k := range kns {
+ if err := fn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(fn, false)
+ }
+
+ fn = &arithmeticFunction{*NewScalarFunction("sign", Unary(), addDoc), decPromoteNone}
+ kns = kernels.GetArithmeticUnaryFixedIntOutKernels(arrow.PrimitiveTypes.Int8, kernels.OpSign)
+ for _, k := range kns {
+ if err := fn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(fn, false)
}
func impl(ctx context.Context, fn string, opts ArithmeticOptions, left, right Datum) (Datum, error) {
@@ -322,3 +423,41 @@ func Multiply(ctx context.Context, opts ArithmeticOptions, left, right Datum) (D
func Divide(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
return impl(ctx, "divide", opts, left, right)
}
+
+// AbsoluteValue returns the AbsoluteValue for each element in the input
+// argument. It accepts either a scalar or an array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// performance is faster if not explicitly checking for overflows but
+// will error on an overflow if CheckOverflow is true.
+func AbsoluteValue(ctx context.Context, opts ArithmeticOptions, input Datum) (Datum, error) {
+ fn := "abs"
+ if opts.NoCheckOverflow {
+ fn += "_unchecked"
+ }
+ return CallFunction(ctx, fn, nil, input)
+}
+
+// Negate returns a result containing the negation of each element in the
+// input argument. It accepts either a scalar or an array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// or to throw an error on unsigned types.
+func Negate(ctx context.Context, opts ArithmeticOptions, input Datum) (Datum, error) {
+ fn := "negate"
+ if opts.NoCheckOverflow {
+ fn += "_unchecked"
+ }
+ return CallFunction(ctx, fn, nil, input)
+}
+
+// Sign returns -1, 0, or 1 depending on the sign of each element in the
+// input. For x in the input:
+//
+// if x > 0: 1
+// if x < 0: -1
+// if x == 0: 0
+//
+func Sign(ctx context.Context, input Datum) (Datum, error) {
+ return CallFunction(ctx, "sign", nil, input)
+}
diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go
index 12e837a811..d57af69e6f 100644
--- a/go/arrow/compute/arithmetic_test.go
+++ b/go/arrow/compute/arithmetic_test.go
@@ -27,6 +27,7 @@ import (
"github.com/apache/arrow/go/v11/arrow/array"
"github.com/apache/arrow/go/v11/arrow/compute"
"github.com/apache/arrow/go/v11/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v11/arrow/compute/internal/kernels"
"github.com/apache/arrow/go/v11/arrow/decimal128"
"github.com/apache/arrow/go/v11/arrow/decimal256"
"github.com/apache/arrow/go/v11/arrow/internal/testing/gen"
@@ -36,6 +37,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
+ "golang.org/x/exp/constraints"
)
var (
@@ -58,6 +60,43 @@ func init() {
}
}
+func assertNullToNull(t *testing.T, ctx context.Context, fn string, mem memory.Allocator) {
+ f, ok := compute.GetFunctionRegistry().GetFunction(fn)
+ require.True(t, ok)
+ nulls := array.MakeArrayOfNull(mem, arrow.Null, 7)
+ defer nulls.Release()
+ n := f.Arity().NArgs
+
+ t.Run("null to null array", func(t *testing.T) {
+ args := make([]compute.Datum, n)
+ for i := 0; i < n; i++ {
+ args[i] = &compute.ArrayDatum{nulls.Data()}
+ }
+
+ result, err := compute.CallFunction(ctx, fn, nil, args...)
+ assert.NoError(t, err)
+ defer result.Release()
+ out := result.(*compute.ArrayDatum).MakeArray()
+ defer out.Release()
+ assertArraysEqual(t, nulls, out)
+ })
+
+ t.Run("null to null scalar", func(t *testing.T) {
+ args := make([]compute.Datum, n)
+ for i := 0; i < n; i++ {
+ args[i] = compute.NewDatum(scalar.ScalarNull)
+ }
+
+ result, err := compute.CallFunction(ctx, fn, nil, args...)
+ assert.NoError(t, err)
+ assertScalarEquals(t, scalar.ScalarNull, result.(*compute.ScalarDatum).Value)
+ })
+}
+
+type unaryArithmeticFunc = func(context.Context, compute.ArithmeticOptions, compute.Datum) (compute.Datum, error)
+
+type unaryFunc = func(compute.Datum) (compute.Datum, error)
+
type binaryArithmeticFunc = func(context.Context, compute.ArithmeticOptions, compute.Datum, compute.Datum) (compute.Datum, error)
type binaryFunc = func(left, right compute.Datum) (compute.Datum, error)
@@ -512,6 +551,12 @@ func (ds *DecimalArithmeticSuite) checkFail(fn string, args []compute.Datum, sub
ds.ErrorContains(err, substr)
}
+func (ds *DecimalArithmeticSuite) decimalArrayFromJSON(ty arrow.DataType, str string) arrow.Array {
+ arr, _, err := array.FromJSON(ds.mem, ty, strings.NewReader(str))
+ ds.Require().NoError(err)
+ return arr
+}
+
type DecimalBinaryArithmeticSuite struct {
DecimalArithmeticSuite
}
@@ -977,6 +1022,133 @@ func (ds *DecimalBinaryArithmeticSuite) TestDivide() {
})
}
+type DecimalUnaryArithmeticSuite struct {
+ DecimalArithmeticSuite
+}
+
+func (ds *DecimalUnaryArithmeticSuite) TestAbsoluteValue() {
+ max128 := decimal128.GetMaxValue(38)
+ max256 := decimal256.GetMaxValue(76)
+ ds.Run("decimal", func() {
+ for _, fn := range []string{"abs_unchecked", "abs"} {
+ ds.Run(fn, func() {
+ for _, ty := range ds.positiveScales() {
+ ds.Run(ty.String(), func() {
+ empty, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`[]`))
+ defer empty.Release()
+ in, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["1.00", "-42.15", null]`))
+ defer in.Release()
+ exp, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["1.00", "42.15", null]`))
+ defer exp.Release()
+
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{empty.Data()}}, &compute.ArrayDatum{empty.Data()}, nil)
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{in.Data()}}, &compute.ArrayDatum{exp.Data()}, nil)
+ })
+ }
+
+ checkScalar(ds.T(), fn, []compute.Datum{compute.NewDatum(scalar.NewDecimal128Scalar(max128.Negate(), &arrow.Decimal128Type{Precision: 38}))},
+ compute.NewDatum(scalar.NewDecimal128Scalar(max128, &arrow.Decimal128Type{Precision: 38})), nil)
+ checkScalar(ds.T(), fn, []compute.Datum{compute.NewDatum(scalar.NewDecimal256Scalar(max256.Negate(), &arrow.Decimal256Type{Precision: 76}))},
+ compute.NewDatum(scalar.NewDecimal256Scalar(max256, &arrow.Decimal256Type{Precision: 76})), nil)
+ for _, ty := range ds.negativeScales() {
+ ds.Run(ty.String(), func() {
+ empty, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`[]`))
+ defer empty.Release()
+ in, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["12E2", "-42E2", null]`))
+ defer in.Release()
+ exp, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["12E2", "42E2", null]`))
+ defer exp.Release()
+
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{empty.Data()}}, &compute.ArrayDatum{empty.Data()}, nil)
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{in.Data()}}, &compute.ArrayDatum{exp.Data()}, nil)
+ })
+ }
+ })
+ }
+ })
+}
+
+func (ds *DecimalUnaryArithmeticSuite) TestNegate() {
+ max128 := decimal128.GetMaxValue(38)
+ max256 := decimal256.GetMaxValue(76)
+
+ for _, fn := range []string{"negate_unchecked", "negate"} {
+ ds.Run(fn, func() {
+ for _, ty := range ds.positiveScales() {
+ empty, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`[]`))
+ defer empty.Release()
+ in, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["0.00", "1.00", "-42.15", null]`))
+ defer in.Release()
+ exp, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["0.00", "-1.00", "42.15", null]`))
+ defer exp.Release()
+
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{empty.Data()}}, &compute.ArrayDatum{empty.Data()}, nil)
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{in.Data()}}, &compute.ArrayDatum{exp.Data()}, nil)
+ }
+
+ checkScalar(ds.T(), fn, []compute.Datum{compute.NewDatum(scalar.NewDecimal128Scalar(max128.Negate(), &arrow.Decimal128Type{Precision: 38}))},
+ compute.NewDatum(scalar.NewDecimal128Scalar(max128, &arrow.Decimal128Type{Precision: 38})), nil)
+ checkScalar(ds.T(), fn, []compute.Datum{compute.NewDatum(scalar.NewDecimal256Scalar(max256.Negate(), &arrow.Decimal256Type{Precision: 76}))},
+ compute.NewDatum(scalar.NewDecimal256Scalar(max256, &arrow.Decimal256Type{Precision: 76})), nil)
+ checkScalar(ds.T(), fn, []compute.Datum{compute.NewDatum(scalar.NewDecimal128Scalar(max128, &arrow.Decimal128Type{Precision: 38}))},
+ compute.NewDatum(scalar.NewDecimal128Scalar(max128.Negate(), &arrow.Decimal128Type{Precision: 38})), nil)
+ checkScalar(ds.T(), fn, []compute.Datum{compute.NewDatum(scalar.NewDecimal256Scalar(max256, &arrow.Decimal256Type{Precision: 76}))},
+ compute.NewDatum(scalar.NewDecimal256Scalar(max256.Negate(), &arrow.Decimal256Type{Precision: 76})), nil)
+ for _, ty := range ds.negativeScales() {
+ ds.Run(ty.String(), func() {
+ empty, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`[]`))
+ defer empty.Release()
+ in, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["0", "12E2", "-42E2", null]`))
+ defer in.Release()
+ exp, _, _ := array.FromJSON(ds.mem, ty, strings.NewReader(`["0", "-12E2", "42E2", null]`))
+ defer exp.Release()
+
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{empty.Data()}}, &compute.ArrayDatum{empty.Data()}, nil)
+ checkScalar(ds.T(), fn, []compute.Datum{&compute.ArrayDatum{in.Data()}}, &compute.ArrayDatum{exp.Data()}, nil)
+ })
+ }
+ })
+ }
+}
+
+func (ds *DecimalUnaryArithmeticSuite) TestSquareRoot() {
+ for _, fn := range []string{"sqrt_unchecked", "sqrt"} {
+ ds.Run(fn, func() {
+ for _, ty := range ds.positiveScales() {
+ ds.Run(ty.String(), func() {
+ empty := ds.decimalArrayFromJSON(ty, `[]`)
+ defer empty.Release()
+ arr := ds.decimalArrayFromJSON(ty, `["4.00", "16.00", "36.00", null]`)
+ defer arr.Release()
+
+ ds.checkDecimalToFloat(fn, []compute.Datum{&compute.ArrayDatum{Value: empty.Data()}})
+ ds.checkDecimalToFloat(fn, []compute.Datum{&compute.ArrayDatum{Value: arr.Data()}})
+
+ neg := ds.decimalArrayFromJSON(ty, `["-2.00"]`)
+ defer neg.Release()
+ ds.checkFail("sqrt", []compute.Datum{&compute.ArrayDatum{Value: neg.Data()}}, "square root of negative number", nil)
+ })
+ }
+
+ for _, ty := range ds.negativeScales() {
+ ds.Run(ty.String(), func() {
+ empty := ds.decimalArrayFromJSON(ty, `[]`)
+ defer empty.Release()
+ arr := ds.decimalArrayFromJSON(ty, `["400", "1600", "3600", null]`)
+ defer arr.Release()
+
+ ds.checkDecimalToFloat(fn, []compute.Datum{&compute.ArrayDatum{Value: empty.Data()}})
+ ds.checkDecimalToFloat(fn, []compute.Datum{&compute.ArrayDatum{Value: arr.Data()}})
+
+ neg := ds.decimalArrayFromJSON(ty, `["-400"]`)
+ defer neg.Release()
+ ds.checkFail("sqrt", []compute.Datum{&compute.ArrayDatum{Value: neg.Data()}}, "square root of negative number", nil)
+ })
+ }
+ })
+ }
+}
+
type ScalarBinaryTemporalArithmeticSuite struct {
BinaryFuncTestSuite
}
@@ -1076,6 +1248,362 @@ func (s *ScalarBinaryTemporalArithmeticSuite) TestTemporalAddSub() {
}
}
+func TestUnaryDispatchBest(t *testing.T) {
+ for _, fn := range []string{"abs"} {
+ for _, suffix := range []string{"", "_unchecked"} {
+ fn += suffix
+ t.Run(fn, func(t *testing.T) {
+ for _, ty := range numericTypes {
+ t.Run(ty.String(), func(t *testing.T) {
+ CheckDispatchBest(t, fn, []arrow.DataType{ty}, []arrow.DataType{ty})
+ CheckDispatchBest(t, fn, []arrow.DataType{&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int8, ValueType: ty}},
+ []arrow.DataType{ty})
+ })
+ }
+ })
+ }
+ }
+
+ for _, fn := range []string{"negate_unchecked", "sign"} {
+ t.Run(fn, func(t *testing.T) {
+ for _, ty := range numericTypes {
+ t.Run(ty.String(), func(t *testing.T) {
+ CheckDispatchBest(t, fn, []arrow.DataType{ty}, []arrow.DataType{ty})
+ CheckDispatchBest(t, fn, []arrow.DataType{&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int8, ValueType: ty}},
+ []arrow.DataType{ty})
+ })
+ }
+ })
+ }
+
+ for _, fn := range []string{"negate"} {
+ t.Run(fn, func(t *testing.T) {
+ for _, ty := range append(signedIntTypes, floatingTypes...) {
+ t.Run(ty.String(), func(t *testing.T) {
+ CheckDispatchBest(t, fn, []arrow.DataType{ty}, []arrow.DataType{ty})
+ CheckDispatchBest(t, fn, []arrow.DataType{&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int8, ValueType: ty}},
+ []arrow.DataType{ty})
+ })
+ }
+ })
+ }
+}
+
+func TestUnaryArithmeticNull(t *testing.T) {
+ for _, fn := range []string{"abs", "negate"} {
+ for _, suffix := range []string{"", "_unchecked"} {
+ fn += suffix
+ assertNullToNull(t, context.TODO(), fn, memory.DefaultAllocator)
+ }
+ }
+
+ for _, fn := range []string{"sign"} {
+ assertNullToNull(t, context.TODO(), fn, memory.DefaultAllocator)
+ }
+}
+
+type UnaryArithmeticSuite[T exec.NumericTypes] struct {
+ suite.Suite
+
+ mem *memory.CheckedAllocator
+ ctx context.Context
+
+ opts compute.ArithmeticOptions
+}
+
+func (b *UnaryArithmeticSuite[T]) SetupTest() {
+ b.mem = memory.NewCheckedAllocator(memory.DefaultAllocator)
+ b.ctx = compute.WithAllocator(context.TODO(), b.mem)
+ b.opts = compute.ArithmeticOptions{}
+}
+
+func (b *UnaryArithmeticSuite[T]) TearDownTest() {
+ b.mem.AssertSize(b.T(), 0)
+}
+
+func (b *UnaryArithmeticSuite[T]) setOverflowCheck(v bool) {
+ b.opts.NoCheckOverflow = !v
+}
+
+func (*UnaryArithmeticSuite[T]) datatype() arrow.DataType {
+ return exec.GetDataType[T]()
+}
+
+func (us *UnaryArithmeticSuite[T]) makeNullScalar() scalar.Scalar {
+ return scalar.MakeNullScalar(us.datatype())
+}
+
+func (us *UnaryArithmeticSuite[T]) makeScalar(v T) scalar.Scalar {
+ return scalar.MakeScalar(v)
+}
+
+func (us *UnaryArithmeticSuite[T]) assertUnaryOpValError(fn unaryArithmeticFunc, arg T, msg string) {
+ in := us.makeScalar(arg)
+ _, err := fn(us.ctx, us.opts, compute.NewDatum(in))
+ us.ErrorIs(err, arrow.ErrInvalid)
+ us.ErrorContains(err, msg)
+}
+
+func (us *UnaryArithmeticSuite[T]) assertUnaryOpNotImplemented(fn unaryArithmeticFunc, arg T, msg string) {
+ in := us.makeScalar(arg)
+ _, err := fn(us.ctx, us.opts, compute.NewDatum(in))
+ us.ErrorIs(err, arrow.ErrNotImplemented)
+ us.ErrorContains(err, msg)
+}
+
+func (us *UnaryArithmeticSuite[T]) assertUnaryOpVals(fn unaryArithmeticFunc, arg, expected T) {
+ in := us.makeScalar(arg)
+ exp := us.makeScalar(expected)
+
+ actual, err := fn(us.ctx, us.opts, compute.NewDatum(in))
+ us.Require().NoError(err)
+ assertScalarEquals(us.T(), exp, actual.(*compute.ScalarDatum).Value)
+}
+
+func (us *UnaryArithmeticSuite[T]) assertUnaryOpScalars(fn unaryArithmeticFunc, arg, exp scalar.Scalar) {
+ actual, err := fn(us.ctx, us.opts, compute.NewDatum(arg))
+ us.Require().NoError(err)
+ assertScalarEquals(us.T(), exp, actual.(*compute.ScalarDatum).Value)
+}
+
+func (us *UnaryArithmeticSuite[T]) assertUnaryOpArrs(fn unaryArithmeticFunc, arg, exp arrow.Array) {
+ datum := &compute.ArrayDatum{arg.Data()}
+ actual, err := fn(us.ctx, us.opts, datum)
+ us.Require().NoError(err)
+ defer actual.Release()
+ assertDatumsEqual(us.T(), &compute.ArrayDatum{exp.Data()}, actual)
+
+ // also check scalar ops
+ for i := 0; i < arg.Len(); i++ {
+ expScalar, err := scalar.GetScalar(exp, i)
+ us.NoError(err)
+ argScalar, err := scalar.GetScalar(arg, i)
+ us.NoError(err)
+
+ actual, err := fn(us.ctx, us.opts, compute.NewDatum(argScalar))
+ us.Require().NoError(err)
+ assertDatumsEqual(us.T(), compute.NewDatum(expScalar), compute.NewDatum(actual))
+ }
+}
+
+func (us *UnaryArithmeticSuite[T]) assertUnaryOp(fn unaryArithmeticFunc, arg, exp string) {
+ in, _, err := array.FromJSON(us.mem, us.datatype(), strings.NewReader(arg), array.WithUseNumber())
+ us.Require().NoError(err)
+ defer in.Release()
+ expected, _, err := array.FromJSON(us.mem, us.datatype(), strings.NewReader(exp), array.WithUseNumber())
+ us.Require().NoError(err)
+ defer expected.Release()
+
+ us.assertUnaryOpArrs(fn, in, expected)
+}
+
+type UnaryArithmeticSigned[T exec.IntTypes] struct {
+ UnaryArithmeticSuite[T]
+}
+
+func (us *UnaryArithmeticSigned[T]) TestAbsoluteValue() {
+ var (
+ dt = us.datatype()
+ min = kernels.MinOf[T]()
+ max = kernels.MaxOf[T]()
+ )
+
+ fn := func(in, exp string) {
+ us.assertUnaryOp(compute.AbsoluteValue, in, exp)
+ }
+
+ us.Run(dt.String(), func() {
+ for _, checkOverflow := range []bool{true, false} {
+ us.setOverflowCheck(checkOverflow)
+ us.Run(fmt.Sprintf("check_overflow=%t", checkOverflow), func() {
+ // empty array
+ fn(`[]`, `[]`)
+ // scalar/arrays with nulls
+ fn(`[null]`, `[null]`)
+ fn(`[1, null -10]`, `[1, null, 10]`)
+ us.assertUnaryOpScalars(compute.AbsoluteValue, us.makeNullScalar(), us.makeNullScalar())
+ // scalar/arrays with zeros
+ fn(`[0, -0]`, `[0, 0]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, -0, 0)
+ us.assertUnaryOpVals(compute.AbsoluteValue, 0, 0)
+ // ordinary scalars/arrays (positive inputs)
+ fn(`[1, 10, 127]`, `[1, 10, 127]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, 1, 1)
+ // ordinary scalars/arrays (negative inputs)
+ fn(`[-1, -10, -127]`, `[1, 10, 127]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, -1, 1)
+ // min/max
+ us.assertUnaryOpVals(compute.AbsoluteValue, max, max)
+ if checkOverflow {
+ us.assertUnaryOpValError(compute.AbsoluteValue, min, "overflow")
+ } else {
+ us.assertUnaryOpVals(compute.AbsoluteValue, min, min)
+ }
+ })
+ }
+ })
+}
+
+func (us *UnaryArithmeticSigned[T]) TestNegate() {
+ var (
+ dt = us.datatype()
+ min = kernels.MinOf[T]()
+ max = kernels.MaxOf[T]()
+ )
+
+ fn := func(in, exp string) {
+ us.assertUnaryOp(compute.Negate, in, exp)
+ }
+
+ us.Run(dt.String(), func() {
+ for _, checkOverflow := range []bool{true, false} {
+ us.setOverflowCheck(checkOverflow)
+ us.Run(fmt.Sprintf("check_overflow=%t", checkOverflow), func() {
+ fn(`[]`, `[]`)
+ // scalar/arrays with nulls
+ fn(`[null]`, `[null]`)
+ fn(`[1, null -10]`, `[-1, null, 10]`)
+ // ordinary scalars/arrays (positive inputs)
+ fn(`[1, 10, 127]`, `[-1, -10, -127]`)
+ us.assertUnaryOpVals(compute.Negate, 1, -1)
+ // ordinary scalars/arrays (negative inputs)
+ fn(`[-1, -10, -127]`, `[1, 10, 127]`)
+ us.assertUnaryOpVals(compute.Negate, -1, 1)
+ // min/max
+ us.assertUnaryOpVals(compute.Negate, min+1, max)
+ us.assertUnaryOpVals(compute.Negate, max, min+1)
+ })
+ }
+ })
+}
+
+type UnaryArithmeticUnsigned[T exec.UintTypes] struct {
+ UnaryArithmeticSuite[T]
+}
+
+func (us *UnaryArithmeticUnsigned[T]) TestAbsoluteValue() {
+ var (
+ min, max T = 0, kernels.MaxOf[T]()
+ )
+
+ fn := func(in, exp string) {
+ us.assertUnaryOp(compute.AbsoluteValue, in, exp)
+ }
+
+ us.Run(us.datatype().String(), func() {
+ for _, checkOverflow := range []bool{true, false} {
+ us.setOverflowCheck(checkOverflow)
+ us.Run(fmt.Sprintf("check_overflow=%t", checkOverflow), func() {
+ fn(`[]`, `[]`)
+ fn(`[null]`, `[null]`)
+ us.assertUnaryOpScalars(compute.AbsoluteValue, us.makeNullScalar(), us.makeNullScalar())
+ fn(`[0, 1, 10, 127]`, `[0, 1, 10, 127]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, min, min)
+ us.assertUnaryOpVals(compute.AbsoluteValue, max, max)
+ })
+ }
+ })
+}
+
+func (us *UnaryArithmeticUnsigned[T]) TestNegate() {
+ var (
+ dt = us.datatype()
+ )
+
+ fn := func(in, exp string) {
+ us.assertUnaryOp(compute.Negate, in, exp)
+ }
+
+ us.Run(dt.String(), func() {
+ us.setOverflowCheck(true)
+ us.assertUnaryOpNotImplemented(compute.Negate, 1, "no kernel matching input types")
+
+ us.setOverflowCheck(false)
+ fn(`[]`, `[]`)
+ fn(`[null]`, `[null]`)
+ us.assertUnaryOpVals(compute.Negate, 1, ^T(1)+1)
+ })
+}
+
+type UnaryArithmeticFloating[T constraints.Float] struct {
+ UnaryArithmeticSuite[T]
+
+ min, max T
+}
+
+func (us *UnaryArithmeticFloating[T]) TestAbsoluteValue() {
+ fn := func(in, exp string) {
+ us.assertUnaryOp(compute.AbsoluteValue, in, exp)
+ }
+
+ us.Run(us.datatype().String(), func() {
+ for _, checkOverflow := range []bool{true, false} {
+ us.setOverflowCheck(checkOverflow)
+ us.Run(fmt.Sprintf("check_overflow=%t", checkOverflow), func() {
+ fn(`[]`, `[]`)
+ fn(`[null]`, `[null]`)
+ fn(`[1.3, null, -10.80]`, `[1.3, null, 10.80]`)
+ us.assertUnaryOpScalars(compute.AbsoluteValue, us.makeNullScalar(), us.makeNullScalar())
+ fn(`[0.0, -0.0]`, `[0.0, 0.0]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, T(math.Copysign(0, -1)), 0)
+ us.assertUnaryOpVals(compute.AbsoluteValue, 0, 0)
+ fn(`[1.3, 10.80, 12748.001]`, `[1.3, 10.80, 12748.001]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, 1.3, 1.3)
+ fn(`[-1.3, -10.80, -12748.001]`, `[1.3, 10.80, 12748.001]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, -1.3, 1.3)
+ fn(`["Inf", "-Inf"]`, `["Inf", "Inf"]`)
+ us.assertUnaryOpVals(compute.AbsoluteValue, us.min, us.max)
+ us.assertUnaryOpVals(compute.AbsoluteValue, us.max, us.max)
+ })
+ }
+ })
+}
+
+func (us *UnaryArithmeticFloating[T]) TestNegate() {
+ var (
+ dt = us.datatype()
+ )
+
+ fn := func(in, exp string) {
+ us.assertUnaryOp(compute.Negate, in, exp)
+ }
+
+ us.Run(dt.String(), func() {
+ for _, checkOverflow := range []bool{true, false} {
+ us.setOverflowCheck(checkOverflow)
+ us.Run(fmt.Sprintf("check_overflow=%t", checkOverflow), func() {
+ fn(`[]`, `[]`)
+ // scalar/arrays with nulls
+ fn(`[null]`, `[null]`)
+ fn(`[1.5, null -10.25]`, `[-1.5, null, 10.25]`)
+ // ordinary scalars/arrays (positive inputs)
+ fn(`[0.5, 10.123, 127.321]`, `[-0.5, -10.123, -127.321]`)
+ us.assertUnaryOpVals(compute.Negate, 1.25, -1.25)
+ // ordinary scalars/arrays (negative inputs)
+ fn(`[-0.5, -10.123, -127.321]`, `[0.5, 10.123, 127.321]`)
+ us.assertUnaryOpVals(compute.Negate, -1.25, 1.25)
+ // min/max
+ us.assertUnaryOpVals(compute.Negate, us.min, us.max)
+ us.assertUnaryOpVals(compute.Negate, us.max, us.min)
+ })
+ }
+ })
+}
+
+func TestUnaryArithmetic(t *testing.T) {
+ suite.Run(t, new(UnaryArithmeticSigned[int8]))
+ suite.Run(t, new(UnaryArithmeticSigned[int16]))
+ suite.Run(t, new(UnaryArithmeticSigned[int32]))
+ suite.Run(t, new(UnaryArithmeticSigned[int64]))
+ suite.Run(t, new(UnaryArithmeticUnsigned[uint8]))
+ suite.Run(t, new(UnaryArithmeticUnsigned[uint16]))
+ suite.Run(t, new(UnaryArithmeticUnsigned[uint32]))
+ suite.Run(t, new(UnaryArithmeticUnsigned[uint64]))
+ suite.Run(t, &UnaryArithmeticFloating[float32]{min: -math.MaxFloat32, max: math.MaxFloat32})
+ suite.Run(t, &UnaryArithmeticFloating[float64]{min: -math.MaxFloat64, max: math.MaxFloat64})
+ suite.Run(t, new(DecimalUnaryArithmeticSuite))
+}
+
const seed = 0x94378165
type binaryOp = func(ctx context.Context, left, right compute.Datum) (compute.Datum, error)
diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index 5b6f17e13b..6a0b77fce0 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -228,10 +228,12 @@ var (
arrow.PrimitiveTypes.Uint32,
arrow.PrimitiveTypes.Uint64,
}
- integerTypes = append(signedIntTypes, unsignedIntTypes...)
- numericTypes = append(integerTypes,
+ integerTypes = append(signedIntTypes, unsignedIntTypes...)
+ floatingTypes = []arrow.DataType{
arrow.PrimitiveTypes.Float32,
- arrow.PrimitiveTypes.Float64)
+ arrow.PrimitiveTypes.Float64,
+ }
+ numericTypes = append(integerTypes, floatingTypes...)
baseBinaryTypes = []arrow.DataType{
arrow.BinaryTypes.Binary,
arrow.BinaryTypes.LargeBinary,
diff --git a/go/arrow/compute/internal/kernels/Makefile b/go/arrow/compute/internal/kernels/Makefile
index 96238cc9a1..53dda4da43 100644
--- a/go/arrow/compute/internal/kernels/Makefile
+++ b/go/arrow/compute/internal/kernels/Makefile
@@ -48,7 +48,7 @@ INTEL_SOURCES := \
assembly: $(INTEL_SOURCES)
_lib/cast_numeric_avx2_amd64.s: _lib/cast_numeric.cc
- $(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+ $(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
_lib/cast_numeric_sse4_amd64.s: _lib/cast_numeric.cc
$(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
index 3a8f6a7e70..be7f7ccb25 100644
--- a/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
@@ -15,7 +15,9 @@
// limitations under the License.
#include <arch.h>
+#include <math.h>
#include <stdint.h>
+#include <limits.h>
#include "types.h"
#include "vendored/safe-math.h"
@@ -32,6 +34,10 @@ enum class optype : int8_t {
SUB,
MUL,
DIV,
+ ABSOLUTE_VALUE,
+ NEGATE,
+ SQRT,
+ SIGN,
// this impl doesn't actually perform any overflow checks as we need
// to only run overflow checks on non-null entries
@@ -39,6 +45,9 @@ enum class optype : int8_t {
SUB_CHECKED,
MUL_CHECKED,
DIV_CHECKED,
+ ABSOLUTE_VALUE_CHECKED,
+ NEGATE_CHECKED,
+ SQRT_CHECKED,
};
struct Add {
@@ -125,45 +134,132 @@ struct MultiplyChecked {
}
};
-template <typename T, typename Op>
+struct AbsoluteValue {
+ template <typename T, typename Arg>
+ static constexpr T Call(Arg input) {
+ if constexpr(is_same_v<Arg, float>) {
+ *(((int*)&input)+0) &= 0x7fffffff;
+ return input;
+ } else if constexpr(is_same_v<Arg, double>) {
+ *(((int*)&input)+1) &= 0x7fffffff;
+ return input;
+ } else if constexpr(is_unsigned_v<Arg>) {
+ return input;
+ } else {
+ const auto mask = input >> (sizeof(Arg) * CHAR_BIT - 1);
+ return (input + mask) ^ mask;
+ }
+ }
+};
+
+struct AbsoluteValueChecked {
+ template <typename T, typename Arg>
+ static constexpr T Call(Arg input) {
+ if constexpr(is_same_v<Arg, float>) {
+ *(((int*)&input)+0) &= 0x7fffffff;
+ return input;
+ } else if constexpr(is_same_v<Arg, double>) {
+ *(((int*)&input)+1) &= 0x7fffffff;
+ return input;
+ } else if constexpr(is_unsigned_v<Arg>) {
+ return input;
+ } else {
+ const auto mask = input >> (sizeof(Arg) * CHAR_BIT - 1);
+ return (input + mask) ^ mask;
+ }
+ }
+};
+
+struct Negate {
+ template <typename T, typename Arg>
+ static constexpr T Call(Arg input) {
+ if constexpr(is_floating_point_v<Arg>) {
+ return -input;
+ } else if constexpr(is_unsigned_v<Arg>) {
+ return ~input + 1;
+ } else {
+ return -input;
+ }
+ }
+};
+
+struct NegateChecked {
+ template <typename T, typename Arg>
+ static constexpr T Call(Arg input) {
+ static_assert(is_same_v<T, Arg>, "");
+ if constexpr(is_floating_point_v<Arg>) {
+ return -input;
+ } else if constexpr(is_unsigned_v<Arg>) {
+ return 0;
+ } else {
+ return -input;
+ }
+ }
+};
+
+struct Sign {
+ template <typename T, typename Arg>
+ static constexpr T Call(Arg input) {
+ if constexpr(is_floating_point_v<Arg>) {
+ return isnan(input) ? input : ((input == 0) ? 0 : (signbit(input) ? -1 : 1));
+ } else if constexpr(is_unsigned_v<Arg>) {
+ return input > 0 ? 1 : 0;
+ } else if constexpr(is_signed_v<Arg>) {
+ return input > 0 ? 1 : (input ? -1 : 0);
+ }
+ }
+};
+
+template <typename T, typename Op, typename OutT = T>
struct arithmetic_op_arr_arr_impl {
static inline void exec(const void* in_left, const void* in_right, void* out, const int len) {
const T* left = reinterpret_cast<const T*>(in_left);
const T* right = reinterpret_cast<const T*>(in_right);
- T* output = reinterpret_cast<T*>(out);
+ OutT* output = reinterpret_cast<OutT*>(out);
for (int i = 0; i < len; ++i) {
- output[i] = Op::template Call<T, T, T>(left[i], right[i]);
+ output[i] = Op::template Call<OutT, T, T>(left[i], right[i]);
}
}
};
-template <typename T, typename Op>
+template <typename T, typename Op, typename OutT = T>
struct arithmetic_op_arr_scalar_impl {
static inline void exec(const void* in_left, const void* scalar_right, void* out, const int len) {
const T* left = reinterpret_cast<const T*>(in_left);
const T right = *reinterpret_cast<const T*>(scalar_right);
- T* output = reinterpret_cast<T*>(out);
+ OutT* output = reinterpret_cast<OutT*>(out);
for (int i = 0; i < len; ++i) {
- output[i] = Op::template Call<T, T, T>(left[i], right);
+ output[i] = Op::template Call<OutT, T, T>(left[i], right);
}
}
};
-template <typename T, typename Op>
+template <typename T, typename Op, typename OutT = T>
struct arithmetic_op_scalar_arr_impl {
static inline void exec(const void* scalar_left, const void* in_right, void* out, const int len) {
const T left = *reinterpret_cast<const T*>(scalar_left);
const T* right = reinterpret_cast<const T*>(in_right);
- T* output = reinterpret_cast<T*>(out);
+ OutT* output = reinterpret_cast<OutT*>(out);
for (int i = 0; i < len; ++i) {
- output[i] = Op::template Call<T, T, T>(left, right[i]);
+ output[i] = Op::template Call<OutT, T, T>(left, right[i]);
}
}
};
+template <typename T, typename Op, typename OutT = T>
+struct arithmetic_unary_op_impl {
+ static inline void exec(const void* arg, void* out, const int len) {
+ const T* input = reinterpret_cast<const T*>(arg);
+ OutT* output = reinterpret_cast<OutT*>(out);
+
+ for (int i = 0; i < len; ++i) {
+ output[i] = Op::template Call<OutT, T>(input[i]);
+ }
+ }
+};
template <typename Op, template<typename...> typename Impl>
static inline void arithmetic_op(const int type, const void* in_left, const void* in_right, void* output, const int len) {
@@ -195,8 +291,132 @@ static inline void arithmetic_op(const int type, const void* in_left, const void
}
}
+template <typename Op, template <typename...> typename Impl, typename Input>
+static inline void arithmetic_op(const int otype, const void* input, void* output, const int len) {
+ const auto outtype = static_cast<arrtype>(otype);
+
+ switch (outtype) {
+ case arrtype::UINT8:
+ return Impl<Input, Op, uint8_t>::exec(input, output, len);
+ case arrtype::INT8:
+ return Impl<Input, Op, int8_t>::exec(input, output, len);
+ case arrtype::UINT16:
+ return Impl<Input, Op, uint16_t>::exec(input, output, len);
+ case arrtype::INT16:
+ return Impl<Input, Op, int16_t>::exec(input, output, len);
+ case arrtype::UINT32:
+ return Impl<Input, Op, uint32_t>::exec(input, output, len);
+ case arrtype::INT32:
+ return Impl<Input, Op, int32_t>::exec(input, output, len);
+ case arrtype::UINT64:
+ return Impl<Input, Op, uint64_t>::exec(input, output, len);
+ case arrtype::INT64:
+ return Impl<Input, Op, int64_t>::exec(input, output, len);
+ case arrtype::FLOAT32:
+ return Impl<Input, Op, float>::exec(input, output, len);
+ case arrtype::FLOAT64:
+ return Impl<Input, Op, double>::exec(input, output, len);
+ default:
+ break;
+ }
+}
+
+
+template <typename Op, template <typename...> typename Impl>
+static inline void arithmetic_op(const int type, const void* input, void* output, const int len) {
+ const auto intype = static_cast<arrtype>(type);
+
+ switch (intype) {
+ case arrtype::UINT8:
+ return Impl<uint8_t, Op>::exec(input, output, len);
+ case arrtype::INT8:
+ return Impl<int8_t, Op>::exec(input, output, len);
+ case arrtype::UINT16:
+ return Impl<uint16_t, Op>::exec(input, output, len);
+ case arrtype::INT16:
+ return Impl<int16_t, Op>::exec(input, output, len);
+ case arrtype::UINT32:
+ return Impl<uint32_t, Op>::exec(input, output, len);
+ case arrtype::INT32:
+ return Impl<int32_t, Op>::exec(input, output, len);
+ case arrtype::UINT64:
+ return Impl<uint64_t, Op>::exec(input, output, len);
+ case arrtype::INT64:
+ return Impl<int64_t, Op>::exec(input, output, len);
+ case arrtype::FLOAT32:
+ return Impl<float, Op>::exec(input, output, len);
+ case arrtype::FLOAT64:
+ return Impl<double, Op>::exec(input, output, len);
+ default:
+ break;
+ }
+}
+
+template <typename Op, template <typename...> typename Impl>
+static inline void arithmetic_op(const int itype, const int otype, const void* input, void* output, const int len) {
+ const auto intype = static_cast<arrtype>(itype);
+
+ switch (intype) {
+ case arrtype::UINT8:
+ return arithmetic_op<Op, Impl, uint8_t>(otype, input, output, len);
+ case arrtype::INT8:
+ return arithmetic_op<Op, Impl, int8_t>(otype, input, output, len);
+ case arrtype::UINT16:
+ return arithmetic_op<Op, Impl, uint16_t>(otype, input, output, len);
+ case arrtype::INT16:
+ return arithmetic_op<Op, Impl, int16_t>(otype, input, output, len);
+ case arrtype::UINT32:
+ return arithmetic_op<Op, Impl, uint32_t>(otype, input, output, len);
+ case arrtype::INT32:
+ return arithmetic_op<Op, Impl, int32_t>(otype, input, output, len);
+ case arrtype::UINT64:
+ return arithmetic_op<Op, Impl, uint64_t>(otype, input, output, len);
+ case arrtype::INT64:
+ return arithmetic_op<Op, Impl, int64_t>(otype, input, output, len);
+ case arrtype::FLOAT32:
+ return arithmetic_op<Op, Impl, float>(otype, input, output, len);
+ case arrtype::FLOAT64:
+ return arithmetic_op<Op, Impl, double>(otype, input, output, len);
+ default:
+ break;
+ }
+}
+
template <template <typename...> class Impl>
-static inline void arithmetic_impl(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
+static inline void arithmetic_unary_impl_same_types(const int type, const int8_t op, const void* input, void* output, const int len) {
+ const auto opt = static_cast<optype>(op);
+
+ switch (opt) {
+ case optype::ABSOLUTE_VALUE:
+ return arithmetic_op<AbsoluteValue, Impl>(type, input, output, len);
+ case optype::ABSOLUTE_VALUE_CHECKED:
+ return arithmetic_op<AbsoluteValueChecked, Impl>(type, input, output, len);
+ case optype::NEGATE:
+ return arithmetic_op<Negate, Impl>(type, input, output, len);
+ case optype::NEGATE_CHECKED:
+ return arithmetic_op<NegateChecked, Impl>(type, input, output, len);
+ case optype::SIGN:
+ return arithmetic_op<Sign, Impl>(type, input, output, len);
+ default:
+ break;
+ }
+}
+
+
+template <template <typename...> class Impl>
+static inline void arithmetic_unary_impl(const int itype, const int otype, const int8_t op, const void* input, void* output, const int len) {
+ const auto opt = static_cast<optype>(op);
+
+ switch (opt) {
+ case optype::SIGN:
+ return arithmetic_op<Sign, Impl>(itype, otype, input, output, len);
+ default:
+ break;
+ }
+}
+
+template <template <typename...> class Impl>
+static inline void arithmetic_binary_impl(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
const auto opt = static_cast<optype>(op);
switch (opt) {
@@ -211,22 +431,31 @@ static inline void arithmetic_impl(const int type, const int8_t op, const void*
case optype::MUL:
return arithmetic_op<Multiply, Impl>(type, in_left, in_right, out, len);
case optype::MUL_CHECKED:
- return arithmetic_op<MultiplyChecked, Impl>(type, in_left, in_right, out, len);
- default:
+ return arithmetic_op<MultiplyChecked, Impl>(type, in_left, in_right, out, len);
+
+ default:
// don't implement divide here as we can only divide on non-null entries
// so we can avoid dividing by zero
break;
}
}
-extern "C" void FULL_NAME(arithmetic)(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
- arithmetic_impl<arithmetic_op_arr_arr_impl>(type, op, in_left, in_right, out, len);
+extern "C" void FULL_NAME(arithmetic_binary)(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
+ arithmetic_binary_impl<arithmetic_op_arr_arr_impl>(type, op, in_left, in_right, out, len);
}
extern "C" void FULL_NAME(arithmetic_arr_scalar)(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
- arithmetic_impl<arithmetic_op_arr_scalar_impl>(type, op, in_left, in_right, out, len);
+ arithmetic_binary_impl<arithmetic_op_arr_scalar_impl>(type, op, in_left, in_right, out, len);
}
extern "C" void FULL_NAME(arithmetic_scalar_arr)(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
- arithmetic_impl<arithmetic_op_scalar_arr_impl>(type, op, in_left, in_right, out, len);
+ arithmetic_binary_impl<arithmetic_op_scalar_arr_impl>(type, op, in_left, in_right, out, len);
+}
+
+extern "C" void FULL_NAME(arithmetic_unary_same_types)(const int type, const int8_t op, const void* input, void* output, const int len) {
+ arithmetic_unary_impl_same_types<arithmetic_unary_op_impl>(type, op, input, output, len);
+}
+
+extern "C" void FULL_NAME(arithmetic_unary_diff_type)(const int itype, const int otype, const int8_t op, const void* input, void* output, const int len) {
+ arithmetic_unary_impl<arithmetic_unary_op_impl>(itype, otype, op, input, output, len);
}
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
index 54bc7d754f..6f0b4e9a2f 100644
--- a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
@@ -2,7 +2,7 @@
.intel_syntax noprefix
.file "base_arithmetic.cc"
.section .rodata.cst32,"aM",@progbits,32
- .p2align 5 # -- Begin function arithmetic_avx2
+ .p2align 5 # -- Begin function arithmetic_binary_avx2
.LCPI0_0:
.short 255 # 0xff
.short 255 # 0xff
@@ -21,15 +21,15 @@
.short 255 # 0xff
.short 255 # 0xff
.text
- .globl arithmetic_avx2
+ .globl arithmetic_binary_avx2
.p2align 4, 0x90
- .type arithmetic_avx2,@function
-arithmetic_avx2: # @arithmetic_avx2
+ .type arithmetic_binary_avx2,@function
+arithmetic_binary_avx2: # @arithmetic_binary_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
- cmp sil, 3
+ cmp sil, 7
jg .LBB0_11
# %bb.1:
test sil, sil
@@ -99,13 +99,13 @@ arithmetic_avx2: # @arithmetic_avx2
jne .LBB0_634
jmp .LBB0_825
.LBB0_11:
- cmp sil, 4
+ cmp sil, 8
je .LBB0_154
# %bb.12:
- cmp sil, 5
+ cmp sil, 9
je .LBB0_420
# %bb.13:
- cmp sil, 6
+ cmp sil, 10
jne .LBB0_825
# %bb.14:
cmp edi, 6
@@ -5808,7 +5808,7 @@ arithmetic_avx2: # @arithmetic_avx2
vzeroupper
ret
.Lfunc_end0:
- .size arithmetic_avx2, .Lfunc_end0-arithmetic_avx2
+ .size arithmetic_binary_avx2, .Lfunc_end0-arithmetic_binary_avx2
# -- End function
.section .rodata.cst32,"aM",@progbits,32
.p2align 5 # -- Begin function arithmetic_arr_scalar_avx2
@@ -5838,7 +5838,7 @@ arithmetic_arr_scalar_avx2: # @arithmetic_arr_scalar_avx2
push rbp
mov rbp, rsp
and rsp, -8
- cmp sil, 3
+ cmp sil, 7
jg .LBB1_12
# %bb.1:
test sil, sil
@@ -5917,13 +5917,13 @@ arithmetic_arr_scalar_avx2: # @arithmetic_arr_scalar_avx2
jne .LBB1_668
jmp .LBB1_1109
.LBB1_12:
- cmp sil, 4
+ cmp sil, 8
je .LBB1_39
# %bb.13:
- cmp sil, 5
+ cmp sil, 9
je .LBB1_47
# %bb.14:
- cmp sil, 6
+ cmp sil, 10
jne .LBB1_1109
# %bb.15:
cmp edi, 6
@@ -13039,7 +13039,7 @@ arithmetic_scalar_arr_avx2: # @arithmetic_scalar_arr_avx2
push rbp
mov rbp, rsp
and rsp, -8
- cmp sil, 3
+ cmp sil, 7
jg .LBB2_12
# %bb.1:
test sil, sil
@@ -13118,13 +13118,13 @@ arithmetic_scalar_arr_avx2: # @arithmetic_scalar_arr_avx2
jne .LBB2_668
jmp .LBB2_1109
.LBB2_12:
- cmp sil, 4
+ cmp sil, 8
je .LBB2_39
# %bb.13:
- cmp sil, 5
+ cmp sil, 9
je .LBB2_47
# %bb.14:
- cmp sil, 6
+ cmp sil, 10
jne .LBB2_1109
# %bb.15:
cmp edi, 6
@@ -19976,6 +19976,14924 @@ arithmetic_scalar_arr_avx2: # @arithmetic_scalar_arr_avx2
.Lfunc_end2:
.size arithmetic_scalar_arr_avx2, .Lfunc_end2-arithmetic_scalar_arr_avx2
# -- End function
+ .section .rodata.cst8,"aM",@progbits,8
+ .p2align 3 # -- Begin function arithmetic_unary_same_types_avx2
+.LCPI3_0:
+ .quad 0x8000000000000000 # double -0
+.LCPI3_1:
+ .quad 0x3ff0000000000000 # double 1
+.LCPI3_4:
+ .quad 1 # 0x1
+.LCPI3_8:
+ .quad 9223372036854775807 # 0x7fffffffffffffff
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI3_2:
+ .quad 0x8000000000000000 # double -0
+ .quad 0x8000000000000000 # double -0
+.LCPI3_11:
+ .byte 0 # 0x0
+ .byte 4 # 0x4
+ .byte 8 # 0x8
+ .byte 12 # 0xc
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .section .rodata.cst4,"aM",@progbits,4
+ .p2align 2
+.LCPI3_3:
+ .long 1 # 0x1
+.LCPI3_7:
+ .long 0x80000000 # float -0
+.LCPI3_9:
+ .long 2147483647 # 0x7fffffff
+ .section .rodata.cst32,"aM",@progbits,32
+ .p2align 5
+.LCPI3_5:
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+.LCPI3_6:
+ .zero 32,1
+.LCPI3_10:
+ .byte 0 # 0x0
+ .byte 1 # 0x1
+ .byte 4 # 0x4
+ .byte 5 # 0x5
+ .byte 8 # 0x8
+ .byte 9 # 0x9
+ .byte 12 # 0xc
+ .byte 13 # 0xd
+ .byte 8 # 0x8
+ .byte 9 # 0x9
+ .byte 12 # 0xc
+ .byte 13 # 0xd
+ .byte 12 # 0xc
+ .byte 13 # 0xd
+ .byte 14 # 0xe
+ .byte 15 # 0xf
+ .byte 16 # 0x10
+ .byte 17 # 0x11
+ .byte 20 # 0x14
+ .byte 21 # 0x15
+ .byte 24 # 0x18
+ .byte 25 # 0x19
+ .byte 28 # 0x1c
+ .byte 29 # 0x1d
+ .byte 24 # 0x18
+ .byte 25 # 0x19
+ .byte 28 # 0x1c
+ .byte 29 # 0x1d
+ .byte 28 # 0x1c
+ .byte 29 # 0x1d
+ .byte 30 # 0x1e
+ .byte 31 # 0x1f
+ .text
+ .globl arithmetic_unary_same_types_avx2
+ .p2align 4, 0x90
+ .type arithmetic_unary_same_types_avx2,@function
+arithmetic_unary_same_types_avx2: # @arithmetic_unary_same_types_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 6
+ jle .LBB3_12
+# %bb.1:
+ cmp sil, 7
+ je .LBB3_22
+# %bb.2:
+ cmp sil, 12
+ je .LBB3_30
+# %bb.3:
+ cmp sil, 13
+ jne .LBB3_865
+# %bb.4:
+ cmp edi, 6
+ jg .LBB3_46
+# %bb.5:
+ cmp edi, 3
+ jle .LBB3_81
+# %bb.6:
+ cmp edi, 4
+ je .LBB3_131
+# %bb.7:
+ cmp edi, 5
+ je .LBB3_134
+# %bb.8:
+ cmp edi, 6
+ jne .LBB3_865
+# %bb.9:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.10:
+ mov r9d, r8d
+ cmp r8d, 32
+ jae .LBB3_221
+# %bb.11:
+ xor edx, edx
+ jmp .LBB3_373
+.LBB3_12:
+ cmp sil, 4
+ je .LBB3_38
+# %bb.13:
+ cmp sil, 5
+ jne .LBB3_865
+# %bb.14:
+ cmp edi, 6
+ jg .LBB3_53
+# %bb.15:
+ cmp edi, 3
+ jle .LBB3_86
+# %bb.16:
+ cmp edi, 4
+ je .LBB3_137
+# %bb.17:
+ cmp edi, 5
+ je .LBB3_140
+# %bb.18:
+ cmp edi, 6
+ jne .LBB3_865
+# %bb.19:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.20:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_21
+# %bb.223:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_374
+# %bb.224:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_374
+.LBB3_21:
+ xor esi, esi
+.LBB3_616:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_618
+.LBB3_617: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_617
+.LBB3_618:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_619: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [rcx + 4*rsi + 4], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [rcx + 4*rsi + 8], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [rcx + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_619
+ jmp .LBB3_865
+.LBB3_22:
+ cmp edi, 6
+ jg .LBB3_60
+# %bb.23:
+ cmp edi, 3
+ jle .LBB3_91
+# %bb.24:
+ cmp edi, 4
+ je .LBB3_143
+# %bb.25:
+ cmp edi, 5
+ je .LBB3_146
+# %bb.26:
+ cmp edi, 6
+ jne .LBB3_865
+# %bb.27:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.28:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_29
+# %bb.226:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_377
+# %bb.227:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_377
+.LBB3_29:
+ xor esi, esi
+.LBB3_380:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_382
+.LBB3_381: # =>This Inner Loop Header: Depth=1
+ xor edi, edi
+ cmp dword ptr [rdx + 4*rsi], 0
+ setne dil
+ mov dword ptr [rcx + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_381
+.LBB3_382:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_383: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ cmp dword ptr [rdx + 4*rsi], 0
+ setne al
+ mov dword ptr [rcx + 4*rsi], eax
+ xor eax, eax
+ cmp dword ptr [rdx + 4*rsi + 4], 0
+ setne al
+ mov dword ptr [rcx + 4*rsi + 4], eax
+ xor eax, eax
+ cmp dword ptr [rdx + 4*rsi + 8], 0
+ setne al
+ mov dword ptr [rcx + 4*rsi + 8], eax
+ xor eax, eax
+ cmp dword ptr [rdx + 4*rsi + 12], 0
+ setne al
+ mov dword ptr [rcx + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_383
+ jmp .LBB3_865
+.LBB3_30:
+ cmp edi, 6
+ jg .LBB3_67
+# %bb.31:
+ cmp edi, 3
+ jle .LBB3_96
+# %bb.32:
+ cmp edi, 4
+ je .LBB3_149
+# %bb.33:
+ cmp edi, 5
+ je .LBB3_152
+# %bb.34:
+ cmp edi, 6
+ jne .LBB3_865
+# %bb.35:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.36:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_37
+# %bb.229:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_384
+# %bb.230:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_384
+.LBB3_37:
+ xor esi, esi
+.LBB3_624:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_626
+.LBB3_625: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_625
+.LBB3_626:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_627: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [rcx + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [rcx + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [rcx + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_627
+ jmp .LBB3_865
+.LBB3_38:
+ cmp edi, 6
+ jg .LBB3_74
+# %bb.39:
+ cmp edi, 3
+ jle .LBB3_101
+# %bb.40:
+ cmp edi, 4
+ je .LBB3_155
+# %bb.41:
+ cmp edi, 5
+ je .LBB3_158
+# %bb.42:
+ cmp edi, 6
+ jne .LBB3_865
+# %bb.43:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.44:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_45
+# %bb.232:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_387
+# %bb.233:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_387
+.LBB3_45:
+ xor esi, esi
+.LBB3_632:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_634
+.LBB3_633: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_633
+.LBB3_634:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_635: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [rcx + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [rcx + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [rcx + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_635
+ jmp .LBB3_865
+.LBB3_46:
+ cmp edi, 8
+ jle .LBB3_106
+# %bb.47:
+ cmp edi, 9
+ je .LBB3_161
+# %bb.48:
+ cmp edi, 11
+ je .LBB3_164
+# %bb.49:
+ cmp edi, 12
+ jne .LBB3_865
+# %bb.50:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.51:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_52
+# %bb.235:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_390
+# %bb.236:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_390
+.LBB3_52:
+ xor esi, esi
+.LBB3_640:
+ mov rax, rsi
+ not rax
+ add rax, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_643
+# %bb.641:
+ vmovapd xmm0, xmmword ptr [rip + .LCPI3_2] # xmm0 = [-0.0E+0,-0.0E+0]
+.LBB3_642: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi], xmm1
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_642
+.LBB3_643:
+ cmp rax, 3
+ jb .LBB3_865
+# %bb.644:
+ vmovapd xmm0, xmmword ptr [rip + .LCPI3_2] # xmm0 = [-0.0E+0,-0.0E+0]
+.LBB3_645: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi + 8] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi + 8], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi + 16] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi + 16], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi + 24] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi + 24], xmm1
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_645
+ jmp .LBB3_865
+.LBB3_53:
+ cmp edi, 8
+ jle .LBB3_111
+# %bb.54:
+ cmp edi, 9
+ je .LBB3_167
+# %bb.55:
+ cmp edi, 11
+ je .LBB3_170
+# %bb.56:
+ cmp edi, 12
+ jne .LBB3_865
+# %bb.57:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.58:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_59
+# %bb.238:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_393
+# %bb.239:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_393
+.LBB3_59:
+ xor esi, esi
+.LBB3_650:
+ mov rax, rsi
+ not rax
+ add rax, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_653
+# %bb.651:
+ vmovapd xmm0, xmmword ptr [rip + .LCPI3_2] # xmm0 = [-0.0E+0,-0.0E+0]
+.LBB3_652: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi], xmm1
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_652
+.LBB3_653:
+ cmp rax, 3
+ jb .LBB3_865
+# %bb.654:
+ vmovapd xmm0, xmmword ptr [rip + .LCPI3_2] # xmm0 = [-0.0E+0,-0.0E+0]
+.LBB3_655: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi + 8] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi + 8], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi + 16] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi + 16], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rsi + 24] # xmm1 = mem[0],zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovlpd qword ptr [rcx + 8*rsi + 24], xmm1
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_655
+ jmp .LBB3_865
+.LBB3_60:
+ cmp edi, 8
+ jle .LBB3_116
+# %bb.61:
+ cmp edi, 9
+ je .LBB3_173
+# %bb.62:
+ cmp edi, 11
+ je .LBB3_176
+# %bb.63:
+ cmp edi, 12
+ jne .LBB3_865
+# %bb.64:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.65:
+ mov eax, r8d
+ cmp r8d, 16
+ jb .LBB3_66
+# %bb.241:
+ lea rsi, [rdx + 8*rax]
+ cmp rsi, rcx
+ jbe .LBB3_396
+# %bb.242:
+ lea rsi, [rcx + 8*rax]
+ cmp rsi, rdx
+ jbe .LBB3_396
+.LBB3_66:
+ xor esi, esi
+.LBB3_399:
+ mov rdi, rsi
+ not rdi
+ test al, 1
+ je .LBB3_401
+# %bb.400:
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vandpd xmm1, xmm0, xmmword ptr [rip + .LCPI3_2]
+ vmovddup xmm2, qword ptr [rip + .LCPI3_1] # xmm2 = [1.0E+0,1.0E+0]
+ # xmm2 = mem[0,0]
+ vorpd xmm1, xmm2, xmm1
+ vxorpd xmm2, xmm2, xmm2
+ vcmpeqsd xmm0, xmm0, xmm2
+ vandnpd xmm0, xmm0, xmm1
+ vmovlpd qword ptr [rcx + 8*rsi], xmm0
+ or rsi, 1
+.LBB3_401:
+ add rdi, rax
+ je .LBB3_865
+# %bb.402:
+ vmovapd xmm0, xmmword ptr [rip + .LCPI3_2] # xmm0 = [-0.0E+0,-0.0E+0]
+ vmovddup xmm1, qword ptr [rip + .LCPI3_1] # xmm1 = [1.0E+0,1.0E+0]
+ # xmm1 = mem[0,0]
+ vxorpd xmm2, xmm2, xmm2
+.LBB3_403: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm3, qword ptr [rdx + 8*rsi] # xmm3 = mem[0],zero
+ vandpd xmm4, xmm3, xmm0
+ vorpd xmm4, xmm1, xmm4
+ vcmpeqsd xmm3, xmm3, xmm2
+ vandnpd xmm3, xmm3, xmm4
+ vmovlpd qword ptr [rcx + 8*rsi], xmm3
+ vmovsd xmm3, qword ptr [rdx + 8*rsi + 8] # xmm3 = mem[0],zero
+ vandpd xmm4, xmm3, xmm0
+ vorpd xmm4, xmm1, xmm4
+ vcmpeqsd xmm3, xmm3, xmm2
+ vandnpd xmm3, xmm3, xmm4
+ vmovlpd qword ptr [rcx + 8*rsi + 8], xmm3
+ add rsi, 2
+ cmp rax, rsi
+ jne .LBB3_403
+ jmp .LBB3_865
+.LBB3_67:
+ cmp edi, 8
+ jle .LBB3_121
+# %bb.68:
+ cmp edi, 9
+ je .LBB3_179
+# %bb.69:
+ cmp edi, 11
+ je .LBB3_182
+# %bb.70:
+ cmp edi, 12
+ jne .LBB3_865
+# %bb.71:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.72:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_73
+# %bb.244:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_404
+# %bb.245:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_404
+.LBB3_73:
+ xor esi, esi
+.LBB3_660:
+ movabs r10, 9223372036854775807
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_662
+.LBB3_661: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ and rdi, r10
+ mov qword ptr [rcx + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_661
+.LBB3_662:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_663: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_663
+ jmp .LBB3_865
+.LBB3_74:
+ cmp edi, 8
+ jle .LBB3_126
+# %bb.75:
+ cmp edi, 9
+ je .LBB3_185
+# %bb.76:
+ cmp edi, 11
+ je .LBB3_188
+# %bb.77:
+ cmp edi, 12
+ jne .LBB3_865
+# %bb.78:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.79:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_80
+# %bb.247:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_407
+# %bb.248:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_407
+.LBB3_80:
+ xor esi, esi
+.LBB3_668:
+ movabs r10, 9223372036854775807
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_670
+.LBB3_669: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ and rdi, r10
+ mov qword ptr [rcx + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_669
+.LBB3_670:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_671: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ and rax, r10
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_671
+ jmp .LBB3_865
+.LBB3_81:
+ cmp edi, 2
+ je .LBB3_191
+# %bb.82:
+ cmp edi, 3
+ jne .LBB3_865
+# %bb.83:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.84:
+ mov r9d, r8d
+ cmp r8d, 128
+ jb .LBB3_85
+# %bb.250:
+ lea rax, [rdx + r9]
+ cmp rax, rcx
+ jbe .LBB3_410
+# %bb.251:
+ lea rax, [rcx + r9]
+ cmp rax, rdx
+ jbe .LBB3_410
+.LBB3_85:
+ xor esi, esi
+.LBB3_676:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_678
+.LBB3_677: # =>This Inner Loop Header: Depth=1
+ movzx r10d, byte ptr [rdx + rsi]
+ xor eax, eax
+ sub al, r10b
+ mov byte ptr [rcx + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_677
+.LBB3_678:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_679: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi]
+ mov byte ptr [rcx + rsi], al
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [rcx + rsi + 1], al
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [rcx + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ xor edi, edi
+ sub dil, al
+ mov byte ptr [rcx + rsi + 3], dil
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_679
+ jmp .LBB3_865
+.LBB3_86:
+ cmp edi, 2
+ je .LBB3_194
+# %bb.87:
+ cmp edi, 3
+ jne .LBB3_865
+# %bb.88:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.89:
+ mov r9d, r8d
+ cmp r8d, 128
+ jb .LBB3_90
+# %bb.253:
+ lea rax, [rdx + r9]
+ cmp rax, rcx
+ jbe .LBB3_413
+# %bb.254:
+ lea rax, [rcx + r9]
+ cmp rax, rdx
+ jbe .LBB3_413
+.LBB3_90:
+ xor esi, esi
+.LBB3_684:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_686
+.LBB3_685: # =>This Inner Loop Header: Depth=1
+ movzx r10d, byte ptr [rdx + rsi]
+ xor eax, eax
+ sub al, r10b
+ mov byte ptr [rcx + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_685
+.LBB3_686:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_687: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi]
+ mov byte ptr [rcx + rsi], al
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [rcx + rsi + 1], al
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [rcx + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ xor edi, edi
+ sub dil, al
+ mov byte ptr [rcx + rsi + 3], dil
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_687
+ jmp .LBB3_865
+.LBB3_91:
+ cmp edi, 2
+ je .LBB3_197
+# %bb.92:
+ cmp edi, 3
+ jne .LBB3_865
+# %bb.93:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.94:
+ mov r11d, r8d
+ cmp r8d, 128
+ jb .LBB3_95
+# %bb.256:
+ lea rsi, [rdx + r11]
+ cmp rsi, rcx
+ jbe .LBB3_416
+# %bb.257:
+ lea rsi, [rcx + r11]
+ cmp rsi, rdx
+ jbe .LBB3_416
+.LBB3_95:
+ xor esi, esi
+.LBB3_419:
+ mov r10, rsi
+ not r10
+ test r11b, 1
+ je .LBB3_421
+# %bb.420:
+ mov r8b, byte ptr [rdx + rsi]
+ test r8b, r8b
+ setne r9b
+ neg r9b
+ test r8b, r8b
+ movzx r8d, r9b
+ mov edi, 1
+ cmovle edi, r8d
+ mov byte ptr [rcx + rsi], dil
+ or rsi, 1
+.LBB3_421:
+ add r10, r11
+ je .LBB3_865
+# %bb.422:
+ mov edi, 1
+.LBB3_423: # =>This Inner Loop Header: Depth=1
+ movzx r8d, byte ptr [rdx + rsi]
+ test r8b, r8b
+ setne al
+ neg al
+ test r8b, r8b
+ movzx eax, al
+ cmovg eax, edi
+ mov byte ptr [rcx + rsi], al
+ movzx r8d, byte ptr [rdx + rsi + 1]
+ test r8b, r8b
+ setne al
+ neg al
+ test r8b, r8b
+ movzx eax, al
+ cmovg eax, edi
+ mov byte ptr [rcx + rsi + 1], al
+ add rsi, 2
+ cmp r11, rsi
+ jne .LBB3_423
+ jmp .LBB3_865
+.LBB3_96:
+ cmp edi, 2
+ je .LBB3_200
+# %bb.97:
+ cmp edi, 3
+ jne .LBB3_865
+# %bb.98:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.99:
+ mov r10d, r8d
+ cmp r8d, 32
+ jb .LBB3_100
+# %bb.259:
+ lea rsi, [rdx + r10]
+ cmp rsi, rcx
+ jbe .LBB3_424
+# %bb.260:
+ lea rsi, [rcx + r10]
+ cmp rsi, rdx
+ jbe .LBB3_424
+.LBB3_100:
+ xor esi, esi
+.LBB3_427:
+ mov r8, rsi
+ not r8
+ test r10b, 1
+ je .LBB3_429
+# %bb.428:
+ movsx edi, byte ptr [rdx + rsi]
+ mov r9d, edi
+ sar r9d, 7
+ add edi, r9d
+ xor edi, r9d
+ mov byte ptr [rcx + rsi], dil
+ or rsi, 1
+.LBB3_429:
+ add r8, r10
+ je .LBB3_865
+.LBB3_430: # =>This Inner Loop Header: Depth=1
+ movsx edi, byte ptr [rdx + rsi]
+ mov eax, edi
+ sar eax, 7
+ add edi, eax
+ xor edi, eax
+ mov byte ptr [rcx + rsi], dil
+ movsx eax, byte ptr [rdx + rsi + 1]
+ mov edi, eax
+ sar edi, 7
+ add eax, edi
+ xor eax, edi
+ mov byte ptr [rcx + rsi + 1], al
+ add rsi, 2
+ cmp r10, rsi
+ jne .LBB3_430
+ jmp .LBB3_865
+.LBB3_101:
+ cmp edi, 2
+ je .LBB3_203
+# %bb.102:
+ cmp edi, 3
+ jne .LBB3_865
+# %bb.103:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.104:
+ mov r10d, r8d
+ cmp r8d, 32
+ jb .LBB3_105
+# %bb.262:
+ lea rsi, [rdx + r10]
+ cmp rsi, rcx
+ jbe .LBB3_431
+# %bb.263:
+ lea rsi, [rcx + r10]
+ cmp rsi, rdx
+ jbe .LBB3_431
+.LBB3_105:
+ xor esi, esi
+.LBB3_434:
+ mov r8, rsi
+ not r8
+ test r10b, 1
+ je .LBB3_436
+# %bb.435:
+ movsx edi, byte ptr [rdx + rsi]
+ mov r9d, edi
+ sar r9d, 7
+ add edi, r9d
+ xor edi, r9d
+ mov byte ptr [rcx + rsi], dil
+ or rsi, 1
+.LBB3_436:
+ add r8, r10
+ je .LBB3_865
+.LBB3_437: # =>This Inner Loop Header: Depth=1
+ movsx edi, byte ptr [rdx + rsi]
+ mov eax, edi
+ sar eax, 7
+ add edi, eax
+ xor edi, eax
+ mov byte ptr [rcx + rsi], dil
+ movsx eax, byte ptr [rdx + rsi + 1]
+ mov edi, eax
+ sar edi, 7
+ add eax, edi
+ xor eax, edi
+ mov byte ptr [rcx + rsi + 1], al
+ add rsi, 2
+ cmp r10, rsi
+ jne .LBB3_437
+ jmp .LBB3_865
+.LBB3_106:
+ cmp edi, 7
+ je .LBB3_206
+# %bb.107:
+ cmp edi, 8
+ jne .LBB3_865
+# %bb.108:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.109:
+ mov r9d, r8d
+ cmp r8d, 16
+ jae .LBB3_265
+# %bb.110:
+ xor edx, edx
+ jmp .LBB3_444
+.LBB3_111:
+ cmp edi, 7
+ je .LBB3_209
+# %bb.112:
+ cmp edi, 8
+ jne .LBB3_865
+# %bb.113:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.114:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_115
+# %bb.267:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_445
+# %bb.268:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_445
+.LBB3_115:
+ xor esi, esi
+.LBB3_692:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_694
+.LBB3_693: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_693
+.LBB3_694:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_695: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_695
+ jmp .LBB3_865
+.LBB3_116:
+ cmp edi, 7
+ je .LBB3_212
+# %bb.117:
+ cmp edi, 8
+ jne .LBB3_865
+# %bb.118:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.119:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_120
+# %bb.270:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_448
+# %bb.271:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_448
+.LBB3_120:
+ xor esi, esi
+.LBB3_451:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_453
+.LBB3_452: # =>This Inner Loop Header: Depth=1
+ xor edi, edi
+ cmp qword ptr [rdx + 8*rsi], 0
+ setne dil
+ mov qword ptr [rcx + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_452
+.LBB3_453:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_454: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ cmp qword ptr [rdx + 8*rsi], 0
+ setne al
+ mov qword ptr [rcx + 8*rsi], rax
+ xor eax, eax
+ cmp qword ptr [rdx + 8*rsi + 8], 0
+ setne al
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ xor eax, eax
+ cmp qword ptr [rdx + 8*rsi + 16], 0
+ setne al
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ xor eax, eax
+ cmp qword ptr [rdx + 8*rsi + 24], 0
+ setne al
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_454
+ jmp .LBB3_865
+.LBB3_121:
+ cmp edi, 7
+ je .LBB3_215
+# %bb.122:
+ cmp edi, 8
+ jne .LBB3_865
+# %bb.123:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.124:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_125
+# %bb.273:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_455
+# %bb.274:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_455
+.LBB3_125:
+ xor esi, esi
+.LBB3_700:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_702
+.LBB3_701: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_701
+.LBB3_702:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_703: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_703
+ jmp .LBB3_865
+.LBB3_126:
+ cmp edi, 7
+ je .LBB3_218
+# %bb.127:
+ cmp edi, 8
+ jne .LBB3_865
+# %bb.128:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.129:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_130
+# %bb.276:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_458
+# %bb.277:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_458
+.LBB3_130:
+ xor esi, esi
+.LBB3_708:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_710
+.LBB3_709: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_709
+.LBB3_710:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_711: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_711
+ jmp .LBB3_865
+.LBB3_131:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.132:
+ mov r9d, r8d
+ cmp r8d, 64
+ jae .LBB3_279
+# %bb.133:
+ xor edx, edx
+ jmp .LBB3_467
+.LBB3_134:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.135:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_136
+# %bb.281:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_468
+# %bb.282:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_468
+.LBB3_136:
+ xor esi, esi
+.LBB3_716:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_718
+.LBB3_717: # =>This Inner Loop Header: Depth=1
+ xor edi, edi
+ sub di, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_717
+.LBB3_718:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_719: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [rcx + 2*rsi + 2], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [rcx + 2*rsi + 4], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [rcx + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_719
+ jmp .LBB3_865
+.LBB3_137:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.138:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_139
+# %bb.284:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_471
+# %bb.285:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_471
+.LBB3_139:
+ xor esi, esi
+.LBB3_724:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_726
+.LBB3_725: # =>This Inner Loop Header: Depth=1
+ xor edi, edi
+ sub di, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_725
+.LBB3_726:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_727: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [rcx + 2*rsi + 2], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [rcx + 2*rsi + 4], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [rcx + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_727
+ jmp .LBB3_865
+.LBB3_140:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.141:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_142
+# %bb.287:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_474
+# %bb.288:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_474
+.LBB3_142:
+ xor esi, esi
+.LBB3_732:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_734
+.LBB3_733: # =>This Inner Loop Header: Depth=1
+ xor edi, edi
+ sub di, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_733
+.LBB3_734:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_735: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [rcx + 2*rsi + 2], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [rcx + 2*rsi + 4], ax
+ xor eax, eax
+ sub ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [rcx + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_735
+ jmp .LBB3_865
+.LBB3_143:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.144:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_145
+# %bb.290:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_477
+# %bb.291:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_477
+.LBB3_145:
+ xor esi, esi
+.LBB3_740:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_742
+.LBB3_741: # =>This Inner Loop Header: Depth=1
+ xor edi, edi
+ cmp word ptr [rdx + 2*rsi], 0
+ setne dil
+ mov word ptr [rcx + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_741
+.LBB3_742:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_743: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ cmp word ptr [rdx + 2*rsi], 0
+ setne al
+ mov word ptr [rcx + 2*rsi], ax
+ xor eax, eax
+ cmp word ptr [rdx + 2*rsi + 2], 0
+ setne al
+ mov word ptr [rcx + 2*rsi + 2], ax
+ xor eax, eax
+ cmp word ptr [rdx + 2*rsi + 4], 0
+ setne al
+ mov word ptr [rcx + 2*rsi + 4], ax
+ xor eax, eax
+ cmp word ptr [rdx + 2*rsi + 6], 0
+ setne al
+ mov word ptr [rcx + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_743
+ jmp .LBB3_865
+.LBB3_146:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.147:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_148
+# %bb.293:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_480
+# %bb.294:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_480
+.LBB3_148:
+ xor esi, esi
+.LBB3_748:
+ mov rax, rsi
+ not rax
+ test r9b, 1
+ je .LBB3_750
+# %bb.749:
+ movzx r8d, word ptr [rdx + 2*rsi]
+ xor r10d, r10d
+ test r8w, r8w
+ setne r10b
+ neg r10d
+ test r8w, r8w
+ mov edi, 1
+ cmovle edi, r10d
+ mov word ptr [rcx + 2*rsi], di
+ or rsi, 1
+.LBB3_750:
+ add rax, r9
+ je .LBB3_865
+# %bb.751:
+ mov r8d, 1
+.LBB3_752: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ xor eax, eax
+ test di, di
+ setne al
+ neg eax
+ test di, di
+ cmovg eax, r8d
+ mov word ptr [rcx + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ xor edi, edi
+ test ax, ax
+ setne dil
+ neg edi
+ test ax, ax
+ cmovg edi, r8d
+ mov word ptr [rcx + 2*rsi + 2], di
+ add rsi, 2
+ cmp r9, rsi
+ jne .LBB3_752
+ jmp .LBB3_865
+.LBB3_149:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.150:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_151
+# %bb.296:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_483
+# %bb.297:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_483
+.LBB3_151:
+ xor esi, esi
+.LBB3_598:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_600
+.LBB3_599: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_599
+.LBB3_600:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_601: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [rcx + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [rcx + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [rcx + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_601
+ jmp .LBB3_865
+.LBB3_152:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.153:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_154
+# %bb.299:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_485
+# %bb.300:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_485
+.LBB3_154:
+ xor esi, esi
+.LBB3_757:
+ mov rax, rsi
+ not rax
+ test r9b, 1
+ je .LBB3_759
+# %bb.758:
+ movsx edi, word ptr [rdx + 2*rsi]
+ mov r8d, edi
+ sar r8d, 15
+ add edi, r8d
+ xor edi, r8d
+ mov word ptr [rcx + 2*rsi], di
+ or rsi, 1
+.LBB3_759:
+ add rax, r9
+ je .LBB3_865
+.LBB3_760: # =>This Inner Loop Header: Depth=1
+ movsx eax, word ptr [rdx + 2*rsi]
+ mov edi, eax
+ sar edi, 15
+ add eax, edi
+ xor eax, edi
+ mov word ptr [rcx + 2*rsi], ax
+ movsx eax, word ptr [rdx + 2*rsi + 2]
+ mov edi, eax
+ sar edi, 15
+ add eax, edi
+ xor eax, edi
+ mov word ptr [rcx + 2*rsi + 2], ax
+ add rsi, 2
+ cmp r9, rsi
+ jne .LBB3_760
+ jmp .LBB3_865
+.LBB3_155:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.156:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_157
+# %bb.302:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_488
+# %bb.303:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_488
+.LBB3_157:
+ xor esi, esi
+.LBB3_608:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_610
+.LBB3_609: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_609
+.LBB3_610:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_611: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ mov word ptr [rcx + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [rcx + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [rcx + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [rcx + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_611
+ jmp .LBB3_865
+.LBB3_158:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.159:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_160
+# %bb.305:
+ lea rax, [rdx + 2*r9]
+ cmp rax, rcx
+ jbe .LBB3_490
+# %bb.306:
+ lea rax, [rcx + 2*r9]
+ cmp rax, rdx
+ jbe .LBB3_490
+.LBB3_160:
+ xor esi, esi
+.LBB3_765:
+ mov rax, rsi
+ not rax
+ test r9b, 1
+ je .LBB3_767
+# %bb.766:
+ movsx edi, word ptr [rdx + 2*rsi]
+ mov r8d, edi
+ sar r8d, 15
+ add edi, r8d
+ xor edi, r8d
+ mov word ptr [rcx + 2*rsi], di
+ or rsi, 1
+.LBB3_767:
+ add rax, r9
+ je .LBB3_865
+.LBB3_768: # =>This Inner Loop Header: Depth=1
+ movsx eax, word ptr [rdx + 2*rsi]
+ mov edi, eax
+ sar edi, 15
+ add eax, edi
+ xor eax, edi
+ mov word ptr [rcx + 2*rsi], ax
+ movsx eax, word ptr [rdx + 2*rsi + 2]
+ mov edi, eax
+ sar edi, 15
+ add eax, edi
+ xor eax, edi
+ mov word ptr [rcx + 2*rsi + 2], ax
+ add rsi, 2
+ cmp r9, rsi
+ jne .LBB3_768
+ jmp .LBB3_865
+.LBB3_161:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.162:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_163
+# %bb.308:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_493
+# %bb.309:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_493
+.LBB3_163:
+ xor esi, esi
+.LBB3_773:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_775
+.LBB3_774: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_774
+.LBB3_775:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_776: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_776
+ jmp .LBB3_865
+.LBB3_164:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.165:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_166
+# %bb.311:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_496
+# %bb.312:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_496
+.LBB3_166:
+ xor esi, esi
+.LBB3_781:
+ mov rax, rsi
+ not rax
+ add rax, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_784
+# %bb.782:
+ vbroadcastss xmm0, dword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_783: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi], xmm1
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_783
+.LBB3_784:
+ cmp rax, 3
+ jb .LBB3_865
+# %bb.785:
+ vbroadcastss xmm0, dword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_786: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi + 4], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rsi + 8] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi + 8], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rsi + 12] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi + 12], xmm1
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_786
+ jmp .LBB3_865
+.LBB3_167:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.168:
+ mov r9d, r8d
+ cmp r8d, 16
+ jb .LBB3_169
+# %bb.314:
+ lea rax, [rdx + 8*r9]
+ cmp rax, rcx
+ jbe .LBB3_499
+# %bb.315:
+ lea rax, [rcx + 8*r9]
+ cmp rax, rdx
+ jbe .LBB3_499
+.LBB3_169:
+ xor esi, esi
+.LBB3_791:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_793
+.LBB3_792: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_792
+.LBB3_793:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_794: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [rcx + 8*rsi], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [rcx + 8*rsi + 8], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [rcx + 8*rsi + 16], rax
+ xor eax, eax
+ sub rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [rcx + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_794
+ jmp .LBB3_865
+.LBB3_170:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.171:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_172
+# %bb.317:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_502
+# %bb.318:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_502
+.LBB3_172:
+ xor esi, esi
+.LBB3_799:
+ mov rax, rsi
+ not rax
+ add rax, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_802
+# %bb.800:
+ vbroadcastss xmm0, dword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_801: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi], xmm1
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_801
+.LBB3_802:
+ cmp rax, 3
+ jb .LBB3_865
+# %bb.803:
+ vbroadcastss xmm0, dword ptr [rip + .LCPI3_7] # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_804: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi + 4], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rsi + 8] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi + 8], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rsi + 12] # xmm1 = mem[0],zero,zero,zero
+ vxorpd xmm1, xmm1, xmm0
+ vmovss dword ptr [rcx + 4*rsi + 12], xmm1
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_804
+ jmp .LBB3_865
+.LBB3_173:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.174:
+ mov r11d, r8d
+ cmp r8d, 16
+ jb .LBB3_175
+# %bb.320:
+ lea rsi, [rdx + 8*r11]
+ cmp rsi, rcx
+ jbe .LBB3_505
+# %bb.321:
+ lea rsi, [rcx + 8*r11]
+ cmp rsi, rdx
+ jbe .LBB3_505
+.LBB3_175:
+ xor esi, esi
+.LBB3_508:
+ mov r10, rsi
+ not r10
+ test r11b, 1
+ je .LBB3_510
+# %bb.509:
+ mov r8, qword ptr [rdx + 8*rsi]
+ xor r9d, r9d
+ test r8, r8
+ setne r9b
+ neg r9
+ test r8, r8
+ mov edi, 1
+ cmovle rdi, r9
+ mov qword ptr [rcx + 8*rsi], rdi
+ or rsi, 1
+.LBB3_510:
+ add r10, r11
+ je .LBB3_865
+# %bb.511:
+ mov r8d, 1
+.LBB3_512: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ xor edi, edi
+ test rax, rax
+ setne dil
+ neg rdi
+ test rax, rax
+ cmovg rdi, r8
+ mov qword ptr [rcx + 8*rsi], rdi
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ xor edi, edi
+ test rax, rax
+ setne dil
+ neg rdi
+ test rax, rax
+ cmovg rdi, r8
+ mov qword ptr [rcx + 8*rsi + 8], rdi
+ add rsi, 2
+ cmp r11, rsi
+ jne .LBB3_512
+ jmp .LBB3_865
+.LBB3_176:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.177:
+ mov eax, r8d
+ cmp r8d, 32
+ jb .LBB3_178
+# %bb.323:
+ lea rsi, [rdx + 4*rax]
+ cmp rsi, rcx
+ jbe .LBB3_513
+# %bb.324:
+ lea rsi, [rcx + 4*rax]
+ cmp rsi, rdx
+ jbe .LBB3_513
+.LBB3_178:
+ xor esi, esi
+.LBB3_516:
+ mov r8, rsi
+ not r8
+ test al, 1
+ je .LBB3_518
+# %bb.517:
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vmovmskps edi, xmm0
+ and edi, 1
+ neg edi
+ or edi, 1
+ vcvtsi2ss xmm1, xmm10, edi
+ vxorps xmm2, xmm2, xmm2
+ vcmpeqss xmm0, xmm0, xmm2
+ vandnps xmm0, xmm0, xmm1
+ vmovss dword ptr [rcx + 4*rsi], xmm0
+ or rsi, 1
+.LBB3_518:
+ add r8, rax
+ je .LBB3_865
+# %bb.519:
+ vxorps xmm0, xmm0, xmm0
+.LBB3_520: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rsi] # xmm1 = mem[0],zero,zero,zero
+ vmovmskps edi, xmm1
+ and edi, 1
+ neg edi
+ or edi, 1
+ vcvtsi2ss xmm2, xmm10, edi
+ vcmpeqss xmm1, xmm1, xmm0
+ vandnps xmm1, xmm1, xmm2
+ vmovss dword ptr [rcx + 4*rsi], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rsi + 4] # xmm1 = mem[0],zero,zero,zero
+ vmovmskps edi, xmm1
+ and edi, 1
+ neg edi
+ or edi, 1
+ vcvtsi2ss xmm2, xmm10, edi
+ vcmpeqss xmm1, xmm1, xmm0
+ vandnps xmm1, xmm1, xmm2
+ vmovss dword ptr [rcx + 4*rsi + 4], xmm1
+ add rsi, 2
+ cmp rax, rsi
+ jne .LBB3_520
+ jmp .LBB3_865
+.LBB3_179:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.180:
+ mov r10d, r8d
+ cmp r8d, 16
+ jb .LBB3_181
+# %bb.326:
+ lea rsi, [rdx + 8*r10]
+ cmp rsi, rcx
+ jbe .LBB3_521
+# %bb.327:
+ lea rsi, [rcx + 8*r10]
+ cmp rsi, rdx
+ jbe .LBB3_521
+.LBB3_181:
+ xor esi, esi
+.LBB3_524:
+ mov r9, rsi
+ not r9
+ test r10b, 1
+ je .LBB3_526
+# %bb.525:
+ mov r8, qword ptr [rdx + 8*rsi]
+ mov rdi, r8
+ neg rdi
+ cmovl rdi, r8
+ mov qword ptr [rcx + 8*rsi], rdi
+ or rsi, 1
+.LBB3_526:
+ add r9, r10
+ je .LBB3_865
+.LBB3_527: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ mov rdi, rax
+ neg rdi
+ cmovl rdi, rax
+ mov qword ptr [rcx + 8*rsi], rdi
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ mov rdi, rax
+ neg rdi
+ cmovl rdi, rax
+ mov qword ptr [rcx + 8*rsi + 8], rdi
+ add rsi, 2
+ cmp r10, rsi
+ jne .LBB3_527
+ jmp .LBB3_865
+.LBB3_182:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.183:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_184
+# %bb.329:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_528
+# %bb.330:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_528
+.LBB3_184:
+ xor esi, esi
+.LBB3_809:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_812
+# %bb.810:
+ mov r10d, 2147483647
+.LBB3_811: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ and eax, r10d
+ mov dword ptr [rcx + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_811
+.LBB3_812:
+ cmp r8, 3
+ jb .LBB3_865
+# %bb.813:
+ mov eax, 2147483647
+.LBB3_814: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi], edi
+ mov edi, dword ptr [rdx + 4*rsi + 4]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi + 4], edi
+ mov edi, dword ptr [rdx + 4*rsi + 8]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi + 8], edi
+ mov edi, dword ptr [rdx + 4*rsi + 12]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi + 12], edi
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_814
+ jmp .LBB3_865
+.LBB3_185:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.186:
+ mov r10d, r8d
+ cmp r8d, 16
+ jb .LBB3_187
+# %bb.332:
+ lea rsi, [rdx + 8*r10]
+ cmp rsi, rcx
+ jbe .LBB3_531
+# %bb.333:
+ lea rsi, [rcx + 8*r10]
+ cmp rsi, rdx
+ jbe .LBB3_531
+.LBB3_187:
+ xor esi, esi
+.LBB3_534:
+ mov r9, rsi
+ not r9
+ test r10b, 1
+ je .LBB3_536
+# %bb.535:
+ mov r8, qword ptr [rdx + 8*rsi]
+ mov rdi, r8
+ neg rdi
+ cmovl rdi, r8
+ mov qword ptr [rcx + 8*rsi], rdi
+ or rsi, 1
+.LBB3_536:
+ add r9, r10
+ je .LBB3_865
+.LBB3_537: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ mov rdi, rax
+ neg rdi
+ cmovl rdi, rax
+ mov qword ptr [rcx + 8*rsi], rdi
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ mov rdi, rax
+ neg rdi
+ cmovl rdi, rax
+ mov qword ptr [rcx + 8*rsi + 8], rdi
+ add rsi, 2
+ cmp r10, rsi
+ jne .LBB3_537
+ jmp .LBB3_865
+.LBB3_188:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.189:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_190
+# %bb.335:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_538
+# %bb.336:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_538
+.LBB3_190:
+ xor esi, esi
+.LBB3_819:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_822
+# %bb.820:
+ mov r10d, 2147483647
+.LBB3_821: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ and eax, r10d
+ mov dword ptr [rcx + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_821
+.LBB3_822:
+ cmp r8, 3
+ jb .LBB3_865
+# %bb.823:
+ mov eax, 2147483647
+.LBB3_824: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi], edi
+ mov edi, dword ptr [rdx + 4*rsi + 4]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi + 4], edi
+ mov edi, dword ptr [rdx + 4*rsi + 8]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi + 8], edi
+ mov edi, dword ptr [rdx + 4*rsi + 12]
+ and edi, eax
+ mov dword ptr [rcx + 4*rsi + 12], edi
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_824
+ jmp .LBB3_865
+.LBB3_191:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.192:
+ mov r9d, r8d
+ cmp r8d, 128
+ jae .LBB3_338
+# %bb.193:
+ xor edx, edx
+ jmp .LBB3_547
+.LBB3_194:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.195:
+ mov r9d, r8d
+ cmp r8d, 128
+ jb .LBB3_196
+# %bb.340:
+ lea rax, [rdx + r9]
+ cmp rax, rcx
+ jbe .LBB3_548
+# %bb.341:
+ lea rax, [rcx + r9]
+ cmp rax, rdx
+ jbe .LBB3_548
+.LBB3_196:
+ xor esi, esi
+.LBB3_829:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_831
+.LBB3_830: # =>This Inner Loop Header: Depth=1
+ movzx r10d, byte ptr [rdx + rsi]
+ xor eax, eax
+ sub al, r10b
+ mov byte ptr [rcx + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_830
+.LBB3_831:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_832: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi]
+ mov byte ptr [rcx + rsi], al
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [rcx + rsi + 1], al
+ xor eax, eax
+ sub al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [rcx + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ xor edi, edi
+ sub dil, al
+ mov byte ptr [rcx + rsi + 3], dil
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_832
+ jmp .LBB3_865
+.LBB3_197:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.198:
+ mov r9d, r8d
+ cmp r8d, 128
+ jb .LBB3_199
+# %bb.343:
+ lea rax, [rdx + r9]
+ cmp rax, rcx
+ jbe .LBB3_551
+# %bb.344:
+ lea rax, [rcx + r9]
+ cmp rax, rdx
+ jbe .LBB3_551
+.LBB3_199:
+ xor esi, esi
+.LBB3_554:
+ mov rdi, rsi
+ not rdi
+ add rdi, r9
+ mov rax, r9
+ and rax, 3
+ je .LBB3_556
+.LBB3_555: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rdx + rsi], 0
+ setne byte ptr [rcx + rsi]
+ add rsi, 1
+ add rax, -1
+ jne .LBB3_555
+.LBB3_556:
+ cmp rdi, 3
+ jb .LBB3_865
+.LBB3_557: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rdx + rsi], 0
+ setne byte ptr [rcx + rsi]
+ cmp byte ptr [rdx + rsi + 1], 0
+ setne byte ptr [rcx + rsi + 1]
+ cmp byte ptr [rdx + rsi + 2], 0
+ setne byte ptr [rcx + rsi + 2]
+ cmp byte ptr [rdx + rsi + 3], 0
+ setne byte ptr [rcx + rsi + 3]
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_557
+ jmp .LBB3_865
+.LBB3_200:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.201:
+ mov r9d, r8d
+ cmp r8d, 128
+ jb .LBB3_202
+# %bb.346:
+ lea rax, [rdx + r9]
+ cmp rax, rcx
+ jbe .LBB3_558
+# %bb.347:
+ lea rax, [rcx + r9]
+ cmp rax, rdx
+ jbe .LBB3_558
+.LBB3_202:
+ xor esi, esi
+.LBB3_837:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_839
+.LBB3_838: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ mov byte ptr [rcx + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_838
+.LBB3_839:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_840: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ mov byte ptr [rcx + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ mov byte ptr [rcx + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ mov byte ptr [rcx + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ mov byte ptr [rcx + rsi + 3], al
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_840
+ jmp .LBB3_865
+.LBB3_203:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.204:
+ mov r9d, r8d
+ cmp r8d, 128
+ jb .LBB3_205
+# %bb.349:
+ lea rax, [rdx + r9]
+ cmp rax, rcx
+ jbe .LBB3_561
+# %bb.350:
+ lea rax, [rcx + r9]
+ cmp rax, rdx
+ jbe .LBB3_561
+.LBB3_205:
+ xor esi, esi
+.LBB3_845:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_847
+.LBB3_846: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ mov byte ptr [rcx + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_846
+.LBB3_847:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_848: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ mov byte ptr [rcx + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ mov byte ptr [rcx + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ mov byte ptr [rcx + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ mov byte ptr [rcx + rsi + 3], al
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_848
+ jmp .LBB3_865
+.LBB3_206:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.207:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_208
+# %bb.352:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_564
+# %bb.353:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_564
+.LBB3_208:
+ xor esi, esi
+.LBB3_853:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_855
+.LBB3_854: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_854
+.LBB3_855:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_856: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [rcx + 4*rsi + 4], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [rcx + 4*rsi + 8], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [rcx + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_856
+ jmp .LBB3_865
+.LBB3_209:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.210:
+ mov r9d, r8d
+ cmp r8d, 32
+ jb .LBB3_211
+# %bb.355:
+ lea rax, [rdx + 4*r9]
+ cmp rax, rcx
+ jbe .LBB3_567
+# %bb.356:
+ lea rax, [rcx + 4*r9]
+ cmp rax, rdx
+ jbe .LBB3_567
+.LBB3_211:
+ xor esi, esi
+.LBB3_861:
+ mov r8, rsi
+ not r8
+ add r8, r9
+ mov rdi, r9
+ and rdi, 3
+ je .LBB3_863
+.LBB3_862: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB3_862
+.LBB3_863:
+ cmp r8, 3
+ jb .LBB3_865
+.LBB3_864: # =>This Inner Loop Header: Depth=1
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [rcx + 4*rsi], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [rcx + 4*rsi + 4], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [rcx + 4*rsi + 8], eax
+ xor eax, eax
+ sub eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [rcx + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r9, rsi
+ jne .LBB3_864
+ jmp .LBB3_865
+.LBB3_212:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.213:
+ mov r11d, r8d
+ cmp r8d, 32
+ jb .LBB3_214
+# %bb.358:
+ lea rsi, [rdx + 4*r11]
+ cmp rsi, rcx
+ jbe .LBB3_570
+# %bb.359:
+ lea rsi, [rcx + 4*r11]
+ cmp rsi, rdx
+ jbe .LBB3_570
+.LBB3_214:
+ xor esi, esi
+.LBB3_573:
+ mov r10, rsi
+ not r10
+ test r11b, 1
+ je .LBB3_575
+# %bb.574:
+ mov r8d, dword ptr [rdx + 4*rsi]
+ xor r9d, r9d
+ test r8d, r8d
+ setne r9b
+ neg r9d
+ test r8d, r8d
+ mov edi, 1
+ cmovle edi, r9d
+ mov dword ptr [rcx + 4*rsi], edi
+ or rsi, 1
+.LBB3_575:
+ add r10, r11
+ je .LBB3_865
+# %bb.576:
+ mov r8d, 1
+.LBB3_577: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ xor edi, edi
+ test eax, eax
+ setne dil
+ neg edi
+ test eax, eax
+ cmovg edi, r8d
+ mov dword ptr [rcx + 4*rsi], edi
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ xor edi, edi
+ test eax, eax
+ setne dil
+ neg edi
+ test eax, eax
+ cmovg edi, r8d
+ mov dword ptr [rcx + 4*rsi + 4], edi
+ add rsi, 2
+ cmp r11, rsi
+ jne .LBB3_577
+ jmp .LBB3_865
+.LBB3_215:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.216:
+ mov r10d, r8d
+ cmp r8d, 32
+ jb .LBB3_217
+# %bb.361:
+ lea rsi, [rdx + 4*r10]
+ cmp rsi, rcx
+ jbe .LBB3_578
+# %bb.362:
+ lea rsi, [rcx + 4*r10]
+ cmp rsi, rdx
+ jbe .LBB3_578
+.LBB3_217:
+ xor esi, esi
+.LBB3_581:
+ mov r9, rsi
+ not r9
+ test r10b, 1
+ je .LBB3_583
+# %bb.582:
+ mov r8d, dword ptr [rdx + 4*rsi]
+ mov edi, r8d
+ neg edi
+ cmovl edi, r8d
+ mov dword ptr [rcx + 4*rsi], edi
+ or rsi, 1
+.LBB3_583:
+ add r9, r10
+ je .LBB3_865
+.LBB3_584: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ mov edi, eax
+ neg edi
+ cmovl edi, eax
+ mov dword ptr [rcx + 4*rsi], edi
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ mov edi, eax
+ neg edi
+ cmovl edi, eax
+ mov dword ptr [rcx + 4*rsi + 4], edi
+ add rsi, 2
+ cmp r10, rsi
+ jne .LBB3_584
+ jmp .LBB3_865
+.LBB3_218:
+ test r8d, r8d
+ jle .LBB3_865
+# %bb.219:
+ mov r10d, r8d
+ cmp r8d, 32
+ jb .LBB3_220
+# %bb.364:
+ lea rsi, [rdx + 4*r10]
+ cmp rsi, rcx
+ jbe .LBB3_585
+# %bb.365:
+ lea rsi, [rcx + 4*r10]
+ cmp rsi, rdx
+ jbe .LBB3_585
+.LBB3_220:
+ xor esi, esi
+.LBB3_588:
+ mov r9, rsi
+ not r9
+ test r10b, 1
+ je .LBB3_590
+# %bb.589:
+ mov r8d, dword ptr [rdx + 4*rsi]
+ mov edi, r8d
+ neg edi
+ cmovl edi, r8d
+ mov dword ptr [rcx + 4*rsi], edi
+ or rsi, 1
+.LBB3_590:
+ add r9, r10
+ je .LBB3_865
+.LBB3_591: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ mov edi, eax
+ neg edi
+ cmovl edi, eax
+ mov dword ptr [rcx + 4*rsi], edi
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ mov edi, eax
+ neg edi
+ cmovl edi, eax
+ mov dword ptr [rcx + 4*rsi + 4], edi
+ add rsi, 2
+ cmp r10, rsi
+ jne .LBB3_591
+ jmp .LBB3_865
+.LBB3_221:
+ mov edx, r9d
+ and edx, -32
+ lea rax, [rdx - 32]
+ mov rdi, rax
+ shr rdi, 5
+ add rdi, 1
+ mov esi, edi
+ and esi, 3
+ cmp rax, 96
+ jae .LBB3_367
+# %bb.222:
+ xor eax, eax
+ jmp .LBB3_369
+.LBB3_265:
+ mov edx, r9d
+ and edx, -16
+ lea rax, [rdx - 16]
+ mov rdi, rax
+ shr rdi, 4
+ add rdi, 1
+ mov esi, edi
+ and esi, 3
+ cmp rax, 48
+ jae .LBB3_438
+# %bb.266:
+ xor eax, eax
+ jmp .LBB3_440
+.LBB3_279:
+ mov edx, r9d
+ and edx, -64
+ lea rax, [rdx - 64]
+ mov rdi, rax
+ shr rdi, 6
+ add rdi, 1
+ mov esi, edi
+ and esi, 3
+ cmp rax, 192
+ jae .LBB3_461
+# %bb.280:
+ xor eax, eax
+ jmp .LBB3_463
+.LBB3_338:
+ mov edx, r9d
+ and edx, -128
+ lea rax, [rdx - 128]
+ mov rdi, rax
+ shr rdi, 7
+ add rdi, 1
+ mov esi, edi
+ and esi, 3
+ cmp rax, 384
+ jae .LBB3_541
+# %bb.339:
+ xor eax, eax
+ jmp .LBB3_543
+.LBB3_374:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_612
+# %bb.375:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_376: # =>This Inner Loop Header: Depth=1
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpsubd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm4
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vpsubd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovdqu ymmword ptr [rcx + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_376
+ jmp .LBB3_613
+.LBB3_377:
+ mov esi, r9d
+ and esi, -32
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vpbroadcastd ymm1, dword ptr [rip + .LCPI3_3] # ymm1 = [1,1,1,1,1,1,1,1]
+.LBB3_378: # =>This Inner Loop Header: Depth=1
+ vpcmpeqd ymm2, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpandn ymm2, ymm2, ymm1
+ vpcmpeqd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpandn ymm3, ymm3, ymm1
+ vpcmpeqd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpcmpeqd ymm5, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vpandn ymm4, ymm4, ymm1
+ vpandn ymm5, ymm5, ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm4
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm5
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB3_378
+# %bb.379:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_380
+.LBB3_384:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_620
+# %bb.385:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB3_386: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [rcx + 4*rdi], ymm0
+ vmovups ymmword ptr [rcx + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [rcx + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [rcx + 4*rdi + 96], ymm3
+ vmovupd ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vmovupd ymm1, ymmword ptr [rdx + 4*rdi + 160]
+ vmovupd ymm2, ymmword ptr [rdx + 4*rdi + 192]
+ vmovupd ymm3, ymmword ptr [rdx + 4*rdi + 224]
+ vmovupd ymmword ptr [rcx + 4*rdi + 128], ymm0
+ vmovupd ymmword ptr [rcx + 4*rdi + 160], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 192], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 224], ymm3
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_386
+ jmp .LBB3_621
+.LBB3_387:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_628
+# %bb.388:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB3_389: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [rcx + 4*rdi], ymm0
+ vmovups ymmword ptr [rcx + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [rcx + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [rcx + 4*rdi + 96], ymm3
+ vmovupd ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vmovupd ymm1, ymmword ptr [rdx + 4*rdi + 160]
+ vmovupd ymm2, ymmword ptr [rdx + 4*rdi + 192]
+ vmovupd ymm3, ymmword ptr [rdx + 4*rdi + 224]
+ vmovupd ymmword ptr [rcx + 4*rdi + 128], ymm0
+ vmovupd ymmword ptr [rcx + 4*rdi + 160], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 192], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 224], ymm3
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_389
+ jmp .LBB3_629
+.LBB3_390:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_636
+# %bb.391:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_0] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_392: # =>This Inner Loop Header: Depth=1
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm4
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [rcx + 8*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_392
+ jmp .LBB3_637
+.LBB3_393:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_646
+# %bb.394:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_0] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_395: # =>This Inner Loop Header: Depth=1
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm4
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [rcx + 8*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_395
+ jmp .LBB3_647
+.LBB3_396:
+ mov esi, eax
+ and esi, -16
+ xor edi, edi
+ vxorpd xmm0, xmm0, xmm0
+ vbroadcastsd ymm1, qword ptr [rip + .LCPI3_0] # ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+ vbroadcastsd ymm2, qword ptr [rip + .LCPI3_1] # ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+.LBB3_397: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi]
+ vmovupd ymm4, ymmword ptr [rdx + 8*rdi + 32]
+ vmovupd ymm5, ymmword ptr [rdx + 8*rdi + 64]
+ vmovupd ymm6, ymmword ptr [rdx + 8*rdi + 96]
+ vandpd ymm7, ymm3, ymm1
+ vorpd ymm7, ymm2, ymm7
+ vandpd ymm8, ymm4, ymm1
+ vorpd ymm8, ymm8, ymm2
+ vandpd ymm9, ymm5, ymm1
+ vorpd ymm9, ymm9, ymm2
+ vandpd ymm10, ymm6, ymm1
+ vorpd ymm10, ymm10, ymm2
+ vcmpneqpd ymm3, ymm3, ymm0
+ vandpd ymm3, ymm3, ymm7
+ vcmpneqpd ymm4, ymm4, ymm0
+ vandpd ymm4, ymm8, ymm4
+ vcmpneqpd ymm5, ymm5, ymm0
+ vandpd ymm5, ymm9, ymm5
+ vcmpneqpd ymm6, ymm6, ymm0
+ vandpd ymm6, ymm10, ymm6
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm4
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm5
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm6
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB3_397
+# %bb.398:
+ cmp rsi, rax
+ je .LBB3_865
+ jmp .LBB3_399
+.LBB3_404:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_656
+# %bb.405:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_8] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+.LBB3_406: # =>This Inner Loop Header: Depth=1
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm4
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [rcx + 8*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_406
+ jmp .LBB3_657
+.LBB3_407:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_664
+# %bb.408:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_8] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+.LBB3_409: # =>This Inner Loop Header: Depth=1
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm4
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [rcx + 8*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_409
+ jmp .LBB3_665
+.LBB3_410:
+ mov esi, r9d
+ and esi, -128
+ lea rax, [rsi - 128]
+ mov r8, rax
+ shr r8, 7
+ add r8, 1
+ test rax, rax
+ je .LBB3_672
+# %bb.411:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_412: # =>This Inner Loop Header: Depth=1
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpsubb ymm4, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm4
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi + 128]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 160]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 192]
+ vpsubb ymm4, ymm0, ymmword ptr [rdx + rdi + 224]
+ vmovdqu ymmword ptr [rcx + rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 224], ymm4
+ add rdi, 256
+ add rax, 2
+ jne .LBB3_412
+ jmp .LBB3_673
+.LBB3_413:
+ mov esi, r9d
+ and esi, -128
+ lea rax, [rsi - 128]
+ mov r8, rax
+ shr r8, 7
+ add r8, 1
+ test rax, rax
+ je .LBB3_680
+# %bb.414:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_415: # =>This Inner Loop Header: Depth=1
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpsubb ymm4, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm4
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi + 128]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 160]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 192]
+ vpsubb ymm4, ymm0, ymmword ptr [rdx + rdi + 224]
+ vmovdqu ymmword ptr [rcx + rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 224], ymm4
+ add rdi, 256
+ add rax, 2
+ jne .LBB3_415
+ jmp .LBB3_681
+.LBB3_416:
+ mov esi, r11d
+ and esi, -128
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vpcmpeqd ymm1, ymm1, ymm1
+ vmovdqa ymm2, ymmword ptr [rip + .LCPI3_6] # ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+.LBB3_417: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm3, ymmword ptr [rdx + rdi]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm5, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm6, ymmword ptr [rdx + rdi + 96]
+ vpcmpeqb ymm7, ymm3, ymm0
+ vpxor ymm7, ymm7, ymm1
+ vpcmpeqb ymm8, ymm4, ymm0
+ vpxor ymm8, ymm8, ymm1
+ vpcmpeqb ymm9, ymm5, ymm0
+ vpxor ymm9, ymm9, ymm1
+ vpcmpeqb ymm10, ymm6, ymm0
+ vpxor ymm10, ymm10, ymm1
+ vpcmpgtb ymm3, ymm2, ymm3
+ vpcmpgtb ymm4, ymm2, ymm4
+ vpcmpgtb ymm5, ymm2, ymm5
+ vpcmpgtb ymm6, ymm2, ymm6
+ vpblendvb ymm3, ymm2, ymm7, ymm3
+ vpblendvb ymm4, ymm2, ymm8, ymm4
+ vpblendvb ymm5, ymm2, ymm9, ymm5
+ vpblendvb ymm6, ymm2, ymm10, ymm6
+ vmovdqu ymmword ptr [rcx + rdi], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm4
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm5
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm6
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB3_417
+# %bb.418:
+ cmp rsi, r11
+ je .LBB3_865
+ jmp .LBB3_419
+.LBB3_424:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+ vmovdqa xmm0, xmmword ptr [rip + .LCPI3_11] # xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+.LBB3_425: # =>This Inner Loop Header: Depth=1
+ vpmovsxbd ymm1, qword ptr [rdx + rdi]
+ vpmovsxbd ymm2, qword ptr [rdx + rdi + 8]
+ vpmovsxbd ymm3, qword ptr [rdx + rdi + 16]
+ vpmovsxbd ymm4, qword ptr [rdx + rdi + 24]
+ vpsrad ymm5, ymm1, 7
+ vpsrad ymm6, ymm2, 7
+ vpsrad ymm7, ymm3, 7
+ vpsrad ymm8, ymm4, 7
+ vpaddd ymm1, ymm5, ymm1
+ vpaddd ymm2, ymm6, ymm2
+ vpaddd ymm3, ymm7, ymm3
+ vpaddd ymm4, ymm8, ymm4
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm3, ymm3, ymm7
+ vpxor ymm4, ymm8, ymm4
+ vextracti128 xmm5, ymm1, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm1, xmm1, xmm0
+ vpunpckldq xmm1, xmm1, xmm5 # xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+ vextracti128 xmm5, ymm2, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm2, xmm2, xmm0
+ vpunpckldq xmm2, xmm2, xmm5 # xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+ vextracti128 xmm5, ymm3, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm3, xmm3, xmm0
+ vpunpckldq xmm3, xmm3, xmm5 # xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+ vextracti128 xmm5, ymm4, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm4, xmm4, xmm0
+ vpunpckldq xmm4, xmm4, xmm5 # xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+ vinserti128 ymm3, ymm3, xmm4, 1
+ vinserti128 ymm1, ymm1, xmm2, 1
+ vpunpcklqdq ymm1, ymm1, ymm3 # ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+ vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB3_425
+# %bb.426:
+ cmp rsi, r10
+ je .LBB3_865
+ jmp .LBB3_427
+.LBB3_431:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+ vmovdqa xmm0, xmmword ptr [rip + .LCPI3_11] # xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+.LBB3_432: # =>This Inner Loop Header: Depth=1
+ vpmovsxbd ymm1, qword ptr [rdx + rdi]
+ vpmovsxbd ymm2, qword ptr [rdx + rdi + 8]
+ vpmovsxbd ymm3, qword ptr [rdx + rdi + 16]
+ vpmovsxbd ymm4, qword ptr [rdx + rdi + 24]
+ vpsrad ymm5, ymm1, 7
+ vpsrad ymm6, ymm2, 7
+ vpsrad ymm7, ymm3, 7
+ vpsrad ymm8, ymm4, 7
+ vpaddd ymm1, ymm5, ymm1
+ vpaddd ymm2, ymm6, ymm2
+ vpaddd ymm3, ymm7, ymm3
+ vpaddd ymm4, ymm8, ymm4
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm3, ymm3, ymm7
+ vpxor ymm4, ymm8, ymm4
+ vextracti128 xmm5, ymm1, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm1, xmm1, xmm0
+ vpunpckldq xmm1, xmm1, xmm5 # xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+ vextracti128 xmm5, ymm2, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm2, xmm2, xmm0
+ vpunpckldq xmm2, xmm2, xmm5 # xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+ vextracti128 xmm5, ymm3, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm3, xmm3, xmm0
+ vpunpckldq xmm3, xmm3, xmm5 # xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+ vextracti128 xmm5, ymm4, 1
+ vpshufb xmm5, xmm5, xmm0
+ vpshufb xmm4, xmm4, xmm0
+ vpunpckldq xmm4, xmm4, xmm5 # xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+ vinserti128 ymm3, ymm3, xmm4, 1
+ vinserti128 ymm1, ymm1, xmm2, 1
+ vpunpcklqdq ymm1, ymm1, ymm3 # ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+ vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB3_432
+# %bb.433:
+ cmp rsi, r10
+ je .LBB3_865
+ jmp .LBB3_434
+.LBB3_445:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_688
+# %bb.446:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_447: # =>This Inner Loop Header: Depth=1
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 8*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 96], ymm4
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vpsubq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovdqu ymmword ptr [rcx + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_447
+ jmp .LBB3_689
+.LBB3_448:
+ mov esi, r9d
+ and esi, -16
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vpbroadcastq ymm1, qword ptr [rip + .LCPI3_4] # ymm1 = [1,1,1,1]
+.LBB3_449: # =>This Inner Loop Header: Depth=1
+ vpcmpeqq ymm2, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpandn ymm2, ymm2, ymm1
+ vpcmpeqq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpandn ymm3, ymm3, ymm1
+ vpcmpeqq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpcmpeqq ymm5, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vpandn ymm4, ymm4, ymm1
+ vpandn ymm5, ymm5, ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 32], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 64], ymm4
+ vmovdqu ymmword ptr [rcx + 8*rdi + 96], ymm5
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB3_449
+# %bb.450:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_451
+.LBB3_455:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_696
+# %bb.456:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB3_457: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovups ymmword ptr [rcx + 8*rdi], ymm0
+ vmovups ymmword ptr [rcx + 8*rdi + 32], ymm1
+ vmovups ymmword ptr [rcx + 8*rdi + 64], ymm2
+ vmovups ymmword ptr [rcx + 8*rdi + 96], ymm3
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 160]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 192]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [rcx + 8*rdi + 128], ymm0
+ vmovupd ymmword ptr [rcx + 8*rdi + 160], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 192], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 224], ymm3
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_457
+ jmp .LBB3_697
+.LBB3_458:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_704
+# %bb.459:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB3_460: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovups ymmword ptr [rcx + 8*rdi], ymm0
+ vmovups ymmword ptr [rcx + 8*rdi + 32], ymm1
+ vmovups ymmword ptr [rcx + 8*rdi + 64], ymm2
+ vmovups ymmword ptr [rcx + 8*rdi + 96], ymm3
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 160]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 192]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [rcx + 8*rdi + 128], ymm0
+ vmovupd ymmword ptr [rcx + 8*rdi + 160], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 192], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 224], ymm3
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_460
+ jmp .LBB3_705
+.LBB3_468:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_712
+# %bb.469:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_470: # =>This Inner Loop Header: Depth=1
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpsubw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm2
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpsubw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_470
+ jmp .LBB3_713
+.LBB3_471:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_720
+# %bb.472:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_473: # =>This Inner Loop Header: Depth=1
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpsubw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm2
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpsubw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_473
+ jmp .LBB3_721
+.LBB3_474:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_728
+# %bb.475:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_476: # =>This Inner Loop Header: Depth=1
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpsubw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm2
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpsubw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_476
+ jmp .LBB3_729
+.LBB3_477:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_736
+# %bb.478:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vmovdqa ymm1, ymmword ptr [rip + .LCPI3_5] # ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+.LBB3_479: # =>This Inner Loop Header: Depth=1
+ vpcmpeqw ymm2, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpandn ymm2, ymm2, ymm1
+ vpcmpeqw ymm3, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vpandn ymm3, ymm3, ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm2
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm3
+ vpcmpeqw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpandn ymm2, ymm2, ymm1
+ vpcmpeqw ymm3, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vpandn ymm3, ymm3, ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [rcx + 2*rdi + 96], ymm3
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_479
+ jmp .LBB3_737
+.LBB3_480:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_744
+# %bb.481:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vpcmpeqd ymm1, ymm1, ymm1
+ vmovdqa ymm2, ymmword ptr [rip + .LCPI3_5] # ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+.LBB3_482: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm4, ymmword ptr [rdx + 2*rdi + 32]
+ vpcmpeqw ymm5, ymm3, ymm0
+ vpxor ymm5, ymm5, ymm1
+ vpcmpeqw ymm6, ymm4, ymm0
+ vpxor ymm6, ymm6, ymm1
+ vpcmpgtw ymm3, ymm2, ymm3
+ vpcmpgtw ymm4, ymm2, ymm4
+ vpblendvb ymm3, ymm2, ymm5, ymm3
+ vpblendvb ymm4, ymm2, ymm6, ymm4
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm3
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm4
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 2*rdi + 96]
+ vpcmpeqw ymm5, ymm3, ymm0
+ vpxor ymm5, ymm5, ymm1
+ vpcmpeqw ymm6, ymm4, ymm0
+ vpxor ymm6, ymm6, ymm1
+ vpcmpgtw ymm3, ymm2, ymm3
+ vpcmpgtw ymm4, ymm2, ymm4
+ vpblendvb ymm3, ymm2, ymm5, ymm3
+ vpblendvb ymm4, ymm2, ymm6, ymm4
+ vmovdqu ymmword ptr [rcx + 2*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 2*rdi + 96], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_482
+ jmp .LBB3_745
+.LBB3_483:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov rdi, rax
+ shr rdi, 5
+ add rdi, 1
+ mov r8d, edi
+ and r8d, 3
+ cmp rax, 96
+ jae .LBB3_592
+# %bb.484:
+ xor eax, eax
+ jmp .LBB3_594
+.LBB3_485:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_753
+# %bb.486:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI3_10] # ymm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+.LBB3_487: # =>This Inner Loop Header: Depth=1
+ vpmovsxwd ymm1, xmmword ptr [rdx + 2*rdi]
+ vpmovsxwd ymm2, xmmword ptr [rdx + 2*rdi + 16]
+ vpsrad ymm3, ymm2, 15
+ vpsrad ymm4, ymm1, 15
+ vpaddd ymm1, ymm4, ymm1
+ vpaddd ymm2, ymm3, ymm2
+ vpxor ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm4
+ vpshufb ymm1, ymm1, ymm0
+ vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
+ vpshufb ymm2, ymm2, ymm0
+ vpermq ymm2, ymm2, 232 # ymm2 = ymm2[0,2,2,3]
+ vmovdqu xmmword ptr [rcx + 2*rdi + 16], xmm2
+ vmovdqu xmmword ptr [rcx + 2*rdi], xmm1
+ vpmovsxwd ymm1, xmmword ptr [rdx + 2*rdi + 32]
+ vpmovsxwd ymm2, xmmword ptr [rdx + 2*rdi + 48]
+ vpsrad ymm3, ymm2, 15
+ vpsrad ymm4, ymm1, 15
+ vpaddd ymm1, ymm4, ymm1
+ vpaddd ymm2, ymm3, ymm2
+ vpxor ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm4
+ vpshufb ymm1, ymm1, ymm0
+ vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
+ vpshufb ymm2, ymm2, ymm0
+ vpermq ymm2, ymm2, 232 # ymm2 = ymm2[0,2,2,3]
+ vmovdqu xmmword ptr [rcx + 2*rdi + 48], xmm2
+ vmovdqu xmmword ptr [rcx + 2*rdi + 32], xmm1
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_487
+ jmp .LBB3_754
+.LBB3_488:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov rdi, rax
+ shr rdi, 5
+ add rdi, 1
+ mov r8d, edi
+ and r8d, 3
+ cmp rax, 96
+ jae .LBB3_602
+# %bb.489:
+ xor eax, eax
+ jmp .LBB3_604
+.LBB3_490:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_761
+# %bb.491:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vmovdqa ymm0, ymmword ptr [rip + .LCPI3_10] # ymm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+.LBB3_492: # =>This Inner Loop Header: Depth=1
+ vpmovsxwd ymm1, xmmword ptr [rdx + 2*rdi]
+ vpmovsxwd ymm2, xmmword ptr [rdx + 2*rdi + 16]
+ vpsrad ymm3, ymm2, 15
+ vpsrad ymm4, ymm1, 15
+ vpaddd ymm1, ymm4, ymm1
+ vpaddd ymm2, ymm3, ymm2
+ vpxor ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm4
+ vpshufb ymm1, ymm1, ymm0
+ vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
+ vpshufb ymm2, ymm2, ymm0
+ vpermq ymm2, ymm2, 232 # ymm2 = ymm2[0,2,2,3]
+ vmovdqu xmmword ptr [rcx + 2*rdi + 16], xmm2
+ vmovdqu xmmword ptr [rcx + 2*rdi], xmm1
+ vpmovsxwd ymm1, xmmword ptr [rdx + 2*rdi + 32]
+ vpmovsxwd ymm2, xmmword ptr [rdx + 2*rdi + 48]
+ vpsrad ymm3, ymm2, 15
+ vpsrad ymm4, ymm1, 15
+ vpaddd ymm1, ymm4, ymm1
+ vpaddd ymm2, ymm3, ymm2
+ vpxor ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm4
+ vpshufb ymm1, ymm1, ymm0
+ vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
+ vpshufb ymm2, ymm2, ymm0
+ vpermq ymm2, ymm2, 232 # ymm2 = ymm2[0,2,2,3]
+ vmovdqu xmmword ptr [rcx + 2*rdi + 48], xmm2
+ vmovdqu xmmword ptr [rcx + 2*rdi + 32], xmm1
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_492
+ jmp .LBB3_762
+.LBB3_493:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_769
+# %bb.494:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_495: # =>This Inner Loop Header: Depth=1
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 8*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 96], ymm4
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vpsubq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovdqu ymmword ptr [rcx + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_495
+ jmp .LBB3_770
+.LBB3_496:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_777
+# %bb.497:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_7] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_498: # =>This Inner Loop Header: Depth=1
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm4
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovupd ymmword ptr [rcx + 4*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_498
+ jmp .LBB3_778
+.LBB3_499:
+ mov esi, r9d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r8, rax
+ shr r8, 4
+ add r8, 1
+ test rax, rax
+ je .LBB3_787
+# %bb.500:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_501: # =>This Inner Loop Header: Depth=1
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 8*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 96], ymm4
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vpsubq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovdqu ymmword ptr [rcx + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rax, 2
+ jne .LBB3_501
+ jmp .LBB3_788
+.LBB3_502:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_795
+# %bb.503:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_7] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+.LBB3_504: # =>This Inner Loop Header: Depth=1
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm4
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vxorpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovupd ymmword ptr [rcx + 4*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_504
+ jmp .LBB3_796
+.LBB3_505:
+ mov esi, r11d
+ and esi, -16
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vpcmpeqd ymm1, ymm1, ymm1
+ vpbroadcastq ymm2, qword ptr [rip + .LCPI3_4] # ymm2 = [1,1,1,1]
+.LBB3_506: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm5, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm6, ymmword ptr [rdx + 8*rdi + 96]
+ vpcmpeqq ymm7, ymm3, ymm0
+ vpxor ymm7, ymm7, ymm1
+ vpcmpeqq ymm8, ymm4, ymm0
+ vpxor ymm8, ymm8, ymm1
+ vpcmpeqq ymm9, ymm5, ymm0
+ vpxor ymm9, ymm9, ymm1
+ vpcmpeqq ymm10, ymm6, ymm0
+ vpxor ymm10, ymm10, ymm1
+ vpcmpgtq ymm3, ymm2, ymm3
+ vpcmpgtq ymm4, ymm2, ymm4
+ vpcmpgtq ymm5, ymm2, ymm5
+ vpcmpgtq ymm6, ymm2, ymm6
+ vblendvpd ymm3, ymm2, ymm7, ymm3
+ vblendvpd ymm4, ymm2, ymm8, ymm4
+ vblendvpd ymm5, ymm2, ymm9, ymm5
+ vblendvpd ymm6, ymm2, ymm10, ymm6
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm4
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm5
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm6
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB3_506
+# %bb.507:
+ cmp rsi, r11
+ je .LBB3_865
+ jmp .LBB3_508
+.LBB3_513:
+ mov esi, eax
+ and esi, -32
+ xor edi, edi
+ vxorps xmm0, xmm0, xmm0
+ vpbroadcastd ymm1, dword ptr [rip + .LCPI3_3] # ymm1 = [1,1,1,1,1,1,1,1]
+.LBB3_514: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm5, ymmword ptr [rdx + 4*rdi + 96]
+ vpsrad ymm6, ymm2, 31
+ vpor ymm6, ymm6, ymm1
+ vpsrad ymm7, ymm3, 31
+ vpor ymm7, ymm7, ymm1
+ vpsrad ymm8, ymm4, 31
+ vpor ymm8, ymm8, ymm1
+ vpsrad ymm9, ymm5, 31
+ vpor ymm9, ymm9, ymm1
+ vcvtdq2ps ymm6, ymm6
+ vcvtdq2ps ymm7, ymm7
+ vcvtdq2ps ymm8, ymm8
+ vcvtdq2ps ymm9, ymm9
+ vcmpneqps ymm2, ymm2, ymm0
+ vandps ymm2, ymm2, ymm6
+ vcmpneqps ymm3, ymm3, ymm0
+ vandps ymm3, ymm3, ymm7
+ vcmpneqps ymm4, ymm4, ymm0
+ vandps ymm4, ymm8, ymm4
+ vcmpneqps ymm5, ymm5, ymm0
+ vandps ymm5, ymm9, ymm5
+ vmovups ymmword ptr [rcx + 4*rdi], ymm2
+ vmovups ymmword ptr [rcx + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [rcx + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [rcx + 4*rdi + 96], ymm5
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB3_514
+# %bb.515:
+ cmp rsi, rax
+ je .LBB3_865
+ jmp .LBB3_516
+.LBB3_521:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_522: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymm1
+ vblendvpd ymm1, ymm1, ymm4, ymm1
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm5, ymm0, ymm2
+ vblendvpd ymm2, ymm2, ymm5, ymm2
+ vpsubq ymm5, ymm0, ymm3
+ vblendvpd ymm3, ymm3, ymm5, ymm3
+ vpsubq ymm5, ymm0, ymm4
+ vblendvpd ymm4, ymm4, ymm5, ymm4
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm4
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB3_522
+# %bb.523:
+ cmp rsi, r10
+ je .LBB3_865
+ jmp .LBB3_524
+.LBB3_528:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_805
+# %bb.529:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_9] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+.LBB3_530: # =>This Inner Loop Header: Depth=1
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm4
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovupd ymmword ptr [rcx + 4*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_530
+ jmp .LBB3_806
+.LBB3_531:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_532: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymm1
+ vblendvpd ymm1, ymm1, ymm4, ymm1
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm5, ymm0, ymm2
+ vblendvpd ymm2, ymm2, ymm5, ymm2
+ vpsubq ymm5, ymm0, ymm3
+ vblendvpd ymm3, ymm3, ymm5, ymm3
+ vpsubq ymm5, ymm0, ymm4
+ vblendvpd ymm4, ymm4, ymm5, ymm4
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm4
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB3_532
+# %bb.533:
+ cmp rsi, r10
+ je .LBB3_865
+ jmp .LBB3_534
+.LBB3_538:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_815
+# %bb.539:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_9] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+.LBB3_540: # =>This Inner Loop Header: Depth=1
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm4
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vandpd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovupd ymmword ptr [rcx + 4*rdi + 128], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 160], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 192], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_540
+ jmp .LBB3_816
+.LBB3_548:
+ mov esi, r9d
+ and esi, -128
+ lea rax, [rsi - 128]
+ mov r8, rax
+ shr r8, 7
+ add r8, 1
+ test rax, rax
+ je .LBB3_825
+# %bb.549:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_550: # =>This Inner Loop Header: Depth=1
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpsubb ymm4, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm4
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi + 128]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 160]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 192]
+ vpsubb ymm4, ymm0, ymmword ptr [rdx + rdi + 224]
+ vmovdqu ymmword ptr [rcx + rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 224], ymm4
+ add rdi, 256
+ add rax, 2
+ jne .LBB3_550
+ jmp .LBB3_826
+.LBB3_551:
+ mov esi, r9d
+ and esi, -128
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vmovdqa ymm1, ymmword ptr [rip + .LCPI3_6] # ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+.LBB3_552: # =>This Inner Loop Header: Depth=1
+ vpcmpeqb ymm2, ymm0, ymmword ptr [rdx + rdi]
+ vpandn ymm2, ymm2, ymm1
+ vpcmpeqb ymm3, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpandn ymm3, ymm3, ymm1
+ vpcmpeqb ymm4, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpcmpeqb ymm5, ymm0, ymmword ptr [rdx + rdi + 96]
+ vpandn ymm4, ymm4, ymm1
+ vpandn ymm5, ymm5, ymm1
+ vmovdqu ymmword ptr [rcx + rdi], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm4
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm5
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB3_552
+# %bb.553:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_554
+.LBB3_558:
+ mov esi, r9d
+ and esi, -128
+ lea rax, [rsi - 128]
+ mov r8, rax
+ shr r8, 7
+ add r8, 1
+ test rax, rax
+ je .LBB3_833
+# %bb.559:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB3_560: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + rdi]
+ vmovups ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovups ymmword ptr [rcx + rdi], ymm0
+ vmovups ymmword ptr [rcx + rdi + 32], ymm1
+ vmovups ymmword ptr [rcx + rdi + 64], ymm2
+ vmovups ymmword ptr [rcx + rdi + 96], ymm3
+ vmovupd ymm0, ymmword ptr [rdx + rdi + 128]
+ vmovupd ymm1, ymmword ptr [rdx + rdi + 160]
+ vmovupd ymm2, ymmword ptr [rdx + rdi + 192]
+ vmovupd ymm3, ymmword ptr [rdx + rdi + 224]
+ vmovupd ymmword ptr [rcx + rdi + 128], ymm0
+ vmovupd ymmword ptr [rcx + rdi + 160], ymm1
+ vmovupd ymmword ptr [rcx + rdi + 192], ymm2
+ vmovupd ymmword ptr [rcx + rdi + 224], ymm3
+ add rdi, 256
+ add rax, 2
+ jne .LBB3_560
+ jmp .LBB3_834
+.LBB3_561:
+ mov esi, r9d
+ and esi, -128
+ lea rax, [rsi - 128]
+ mov r8, rax
+ shr r8, 7
+ add r8, 1
+ test rax, rax
+ je .LBB3_841
+# %bb.562:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB3_563: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + rdi]
+ vmovups ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovups ymmword ptr [rcx + rdi], ymm0
+ vmovups ymmword ptr [rcx + rdi + 32], ymm1
+ vmovups ymmword ptr [rcx + rdi + 64], ymm2
+ vmovups ymmword ptr [rcx + rdi + 96], ymm3
+ vmovupd ymm0, ymmword ptr [rdx + rdi + 128]
+ vmovupd ymm1, ymmword ptr [rdx + rdi + 160]
+ vmovupd ymm2, ymmword ptr [rdx + rdi + 192]
+ vmovupd ymm3, ymmword ptr [rdx + rdi + 224]
+ vmovupd ymmword ptr [rcx + rdi + 128], ymm0
+ vmovupd ymmword ptr [rcx + rdi + 160], ymm1
+ vmovupd ymmword ptr [rcx + rdi + 192], ymm2
+ vmovupd ymmword ptr [rcx + rdi + 224], ymm3
+ add rdi, 256
+ add rax, 2
+ jne .LBB3_563
+ jmp .LBB3_842
+.LBB3_564:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_849
+# %bb.565:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_566: # =>This Inner Loop Header: Depth=1
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpsubd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm4
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vpsubd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovdqu ymmword ptr [rcx + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_566
+ jmp .LBB3_850
+.LBB3_567:
+ mov esi, r9d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r8, rax
+ shr r8, 5
+ add r8, 1
+ test rax, rax
+ je .LBB3_857
+# %bb.568:
+ mov rax, r8
+ and rax, -2
+ neg rax
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+.LBB3_569: # =>This Inner Loop Header: Depth=1
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpsubd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm4
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vpsubd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovdqu ymmword ptr [rcx + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rax, 2
+ jne .LBB3_569
+ jmp .LBB3_858
+.LBB3_570:
+ mov esi, r11d
+ and esi, -32
+ xor edi, edi
+ vpxor xmm0, xmm0, xmm0
+ vpcmpeqd ymm1, ymm1, ymm1
+ vpbroadcastd ymm2, dword ptr [rip + .LCPI3_3] # ymm2 = [1,1,1,1,1,1,1,1]
+.LBB3_571: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm5, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm6, ymmword ptr [rdx + 4*rdi + 96]
+ vpcmpeqd ymm7, ymm3, ymm0
+ vpxor ymm7, ymm7, ymm1
+ vpcmpeqd ymm8, ymm4, ymm0
+ vpxor ymm8, ymm8, ymm1
+ vpcmpeqd ymm9, ymm5, ymm0
+ vpxor ymm9, ymm9, ymm1
+ vpcmpeqd ymm10, ymm6, ymm0
+ vpxor ymm10, ymm10, ymm1
+ vpcmpgtd ymm3, ymm2, ymm3
+ vpcmpgtd ymm4, ymm2, ymm4
+ vpcmpgtd ymm5, ymm2, ymm5
+ vpcmpgtd ymm6, ymm2, ymm6
+ vblendvps ymm3, ymm2, ymm7, ymm3
+ vblendvps ymm4, ymm2, ymm8, ymm4
+ vblendvps ymm5, ymm2, ymm9, ymm5
+ vblendvps ymm6, ymm2, ymm10, ymm6
+ vmovups ymmword ptr [rcx + 4*rdi], ymm3
+ vmovups ymmword ptr [rcx + 4*rdi + 32], ymm4
+ vmovups ymmword ptr [rcx + 4*rdi + 64], ymm5
+ vmovups ymmword ptr [rcx + 4*rdi + 96], ymm6
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB3_571
+# %bb.572:
+ cmp rsi, r11
+ je .LBB3_865
+ jmp .LBB3_573
+.LBB3_578:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB3_579: # =>This Inner Loop Header: Depth=1
+ vpabsd ymm0, ymmword ptr [rdx + 4*rdi]
+ vpabsd ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpabsd ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpabsd ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm0
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB3_579
+# %bb.580:
+ cmp rsi, r10
+ je .LBB3_865
+ jmp .LBB3_581
+.LBB3_585:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB3_586: # =>This Inner Loop Header: Depth=1
+ vpabsd ymm0, ymmword ptr [rdx + 4*rdi]
+ vpabsd ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpabsd ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpabsd ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm0
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB3_586
+# %bb.587:
+ cmp rsi, r10
+ je .LBB3_865
+ jmp .LBB3_588
+.LBB3_367:
+ and rdi, -4
+ neg rdi
+ xor eax, eax
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_368: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rcx + 4*rax], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 32], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 64], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 96], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 128], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 160], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 192], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 224], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 256], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 288], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 320], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 352], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 384], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 416], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 448], ymm0
+ vmovupd ymmword ptr [rcx + 4*rax + 480], ymm0
+ sub rax, -128
+ add rdi, 4
+ jne .LBB3_368
+.LBB3_369:
+ test rsi, rsi
+ je .LBB3_372
+# %bb.370:
+ lea rax, [rcx + 4*rax]
+ add rax, 96
+ neg rsi
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_371: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rax - 96], ymm0
+ vmovupd ymmword ptr [rax - 64], ymm0
+ vmovupd ymmword ptr [rax - 32], ymm0
+ vmovupd ymmword ptr [rax], ymm0
+ sub rax, -128
+ inc rsi
+ jne .LBB3_371
+.LBB3_372:
+ cmp rdx, r9
+ je .LBB3_865
+ .p2align 4, 0x90
+.LBB3_373: # =>This Inner Loop Header: Depth=1
+ mov dword ptr [rcx + 4*rdx], 0
+ add rdx, 1
+ cmp r9, rdx
+ jne .LBB3_373
+ jmp .LBB3_865
+.LBB3_438:
+ and rdi, -4
+ neg rdi
+ xor eax, eax
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_439: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rcx + 8*rax], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 32], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 64], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 96], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 128], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 160], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 192], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 224], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 256], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 288], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 320], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 352], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 384], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 416], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 448], ymm0
+ vmovupd ymmword ptr [rcx + 8*rax + 480], ymm0
+ add rax, 64
+ add rdi, 4
+ jne .LBB3_439
+.LBB3_440:
+ test rsi, rsi
+ je .LBB3_443
+# %bb.441:
+ lea rax, [rcx + 8*rax]
+ add rax, 96
+ neg rsi
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_442: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rax - 96], ymm0
+ vmovupd ymmword ptr [rax - 64], ymm0
+ vmovupd ymmword ptr [rax - 32], ymm0
+ vmovupd ymmword ptr [rax], ymm0
+ sub rax, -128
+ inc rsi
+ jne .LBB3_442
+.LBB3_443:
+ cmp rdx, r9
+ je .LBB3_865
+ .p2align 4, 0x90
+.LBB3_444: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rcx + 8*rdx], 0
+ add rdx, 1
+ cmp r9, rdx
+ jne .LBB3_444
+ jmp .LBB3_865
+.LBB3_461:
+ and rdi, -4
+ neg rdi
+ xor eax, eax
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_462: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rcx + 2*rax], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 32], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 64], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 96], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 128], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 160], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 192], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 224], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 256], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 288], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 320], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 352], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 384], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 416], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 448], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 480], ymm0
+ add rax, 256
+ add rdi, 4
+ jne .LBB3_462
+.LBB3_463:
+ test rsi, rsi
+ je .LBB3_466
+# %bb.464:
+ lea rax, [rcx + 2*rax]
+ add rax, 96
+ neg rsi
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_465: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rax - 96], ymm0
+ vmovupd ymmword ptr [rax - 64], ymm0
+ vmovupd ymmword ptr [rax - 32], ymm0
+ vmovupd ymmword ptr [rax], ymm0
+ sub rax, -128
+ inc rsi
+ jne .LBB3_465
+.LBB3_466:
+ cmp rdx, r9
+ je .LBB3_865
+ .p2align 4, 0x90
+.LBB3_467: # =>This Inner Loop Header: Depth=1
+ mov word ptr [rcx + 2*rdx], 0
+ add rdx, 1
+ cmp r9, rdx
+ jne .LBB3_467
+ jmp .LBB3_865
+.LBB3_541:
+ and rdi, -4
+ neg rdi
+ xor eax, eax
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_542: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rcx + rax], ymm0
+ vmovupd ymmword ptr [rcx + rax + 32], ymm0
+ vmovupd ymmword ptr [rcx + rax + 64], ymm0
+ vmovupd ymmword ptr [rcx + rax + 96], ymm0
+ vmovupd ymmword ptr [rcx + rax + 128], ymm0
+ vmovupd ymmword ptr [rcx + rax + 160], ymm0
+ vmovupd ymmword ptr [rcx + rax + 192], ymm0
+ vmovupd ymmword ptr [rcx + rax + 224], ymm0
+ vmovupd ymmword ptr [rcx + rax + 256], ymm0
+ vmovupd ymmword ptr [rcx + rax + 288], ymm0
+ vmovupd ymmword ptr [rcx + rax + 320], ymm0
+ vmovupd ymmword ptr [rcx + rax + 352], ymm0
+ vmovupd ymmword ptr [rcx + rax + 384], ymm0
+ vmovupd ymmword ptr [rcx + rax + 416], ymm0
+ vmovupd ymmword ptr [rcx + rax + 448], ymm0
+ vmovupd ymmword ptr [rcx + rax + 480], ymm0
+ add rax, 512
+ add rdi, 4
+ jne .LBB3_542
+.LBB3_543:
+ test rsi, rsi
+ je .LBB3_546
+# %bb.544:
+ add rax, rcx
+ add rax, 96
+ neg rsi
+ vxorpd xmm0, xmm0, xmm0
+.LBB3_545: # =>This Inner Loop Header: Depth=1
+ vmovupd ymmword ptr [rax - 96], ymm0
+ vmovupd ymmword ptr [rax - 64], ymm0
+ vmovupd ymmword ptr [rax - 32], ymm0
+ vmovupd ymmword ptr [rax], ymm0
+ sub rax, -128
+ inc rsi
+ jne .LBB3_545
+.LBB3_546:
+ cmp rdx, r9
+ je .LBB3_865
+ .p2align 4, 0x90
+.LBB3_547: # =>This Inner Loop Header: Depth=1
+ mov byte ptr [rcx + rdx], 0
+ add rdx, 1
+ cmp r9, rdx
+ jne .LBB3_547
+.LBB3_865:
+ mov rsp, rbp
+ pop rbp
+ vzeroupper
+ ret
+.LBB3_592:
+ and rdi, -4
+ neg rdi
+ xor eax, eax
+.LBB3_593: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 2*rax]
+ vmovups ymm1, ymmword ptr [rdx + 2*rax + 32]
+ vmovups ymmword ptr [rcx + 2*rax], ymm0
+ vmovups ymmword ptr [rcx + 2*rax + 32], ymm1
+ vmovups ymm0, ymmword ptr [rdx + 2*rax + 64]
+ vmovups ymm1, ymmword ptr [rdx + 2*rax + 96]
+ vmovups ymmword ptr [rcx + 2*rax + 64], ymm0
+ vmovups ymmword ptr [rcx + 2*rax + 96], ymm1
+ vmovups ymm0, ymmword ptr [rdx + 2*rax + 128]
+ vmovups ymm1, ymmword ptr [rdx + 2*rax + 160]
+ vmovups ymmword ptr [rcx + 2*rax + 128], ymm0
+ vmovups ymmword ptr [rcx + 2*rax + 160], ymm1
+ vmovupd ymm0, ymmword ptr [rdx + 2*rax + 192]
+ vmovupd ymm1, ymmword ptr [rdx + 2*rax + 224]
+ vmovupd ymmword ptr [rcx + 2*rax + 192], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 224], ymm1
+ sub rax, -128
+ add rdi, 4
+ jne .LBB3_593
+.LBB3_594:
+ test r8, r8
+ je .LBB3_597
+# %bb.595:
+ add rax, rax
+ add rax, 32
+ neg r8
+.LBB3_596: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rdx + rax - 32]
+ vmovupd ymm1, ymmword ptr [rdx + rax]
+ vmovupd ymmword ptr [rcx + rax - 32], ymm0
+ vmovupd ymmword ptr [rcx + rax], ymm1
+ add rax, 64
+ inc r8
+ jne .LBB3_596
+.LBB3_597:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_598
+.LBB3_602:
+ and rdi, -4
+ neg rdi
+ xor eax, eax
+.LBB3_603: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 2*rax]
+ vmovups ymm1, ymmword ptr [rdx + 2*rax + 32]
+ vmovups ymmword ptr [rcx + 2*rax], ymm0
+ vmovups ymmword ptr [rcx + 2*rax + 32], ymm1
+ vmovups ymm0, ymmword ptr [rdx + 2*rax + 64]
+ vmovups ymm1, ymmword ptr [rdx + 2*rax + 96]
+ vmovups ymmword ptr [rcx + 2*rax + 64], ymm0
+ vmovups ymmword ptr [rcx + 2*rax + 96], ymm1
+ vmovups ymm0, ymmword ptr [rdx + 2*rax + 128]
+ vmovups ymm1, ymmword ptr [rdx + 2*rax + 160]
+ vmovups ymmword ptr [rcx + 2*rax + 128], ymm0
+ vmovups ymmword ptr [rcx + 2*rax + 160], ymm1
+ vmovupd ymm0, ymmword ptr [rdx + 2*rax + 192]
+ vmovupd ymm1, ymmword ptr [rdx + 2*rax + 224]
+ vmovupd ymmword ptr [rcx + 2*rax + 192], ymm0
+ vmovupd ymmword ptr [rcx + 2*rax + 224], ymm1
+ sub rax, -128
+ add rdi, 4
+ jne .LBB3_603
+.LBB3_604:
+ test r8, r8
+ je .LBB3_607
+# %bb.605:
+ add rax, rax
+ add rax, 32
+ neg r8
+.LBB3_606: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rdx + rax - 32]
+ vmovupd ymm1, ymmword ptr [rdx + rax]
+ vmovupd ymmword ptr [rcx + rax - 32], ymm0
+ vmovupd ymmword ptr [rcx + rax], ymm1
+ add rax, 64
+ inc r8
+ jne .LBB3_606
+.LBB3_607:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_608
+.LBB3_612:
+ xor edi, edi
+.LBB3_613:
+ test r8b, 1
+ je .LBB3_615
+# %bb.614:
+ vpxor xmm0, xmm0, xmm0
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpsubd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm0
+.LBB3_615:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_616
+.LBB3_620:
+ xor edi, edi
+.LBB3_621:
+ test r8b, 1
+ je .LBB3_623
+# %bb.622:
+ vmovupd ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm0
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm3
+.LBB3_623:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_624
+.LBB3_628:
+ xor edi, edi
+.LBB3_629:
+ test r8b, 1
+ je .LBB3_631
+# %bb.630:
+ vmovupd ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm0
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm3
+.LBB3_631:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_632
+.LBB3_636:
+ xor edi, edi
+.LBB3_637:
+ test r8b, 1
+ je .LBB3_639
+# %bb.638:
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_0] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vxorpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm0
+.LBB3_639:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_640
+.LBB3_646:
+ xor edi, edi
+.LBB3_647:
+ test r8b, 1
+ je .LBB3_649
+# %bb.648:
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_0] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vxorpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm0
+.LBB3_649:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_650
+.LBB3_656:
+ xor edi, edi
+.LBB3_657:
+ test r8b, 1
+ je .LBB3_659
+# %bb.658:
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_8] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vandpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm0
+.LBB3_659:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_660
+.LBB3_664:
+ xor edi, edi
+.LBB3_665:
+ test r8b, 1
+ je .LBB3_667
+# %bb.666:
+ vbroadcastsd ymm0, qword ptr [rip + .LCPI3_8] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vandpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm0
+.LBB3_667:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_668
+.LBB3_672:
+ xor edi, edi
+.LBB3_673:
+ test r8b, 1
+ je .LBB3_675
+# %bb.674:
+ vpxor xmm0, xmm0, xmm0
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpsubb ymm0, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm0
+.LBB3_675:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_676
+.LBB3_680:
+ xor edi, edi
+.LBB3_681:
+ test r8b, 1
+ je .LBB3_683
+# %bb.682:
+ vpxor xmm0, xmm0, xmm0
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpsubb ymm0, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm0
+.LBB3_683:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_684
+.LBB3_688:
+ xor edi, edi
+.LBB3_689:
+ test r8b, 1
+ je .LBB3_691
+# %bb.690:
+ vpxor xmm0, xmm0, xmm0
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 8*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 96], ymm0
+.LBB3_691:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_692
+.LBB3_696:
+ xor edi, edi
+.LBB3_697:
+ test r8b, 1
+ je .LBB3_699
+# %bb.698:
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm0
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm3
+.LBB3_699:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_700
+.LBB3_704:
+ xor edi, edi
+.LBB3_705:
+ test r8b, 1
+ je .LBB3_707
+# %bb.706:
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [rcx + 8*rdi], ymm0
+ vmovupd ymmword ptr [rcx + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [rcx + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [rcx + 8*rdi + 96], ymm3
+.LBB3_707:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_708
+.LBB3_712:
+ xor edi, edi
+.LBB3_713:
+ test r8b, 1
+ je .LBB3_715
+# %bb.714:
+ vpxor xmm0, xmm0, xmm0
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpsubw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm0
+.LBB3_715:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_716
+.LBB3_720:
+ xor edi, edi
+.LBB3_721:
+ test r8b, 1
+ je .LBB3_723
+# %bb.722:
+ vpxor xmm0, xmm0, xmm0
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpsubw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm0
+.LBB3_723:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_724
+.LBB3_728:
+ xor edi, edi
+.LBB3_729:
+ test r8b, 1
+ je .LBB3_731
+# %bb.730:
+ vpxor xmm0, xmm0, xmm0
+ vpsubw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpsubw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm0
+.LBB3_731:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_732
+.LBB3_736:
+ xor edi, edi
+.LBB3_737:
+ test r8b, 1
+ je .LBB3_739
+# %bb.738:
+ vpxor xmm0, xmm0, xmm0
+ vpcmpeqw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqa ymm2, ymmword ptr [rip + .LCPI3_5] # ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+ vpcmpeqw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm0, ymm0, ymm2
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm0
+.LBB3_739:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_740
+.LBB3_744:
+ xor edi, edi
+.LBB3_745:
+ test r8b, 1
+ je .LBB3_747
+# %bb.746:
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpxor xmm2, xmm2, xmm2
+ vpcmpeqw ymm3, ymm0, ymm2
+ vpcmpeqd ymm4, ymm4, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vpcmpeqw ymm2, ymm1, ymm2
+ vpxor ymm2, ymm2, ymm4
+ vmovdqa ymm4, ymmword ptr [rip + .LCPI3_5] # ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+ vpcmpgtw ymm0, ymm4, ymm0
+ vpcmpgtw ymm1, ymm4, ymm1
+ vpblendvb ymm0, ymm4, ymm3, ymm0
+ vpblendvb ymm1, ymm4, ymm2, ymm1
+ vmovdqu ymmword ptr [rcx + 2*rdi], ymm0
+ vmovdqu ymmword ptr [rcx + 2*rdi + 32], ymm1
+.LBB3_747:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_748
+.LBB3_753:
+ xor edi, edi
+.LBB3_754:
+ test r8b, 1
+ je .LBB3_756
+# %bb.755:
+ vpmovsxwd ymm0, xmmword ptr [rdx + 2*rdi]
+ vpmovsxwd ymm1, xmmword ptr [rdx + 2*rdi + 16]
+ vpsrad ymm2, ymm1, 15
+ vpsrad ymm3, ymm0, 15
+ vpaddd ymm0, ymm3, ymm0
+ vpaddd ymm1, ymm2, ymm1
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm0, ymm0, ymm3
+ vmovdqa ymm2, ymmword ptr [rip + .LCPI3_10] # ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+ vpshufb ymm0, ymm0, ymm2
+ vpermq ymm0, ymm0, 232 # ymm0 = ymm0[0,2,2,3]
+ vpshufb ymm1, ymm1, ymm2
+ vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
+ vmovdqu xmmword ptr [rcx + 2*rdi + 16], xmm1
+ vmovdqu xmmword ptr [rcx + 2*rdi], xmm0
+.LBB3_756:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_757
+.LBB3_761:
+ xor edi, edi
+.LBB3_762:
+ test r8b, 1
+ je .LBB3_764
+# %bb.763:
+ vpmovsxwd ymm0, xmmword ptr [rdx + 2*rdi]
+ vpmovsxwd ymm1, xmmword ptr [rdx + 2*rdi + 16]
+ vpsrad ymm2, ymm1, 15
+ vpsrad ymm3, ymm0, 15
+ vpaddd ymm0, ymm3, ymm0
+ vpaddd ymm1, ymm2, ymm1
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm0, ymm0, ymm3
+ vmovdqa ymm2, ymmword ptr [rip + .LCPI3_10] # ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+ vpshufb ymm0, ymm0, ymm2
+ vpermq ymm0, ymm0, 232 # ymm0 = ymm0[0,2,2,3]
+ vpshufb ymm1, ymm1, ymm2
+ vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
+ vmovdqu xmmword ptr [rcx + 2*rdi + 16], xmm1
+ vmovdqu xmmword ptr [rcx + 2*rdi], xmm0
+.LBB3_764:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_765
+.LBB3_769:
+ xor edi, edi
+.LBB3_770:
+ test r8b, 1
+ je .LBB3_772
+# %bb.771:
+ vpxor xmm0, xmm0, xmm0
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 8*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 96], ymm0
+.LBB3_772:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_773
+.LBB3_777:
+ xor edi, edi
+.LBB3_778:
+ test r8b, 1
+ je .LBB3_780
+# %bb.779:
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_7] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vxorpd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm0
+.LBB3_780:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_781
+.LBB3_787:
+ xor edi, edi
+.LBB3_788:
+ test r8b, 1
+ je .LBB3_790
+# %bb.789:
+ vpxor xmm0, xmm0, xmm0
+ vpsubq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpsubq ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 8*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 8*rdi + 96], ymm0
+.LBB3_790:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_791
+.LBB3_795:
+ xor edi, edi
+.LBB3_796:
+ test r8b, 1
+ je .LBB3_798
+# %bb.797:
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_7] # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+ vxorpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vxorpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vxorpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vxorpd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm0
+.LBB3_798:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_799
+.LBB3_805:
+ xor edi, edi
+.LBB3_806:
+ test r8b, 1
+ je .LBB3_808
+# %bb.807:
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_9] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vandpd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm0
+.LBB3_808:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_809
+.LBB3_815:
+ xor edi, edi
+.LBB3_816:
+ test r8b, 1
+ je .LBB3_818
+# %bb.817:
+ vbroadcastss ymm0, dword ptr [rip + .LCPI3_9] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
+ vandpd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vandpd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vandpd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vandpd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovupd ymmword ptr [rcx + 4*rdi], ymm1
+ vmovupd ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovupd ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovupd ymmword ptr [rcx + 4*rdi + 96], ymm0
+.LBB3_818:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_819
+.LBB3_825:
+ xor edi, edi
+.LBB3_826:
+ test r8b, 1
+ je .LBB3_828
+# %bb.827:
+ vpxor xmm0, xmm0, xmm0
+ vpsubb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpsubb ymm0, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [rcx + rdi], ymm1
+ vmovdqu ymmword ptr [rcx + rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + rdi + 96], ymm0
+.LBB3_828:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_829
+.LBB3_833:
+ xor edi, edi
+.LBB3_834:
+ test r8b, 1
+ je .LBB3_836
+# %bb.835:
+ vmovupd ymm0, ymmword ptr [rdx + rdi]
+ vmovupd ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovupd ymmword ptr [rcx + rdi], ymm0
+ vmovupd ymmword ptr [rcx + rdi + 32], ymm1
+ vmovupd ymmword ptr [rcx + rdi + 64], ymm2
+ vmovupd ymmword ptr [rcx + rdi + 96], ymm3
+.LBB3_836:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_837
+.LBB3_841:
+ xor edi, edi
+.LBB3_842:
+ test r8b, 1
+ je .LBB3_844
+# %bb.843:
+ vmovupd ymm0, ymmword ptr [rdx + rdi]
+ vmovupd ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovupd ymmword ptr [rcx + rdi], ymm0
+ vmovupd ymmword ptr [rcx + rdi + 32], ymm1
+ vmovupd ymmword ptr [rcx + rdi + 64], ymm2
+ vmovupd ymmword ptr [rcx + rdi + 96], ymm3
+.LBB3_844:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_845
+.LBB3_849:
+ xor edi, edi
+.LBB3_850:
+ test r8b, 1
+ je .LBB3_852
+# %bb.851:
+ vpxor xmm0, xmm0, xmm0
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpsubd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm0
+.LBB3_852:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_853
+.LBB3_857:
+ xor edi, edi
+.LBB3_858:
+ test r8b, 1
+ je .LBB3_860
+# %bb.859:
+ vpxor xmm0, xmm0, xmm0
+ vpsubd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpsubd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [rcx + 4*rdi], ymm1
+ vmovdqu ymmword ptr [rcx + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [rcx + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [rcx + 4*rdi + 96], ymm0
+.LBB3_860:
+ cmp rsi, r9
+ je .LBB3_865
+ jmp .LBB3_861
+.Lfunc_end3:
+ .size arithmetic_unary_same_types_avx2, .Lfunc_end3-arithmetic_unary_same_types_avx2
+ # -- End function
+ .section .rodata.cst8,"aM",@progbits,8
+ .p2align 3 # -- Begin function arithmetic_unary_diff_type_avx2
+.LCPI4_0:
+ .quad 0x8000000000000000 # double -0
+.LCPI4_1:
+ .quad 0x3ff0000000000000 # double 1
+.LCPI4_6:
+ .quad 0x43e0000000000000 # double 9.2233720368547758E+18
+.LCPI4_7:
+ .quad 0x41e0000000000000 # double 2147483648
+.LCPI4_13:
+ .quad 0xbff0000000000000 # double -1
+.LCPI4_15:
+ .quad 1 # 0x1
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI4_2:
+ .quad 0x8000000000000000 # double -0
+ .quad 0x8000000000000000 # double -0
+.LCPI4_11:
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+.LCPI4_12:
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+.LCPI4_16:
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .zero 2
+ .zero 2
+ .zero 2
+ .zero 2
+.LCPI4_17:
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .byte 1 # 0x1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+ .zero 1
+.LCPI4_19:
+ .zero 16,1
+ .section .rodata.cst4,"aM",@progbits,4
+ .p2align 2
+.LCPI4_3:
+ .long 0x7fffffff # float NaN
+.LCPI4_4:
+ .long 0x80000000 # float -0
+.LCPI4_5:
+ .long 0x3f800000 # float 1
+.LCPI4_8:
+ .long 1 # 0x1
+.LCPI4_9:
+ .long 0x5f000000 # float 9.22337203E+18
+.LCPI4_10:
+ .long 0x4f000000 # float 2.14748365E+9
+.LCPI4_14:
+ .long 0xbf800000 # float -1
+ .section .rodata.cst32,"aM",@progbits,32
+ .p2align 5
+.LCPI4_18:
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+ .short 1 # 0x1
+.LCPI4_20:
+ .zero 32,1
+ .text
+ .globl arithmetic_unary_diff_type_avx2
+ .p2align 4, 0x90
+ .type arithmetic_unary_diff_type_avx2,@function
+arithmetic_unary_diff_type_avx2: # @arithmetic_unary_diff_type_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ push r14
+ push rbx
+ and rsp, -8
+ cmp dl, 7
+ jne .LBB4_1351
+# %bb.1:
+ cmp edi, 6
+ jg .LBB4_14
+# %bb.2:
+ cmp edi, 3
+ jle .LBB4_26
+# %bb.3:
+ cmp edi, 4
+ je .LBB4_46
+# %bb.4:
+ cmp edi, 5
+ je .LBB4_54
+# %bb.5:
+ cmp edi, 6
+ jne .LBB4_1351
+# %bb.6:
+ cmp esi, 6
+ jg .LBB4_94
+# %bb.7:
+ cmp esi, 3
+ jle .LBB4_164
+# %bb.8:
+ cmp esi, 4
+ je .LBB4_267
+# %bb.9:
+ cmp esi, 5
+ je .LBB4_270
+# %bb.10:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.11:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.12:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_13
+# %bb.447:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB4_870
+# %bb.448:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB4_870
+.LBB4_13:
+ xor edx, edx
+.LBB4_873:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_875
+.LBB4_874: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp dword ptr [rcx + 4*rdx], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx], esi
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_874
+.LBB4_875:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_876: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp dword ptr [rcx + 4*rdx], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx], esi
+ xor esi, esi
+ cmp dword ptr [rcx + 4*rdx + 4], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx + 4], esi
+ xor esi, esi
+ cmp dword ptr [rcx + 4*rdx + 8], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx + 8], esi
+ xor esi, esi
+ cmp dword ptr [rcx + 4*rdx + 12], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx + 12], esi
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_876
+ jmp .LBB4_1351
+.LBB4_14:
+ cmp edi, 8
+ jle .LBB4_36
+# %bb.15:
+ cmp edi, 9
+ je .LBB4_62
+# %bb.16:
+ cmp edi, 11
+ je .LBB4_70
+# %bb.17:
+ cmp edi, 12
+ jne .LBB4_1351
+# %bb.18:
+ cmp esi, 6
+ jg .LBB4_101
+# %bb.19:
+ cmp esi, 3
+ jle .LBB4_169
+# %bb.20:
+ cmp esi, 4
+ je .LBB4_273
+# %bb.21:
+ cmp esi, 5
+ je .LBB4_276
+# %bb.22:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.23:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.24:
+ mov eax, r9d
+ xor r10d, r10d
+ cmp r9d, 4
+ jae .LBB4_450
+# %bb.25:
+ xor esi, esi
+ jmp .LBB4_1292
+.LBB4_26:
+ cmp edi, 2
+ je .LBB4_78
+# %bb.27:
+ cmp edi, 3
+ jne .LBB4_1351
+# %bb.28:
+ cmp esi, 6
+ jg .LBB4_108
+# %bb.29:
+ cmp esi, 3
+ jle .LBB4_174
+# %bb.30:
+ cmp esi, 4
+ je .LBB4_279
+# %bb.31:
+ cmp esi, 5
+ je .LBB4_282
+# %bb.32:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.33:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.34:
+ mov r11d, r9d
+ cmp r9d, 32
+ jb .LBB4_35
+# %bb.453:
+ lea rdx, [rcx + r11]
+ cmp rdx, r8
+ jbe .LBB4_877
+# %bb.454:
+ lea rdx, [r8 + 4*r11]
+ cmp rdx, rcx
+ jbe .LBB4_877
+.LBB4_35:
+ xor edx, edx
+.LBB4_880:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_882
+# %bb.881:
+ mov r9b, byte ptr [rcx + rdx]
+ xor r10d, r10d
+ test r9b, r9b
+ setne r10b
+ neg r10d
+ test r9b, r9b
+ mov edi, 1
+ cmovle edi, r10d
+ mov dword ptr [r8 + 4*rdx], edi
+ or rdx, 1
+.LBB4_882:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.883:
+ mov esi, 1
+.LBB4_884: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdx]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg edi
+ test al, al
+ cmovg edi, esi
+ mov dword ptr [r8 + 4*rdx], edi
+ movzx eax, byte ptr [rcx + rdx + 1]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg edi
+ test al, al
+ cmovg edi, esi
+ mov dword ptr [r8 + 4*rdx + 4], edi
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_884
+ jmp .LBB4_1351
+.LBB4_36:
+ cmp edi, 7
+ je .LBB4_86
+# %bb.37:
+ cmp edi, 8
+ jne .LBB4_1351
+# %bb.38:
+ cmp esi, 6
+ jg .LBB4_115
+# %bb.39:
+ cmp esi, 3
+ jle .LBB4_179
+# %bb.40:
+ cmp esi, 4
+ je .LBB4_285
+# %bb.41:
+ cmp esi, 5
+ je .LBB4_288
+# %bb.42:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.43:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.44:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_456
+# %bb.45:
+ xor edx, edx
+ jmp .LBB4_459
+.LBB4_46:
+ cmp esi, 6
+ jg .LBB4_122
+# %bb.47:
+ cmp esi, 3
+ jle .LBB4_184
+# %bb.48:
+ cmp esi, 4
+ je .LBB4_291
+# %bb.49:
+ cmp esi, 5
+ je .LBB4_294
+# %bb.50:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.51:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.52:
+ mov eax, r9d
+ cmp r9d, 32
+ jae .LBB4_460
+# %bb.53:
+ xor edx, edx
+ jmp .LBB4_463
+.LBB4_54:
+ cmp esi, 6
+ jg .LBB4_129
+# %bb.55:
+ cmp esi, 3
+ jle .LBB4_189
+# %bb.56:
+ cmp esi, 4
+ je .LBB4_297
+# %bb.57:
+ cmp esi, 5
+ je .LBB4_300
+# %bb.58:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.59:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.60:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB4_464
+# %bb.61:
+ xor edx, edx
+ jmp .LBB4_467
+.LBB4_62:
+ cmp esi, 6
+ jg .LBB4_136
+# %bb.63:
+ cmp esi, 3
+ jle .LBB4_194
+# %bb.64:
+ cmp esi, 4
+ je .LBB4_303
+# %bb.65:
+ cmp esi, 5
+ je .LBB4_306
+# %bb.66:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.67:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.68:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB4_469
+# %bb.69:
+ xor edx, edx
+ jmp .LBB4_472
+.LBB4_70:
+ cmp esi, 6
+ jg .LBB4_143
+# %bb.71:
+ cmp esi, 3
+ jle .LBB4_199
+# %bb.72:
+ cmp esi, 4
+ je .LBB4_309
+# %bb.73:
+ cmp esi, 5
+ je .LBB4_312
+# %bb.74:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.75:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.76:
+ mov eax, r9d
+ cmp r9d, 8
+ jae .LBB4_474
+# %bb.77:
+ xor edx, edx
+ jmp .LBB4_1298
+.LBB4_78:
+ cmp esi, 6
+ jg .LBB4_150
+# %bb.79:
+ cmp esi, 3
+ jle .LBB4_204
+# %bb.80:
+ cmp esi, 4
+ je .LBB4_315
+# %bb.81:
+ cmp esi, 5
+ je .LBB4_318
+# %bb.82:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.83:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.84:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_85
+# %bb.477:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_885
+# %bb.478:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB4_885
+.LBB4_85:
+ xor edx, edx
+.LBB4_888:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_890
+.LBB4_889: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx], esi
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_889
+.LBB4_890:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_891: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx], esi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 1], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx + 4], esi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 2], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx + 8], esi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 3], 0
+ setne sil
+ mov dword ptr [r8 + 4*rdx + 12], esi
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_891
+ jmp .LBB4_1351
+.LBB4_86:
+ cmp esi, 6
+ jg .LBB4_157
+# %bb.87:
+ cmp esi, 3
+ jle .LBB4_209
+# %bb.88:
+ cmp esi, 4
+ je .LBB4_321
+# %bb.89:
+ cmp esi, 5
+ je .LBB4_324
+# %bb.90:
+ cmp esi, 6
+ jne .LBB4_1351
+# %bb.91:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.92:
+ mov r11d, r9d
+ cmp r9d, 32
+ jb .LBB4_93
+# %bb.480:
+ lea rdx, [rcx + 4*r11]
+ cmp rdx, r8
+ jbe .LBB4_892
+# %bb.481:
+ lea rdx, [r8 + 4*r11]
+ cmp rdx, rcx
+ jbe .LBB4_892
+.LBB4_93:
+ xor edx, edx
+.LBB4_895:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_897
+# %bb.896:
+ mov r9d, dword ptr [rcx + 4*rdx]
+ xor r10d, r10d
+ test r9d, r9d
+ setne r10b
+ neg r10d
+ test r9d, r9d
+ mov edi, 1
+ cmovle edi, r10d
+ mov dword ptr [r8 + 4*rdx], edi
+ or rdx, 1
+.LBB4_897:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.898:
+ mov esi, 1
+.LBB4_899: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rdx]
+ xor eax, eax
+ test edi, edi
+ setne al
+ neg eax
+ test edi, edi
+ cmovg eax, esi
+ mov dword ptr [r8 + 4*rdx], eax
+ mov eax, dword ptr [rcx + 4*rdx + 4]
+ xor edi, edi
+ test eax, eax
+ setne dil
+ neg edi
+ test eax, eax
+ cmovg edi, esi
+ mov dword ptr [r8 + 4*rdx + 4], edi
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_899
+ jmp .LBB4_1351
+.LBB4_94:
+ cmp esi, 8
+ jle .LBB4_214
+# %bb.95:
+ cmp esi, 9
+ je .LBB4_327
+# %bb.96:
+ cmp esi, 11
+ je .LBB4_330
+# %bb.97:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.98:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.99:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_483
+# %bb.100:
+ xor edx, edx
+ jmp .LBB4_486
+.LBB4_101:
+ cmp esi, 8
+ jle .LBB4_219
+# %bb.102:
+ cmp esi, 9
+ je .LBB4_333
+# %bb.103:
+ cmp esi, 11
+ je .LBB4_336
+# %bb.104:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.105:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.106:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_107
+# %bb.490:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB4_900
+# %bb.491:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB4_900
+.LBB4_107:
+ xor edx, edx
+.LBB4_903:
+ mov rsi, rdx
+ not rsi
+ test al, 1
+ je .LBB4_905
+# %bb.904:
+ vmovsd xmm0, qword ptr [rcx + 8*rdx] # xmm0 = mem[0],zero
+ vandpd xmm1, xmm0, xmmword ptr [rip + .LCPI4_2]
+ vmovddup xmm2, qword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
+ # xmm2 = mem[0,0]
+ vorpd xmm1, xmm2, xmm1
+ vxorpd xmm2, xmm2, xmm2
+ vcmpeqsd xmm0, xmm0, xmm2
+ vandnpd xmm0, xmm0, xmm1
+ vmovlpd qword ptr [r8 + 8*rdx], xmm0
+ or rdx, 1
+.LBB4_905:
+ add rsi, rax
+ je .LBB4_1351
+# %bb.906:
+ vmovapd xmm0, xmmword ptr [rip + .LCPI4_2] # xmm0 = [-0.0E+0,-0.0E+0]
+ vmovddup xmm1, qword ptr [rip + .LCPI4_1] # xmm1 = [1.0E+0,1.0E+0]
+ # xmm1 = mem[0,0]
+ vxorpd xmm2, xmm2, xmm2
+.LBB4_907: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm3, qword ptr [rcx + 8*rdx] # xmm3 = mem[0],zero
+ vandpd xmm4, xmm3, xmm0
+ vorpd xmm4, xmm1, xmm4
+ vcmpeqsd xmm3, xmm3, xmm2
+ vandnpd xmm3, xmm3, xmm4
+ vmovlpd qword ptr [r8 + 8*rdx], xmm3
+ vmovsd xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero
+ vandpd xmm4, xmm3, xmm0
+ vorpd xmm4, xmm1, xmm4
+ vcmpeqsd xmm3, xmm3, xmm2
+ vandnpd xmm3, xmm3, xmm4
+ vmovlpd qword ptr [r8 + 8*rdx + 8], xmm3
+ add rdx, 2
+ cmp rax, rdx
+ jne .LBB4_907
+ jmp .LBB4_1351
+.LBB4_108:
+ cmp esi, 8
+ jle .LBB4_224
+# %bb.109:
+ cmp esi, 9
+ je .LBB4_339
+# %bb.110:
+ cmp esi, 11
+ je .LBB4_342
+# %bb.111:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.112:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.113:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_114
+# %bb.493:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_908
+# %bb.494:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB4_908
+.LBB4_114:
+ xor edx, edx
+.LBB4_911:
+ mov rsi, rdx
+ not rsi
+ test al, 1
+ je .LBB4_1254
+# %bb.912:
+ cmp byte ptr [rcx + rdx], 0
+ jne .LBB4_1250
+# %bb.913:
+ vpxor xmm0, xmm0, xmm0
+ jmp .LBB4_1251
+.LBB4_115:
+ cmp esi, 8
+ jle .LBB4_229
+# %bb.116:
+ cmp esi, 9
+ je .LBB4_345
+# %bb.117:
+ cmp esi, 11
+ je .LBB4_348
+# %bb.118:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.119:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.120:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_496
+# %bb.121:
+ xor edx, edx
+ jmp .LBB4_499
+.LBB4_122:
+ cmp esi, 8
+ jle .LBB4_234
+# %bb.123:
+ cmp esi, 9
+ je .LBB4_351
+# %bb.124:
+ cmp esi, 11
+ je .LBB4_354
+# %bb.125:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.126:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.127:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_503
+# %bb.128:
+ xor edx, edx
+ jmp .LBB4_506
+.LBB4_129:
+ cmp esi, 8
+ jle .LBB4_239
+# %bb.130:
+ cmp esi, 9
+ je .LBB4_357
+# %bb.131:
+ cmp esi, 11
+ je .LBB4_360
+# %bb.132:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.133:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.134:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_510
+# %bb.135:
+ xor edx, edx
+ jmp .LBB4_513
+.LBB4_136:
+ cmp esi, 8
+ jle .LBB4_244
+# %bb.137:
+ cmp esi, 9
+ je .LBB4_363
+# %bb.138:
+ cmp esi, 11
+ je .LBB4_366
+# %bb.139:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.140:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.141:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_519
+# %bb.142:
+ xor edx, edx
+ jmp .LBB4_522
+.LBB4_143:
+ cmp esi, 8
+ jle .LBB4_249
+# %bb.144:
+ cmp esi, 9
+ je .LBB4_369
+# %bb.145:
+ cmp esi, 11
+ je .LBB4_372
+# %bb.146:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.147:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.148:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_528
+# %bb.149:
+ xor edx, edx
+ jmp .LBB4_531
+.LBB4_150:
+ cmp esi, 8
+ jle .LBB4_257
+# %bb.151:
+ cmp esi, 9
+ je .LBB4_375
+# %bb.152:
+ cmp esi, 11
+ je .LBB4_378
+# %bb.153:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.154:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.155:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_156
+# %bb.535:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_914
+# %bb.536:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB4_914
+.LBB4_156:
+ xor edx, edx
+.LBB4_917:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_922
+# %bb.918:
+ vmovq xmm0, qword ptr [rip + .LCPI4_1] # xmm0 = mem[0],zero
+ jmp .LBB4_920
+.LBB4_919: # in Loop: Header=BB4_920 Depth=1
+ vmovq qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ je .LBB4_922
+.LBB4_920: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rcx + rdx], 0
+ vmovdqa xmm1, xmm0
+ jne .LBB4_919
+# %bb.921: # in Loop: Header=BB4_920 Depth=1
+ vpxor xmm1, xmm1, xmm1
+ jmp .LBB4_919
+.LBB4_157:
+ cmp esi, 8
+ jle .LBB4_262
+# %bb.158:
+ cmp esi, 9
+ je .LBB4_381
+# %bb.159:
+ cmp esi, 11
+ je .LBB4_384
+# %bb.160:
+ cmp esi, 12
+ jne .LBB4_1351
+# %bb.161:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.162:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_538
+# %bb.163:
+ xor edx, edx
+ jmp .LBB4_541
+.LBB4_164:
+ cmp esi, 2
+ je .LBB4_387
+# %bb.165:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.166:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.167:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_168
+# %bb.547:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB4_933
+# %bb.548:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_933
+.LBB4_168:
+ xor edx, edx
+.LBB4_936:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_938
+.LBB4_937: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rcx + 4*rdx], 0
+ setne byte ptr [r8 + rdx]
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_937
+.LBB4_938:
+ cmp rsi, 3
+ jb .LBB4_1351
+.LBB4_939: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rcx + 4*rdx], 0
+ setne byte ptr [r8 + rdx]
+ cmp dword ptr [rcx + 4*rdx + 4], 0
+ setne byte ptr [r8 + rdx + 1]
+ cmp dword ptr [rcx + 4*rdx + 8], 0
+ setne byte ptr [r8 + rdx + 2]
+ cmp dword ptr [rcx + 4*rdx + 12], 0
+ setne byte ptr [r8 + rdx + 3]
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_939
+ jmp .LBB4_1351
+.LBB4_169:
+ cmp esi, 2
+ je .LBB4_390
+# %bb.170:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.171:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.172:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_173
+# %bb.550:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB4_940
+# %bb.551:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_940
+.LBB4_173:
+ xor edx, edx
+.LBB4_943:
+ mov rsi, rdx
+ not rsi
+ test al, 1
+ je .LBB4_945
+# %bb.944:
+ vmovsd xmm0, qword ptr [rcx + 8*rdx] # xmm0 = mem[0],zero
+ xor edi, edi
+ vpxor xmm1, xmm1, xmm1
+ vucomisd xmm1, xmm0
+ vandpd xmm0, xmm0, xmmword ptr [rip + .LCPI4_2]
+ vmovddup xmm1, qword ptr [rip + .LCPI4_1] # xmm1 = [1.0E+0,1.0E+0]
+ # xmm1 = mem[0,0]
+ vorpd xmm0, xmm1, xmm0
+ vcvttsd2si ebx, xmm0
+ cmove ebx, edi
+ mov byte ptr [r8 + rdx], bl
+ or rdx, 1
+.LBB4_945:
+ add rsi, rax
+ je .LBB4_1351
+# %bb.946:
+ xor esi, esi
+ vxorpd xmm0, xmm0, xmm0
+ vmovapd xmm1, xmmword ptr [rip + .LCPI4_2] # xmm1 = [-0.0E+0,-0.0E+0]
+ vmovddup xmm2, qword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
+ # xmm2 = mem[0,0]
+.LBB4_947: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm3, qword ptr [rcx + 8*rdx] # xmm3 = mem[0],zero
+ vucomisd xmm0, xmm3
+ vandpd xmm3, xmm3, xmm1
+ vorpd xmm3, xmm2, xmm3
+ vcvttsd2si edi, xmm3
+ cmove edi, esi
+ mov byte ptr [r8 + rdx], dil
+ vmovsd xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero
+ vucomisd xmm0, xmm3
+ vandpd xmm3, xmm3, xmm1
+ vorpd xmm3, xmm2, xmm3
+ vcvttsd2si edi, xmm3
+ cmove edi, esi
+ mov byte ptr [r8 + rdx + 1], dil
+ add rdx, 2
+ cmp rax, rdx
+ jne .LBB4_947
+ jmp .LBB4_1351
+.LBB4_174:
+ cmp esi, 2
+ je .LBB4_393
+# %bb.175:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.176:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.177:
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB4_178
+# %bb.553:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB4_948
+# %bb.554:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_948
+.LBB4_178:
+ xor r11d, r11d
+.LBB4_951:
+ mov rsi, r11
+ not rsi
+ test r10b, 1
+ je .LBB4_953
+# %bb.952:
+ mov dil, byte ptr [rcx + r11]
+ test dil, dil
+ setne r9b
+ neg r9b
+ test dil, dil
+ movzx r9d, r9b
+ mov edi, 1
+ cmovle edi, r9d
+ mov byte ptr [r8 + r11], dil
+ or r11, 1
+.LBB4_953:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.954:
+ mov esi, 1
+.LBB4_955: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + r11]
+ test al, al
+ setne dl
+ neg dl
+ test al, al
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11], al
+ movzx eax, byte ptr [rcx + r11 + 1]
+ test al, al
+ setne dl
+ neg dl
+ test al, al
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11 + 1], al
+ add r11, 2
+ cmp r10, r11
+ jne .LBB4_955
+ jmp .LBB4_1351
+.LBB4_179:
+ cmp esi, 2
+ je .LBB4_396
+# %bb.180:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.181:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.182:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_183
+# %bb.556:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB4_956
+# %bb.557:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_956
+.LBB4_183:
+ xor edx, edx
+.LBB4_959:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_961
+.LBB4_960: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne byte ptr [r8 + rdx]
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_960
+.LBB4_961:
+ cmp rsi, 3
+ jb .LBB4_1351
+.LBB4_962: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne byte ptr [r8 + rdx]
+ cmp qword ptr [rcx + 8*rdx + 8], 0
+ setne byte ptr [r8 + rdx + 1]
+ cmp qword ptr [rcx + 8*rdx + 16], 0
+ setne byte ptr [r8 + rdx + 2]
+ cmp qword ptr [rcx + 8*rdx + 24], 0
+ setne byte ptr [r8 + rdx + 3]
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_962
+ jmp .LBB4_1351
+.LBB4_184:
+ cmp esi, 2
+ je .LBB4_399
+# %bb.185:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.186:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.187:
+ mov eax, r9d
+ cmp r9d, 64
+ jb .LBB4_188
+# %bb.559:
+ lea rdx, [rcx + 2*rax]
+ cmp rdx, r8
+ jbe .LBB4_963
+# %bb.560:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_963
+.LBB4_188:
+ xor edx, edx
+.LBB4_966:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_968
+.LBB4_967: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rcx + 2*rdx], 0
+ setne byte ptr [r8 + rdx]
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_967
+.LBB4_968:
+ cmp rsi, 3
+ jb .LBB4_1351
+.LBB4_969: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rcx + 2*rdx], 0
+ setne byte ptr [r8 + rdx]
+ cmp word ptr [rcx + 2*rdx + 2], 0
+ setne byte ptr [r8 + rdx + 1]
+ cmp word ptr [rcx + 2*rdx + 4], 0
+ setne byte ptr [r8 + rdx + 2]
+ cmp word ptr [rcx + 2*rdx + 6], 0
+ setne byte ptr [r8 + rdx + 3]
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_969
+ jmp .LBB4_1351
+.LBB4_189:
+ cmp esi, 2
+ je .LBB4_402
+# %bb.190:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.191:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.192:
+ mov r10d, r9d
+ cmp r9d, 64
+ jb .LBB4_193
+# %bb.562:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB4_970
+# %bb.563:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_970
+.LBB4_193:
+ xor r11d, r11d
+.LBB4_973:
+ mov rsi, r11
+ not rsi
+ test r10b, 1
+ je .LBB4_975
+# %bb.974:
+ movzx edi, word ptr [rcx + 2*r11]
+ test di, di
+ setne r9b
+ neg r9b
+ test di, di
+ movzx r9d, r9b
+ mov edi, 1
+ cmovle edi, r9d
+ mov byte ptr [r8 + r11], dil
+ or r11, 1
+.LBB4_975:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.976:
+ mov esi, 1
+.LBB4_977: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*r11]
+ test di, di
+ setne al
+ neg al
+ test di, di
+ movzx eax, al
+ cmovg eax, esi
+ mov byte ptr [r8 + r11], al
+ movzx eax, word ptr [rcx + 2*r11 + 2]
+ test ax, ax
+ setne dl
+ neg dl
+ test ax, ax
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11 + 1], al
+ add r11, 2
+ cmp r10, r11
+ jne .LBB4_977
+ jmp .LBB4_1351
+.LBB4_194:
+ cmp esi, 2
+ je .LBB4_405
+# %bb.195:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.196:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.197:
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB4_198
+# %bb.565:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB4_978
+# %bb.566:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_978
+.LBB4_198:
+ xor r11d, r11d
+.LBB4_981:
+ mov rsi, r11
+ not rsi
+ test r10b, 1
+ je .LBB4_983
+# %bb.982:
+ mov rdi, qword ptr [rcx + 8*r11]
+ test rdi, rdi
+ setne r9b
+ neg r9b
+ test rdi, rdi
+ movzx r9d, r9b
+ mov edi, 1
+ cmovle edi, r9d
+ mov byte ptr [r8 + r11], dil
+ or r11, 1
+.LBB4_983:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.984:
+ mov esi, 1
+.LBB4_985: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*r11]
+ test rdi, rdi
+ setne al
+ neg al
+ test rdi, rdi
+ movzx eax, al
+ cmovg eax, esi
+ mov byte ptr [r8 + r11], al
+ mov rax, qword ptr [rcx + 8*r11 + 8]
+ test rax, rax
+ setne dl
+ neg dl
+ test rax, rax
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11 + 1], al
+ add r11, 2
+ cmp r10, r11
+ jne .LBB4_985
+ jmp .LBB4_1351
+.LBB4_199:
+ cmp esi, 2
+ je .LBB4_408
+# %bb.200:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.201:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.202:
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB4_203
+# %bb.568:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB4_986
+# %bb.569:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_986
+.LBB4_203:
+ xor edx, edx
+.LBB4_989:
+ mov rsi, rdx
+ not rsi
+ test r10b, 1
+ je .LBB4_991
+# %bb.990:
+ vmovd xmm0, dword ptr [rcx + 4*rdx] # xmm0 = mem[0],zero,zero,zero
+ vmovd edi, xmm0
+ test edi, edi
+ setns dil
+ add dil, dil
+ add dil, -1
+ xor r9d, r9d
+ vpxor xmm1, xmm1, xmm1
+ vucomiss xmm1, xmm0
+ movzx edi, dil
+ cmove edi, r9d
+ mov byte ptr [r8 + rdx], dil
+ or rdx, 1
+.LBB4_991:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.992:
+ xor esi, esi
+ vxorps xmm0, xmm0, xmm0
+.LBB4_993: # =>This Inner Loop Header: Depth=1
+ vmovd xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero
+ vmovd edi, xmm1
+ test edi, edi
+ setns al
+ add al, al
+ add al, -1
+ vucomiss xmm0, xmm1
+ movzx eax, al
+ cmove eax, esi
+ mov byte ptr [r8 + rdx], al
+ vmovd xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
+ vmovd eax, xmm1
+ test eax, eax
+ setns al
+ add al, al
+ add al, -1
+ vucomiss xmm0, xmm1
+ movzx eax, al
+ cmove eax, esi
+ mov byte ptr [r8 + rdx + 1], al
+ add rdx, 2
+ cmp r10, rdx
+ jne .LBB4_993
+ jmp .LBB4_1351
+.LBB4_204:
+ cmp esi, 2
+ je .LBB4_411
+# %bb.205:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.206:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.207:
+ mov eax, r9d
+ cmp r9d, 128
+ jb .LBB4_208
+# %bb.571:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_994
+# %bb.572:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_994
+.LBB4_208:
+ xor edx, edx
+.LBB4_997:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_999
+.LBB4_998: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rcx + rdx], 0
+ setne byte ptr [r8 + rdx]
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_998
+.LBB4_999:
+ cmp rsi, 3
+ jb .LBB4_1351
+.LBB4_1000: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rcx + rdx], 0
+ setne byte ptr [r8 + rdx]
+ cmp byte ptr [rcx + rdx + 1], 0
+ setne byte ptr [r8 + rdx + 1]
+ cmp byte ptr [rcx + rdx + 2], 0
+ setne byte ptr [r8 + rdx + 2]
+ cmp byte ptr [rcx + rdx + 3], 0
+ setne byte ptr [r8 + rdx + 3]
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1000
+ jmp .LBB4_1351
+.LBB4_209:
+ cmp esi, 2
+ je .LBB4_414
+# %bb.210:
+ cmp esi, 3
+ jne .LBB4_1351
+# %bb.211:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.212:
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB4_213
+# %bb.574:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB4_1001
+# %bb.575:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_1001
+.LBB4_213:
+ xor r11d, r11d
+.LBB4_1004:
+ mov rsi, r11
+ not rsi
+ test r10b, 1
+ je .LBB4_1006
+# %bb.1005:
+ mov edi, dword ptr [rcx + 4*r11]
+ test edi, edi
+ setne r9b
+ neg r9b
+ test edi, edi
+ movzx r9d, r9b
+ mov edi, 1
+ cmovle edi, r9d
+ mov byte ptr [r8 + r11], dil
+ or r11, 1
+.LBB4_1006:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.1007:
+ mov esi, 1
+.LBB4_1008: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*r11]
+ test edi, edi
+ setne al
+ neg al
+ test edi, edi
+ movzx eax, al
+ cmovg eax, esi
+ mov byte ptr [r8 + r11], al
+ mov eax, dword ptr [rcx + 4*r11 + 4]
+ test eax, eax
+ setne dl
+ neg dl
+ test eax, eax
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11 + 1], al
+ add r11, 2
+ cmp r10, r11
+ jne .LBB4_1008
+ jmp .LBB4_1351
+.LBB4_214:
+ cmp esi, 7
+ je .LBB4_417
+# %bb.215:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.216:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.217:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_577
+# %bb.218:
+ xor edx, edx
+ jmp .LBB4_580
+.LBB4_219:
+ cmp esi, 7
+ je .LBB4_420
+# %bb.220:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.221:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.222:
+ mov r10d, r9d
+ movabs r11, -9223372036854775808
+ cmp r9d, 4
+ jae .LBB4_581
+# %bb.223:
+ xor esi, esi
+ jmp .LBB4_1286
+.LBB4_224:
+ cmp esi, 7
+ je .LBB4_423
+# %bb.225:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.226:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.227:
+ mov r11d, r9d
+ cmp r9d, 16
+ jb .LBB4_228
+# %bb.584:
+ lea rdx, [rcx + r11]
+ cmp rdx, r8
+ jbe .LBB4_1009
+# %bb.585:
+ lea rdx, [r8 + 8*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1009
+.LBB4_228:
+ xor edx, edx
+.LBB4_1012:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1014
+# %bb.1013:
+ mov r9b, byte ptr [rcx + rdx]
+ xor r10d, r10d
+ test r9b, r9b
+ setne r10b
+ neg r10
+ test r9b, r9b
+ mov edi, 1
+ cmovle rdi, r10
+ mov qword ptr [r8 + 8*rdx], rdi
+ or rdx, 1
+.LBB4_1014:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1015:
+ mov esi, 1
+.LBB4_1016: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdx]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg rdi
+ test al, al
+ cmovg rdi, rsi
+ mov qword ptr [r8 + 8*rdx], rdi
+ movzx eax, byte ptr [rcx + rdx + 1]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg rdi
+ test al, al
+ cmovg rdi, rsi
+ mov qword ptr [r8 + 8*rdx + 8], rdi
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1016
+ jmp .LBB4_1351
+.LBB4_229:
+ cmp esi, 7
+ je .LBB4_426
+# %bb.230:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.231:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.232:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_233
+# %bb.587:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB4_1017
+# %bb.588:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1017
+.LBB4_233:
+ xor edx, edx
+.LBB4_1020:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1022
+.LBB4_1021: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1021
+.LBB4_1022:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1023: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx + 8], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 8], rsi
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx + 16], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 16], rsi
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx + 24], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 24], rsi
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1023
+ jmp .LBB4_1351
+.LBB4_234:
+ cmp esi, 7
+ je .LBB4_429
+# %bb.235:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.236:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.237:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_590
+# %bb.238:
+ xor edx, edx
+ jmp .LBB4_593
+.LBB4_239:
+ cmp esi, 7
+ je .LBB4_432
+# %bb.240:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.241:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.242:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB4_594
+# %bb.243:
+ xor edx, edx
+ jmp .LBB4_597
+.LBB4_244:
+ cmp esi, 7
+ je .LBB4_435
+# %bb.245:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.246:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.247:
+ mov r11d, r9d
+ cmp r9d, 16
+ jb .LBB4_248
+# %bb.599:
+ lea rdx, [rcx + 8*r11]
+ cmp rdx, r8
+ jbe .LBB4_1024
+# %bb.600:
+ lea rdx, [r8 + 8*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1024
+.LBB4_248:
+ xor edx, edx
+.LBB4_1027:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1029
+# %bb.1028:
+ mov r9, qword ptr [rcx + 8*rdx]
+ xor r10d, r10d
+ test r9, r9
+ setne r10b
+ neg r10
+ test r9, r9
+ mov edi, 1
+ cmovle rdi, r10
+ mov qword ptr [r8 + 8*rdx], rdi
+ or rdx, 1
+.LBB4_1029:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1030:
+ mov esi, 1
+.LBB4_1031: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rdx]
+ xor eax, eax
+ test rdi, rdi
+ setne al
+ neg rax
+ test rdi, rdi
+ cmovg rax, rsi
+ mov qword ptr [r8 + 8*rdx], rax
+ mov rax, qword ptr [rcx + 8*rdx + 8]
+ xor edi, edi
+ test rax, rax
+ setne dil
+ neg rdi
+ test rax, rax
+ cmovg rdi, rsi
+ mov qword ptr [r8 + 8*rdx + 8], rdi
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1031
+ jmp .LBB4_1351
+.LBB4_249:
+ cmp esi, 7
+ je .LBB4_438
+# %bb.250:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.251:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.252:
+ mov r10d, r9d
+ cmp r9d, 1
+ jne .LBB4_602
+# %bb.253:
+ xor eax, eax
+ jmp .LBB4_254
+.LBB4_257:
+ cmp esi, 7
+ je .LBB4_441
+# %bb.258:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.259:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.260:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_261
+# %bb.610:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_1032
+# %bb.611:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1032
+.LBB4_261:
+ xor edx, edx
+.LBB4_1035:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1037
+.LBB4_1036: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1036
+.LBB4_1037:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1038: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 1], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 8], rsi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 2], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 16], rsi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 3], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 24], rsi
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1038
+ jmp .LBB4_1351
+.LBB4_262:
+ cmp esi, 7
+ je .LBB4_444
+# %bb.263:
+ cmp esi, 8
+ jne .LBB4_1351
+# %bb.264:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.265:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB4_613
+# %bb.266:
+ xor edx, edx
+ jmp .LBB4_616
+.LBB4_267:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.268:
+ mov eax, r9d
+ cmp r9d, 32
+ jae .LBB4_618
+# %bb.269:
+ xor edx, edx
+ jmp .LBB4_621
+.LBB4_270:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.271:
+ mov eax, r9d
+ cmp r9d, 32
+ jae .LBB4_622
+# %bb.272:
+ xor edx, edx
+ jmp .LBB4_625
+.LBB4_273:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.274:
+ mov eax, r9d
+ xor edx, edx
+ cmp r9d, 16
+ jae .LBB4_626
+# %bb.275:
+ xor esi, esi
+ jmp .LBB4_629
+.LBB4_276:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.277:
+ mov eax, r9d
+ xor edx, edx
+ cmp r9d, 16
+ jae .LBB4_631
+# %bb.278:
+ xor esi, esi
+ jmp .LBB4_634
+.LBB4_279:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.280:
+ mov r11d, r9d
+ cmp r9d, 64
+ jb .LBB4_281
+# %bb.636:
+ lea rdx, [rcx + r11]
+ cmp rdx, r8
+ jbe .LBB4_1039
+# %bb.637:
+ lea rdx, [r8 + 2*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1039
+.LBB4_281:
+ xor edx, edx
+.LBB4_1042:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1044
+# %bb.1043:
+ mov r9b, byte ptr [rcx + rdx]
+ xor r10d, r10d
+ test r9b, r9b
+ setne r10b
+ neg r10d
+ test r9b, r9b
+ mov edi, 1
+ cmovle edi, r10d
+ mov word ptr [r8 + 2*rdx], di
+ or rdx, 1
+.LBB4_1044:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1045:
+ mov esi, 1
+.LBB4_1046: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdx]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg edi
+ test al, al
+ cmovg edi, esi
+ mov word ptr [r8 + 2*rdx], di
+ movzx eax, byte ptr [rcx + rdx + 1]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg edi
+ test al, al
+ cmovg edi, esi
+ mov word ptr [r8 + 2*rdx + 2], di
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1046
+ jmp .LBB4_1351
+.LBB4_282:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.283:
+ mov r11d, r9d
+ cmp r9d, 64
+ jb .LBB4_284
+# %bb.639:
+ lea rdx, [rcx + r11]
+ cmp rdx, r8
+ jbe .LBB4_1047
+# %bb.640:
+ lea rdx, [r8 + 2*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1047
+.LBB4_284:
+ xor edx, edx
+.LBB4_1050:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1052
+# %bb.1051:
+ mov r9b, byte ptr [rcx + rdx]
+ xor r10d, r10d
+ test r9b, r9b
+ setne r10b
+ neg r10d
+ test r9b, r9b
+ mov edi, 1
+ cmovle edi, r10d
+ mov word ptr [r8 + 2*rdx], di
+ or rdx, 1
+.LBB4_1052:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1053:
+ mov esi, 1
+.LBB4_1054: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdx]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg edi
+ test al, al
+ cmovg edi, esi
+ mov word ptr [r8 + 2*rdx], di
+ movzx eax, byte ptr [rcx + rdx + 1]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg edi
+ test al, al
+ cmovg edi, esi
+ mov word ptr [r8 + 2*rdx + 2], di
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1054
+ jmp .LBB4_1351
+.LBB4_285:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.286:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_642
+# %bb.287:
+ xor edx, edx
+ jmp .LBB4_645
+.LBB4_288:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.289:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_646
+# %bb.290:
+ xor edx, edx
+ jmp .LBB4_649
+.LBB4_291:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.292:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_293
+# %bb.650:
+ lea rdx, [rcx + 2*rax]
+ cmp rdx, r8
+ jbe .LBB4_1055
+# %bb.651:
+ lea rdx, [r8 + 2*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1055
+.LBB4_293:
+ xor edx, edx
+.LBB4_1321:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1323
+.LBB4_1322: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1322
+.LBB4_1323:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1324: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx + 2], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 2], si
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx + 4], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 4], si
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx + 6], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 6], si
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1324
+ jmp .LBB4_1351
+.LBB4_294:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.295:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_296
+# %bb.653:
+ lea rdx, [rcx + 2*rax]
+ cmp rdx, r8
+ jbe .LBB4_1058
+# %bb.654:
+ lea rdx, [r8 + 2*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1058
+.LBB4_296:
+ xor edx, edx
+.LBB4_1329:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1331
+.LBB4_1330: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1330
+.LBB4_1331:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1332: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx + 2], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 2], si
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx + 4], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 4], si
+ xor esi, esi
+ cmp word ptr [rcx + 2*rdx + 6], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 6], si
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1332
+ jmp .LBB4_1351
+.LBB4_297:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.298:
+ mov r11d, r9d
+ cmp r9d, 32
+ jb .LBB4_299
+# %bb.656:
+ lea rdx, [rcx + 2*r11]
+ cmp rdx, r8
+ jbe .LBB4_1061
+# %bb.657:
+ lea rdx, [r8 + 2*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1061
+.LBB4_299:
+ xor edx, edx
+.LBB4_1337:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1339
+# %bb.1338:
+ movzx r9d, word ptr [rcx + 2*rdx]
+ xor r10d, r10d
+ test r9w, r9w
+ setne r10b
+ neg r10d
+ test r9w, r9w
+ mov edi, 1
+ cmovle edi, r10d
+ mov word ptr [r8 + 2*rdx], di
+ or rdx, 1
+.LBB4_1339:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1340:
+ mov esi, 1
+.LBB4_1341: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rdx]
+ xor eax, eax
+ test di, di
+ setne al
+ neg eax
+ test di, di
+ cmovg eax, esi
+ mov word ptr [r8 + 2*rdx], ax
+ movzx eax, word ptr [rcx + 2*rdx + 2]
+ xor edi, edi
+ test ax, ax
+ setne dil
+ neg edi
+ test ax, ax
+ cmovg edi, esi
+ mov word ptr [r8 + 2*rdx + 2], di
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1341
+ jmp .LBB4_1351
+.LBB4_300:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.301:
+ mov r11d, r9d
+ cmp r9d, 32
+ jb .LBB4_302
+# %bb.659:
+ lea rdx, [rcx + 2*r11]
+ cmp rdx, r8
+ jbe .LBB4_1064
+# %bb.660:
+ lea rdx, [r8 + 2*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1064
+.LBB4_302:
+ xor edx, edx
+.LBB4_1346:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1348
+# %bb.1347:
+ movzx r9d, word ptr [rcx + 2*rdx]
+ xor r10d, r10d
+ test r9w, r9w
+ setne r10b
+ neg r10d
+ test r9w, r9w
+ mov edi, 1
+ cmovle edi, r10d
+ mov word ptr [r8 + 2*rdx], di
+ or rdx, 1
+.LBB4_1348:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1349:
+ mov esi, 1
+.LBB4_1350: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rdx]
+ xor eax, eax
+ test di, di
+ setne al
+ neg eax
+ test di, di
+ cmovg eax, esi
+ mov word ptr [r8 + 2*rdx], ax
+ movzx eax, word ptr [rcx + 2*rdx + 2]
+ xor edi, edi
+ test ax, ax
+ setne dil
+ neg edi
+ test ax, ax
+ cmovg edi, esi
+ mov word ptr [r8 + 2*rdx + 2], di
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1350
+ jmp .LBB4_1351
+.LBB4_303:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.304:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB4_662
+# %bb.305:
+ xor edx, edx
+ jmp .LBB4_665
+.LBB4_306:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.307:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB4_667
+# %bb.308:
+ xor edx, edx
+ jmp .LBB4_670
+.LBB4_309:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.310:
+ mov eax, r9d
+ xor r10d, r10d
+ cmp r9d, 32
+ jae .LBB4_672
+# %bb.311:
+ xor esi, esi
+ jmp .LBB4_675
+.LBB4_312:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.313:
+ mov eax, r9d
+ xor r10d, r10d
+ cmp r9d, 32
+ jae .LBB4_677
+# %bb.314:
+ xor esi, esi
+ jmp .LBB4_680
+.LBB4_315:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.316:
+ mov eax, r9d
+ cmp r9d, 64
+ jb .LBB4_317
+# %bb.682:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_1067
+# %bb.683:
+ lea rdx, [r8 + 2*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1067
+.LBB4_317:
+ xor edx, edx
+.LBB4_1070:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1072
+.LBB4_1071: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1071
+.LBB4_1072:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1073: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 1], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 2], si
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 2], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 4], si
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 3], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 6], si
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1073
+ jmp .LBB4_1351
+.LBB4_318:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.319:
+ mov eax, r9d
+ cmp r9d, 64
+ jb .LBB4_320
+# %bb.685:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_1074
+# %bb.686:
+ lea rdx, [r8 + 2*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1074
+.LBB4_320:
+ xor edx, edx
+.LBB4_1077:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1079
+.LBB4_1078: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1078
+.LBB4_1079:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1080: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx], si
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 1], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 2], si
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 2], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 4], si
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 3], 0
+ setne sil
+ mov word ptr [r8 + 2*rdx + 6], si
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1080
+ jmp .LBB4_1351
+.LBB4_321:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.322:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB4_688
+# %bb.323:
+ xor edx, edx
+ jmp .LBB4_691
+.LBB4_324:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.325:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB4_693
+# %bb.326:
+ xor edx, edx
+ jmp .LBB4_696
+.LBB4_327:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.328:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_698
+# %bb.329:
+ xor edx, edx
+ jmp .LBB4_701
+.LBB4_330:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.331:
+ mov eax, r9d
+ cmp r9d, 32
+ jae .LBB4_702
+# %bb.332:
+ xor edx, edx
+ jmp .LBB4_705
+.LBB4_333:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.334:
+ mov eax, r9d
+ cmp r9d, 4
+ jae .LBB4_709
+# %bb.335:
+ xor edx, edx
+ jmp .LBB4_1306
+.LBB4_336:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.337:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_712
+# %bb.338:
+ xor edx, edx
+ jmp .LBB4_715
+.LBB4_339:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.340:
+ mov r11d, r9d
+ cmp r9d, 16
+ jb .LBB4_341
+# %bb.719:
+ lea rdx, [rcx + r11]
+ cmp rdx, r8
+ jbe .LBB4_1081
+# %bb.720:
+ lea rdx, [r8 + 8*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1081
+.LBB4_341:
+ xor edx, edx
+.LBB4_1084:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1086
+# %bb.1085:
+ mov r9b, byte ptr [rcx + rdx]
+ xor r10d, r10d
+ test r9b, r9b
+ setne r10b
+ neg r10
+ test r9b, r9b
+ mov edi, 1
+ cmovle rdi, r10
+ mov qword ptr [r8 + 8*rdx], rdi
+ or rdx, 1
+.LBB4_1086:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1087:
+ mov esi, 1
+.LBB4_1088: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rdx]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg rdi
+ test al, al
+ cmovg rdi, rsi
+ mov qword ptr [r8 + 8*rdx], rdi
+ movzx eax, byte ptr [rcx + rdx + 1]
+ xor edi, edi
+ test al, al
+ setne dil
+ neg rdi
+ test al, al
+ cmovg rdi, rsi
+ mov qword ptr [r8 + 8*rdx + 8], rdi
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1088
+ jmp .LBB4_1351
+.LBB4_342:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.343:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_344
+# %bb.722:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_1089
+# %bb.723:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1089
+.LBB4_344:
+ xor edx, edx
+.LBB4_1092:
+ mov rsi, rdx
+ not rsi
+ test al, 1
+ je .LBB4_1269
+# %bb.1093:
+ cmp byte ptr [rcx + rdx], 0
+ jne .LBB4_1265
+# %bb.1094:
+ vpxor xmm0, xmm0, xmm0
+ jmp .LBB4_1266
+.LBB4_345:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.346:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_347
+# %bb.725:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB4_1095
+# %bb.726:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1095
+.LBB4_347:
+ xor edx, edx
+.LBB4_1098:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1100
+.LBB4_1099: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1099
+.LBB4_1100:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1101: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx + 8], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 8], rsi
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx + 16], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 16], rsi
+ xor esi, esi
+ cmp qword ptr [rcx + 8*rdx + 24], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 24], rsi
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1101
+ jmp .LBB4_1351
+.LBB4_348:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.349:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_728
+# %bb.350:
+ xor edx, edx
+ jmp .LBB4_731
+.LBB4_351:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.352:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_735
+# %bb.353:
+ xor edx, edx
+ jmp .LBB4_738
+.LBB4_354:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.355:
+ mov eax, r9d
+ cmp r9d, 32
+ jae .LBB4_739
+# %bb.356:
+ xor edx, edx
+ jmp .LBB4_742
+.LBB4_357:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.358:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB4_746
+# %bb.359:
+ xor edx, edx
+ jmp .LBB4_749
+.LBB4_360:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.361:
+ mov eax, r9d
+ cmp r9d, 32
+ jae .LBB4_751
+# %bb.362:
+ xor edx, edx
+ jmp .LBB4_754
+.LBB4_363:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.364:
+ mov r11d, r9d
+ cmp r9d, 16
+ jb .LBB4_365
+# %bb.760:
+ lea rdx, [rcx + 8*r11]
+ cmp rdx, r8
+ jbe .LBB4_1102
+# %bb.761:
+ lea rdx, [r8 + 8*r11]
+ cmp rdx, rcx
+ jbe .LBB4_1102
+.LBB4_365:
+ xor edx, edx
+.LBB4_1105:
+ mov rsi, rdx
+ not rsi
+ test r11b, 1
+ je .LBB4_1107
+# %bb.1106:
+ mov r9, qword ptr [rcx + 8*rdx]
+ xor r10d, r10d
+ test r9, r9
+ setne r10b
+ neg r10
+ test r9, r9
+ mov edi, 1
+ cmovle rdi, r10
+ mov qword ptr [r8 + 8*rdx], rdi
+ or rdx, 1
+.LBB4_1107:
+ add rsi, r11
+ je .LBB4_1351
+# %bb.1108:
+ mov esi, 1
+.LBB4_1109: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rdx]
+ xor eax, eax
+ test rdi, rdi
+ setne al
+ neg rax
+ test rdi, rdi
+ cmovg rax, rsi
+ mov qword ptr [r8 + 8*rdx], rax
+ mov rax, qword ptr [rcx + 8*rdx + 8]
+ xor edi, edi
+ test rax, rax
+ setne dil
+ neg rdi
+ test rax, rax
+ cmovg rdi, rsi
+ mov qword ptr [r8 + 8*rdx + 8], rdi
+ add rdx, 2
+ cmp r11, rdx
+ jne .LBB4_1109
+ jmp .LBB4_1351
+.LBB4_366:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.367:
+ mov eax, r9d
+ cmp r9d, 16
+ jae .LBB4_763
+# %bb.368:
+ xor edx, edx
+ jmp .LBB4_766
+.LBB4_369:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.370:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB4_772
+# %bb.371:
+ xor edx, edx
+ jmp .LBB4_1312
+.LBB4_372:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.373:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_374
+# %bb.775:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB4_1110
+# %bb.776:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1110
+.LBB4_374:
+ xor edx, edx
+.LBB4_1113:
+ mov rsi, rdx
+ not rsi
+ test al, 1
+ je .LBB4_1115
+# %bb.1114:
+ vmovss xmm0, dword ptr [rcx + 4*rdx] # xmm0 = mem[0],zero,zero,zero
+ vmovmskps edi, xmm0
+ and edi, 1
+ neg edi
+ or edi, 1
+ vcvtsi2ss xmm1, xmm10, edi
+ vxorps xmm2, xmm2, xmm2
+ vcmpeqss xmm0, xmm0, xmm2
+ vandnps xmm0, xmm0, xmm1
+ vmovss dword ptr [r8 + 4*rdx], xmm0
+ or rdx, 1
+.LBB4_1115:
+ add rsi, rax
+ je .LBB4_1351
+# %bb.1116:
+ vxorps xmm0, xmm0, xmm0
+.LBB4_1117: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero
+ vmovmskps esi, xmm1
+ and esi, 1
+ neg esi
+ or esi, 1
+ vcvtsi2ss xmm2, xmm10, esi
+ vcmpeqss xmm1, xmm1, xmm0
+ vandnps xmm1, xmm1, xmm2
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ vmovss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
+ vmovmskps esi, xmm1
+ and esi, 1
+ neg esi
+ or esi, 1
+ vcvtsi2ss xmm2, xmm10, esi
+ vcmpeqss xmm1, xmm1, xmm0
+ vandnps xmm1, xmm1, xmm2
+ vmovss dword ptr [r8 + 4*rdx + 4], xmm1
+ add rdx, 2
+ cmp rax, rdx
+ jne .LBB4_1117
+ jmp .LBB4_1351
+.LBB4_375:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.376:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_377
+# %bb.778:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_1118
+# %bb.779:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1118
+.LBB4_377:
+ xor edx, edx
+.LBB4_1121:
+ mov r9, rdx
+ not r9
+ add r9, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1123
+.LBB4_1122: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1122
+.LBB4_1123:
+ cmp r9, 3
+ jb .LBB4_1351
+.LBB4_1124: # =>This Inner Loop Header: Depth=1
+ xor esi, esi
+ cmp byte ptr [rcx + rdx], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx], rsi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 1], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 8], rsi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 2], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 16], rsi
+ xor esi, esi
+ cmp byte ptr [rcx + rdx + 3], 0
+ setne sil
+ mov qword ptr [r8 + 8*rdx + 24], rsi
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1124
+ jmp .LBB4_1351
+.LBB4_378:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.379:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_380
+# %bb.781:
+ lea rdx, [rcx + rax]
+ cmp rdx, r8
+ jbe .LBB4_1125
+# %bb.782:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB4_1125
+.LBB4_380:
+ xor edx, edx
+.LBB4_1128:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1133
+# %bb.1129:
+ vmovd xmm0, dword ptr [rip + .LCPI4_5] # xmm0 = mem[0],zero,zero,zero
+ jmp .LBB4_1131
+.LBB4_1130: # in Loop: Header=BB4_1131 Depth=1
+ vmovd dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ je .LBB4_1133
+.LBB4_1131: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rcx + rdx], 0
+ vmovdqa xmm1, xmm0
+ jne .LBB4_1130
+# %bb.1132: # in Loop: Header=BB4_1131 Depth=1
+ vpxor xmm1, xmm1, xmm1
+ jmp .LBB4_1130
+.LBB4_381:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.382:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB4_784
+# %bb.383:
+ xor edx, edx
+ jmp .LBB4_787
+.LBB4_384:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.385:
+ mov eax, r9d
+ cmp r9d, 32
+ jae .LBB4_789
+# %bb.386:
+ xor edx, edx
+ jmp .LBB4_792
+.LBB4_387:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.388:
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB4_389
+# %bb.798:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB4_1144
+# %bb.799:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_1144
+.LBB4_389:
+ xor edx, edx
+.LBB4_1147:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1149
+.LBB4_1148: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rcx + 4*rdx], 0
+ setne byte ptr [r8 + rdx]
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1148
+.LBB4_1149:
+ cmp rsi, 3
+ jb .LBB4_1351
+.LBB4_1150: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rcx + 4*rdx], 0
+ setne byte ptr [r8 + rdx]
+ cmp dword ptr [rcx + 4*rdx + 4], 0
+ setne byte ptr [r8 + rdx + 1]
+ cmp dword ptr [rcx + 4*rdx + 8], 0
+ setne byte ptr [r8 + rdx + 2]
+ cmp dword ptr [rcx + 4*rdx + 12], 0
+ setne byte ptr [r8 + rdx + 3]
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1150
+ jmp .LBB4_1351
+.LBB4_390:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.391:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_392
+# %bb.801:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB4_1151
+# %bb.802:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_1151
+.LBB4_392:
+ xor edx, edx
+.LBB4_1154:
+ mov rsi, rdx
+ not rsi
+ test al, 1
+ je .LBB4_1156
+# %bb.1155:
+ vmovsd xmm0, qword ptr [rcx + 8*rdx] # xmm0 = mem[0],zero
+ xor edi, edi
+ vpxor xmm1, xmm1, xmm1
+ vucomisd xmm1, xmm0
+ vandpd xmm0, xmm0, xmmword ptr [rip + .LCPI4_2]
+ vmovddup xmm1, qword ptr [rip + .LCPI4_1] # xmm1 = [1.0E+0,1.0E+0]
+ # xmm1 = mem[0,0]
+ vorpd xmm0, xmm1, xmm0
+ vcvttsd2si ebx, xmm0
+ cmove ebx, edi
+ mov byte ptr [r8 + rdx], bl
+ or rdx, 1
+.LBB4_1156:
+ add rsi, rax
+ je .LBB4_1351
+# %bb.1157:
+ xor esi, esi
+ vxorpd xmm0, xmm0, xmm0
+ vmovapd xmm1, xmmword ptr [rip + .LCPI4_2] # xmm1 = [-0.0E+0,-0.0E+0]
+ vmovddup xmm2, qword ptr [rip + .LCPI4_1] # xmm2 = [1.0E+0,1.0E+0]
+ # xmm2 = mem[0,0]
+.LBB4_1158: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm3, qword ptr [rcx + 8*rdx] # xmm3 = mem[0],zero
+ vucomisd xmm0, xmm3
+ vandpd xmm3, xmm3, xmm1
+ vorpd xmm3, xmm2, xmm3
+ vcvttsd2si edi, xmm3
+ cmove edi, esi
+ mov byte ptr [r8 + rdx], dil
+ vmovsd xmm3, qword ptr [rcx + 8*rdx + 8] # xmm3 = mem[0],zero
+ vucomisd xmm0, xmm3
+ vandpd xmm3, xmm3, xmm1
+ vorpd xmm3, xmm2, xmm3
+ vcvttsd2si edi, xmm3
+ cmove edi, esi
+ mov byte ptr [r8 + rdx + 1], dil
+ add rdx, 2
+ cmp rax, rdx
+ jne .LBB4_1158
+ jmp .LBB4_1351
+.LBB4_393:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.394:
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB4_395
+# %bb.804:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB4_1159
+# %bb.805:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_1159
+.LBB4_395:
+ xor r11d, r11d
+.LBB4_1162:
+ mov rsi, r11
+ not rsi
+ test r10b, 1
+ je .LBB4_1164
+# %bb.1163:
+ mov dil, byte ptr [rcx + r11]
+ test dil, dil
+ setne r9b
+ neg r9b
+ test dil, dil
+ movzx r9d, r9b
+ mov edi, 1
+ cmovle edi, r9d
+ mov byte ptr [r8 + r11], dil
+ or r11, 1
+.LBB4_1164:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.1165:
+ mov esi, 1
+.LBB4_1166: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + r11]
+ test al, al
+ setne dl
+ neg dl
+ test al, al
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11], al
+ movzx eax, byte ptr [rcx + r11 + 1]
+ test al, al
+ setne dl
+ neg dl
+ test al, al
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11 + 1], al
+ add r11, 2
+ cmp r10, r11
+ jne .LBB4_1166
+ jmp .LBB4_1351
+.LBB4_396:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.397:
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB4_398
+# %bb.807:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB4_1167
+# %bb.808:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_1167
+.LBB4_398:
+ xor edx, edx
+.LBB4_1170:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1172
+.LBB4_1171: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne byte ptr [r8 + rdx]
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1171
+.LBB4_1172:
+ cmp rsi, 3
+ jb .LBB4_1351
+.LBB4_1173: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rcx + 8*rdx], 0
+ setne byte ptr [r8 + rdx]
+ cmp qword ptr [rcx + 8*rdx + 8], 0
+ setne byte ptr [r8 + rdx + 1]
+ cmp qword ptr [rcx + 8*rdx + 16], 0
+ setne byte ptr [r8 + rdx + 2]
+ cmp qword ptr [rcx + 8*rdx + 24], 0
+ setne byte ptr [r8 + rdx + 3]
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1173
+ jmp .LBB4_1351
+.LBB4_399:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.400:
+ mov eax, r9d
+ cmp r9d, 64
+ jb .LBB4_401
+# %bb.810:
+ lea rdx, [rcx + 2*rax]
+ cmp rdx, r8
+ jbe .LBB4_1174
+# %bb.811:
+ lea rdx, [r8 + rax]
+ cmp rdx, rcx
+ jbe .LBB4_1174
+.LBB4_401:
+ xor edx, edx
+.LBB4_1177:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB4_1179
+.LBB4_1178: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rcx + 2*rdx], 0
+ setne byte ptr [r8 + rdx]
+ add rdx, 1
+ add rdi, -1
+ jne .LBB4_1178
+.LBB4_1179:
+ cmp rsi, 3
+ jb .LBB4_1351
+.LBB4_1180: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rcx + 2*rdx], 0
+ setne byte ptr [r8 + rdx]
+ cmp word ptr [rcx + 2*rdx + 2], 0
+ setne byte ptr [r8 + rdx + 1]
+ cmp word ptr [rcx + 2*rdx + 4], 0
+ setne byte ptr [r8 + rdx + 2]
+ cmp word ptr [rcx + 2*rdx + 6], 0
+ setne byte ptr [r8 + rdx + 3]
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB4_1180
+ jmp .LBB4_1351
+.LBB4_402:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.403:
+ mov r10d, r9d
+ cmp r9d, 64
+ jb .LBB4_404
+# %bb.813:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB4_1181
+# %bb.814:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_1181
+.LBB4_404:
+ xor r11d, r11d
+.LBB4_1184:
+ mov rsi, r11
+ not rsi
+ test r10b, 1
+ je .LBB4_1186
+# %bb.1185:
+ movzx edi, word ptr [rcx + 2*r11]
+ test di, di
+ setne r9b
+ neg r9b
+ test di, di
+ movzx r9d, r9b
+ mov edi, 1
+ cmovle edi, r9d
+ mov byte ptr [r8 + r11], dil
+ or r11, 1
+.LBB4_1186:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.1187:
+ mov esi, 1
+.LBB4_1188: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*r11]
+ test di, di
+ setne al
+ neg al
+ test di, di
+ movzx eax, al
+ cmovg eax, esi
+ mov byte ptr [r8 + r11], al
+ movzx eax, word ptr [rcx + 2*r11 + 2]
+ test ax, ax
+ setne dl
+ neg dl
+ test ax, ax
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11 + 1], al
+ add r11, 2
+ cmp r10, r11
+ jne .LBB4_1188
+ jmp .LBB4_1351
+.LBB4_405:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.406:
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB4_407
+# %bb.816:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB4_1189
+# %bb.817:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_1189
+.LBB4_407:
+ xor r11d, r11d
+.LBB4_1192:
+ mov rsi, r11
+ not rsi
+ test r10b, 1
+ je .LBB4_1194
+# %bb.1193:
+ mov rdi, qword ptr [rcx + 8*r11]
+ test rdi, rdi
+ setne r9b
+ neg r9b
+ test rdi, rdi
+ movzx r9d, r9b
+ mov edi, 1
+ cmovle edi, r9d
+ mov byte ptr [r8 + r11], dil
+ or r11, 1
+.LBB4_1194:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.1195:
+ mov esi, 1
+.LBB4_1196: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*r11]
+ test rdi, rdi
+ setne al
+ neg al
+ test rdi, rdi
+ movzx eax, al
+ cmovg eax, esi
+ mov byte ptr [r8 + r11], al
+ mov rax, qword ptr [rcx + 8*r11 + 8]
+ test rax, rax
+ setne dl
+ neg dl
+ test rax, rax
+ movzx eax, dl
+ cmovg eax, esi
+ mov byte ptr [r8 + r11 + 1], al
+ add r11, 2
+ cmp r10, r11
+ jne .LBB4_1196
+ jmp .LBB4_1351
+.LBB4_408:
+ test r9d, r9d
+ jle .LBB4_1351
+# %bb.409:
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB4_410
+# %bb.819:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB4_1197
+# %bb.820:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB4_1197
+.LBB4_410:
+ xor edx, edx
+.LBB4_1200:
+ mov rsi, rdx
+ not rsi
+ test r10b, 1
+ je .LBB4_1202
+# %bb.1201:
+ vmovd xmm0, dword ptr [rcx + 4*rdx] # xmm0 = mem[0],zero,zero,zero
+ vmovd edi, xmm0
+ test edi, edi
+ setns dil
+ add dil, dil
+ add dil, -1
+ xor r9d, r9d
+ vpxor xmm1, xmm1, xmm1
+ vucomiss xmm1, xmm0
+ movzx edi, dil
+ cmove edi, r9d
+ mov byte ptr [r8 + rdx], dil
+ or rdx, 1
+.LBB4_1202:
+ add rsi, r10
+ je .LBB4_1351
+# %bb.1203:
+ xor esi, esi
+ vxorps xmm0, xmm0, xmm0
+.LBB4_1204: # =>This Inner Loop Header: Depth=1
+ vmovd xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero
+ vmovd edi, xmm1
+ test edi, edi
+ setns al
+ add al, al
+ add al, -1
+ vucomiss xmm0, xmm1
+ movzx eax, al
+ cmove eax, esi
+ mov byte ptr [r8 + rdx], al
+ vmovd xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
+ vmovd eax, xmm1
+ test eax, eax
+ setns al
+ add al, al
+ add al, -1
+ vucomiss xmm0, xmm1
... 59437 lines suppressed ...