You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/11/21 15:38:05 UTC
[arrow] branch master updated: ARROW-18110: [Go] Scalar Comparisons (#14669)
This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b9dd41607c ARROW-18110: [Go] Scalar Comparisons (#14669)
b9dd41607c is described below
commit b9dd41607cb7dd7afd50e3ceb99c68e79e7733a0
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Mon Nov 21 10:37:54 2022 -0500
ARROW-18110: [Go] Scalar Comparisons (#14669)
Authored-by: Matt Topol <zo...@gmail.com>
Signed-off-by: Matt Topol <zo...@gmail.com>
---
dev/release/rat_exclude_files.txt | 1 +
go/arrow/compute/arithmetic_test.go | 6 +
go/arrow/compute/datum.go | 33 +-
go/arrow/compute/executor.go | 1 +
go/arrow/compute/internal/exec/utils.go | 83 +-
go/arrow/compute/internal/kernels/Makefile | 17 +-
.../internal/kernels/_lib/scalar_comparison.cc | 241 +
.../kernels/_lib/scalar_comparison_avx2_amd64.s | 67763 +++++++++++++++++++
.../kernels/_lib/scalar_comparison_sse4_amd64.s | 59819 ++++++++++++++++
.../internal/kernels/compareoperator_string.go | 28 +
go/arrow/compute/internal/kernels/helpers.go | 117 +
.../internal/kernels/scalar_comparison_amd64.go | 110 +
.../kernels/scalar_comparison_avx2_amd64.go | 109 +
.../kernels/scalar_comparison_avx2_amd64.s | 67310 ++++++++++++++++++
.../internal/kernels/scalar_comparison_noasm.go | 23 +
.../kernels/scalar_comparison_sse4_amd64.go | 109 +
.../kernels/scalar_comparison_sse4_amd64.s | 58288 ++++++++++++++++
.../compute/internal/kernels/scalar_comparisons.go | 699 +
go/arrow/compute/internal/kernels/types.go | 14 +-
go/arrow/compute/registry.go | 1 +
go/arrow/compute/scalar_compare.go | 135 +
go/arrow/compute/scalar_compare_test.go | 1487 +
go/arrow/compute/utils.go | 82 +
go/arrow/decimal128/decimal128.go | 6 +-
go/arrow/decimal256/decimal256.go | 9 +-
go/arrow/internal/testing/gen/random_array_gen.go | 42 +-
go/arrow/scalar/parse.go | 83 +
go/internal/bitutils/bitmap_generate.go | 2 +
28 files changed, 256590 insertions(+), 28 deletions(-)
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 5cce4195c0..e3eb981842 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -143,6 +143,7 @@ go/arrow/unionmode_string.go
go/arrow/compute/go.sum
go/arrow/compute/datumkind_string.go
go/arrow/compute/funckind_string.go
+go/arrow/compute/internal/kernels/compareoperator_string.go
go/arrow/compute/internal/kernels/_lib/vendored/*
go/*.tmpldata
go/*.s
diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go
index d57af69e6f..2549f6c904 100644
--- a/go/arrow/compute/arithmetic_test.go
+++ b/go/arrow/compute/arithmetic_test.go
@@ -146,6 +146,12 @@ func (b *BinaryFuncTestSuite) TearDownTest() {
b.mem.AssertSize(b.T(), 0)
}
+func (b *BinaryFuncTestSuite) getArr(dt arrow.DataType, str string) arrow.Array {
+ arr, _, err := array.FromJSON(b.mem, dt, strings.NewReader(str), array.WithUseNumber())
+ b.Require().NoError(err)
+ return arr
+}
+
type Float16BinaryFuncTestSuite struct {
BinaryFuncTestSuite
}
diff --git a/go/arrow/compute/datum.go b/go/arrow/compute/datum.go
index e02d50a98a..f6a46e3ef4 100644
--- a/go/arrow/compute/datum.go
+++ b/go/arrow/compute/datum.go
@@ -239,8 +239,6 @@ func (d *TableDatum) Equals(other Datum) bool {
return false
}
-// CollectionDatum is a slice of Datums
-
// NewDatum will construct the appropriate Datum type based on what is passed in
// as the argument.
//
@@ -258,23 +256,38 @@ func NewDatum(value interface{}) Datum {
return NewDatum(v.data())
case arrow.Array:
v.Data().Retain()
- return &ArrayDatum{v.Data().(*array.Data)}
- case arrow.ArrayData:
+ return &ArrayDatum{v.Data()}
+ case scalar.Releasable:
v.Retain()
+ return NewDatumWithoutOwning(v)
+ case scalar.Scalar:
+ return &ScalarDatum{v}
+ default:
+ return &ScalarDatum{scalar.MakeScalar(value)}
+ }
+}
+
+// NewDatumWithoutOwning is like NewDatum only it does not call Retain on
+// the passed in value (if applicable). This means that if the resulting
+// Datum should not have Release called on it and the original value needs
+// to outlive the Datum.
+//
+// Only use this if you know what you're doing. For the most part this is
+// just a convenience function.+-
+
+func NewDatumWithoutOwning(value interface{}) Datum {
+ switch v := value.(type) {
+ case arrow.Array:
+ return &ArrayDatum{v.Data()}
+ case arrow.ArrayData:
return &ArrayDatum{v}
case *arrow.Chunked:
- v.Retain()
return &ChunkedDatum{v}
case arrow.Record:
- v.Retain()
return &RecordDatum{v}
case arrow.Table:
- v.Retain()
return &TableDatum{v}
case scalar.Scalar:
- if ls, ok := v.(scalar.Releasable); ok {
- ls.Retain()
- }
return &ScalarDatum{v}
default:
return &ScalarDatum{scalar.MakeScalar(value)}
diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go
index 80adcbd1e9..f6a2661abd 100644
--- a/go/arrow/compute/executor.go
+++ b/go/arrow/compute/executor.go
@@ -613,6 +613,7 @@ func (s *scalarExecutor) executeSpans(data chan<- Datum) (err error) {
output = *s.prepareOutput(int(input.Len))
if err = s.executeSingleSpan(&input, &output); err != nil {
+ output.Release()
return
}
err = s.emitResult(&output, data)
diff --git a/go/arrow/compute/internal/exec/utils.go b/go/arrow/compute/internal/exec/utils.go
index b57cb5990a..61a1854a60 100644
--- a/go/arrow/compute/internal/exec/utils.go
+++ b/go/arrow/compute/internal/exec/utils.go
@@ -25,6 +25,7 @@ import (
"github.com/apache/arrow/go/v11/arrow"
"github.com/apache/arrow/go/v11/arrow/array"
+ "github.com/apache/arrow/go/v11/arrow/bitutil"
"github.com/apache/arrow/go/v11/arrow/decimal128"
"github.com/apache/arrow/go/v11/arrow/decimal256"
"github.com/apache/arrow/go/v11/arrow/float16"
@@ -99,7 +100,7 @@ func GetValues[T FixedWidthTypes](data arrow.ArrayData, i int) []T {
return ret[data.Offset():]
}
-// GetSpanValues returns a properly typed slice bye reinterpreting
+// GetSpanValues returns a properly typed slice by reinterpreting
// the buffer at index i using unsafe.Slice. This will take into account
// the offset of the given ArraySpan.
func GetSpanValues[T FixedWidthTypes](span *ArraySpan, i int) []T {
@@ -158,6 +159,7 @@ func OptionsInit[T any](_ *KernelCtx, args KernelInitArgs) (KernelState, error)
}
var typMap = map[reflect.Type]arrow.DataType{
+ reflect.TypeOf(false): arrow.FixedWidthTypes.Boolean,
reflect.TypeOf(int8(0)): arrow.PrimitiveTypes.Int8,
reflect.TypeOf(int16(0)): arrow.PrimitiveTypes.Int16,
reflect.TypeOf(int32(0)): arrow.PrimitiveTypes.Int32,
@@ -192,13 +194,13 @@ func GetType[T NumericTypes | bool | string]() arrow.Type {
return typMap[reflect.TypeOf(z)].ID()
}
-type arrayBuilder[T NumericTypes] interface {
+type arrayBuilder[T NumericTypes | bool] interface {
array.Builder
Append(T)
AppendValues([]T, []bool)
}
-func ArrayFromSlice[T NumericTypes](mem memory.Allocator, data []T) arrow.Array {
+func ArrayFromSlice[T NumericTypes | bool](mem memory.Allocator, data []T) arrow.Array {
bldr := array.NewBuilder(mem, typMap[reflect.TypeOf(data).Elem()]).(arrayBuilder[T])
defer bldr.Release()
@@ -303,3 +305,78 @@ func (c *ChunkResolver) Resolve(idx int64) (chunk, index int64) {
atomic.StoreInt64(&c.cached, chunk)
return
}
+
+type arrayTypes interface {
+ FixedWidthTypes | TemporalTypes | bool | string | []byte
+}
+
+type ArrayIter[T arrayTypes] interface {
+ Next() T
+}
+
+type BoolIter struct {
+ Rdr *bitutil.BitmapReader
+}
+
+func NewBoolIter(arr *ArraySpan) ArrayIter[bool] {
+ return &BoolIter{
+ Rdr: bitutil.NewBitmapReader(arr.Buffers[1].Buf, int(arr.Offset), int(arr.Len))}
+}
+
+func (b *BoolIter) Next() (out bool) {
+ out = b.Rdr.Set()
+ b.Rdr.Next()
+ return
+}
+
+type PrimitiveIter[T FixedWidthTypes] struct {
+ Values []T
+}
+
+func NewPrimitiveIter[T FixedWidthTypes](arr *ArraySpan) ArrayIter[T] {
+ return &PrimitiveIter[T]{Values: GetSpanValues[T](arr, 1)}
+}
+
+func (p *PrimitiveIter[T]) Next() (v T) {
+ v = p.Values[0]
+ p.Values = p.Values[1:]
+ return
+}
+
+type VarBinaryIter[OffsetT int32 | int64] struct {
+ Offsets []OffsetT
+ Data []byte
+ Pos int64
+}
+
+func NewVarBinaryIter[OffsetT int32 | int64](arr *ArraySpan) ArrayIter[[]byte] {
+ return &VarBinaryIter[OffsetT]{
+ Offsets: GetSpanOffsets[OffsetT](arr, 1),
+ Data: arr.Buffers[2].Buf,
+ }
+}
+
+func (v *VarBinaryIter[OffsetT]) Next() []byte {
+ cur := v.Pos
+ v.Pos++
+ return v.Data[v.Offsets[cur]:v.Offsets[v.Pos]]
+}
+
+type FSBIter struct {
+ Data []byte
+ Width int
+ Pos int64
+}
+
+func NewFSBIter(arr *ArraySpan) ArrayIter[[]byte] {
+ return &FSBIter{
+ Data: arr.Buffers[1].Buf,
+ Width: arr.Type.(arrow.FixedWidthDataType).Bytes(),
+ }
+}
+
+func (f *FSBIter) Next() []byte {
+ start := f.Width * int(f.Pos)
+ f.Pos++
+ return f.Data[start : start+f.Width]
+}
diff --git a/go/arrow/compute/internal/kernels/Makefile b/go/arrow/compute/internal/kernels/Makefile
index 53dda4da43..ac00bd837c 100644
--- a/go/arrow/compute/internal/kernels/Makefile
+++ b/go/arrow/compute/internal/kernels/Makefile
@@ -20,7 +20,7 @@ PERL_FIXUP_ROTATE=perl -i -pe 's/(ro[rl]\s+\w{2,3})$$/\1, 1/'
C2GOASM=c2goasm
CC=clang-11
CXX=clang++-11
-C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 \
+C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=5000 \
-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib
ASM_FLAGS_AVX2=-mavx2 -mfma
ASM_FLAGS_SSE4=-msse4
@@ -37,7 +37,8 @@ ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -n
INTEL_SOURCES := \
cast_numeric_avx2_amd64.s cast_numeric_sse4_amd64.s constant_factor_avx2_amd64.s \
- constant_factor_sse4_amd64.s base_arithmetic_avx2_amd64.s base_arithmetic_sse4_amd64.s
+ constant_factor_sse4_amd64.s base_arithmetic_avx2_amd64.s base_arithmetic_sse4_amd64.s \
+ scalar_comparison_avx2_amd64.s scalar_comparison_sse4_amd64.s
#
# ARROW-15336: DO NOT add the assembly target for Arm64 (ARM_SOURCES) until c2goasm added the Arm64 support.
@@ -62,6 +63,12 @@ _lib/base_arithmetic_avx2_amd64.s: _lib/base_arithmetic.cc
_lib/base_arithmetic_sse4_amd64.s: _lib/base_arithmetic.cc
$(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+_lib/scalar_comparison_avx2_amd64.s: _lib/scalar_comparison.cc
+ $(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/scalar_comparison_sse4_amd64.s: _lib/scalar_comparison.cc
+ $(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
_lib/base_arithmetic_neon.s: _lib/base_arithmetic.cc
$(CXX) -std=c++17 -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
@@ -92,6 +99,12 @@ base_arithmetic_avx2_amd64.s: _lib/base_arithmetic_avx2_amd64.s
base_arithmetic_sse4_amd64.s: _lib/base_arithmetic_sse4_amd64.s
$(C2GOASM) -a -f $^ $@
+scalar_comparison_avx2_amd64.s: _lib/scalar_comparison_avx2_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
+scalar_comparison_sse4_amd64.s: _lib/scalar_comparison_sse4_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
clean:
rm -f $(INTEL_SOURCES)
rm -f $(addprefix _lib/,$(INTEL_SOURCES))
diff --git a/go/arrow/compute/internal/kernels/_lib/scalar_comparison.cc b/go/arrow/compute/internal/kernels/_lib/scalar_comparison.cc
new file mode 100644
index 0000000000..09540f3679
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/scalar_comparison.cc
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arch.h>
+#include <stdint.h>
+#include "types.h"
+
+// pack integers into a bitmap in batches of 8
+template <int batch_size>
+inline void pack_bits(const uint32_t* values, uint8_t* out) {
+ for (int i = 0; i < batch_size / 8; ++i) {
+ *out++ = (values[0] | values[1]<<1 | values[2]<<2 | values[3]<<3 |
+ values[4]<<4 | values[5]<<5 | values[6]<<6 | values[7]<<7);
+ values += 8;
+ }
+}
+
+struct Equal {
+ template <typename T>
+ static constexpr bool Call(const T& left, const T& right) {
+ return left == right;
+ }
+};
+
+struct NotEqual {
+ template <typename T>
+ static constexpr bool Call(const T& left, const T& right) {
+ return left != right;
+ }
+};
+
+struct Greater {
+ template <typename T>
+ static constexpr bool Call(const T& left, const T& right) {
+ return left > right;
+ }
+};
+
+struct GreaterEqual {
+ template <typename T>
+ static constexpr bool Call(const T& left, const T& right) {
+ return left >= right;
+ }
+};
+
+static inline void set_bit_to(uint8_t* bits, int64_t i, bool bit_is_set) {
+ bits[i/8] ^= static_cast<uint8_t>(-static_cast<uint8_t>(bit_is_set) ^ bits[i / 8]) & static_cast<uint8_t>(1 << (i % 8));
+}
+
+template <typename T, typename Op>
+struct compare_primitive_arr_arr {
+ static inline void Exec(const void* left_void, const void* right_void, int64_t length, void* out_void, const int offset) {
+ const T* left = reinterpret_cast<const T*>(left_void);
+ const T* right = reinterpret_cast<const T*>(right_void);
+ uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_void);
+ static constexpr int kBatchSize = 32;
+ int64_t num_batches = length / kBatchSize;
+ uint32_t temp_output[kBatchSize];
+
+ if (int prefix = offset % 8) {
+ for (int i = prefix; i < 8; ++i) {
+ set_bit_to(out_bitmap, i, Op::template Call<T>(*left++, *right++));
+ }
+ out_bitmap++;
+ }
+
+ for (int64_t j = 0; j < num_batches; ++j) {
+ for (int i = 0; i < kBatchSize; ++i) {
+ temp_output[i] = Op::template Call<T>(*left++, *right++);
+ }
+ pack_bits<kBatchSize>(temp_output, out_bitmap);
+ out_bitmap += kBatchSize / 8;
+ }
+ int64_t bit_index = 0;
+ for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+ set_bit_to(out_bitmap, bit_index++, Op::template Call<T>(*left++, *right++));
+ }
+ }
+};
+
+template <typename T, typename Op>
+struct compare_primitive_arr_scalar {
+ static inline void Exec(const void* left_void, const void* right_void, int64_t length, void* out_void, const int offset) {
+ const T* left = reinterpret_cast<const T*>(left_void);
+ const T right = *reinterpret_cast<const T*>(right_void);
+ uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_void);
+ static constexpr int kBatchSize = 32;
+ int64_t num_batches = length / kBatchSize;
+ uint32_t temp_output[kBatchSize];
+
+ if (int prefix = offset % 8) {
+ for (int i = prefix; i < 8; ++i) {
+ set_bit_to(out_bitmap, i, Op::template Call<T>(*left++, right));
+ }
+ out_bitmap++;
+ }
+
+ for (int64_t j = 0; j < num_batches; ++j) {
+ for (int i = 0; i < kBatchSize; ++i) {
+ temp_output[i] = Op::template Call<T>(*left++, right);
+ }
+ pack_bits<kBatchSize>(temp_output, out_bitmap);
+ out_bitmap += kBatchSize / 8;
+ }
+ int64_t bit_index = 0;
+ for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+ set_bit_to(out_bitmap, bit_index++, Op::template Call<T>(*left++, right));
+ }
+ }
+};
+
+template <typename T, typename Op>
+struct compare_primitive_scalar_arr {
+ static inline void Exec(const void* left_void, const void* right_void, int64_t length, void* out_void, const int offset) {
+ const T left = *reinterpret_cast<const T*>(left_void);
+ const T* right = reinterpret_cast<const T*>(right_void);
+ uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_void);
+ static constexpr int kBatchSize = 32;
+ int64_t num_batches = length / kBatchSize;
+ uint32_t temp_output[kBatchSize];
+
+ if (int prefix = offset % 8) {
+ for (int i = prefix; i < 8; ++i) {
+ set_bit_to(out_bitmap, i, Op::template Call<T>(left, *right++));
+ }
+ out_bitmap++;
+ }
+
+ for (int64_t j = 0; j < num_batches; ++j) {
+ for (int i = 0; i < kBatchSize; ++i) {
+ temp_output[i] = Op::template Call<T>(left, *right++);
+ }
+ pack_bits<kBatchSize>(temp_output, out_bitmap);
+ out_bitmap += kBatchSize / 8;
+ }
+ int64_t bit_index = 0;
+ for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+ set_bit_to(out_bitmap, bit_index++, Op::template Call<T>(left, *right++));
+ }
+ }
+};
+
+enum class cmpop : int8_t {
+ EQUAL,
+ NOT_EQUAL,
+ GREATER,
+ GREATER_EQUAL,
+ // LESS and LESS_EQUAL are handled by doing flipped
+ // versions of GREATER and GREATER_EQUAL
+};
+
+template <typename Op, template <typename...> typename Impl>
+static inline void comparison_exec(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ const auto ty = static_cast<arrtype>(type);
+
+ switch (ty) {
+ case arrtype::UINT8:
+ return Impl<uint8_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::INT8:
+ return Impl<int8_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::UINT16:
+ return Impl<uint16_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::INT16:
+ return Impl<int16_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::UINT32:
+ return Impl<uint32_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::INT32:
+ return Impl<int32_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::UINT64:
+ return Impl<uint64_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::INT64:
+ return Impl<int64_t, Op>::Exec(left, right, length, out, offset);
+ case arrtype::FLOAT32:
+ return Impl<float, Op>::Exec(left, right, length, out, offset);
+ case arrtype::FLOAT64:
+ return Impl<double, Op>::Exec(left, right, length, out, offset);
+ default:
+ break;
+ }
+}
+
+extern "C" void FULL_NAME(comparison_equal_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<Equal, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_equal_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<Equal, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_equal_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<Equal, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_not_equal_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<NotEqual, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_not_equal_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<NotEqual, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_not_equal_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<NotEqual, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<Greater, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<Greater, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<Greater, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_equal_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<GreaterEqual, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_equal_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<GreaterEqual, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_equal_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+ comparison_exec<GreaterEqual, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
diff --git a/go/arrow/compute/internal/kernels/_lib/scalar_comparison_avx2_amd64.s b/go/arrow/compute/internal/kernels/_lib/scalar_comparison_avx2_amd64.s
new file mode 100644
index 0000000000..b29d6694a1
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/scalar_comparison_avx2_amd64.s
@@ -0,0 +1,67763 @@
+ .text
+ .intel_syntax noprefix
+ .file "scalar_comparison.cc"
+ .globl comparison_equal_arr_arr_avx2 # -- Begin function comparison_equal_arr_arr_avx2
+ .p2align 4, 0x90
+ .type comparison_equal_arr_arr_avx2,@function
+comparison_equal_arr_arr_avx2: # @comparison_equal_arr_arr_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ and rsp, -8
+ sub rsp, 72
+ # kill: def $r9d killed $r9d def $r9
+ mov r11, r8
+ mov r14, rcx
+ cmp edi, 6
+ jg .LBB0_29
+# %bb.1:
+ cmp edi, 3
+ jle .LBB0_2
+# %bb.15:
+ cmp edi, 4
+ je .LBB0_68
+# %bb.16:
+ cmp edi, 5
+ je .LBB0_79
+# %bb.17:
+ cmp edi, 6
+ jne .LBB0_123
+# %bb.18:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_22
+# %bb.19:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_20: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rsi]
+ add rsi, 4
+ cmp ecx, dword ptr [rdx]
+ lea rdx, [rdx + 4]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_20
+# %bb.21:
+ add r14, 1
+.LBB0_22:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_26
+# %bb.23:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 64], r15 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_24: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ mov eax, dword ptr [rsi]
+ mov ecx, dword ptr [rsi + 4]
+ cmp eax, dword ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp ecx, dword ptr [rdx + 4]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 8]
+ cmp eax, dword ptr [rdx + 8]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 12]
+ cmp eax, dword ptr [rdx + 12]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 16]
+ cmp eax, dword ptr [rdx + 16]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 20]
+ cmp eax, dword ptr [rdx + 20]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 24]
+ cmp eax, dword ptr [rdx + 24]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 28]
+ cmp eax, dword ptr [rdx + 28]
+ sete r13b
+ mov eax, dword ptr [rsi + 32]
+ cmp eax, dword ptr [rdx + 32]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 36]
+ cmp eax, dword ptr [rdx + 36]
+ sete r8b
+ mov eax, dword ptr [rsi + 40]
+ cmp eax, dword ptr [rdx + 40]
+ sete r11b
+ mov eax, dword ptr [rsi + 44]
+ cmp eax, dword ptr [rdx + 44]
+ sete r15b
+ mov eax, dword ptr [rsi + 48]
+ cmp eax, dword ptr [rdx + 48]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 52]
+ cmp eax, dword ptr [rdx + 52]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 56]
+ cmp eax, dword ptr [rdx + 56]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 60]
+ cmp eax, dword ptr [rdx + 60]
+ sete bl
+ mov eax, dword ptr [rsi + 64]
+ mov ecx, dword ptr [rsi + 68]
+ cmp eax, dword ptr [rdx + 64]
+ mov eax, dword ptr [rsi + 72]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ cmp ecx, dword ptr [rdx + 68]
+ mov ecx, dword ptr [rsi + 76]
+ sete r10b
+ cmp eax, dword ptr [rdx + 72]
+ mov eax, dword ptr [rsi + 80]
+ sete r14b
+ cmp ecx, dword ptr [rdx + 76]
+ mov ecx, dword ptr [rsi + 84]
+ sete r12b
+ cmp eax, dword ptr [rdx + 80]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ cmp ecx, dword ptr [rdx + 84]
+ mov eax, dword ptr [rsi + 88]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 88]
+ mov eax, dword ptr [rsi + 92]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 92]
+ mov eax, dword ptr [rsi + 96]
+ sete r9b
+ cmp eax, dword ptr [rdx + 96]
+ mov eax, dword ptr [rsi + 100]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 100]
+ mov eax, dword ptr [rsi + 104]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 104]
+ mov eax, dword ptr [rsi + 108]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 108]
+ mov eax, dword ptr [rsi + 112]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 112]
+ mov eax, dword ptr [rsi + 116]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 116]
+ mov eax, dword ptr [rsi + 120]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 120]
+ mov eax, dword ptr [rsi + 124]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ sub rsi, -128
+ cmp eax, dword ptr [rdx + 124]
+ sete dil
+ movzx eax, byte ptr [rsp + 32] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl al, 6
+ shl r13b, 7
+ or r13b, al
+ movzx eax, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 9] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r11b, 2
+ or r11b, r8b
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov r8d, ecx
+ shl r15b, 3
+ or r15b, r11b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r8b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 4
+ or al, r15b
+ mov r8d, eax
+ movzx eax, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl al, 5
+ or al, r8b
+ movzx r8d, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl r8b, 6
+ shl bl, 7
+ or bl, r8b
+ or r13b, cl
+ or bl, al
+ add r10b, r10b
+ add r10b, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl r14b, 2
+ or r14b, r10b
+ shl r12b, 3
+ or r12b, r14b
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 4
+ or al, r12b
+ mov ecx, eax
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r13b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl cl, 6
+ shl dil, 7
+ or dil, cl
+ or dil, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 128
+ add r14, 4
+ add qword ptr [rsp + 56], -1 # 8-byte Folded Spill
+ jne .LBB0_24
+# %bb.25:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 64] # 8-byte Reload
+.LBB0_26:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.27:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_28: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ mov edi, dword ptr [rsi + 4*rcx]
+ cmp edi, dword ptr [rdx + 4*rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_28
+ jmp .LBB0_123
+.LBB0_29:
+ cmp edi, 8
+ jle .LBB0_30
+# %bb.43:
+ cmp edi, 9
+ je .LBB0_101
+# %bb.44:
+ cmp edi, 11
+ je .LBB0_112
+# %bb.45:
+ cmp edi, 12
+ jne .LBB0_123
+# %bb.46:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_50
+# %bb.47:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_48: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rsi] # xmm0 = mem[0],zero
+ add rsi, 8
+ vucomisd xmm0, qword ptr [rdx]
+ lea rdx, [rdx + 8]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_48
+# %bb.49:
+ add r14, 1
+.LBB0_50:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_54
+# %bb.51:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 32], r15 # 8-byte Spill
+ mov qword ptr [rsp + 40], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_52: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ vmovsd xmm0, qword ptr [rsi] # xmm0 = mem[0],zero
+ vmovsd xmm1, qword ptr [rsi + 8] # xmm1 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ vucomisd xmm1, qword ptr [rdx + 8]
+ sete al
+ vmovsd xmm0, qword ptr [rsi + 16] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 16]
+ vmovsd xmm0, qword ptr [rsi + 24] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 24]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ vmovsd xmm0, qword ptr [rsi + 32] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 32]
+ vmovsd xmm0, qword ptr [rsi + 40] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 40]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ vmovsd xmm0, qword ptr [rsi + 48] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 48]
+ vmovsd xmm0, qword ptr [rsi + 56] # xmm0 = mem[0],zero
+ sete r13b
+ vucomisd xmm0, qword ptr [rdx + 56]
+ sete r15b
+ vmovsd xmm0, qword ptr [rsi + 64] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 64]
+ vmovsd xmm0, qword ptr [rsi + 72] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 72]
+ sete cl
+ vmovsd xmm0, qword ptr [rsi + 80] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 80]
+ vmovsd xmm0, qword ptr [rsi + 88] # xmm0 = mem[0],zero
+ sete r9b
+ vucomisd xmm0, qword ptr [rdx + 88]
+ sete r11b
+ vmovsd xmm0, qword ptr [rsi + 96] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 96]
+ vmovsd xmm0, qword ptr [rsi + 104] # xmm0 = mem[0],zero
+ sete r10b
+ vucomisd xmm0, qword ptr [rdx + 104]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ vmovsd xmm0, qword ptr [rsi + 112] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 112]
+ vmovsd xmm0, qword ptr [rsi + 120] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 120]
+ sete bl
+ vmovsd xmm0, qword ptr [rsi + 128] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 128]
+ vmovsd xmm0, qword ptr [rsi + 136] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 136]
+ vmovsd xmm0, qword ptr [rsi + 144] # xmm0 = mem[0],zero
+ sete r14b
+ vucomisd xmm0, qword ptr [rdx + 144]
+ vmovsd xmm0, qword ptr [rsi + 152] # xmm0 = mem[0],zero
+ sete r12b
+ vucomisd xmm0, qword ptr [rdx + 152]
+ vmovsd xmm0, qword ptr [rsi + 160] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 160]
+ vmovsd xmm0, qword ptr [rsi + 168] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 168]
+ vmovsd xmm0, qword ptr [rsi + 176] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 176]
+ vmovsd xmm0, qword ptr [rsi + 184] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 184]
+ vmovsd xmm0, qword ptr [rsi + 192] # xmm0 = mem[0],zero
+ sete r8b
+ vucomisd xmm0, qword ptr [rdx + 192]
+ vmovsd xmm0, qword ptr [rsi + 200] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 200]
+ vmovsd xmm0, qword ptr [rsi + 208] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 208]
+ vmovsd xmm0, qword ptr [rsi + 216] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 216]
+ vmovsd xmm0, qword ptr [rsi + 224] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 224]
+ vmovsd xmm0, qword ptr [rsi + 232] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 232]
+ vmovsd xmm0, qword ptr [rsi + 240] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rdx + 240]
+ vmovsd xmm0, qword ptr [rsi + 248] # xmm0 = mem[0],zero
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ add rsi, 256
+ vucomisd xmm0, qword ptr [rdx + 248]
+ sete dil
+ add al, al
+ add al, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl r13b, 6
+ shl r15b, 7
+ or r15b, r13b
+ movzx r13d, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl r13b, 2
+ or r13b, al
+ mov eax, r13d
+ add cl, cl
+ add cl, byte ptr [rsp + 8] # 1-byte Folded Reload
+ movzx r13d, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl r13b, 3
+ or r13b, al
+ shl r9b, 2
+ or r9b, cl
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, r13b
+ mov r13d, ecx
+ shl r11b, 3
+ or r11b, r9b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r13b
+ shl r10b, 4
+ or r10b, r11b
+ movzx eax, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl al, 5
+ or al, r10b
+ movzx r9d, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl r9b, 6
+ shl bl, 7
+ or bl, r9b
+ or r15b, cl
+ or bl, al
+ add r14b, r14b
+ add r14b, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl r12b, 2
+ or r12b, r14b
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 9] # 1-byte Folded Reload
+ shl al, 3
+ or al, r12b
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r15b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r8b, 7
+ or r8b, cl
+ mov byte ptr [r14 + 1], bl
+ or r8b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 20] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ movzx ecx, byte ptr [rsp + 19] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, al
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 6
+ shl dil, 7
+ or dil, al
+ or dil, cl
+ mov byte ptr [r14 + 2], r8b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 256
+ add r14, 4
+ add qword ptr [rsp + 40], -1 # 8-byte Folded Spill
+ jne .LBB0_52
+# %bb.53:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 32] # 8-byte Reload
+.LBB0_54:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.55:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_56: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rsi + 8*rcx] # xmm0 = mem[0],zero
+ vucomisd xmm0, qword ptr [rdx + 8*rcx]
+ lea r8, [rcx + 1]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_56
+ jmp .LBB0_123
+.LBB0_2:
+ cmp edi, 2
+ je .LBB0_57
+# %bb.3:
+ cmp edi, 3
+ jne .LBB0_123
+# %bb.4:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_8
+# %bb.5:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_6: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rsi]
+ add rsi, 1
+ cmp cl, byte ptr [rdx]
+ lea rdx, [rdx + 1]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_6
+# %bb.7:
+ add r14, 1
+.LBB0_8:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_12
+# %bb.9:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ mov qword ptr [rsp + 32], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_10: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ movzx eax, byte ptr [rsi]
+ movzx ecx, byte ptr [rsi + 1]
+ cmp al, byte ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp cl, byte ptr [rdx + 1]
+ sete cl
+ movzx eax, byte ptr [rsi + 2]
+ cmp al, byte ptr [rdx + 2]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 3]
+ cmp al, byte ptr [rdx + 3]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 4]
+ cmp al, byte ptr [rdx + 4]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 5]
+ cmp al, byte ptr [rdx + 5]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 6]
+ cmp al, byte ptr [rdx + 6]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 7]
+ cmp al, byte ptr [rdx + 7]
+ sete r15b
+ movzx eax, byte ptr [rsi + 8]
+ cmp al, byte ptr [rdx + 8]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 9]
+ cmp al, byte ptr [rdx + 9]
+ sete dil
+ movzx eax, byte ptr [rsi + 10]
+ cmp al, byte ptr [rdx + 10]
+ sete r10b
+ movzx eax, byte ptr [rsi + 11]
+ cmp al, byte ptr [rdx + 11]
+ sete r11b
+ movzx eax, byte ptr [rsi + 12]
+ cmp al, byte ptr [rdx + 12]
+ sete r14b
+ movzx eax, byte ptr [rsi + 13]
+ cmp al, byte ptr [rdx + 13]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 14]
+ cmp al, byte ptr [rdx + 14]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 15]
+ cmp al, byte ptr [rdx + 15]
+ sete bl
+ movzx eax, byte ptr [rsi + 16]
+ cmp al, byte ptr [rdx + 16]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 17]
+ cmp al, byte ptr [rdx + 17]
+ sete r12b
+ movzx eax, byte ptr [rsi + 18]
+ cmp al, byte ptr [rdx + 18]
+ sete r13b
+ movzx eax, byte ptr [rsi + 19]
+ cmp al, byte ptr [rdx + 19]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 20]
+ cmp al, byte ptr [rdx + 20]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 21]
+ cmp al, byte ptr [rdx + 21]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 22]
+ cmp al, byte ptr [rdx + 22]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 23]
+ cmp al, byte ptr [rdx + 23]
+ sete r9b
+ movzx eax, byte ptr [rsi + 24]
+ cmp al, byte ptr [rdx + 24]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 25]
+ cmp al, byte ptr [rdx + 25]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 26]
+ cmp al, byte ptr [rdx + 26]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 27]
+ cmp al, byte ptr [rdx + 27]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 28]
+ cmp al, byte ptr [rdx + 28]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 29]
+ cmp al, byte ptr [rdx + 29]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 30]
+ cmp al, byte ptr [rdx + 30]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 31]
+ add rsi, 32
+ cmp al, byte ptr [rdx + 31]
+ sete r8b
+ add cl, cl
+ add cl, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov eax, ecx
+ movzx ecx, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl cl, 6
+ shl r15b, 7
+ or r15b, cl
+ movzx ecx, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, al
+ mov eax, ecx
+ add dil, dil
+ add dil, byte ptr [rsp + 7] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r10b, 2
+ or r10b, dil
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov edi, ecx
+ shl r11b, 3
+ or r11b, r10b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, dil
+ shl r14b, 4
+ or r14b, r11b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 5
+ or al, r14b
+ movzx edi, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl dil, 6
+ shl bl, 7
+ or bl, dil
+ or r15b, cl
+ or bl, al
+ add r12b, r12b
+ add r12b, byte ptr [rsp + 13] # 1-byte Folded Reload
+ shl r13b, 2
+ or r13b, r12b
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 3
+ or al, r13b
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 9] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r15b
+ movzx ecx, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 12] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl cl, 6
+ shl r8b, 7
+ or r8b, cl
+ or r8b, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], r8b
+ add rdx, 32
+ add r14, 4
+ add qword ptr [rsp + 32], -1 # 8-byte Folded Spill
+ jne .LBB0_10
+# %bb.11:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 56] # 8-byte Reload
+.LBB0_12:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.13:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_14: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ movzx ebx, byte ptr [rsi + rcx]
+ cmp bl, byte ptr [rdx + rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_14
+ jmp .LBB0_123
+.LBB0_30:
+ cmp edi, 7
+ je .LBB0_90
+# %bb.31:
+ cmp edi, 8
+ jne .LBB0_123
+# %bb.32:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_36
+# %bb.33:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_34: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rsi]
+ add rsi, 8
+ cmp rcx, qword ptr [rdx]
+ lea rdx, [rdx + 8]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_34
+# %bb.35:
+ add r14, 1
+.LBB0_36:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_40
+# %bb.37:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 64], r15 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_38: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ mov rax, qword ptr [rsi]
+ mov rcx, qword ptr [rsi + 8]
+ cmp rax, qword ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp rcx, qword ptr [rdx + 8]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 16]
+ cmp rax, qword ptr [rdx + 16]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 24]
+ cmp rax, qword ptr [rdx + 24]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 32]
+ cmp rax, qword ptr [rdx + 32]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 40]
+ cmp rax, qword ptr [rdx + 40]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 48]
+ cmp rax, qword ptr [rdx + 48]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 56]
+ cmp rax, qword ptr [rdx + 56]
+ sete r13b
+ mov rax, qword ptr [rsi + 64]
+ cmp rax, qword ptr [rdx + 64]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 72]
+ cmp rax, qword ptr [rdx + 72]
+ sete r8b
+ mov rax, qword ptr [rsi + 80]
+ cmp rax, qword ptr [rdx + 80]
+ sete r11b
+ mov rax, qword ptr [rsi + 88]
+ cmp rax, qword ptr [rdx + 88]
+ sete r15b
+ mov rax, qword ptr [rsi + 96]
+ cmp rax, qword ptr [rdx + 96]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 104]
+ cmp rax, qword ptr [rdx + 104]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 112]
+ cmp rax, qword ptr [rdx + 112]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 120]
+ cmp rax, qword ptr [rdx + 120]
+ sete bl
+ mov rax, qword ptr [rsi + 128]
+ mov rcx, qword ptr [rsi + 136]
+ cmp rax, qword ptr [rdx + 128]
+ mov rax, qword ptr [rsi + 144]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ cmp rcx, qword ptr [rdx + 136]
+ mov rcx, qword ptr [rsi + 152]
+ sete r10b
+ cmp rax, qword ptr [rdx + 144]
+ mov rax, qword ptr [rsi + 160]
+ sete r14b
+ cmp rcx, qword ptr [rdx + 152]
+ mov rcx, qword ptr [rsi + 168]
+ sete r12b
+ cmp rax, qword ptr [rdx + 160]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ cmp rcx, qword ptr [rdx + 168]
+ mov rax, qword ptr [rsi + 176]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 176]
+ mov rax, qword ptr [rsi + 184]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 184]
+ mov rax, qword ptr [rsi + 192]
+ sete r9b
+ cmp rax, qword ptr [rdx + 192]
+ mov rax, qword ptr [rsi + 200]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 200]
+ mov rax, qword ptr [rsi + 208]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 208]
+ mov rax, qword ptr [rsi + 216]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 216]
+ mov rax, qword ptr [rsi + 224]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 224]
+ mov rax, qword ptr [rsi + 232]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 232]
+ mov rax, qword ptr [rsi + 240]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 240]
+ mov rax, qword ptr [rsi + 248]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ add rsi, 256
+ cmp rax, qword ptr [rdx + 248]
+ sete dil
+ movzx eax, byte ptr [rsp + 32] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl al, 6
+ shl r13b, 7
+ or r13b, al
+ movzx eax, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 9] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r11b, 2
+ or r11b, r8b
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov r8d, ecx
+ shl r15b, 3
+ or r15b, r11b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r8b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 4
+ or al, r15b
+ mov r8d, eax
+ movzx eax, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl al, 5
+ or al, r8b
+ movzx r8d, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl r8b, 6
+ shl bl, 7
+ or bl, r8b
+ or r13b, cl
+ or bl, al
+ add r10b, r10b
+ add r10b, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl r14b, 2
+ or r14b, r10b
+ shl r12b, 3
+ or r12b, r14b
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 4
+ or al, r12b
+ mov ecx, eax
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r13b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl cl, 6
+ shl dil, 7
+ or dil, cl
+ or dil, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 256
+ add r14, 4
+ add qword ptr [rsp + 56], -1 # 8-byte Folded Spill
+ jne .LBB0_38
+# %bb.39:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 64] # 8-byte Reload
+.LBB0_40:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.41:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_42: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ mov rdi, qword ptr [rsi + 8*rcx]
+ cmp rdi, qword ptr [rdx + 8*rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_42
+ jmp .LBB0_123
+.LBB0_68:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_72
+# %bb.69:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_70: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rsi]
+ add rsi, 2
+ cmp cx, word ptr [rdx]
+ lea rdx, [rdx + 2]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_70
+# %bb.71:
+ add r14, 1
+.LBB0_72:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_76
+# %bb.73:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 64], r15 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_74: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ movzx eax, word ptr [rsi]
+ movzx ecx, word ptr [rsi + 2]
+ cmp ax, word ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp cx, word ptr [rdx + 2]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 4]
+ cmp ax, word ptr [rdx + 4]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 6]
+ cmp ax, word ptr [rdx + 6]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 8]
+ cmp ax, word ptr [rdx + 8]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 10]
+ cmp ax, word ptr [rdx + 10]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 12]
+ cmp ax, word ptr [rdx + 12]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 14]
+ cmp ax, word ptr [rdx + 14]
+ sete r13b
+ movzx eax, word ptr [rsi + 16]
+ cmp ax, word ptr [rdx + 16]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 18]
+ cmp ax, word ptr [rdx + 18]
+ sete r8b
+ movzx eax, word ptr [rsi + 20]
+ cmp ax, word ptr [rdx + 20]
+ sete r11b
+ movzx eax, word ptr [rsi + 22]
+ cmp ax, word ptr [rdx + 22]
+ sete r15b
+ movzx eax, word ptr [rsi + 24]
+ cmp ax, word ptr [rdx + 24]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 26]
+ cmp ax, word ptr [rdx + 26]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 28]
+ cmp ax, word ptr [rdx + 28]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 30]
+ cmp ax, word ptr [rdx + 30]
+ sete bl
+ movzx eax, word ptr [rsi + 32]
+ movzx ecx, word ptr [rsi + 34]
+ cmp ax, word ptr [rdx + 32]
+ movzx eax, word ptr [rsi + 36]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ cmp cx, word ptr [rdx + 34]
+ movzx ecx, word ptr [rsi + 38]
+ sete r10b
+ cmp ax, word ptr [rdx + 36]
+ movzx eax, word ptr [rsi + 40]
+ sete r14b
+ cmp cx, word ptr [rdx + 38]
+ movzx ecx, word ptr [rsi + 42]
+ sete r12b
+ cmp ax, word ptr [rdx + 40]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ cmp cx, word ptr [rdx + 42]
+ movzx eax, word ptr [rsi + 44]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 44]
+ movzx eax, word ptr [rsi + 46]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 46]
+ movzx eax, word ptr [rsi + 48]
+ sete r9b
+ cmp ax, word ptr [rdx + 48]
+ movzx eax, word ptr [rsi + 50]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 50]
+ movzx eax, word ptr [rsi + 52]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 52]
+ movzx eax, word ptr [rsi + 54]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 54]
+ movzx eax, word ptr [rsi + 56]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 56]
+ movzx eax, word ptr [rsi + 58]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 58]
+ movzx eax, word ptr [rsi + 60]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 60]
+ movzx eax, word ptr [rsi + 62]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ add rsi, 64
+ cmp ax, word ptr [rdx + 62]
+ sete dil
+ movzx eax, byte ptr [rsp + 32] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl al, 6
+ shl r13b, 7
+ or r13b, al
+ movzx eax, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 9] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r11b, 2
+ or r11b, r8b
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov r8d, ecx
+ shl r15b, 3
+ or r15b, r11b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r8b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 4
+ or al, r15b
+ mov r8d, eax
+ movzx eax, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl al, 5
+ or al, r8b
+ movzx r8d, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl r8b, 6
+ shl bl, 7
+ or bl, r8b
+ or r13b, cl
+ or bl, al
+ add r10b, r10b
+ add r10b, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl r14b, 2
+ or r14b, r10b
+ shl r12b, 3
+ or r12b, r14b
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 4
+ or al, r12b
+ mov ecx, eax
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r13b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl cl, 6
+ shl dil, 7
+ or dil, cl
+ or dil, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 64
+ add r14, 4
+ add qword ptr [rsp + 56], -1 # 8-byte Folded Spill
+ jne .LBB0_74
+# %bb.75:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 64] # 8-byte Reload
+.LBB0_76:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.77:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_78: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ movzx edi, word ptr [rsi + 2*rcx]
+ cmp di, word ptr [rdx + 2*rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_78
+ jmp .LBB0_123
+.LBB0_79:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_83
+# %bb.80:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_81: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rsi]
+ add rsi, 2
+ cmp cx, word ptr [rdx]
+ lea rdx, [rdx + 2]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_81
+# %bb.82:
+ add r14, 1
+.LBB0_83:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_87
+# %bb.84:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 64], r15 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_85: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ movzx eax, word ptr [rsi]
+ movzx ecx, word ptr [rsi + 2]
+ cmp ax, word ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp cx, word ptr [rdx + 2]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 4]
+ cmp ax, word ptr [rdx + 4]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 6]
+ cmp ax, word ptr [rdx + 6]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 8]
+ cmp ax, word ptr [rdx + 8]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 10]
+ cmp ax, word ptr [rdx + 10]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 12]
+ cmp ax, word ptr [rdx + 12]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 14]
+ cmp ax, word ptr [rdx + 14]
+ sete r13b
+ movzx eax, word ptr [rsi + 16]
+ cmp ax, word ptr [rdx + 16]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 18]
+ cmp ax, word ptr [rdx + 18]
+ sete r8b
+ movzx eax, word ptr [rsi + 20]
+ cmp ax, word ptr [rdx + 20]
+ sete r11b
+ movzx eax, word ptr [rsi + 22]
+ cmp ax, word ptr [rdx + 22]
+ sete r15b
+ movzx eax, word ptr [rsi + 24]
+ cmp ax, word ptr [rdx + 24]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 26]
+ cmp ax, word ptr [rdx + 26]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 28]
+ cmp ax, word ptr [rdx + 28]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ movzx eax, word ptr [rsi + 30]
+ cmp ax, word ptr [rdx + 30]
+ sete bl
+ movzx eax, word ptr [rsi + 32]
+ movzx ecx, word ptr [rsi + 34]
+ cmp ax, word ptr [rdx + 32]
+ movzx eax, word ptr [rsi + 36]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ cmp cx, word ptr [rdx + 34]
+ movzx ecx, word ptr [rsi + 38]
+ sete r10b
+ cmp ax, word ptr [rdx + 36]
+ movzx eax, word ptr [rsi + 40]
+ sete r14b
+ cmp cx, word ptr [rdx + 38]
+ movzx ecx, word ptr [rsi + 42]
+ sete r12b
+ cmp ax, word ptr [rdx + 40]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ cmp cx, word ptr [rdx + 42]
+ movzx eax, word ptr [rsi + 44]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 44]
+ movzx eax, word ptr [rsi + 46]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 46]
+ movzx eax, word ptr [rsi + 48]
+ sete r9b
+ cmp ax, word ptr [rdx + 48]
+ movzx eax, word ptr [rsi + 50]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 50]
+ movzx eax, word ptr [rsi + 52]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 52]
+ movzx eax, word ptr [rsi + 54]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 54]
+ movzx eax, word ptr [rsi + 56]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 56]
+ movzx eax, word ptr [rsi + 58]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 58]
+ movzx eax, word ptr [rsi + 60]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ cmp ax, word ptr [rdx + 60]
+ movzx eax, word ptr [rsi + 62]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ add rsi, 64
+ cmp ax, word ptr [rdx + 62]
+ sete dil
+ movzx eax, byte ptr [rsp + 32] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl al, 6
+ shl r13b, 7
+ or r13b, al
+ movzx eax, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 9] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r11b, 2
+ or r11b, r8b
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov r8d, ecx
+ shl r15b, 3
+ or r15b, r11b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r8b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 4
+ or al, r15b
+ mov r8d, eax
+ movzx eax, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl al, 5
+ or al, r8b
+ movzx r8d, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl r8b, 6
+ shl bl, 7
+ or bl, r8b
+ or r13b, cl
+ or bl, al
+ add r10b, r10b
+ add r10b, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl r14b, 2
+ or r14b, r10b
+ shl r12b, 3
+ or r12b, r14b
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 4
+ or al, r12b
+ mov ecx, eax
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r13b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl cl, 6
+ shl dil, 7
+ or dil, cl
+ or dil, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 64
+ add r14, 4
+ add qword ptr [rsp + 56], -1 # 8-byte Folded Spill
+ jne .LBB0_85
+# %bb.86:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 64] # 8-byte Reload
+.LBB0_87:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.88:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_89: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ movzx edi, word ptr [rsi + 2*rcx]
+ cmp di, word ptr [rdx + 2*rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_89
+ jmp .LBB0_123
+.LBB0_101:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_105
+# %bb.102:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_103: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rsi]
+ add rsi, 8
+ cmp rcx, qword ptr [rdx]
+ lea rdx, [rdx + 8]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_103
+# %bb.104:
+ add r14, 1
+.LBB0_105:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_109
+# %bb.106:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 64], r15 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_107: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ mov rax, qword ptr [rsi]
+ mov rcx, qword ptr [rsi + 8]
+ cmp rax, qword ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp rcx, qword ptr [rdx + 8]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 16]
+ cmp rax, qword ptr [rdx + 16]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 24]
+ cmp rax, qword ptr [rdx + 24]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 32]
+ cmp rax, qword ptr [rdx + 32]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 40]
+ cmp rax, qword ptr [rdx + 40]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 48]
+ cmp rax, qword ptr [rdx + 48]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 56]
+ cmp rax, qword ptr [rdx + 56]
+ sete r13b
+ mov rax, qword ptr [rsi + 64]
+ cmp rax, qword ptr [rdx + 64]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 72]
+ cmp rax, qword ptr [rdx + 72]
+ sete r8b
+ mov rax, qword ptr [rsi + 80]
+ cmp rax, qword ptr [rdx + 80]
+ sete r11b
+ mov rax, qword ptr [rsi + 88]
+ cmp rax, qword ptr [rdx + 88]
+ sete r15b
+ mov rax, qword ptr [rsi + 96]
+ cmp rax, qword ptr [rdx + 96]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 104]
+ cmp rax, qword ptr [rdx + 104]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 112]
+ cmp rax, qword ptr [rdx + 112]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ mov rax, qword ptr [rsi + 120]
+ cmp rax, qword ptr [rdx + 120]
+ sete bl
+ mov rax, qword ptr [rsi + 128]
+ mov rcx, qword ptr [rsi + 136]
+ cmp rax, qword ptr [rdx + 128]
+ mov rax, qword ptr [rsi + 144]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ cmp rcx, qword ptr [rdx + 136]
+ mov rcx, qword ptr [rsi + 152]
+ sete r10b
+ cmp rax, qword ptr [rdx + 144]
+ mov rax, qword ptr [rsi + 160]
+ sete r14b
+ cmp rcx, qword ptr [rdx + 152]
+ mov rcx, qword ptr [rsi + 168]
+ sete r12b
+ cmp rax, qword ptr [rdx + 160]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ cmp rcx, qword ptr [rdx + 168]
+ mov rax, qword ptr [rsi + 176]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 176]
+ mov rax, qword ptr [rsi + 184]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 184]
+ mov rax, qword ptr [rsi + 192]
+ sete r9b
+ cmp rax, qword ptr [rdx + 192]
+ mov rax, qword ptr [rsi + 200]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 200]
+ mov rax, qword ptr [rsi + 208]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 208]
+ mov rax, qword ptr [rsi + 216]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 216]
+ mov rax, qword ptr [rsi + 224]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 224]
+ mov rax, qword ptr [rsi + 232]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 232]
+ mov rax, qword ptr [rsi + 240]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ cmp rax, qword ptr [rdx + 240]
+ mov rax, qword ptr [rsi + 248]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ add rsi, 256
+ cmp rax, qword ptr [rdx + 248]
+ sete dil
+ movzx eax, byte ptr [rsp + 32] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl al, 6
+ shl r13b, 7
+ or r13b, al
+ movzx eax, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 9] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r11b, 2
+ or r11b, r8b
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov r8d, ecx
+ shl r15b, 3
+ or r15b, r11b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r8b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 4
+ or al, r15b
+ mov r8d, eax
+ movzx eax, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl al, 5
+ or al, r8b
+ movzx r8d, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl r8b, 6
+ shl bl, 7
+ or bl, r8b
+ or r13b, cl
+ or bl, al
+ add r10b, r10b
+ add r10b, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl r14b, 2
+ or r14b, r10b
+ shl r12b, 3
+ or r12b, r14b
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 4
+ or al, r12b
+ mov ecx, eax
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r13b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl cl, 6
+ shl dil, 7
+ or dil, cl
+ or dil, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 256
+ add r14, 4
+ add qword ptr [rsp + 56], -1 # 8-byte Folded Spill
+ jne .LBB0_107
+# %bb.108:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 64] # 8-byte Reload
+.LBB0_109:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.110:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_111: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ mov rdi, qword ptr [rsi + 8*rcx]
+ cmp rdi, qword ptr [rdx + 8*rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_111
+ jmp .LBB0_123
+.LBB0_112:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_116
+# %bb.113:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_114: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rsi] # xmm0 = mem[0],zero,zero,zero
+ add rsi, 4
+ vucomiss xmm0, dword ptr [rdx]
+ lea rdx, [rdx + 4]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_114
+# %bb.115:
+ add r14, 1
+.LBB0_116:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_120
+# %bb.117:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 32], r15 # 8-byte Spill
+ mov qword ptr [rsp + 40], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_118: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ vmovss xmm0, dword ptr [rsi] # xmm0 = mem[0],zero,zero,zero
+ vmovss xmm1, dword ptr [rsi + 4] # xmm1 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ vucomiss xmm1, dword ptr [rdx + 4]
+ sete al
+ vmovss xmm0, dword ptr [rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 8]
+ vmovss xmm0, dword ptr [rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 12]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ vmovss xmm0, dword ptr [rsi + 16] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 16]
+ vmovss xmm0, dword ptr [rsi + 20] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 20]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ vmovss xmm0, dword ptr [rsi + 24] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 24]
+ vmovss xmm0, dword ptr [rsi + 28] # xmm0 = mem[0],zero,zero,zero
+ sete r13b
+ vucomiss xmm0, dword ptr [rdx + 28]
+ sete r15b
+ vmovss xmm0, dword ptr [rsi + 32] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 32]
+ vmovss xmm0, dword ptr [rsi + 36] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 36]
+ sete cl
+ vmovss xmm0, dword ptr [rsi + 40] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 40]
+ vmovss xmm0, dword ptr [rsi + 44] # xmm0 = mem[0],zero,zero,zero
+ sete r9b
+ vucomiss xmm0, dword ptr [rdx + 44]
+ sete r11b
+ vmovss xmm0, dword ptr [rsi + 48] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 48]
+ vmovss xmm0, dword ptr [rsi + 52] # xmm0 = mem[0],zero,zero,zero
+ sete r10b
+ vucomiss xmm0, dword ptr [rdx + 52]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ vmovss xmm0, dword ptr [rsi + 56] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 56]
+ vmovss xmm0, dword ptr [rsi + 60] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 60]
+ sete bl
+ vmovss xmm0, dword ptr [rsi + 64] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 64]
+ vmovss xmm0, dword ptr [rsi + 68] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 68]
+ vmovss xmm0, dword ptr [rsi + 72] # xmm0 = mem[0],zero,zero,zero
+ sete r14b
+ vucomiss xmm0, dword ptr [rdx + 72]
+ vmovss xmm0, dword ptr [rsi + 76] # xmm0 = mem[0],zero,zero,zero
+ sete r12b
+ vucomiss xmm0, dword ptr [rdx + 76]
+ vmovss xmm0, dword ptr [rsi + 80] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 80]
+ vmovss xmm0, dword ptr [rsi + 84] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 84]
+ vmovss xmm0, dword ptr [rsi + 88] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 88]
+ vmovss xmm0, dword ptr [rsi + 92] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 92]
+ vmovss xmm0, dword ptr [rsi + 96] # xmm0 = mem[0],zero,zero,zero
+ sete r8b
+ vucomiss xmm0, dword ptr [rdx + 96]
+ vmovss xmm0, dword ptr [rsi + 100] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 100]
+ vmovss xmm0, dword ptr [rsi + 104] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 104]
+ vmovss xmm0, dword ptr [rsi + 108] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 108]
+ vmovss xmm0, dword ptr [rsi + 112] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 112]
+ vmovss xmm0, dword ptr [rsi + 116] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 116]
+ vmovss xmm0, dword ptr [rsi + 120] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rdx + 120]
+ vmovss xmm0, dword ptr [rsi + 124] # xmm0 = mem[0],zero,zero,zero
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ sub rsi, -128
+ vucomiss xmm0, dword ptr [rdx + 124]
+ sete dil
+ add al, al
+ add al, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl r13b, 6
+ shl r15b, 7
+ or r15b, r13b
+ movzx r13d, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl r13b, 2
+ or r13b, al
+ mov eax, r13d
+ add cl, cl
+ add cl, byte ptr [rsp + 8] # 1-byte Folded Reload
+ movzx r13d, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl r13b, 3
+ or r13b, al
+ shl r9b, 2
+ or r9b, cl
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, r13b
+ mov r13d, ecx
+ shl r11b, 3
+ or r11b, r9b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r13b
+ shl r10b, 4
+ or r10b, r11b
+ movzx eax, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl al, 5
+ or al, r10b
+ movzx r9d, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl r9b, 6
+ shl bl, 7
+ or bl, r9b
+ or r15b, cl
+ or bl, al
+ add r14b, r14b
+ add r14b, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl r12b, 2
+ or r12b, r14b
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 9] # 1-byte Folded Reload
+ shl al, 3
+ or al, r12b
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r15b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r8b, 7
+ or r8b, cl
+ mov byte ptr [r14 + 1], bl
+ or r8b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 20] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ movzx ecx, byte ptr [rsp + 19] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, al
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 6
+ shl dil, 7
+ or dil, al
+ or dil, cl
+ mov byte ptr [r14 + 2], r8b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 128
+ add r14, 4
+ add qword ptr [rsp + 40], -1 # 8-byte Folded Spill
+ jne .LBB0_118
+# %bb.119:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 32] # 8-byte Reload
+.LBB0_120:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.121:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_122: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rsi + 4*rcx] # xmm0 = mem[0],zero,zero,zero
+ vucomiss xmm0, dword ptr [rdx + 4*rcx]
+ lea r8, [rcx + 1]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_122
+ jmp .LBB0_123
+.LBB0_57:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_61
+# %bb.58:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_59: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rsi]
+ add rsi, 1
+ cmp cl, byte ptr [rdx]
+ lea rdx, [rdx + 1]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_59
+# %bb.60:
+ add r14, 1
+.LBB0_61:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_65
+# %bb.62:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ mov qword ptr [rsp + 32], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_63: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ movzx eax, byte ptr [rsi]
+ movzx ecx, byte ptr [rsi + 1]
+ cmp al, byte ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp cl, byte ptr [rdx + 1]
+ sete cl
+ movzx eax, byte ptr [rsi + 2]
+ cmp al, byte ptr [rdx + 2]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 3]
+ cmp al, byte ptr [rdx + 3]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 4]
+ cmp al, byte ptr [rdx + 4]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 5]
+ cmp al, byte ptr [rdx + 5]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 6]
+ cmp al, byte ptr [rdx + 6]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 7]
+ cmp al, byte ptr [rdx + 7]
+ sete r15b
+ movzx eax, byte ptr [rsi + 8]
+ cmp al, byte ptr [rdx + 8]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 9]
+ cmp al, byte ptr [rdx + 9]
+ sete dil
+ movzx eax, byte ptr [rsi + 10]
+ cmp al, byte ptr [rdx + 10]
+ sete r10b
+ movzx eax, byte ptr [rsi + 11]
+ cmp al, byte ptr [rdx + 11]
+ sete r11b
+ movzx eax, byte ptr [rsi + 12]
+ cmp al, byte ptr [rdx + 12]
+ sete r14b
+ movzx eax, byte ptr [rsi + 13]
+ cmp al, byte ptr [rdx + 13]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 14]
+ cmp al, byte ptr [rdx + 14]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 15]
+ cmp al, byte ptr [rdx + 15]
+ sete bl
+ movzx eax, byte ptr [rsi + 16]
+ cmp al, byte ptr [rdx + 16]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 17]
+ cmp al, byte ptr [rdx + 17]
+ sete r12b
+ movzx eax, byte ptr [rsi + 18]
+ cmp al, byte ptr [rdx + 18]
+ sete r13b
+ movzx eax, byte ptr [rsi + 19]
+ cmp al, byte ptr [rdx + 19]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 20]
+ cmp al, byte ptr [rdx + 20]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 21]
+ cmp al, byte ptr [rdx + 21]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 22]
+ cmp al, byte ptr [rdx + 22]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 23]
+ cmp al, byte ptr [rdx + 23]
+ sete r9b
+ movzx eax, byte ptr [rsi + 24]
+ cmp al, byte ptr [rdx + 24]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 25]
+ cmp al, byte ptr [rdx + 25]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 26]
+ cmp al, byte ptr [rdx + 26]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 27]
+ cmp al, byte ptr [rdx + 27]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 28]
+ cmp al, byte ptr [rdx + 28]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 29]
+ cmp al, byte ptr [rdx + 29]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 30]
+ cmp al, byte ptr [rdx + 30]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ movzx eax, byte ptr [rsi + 31]
+ add rsi, 32
+ cmp al, byte ptr [rdx + 31]
+ sete r8b
+ add cl, cl
+ add cl, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov eax, ecx
+ movzx ecx, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl cl, 6
+ shl r15b, 7
+ or r15b, cl
+ movzx ecx, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, al
+ mov eax, ecx
+ add dil, dil
+ add dil, byte ptr [rsp + 7] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r10b, 2
+ or r10b, dil
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov edi, ecx
+ shl r11b, 3
+ or r11b, r10b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, dil
+ shl r14b, 4
+ or r14b, r11b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 5
+ or al, r14b
+ movzx edi, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl dil, 6
+ shl bl, 7
+ or bl, dil
+ or r15b, cl
+ or bl, al
+ add r12b, r12b
+ add r12b, byte ptr [rsp + 13] # 1-byte Folded Reload
+ shl r13b, 2
+ or r13b, r12b
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 3
+ or al, r13b
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 9] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r15b
+ movzx ecx, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 12] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl cl, 6
+ shl r8b, 7
+ or r8b, cl
+ or r8b, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], r8b
+ add rdx, 32
+ add r14, 4
+ add qword ptr [rsp + 32], -1 # 8-byte Folded Spill
+ jne .LBB0_63
+# %bb.64:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 56] # 8-byte Reload
+.LBB0_65:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.66:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_67: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ movzx ebx, byte ptr [rsi + rcx]
+ cmp bl, byte ptr [rdx + rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_67
+ jmp .LBB0_123
+.LBB0_90:
+ lea r15, [r11 + 31]
+ test r11, r11
+ cmovns r15, r11
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB0_94
+# %bb.91:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB0_92: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rsi]
+ add rsi, 4
+ cmp ecx, dword ptr [rdx]
+ lea rdx, [rdx + 4]
+ sete r10b
+ neg r10b
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r8d, byte ptr [r14 + rdi]
+ xor r10b, r8b
+ lea r9d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r9d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, r10b
+ xor bl, r8b
+ mov byte ptr [r14 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB0_92
+# %bb.93:
+ add r14, 1
+.LBB0_94:
+ sar r15, 5
+ cmp r11, 32
+ jl .LBB0_98
+# %bb.95:
+ mov qword ptr [rsp + 24], r11 # 8-byte Spill
+ mov qword ptr [rsp + 64], r15 # 8-byte Spill
+ mov qword ptr [rsp + 56], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB0_96: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 48], r14 # 8-byte Spill
+ mov eax, dword ptr [rsi]
+ mov ecx, dword ptr [rsi + 4]
+ cmp eax, dword ptr [rdx]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp ecx, dword ptr [rdx + 4]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 8]
+ cmp eax, dword ptr [rdx + 8]
+ sete byte ptr [rsp + 20] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 12]
+ cmp eax, dword ptr [rdx + 12]
+ sete byte ptr [rsp + 21] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 16]
+ cmp eax, dword ptr [rdx + 16]
+ sete byte ptr [rsp + 22] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 20]
+ cmp eax, dword ptr [rdx + 20]
+ sete byte ptr [rsp + 23] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 24]
+ cmp eax, dword ptr [rdx + 24]
+ sete byte ptr [rsp + 4] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 28]
+ cmp eax, dword ptr [rdx + 28]
+ sete r13b
+ mov eax, dword ptr [rsi + 32]
+ cmp eax, dword ptr [rdx + 32]
+ sete byte ptr [rsp + 9] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 36]
+ cmp eax, dword ptr [rdx + 36]
+ sete r8b
+ mov eax, dword ptr [rsi + 40]
+ cmp eax, dword ptr [rdx + 40]
+ sete r11b
+ mov eax, dword ptr [rsi + 44]
+ cmp eax, dword ptr [rdx + 44]
+ sete r15b
+ mov eax, dword ptr [rsi + 48]
+ cmp eax, dword ptr [rdx + 48]
+ sete byte ptr [rsp + 5] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 52]
+ cmp eax, dword ptr [rdx + 52]
+ sete byte ptr [rsp + 6] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 56]
+ cmp eax, dword ptr [rdx + 56]
+ sete byte ptr [rsp + 7] # 1-byte Folded Spill
+ mov eax, dword ptr [rsi + 60]
+ cmp eax, dword ptr [rdx + 60]
+ sete bl
+ mov eax, dword ptr [rsi + 64]
+ mov ecx, dword ptr [rsi + 68]
+ cmp eax, dword ptr [rdx + 64]
+ mov eax, dword ptr [rsi + 72]
+ sete byte ptr [rsp + 10] # 1-byte Folded Spill
+ cmp ecx, dword ptr [rdx + 68]
+ mov ecx, dword ptr [rsi + 76]
+ sete r10b
+ cmp eax, dword ptr [rdx + 72]
+ mov eax, dword ptr [rsi + 80]
+ sete r14b
+ cmp ecx, dword ptr [rdx + 76]
+ mov ecx, dword ptr [rsi + 84]
+ sete r12b
+ cmp eax, dword ptr [rdx + 80]
+ sete byte ptr [rsp + 8] # 1-byte Folded Spill
+ cmp ecx, dword ptr [rdx + 84]
+ mov eax, dword ptr [rsi + 88]
+ sete byte ptr [rsp + 11] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 88]
+ mov eax, dword ptr [rsi + 92]
+ sete byte ptr [rsp + 12] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 92]
+ mov eax, dword ptr [rsi + 96]
+ sete r9b
+ cmp eax, dword ptr [rdx + 96]
+ mov eax, dword ptr [rsi + 100]
+ sete byte ptr [rsp + 19] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 100]
+ mov eax, dword ptr [rsi + 104]
+ sete byte ptr [rsp + 13] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 104]
+ mov eax, dword ptr [rsi + 108]
+ sete byte ptr [rsp + 14] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 108]
+ mov eax, dword ptr [rsi + 112]
+ sete byte ptr [rsp + 15] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 112]
+ mov eax, dword ptr [rsi + 116]
+ sete byte ptr [rsp + 16] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 116]
+ mov eax, dword ptr [rsi + 120]
+ sete byte ptr [rsp + 18] # 1-byte Folded Spill
+ cmp eax, dword ptr [rdx + 120]
+ mov eax, dword ptr [rsi + 124]
+ sete byte ptr [rsp + 17] # 1-byte Folded Spill
+ sub rsi, -128
+ cmp eax, dword ptr [rdx + 124]
+ sete dil
+ movzx eax, byte ptr [rsp + 32] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 40] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 4] # 1-byte Folded Reload
+ shl al, 6
+ shl r13b, 7
+ or r13b, al
+ movzx eax, byte ptr [rsp + 20] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 9] # 1-byte Folded Reload
+ movzx ecx, byte ptr [rsp + 21] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, al
+ mov eax, ecx
+ shl r11b, 2
+ or r11b, r8b
+ movzx ecx, byte ptr [rsp + 22] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, al
+ mov r8d, ecx
+ shl r15b, 3
+ or r15b, r11b
+ movzx ecx, byte ptr [rsp + 23] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, r8b
+ movzx eax, byte ptr [rsp + 5] # 1-byte Folded Reload
+ shl al, 4
+ or al, r15b
+ mov r8d, eax
+ movzx eax, byte ptr [rsp + 6] # 1-byte Folded Reload
+ shl al, 5
+ or al, r8b
+ movzx r8d, byte ptr [rsp + 7] # 1-byte Folded Reload
+ shl r8b, 6
+ shl bl, 7
+ or bl, r8b
+ or r13b, cl
+ or bl, al
+ add r10b, r10b
+ add r10b, byte ptr [rsp + 10] # 1-byte Folded Reload
+ shl r14b, 2
+ or r14b, r10b
+ shl r12b, 3
+ or r12b, r14b
+ movzx eax, byte ptr [rsp + 8] # 1-byte Folded Reload
+ shl al, 4
+ or al, r12b
+ mov ecx, eax
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ movzx eax, byte ptr [rsp + 11] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ mov byte ptr [r14], r13b
+ movzx ecx, byte ptr [rsp + 12] # 1-byte Folded Reload
+ shl cl, 6
+ shl r9b, 7
+ or r9b, cl
+ mov byte ptr [r14 + 1], bl
+ or r9b, al
+ movzx eax, byte ptr [rsp + 13] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 19] # 1-byte Folded Reload
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 14] # 1-byte Folded Reload
+ shl al, 2
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 15] # 1-byte Folded Reload
+ shl al, 3
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 16] # 1-byte Folded Reload
+ shl al, 4
+ or al, cl
+ mov ecx, eax
+ movzx eax, byte ptr [rsp + 18] # 1-byte Folded Reload
+ shl al, 5
+ or al, cl
+ movzx ecx, byte ptr [rsp + 17] # 1-byte Folded Reload
+ shl cl, 6
+ shl dil, 7
+ or dil, cl
+ or dil, al
+ mov byte ptr [r14 + 2], r9b
+ mov byte ptr [r14 + 3], dil
+ add rdx, 128
+ add r14, 4
+ add qword ptr [rsp + 56], -1 # 8-byte Folded Spill
+ jne .LBB0_96
+# %bb.97:
+ mov r11, qword ptr [rsp + 24] # 8-byte Reload
+ mov r15, qword ptr [rsp + 64] # 8-byte Reload
+.LBB0_98:
+ shl r15, 5
+ cmp r15, r11
+ jge .LBB0_123
+# %bb.99:
+ sub r11, r15
+ xor ecx, ecx
+ .p2align 4, 0x90
+.LBB0_100: # =>This Inner Loop Header: Depth=1
+ lea r8, [rcx + 1]
+ mov edi, dword ptr [rsi + 4*rcx]
+ cmp edi, dword ptr [rdx + 4*rcx]
+ sete bl
+ neg bl
+ mov rdi, rcx
+ shr rdi, 3
+ movzx r9d, byte ptr [r14 + rdi]
+ xor bl, r9b
+ and cl, 7
+ mov al, 1
+ # kill: def $cl killed $cl killed $rcx
+ shl al, cl
+ and al, bl
+ xor al, r9b
+ mov byte ptr [r14 + rdi], al
+ mov rcx, r8
+ cmp r11, r8
+ jne .LBB0_100
+.LBB0_123:
+ lea rsp, [rbp - 40]
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ pop rbp
+ ret
+.Lfunc_end0:
+ .size comparison_equal_arr_arr_avx2, .Lfunc_end0-comparison_equal_arr_arr_avx2
+ # -- End function
+ .section .rodata.cst32,"aM",@progbits,32
+ .p2align 5 # -- Begin function comparison_equal_arr_scalar_avx2
+.LCPI1_0:
+ .zero 32,1
+.LCPI1_1:
+ .zero 32,4
+.LCPI1_2:
+ .zero 32,8
+.LCPI1_3:
+ .zero 32,16
+.LCPI1_4:
+ .zero 32,32
+.LCPI1_5:
+ .zero 32,64
+.LCPI1_6:
+ .zero 32,128
+ .text
+ .globl comparison_equal_arr_scalar_avx2
+ .p2align 4, 0x90
+ .type comparison_equal_arr_scalar_avx2,@function
+comparison_equal_arr_scalar_avx2: # @comparison_equal_arr_scalar_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ and rsp, -32
+ sub rsp, 1280
+ # kill: def $r9d killed $r9d def $r9
+ mov r10, r8
+ mov r11, rcx
+ cmp edi, 6
+ jg .LBB1_13
+# %bb.1:
+ cmp edi, 3
+ jle .LBB1_25
+# %bb.2:
+ cmp edi, 4
+ je .LBB1_49
+# %bb.3:
+ cmp edi, 5
+ je .LBB1_57
+# %bb.4:
+ cmp edi, 6
+ jne .LBB1_164
+# %bb.5:
+ mov r13d, dword ptr [rdx]
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_9
+# %bb.6:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_7: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rsi], r13d
+ lea rsi, [rsi + 4]
+ sete dl
+ neg dl
+ lea rbx, [rax + 7]
+ test rax, rax
+ cmovns rbx, rax
+ sar rbx, 3
+ movzx r8d, byte ptr [r11 + rbx]
+ xor dl, r8b
+ lea edi, [8*rbx]
+ mov ecx, eax
+ sub ecx, edi
+ mov edi, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl edi, cl
+ and dil, dl
+ xor dil, r8b
+ mov byte ptr [r11 + rbx], dil
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_7
+# %bb.8:
+ add r11, 1
+.LBB1_9:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_101
+# %bb.10:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 176], r15 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_11: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rsi], r13d
+ sete byte ptr [rsp + 152] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 4], r13d
+ sete dil
+ cmp dword ptr [rsi + 8], r13d
+ sete r14b
+ cmp dword ptr [rsi + 12], r13d
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 16], r13d
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 20], r13d
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 24], r13d
+ sete al
+ cmp dword ptr [rsi + 28], r13d
+ sete bl
+ cmp dword ptr [rsi + 32], r13d
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 36], r13d
+ sete dl
+ cmp dword ptr [rsi + 40], r13d
+ sete r9b
+ cmp dword ptr [rsi + 44], r13d
+ sete r10b
+ cmp dword ptr [rsi + 48], r13d
+ sete r11b
+ cmp dword ptr [rsi + 52], r13d
+ sete r12b
+ cmp dword ptr [rsi + 56], r13d
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 60], r13d
+ sete cl
+ cmp dword ptr [rsi + 64], r13d
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 68], r13d
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 72], r13d
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 76], r13d
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 80], r13d
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 84], r13d
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 88], r13d
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 92], r13d
+ sete r15b
+ cmp dword ptr [rsi + 96], r13d
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 100], r13d
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 104], r13d
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 108], r13d
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 112], r13d
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 116], r13d
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 120], r13d
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 124], r13d
+ sete r8b
+ add dil, dil
+ add dil, byte ptr [rsp + 152] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, dil
+ add dl, dl
+ add dl, byte ptr [rsp + 104] # 1-byte Folded Reload
+ movzx eax, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 3
+ or al, r14b
+ shl r9b, 2
+ or r9b, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, al
+ mov edi, edx
+ shl r10b, 3
+ or r10b, r9b
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ movzx edi, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ add dl, dl
+ add dl, byte ptr [rsp + 72] # 1-byte Folded Reload
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov edi, edx
+ mov rdx, qword ptr [rsp + 272] # 8-byte Reload
+ mov byte ptr [rdx], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [rdx + 1], cl
+ or r15b, dil
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, bl
+ movzx ebx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl bl, 6
+ shl r8b, 7
+ or r8b, bl
+ or r8b, cl
+ mov byte ptr [rdx + 2], r15b
+ mov byte ptr [rdx + 3], r8b
+ add rsi, 128
+ add rdx, 4
+ mov qword ptr [rsp + 272], rdx # 8-byte Spill
+ add qword ptr [rsp + 168], -1 # 8-byte Folded Spill
+ jne .LBB1_11
+# %bb.12:
+ mov r14, qword ptr [rsp + 272] # 8-byte Reload
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 176] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_102
+ jmp .LBB1_164
+.LBB1_13:
+ cmp edi, 8
+ jle .LBB1_39
+# %bb.14:
+ cmp edi, 9
+ je .LBB1_65
+# %bb.15:
+ cmp edi, 11
+ je .LBB1_73
+# %bb.16:
+ cmp edi, 12
+ jne .LBB1_164
+# %bb.17:
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ vmovsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ sub r9d, eax
+ je .LBB1_21
+# %bb.18:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_19: # =>This Inner Loop Header: Depth=1
+ vucomisd xmm0, qword ptr [rsi]
+ lea rsi, [rsi + 8]
+ sete dl
+ neg dl
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r9d, byte ptr [r11 + rdi]
+ xor dl, r9b
+ lea r8d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r8d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, dl
+ xor bl, r9b
+ mov byte ptr [r11 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_19
+# %bb.20:
+ add r11, 1
+.LBB1_21:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_105
+# %bb.22:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ mov qword ptr [rsp + 152], r15 # 8-byte Spill
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_23: # =>This Inner Loop Header: Depth=1
+ vucomisd xmm0, qword ptr [rsi]
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 8]
+ sete r9b
+ vucomisd xmm0, qword ptr [rsi + 16]
+ sete r14b
+ vucomisd xmm0, qword ptr [rsi + 24]
+ sete r13b
+ vucomisd xmm0, qword ptr [rsi + 32]
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 40]
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 48]
+ sete al
+ vucomisd xmm0, qword ptr [rsi + 56]
+ sete bl
+ vucomisd xmm0, qword ptr [rsi + 64]
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 72]
+ sete dl
+ vucomisd xmm0, qword ptr [rsi + 80]
+ sete dil
+ vucomisd xmm0, qword ptr [rsi + 88]
+ sete r10b
+ vucomisd xmm0, qword ptr [rsi + 96]
+ sete r11b
+ vucomisd xmm0, qword ptr [rsi + 104]
+ sete r12b
+ vucomisd xmm0, qword ptr [rsi + 112]
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 120]
+ sete cl
+ vucomisd xmm0, qword ptr [rsi + 128]
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 136]
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 144]
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 152]
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 160]
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 168]
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 176]
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 184]
+ sete r15b
+ vucomisd xmm0, qword ptr [rsi + 192]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 200]
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 208]
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 216]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 224]
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 232]
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 240]
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ vucomisd xmm0, qword ptr [rsi + 248]
+ sete r8b
+ add r9b, r9b
+ add r9b, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, r9b
+ add dl, dl
+ add dl, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl r13b, 3
+ or r13b, r14b
+ shl dil, 2
+ or dil, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, r13b
+ mov r9d, edx
+ shl r10b, 3
+ or r10b, dil
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, r9b
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ movzx edi, byte ptr [rsp + 120] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx eax, byte ptr [rsp + 104] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 72] # 1-byte Folded Reload
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, al
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov edi, edx
+ mov rdx, qword ptr [rsp + 272] # 8-byte Reload
+ mov byte ptr [rdx], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [rdx + 1], cl
+ or r15b, dil
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, bl
+ movzx ebx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl bl, 6
+ shl r8b, 7
+ or r8b, bl
+ or r8b, cl
+ mov byte ptr [rdx + 2], r15b
+ mov byte ptr [rdx + 3], r8b
+ add rsi, 256
+ add rdx, 4
+ mov qword ptr [rsp + 272], rdx # 8-byte Spill
+ add qword ptr [rsp + 152], -1 # 8-byte Folded Spill
+ jne .LBB1_23
+# %bb.24:
+ mov r14, qword ptr [rsp + 272] # 8-byte Reload
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 168] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_106
+ jmp .LBB1_164
+.LBB1_25:
+ cmp edi, 2
+ je .LBB1_81
+# %bb.26:
+ cmp edi, 3
+ jne .LBB1_164
+# %bb.27:
+ mov r14b, byte ptr [rdx]
+ lea r13, [r10 + 31]
+ test r10, r10
+ mov r15, r10
+ cmovns r13, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_31
+# %bb.28:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_29: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rsi], r14b
+ lea rsi, [rsi + 1]
+ sete dl
+ neg dl
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r9d, byte ptr [r11 + rdi]
+ xor dl, r9b
+ lea r8d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r8d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, dl
+ xor bl, r9b
+ mov byte ptr [r11 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_29
+# %bb.30:
+ add r11, 1
+.LBB1_31:
+ sar r13, 5
+ cmp r15, 32
+ jl .LBB1_108
+# %bb.32:
+ cmp r13, 32
+ mov dword ptr [rsp + 28], r14d # 4-byte Spill
+ mov qword ptr [rsp + 280], r15 # 8-byte Spill
+ mov qword ptr [rsp + 392], r13 # 8-byte Spill
+ jb .LBB1_35
+# %bb.33:
+ mov rax, r13
+ shl rax, 5
+ add rax, rsi
+ cmp r11, rax
+ jae .LBB1_165
+# %bb.34:
+ lea rax, [r11 + 4*r13]
+ cmp rsi, rax
+ jae .LBB1_165
+.LBB1_35:
+ xor eax, eax
+ mov qword ptr [rsp + 384], rax # 8-byte Spill
+ mov r12, rsi
+ mov qword ptr [rsp + 376], r11 # 8-byte Spill
+.LBB1_36:
+ sub r13, qword ptr [rsp + 384] # 8-byte Folded Reload
+ mov qword ptr [rsp + 152], r13 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_37: # =>This Inner Loop Header: Depth=1
+ mov rcx, r12
+ cmp byte ptr [r12], r14b
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 1], r14b
+ sete r8b
+ cmp byte ptr [r12 + 2], r14b
+ sete r15b
+ cmp byte ptr [r12 + 3], r14b
+ sete r13b
+ cmp byte ptr [r12 + 4], r14b
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 5], r14b
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 6], r14b
+ sete al
+ cmp byte ptr [r12 + 7], r14b
+ sete r11b
+ cmp byte ptr [r12 + 8], r14b
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 9], r14b
+ sete dl
+ cmp byte ptr [r12 + 10], r14b
+ sete sil
+ cmp byte ptr [r12 + 11], r14b
+ sete dil
+ cmp byte ptr [r12 + 12], r14b
+ sete r10b
+ cmp byte ptr [r12 + 13], r14b
+ sete r12b
+ cmp byte ptr [rcx + 14], r14b
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 15], r14b
+ sete r9b
+ cmp byte ptr [rcx + 16], r14b
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 17], r14b
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 18], r14b
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 19], r14b
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 20], r14b
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 21], r14b
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 22], r14b
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 23], r14b
+ sete r14b
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 24], bl
+ sete byte ptr [rsp + 272] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 25], bl
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 26], bl
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 27], bl
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 28], bl
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 29], bl
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 30], bl
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 31], bl
+ sete bl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 48] # 1-byte Folded Reload
+ shl al, 6
+ shl r11b, 7
+ or r11b, al
+ shl r15b, 2
+ or r15b, r8b
+ add dl, dl
+ add dl, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl r13b, 3
+ or r13b, r15b
+ shl sil, 2
+ or sil, dl
+ movzx edx, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, r13b
+ mov r8d, edx
+ shl dil, 3
+ or dil, sil
+ movzx edx, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, r8b
+ shl r10b, 4
+ or r10b, dil
+ shl r12b, 5
+ or r12b, r10b
+ movzx esi, byte ptr [rsp + 104] # 1-byte Folded Reload
+ shl sil, 6
+ shl r9b, 7
+ or r9b, sil
+ or r11b, dl
+ or r9b, r12b
+ movzx eax, byte ptr [rsp + 128] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 288] # 1-byte Folded Reload
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, al
+ mov esi, edx
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, sil
+ mov esi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, sil
+ mov esi, edx
+ movzx edx, byte ptr [rsp + 72] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, sil
+ mov rsi, qword ptr [rsp + 376] # 8-byte Reload
+ mov byte ptr [rsi], r11b
+ movzx edi, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dil, 6
+ shl r14b, 7
+ or r14b, dil
+ mov byte ptr [rsi + 1], r9b
+ or r14b, dl
+ movzx eax, byte ptr [rsp + 80] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 272] # 1-byte Folded Reload
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl al, 2
+ or al, dl
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl al, 3
+ or al, dl
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl al, 4
+ or al, dl
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl al, 5
+ or al, dl
+ movzx edx, byte ptr [rsp + 32] # 1-byte Folded Reload
+ shl dl, 6
+ shl bl, 7
+ or bl, dl
+ or bl, al
+ mov byte ptr [rsi + 2], r14b
+ mov r14d, dword ptr [rsp + 28] # 4-byte Reload
+ mov byte ptr [rsi + 3], bl
+ lea r12, [rcx + 32]
+ add rsi, 4
+ mov qword ptr [rsp + 376], rsi # 8-byte Spill
+ add qword ptr [rsp + 152], -1 # 8-byte Folded Spill
+ jne .LBB1_37
+# %bb.38:
+ mov r15, qword ptr [rsp + 280] # 8-byte Reload
+ mov r13, qword ptr [rsp + 392] # 8-byte Reload
+ jmp .LBB1_109
+.LBB1_39:
+ cmp edi, 7
+ je .LBB1_93
+# %bb.40:
+ cmp edi, 8
+ jne .LBB1_164
+# %bb.41:
+ mov r13, qword ptr [rdx]
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_45
+# %bb.42:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_43: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rsi], r13
+ lea rsi, [rsi + 8]
+ sete dl
+ neg dl
+ lea rbx, [rax + 7]
+ test rax, rax
+ cmovns rbx, rax
+ sar rbx, 3
+ movzx r8d, byte ptr [r11 + rbx]
+ xor dl, r8b
+ lea edi, [8*rbx]
+ mov ecx, eax
+ sub ecx, edi
+ mov edi, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl edi, cl
+ and dil, dl
+ xor dil, r8b
+ mov byte ptr [r11 + rbx], dil
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_43
+# %bb.44:
+ add r11, 1
+.LBB1_45:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_112
+# %bb.46:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 176], r15 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_47: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ cmp qword ptr [rsi], r13
+ sete byte ptr [rsp + 152] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 8], r13
+ sete dil
+ cmp qword ptr [rsi + 16], r13
+ sete r14b
+ cmp qword ptr [rsi + 24], r13
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 32], r13
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 40], r13
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 48], r13
+ sete al
+ cmp qword ptr [rsi + 56], r13
+ sete bl
+ cmp qword ptr [rsi + 64], r13
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 72], r13
+ sete dl
+ cmp qword ptr [rsi + 80], r13
+ sete r9b
+ cmp qword ptr [rsi + 88], r13
+ sete r10b
+ cmp qword ptr [rsi + 96], r13
+ sete r11b
+ cmp qword ptr [rsi + 104], r13
+ sete r12b
+ cmp qword ptr [rsi + 112], r13
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 120], r13
+ sete cl
+ cmp qword ptr [rsi + 128], r13
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 136], r13
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 144], r13
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 152], r13
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 160], r13
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 168], r13
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 176], r13
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 184], r13
+ sete r15b
+ cmp qword ptr [rsi + 192], r13
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 200], r13
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 208], r13
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 216], r13
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 224], r13
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 232], r13
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 240], r13
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 248], r13
+ sete r8b
+ add dil, dil
+ add dil, byte ptr [rsp + 152] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, dil
+ add dl, dl
+ add dl, byte ptr [rsp + 104] # 1-byte Folded Reload
+ movzx eax, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 3
+ or al, r14b
+ shl r9b, 2
+ or r9b, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, al
+ mov edi, edx
+ shl r10b, 3
+ or r10b, r9b
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ mov r11, qword ptr [rsp + 272] # 8-byte Reload
+ movzx edi, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ add dl, dl
+ add dl, byte ptr [rsp + 72] # 1-byte Folded Reload
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov byte ptr [r11], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [r11 + 1], cl
+ or r15b, dl
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, dl
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, dl
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, dl
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, dl
+ movzx edx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl dl, 6
+ shl r8b, 7
+ or r8b, dl
+ or r8b, cl
+ mov byte ptr [r11 + 2], r15b
+ mov byte ptr [r11 + 3], r8b
+ add rsi, 256
+ add r11, 4
+ add qword ptr [rsp + 168], -1 # 8-byte Folded Spill
+ jne .LBB1_47
+# %bb.48:
+ mov r14, r11
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 176] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_113
+ jmp .LBB1_164
+.LBB1_49:
+ movzx r13d, word ptr [rdx]
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_53
+# %bb.50:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_51: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rsi], r13w
+ lea rsi, [rsi + 2]
+ sete dl
+ neg dl
+ lea rbx, [rax + 7]
+ test rax, rax
+ cmovns rbx, rax
+ sar rbx, 3
+ movzx r8d, byte ptr [r11 + rbx]
+ xor dl, r8b
+ lea edi, [8*rbx]
+ mov ecx, eax
+ sub ecx, edi
+ mov edi, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl edi, cl
+ and dil, dl
+ xor dil, r8b
+ mov byte ptr [r11 + rbx], dil
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_51
+# %bb.52:
+ add r11, 1
+.LBB1_53:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_116
+# %bb.54:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 176], r15 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_55: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rsi], r13w
+ sete al
+ cmp word ptr [rsi + 2], r13w
+ sete dil
+ cmp word ptr [rsi + 4], r13w
+ sete r14b
+ cmp word ptr [rsi + 6], r13w
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp word ptr [rsi + 8], r13w
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp word ptr [rsi + 10], r13w
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ cmp word ptr [rsi + 12], r13w
+ sete byte ptr [rsp + 152] # 1-byte Folded Spill
+ cmp word ptr [rsi + 14], r13w
+ sete bl
+ cmp word ptr [rsi + 16], r13w
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp word ptr [rsi + 18], r13w
+ sete dl
+ cmp word ptr [rsi + 20], r13w
+ sete r9b
+ cmp word ptr [rsi + 22], r13w
+ sete r10b
+ cmp word ptr [rsi + 24], r13w
+ sete r11b
+ cmp word ptr [rsi + 26], r13w
+ sete r12b
+ cmp word ptr [rsi + 28], r13w
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp word ptr [rsi + 30], r13w
+ sete cl
+ cmp word ptr [rsi + 32], r13w
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp word ptr [rsi + 34], r13w
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp word ptr [rsi + 36], r13w
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp word ptr [rsi + 38], r13w
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp word ptr [rsi + 40], r13w
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ cmp word ptr [rsi + 42], r13w
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ cmp word ptr [rsi + 44], r13w
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ cmp word ptr [rsi + 46], r13w
+ sete r15b
+ cmp word ptr [rsi + 48], r13w
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ cmp word ptr [rsi + 50], r13w
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ cmp word ptr [rsi + 52], r13w
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ cmp word ptr [rsi + 54], r13w
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp word ptr [rsi + 56], r13w
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp word ptr [rsi + 58], r13w
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp word ptr [rsi + 60], r13w
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ cmp word ptr [rsi + 62], r13w
+ sete r8b
+ add dil, dil
+ or dil, al
+ movzx eax, byte ptr [rsp + 152] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, dil
+ add dl, dl
+ add dl, byte ptr [rsp + 112] # 1-byte Folded Reload
+ movzx eax, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 3
+ or al, r14b
+ shl r9b, 2
+ or r9b, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, al
+ mov edi, edx
+ shl r10b, 3
+ or r10b, r9b
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ movzx edi, byte ptr [rsp + 104] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ add dl, dl
+ add dl, byte ptr [rsp + 72] # 1-byte Folded Reload
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov edi, edx
+ mov rdx, qword ptr [rsp + 272] # 8-byte Reload
+ mov byte ptr [rdx], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [rdx + 1], cl
+ or r15b, dil
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, bl
+ movzx ebx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl bl, 6
+ shl r8b, 7
+ or r8b, bl
+ or r8b, cl
+ mov byte ptr [rdx + 2], r15b
+ mov byte ptr [rdx + 3], r8b
+ add rsi, 64
+ add rdx, 4
+ mov qword ptr [rsp + 272], rdx # 8-byte Spill
+ add qword ptr [rsp + 168], -1 # 8-byte Folded Spill
+ jne .LBB1_55
+# %bb.56:
+ mov r14, qword ptr [rsp + 272] # 8-byte Reload
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 176] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_117
+ jmp .LBB1_164
+.LBB1_57:
+ movzx r13d, word ptr [rdx]
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_61
+# %bb.58:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_59: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rsi], r13w
+ lea rsi, [rsi + 2]
+ sete dl
+ neg dl
+ lea rbx, [rax + 7]
+ test rax, rax
+ cmovns rbx, rax
+ sar rbx, 3
+ movzx r8d, byte ptr [r11 + rbx]
+ xor dl, r8b
+ lea edi, [8*rbx]
+ mov ecx, eax
+ sub ecx, edi
+ mov edi, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl edi, cl
+ and dil, dl
+ xor dil, r8b
+ mov byte ptr [r11 + rbx], dil
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_59
+# %bb.60:
+ add r11, 1
+.LBB1_61:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_120
+# %bb.62:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 176], r15 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_63: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rsi], r13w
+ sete byte ptr [rsp + 152] # 1-byte Folded Spill
+ cmp word ptr [rsi + 2], r13w
+ sete dil
+ cmp word ptr [rsi + 4], r13w
+ sete r14b
+ cmp word ptr [rsi + 6], r13w
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp word ptr [rsi + 8], r13w
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp word ptr [rsi + 10], r13w
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ cmp word ptr [rsi + 12], r13w
+ sete al
+ cmp word ptr [rsi + 14], r13w
+ sete bl
+ cmp word ptr [rsi + 16], r13w
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp word ptr [rsi + 18], r13w
+ sete dl
+ cmp word ptr [rsi + 20], r13w
+ sete r9b
+ cmp word ptr [rsi + 22], r13w
+ sete r10b
+ cmp word ptr [rsi + 24], r13w
+ sete r11b
+ cmp word ptr [rsi + 26], r13w
+ sete r12b
+ cmp word ptr [rsi + 28], r13w
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp word ptr [rsi + 30], r13w
+ sete cl
+ cmp word ptr [rsi + 32], r13w
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp word ptr [rsi + 34], r13w
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp word ptr [rsi + 36], r13w
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp word ptr [rsi + 38], r13w
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp word ptr [rsi + 40], r13w
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ cmp word ptr [rsi + 42], r13w
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ cmp word ptr [rsi + 44], r13w
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ cmp word ptr [rsi + 46], r13w
+ sete r15b
+ cmp word ptr [rsi + 48], r13w
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ cmp word ptr [rsi + 50], r13w
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ cmp word ptr [rsi + 52], r13w
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ cmp word ptr [rsi + 54], r13w
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp word ptr [rsi + 56], r13w
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp word ptr [rsi + 58], r13w
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp word ptr [rsi + 60], r13w
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ cmp word ptr [rsi + 62], r13w
+ sete r8b
+ add dil, dil
+ add dil, byte ptr [rsp + 152] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, dil
+ add dl, dl
+ add dl, byte ptr [rsp + 104] # 1-byte Folded Reload
+ movzx eax, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 3
+ or al, r14b
+ shl r9b, 2
+ or r9b, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, al
+ mov edi, edx
+ shl r10b, 3
+ or r10b, r9b
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ movzx edi, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ add dl, dl
+ add dl, byte ptr [rsp + 72] # 1-byte Folded Reload
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov edi, edx
+ mov rdx, qword ptr [rsp + 272] # 8-byte Reload
+ mov byte ptr [rdx], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [rdx + 1], cl
+ or r15b, dil
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, bl
+ movzx ebx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl bl, 6
+ shl r8b, 7
+ or r8b, bl
+ or r8b, cl
+ mov byte ptr [rdx + 2], r15b
+ mov byte ptr [rdx + 3], r8b
+ add rsi, 64
+ add rdx, 4
+ mov qword ptr [rsp + 272], rdx # 8-byte Spill
+ add qword ptr [rsp + 168], -1 # 8-byte Folded Spill
+ jne .LBB1_63
+# %bb.64:
+ mov r14, qword ptr [rsp + 272] # 8-byte Reload
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 176] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_121
+ jmp .LBB1_164
+.LBB1_65:
+ mov r13, qword ptr [rdx]
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_69
+# %bb.66:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_67: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rsi], r13
+ lea rsi, [rsi + 8]
+ sete dl
+ neg dl
+ lea rbx, [rax + 7]
+ test rax, rax
+ cmovns rbx, rax
+ sar rbx, 3
+ movzx r8d, byte ptr [r11 + rbx]
+ xor dl, r8b
+ lea edi, [8*rbx]
+ mov ecx, eax
+ sub ecx, edi
+ mov edi, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl edi, cl
+ and dil, dl
+ xor dil, r8b
+ mov byte ptr [r11 + rbx], dil
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_67
+# %bb.68:
+ add r11, 1
+.LBB1_69:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_123
+# %bb.70:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 176], r15 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_71: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rsi], r13
+ sete byte ptr [rsp + 152] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 8], r13
+ sete dil
+ cmp qword ptr [rsi + 16], r13
+ sete r14b
+ cmp qword ptr [rsi + 24], r13
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 32], r13
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 40], r13
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 48], r13
+ sete al
+ cmp qword ptr [rsi + 56], r13
+ sete bl
+ cmp qword ptr [rsi + 64], r13
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 72], r13
+ sete dl
+ cmp qword ptr [rsi + 80], r13
+ sete r9b
+ cmp qword ptr [rsi + 88], r13
+ sete r10b
+ cmp qword ptr [rsi + 96], r13
+ sete r11b
+ cmp qword ptr [rsi + 104], r13
+ sete r12b
+ cmp qword ptr [rsi + 112], r13
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 120], r13
+ sete cl
+ cmp qword ptr [rsi + 128], r13
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 136], r13
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 144], r13
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 152], r13
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 160], r13
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 168], r13
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 176], r13
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 184], r13
+ sete r15b
+ cmp qword ptr [rsi + 192], r13
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 200], r13
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 208], r13
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 216], r13
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 224], r13
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 232], r13
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 240], r13
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ cmp qword ptr [rsi + 248], r13
+ sete r8b
+ add dil, dil
+ add dil, byte ptr [rsp + 152] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, dil
+ add dl, dl
+ add dl, byte ptr [rsp + 104] # 1-byte Folded Reload
+ movzx eax, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 3
+ or al, r14b
+ shl r9b, 2
+ or r9b, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, al
+ mov edi, edx
+ shl r10b, 3
+ or r10b, r9b
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ movzx edi, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ add dl, dl
+ add dl, byte ptr [rsp + 72] # 1-byte Folded Reload
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov edi, edx
+ mov rdx, qword ptr [rsp + 272] # 8-byte Reload
+ mov byte ptr [rdx], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [rdx + 1], cl
+ or r15b, dil
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, bl
+ movzx ebx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl bl, 6
+ shl r8b, 7
+ or r8b, bl
+ or r8b, cl
+ mov byte ptr [rdx + 2], r15b
+ mov byte ptr [rdx + 3], r8b
+ add rsi, 256
+ add rdx, 4
+ mov qword ptr [rsp + 272], rdx # 8-byte Spill
+ add qword ptr [rsp + 168], -1 # 8-byte Folded Spill
+ jne .LBB1_71
+# %bb.72:
+ mov r14, qword ptr [rsp + 272] # 8-byte Reload
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 176] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_124
+ jmp .LBB1_164
+.LBB1_73:
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ vmovss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ sub r9d, eax
+ je .LBB1_77
+# %bb.74:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_75: # =>This Inner Loop Header: Depth=1
+ vucomiss xmm0, dword ptr [rsi]
+ lea rsi, [rsi + 4]
+ sete dl
+ neg dl
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r9d, byte ptr [r11 + rdi]
+ xor dl, r9b
+ lea r8d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r8d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, dl
+ xor bl, r9b
+ mov byte ptr [r11 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_75
+# %bb.76:
+ add r11, 1
+.LBB1_77:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_126
+# %bb.78:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ mov qword ptr [rsp + 152], r15 # 8-byte Spill
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_79: # =>This Inner Loop Header: Depth=1
+ vucomiss xmm0, dword ptr [rsi]
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 4]
+ sete r9b
+ vucomiss xmm0, dword ptr [rsi + 8]
+ sete r14b
+ vucomiss xmm0, dword ptr [rsi + 12]
+ sete r13b
+ vucomiss xmm0, dword ptr [rsi + 16]
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 20]
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 24]
+ sete al
+ vucomiss xmm0, dword ptr [rsi + 28]
+ sete bl
+ vucomiss xmm0, dword ptr [rsi + 32]
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 36]
+ sete dl
+ vucomiss xmm0, dword ptr [rsi + 40]
+ sete dil
+ vucomiss xmm0, dword ptr [rsi + 44]
+ sete r10b
+ vucomiss xmm0, dword ptr [rsi + 48]
+ sete r11b
+ vucomiss xmm0, dword ptr [rsi + 52]
+ sete r12b
+ vucomiss xmm0, dword ptr [rsi + 56]
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 60]
+ sete cl
+ vucomiss xmm0, dword ptr [rsi + 64]
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 68]
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 72]
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 76]
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 80]
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 84]
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 88]
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 92]
+ sete r15b
+ vucomiss xmm0, dword ptr [rsi + 96]
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 100]
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 104]
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 108]
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 112]
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 116]
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 120]
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ vucomiss xmm0, dword ptr [rsi + 124]
+ sete r8b
+ add r9b, r9b
+ add r9b, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, r9b
+ add dl, dl
+ add dl, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl r13b, 3
+ or r13b, r14b
+ shl dil, 2
+ or dil, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, r13b
+ mov r9d, edx
+ shl r10b, 3
+ or r10b, dil
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, r9b
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ movzx edi, byte ptr [rsp + 120] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx eax, byte ptr [rsp + 104] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 72] # 1-byte Folded Reload
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, al
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov edi, edx
+ mov rdx, qword ptr [rsp + 272] # 8-byte Reload
+ mov byte ptr [rdx], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [rdx + 1], cl
+ or r15b, dil
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, bl
+ mov ebx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, bl
+ movzx ebx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl bl, 6
+ shl r8b, 7
+ or r8b, bl
+ or r8b, cl
+ mov byte ptr [rdx + 2], r15b
+ mov byte ptr [rdx + 3], r8b
+ add rsi, 128
+ add rdx, 4
+ mov qword ptr [rsp + 272], rdx # 8-byte Spill
+ add qword ptr [rsp + 152], -1 # 8-byte Folded Spill
+ jne .LBB1_79
+# %bb.80:
+ mov r14, qword ptr [rsp + 272] # 8-byte Reload
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 168] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_127
+ jmp .LBB1_164
+.LBB1_81:
+ mov r14b, byte ptr [rdx]
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_85
+# %bb.82:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_83: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [rsi], r14b
+ lea rsi, [rsi + 1]
+ sete dl
+ neg dl
+ lea rdi, [rax + 7]
+ test rax, rax
+ cmovns rdi, rax
+ sar rdi, 3
+ movzx r9d, byte ptr [r11 + rdi]
+ xor dl, r9b
+ lea r8d, [8*rdi]
+ mov ecx, eax
+ sub ecx, r8d
+ mov ebx, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl ebx, cl
+ and bl, dl
+ xor bl, r9b
+ mov byte ptr [r11 + rdi], bl
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_83
+# %bb.84:
+ add r11, 1
+.LBB1_85:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_129
+# %bb.86:
+ cmp r15, 32
+ mov dword ptr [rsp + 28], r14d # 4-byte Spill
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 392], r15 # 8-byte Spill
+ jb .LBB1_89
+# %bb.87:
+ mov rax, r15
+ shl rax, 5
+ add rax, rsi
+ cmp r11, rax
+ jae .LBB1_168
+# %bb.88:
+ lea rax, [r11 + 4*r15]
+ cmp rsi, rax
+ jae .LBB1_168
+.LBB1_89:
+ xor eax, eax
+ mov qword ptr [rsp + 384], rax # 8-byte Spill
+ mov r12, rsi
+ mov qword ptr [rsp + 376], r11 # 8-byte Spill
+.LBB1_90:
+ sub r15, qword ptr [rsp + 384] # 8-byte Folded Reload
+ mov qword ptr [rsp + 152], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_91: # =>This Inner Loop Header: Depth=1
+ mov rcx, r12
+ cmp byte ptr [r12], r14b
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 1], r14b
+ sete r8b
+ cmp byte ptr [r12 + 2], r14b
+ sete r15b
+ cmp byte ptr [r12 + 3], r14b
+ sete r13b
+ cmp byte ptr [r12 + 4], r14b
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 5], r14b
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 6], r14b
+ sete al
+ cmp byte ptr [r12 + 7], r14b
+ sete r11b
+ cmp byte ptr [r12 + 8], r14b
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp byte ptr [r12 + 9], r14b
+ sete dl
+ cmp byte ptr [r12 + 10], r14b
+ sete sil
+ cmp byte ptr [r12 + 11], r14b
+ sete dil
+ cmp byte ptr [r12 + 12], r14b
+ sete r10b
+ cmp byte ptr [r12 + 13], r14b
+ sete r12b
+ cmp byte ptr [rcx + 14], r14b
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 15], r14b
+ sete r9b
+ cmp byte ptr [rcx + 16], r14b
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 17], r14b
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 18], r14b
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 19], r14b
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 20], r14b
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 21], r14b
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 22], r14b
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ cmp byte ptr [rcx + 23], r14b
+ sete r14b
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 24], bl
+ sete byte ptr [rsp + 272] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 25], bl
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 26], bl
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 27], bl
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 28], bl
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 29], bl
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 30], bl
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ mov ebx, dword ptr [rsp + 28] # 4-byte Reload
+ cmp byte ptr [rcx + 31], bl
+ sete bl
+ add r8b, r8b
+ add r8b, byte ptr [rsp + 32] # 1-byte Folded Reload
+ shl al, 6
+ shl r11b, 7
+ or r11b, al
+ shl r15b, 2
+ or r15b, r8b
+ add dl, dl
+ add dl, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl r13b, 3
+ or r13b, r15b
+ shl sil, 2
+ or sil, dl
+ movzx edx, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, r13b
+ mov r8d, edx
+ shl dil, 3
+ or dil, sil
+ movzx edx, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, r8b
+ shl r10b, 4
+ or r10b, dil
+ shl r12b, 5
+ or r12b, r10b
+ movzx esi, byte ptr [rsp + 104] # 1-byte Folded Reload
+ shl sil, 6
+ shl r9b, 7
+ or r9b, sil
+ or r11b, dl
+ or r9b, r12b
+ movzx eax, byte ptr [rsp + 136] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 288] # 1-byte Folded Reload
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, al
+ mov esi, edx
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, sil
+ mov esi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, sil
+ mov esi, edx
+ movzx edx, byte ptr [rsp + 72] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, sil
+ mov rsi, qword ptr [rsp + 376] # 8-byte Reload
+ mov byte ptr [rsi], r11b
+ movzx edi, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dil, 6
+ shl r14b, 7
+ or r14b, dil
+ mov byte ptr [rsi + 1], r9b
+ or r14b, dl
+ movzx eax, byte ptr [rsp + 88] # 1-byte Folded Reload
+ add al, al
+ add al, byte ptr [rsp + 272] # 1-byte Folded Reload
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl al, 2
+ or al, dl
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl al, 3
+ or al, dl
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 48] # 1-byte Folded Reload
+ shl al, 4
+ or al, dl
+ mov edx, eax
+ movzx eax, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl al, 5
+ or al, dl
+ movzx edx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl dl, 6
+ shl bl, 7
+ or bl, dl
+ or bl, al
+ mov byte ptr [rsi + 2], r14b
+ mov r14d, dword ptr [rsp + 28] # 4-byte Reload
+ mov byte ptr [rsi + 3], bl
+ lea r12, [rcx + 32]
+ add rsi, 4
+ mov qword ptr [rsp + 376], rsi # 8-byte Spill
+ add qword ptr [rsp + 152], -1 # 8-byte Folded Spill
+ jne .LBB1_91
+# %bb.92:
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 392] # 8-byte Reload
+ jmp .LBB1_130
+.LBB1_93:
+ mov r13d, dword ptr [rdx]
+ lea r15, [r10 + 31]
+ test r10, r10
+ cmovns r15, r10
+ lea eax, [r9 + 7]
+ test r9d, r9d
+ cmovns eax, r9d
+ and eax, -8
+ sub r9d, eax
+ je .LBB1_97
+# %bb.94:
+ movsxd rax, r9d
+ .p2align 4, 0x90
+.LBB1_95: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rsi], r13d
+ lea rsi, [rsi + 4]
+ sete dl
+ neg dl
+ lea rbx, [rax + 7]
+ test rax, rax
+ cmovns rbx, rax
+ sar rbx, 3
+ movzx r8d, byte ptr [r11 + rbx]
+ xor dl, r8b
+ lea edi, [8*rbx]
+ mov ecx, eax
+ sub ecx, edi
+ mov edi, 1
+ # kill: def $cl killed $cl killed $ecx
+ shl edi, cl
+ and dil, dl
+ xor dil, r8b
+ mov byte ptr [r11 + rbx], dil
+ add rax, 1
+ cmp rax, 8
+ jne .LBB1_95
+# %bb.96:
+ add r11, 1
+.LBB1_97:
+ sar r15, 5
+ cmp r10, 32
+ jl .LBB1_133
+# %bb.98:
+ mov qword ptr [rsp + 280], r10 # 8-byte Spill
+ mov qword ptr [rsp + 176], r15 # 8-byte Spill
+ mov qword ptr [rsp + 168], r15 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_99: # =>This Inner Loop Header: Depth=1
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ cmp dword ptr [rsi], r13d
+ sete byte ptr [rsp + 152] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 4], r13d
+ sete dil
+ cmp dword ptr [rsi + 8], r13d
+ sete r14b
+ cmp dword ptr [rsi + 12], r13d
+ sete byte ptr [rsp + 160] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 16], r13d
+ sete byte ptr [rsp + 136] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 20], r13d
+ sete byte ptr [rsp + 88] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 24], r13d
+ sete al
+ cmp dword ptr [rsi + 28], r13d
+ sete bl
+ cmp dword ptr [rsi + 32], r13d
+ sete byte ptr [rsp + 104] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 36], r13d
+ sete dl
+ cmp dword ptr [rsi + 40], r13d
+ sete r9b
+ cmp dword ptr [rsi + 44], r13d
+ sete r10b
+ cmp dword ptr [rsi + 48], r13d
+ sete r11b
+ cmp dword ptr [rsi + 52], r13d
+ sete r12b
+ cmp dword ptr [rsi + 56], r13d
+ sete byte ptr [rsp + 112] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 60], r13d
+ sete cl
+ cmp dword ptr [rsi + 64], r13d
+ sete byte ptr [rsp + 72] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 68], r13d
+ sete byte ptr [rsp + 120] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 72], r13d
+ sete byte ptr [rsp + 128] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 76], r13d
+ sete byte ptr [rsp + 144] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 80], r13d
+ sete byte ptr [rsp + 80] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 84], r13d
+ sete byte ptr [rsp + 96] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 88], r13d
+ sete byte ptr [rsp + 64] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 92], r13d
+ sete r15b
+ cmp dword ptr [rsi + 96], r13d
+ sete byte ptr [rsp + 32] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 100], r13d
+ sete byte ptr [rsp + 48] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 104], r13d
+ sete byte ptr [rsp + 56] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 108], r13d
+ sete byte ptr [rsp + 40] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 112], r13d
+ sete byte ptr [rsp + 320] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 116], r13d
+ sete byte ptr [rsp + 288] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 120], r13d
+ sete byte ptr [rsp + 28] # 1-byte Folded Spill
+ cmp dword ptr [rsi + 124], r13d
+ sete r8b
+ add dil, dil
+ add dil, byte ptr [rsp + 152] # 1-byte Folded Reload
+ shl al, 6
+ shl bl, 7
+ or bl, al
+ shl r14b, 2
+ or r14b, dil
+ add dl, dl
+ add dl, byte ptr [rsp + 104] # 1-byte Folded Reload
+ movzx eax, byte ptr [rsp + 160] # 1-byte Folded Reload
+ shl al, 3
+ or al, r14b
+ shl r9b, 2
+ or r9b, dl
+ movzx edx, byte ptr [rsp + 136] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, al
+ mov edi, edx
+ shl r10b, 3
+ or r10b, r9b
+ movzx edx, byte ptr [rsp + 88] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ shl r11b, 4
+ or r11b, r10b
+ shl r12b, 5
+ or r12b, r11b
+ mov r11, qword ptr [rsp + 272] # 8-byte Reload
+ movzx edi, byte ptr [rsp + 112] # 1-byte Folded Reload
+ shl dil, 6
+ shl cl, 7
+ or cl, dil
+ or bl, dl
+ or cl, r12b
+ movzx edx, byte ptr [rsp + 120] # 1-byte Folded Reload
+ add dl, dl
+ add dl, byte ptr [rsp + 72] # 1-byte Folded Reload
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 128] # 1-byte Folded Reload
+ shl dl, 2
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 144] # 1-byte Folded Reload
+ shl dl, 3
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 80] # 1-byte Folded Reload
+ shl dl, 4
+ or dl, dil
+ mov edi, edx
+ movzx edx, byte ptr [rsp + 96] # 1-byte Folded Reload
+ shl dl, 5
+ or dl, dil
+ mov byte ptr [r11], bl
+ movzx ebx, byte ptr [rsp + 64] # 1-byte Folded Reload
+ shl bl, 6
+ shl r15b, 7
+ or r15b, bl
+ mov byte ptr [r11 + 1], cl
+ or r15b, dl
+ movzx ecx, byte ptr [rsp + 48] # 1-byte Folded Reload
+ add cl, cl
+ add cl, byte ptr [rsp + 32] # 1-byte Folded Reload
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 56] # 1-byte Folded Reload
+ shl cl, 2
+ or cl, dl
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 40] # 1-byte Folded Reload
+ shl cl, 3
+ or cl, dl
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 320] # 1-byte Folded Reload
+ shl cl, 4
+ or cl, dl
+ mov edx, ecx
+ movzx ecx, byte ptr [rsp + 288] # 1-byte Folded Reload
+ shl cl, 5
+ or cl, dl
+ movzx edx, byte ptr [rsp + 28] # 1-byte Folded Reload
+ shl dl, 6
+ shl r8b, 7
+ or r8b, dl
+ or r8b, cl
+ mov byte ptr [r11 + 2], r15b
+ mov byte ptr [r11 + 3], r8b
+ add rsi, 128
+ add r11, 4
+ add qword ptr [rsp + 168], -1 # 8-byte Folded Spill
+ jne .LBB1_99
+# %bb.100:
+ mov r14, r11
+ mov r10, qword ptr [rsp + 280] # 8-byte Reload
+ mov r15, qword ptr [rsp + 176] # 8-byte Reload
+ shl r15, 5
+ cmp r15, r10
+ jl .LBB1_134
+ jmp .LBB1_164
+.LBB1_101:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_102:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ je .LBB1_135
+# %bb.103:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_104: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rsi], r13d
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ xor al, r9b
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ cmp dword ptr [rsi + 4], r13d
+ lea rsi, [rsi + 8]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_104
+ jmp .LBB1_161
+.LBB1_105:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_106:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ jne .LBB1_136
+# %bb.107:
+ xor r11d, r11d
+ jmp .LBB1_138
+.LBB1_108:
+ mov qword ptr [rsp + 376], r11 # 8-byte Spill
+ mov r12, rsi
+.LBB1_109:
+ shl r13, 5
+ cmp r13, r15
+ jge .LBB1_164
+# %bb.110:
+ mov r8, r15
+ sub r8, r13
+ not r13
+ add r13, r15
+ je .LBB1_132
+# %bb.140:
+ mov r10, r8
+ and r10, -2
+ xor esi, esi
+ mov r11, qword ptr [rsp + 376] # 8-byte Reload
+ .p2align 4, 0x90
+.LBB1_141: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [r12 + rsi], r14b
+ sete bl
+ neg bl
+ mov rdi, rsi
+ shr rdi, 3
+ mov ecx, esi
+ and cl, 6
+ mov dl, 1
+ shl dl, cl
+ movzx r9d, byte ptr [r11 + rdi]
+ xor bl, r9b
+ and dl, bl
+ xor dl, r9b
+ mov byte ptr [r11 + rdi], dl
+ cmp byte ptr [r12 + rsi + 1], r14b
+ lea rsi, [rsi + 2]
+ sete bl
+ neg bl
+ xor bl, dl
+ or cl, 1
+ mov al, 1
+ shl al, cl
+ and al, bl
+ xor al, dl
+ mov byte ptr [r11 + rdi], al
+ cmp r10, rsi
+ jne .LBB1_141
+ jmp .LBB1_156
+.LBB1_112:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_113:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ je .LBB1_125
+# %bb.114:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_115: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rsi], r13
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ xor al, r9b
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ cmp qword ptr [rsi + 8], r13
+ lea rsi, [rsi + 16]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_115
+ jmp .LBB1_148
+.LBB1_116:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_117:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ je .LBB1_122
+# %bb.118:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_119: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rsi], r13w
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ xor al, r9b
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ cmp word ptr [rsi + 2], r13w
+ lea rsi, [rsi + 4]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_119
+ jmp .LBB1_144
+.LBB1_120:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_121:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ jne .LBB1_142
+.LBB1_122:
+ xor r11d, r11d
+ jmp .LBB1_144
+.LBB1_123:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_124:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ jne .LBB1_146
+.LBB1_125:
+ xor r11d, r11d
+ jmp .LBB1_148
+.LBB1_126:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_127:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ jne .LBB1_150
+# %bb.128:
+ xor r11d, r11d
+ jmp .LBB1_152
+.LBB1_129:
+ mov qword ptr [rsp + 376], r11 # 8-byte Spill
+ mov r12, rsi
+.LBB1_130:
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+# %bb.131:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ jne .LBB1_154
+.LBB1_132:
+ xor esi, esi
+ jmp .LBB1_157
+.LBB1_133:
+ mov r14, r11
+ shl r15, 5
+ cmp r15, r10
+ jge .LBB1_164
+.LBB1_134:
+ mov r8, r10
+ sub r8, r15
+ not r15
+ add r15, r10
+ jne .LBB1_159
+.LBB1_135:
+ xor r11d, r11d
+ jmp .LBB1_161
+.LBB1_136:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_137: # =>This Inner Loop Header: Depth=1
+ vucomisd xmm0, qword ptr [rsi]
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ xor al, r9b
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ vucomisd xmm0, qword ptr [rsi + 8]
+ lea rsi, [rsi + 16]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_137
+.LBB1_138:
+ test r8b, 1
+ je .LBB1_164
+# %bb.139:
+ vucomisd xmm0, qword ptr [rsi]
+ jmp .LBB1_163
+.LBB1_142:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_143: # =>This Inner Loop Header: Depth=1
+ cmp word ptr [rsi], r13w
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ xor al, r9b
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ cmp word ptr [rsi + 2], r13w
+ lea rsi, [rsi + 4]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_143
+.LBB1_144:
+ test r8b, 1
+ je .LBB1_164
+# %bb.145:
+ cmp word ptr [rsi], r13w
+ jmp .LBB1_163
+.LBB1_146:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_147: # =>This Inner Loop Header: Depth=1
+ cmp qword ptr [rsi], r13
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ xor al, r9b
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ cmp qword ptr [rsi + 8], r13
+ lea rsi, [rsi + 16]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_147
+.LBB1_148:
+ test r8b, 1
+ je .LBB1_164
+# %bb.149:
+ cmp qword ptr [rsi], r13
+ jmp .LBB1_163
+.LBB1_150:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_151: # =>This Inner Loop Header: Depth=1
+ vucomiss xmm0, dword ptr [rsi]
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ xor al, r9b
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ vucomiss xmm0, dword ptr [rsi + 4]
+ lea rsi, [rsi + 8]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_151
+.LBB1_152:
+ test r8b, 1
+ je .LBB1_164
+# %bb.153:
+ vucomiss xmm0, dword ptr [rsi]
+ jmp .LBB1_163
+.LBB1_154:
+ mov r10, r8
+ and r10, -2
+ xor esi, esi
+ mov r11, qword ptr [rsp + 376] # 8-byte Reload
+ .p2align 4, 0x90
+.LBB1_155: # =>This Inner Loop Header: Depth=1
+ cmp byte ptr [r12 + rsi], r14b
+ sete bl
+ neg bl
+ mov rdi, rsi
+ shr rdi, 3
+ mov ecx, esi
+ and cl, 6
+ mov dl, 1
+ shl dl, cl
+ movzx r9d, byte ptr [r11 + rdi]
+ xor bl, r9b
+ and dl, bl
+ xor dl, r9b
+ mov byte ptr [r11 + rdi], dl
+ cmp byte ptr [r12 + rsi + 1], r14b
+ lea rsi, [rsi + 2]
+ sete bl
+ neg bl
+ xor bl, dl
+ or cl, 1
+ mov al, 1
+ shl al, cl
+ and al, bl
+ xor al, dl
+ mov byte ptr [r11 + rdi], al
+ cmp r10, rsi
+ jne .LBB1_155
+.LBB1_156:
+ add r12, rsi
+.LBB1_157:
+ test r8b, 1
+ je .LBB1_164
+# %bb.158:
+ cmp byte ptr [r12], r14b
+ sete al
+ neg al
+ mov rdx, rsi
+ shr rdx, 3
+ mov r8, qword ptr [rsp + 376] # 8-byte Reload
+ mov dil, byte ptr [r8 + rdx]
+ and sil, 7
+ mov bl, 1
+ mov ecx, esi
+ shl bl, cl
+ xor al, dil
+ and bl, al
+ xor bl, dil
+ mov byte ptr [r8 + rdx], bl
+ jmp .LBB1_164
+.LBB1_159:
+ mov r10, r8
+ and r10, -2
+ xor r11d, r11d
+ mov r15, r14
+ .p2align 4, 0x90
+.LBB1_160: # =>This Inner Loop Header: Depth=1
+ cmp dword ptr [rsi], r13d
+ sete al
+ neg al
+ mov rdi, r11
+ shr rdi, 3
+ movzx r9d, byte ptr [r15 + rdi]
+ mov ecx, r11d
+ and cl, 6
+ mov bl, 1
+ shl bl, cl
+ xor al, r9b
+ and bl, al
+ xor bl, r9b
+ mov byte ptr [r15 + rdi], bl
+ add r11, 2
+ cmp dword ptr [rsi + 4], r13d
+ lea rsi, [rsi + 8]
+ sete al
+ neg al
+ xor al, bl
+ or cl, 1
+ mov dl, 1
+ shl dl, cl
+ and dl, al
+ xor dl, bl
+ mov byte ptr [r15 + rdi], dl
+ cmp r10, r11
+ jne .LBB1_160
+.LBB1_161:
+ test r8b, 1
+ je .LBB1_164
+# %bb.162:
+ cmp dword ptr [rsi], r13d
+.LBB1_163:
+ sete al
+ neg al
+ mov rdx, r11
+ shr rdx, 3
+ mov sil, byte ptr [r14 + rdx]
+ and r11b, 7
+ mov bl, 1
+ mov ecx, r11d
+ shl bl, cl
+ xor al, sil
+ and bl, al
+ xor bl, sil
+ mov byte ptr [r14 + rdx], bl
+.LBB1_164:
+ lea rsp, [rbp - 40]
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ pop rbp
+ vzeroupper
+ ret
+.LBB1_165:
+ and r13, -32
+ mov rax, r13
+ shl rax, 5
+ add rax, rsi
+ mov qword ptr [rsp + 400], rax # 8-byte Spill
+ mov qword ptr [rsp + 384], r13 # 8-byte Spill
+ lea rax, [r11 + 4*r13]
+ mov qword ptr [rsp + 376], rax # 8-byte Spill
+ vmovd xmm0, r14d
+ vpbroadcastb ymm0, xmm0
+ vmovdqa ymmword ptr [rsp + 512], ymm0 # 32-byte Spill
+ xor eax, eax
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_166: # =>This Inner Loop Header: Depth=1
+ mov rbx, rax
+ mov qword ptr [rsp + 408], rax # 8-byte Spill
+ shl rbx, 5
+ mov rax, rbx
+ or rax, 32
+ mov qword ptr [rsp + 208], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 64
+ mov qword ptr [rsp + 88], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 96
+ mov qword ptr [rsp + 64], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 128
+ mov qword ptr [rsp + 160], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 160
+ mov qword ptr [rsp + 320], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 192
+ mov qword ptr [rsp + 144], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 224
+ mov qword ptr [rsp + 224], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 256
+ mov qword ptr [rsp + 264], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 288
+ mov qword ptr [rsp + 96], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 320
+ mov qword ptr [rsp + 136], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 512
+ mov rcx, rax
+ movzx eax, byte ptr [rsi + rax]
+ mov rdx, rcx
+ vmovd xmm0, eax
+ mov rcx, rbx
+ movzx eax, byte ptr [rsi + rbx]
+ vmovd xmm3, eax
+ movzx eax, byte ptr [rsi + rdx + 1]
+ vmovd xmm4, eax
+ movzx eax, byte ptr [rsi + rbx + 1]
+ vmovd xmm10, eax
+ movzx eax, byte ptr [rsi + rdx + 2]
+ vmovd xmm1, eax
+ vmovdqa xmmword ptr [rsp + 480], xmm1 # 16-byte Spill
+ movzx eax, byte ptr [rsi + rbx + 2]
+ vmovd xmm1, eax
+ vmovdqa xmmword ptr [rsp + 448], xmm1 # 16-byte Spill
+ movzx eax, byte ptr [rsi + rdx + 3]
+ vmovd xmm11, eax
+ movzx eax, byte ptr [rsi + rbx + 3]
+ vmovd xmm8, eax
+ movzx eax, byte ptr [rsi + rdx + 4]
+ vmovd xmm1, eax
+ vmovdqa xmmword ptr [rsp + 416], xmm1 # 16-byte Spill
+ movzx eax, byte ptr [rsi + rbx + 4]
+ vmovd xmm13, eax
+ movzx eax, byte ptr [rsi + rdx + 5]
+ vmovd xmm14, eax
+ movzx eax, byte ptr [rsi + rbx + 5]
+ vmovd xmm6, eax
+ movzx eax, byte ptr [rsi + rdx + 6]
+ mov qword ptr [rsp + 240], rdx # 8-byte Spill
+ vmovd xmm12, eax
+ movzx eax, byte ptr [rsi + rbx + 6]
+ vmovd xmm7, eax
+ movzx eax, byte ptr [rsi + rdx + 7]
+ vmovd xmm2, eax
+ movzx eax, byte ptr [rsi + rbx + 7]
+ vmovd xmm1, eax
+ mov rax, rbx
+ or rax, 352
+ mov qword ptr [rsp + 128], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 384
+ mov qword ptr [rsp + 120], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 416
+ mov qword ptr [rsp + 32], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 448
+ mov qword ptr [rsp + 288], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 480
+ mov qword ptr [rsp + 48], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 544
+ mov qword ptr [rsp + 152], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 576
+ mov qword ptr [rsp + 232], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 608
+ mov qword ptr [rsp + 104], rax # 8-byte Spill
+ mov r15, rbx
+ or r15, 640
+ mov qword ptr [rsp + 176], r15 # 8-byte Spill
+ mov r11, rbx
+ or r11, 672
+ mov qword ptr [rsp + 200], r11 # 8-byte Spill
+ mov r8, rbx
+ or r8, 704
+ mov qword ptr [rsp + 168], r8 # 8-byte Spill
+ mov rdx, rbx
+ or rdx, 736
+ mov qword ptr [rsp + 192], rdx # 8-byte Spill
+ mov r12, rbx
+ or r12, 768
+ mov qword ptr [rsp + 216], r12 # 8-byte Spill
+ mov r14, rbx
+ or r14, 800
+ mov qword ptr [rsp + 184], r14 # 8-byte Spill
+ mov r10, rbx
+ or r10, 832
+ mov qword ptr [rsp + 80], r10 # 8-byte Spill
+ mov r9, rbx
+ or r9, 864
+ mov qword ptr [rsp + 72], r9 # 8-byte Spill
+ mov rax, rbx
+ or rax, 896
+ mov qword ptr [rsp + 248], rax # 8-byte Spill
+ mov rdi, rbx
+ or rdi, 928
+ mov qword ptr [rsp + 112], rdi # 8-byte Spill
+ mov rax, rbx
+ mov qword ptr [rsp + 256], rbx # 8-byte Spill
+ or rax, 960
+ mov qword ptr [rsp + 40], rax # 8-byte Spill
+ or rcx, 992
+ mov qword ptr [rsp + 56], rcx # 8-byte Spill
+ mov r13, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm9, xmm0, byte ptr [rsi + r13], 1
+ mov rbx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm9, byte ptr [rsi + rbx], 2
+ mov rbx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11], 5
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12], 8
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14], 9
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9], 11
+ mov r13, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax], 14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx], 15
+ mov r14, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14], 1
+ mov r12, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12], 2
+ mov r10, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10], 3
+ mov r11, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11], 4
+ mov r8, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8], 5
+ mov r9, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9], 6
+ mov r15, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15], 7
+ mov rdi, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax], 9
+ mov rbx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx], 10
+ mov rcx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx], 11
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 12
+ mov rdx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 13
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 14
+ mov rdx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 15
+ mov rdx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 1
+ mov rdx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 2
+ mov rdx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 3
+ mov rdx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 4
+ mov rdx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 5
+ mov rdx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 6
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 7
+ mov rdx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 8
+ mov rdx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 9
+ mov rdx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 10
+ mov rdx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 11
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 1], 12
+ mov rdx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 13
+ mov r13, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 1], 14
+ mov rdx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 15
+ vpinsrb xmm5, xmm10, byte ptr [rsi + r14 + 1], 1
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r12 + 1], 2
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 1], 3
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r11 + 1], 4
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r8 + 1], 5
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 1], 6
+ mov r8, r9
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r15 + 1], 7
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 1], 8
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 9
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rbx + 1], 10
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 1], 11
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 14
+ vinserti128 ymm15, ymm3, xmm0, 1
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm5, byte ptr [rsi + rax + 1], 15
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 8]
+ vmovd xmm9, edi
+ vinserti128 ymm0, ymm0, xmm4, 1
+ vmovdqa ymmword ptr [rsp + 1216], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 8]
+ vmovd xmm10, edi
+ mov rdx, qword ptr [rsp + 152] # 8-byte Reload
+ vmovdqa xmm0, xmmword ptr [rsp + 480] # 16-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 2], 1
+ mov rcx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 2], 2
+ mov r10, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 2], 3
+ mov rax, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 4
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 5
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 6
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 7
+ mov r12, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 2], 8
+ mov r13, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 2], 9
+ mov r9, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 2], 10
+ mov r11, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 2], 11
+ mov r14, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 2], 12
+ mov r15, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 2], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 14
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 15
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vmovdqa xmm3, xmmword ptr [rsp + 448] # 16-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 2], 1
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 2
+ mov rdi, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 3
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 4
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 5
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 2], 6
+ mov rdi, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 7
+ mov rbx, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 8
+ mov rbx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 9
+ mov rbx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 10
+ mov r8, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 2], 11
+ mov rbx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 12
+ mov rbx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 13
+ mov rbx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 14
+ mov rbx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 15
+ vpinsrb xmm4, xmm11, byte ptr [rsi + rdx + 3], 1
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 3], 2
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r10 + 3], 3
+ mov rcx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 3], 4
+ mov rcx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 3], 5
+ mov rcx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 3], 6
+ mov r8, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r8 + 3], 7
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r12 + 3], 8
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 3], 9
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r9 + 3], 10
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r11 + 3], 11
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r14 + 3], 12
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r15 + 3], 13
+ mov r14, r15
+ mov rdx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 3], 14
+ mov rcx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 3], 15
+ vpinsrb xmm5, xmm8, byte ptr [rsi + rax + 3], 1
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 2
+ mov r10, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 3], 3
+ mov r15, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r15 + 3], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 5
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 6
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 3], 7
+ mov r11, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r11 + 3], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 10
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 11
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 13
+ vinserti128 ymm0, ymm3, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 480], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm5, byte ptr [rsi + rax + 3], 14
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 9]
+ vmovd xmm8, edi
+ mov r9, rbx
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 3], 15
+ vinserti128 ymm0, ymm0, xmm4, 1
+ vmovdqa ymmword ptr [rsp + 448], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 9]
+ vmovd xmm11, edi
+ vmovdqa xmm0, xmmword ptr [rsp + 416] # 16-byte Reload
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 1
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 2
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 3
+ mov r13, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 4], 4
+ mov rcx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 4], 5
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 4], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 4], 8
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 9
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 10
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 11
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 4], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 4], 14
+ mov r12, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 4], 15
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm13, byte ptr [rsi + rax + 4], 1
+ mov rdx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 4], 2
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 4], 3
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 4], 4
+ mov r10, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 4], 5
+ mov rdi, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 4], 6
+ mov r14, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 4], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 4], 8
+ mov rbx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 4], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 4], 10
+ mov r11, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 4], 11
+ mov r8, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 4], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 4], 13
+ mov r15, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 4], 14
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 4], 15
+ mov r9, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm4, xmm14, byte ptr [rsi + r9 + 5], 1
+ mov r15, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r15 + 5], 2
+ mov r9, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r9 + 5], 3
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 5], 4
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 5
+ mov rcx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 6
+ mov rcx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 7
+ mov r13, rcx
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 5], 8
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 5], 9
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 5], 10
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 5], 11
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 5], 12
+ mov rcx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 5], 14
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r12 + 5], 15
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm5, xmm6, byte ptr [rsi + rax + 5], 1
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 5], 2
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 3
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 5], 4
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 5], 5
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 5], 6
+ mov r10, rdi
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r14 + 5], 7
+ mov r14, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r14 + 5], 8
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rbx + 5], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 10
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r11 + 5], 11
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r8 + 5], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 14
+ vinserti128 ymm14, ymm3, xmm0, 1
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm5, byte ptr [rsi + rax + 5], 15
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 10]
+ vmovd xmm3, edi
+ vinserti128 ymm0, ymm0, xmm4, 1
+ vmovdqa ymmword ptr [rsp + 416], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 10]
+ vmovd xmm4, edi
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm12, byte ptr [rsi + rax + 6], 1
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 6], 2
+ mov r9, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 6], 3
+ mov rbx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 6], 4
+ mov rdx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 6], 5
+ mov r11, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 6], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 6], 7
+ mov rcx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 6], 8
+ mov r13, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 6], 9
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 6], 10
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 6], 11
+ mov r12, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 6], 12
+ mov r8, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 6], 13
+ mov rcx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 6], 14
+ mov rcx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 6], 15
+ mov rcx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm5, xmm7, byte ptr [rsi + rcx + 6], 1
+ mov rcx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 2
+ mov rcx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 3
+ mov rcx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 4
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 6], 5
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 6], 6
+ mov rcx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 7
+ mov r10, r14
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r14 + 6], 8
+ mov rcx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 9
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 10
+ mov rdx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 6], 11
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 6], 12
+ mov rdx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 6], 13
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 6], 14
+ mov r14, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r14 + 6], 15
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 7], 1
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 7], 2
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 7], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 7], 4
+ mov r14, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 7], 5
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 7], 6
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 7], 7
+ mov r11, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 7], 8
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 7], 9
+ mov rdx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 7], 10
+ mov r9, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 7], 11
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 7], 12
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 7], 13
+ mov rdx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 7], 14
+ mov rdx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 7], 15
+ mov rdx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 1
+ mov rdx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 2
+ mov rdx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 3
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 7], 5
+ mov rdx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 6
+ mov r15, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 7], 7
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 7], 8
+ mov rdx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 10
+ mov r13, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 7], 11
+ mov rcx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 12
+ mov rcx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 13
+ vinserti128 ymm0, ymm5, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 1184], ymm0 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm1, byte ptr [rsi + rcx + 7], 14
+ mov rcx, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 11]
+ vmovd xmm1, edi
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 7], 15
+ vinserti128 ymm0, ymm0, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 1152], ymm0 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 11]
+ vmovd xmm2, edi
+ vpinsrb xmm0, xmm9, byte ptr [rsi + rax + 8], 1
+ mov r12, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 8], 2
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 8], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 8], 4
+ mov rbx, r14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 8], 5
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 8], 6
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 8], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 8], 8
+ mov r8, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 8], 9
+ mov rdx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 8], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 8], 11
+ mov r11, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 8], 12
+ mov rdi, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 8], 13
+ mov rdx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 8], 14
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 8], 15
+ mov rdx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm5, xmm10, byte ptr [rsi + rdx + 8], 1
+ mov r14, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r14 + 8], 2
+ mov rdi, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 8], 3
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 8], 4
+ mov rdx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 5
+ mov rdx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 6
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r15 + 8], 7
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 8], 8
+ mov r9, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 8], 9
+ mov r15, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r15 + 8], 10
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r13 + 8], 11
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 12
+ mov rdx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 13
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 14
+ mov r13, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r13 + 8], 15
+ mov r13, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm6, xmm8, byte ptr [rsi + r13 + 9], 1
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r12 + 9], 2
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 3
+ mov rcx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 4
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rbx + 9], 5
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rax + 9], 6
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rdx + 9], 7
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rax + 9], 8
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r8 + 9], 9
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rax + 9], 10
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 11
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r11 + 9], 12
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rax + 9], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rax + 9], 14
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rax + 9], 15
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm7, xmm11, byte ptr [rsi + rax + 9], 1
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r14 + 9], 2
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 3
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rdi + 9], 4
+ mov r14, rdi
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 5
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 6
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 7
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r10 + 9], 8
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r9 + 9], 9
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r15 + 9], 10
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 11
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 12
+ mov r15, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r15 + 9], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 14
+ vinserti128 ymm0, ymm5, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 1120], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm5, xmm7, byte ptr [rsi + rax + 9], 15
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 12]
+ vmovd xmm0, edi
+ vinserti128 ymm5, ymm5, xmm6, 1
+ vmovdqa ymmword ptr [rsp + 1088], ymm5 # 32-byte Spill
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 12]
+ vmovd xmm5, edi
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 10], 1
+ mov rbx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 10], 2
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 3
+ mov r9, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 10], 4
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 5
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 6
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 10], 7
+ mov r8, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 10], 8
+ mov r12, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 10], 9
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 10
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 10], 11
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 10], 12
+ mov rcx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 10], 13
+ mov rdi, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 10], 14
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 10], 15
+ mov r11, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r11 + 10], 1
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdi + 10], 2
+ mov rdi, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdi + 10], 3
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r14 + 10], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 5
+ mov r14, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r14 + 10], 6
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 7
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r10 + 10], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 9
+ mov r14, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r14 + 10], 10
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 11
+ mov r10, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r10 + 10], 12
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r15 + 10], 13
+ mov r15, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r15 + 10], 14
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 15
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 11], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 11], 2
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 11], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 11], 4
+ mov rbx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 11], 5
+ mov r13, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 11], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 11], 7
+ mov r9, rdx
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 11], 8
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 11], 9
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 11], 10
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 11], 11
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 11], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 11], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 11], 14
+ mov r15, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 11], 15
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 11], 1
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 2
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 11], 3
+ mov rax, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 4
+ mov rcx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 5
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 6
+ mov rcx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 7
+ mov r12, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 11], 8
+ mov rcx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 11], 10
+ mov rcx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 11
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 11], 12
+ mov rcx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 13
+ vinserti128 ymm3, ymm4, xmm3, 1
+ vmovdqa ymmword ptr [rsp + 1056], ymm3 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 14
+ mov rcx, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 13]
+ vmovd xmm3, edi
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 11], 15
+ vinserti128 ymm1, ymm2, xmm1, 1
+ vmovdqa ymmword ptr [rsp + 1024], ymm1 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 13]
+ vmovd xmm1, edi
+ mov rcx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 1
+ mov rcx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 2
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 3
+ mov rdx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 12], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 12], 5
+ mov rbx, r13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 12], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 12], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 12], 8
+ mov r9, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 12], 9
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 10
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 11
+ mov r11, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 12], 12
+ mov rcx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 13
+ mov r14, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 12], 14
+ mov r10, r15
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 12], 15
+ mov r13, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm2, xmm5, byte ptr [rsi + r13 + 12], 1
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 12], 2
+ mov rcx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 12], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 12], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 12], 5
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 12], 6
+ mov r15, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 12], 7
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 12], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 12], 9
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 12], 10
+ mov r12, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 12], 11
+ mov rcx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 12], 12
+ mov rcx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 12], 13
+ mov rcx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 12], 14
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 12], 15
+ mov rcx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 1
+ mov rcx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 2
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 3
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 13], 4
+ mov rcx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 5
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 13], 6
+ mov rcx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 13], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 13], 9
+ mov r9, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 13], 10
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 11
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 13], 12
+ mov r11, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 13], 13
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 13], 14
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 13], 15
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 13], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 13], 2
+ mov rcx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 3
+ mov rcx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 4
+ mov rcx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 5
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 13], 7
+ mov rcx, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 8
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 13], 11
+ mov r10, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 13], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 14
+ vinserti128 ymm0, ymm2, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 992], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm1, byte ptr [rsi + rax + 13], 15
+ mov r13, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r13 + 14]
+ vmovd xmm1, edi
+ vinserti128 ymm0, ymm0, xmm3, 1
+ vmovdqa ymmword ptr [rsp + 960], ymm0 # 32-byte Spill
+ mov r14, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r14 + 14]
+ vmovd xmm0, edi
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 1
+ mov rdx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 14], 2
+ mov rbx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 14], 3
+ mov r8, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 14], 4
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 5
+ mov rcx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 14], 6
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 7
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 8
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 14], 10
+ mov r15, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 14], 11
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 14], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 14
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 15
+ mov r9, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 14], 1
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 14], 2
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 14], 3
+ mov r11, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 14], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 14], 5
+ mov r12, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 14], 6
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 14], 7
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 14], 8
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 9
+ mov rdi, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 10
+ mov rdi, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 11
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 14], 12
+ mov rdi, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 13
+ mov rdi, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 14
+ mov rdi, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 15
+ movzx edi, byte ptr [rsi + r13 + 15]
+ vmovd xmm2, edi
+ mov rdi, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 15], 1
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 15], 2
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 15], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 15], 4
+ mov rbx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 15], 5
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 15], 6
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 15], 7
+ mov rcx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 15], 8
+ mov rcx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 15], 9
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 15], 10
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 15], 11
+ mov r13, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 15], 12
+ mov rcx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 15], 13
+ mov r8, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 15], 14
+ mov rcx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 15], 15
+ movzx edi, byte ptr [rsi + r14 + 15]
+ vmovd xmm3, edi
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 15], 1
+ mov rcx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 15], 2
+ mov rcx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 15], 3
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 15], 4
+ mov rcx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 15], 5
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 15], 6
+ mov r10, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 15], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 10
+ mov r12, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 15], 11
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 14
+ mov r15, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 15], 15
+ vinserti128 ymm0, ymm0, xmm1, 1
+ vmovdqa ymmword ptr [rsp + 896], ymm0 # 32-byte Spill
+ vinserti128 ymm0, ymm3, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 928], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 16]
+ vmovd xmm0, edi
+ mov r14, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 16], 1
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 16], 2
+ mov r11, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 16], 3
+ mov rax, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 16], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 16], 5
+ mov r9, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 16], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 16], 7
+ mov rdx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 16], 8
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 16], 9
+ mov rbx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 16], 10
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 16], 11
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 16], 12
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 16], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 16], 14
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 16], 15
+ mov rdi, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 16]
+ vmovd xmm1, edi
+ mov rdi, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 1
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 2
+ mov rdi, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 3
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 16], 5
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 16], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 16], 7
+ mov rcx, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 16], 8
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 9
+ mov rdi, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 16], 11
+ mov rdi, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 12
+ mov r12, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 16], 13
+ mov r13, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 16], 14
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 16], 15
+ mov rdi, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 17]
+ vmovd xmm2, edi
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 17], 1
+ mov r8, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 17], 2
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 17], 3
+ mov r10, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 17], 4
+ mov rdi, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 17], 5
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 17], 6
+ mov rdi, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 17], 7
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 17], 8
+ mov rdx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 17], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 17], 10
+ mov r11, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 17], 11
+ mov rdx, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 17], 12
+ mov rdx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 17], 13
+ mov rdx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 17], 14
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 17], 15
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 17]
+ vmovd xmm3, edi
+ mov r14, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 17], 1
+ mov r15, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 17], 2
+ mov r9, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 17], 3
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 17], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 17], 5
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 17], 6
+ mov rbx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 17], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 17], 8
+ mov rcx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 17], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 17], 10
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 17], 11
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 17], 12
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 17], 13
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 17], 14
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 864], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm3, byte ptr [rsi + rax + 17], 15
+ vinserti128 ymm0, ymm0, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 832], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 18]
+ vmovd xmm0, edi
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 1
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 18], 2
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 18], 4
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 5
+ mov r8, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 18], 6
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 7
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 8
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 9
+ mov r10, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 18], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 18], 11
+ mov r12, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 18], 12
+ mov r11, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 18], 13
+ mov rdi, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 18], 14
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 18], 15
+ mov rdi, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 18]
+ vmovd xmm1, edi
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 18], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 18], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 18], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 18], 4
+ mov rdx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 18], 5
+ mov r14, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 18], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 18], 7
+ mov rdx, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 18], 8
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 18], 9
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 18], 10
+ mov r15, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 18], 11
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 18], 12
+ mov rdi, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 18], 13
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 18], 14
+ mov rbx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 18], 15
+ mov rdi, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 19]
+ vmovd xmm2, edi
+ mov rdi, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 1
+ mov rdi, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 2
+ mov rdi, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 3
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 4
+ mov r13, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 19], 5
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 19], 6
+ mov rdi, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 7
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 8
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 19], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 19], 10
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 19], 11
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 19], 12
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 19], 13
+ mov r9, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 19], 14
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 19], 15
+ mov r10, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r10 + 19]
+ vmovd xmm3, edi
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 1
+ mov r8, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 19], 2
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 3
+ mov rax, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 5
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 19], 6
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 7
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 9
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 19], 10
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 19], 11
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 19], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 14
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 19], 15
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 768], ymm0 # 32-byte Spill
+ vinserti128 ymm0, ymm3, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 800], ymm0 # 32-byte Spill
+ mov r11, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r11 + 20]
+ vmovd xmm0, edi
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 1
+ mov rdx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 20], 2
+ mov rdi, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 20], 3
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 20], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 20], 5
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 20], 6
+ mov r13, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 20], 7
+ mov r15, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 20], 8
+ mov rdi, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 20], 9
+ mov r12, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 20], 10
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 20], 11
+ mov rdi, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 20], 12
+ mov rdi, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 20], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 20], 14
+ mov r14, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 20], 15
+ movzx edi, byte ptr [rsi + r10 + 20]
+ vmovd xmm1, edi
+ mov rbx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 20], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 20], 2
+ mov r8, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 20], 3
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 4
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 5
+ mov rdi, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 6
+ mov r10, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 20], 7
+ mov rdi, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 8
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 9
+ mov rdi, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 10
+ mov rdi, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 11
+ mov r9, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 20], 12
+ mov rdi, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 13
+ mov rdi, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 14
+ mov rdi, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 20], 15
+ movzx edi, byte ptr [rsi + r11 + 21]
+ vmovd xmm2, edi
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 21], 1
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 21], 2
+ mov rdx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 21], 3
+ mov rax, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 21], 4
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 21], 5
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 21], 6
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 21], 7
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 21], 8
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 21], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 21], 10
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 21], 11
+ mov r12, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 21], 12
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 21], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 21], 14
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 21], 15
+ mov r11, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r11 + 21]
+ vmovd xmm3, edi
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 21], 1
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 2
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 21], 3
+ mov rcx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 21], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 5
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 6
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 21], 7
+ mov r13, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 21], 8
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 21], 9
+ mov r15, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 21], 10
+ mov rbx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 21], 11
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 21], 12
+ mov r8, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 21], 13
+ mov rdi, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 21], 14
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 704], ymm0 # 32-byte Spill
+ mov r10, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm3, byte ptr [rsi + r10 + 21], 15
+ vinserti128 ymm0, ymm0, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 736], ymm0 # 32-byte Spill
+ mov rdi, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 22]
+ vmovd xmm0, edi
+ mov rdi, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 1
+ mov rdi, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 2
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 3
+ mov rdx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 4
+ mov rdx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 5
+ mov rdx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 6
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 7
+ mov rdx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 8
+ mov rdx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 9
+ mov r14, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 22], 10
+ mov rdi, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 11
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 22], 12
+ mov rdi, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 13
+ mov r9, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 22], 14
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 15
+ movzx edi, byte ptr [rsi + r11 + 22]
+ vmovd xmm1, edi
+ mov rdi, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 22], 1
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 22], 2
+ mov r12, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 22], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 22], 4
+ mov rcx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 22], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 22], 6
+ mov r11, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 22], 7
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 22], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 22], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 22], 10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 22], 11
+ mov r15, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 22], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 22], 13
+ mov rcx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 22], 14
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 22], 15
+ mov rdi, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 23]
+ vmovd xmm2, edi
+ mov r10, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 23], 1
+ mov r8, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 23], 2
+ mov rdi, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 3
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 4
+ mov rdi, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 5
+ mov r13, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 23], 6
+ mov rdi, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 7
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 8
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 23], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 23], 10
+ mov rdx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 23], 11
+ mov r14, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 23], 12
+ mov rbx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 23], 13
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 23], 14
+ mov rdx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 23], 15
+ mov rdi, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 23]
+ vmovd xmm3, edi
+ mov r9, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 23], 1
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 23], 2
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 23], 3
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 23], 4
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 23], 5
+ mov rdi, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 23], 6
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 23], 7
+ mov r12, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 23], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 23], 9
+ mov r11, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 23], 10
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 23], 11
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 23], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 23], 13
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 23], 14
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 23], 15
+ vinserti128 ymm10, ymm1, xmm0, 1
+ vinserti128 ymm0, ymm3, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 672], ymm0 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 24]
+ vmovd xmm0, edi
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 24], 1
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 24], 2
+ mov r10, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 24], 3
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 24], 4
+ mov rdi, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 24], 5
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 24], 6
+ mov r8, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 24], 7
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 24], 8
+ mov r13, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 24], 9
+ mov rdi, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 24], 10
+ mov rdi, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 24], 11
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 24], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 24], 13
+ mov rdi, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 24], 14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 24], 15
+ mov rdx, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdx + 24]
+ vmovd xmm1, edi
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 24], 1
+ mov r9, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 24], 2
+ mov rdi, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 24], 3
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 24], 4
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 24], 5
+ mov rdi, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 24], 6
+ mov rdi, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 24], 7
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 24], 8
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 24], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 24], 10
+ mov rdi, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 24], 11
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 24], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 24], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 24], 14
+ mov r11, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 24], 15
+ movzx edi, byte ptr [rsi + rcx + 25]
+ vmovd xmm2, edi
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 25], 1
+ mov rbx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 25], 2
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 25], 3
+ mov rax, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 25], 4
+ mov r14, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 25], 5
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 25], 6
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 25], 7
+ mov rcx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 25], 8
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 25], 9
+ mov r15, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 25], 10
+ mov r8, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 25], 11
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 25], 12
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 25], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 25], 14
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 25], 15
+ movzx edi, byte ptr [rsi + rdx + 25]
+ vmovd xmm3, edi
+ mov r12, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 25], 1
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 25], 2
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 3
+ mov rax, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 5
+ mov r13, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 25], 6
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 7
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 8
+ mov r10, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 25], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 10
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 11
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 25], 12
+ mov rdx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 25], 13
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 25], 14
+ vinserti128 ymm9, ymm1, xmm0, 1
+ vpinsrb xmm0, xmm3, byte ptr [rsi + r11 + 25], 15
+ vinserti128 ymm8, ymm0, xmm2, 1
+ mov r11, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r11 + 26]
+ vmovd xmm0, edi
+ mov rdx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 26], 1
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 26], 2
+ mov rbx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 26], 3
+ mov rdx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 26], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 26], 5
+ mov r9, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 26], 6
+ mov rdi, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 26], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 26], 8
+ mov rcx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 26], 9
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 26], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 26], 11
+ mov r14, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 26], 12
+ mov r15, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 26], 13
+ mov rcx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 26], 14
+ mov rcx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 26], 15
+ mov rcx, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 26]
+ vmovd xmm1, edi
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 26], 1
+ mov rcx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 26], 2
+ mov rcx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 26], 3
+ mov rcx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 26], 4
+ mov r12, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 26], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 26], 6
+ mov rcx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 26], 7
+ mov r13, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 26], 8
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 26], 9
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 26], 10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 11
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 12
+ mov r10, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 26], 13
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 14
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 15
+ movzx edi, byte ptr [rsi + r11 + 27]
+ vmovd xmm2, edi
+ mov r11, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 27], 1
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 27], 2
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 27], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 27], 4
+ mov r8, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 27], 5
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 27], 6
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 27], 7
+ mov rdx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 27], 8
+ mov rbx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 27], 9
+ mov rdx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 27], 10
+ mov rdx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 27], 11
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 27], 12
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 27], 13
+ mov rdx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 27], 14
+ mov rdx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 27], 15
+ mov rdx, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdx + 27]
+ vmovd xmm3, edi
+ mov rdx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 27], 1
+ mov r9, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 27], 2
+ mov rdx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 27], 3
+ mov r14, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 27], 4
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 27], 5
+ mov r15, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 27], 6
+ mov rdi, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 27], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 27], 8
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 27], 9
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 27], 10
+ mov rcx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 27], 11
+ mov rcx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 27], 12
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 27], 13
+ mov r13, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 27], 14
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 27], 15
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 544], ymm0 # 32-byte Spill
+ vinserti128 ymm0, ymm3, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 576], ymm0 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 28]
+ vmovd xmm0, edi
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 28], 1
+ mov rcx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 28], 2
+ mov rdi, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 28], 3
+ mov r11, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 28], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 28], 5
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 28], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 28], 7
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 28], 8
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 28], 9
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 28], 10
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 28], 11
+ mov rbx, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 28], 12
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 28], 13
+ mov rdi, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 28], 14
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 28], 15
+ mov r12, qword ptr [rsp + 256] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r12 + 28]
+ vmovd xmm1, edi
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 28], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 28], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 28], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 28], 4
+ mov r9, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 28], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 28], 6
+ mov r15, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 28], 7
+ mov r8, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 28], 8
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 28], 9
+ mov r14, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 28], 10
+ mov r10, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 28], 11
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 28], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 28], 13
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 28], 14
+ mov rdi, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 28], 15
+ mov rdi, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 29]
+ vmovd xmm2, edi
+ mov r13, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 29], 1
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 29], 2
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 29], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 29], 4
+ mov r11, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 29], 5
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 6
+ mov rdi, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 7
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 8
+ mov rdi, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 9
+ mov rdi, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 10
+ mov rdi, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 11
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 29], 12
+ mov rdi, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 13
+ mov rdi, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 14
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 29], 15
+ movzx edi, byte ptr [rsi + r12 + 29]
+ vmovd xmm3, edi
+ mov rbx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 29], 1
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 29], 2
+ mov rdi, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 29], 3
+ mov r12, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 29], 4
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 29], 5
+ mov rdi, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 29], 6
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 29], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 29], 8
+ mov r9, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 29], 9
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 29], 10
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 29], 11
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 29], 12
+ mov r14, rdx
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 29], 13
+ mov r10, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm4, xmm3, byte ptr [rsi + r10 + 29], 14
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 640], ymm0 # 32-byte Spill
+ mov rdx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm4, byte ptr [rsi + rdx + 29], 15
+ vinserti128 ymm0, ymm0, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 608], ymm0 # 32-byte Spill
+ mov r8, qword ptr [rsp + 240] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r8 + 30]
+ vmovd xmm0, edi
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 30], 1
+ movzx edi, byte ptr [rsi + r8 + 31]
+ vmovd xmm1, edi
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 31], 1
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 2
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 30], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 31], 3
+ mov rax, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 30], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 31], 5
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 6
+ mov r11, qword ptr [rsp + 272] # 8-byte Reload
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 7
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 7
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 8
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 8
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 9
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 10
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 11
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 11
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 12
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 13
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 13
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 14
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 31], 14
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 30], 15
+ vpinsrb xmm2, xmm1, byte ptr [rsi + rax + 31], 15
+ mov rcx, qword ptr [rsp + 256] # 8-byte Reload
+ movzx eax, byte ptr [rsi + rcx + 30]
+ vmovd xmm1, eax
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 30], 1
+ movzx eax, byte ptr [rsi + rcx + 31]
+ vmovd xmm7, eax
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rbx + 31], 1
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 2
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 2
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 3
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 30], 4
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r12 + 31], 4
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 5
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 5
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 6
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 30], 7
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r15 + 31], 7
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 8
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 8
+ mov rax, r9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 30], 9
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r9 + 31], 9
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 10
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 10
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 11
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 11
+ mov rax, r14
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 30], 12
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r14 + 31], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 30], 13
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 31], 13
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 30], 14
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r10 + 31], 14
+ mov rax, rdx
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 30], 15
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rdx + 31], 15
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 320], ymm0 # 32-byte Spill
+ vinserti128 ymm0, ymm7, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 288], ymm0 # 32-byte Spill
+ vmovdqa ymm0, ymmword ptr [rsp + 512] # 32-byte Reload
+ vpcmpeqb ymm2, ymm0, ymmword ptr [rsp + 1216] # 32-byte Folded Reload
+ vmovdqa ymm1, ymmword ptr [rip + .LCPI1_0] # ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+ vpand ymm7, ymm2, ymm1
+ vpsubb ymm11, ymm7, ymm2
+ vpcmpeqb ymm7, ymm15, ymm0
+ vpand ymm7, ymm7, ymm1
+ vpcmpeqb ymm12, ymm0, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+ vmovdqa ymm6, ymmword ptr [rip + .LCPI1_1] # ymm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+ vpand ymm12, ymm12, ymm6
+ vpor ymm7, ymm12, ymm7
+ vpor ymm11, ymm11, ymm7
+ vpcmpeqb ymm7, ymm0, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+ vmovdqa ymm2, ymmword ptr [rip + .LCPI1_2] # ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+ vpand ymm7, ymm7, ymm2
+ vpcmpeqb ymm12, ymm14, ymm0
+ vmovdqa ymm4, ymmword ptr [rip + .LCPI1_3] # ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+ vpand ymm12, ymm12, ymm4
+ vpor ymm7, ymm12, ymm7
+ vpcmpeqb ymm12, ymm0, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+ vmovdqa ymm13, ymmword ptr [rip + .LCPI1_4] # ymm13 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+ vpand ymm12, ymm12, ymm13
+ vmovdqa ymm14, ymm13
+ vpor ymm7, ymm12, ymm7
+ vpor ymm11, ymm11, ymm7
+ vpcmpeqb ymm7, ymm0, ymmword ptr [rsp + 1184] # 32-byte Folded Reload
+ vmovdqa ymm5, ymmword ptr [rip + .LCPI1_5] # ymm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+ vpand ymm7, ymm7, ymm5
+ vpcmpeqb ymm12, ymm0, ymmword ptr [rsp + 1152] # 32-byte Folded Reload
+ vpsllw ymm12, ymm12, 7
+ vmovdqa ymm15, ymmword ptr [rip + .LCPI1_6] # ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+ vpand ymm12, ymm12, ymm15
+ vpor ymm7, ymm12, ymm7
+ vpor ymm13, ymm11, ymm7
+ vpcmpeqb ymm7, ymm0, ymmword ptr [rsp + 1088] # 32-byte Folded Reload
+ vpand ymm12, ymm7, ymm1
+ vpsubb ymm7, ymm12, ymm7
+ vpcmpeqb ymm12, ymm0, ymmword ptr [rsp + 1120] # 32-byte Folded Reload
+ vpand ymm12, ymm12, ymm1
+ vpcmpeqb ymm11, ymm0, ymmword ptr [rsp + 1056] # 32-byte Folded Reload
+ vpand ymm11, ymm11, ymm6
+ vpor ymm11, ymm12, ymm11
+ vpor ymm7, ymm11, ymm7
+ vpcmpeqb ymm11, ymm0, ymmword ptr [rsp + 1024] # 32-byte Folded Reload
+ vpand ymm11, ymm11, ymm2
+ vpcmpeqb ymm12, ymm0, ymmword ptr [rsp + 992] # 32-byte Folded Reload
+ vpand ymm12, ymm12, ymm4
+ vpor ymm11, ymm11, ymm12
+ vpcmpeqb ymm12, ymm0, ymmword ptr [rsp + 960] # 32-byte Folded Reload
+ vpand ymm12, ymm12, ymm14
+ vmovdqa ymm3, ymm14
+ vpor ymm11, ymm11, ymm12
+ vpor ymm7, ymm11, ymm7
+ vpcmpeqb ymm11, ymm0, ymmword ptr [rsp + 896] # 32-byte Folded Reload
+ vpand ymm11, ymm11, ymm5
+ vpcmpeqb ymm12, ymm0, ymmword ptr [rsp + 928] # 32-byte Folded Reload
+ vpsllw ymm12, ymm12, 7
+ vpand ymm12, ymm12, ymm15
+ vpor ymm11, ymm11, ymm12
+ vpor ymm12, ymm11, ymm7
+ vpcmpeqb ymm7, ymm0, ymmword ptr [rsp + 832] # 32-byte Folded Reload
+ vpand ymm11, ymm7, ymm1
+ vpsubb ymm7, ymm11, ymm7
+ vpcmpeqb ymm11, ymm0, ymmword ptr [rsp + 864] # 32-byte Folded Reload
+ vpand ymm11, ymm11, ymm1
+ vpcmpeqb ymm14, ymm0, ymmword ptr [rsp + 768] # 32-byte Folded Reload
+ vpand ymm14, ymm14, ymm6
+ vpor ymm11, ymm11, ymm14
+ vpor ymm7, ymm11, ymm7
+ vpcmpeqb ymm11, ymm0, ymmword ptr [rsp + 800] # 32-byte Folded Reload
+ vpand ymm11, ymm11, ymm2
+ vpcmpeqb ymm14, ymm0, ymmword ptr [rsp + 704] # 32-byte Folded Reload
+ vpand ymm14, ymm14, ymm4
+ vpor ymm11, ymm11, ymm14
+ vpcmpeqb ymm14, ymm0, ymmword ptr [rsp + 736] # 32-byte Folded Reload
+ vpand ymm14, ymm14, ymm3
+ vpor ymm11, ymm11, ymm14
+ vpor ymm7, ymm11, ymm7
+ vpcmpeqb ymm10, ymm10, ymm0
+ vmovdqa ymm14, ymm5
+ vpand ymm10, ymm10, ymm5
+ vpcmpeqb ymm11, ymm0, ymmword ptr [rsp + 672] # 32-byte Folded Reload
+ vpsllw ymm11, ymm11, 7
+ vpand ymm11, ymm11, ymm15
+ vpor ymm10, ymm10, ymm11
+ vpor ymm7, ymm10, ymm7
+ vpcmpeqb ymm8, ymm8, ymm0
+ vpand ymm10, ymm8, ymm1
+ vpsubb ymm8, ymm10, ymm8
+ vpcmpeqb ymm9, ymm9, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpcmpeqb ymm5, ymm0, ymmword ptr [rsp + 544] # 32-byte Folded Reload
+ vpand ymm5, ymm5, ymm6
+ vpor ymm5, ymm9, ymm5
+ vpor ymm5, ymm8, ymm5
+ vpcmpeqb ymm6, ymm0, ymmword ptr [rsp + 576] # 32-byte Folded Reload
+ vpand ymm6, ymm6, ymm2
+ vpcmpeqb ymm3, ymm0, ymmword ptr [rsp + 640] # 32-byte Folded Reload
+ vpand ymm3, ymm3, ymm4
+ vpor ymm3, ymm6, ymm3
+ vpcmpeqb ymm4, ymm0, ymmword ptr [rsp + 608] # 32-byte Folded Reload
+ vpand ymm4, ymm4, ymmword ptr [rip + .LCPI1_4]
+ vpor ymm3, ymm3, ymm4
+ vpor ymm3, ymm5, ymm3
+ vpcmpeqb ymm1, ymm0, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+ vpand ymm1, ymm14, ymm1
+ vpcmpeqb ymm2, ymm0, ymmword ptr [rsp + 288] # 32-byte Folded Reload
+ vpsllw ymm2, ymm2, 7
+ vpand ymm2, ymm15, ymm2
+ vpor ymm1, ymm1, ymm2
+ vpor ymm1, ymm3, ymm1
+ vpunpcklbw ymm2, ymm13, ymm12 # ymm2 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23]
+ vpunpckhbw ymm0, ymm13, ymm12 # ymm0 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31]
+ vpunpcklbw ymm3, ymm7, ymm1 # ymm3 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[4],ymm1[4],ymm7[5],ymm1[5],ymm7[6],ymm1[6],ymm7[7],ymm1[7],ymm7[16],ymm1[16],ymm7[17],ymm1[17],ymm7[18],ymm1[18],ymm7[19],ymm1[19],ymm7[20],ymm1[20],ymm7[21],ymm1[21],ymm7[22],ymm1[22],ymm7[23],ymm1[23]
+ vpunpckhbw ymm1, ymm7, ymm1 # ymm1 = ymm7[8],ymm1[8],ymm7[9],ymm1[9],ymm7[10],ymm1[10],ymm7[11],ymm1[11],ymm7[12],ymm1[12],ymm7[13],ymm1[13],ymm7[14],ymm1[14],ymm7[15],ymm1[15],ymm7[24],ymm1[24],ymm7[25],ymm1[25],ymm7[26],ymm1[26],ymm7[27],ymm1[27],ymm7[28],ymm1[28],ymm7[29],ymm1[29],ymm7[30],ymm1[30],ymm7[31],ymm1[31]
+ vpunpcklwd ymm4, ymm2, ymm3 # ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+ vpunpckhwd ymm2, ymm2, ymm3 # ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+ vpunpcklwd ymm3, ymm0, ymm1 # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+ vpunpckhwd ymm0, ymm0, ymm1 # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+ vinserti128 ymm1, ymm4, xmm2, 1
+ vperm2i128 ymm2, ymm4, ymm2, 49 # ymm2 = ymm4[2,3],ymm2[2,3]
+ vinserti128 ymm4, ymm3, xmm0, 1
+ vperm2i128 ymm0, ymm3, ymm0, 49 # ymm0 = ymm3[2,3],ymm0[2,3]
+ mov rcx, qword ptr [rsp + 408] # 8-byte Reload
+ vmovdqu ymmword ptr [r11 + 4*rcx + 96], ymm0
+ vmovdqu ymmword ptr [r11 + 4*rcx + 64], ymm2
+ vmovdqu ymmword ptr [r11 + 4*rcx + 32], ymm4
+ vmovdqu ymmword ptr [r11 + 4*rcx], ymm1
+ add rcx, 32
+ mov rax, rcx
+ cmp rcx, qword ptr [rsp + 384] # 8-byte Folded Reload
+ jne .LBB1_166
+# %bb.167:
+ mov r13, qword ptr [rsp + 392] # 8-byte Reload
+ cmp r13, qword ptr [rsp + 384] # 8-byte Folded Reload
+ mov r15, qword ptr [rsp + 280] # 8-byte Reload
+ mov r14d, dword ptr [rsp + 28] # 4-byte Reload
+ mov r12, qword ptr [rsp + 400] # 8-byte Reload
+ jne .LBB1_36
+ jmp .LBB1_109
+.LBB1_168:
+ and r15, -32
+ mov rax, r15
+ shl rax, 5
+ add rax, rsi
+ mov qword ptr [rsp + 400], rax # 8-byte Spill
+ mov qword ptr [rsp + 384], r15 # 8-byte Spill
+ lea rax, [r11 + 4*r15]
+ mov qword ptr [rsp + 376], rax # 8-byte Spill
+ vmovd xmm0, r14d
+ vpbroadcastb ymm0, xmm0
+ vmovdqa ymmword ptr [rsp + 512], ymm0 # 32-byte Spill
+ xor eax, eax
+ mov qword ptr [rsp + 272], r11 # 8-byte Spill
+ .p2align 4, 0x90
+.LBB1_169: # =>This Inner Loop Header: Depth=1
+ mov rbx, rax
+ mov qword ptr [rsp + 408], rax # 8-byte Spill
+ shl rbx, 5
+ mov rax, rbx
+ or rax, 32
+ mov qword ptr [rsp + 104], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 64
+ mov qword ptr [rsp + 152], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 96
+ mov qword ptr [rsp + 176], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 128
+ mov qword ptr [rsp + 120], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 160
+ mov qword ptr [rsp + 168], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 192
+ mov qword ptr [rsp + 232], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 224
+ mov qword ptr [rsp + 216], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 256
+ mov qword ptr [rsp + 56], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 288
+ mov qword ptr [rsp + 64], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 320
+ mov qword ptr [rsp + 40], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 512
+ mov rcx, rax
+ movzx eax, byte ptr [rsi + rax]
+ vmovd xmm0, eax
+ movzx eax, byte ptr [rsi + rbx]
+ vmovd xmm3, eax
+ movzx eax, byte ptr [rsi + rcx + 1]
+ vmovd xmm4, eax
+ movzx eax, byte ptr [rsi + rbx + 1]
+ vmovd xmm10, eax
+ movzx eax, byte ptr [rsi + rcx + 2]
+ mov rdx, rcx
+ vmovd xmm1, eax
+ vmovdqa xmmword ptr [rsp + 480], xmm1 # 16-byte Spill
+ mov rcx, rbx
+ movzx eax, byte ptr [rsi + rbx + 2]
+ vmovd xmm1, eax
+ vmovdqa xmmword ptr [rsp + 448], xmm1 # 16-byte Spill
+ movzx eax, byte ptr [rsi + rdx + 3]
+ vmovd xmm11, eax
+ movzx eax, byte ptr [rsi + rbx + 3]
+ vmovd xmm8, eax
+ movzx eax, byte ptr [rsi + rdx + 4]
+ vmovd xmm1, eax
+ vmovdqa xmmword ptr [rsp + 416], xmm1 # 16-byte Spill
+ movzx eax, byte ptr [rsi + rbx + 4]
+ vmovd xmm13, eax
+ movzx eax, byte ptr [rsi + rdx + 5]
+ vmovd xmm14, eax
+ movzx eax, byte ptr [rsi + rbx + 5]
+ vmovd xmm6, eax
+ movzx eax, byte ptr [rsi + rdx + 6]
+ mov qword ptr [rsp + 248], rdx # 8-byte Spill
+ vmovd xmm12, eax
+ movzx eax, byte ptr [rsi + rbx + 6]
+ vmovd xmm7, eax
+ movzx eax, byte ptr [rsi + rdx + 7]
+ vmovd xmm2, eax
+ movzx eax, byte ptr [rsi + rbx + 7]
+ vmovd xmm1, eax
+ mov rax, rbx
+ or rax, 352
+ mov qword ptr [rsp + 200], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 384
+ mov qword ptr [rsp + 320], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 416
+ mov qword ptr [rsp + 32], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 448
+ mov qword ptr [rsp + 96], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 480
+ mov qword ptr [rsp + 288], rax # 8-byte Spill
+ mov rax, rbx
+ or rax, 544
+ mov qword ptr [rsp + 136], rax # 8-byte Spill
+ or rbx, 576
+ mov qword ptr [rsp + 256], rbx # 8-byte Spill
+ mov rax, rcx
+ or rax, 608
+ mov qword ptr [rsp + 48], rax # 8-byte Spill
+ mov r12, rcx
+ or r12, 640
+ mov qword ptr [rsp + 208], r12 # 8-byte Spill
+ mov r14, rcx
+ or r14, 672
+ mov qword ptr [rsp + 144], r14 # 8-byte Spill
+ mov rax, rcx
+ or rax, 704
+ mov qword ptr [rsp + 184], rax # 8-byte Spill
+ mov rdi, rcx
+ or rdi, 736
+ mov r9, rcx
+ or r9, 768
+ mov qword ptr [rsp + 224], r9 # 8-byte Spill
+ mov r15, rcx
+ or r15, 800
+ mov qword ptr [rsp + 112], r15 # 8-byte Spill
+ mov r11, rcx
+ or r11, 832
+ mov qword ptr [rsp + 192], r11 # 8-byte Spill
+ mov r10, rcx
+ or r10, 864
+ mov qword ptr [rsp + 88], r10 # 8-byte Spill
+ mov r8, rcx
+ or r8, 896
+ mov qword ptr [rsp + 128], r8 # 8-byte Spill
+ mov rdx, rcx
+ or rdx, 928
+ mov qword ptr [rsp + 240], rdx # 8-byte Spill
+ mov rax, rcx
+ mov qword ptr [rsp + 264], rcx # 8-byte Spill
+ or rax, 960
+ mov qword ptr [rsp + 72], rax # 8-byte Spill
+ or rcx, 992
+ mov qword ptr [rsp + 80], rcx # 8-byte Spill
+ mov r13, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm9, xmm0, byte ptr [rsi + r13], 1
+ vpinsrb xmm0, xmm9, byte ptr [rsi + rbx], 2
+ mov rbx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14], 5
+ mov rbx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi], 7
+ mov r13, rdi
+ mov qword ptr [rsp + 160], rdi # 8-byte Spill
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9], 8
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15], 9
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10], 11
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax], 14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx], 15
+ mov r14, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14], 1
+ mov r10, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10], 2
+ mov r12, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12], 3
+ mov r8, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8], 4
+ mov r11, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11], 5
+ mov r9, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9], 6
+ mov r15, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15], 7
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi], 8
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax], 9
+ mov rbx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx], 10
+ mov rcx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx], 11
+ mov rdx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 12
+ mov rdx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 13
+ mov rdx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 14
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx], 15
+ mov rdx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 1
+ mov rdx, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 2
+ mov rdx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 3
+ mov rdx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 4
+ mov rdx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 5
+ mov rdx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 6
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 1], 7
+ mov r13, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 1], 8
+ mov r13, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 1], 9
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 10
+ mov rdx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 11
+ mov rdx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 12
+ mov rdx, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 13
+ mov rdx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 14
+ mov rdx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 1], 15
+ vpinsrb xmm5, xmm10, byte ptr [rsi + r14 + 1], 1
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 1], 2
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r12 + 1], 3
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r8 + 1], 4
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r11 + 1], 5
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 1], 6
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r15 + 1], 7
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 1], 8
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 9
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rbx + 1], 10
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 1], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 13
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 1], 14
+ vinserti128 ymm15, ymm3, xmm0, 1
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm5, byte ptr [rsi + rax + 1], 15
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 8]
+ vmovd xmm9, edi
+ vinserti128 ymm0, ymm0, xmm4, 1
+ vmovdqa ymmword ptr [rsp + 1216], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 8]
+ vmovd xmm10, edi
+ mov r8, qword ptr [rsp + 136] # 8-byte Reload
+ vmovdqa xmm0, xmmword ptr [rsp + 480] # 16-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 2], 1
+ mov rcx, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 2], 2
+ mov r10, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 2], 3
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 4
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 5
+ mov r9, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 2], 6
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 2], 7
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 8
+ mov r12, r13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 2], 9
+ mov r13, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 2], 10
+ mov r11, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 2], 11
+ mov r14, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 2], 12
+ mov r15, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 2], 13
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 14
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 2], 15
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vmovdqa xmm3, xmmword ptr [rsp + 448] # 16-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 2], 1
+ mov rdi, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 2
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 3
+ mov rdi, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 4
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 5
+ mov rdi, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 6
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 2], 7
+ mov rbx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 8
+ mov rbx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 9
+ mov rbx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 10
+ mov rbx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 11
+ mov rbx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 12
+ mov rbx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 13
+ mov rbx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 14
+ mov rbx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 2], 15
+ vpinsrb xmm4, xmm11, byte ptr [rsi + r8 + 3], 1
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 3], 2
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r10 + 3], 3
+ mov rbx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 3], 4
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 3], 5
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r9 + 3], 6
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 3], 7
+ mov rdx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 3], 8
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r12 + 3], 9
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 3], 10
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r11 + 3], 11
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r14 + 3], 12
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r15 + 3], 13
+ mov r9, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r9 + 3], 14
+ mov r11, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r11 + 3], 15
+ vpinsrb xmm5, xmm8, byte ptr [rsi + rax + 3], 1
+ mov rbx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rbx + 3], 2
+ mov rax, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 3
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 4
+ mov r10, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 3], 5
+ mov r14, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r14 + 3], 6
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 3], 7
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 8
+ mov r15, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r15 + 3], 9
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 10
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 3], 13
+ vinserti128 ymm0, ymm3, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 480], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm0, xmm5, byte ptr [rsi + rax + 3], 14
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 9]
+ vmovd xmm8, edi
+ mov r12, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 3], 15
+ vinserti128 ymm0, ymm0, xmm4, 1
+ vmovdqa ymmword ptr [rsp + 448], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 9]
+ vmovd xmm11, edi
+ vmovdqa xmm0, xmmword ptr [rsp + 416] # 16-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 4], 1
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 2
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 3
+ mov r13, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 4], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 4], 5
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 6
+ mov rax, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 4], 8
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 9
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 10
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 11
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 12
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 4], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 4], 14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 4], 15
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm3, xmm13, byte ptr [rsi + rax + 4], 1
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 4], 2
+ mov r11, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 4], 3
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 4], 4
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 4], 5
+ mov rdi, r14
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 4], 6
+ mov r10, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 4], 7
+ mov r9, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 4], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 4], 9
+ mov rbx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 4], 10
+ mov r14, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 4], 11
+ mov rbx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 4], 12
+ mov rbx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 4], 13
+ mov r15, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 4], 14
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 4], 15
+ vpinsrb xmm4, xmm14, byte ptr [rsi + r8 + 5], 1
+ mov r15, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r15 + 5], 2
+ mov rbx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 5], 3
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 5], 4
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 5
+ mov r13, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 5], 6
+ mov rcx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 7
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 5], 8
+ mov rcx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 9
+ mov rcx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 10
+ mov rdx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdx + 5], 11
+ mov rcx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 12
+ mov r8, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r8 + 5], 13
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 14
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rcx + 5], 15
+ mov r12, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm5, xmm6, byte ptr [rsi + r12 + 5], 1
+ mov rdx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 5], 2
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r11 + 5], 3
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 4
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 5
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 5], 6
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r10 + 5], 7
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 5], 8
+ mov r9, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 5], 9
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 10
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r14 + 5], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 13
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 5], 14
+ vinserti128 ymm14, ymm3, xmm0, 1
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm5, byte ptr [rsi + rax + 5], 15
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 10]
+ vmovd xmm3, edi
+ vinserti128 ymm0, ymm0, xmm4, 1
+ vmovdqa ymmword ptr [rsp + 416], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 10]
+ vmovd xmm4, edi
+ mov r11, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm12, byte ptr [rsi + r11 + 6], 1
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 6], 2
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 6], 3
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 6], 4
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 6], 5
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 6], 6
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 6], 7
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 6], 8
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 6], 9
+ mov r14, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 6], 10
+ mov r10, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 6], 11
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 6], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 6], 13
+ mov rdi, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 6], 14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 6], 15
+ vpinsrb xmm5, xmm7, byte ptr [rsi + r12 + 6], 1
+ mov rcx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 2
+ mov rcx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 3
+ mov rcx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 4
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 6], 5
+ mov rbx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rbx + 6], 6
+ mov rcx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 7
+ mov r12, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r12 + 6], 8
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 6], 9
+ mov rcx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 10
+ mov r9, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 6], 11
+ mov r8, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r8 + 6], 12
+ mov r13, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r13 + 6], 13
+ mov rcx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 14
+ mov rcx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rcx + 6], 15
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 7], 1
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 7], 2
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 7], 3
+ mov r11, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 7], 4
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 7], 5
+ mov rcx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 7], 6
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 7], 7
+ mov rdx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 7], 8
+ mov rcx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 7], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 7], 10
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 7], 11
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 7], 12
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 7], 13
+ mov r15, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 7], 14
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 7], 15
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 1
+ mov rcx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 2
+ mov rcx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 3
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 7], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 7], 6
+ mov r13, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 7], 7
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 7], 8
+ mov rdx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 7], 9
+ mov rcx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 7], 11
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 7], 12
+ mov rcx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 7], 13
+ vinserti128 ymm0, ymm5, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 1184], ymm0 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm0, xmm1, byte ptr [rsi + rcx + 7], 14
+ mov rcx, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 11]
+ vmovd xmm1, edi
+ mov rcx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 7], 15
+ vinserti128 ymm0, ymm0, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 1152], ymm0 # 32-byte Spill
+ mov rcx, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rcx + 11]
+ vmovd xmm2, edi
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm9, byte ptr [rsi + rcx + 8], 1
+ mov r8, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 8], 2
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 8], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 8], 4
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 8], 5
+ mov rcx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 8], 6
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 8], 7
+ mov r14, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 8], 8
+ mov r10, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 8], 9
+ mov rbx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 8], 10
+ mov rdx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 8], 11
+ mov r12, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 8], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 8], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 8], 14
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 8], 15
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm5, xmm10, byte ptr [rsi + rax + 8], 1
+ mov r9, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r9 + 8], 2
+ mov rax, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 8], 3
+ mov rdi, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdi + 8], 4
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 8], 5
+ mov r15, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r15 + 8], 6
+ vpinsrb xmm5, xmm5, byte ptr [rsi + r13 + 8], 7
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 8], 8
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 8], 9
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 8], 10
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rax + 8], 11
+ mov rdx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 12
+ mov rdx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 13
+ mov rdx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 14
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm5, xmm5, byte ptr [rsi + rdx + 8], 15
+ mov r13, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm6, xmm8, byte ptr [rsi + r13 + 9], 1
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r8 + 9], 2
+ mov rdx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rdx + 9], 3
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r11 + 9], 4
+ mov rdx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rdx + 9], 5
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 6
+ mov rcx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 7
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r14 + 9], 8
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r10 + 9], 9
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rbx + 9], 10
+ mov rcx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 11
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r12 + 9], 12
+ mov rcx, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 13
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + rcx + 9], 14
+ mov r13, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm6, xmm6, byte ptr [rsi + r13 + 9], 15
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm7, xmm11, byte ptr [rsi + rcx + 9], 1
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r9 + 9], 2
+ mov rcx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rcx + 9], 3
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rdi + 9], 4
+ mov r11, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r11 + 9], 5
+ vpinsrb xmm7, xmm7, byte ptr [rsi + r15 + 9], 6
+ mov rbx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rbx + 9], 7
+ mov rcx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rcx + 9], 8
+ mov rcx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rcx + 9], 9
+ mov rcx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rcx + 9], 10
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 13
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm7, xmm7, byte ptr [rsi + rax + 9], 14
+ vinserti128 ymm0, ymm5, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 1120], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm5, xmm7, byte ptr [rsi + rax + 9], 15
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 12]
+ vmovd xmm0, edi
+ vinserti128 ymm5, ymm5, xmm6, 1
+ vmovdqa ymmword ptr [rsp + 1088], ymm5 # 32-byte Spill
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 12]
+ vmovd xmm5, edi
+ mov rdx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 10], 1
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 10], 2
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 10], 3
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 4
+ mov r12, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 10], 5
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 6
+ mov r9, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 10], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 10], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 10], 9
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 10
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 11
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 10], 12
+ mov r10, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 10], 13
+ mov r15, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 10], 14
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 10], 15
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 1
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 2
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rdi + 10], 3
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 4
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r11 + 10], 5
+ mov r11, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r11 + 10], 6
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 10], 7
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 8
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rax + 10], 9
+ mov rbx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 10], 10
+ mov rbx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 10], 11
+ mov rbx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 10], 12
+ mov rbx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 10], 13
+ mov rbx, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + rbx + 10], 14
+ mov r13, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm4, xmm4, byte ptr [rsi + r13 + 10], 15
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 11], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 11], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 11], 3
+ mov rcx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 11], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 11], 5
+ mov r8, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 11], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 11], 7
+ mov r12, r9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 11], 8
+ mov rdx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 11], 9
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 11], 10
+ mov rdx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 11], 11
+ mov rdx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 11], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 11], 13
+ mov r13, r10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 11], 14
+ mov rdx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 11], 15
+ mov rdx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 11], 1
+ mov r14, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 11], 2
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 11], 3
+ mov rdi, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 11], 4
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 11], 5
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 11], 6
+ mov r9, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 11], 7
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 11], 8
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 9
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 10
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 13
+ vinserti128 ymm3, ymm4, xmm3, 1
+ vmovdqa ymmword ptr [rsp + 1056], ymm3 # 32-byte Spill
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 11], 14
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 13]
+ vmovd xmm3, edi
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 11], 15
+ vinserti128 ymm1, ymm2, xmm1, 1
+ vmovdqa ymmword ptr [rsp + 1024], ymm1 # 32-byte Spill
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 13]
+ vmovd xmm1, edi
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 12], 1
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 12], 2
+ mov r15, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 12], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 4
+ mov r10, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 12], 5
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 12], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 12], 7
+ mov r12, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 12], 8
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 12], 9
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 12], 10
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 12], 11
+ mov rcx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 12], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 12], 13
+ mov r13, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 12], 14
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 12], 15
+ vpinsrb xmm2, xmm5, byte ptr [rsi + rdx + 12], 1
+ mov rdi, r14
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 12], 2
+ mov r11, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 12], 3
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 12], 4
+ mov r14, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 12], 5
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 12], 6
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 12], 7
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 12], 8
+ mov rbx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 12], 9
+ mov rbx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 12], 10
+ mov rbx, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 12], 11
+ mov rbx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 12], 12
+ mov r9, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 12], 13
+ mov r8, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 12], 14
+ mov rbx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 12], 15
+ mov rbx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 13], 1
+ mov rbx, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 13], 2
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 13], 3
+ mov rbx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 13], 4
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 13], 5
+ mov r15, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 13], 6
+ mov rbx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 13], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 13], 8
+ mov r12, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 13], 9
+ mov rbx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 13], 10
+ mov rbx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 13], 11
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 12
+ mov r10, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 13], 13
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 13], 14
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 13], 15
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 13], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 13], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 13], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 13], 5
+ mov rcx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 6
+ mov rcx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 13], 7
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 8
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 9
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 10
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 13], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 13], 13
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 13], 14
+ vinserti128 ymm0, ymm2, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 992], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm1, byte ptr [rsi + rax + 13], 15
+ mov r13, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r13 + 14]
+ vmovd xmm1, edi
+ vinserti128 ymm0, ymm0, xmm3, 1
+ vmovdqa ymmword ptr [rsp + 960], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 14]
+ vmovd xmm0, edi
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 14], 1
+ mov r11, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 14], 2
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 14], 3
+ mov rbx, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 14], 4
+ mov r8, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 14], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 14], 6
+ mov rcx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 14], 7
+ mov rcx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 14], 8
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 14], 9
+ mov rcx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 14], 10
+ mov rcx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 14], 11
+ mov rdi, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 14], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 14], 13
+ mov rdx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 14], 14
+ mov r9, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 14], 15
+ mov rdx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 14], 1
+ mov rdi, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 2
+ mov r12, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 14], 3
+ mov rdi, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 4
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 14], 5
+ mov rdi, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 6
+ mov r10, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 14], 7
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 8
+ mov r14, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 14], 9
+ mov rdi, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 10
+ mov rdi, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 11
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 12
+ mov rdi, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 13
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 14
+ mov rdi, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 14], 15
+ movzx edi, byte ptr [rsi + r13 + 15]
+ vmovd xmm2, edi
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 15], 1
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 15], 2
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 15], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 15], 4
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 15], 5
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 15], 6
+ mov r15, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 15], 7
+ mov r8, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 15], 8
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 15], 9
+ mov rbx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 15], 10
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 15], 11
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 15], 12
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 15], 13
+ mov r13, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 15], 14
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 15], 15
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 15]
+ vmovd xmm3, edi
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 15], 1
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 2
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 15], 3
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 4
+ mov rdx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 15], 5
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 6
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 15], 7
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 15], 9
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 10
+ mov r14, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 15], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 15], 12
+ mov rcx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 15], 13
+ mov r11, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 15], 14
+ mov r12, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 15], 15
+ vinserti128 ymm0, ymm0, xmm1, 1
+ vmovdqa ymmword ptr [rsp + 896], ymm0 # 32-byte Spill
+ vinserti128 ymm0, ymm3, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 928], ymm0 # 32-byte Spill
+ mov r10, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r10 + 16]
+ vmovd xmm0, edi
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 16], 1
+ mov rcx, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 16], 2
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 16], 3
+ mov r9, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 16], 4
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 16], 5
+ mov rcx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 16], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 16], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 16], 8
+ mov rcx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 16], 9
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 16], 10
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 16], 11
+ mov rdi, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 16], 12
+ mov r8, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 16], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 16], 14
+ mov rdi, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 16], 15
+ mov rbx, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rbx + 16]
+ vmovd xmm1, edi
+ mov rdi, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 1
+ mov rdi, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 2
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 16], 3
+ mov r13, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 16], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 16], 5
+ mov rdx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 16], 6
+ mov rdx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 16], 7
+ mov rdx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 16], 8
+ mov rdx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 16], 9
+ mov rdx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 16], 10
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 16], 11
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 16], 12
+ mov r15, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 16], 13
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 16], 14
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 16], 15
+ movzx edi, byte ptr [rsi + r10 + 17]
+ vmovd xmm2, edi
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 17], 1
+ mov rdx, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 17], 2
+ mov r10, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 17], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 17], 4
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 17], 5
+ mov r11, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 17], 6
+ mov rax, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 17], 7
+ mov r14, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 17], 8
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 17], 9
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 17], 10
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 17], 11
+ mov r12, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 17], 12
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 17], 13
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 17], 14
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 17], 15
+ movzx edi, byte ptr [rsi + rbx + 17]
+ vmovd xmm3, edi
+ mov rcx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 17], 1
+ mov rcx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 17], 2
+ mov r8, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 17], 3
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 17], 4
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 5
+ mov rdi, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 6
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 7
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 8
+ mov rdi, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 9
+ mov r9, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 17], 10
+ mov rdi, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 11
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 12
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 17], 13
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 17], 14
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 864], ymm0 # 32-byte Spill
+ mov rdi, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm3, byte ptr [rsi + rdi + 17], 15
+ vinserti128 ymm0, ymm0, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 832], ymm0 # 32-byte Spill
+ mov rdi, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 18]
+ vmovd xmm0, edi
+ mov rdi, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 18], 1
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 18], 2
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 18], 3
+ mov r15, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 18], 4
+ mov rdi, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 18], 5
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 18], 6
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 18], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 18], 8
+ mov rdx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 18], 9
+ mov rdx, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 18], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 11
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 18], 12
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 13
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 14
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 18], 15
+ movzx edi, byte ptr [rsi + rbx + 18]
+ vmovd xmm1, edi
+ mov r13, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 18], 1
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 18], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 18], 3
+ mov r10, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 18], 4
+ mov r14, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 18], 5
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 18], 6
+ mov r11, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 18], 7
+ mov rcx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 18], 8
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 18], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 18], 10
+ mov rdi, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 18], 11
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 18], 12
+ mov rdi, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 18], 13
+ mov rdi, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 18], 14
+ mov rdi, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdi + 18], 15
+ mov rdi, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 19]
+ vmovd xmm2, edi
+ mov rdi, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 1
+ mov r12, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 19], 2
+ mov rdi, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 19], 4
+ mov rdi, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 5
+ mov rdi, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 6
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 7
+ mov rdi, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 8
+ mov rdi, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 19], 10
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 11
+ mov rdi, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 12
+ mov rdi, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 19], 13
+ mov r9, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 19], 14
+ mov r8, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 19], 15
+ movzx edi, byte ptr [rsi + rbx + 19]
+ vmovd xmm3, edi
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 19], 1
+ mov rdx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 19], 2
+ mov rdx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 19], 3
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 19], 4
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 19], 5
+ mov rbx, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 19], 6
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 19], 7
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 19], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 9
+ mov r11, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 19], 10
+ mov rax, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 11
+ mov r13, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 19], 12
+ mov rax, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 13
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 19], 14
+ mov r14, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 19], 15
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 768], ymm0 # 32-byte Spill
+ vinserti128 ymm0, ymm3, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 800], ymm0 # 32-byte Spill
+ mov r15, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r15 + 20]
+ vmovd xmm0, edi
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 20], 1
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 20], 2
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 3
+ mov r12, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 20], 4
+ mov r10, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 20], 5
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 6
+ mov rax, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 7
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 8
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 9
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 10
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 11
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 12
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 20], 13
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 20], 14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 20], 15
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 20]
+ vmovd xmm1, edi
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 20], 1
+ mov r8, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 20], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 20], 3
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 20], 4
+ mov rdx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 20], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 20], 6
+ mov rdx, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 20], 7
+ mov rdx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 20], 8
+ mov rdx, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 20], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 20], 10
+ mov r11, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 20], 11
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 20], 12
+ mov r13, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 20], 13
+ mov r9, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 20], 14
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 20], 15
+ movzx edi, byte ptr [rsi + r15 + 21]
+ vmovd xmm2, edi
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 21], 1
+ mov rcx, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 21], 2
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 21], 3
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 21], 4
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 21], 5
+ mov rdx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 21], 6
+ mov r12, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 21], 7
+ mov rcx, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 21], 8
+ mov r10, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r10 + 21], 9
+ mov rdi, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 21], 10
+ mov r14, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 21], 11
+ mov rdi, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 21], 12
+ mov rbx, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 21], 13
+ mov rdi, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 21], 14
+ mov rdi, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 21], 15
+ mov rdi, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 21]
+ vmovd xmm3, edi
+ mov rdi, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 21], 1
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 21], 2
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 21], 3
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 4
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 5
+ mov r8, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 21], 6
+ mov r15, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 21], 7
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 8
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 9
+ mov rax, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 10
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 21], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 21], 12
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 21], 13
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 21], 14
+ vinserti128 ymm0, ymm1, xmm0, 1
+ vmovdqa ymmword ptr [rsp + 704], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm3, byte ptr [rsi + rax + 21], 15
+ vinserti128 ymm0, ymm0, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 736], ymm0 # 32-byte Spill
+ mov rax, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 22]
+ vmovd xmm0, edi
+ mov rdi, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 1
+ mov rdi, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 2
+ mov rdi, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 3
+ mov rdi, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 22], 4
+ mov r13, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 22], 5
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 22], 6
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 22], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 22], 8
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r10 + 22], 9
+ mov r12, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r12 + 22], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 22], 11
+ mov r11, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 22], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 22], 13
+ mov rcx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 22], 14
+ mov rcx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 22], 15
+ mov rbx, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rbx + 22]
+ vmovd xmm1, edi
+ mov r10, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 22], 1
+ mov rcx, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 22], 2
+ mov rdx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 3
+ mov rdx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 4
+ mov rdx, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 5
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 22], 6
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 22], 7
+ mov rdx, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 8
+ mov r14, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 22], 9
+ mov rdx, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 10
+ mov r9, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 22], 11
+ mov rdx, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 12
+ mov rdx, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 13
+ mov r15, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 22], 14
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 22], 15
+ movzx edi, byte ptr [rsi + rax + 23]
+ vmovd xmm2, edi
+ mov rax, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 23], 1
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 23], 2
+ mov rdx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 23], 3
+ mov rax, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rax + 23], 4
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 23], 5
+ mov rdi, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 6
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 7
+ mov rdi, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 8
+ mov rdi, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 9
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r12 + 23], 10
+ mov rdi, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 11
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 23], 12
+ mov rdi, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 13
+ mov rdi, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 23], 14
+ mov r13, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 23], 15
+ movzx edi, byte ptr [rsi + rbx + 23]
+ vmovd xmm3, edi
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 23], 1
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 23], 2
+ mov rcx, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rcx + 23], 3
+ mov rbx, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rbx + 23], 4
+ mov r10, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 23], 5
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 23], 6
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 23], 7
+ mov r8, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r8 + 23], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 23], 9
+ mov r12, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 23], 10
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 23], 11
+ mov r11, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r11 + 23], 12
+ mov r14, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r14 + 23], 13
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r15 + 23], 14
+ mov r9, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r9 + 23], 15
+ vinserti128 ymm10, ymm1, xmm0, 1
+ vinserti128 ymm0, ymm3, xmm2, 1
+ vmovdqa ymmword ptr [rsp + 672], ymm0 # 32-byte Spill
+ mov rdi, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdi + 24]
+ vmovd xmm0, edi
+ mov rdi, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdi + 24], 1
+ mov r15, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 24], 2
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 24], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 4
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 5
+ mov rax, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 6
+ mov rdx, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rdx + 24], 7
+ mov rax, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 8
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 9
+ mov rax, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 10
+ mov rax, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 11
+ mov rax, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 12
+ mov rax, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 13
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 24], 14
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r13 + 24], 15
+ mov rax, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rax + 24]
+ vmovd xmm1, edi
+ mov rax, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 24], 1
+ mov rax, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 24], 2
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rcx + 24], 3
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rbx + 24], 4
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 24], 5
+ mov r10, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 24], 6
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 24], 7
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r8 + 24], 8
+ mov r13, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 24], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 24], 10
+ mov r12, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r12 + 24], 11
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 24], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 24], 13
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 24], 14
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r9 + 24], 15
+ mov r8, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r8 + 25]
+ vmovd xmm2, edi
+ mov rcx, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 25], 1
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 25], 2
+ mov rcx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 25], 3
+ mov r15, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r15 + 25], 4
+ mov rcx, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 25], 5
+ mov rcx, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rcx + 25], 6
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 25], 7
+ mov r9, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 25], 8
+ mov rdx, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 25], 9
+ mov r11, qword ptr [rsp + 192] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r11 + 25], 10
+ mov rbx, qword ptr [rsp + 88] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rbx + 25], 11
+ mov rdx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 25], 12
+ mov r14, qword ptr [rsp + 240] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r14 + 25], 13
+ mov rdx, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 25], 14
+ mov rdx, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 25], 15
+ mov rdx, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdx + 25]
+ vmovd xmm3, edi
+ mov rdx, qword ptr [rsp + 104] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdx + 25], 1
+ mov rdi, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 25], 2
+ mov rdi, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 25], 3
+ mov rdi, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 25], 4
+ mov rdi, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 25], 5
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 25], 6
+ mov rdi, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 25], 7
+ mov rdi, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 25], 8
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 25], 9
+ mov r13, qword ptr [rsp + 40] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r13 + 25], 10
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r12 + 25], 11
+ mov rdi, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rdi + 25], 12
+ mov r10, qword ptr [rsp + 32] # 8-byte Reload
+ vpinsrb xmm3, xmm3, byte ptr [rsi + r10 + 25], 13
+ vpinsrb xmm3, xmm3, byte ptr [rsi + rax + 25], 14
+ vinserti128 ymm9, ymm1, xmm0, 1
+ mov rdi, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm0, xmm3, byte ptr [rsi + rdi + 25], 15
+ vinserti128 ymm8, ymm0, xmm2, 1
+ movzx edi, byte ptr [rsi + r8 + 26]
+ vmovd xmm0, edi
+ mov r8, qword ptr [rsp + 136] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r8 + 26], 1
+ mov rax, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 26], 2
+ mov rax, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 26], 3
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r15 + 26], 4
+ mov rax, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 26], 5
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rcx + 26], 6
+ mov rax, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 26], 7
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r9 + 26], 8
+ mov rax, qword ptr [rsp + 112] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 26], 9
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r11 + 26], 10
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 26], 11
+ mov rbx, qword ptr [rsp + 128] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rbx + 26], 12
+ vpinsrb xmm0, xmm0, byte ptr [rsi + r14 + 26], 13
+ mov rcx, r14
+ mov rax, qword ptr [rsp + 72] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 26], 14
+ mov rax, qword ptr [rsp + 80] # 8-byte Reload
+ vpinsrb xmm0, xmm0, byte ptr [rsi + rax + 26], 15
+ mov r12, qword ptr [rsp + 264] # 8-byte Reload
+ movzx edi, byte ptr [rsi + r12 + 26]
+ vmovd xmm1, edi
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 26], 1
+ mov r14, qword ptr [rsp + 152] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r14 + 26], 2
+ mov r15, qword ptr [rsp + 176] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r15 + 26], 3
+ mov rax, qword ptr [rsp + 120] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 4
+ mov rax, qword ptr [rsp + 168] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 5
+ mov rax, qword ptr [rsp + 232] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 6
+ mov rax, qword ptr [rsp + 216] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 7
+ mov rax, qword ptr [rsp + 56] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 8
+ mov rax, qword ptr [rsp + 64] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 9
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r13 + 26], 10
+ mov r11, qword ptr [rsp + 200] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r11 + 26], 11
+ mov rax, qword ptr [rsp + 320] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 12
+ vpinsrb xmm1, xmm1, byte ptr [rsi + r10 + 26], 13
+ mov rax, qword ptr [rsp + 96] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rax + 26], 14
+ mov rdx, qword ptr [rsp + 288] # 8-byte Reload
+ vpinsrb xmm1, xmm1, byte ptr [rsi + rdx + 26], 15
+ mov rdx, qword ptr [rsp + 248] # 8-byte Reload
+ movzx edi, byte ptr [rsi + rdx + 27]
+ vmovd xmm2, edi
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 27], 1
+ mov r8, qword ptr [rsp + 256] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r8 + 27], 2
+ mov rdx, qword ptr [rsp + 48] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdx + 27], 3
+ mov rdi, qword ptr [rsp + 208] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 27], 4
+ mov r9, qword ptr [rsp + 144] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r9 + 27], 5
+ mov r13, qword ptr [rsp + 184] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + r13 + 27], 6
+ mov rdi, qword ptr [rsp + 160] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 27], 7
+ mov rdi, qword ptr [rsp + 224] # 8-byte Reload
+ vpinsrb xmm2, xmm2, byte ptr [rsi + rdi + 27], 8
... 247016 lines suppressed ...