You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/11/21 15:38:05 UTC

[arrow] branch master updated: ARROW-18110: [Go] Scalar Comparisons (#14669)

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b9dd41607c ARROW-18110: [Go] Scalar Comparisons (#14669)
b9dd41607c is described below

commit b9dd41607cb7dd7afd50e3ceb99c68e79e7733a0
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Mon Nov 21 10:37:54 2022 -0500

    ARROW-18110: [Go] Scalar Comparisons (#14669)
    
    Authored-by: Matt Topol <zo...@gmail.com>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 dev/release/rat_exclude_files.txt                  |     1 +
 go/arrow/compute/arithmetic_test.go                |     6 +
 go/arrow/compute/datum.go                          |    33 +-
 go/arrow/compute/executor.go                       |     1 +
 go/arrow/compute/internal/exec/utils.go            |    83 +-
 go/arrow/compute/internal/kernels/Makefile         |    17 +-
 .../internal/kernels/_lib/scalar_comparison.cc     |   241 +
 .../kernels/_lib/scalar_comparison_avx2_amd64.s    | 67763 +++++++++++++++++++
 .../kernels/_lib/scalar_comparison_sse4_amd64.s    | 59819 ++++++++++++++++
 .../internal/kernels/compareoperator_string.go     |    28 +
 go/arrow/compute/internal/kernels/helpers.go       |   117 +
 .../internal/kernels/scalar_comparison_amd64.go    |   110 +
 .../kernels/scalar_comparison_avx2_amd64.go        |   109 +
 .../kernels/scalar_comparison_avx2_amd64.s         | 67310 ++++++++++++++++++
 .../internal/kernels/scalar_comparison_noasm.go    |    23 +
 .../kernels/scalar_comparison_sse4_amd64.go        |   109 +
 .../kernels/scalar_comparison_sse4_amd64.s         | 58288 ++++++++++++++++
 .../compute/internal/kernels/scalar_comparisons.go |   699 +
 go/arrow/compute/internal/kernels/types.go         |    14 +-
 go/arrow/compute/registry.go                       |     1 +
 go/arrow/compute/scalar_compare.go                 |   135 +
 go/arrow/compute/scalar_compare_test.go            |  1487 +
 go/arrow/compute/utils.go                          |    82 +
 go/arrow/decimal128/decimal128.go                  |     6 +-
 go/arrow/decimal256/decimal256.go                  |     9 +-
 go/arrow/internal/testing/gen/random_array_gen.go  |    42 +-
 go/arrow/scalar/parse.go                           |    83 +
 go/internal/bitutils/bitmap_generate.go            |     2 +
 28 files changed, 256590 insertions(+), 28 deletions(-)

diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 5cce4195c0..e3eb981842 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -143,6 +143,7 @@ go/arrow/unionmode_string.go
 go/arrow/compute/go.sum
 go/arrow/compute/datumkind_string.go
 go/arrow/compute/funckind_string.go
+go/arrow/compute/internal/kernels/compareoperator_string.go
 go/arrow/compute/internal/kernels/_lib/vendored/*
 go/*.tmpldata
 go/*.s
diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go
index d57af69e6f..2549f6c904 100644
--- a/go/arrow/compute/arithmetic_test.go
+++ b/go/arrow/compute/arithmetic_test.go
@@ -146,6 +146,12 @@ func (b *BinaryFuncTestSuite) TearDownTest() {
 	b.mem.AssertSize(b.T(), 0)
 }
 
+func (b *BinaryFuncTestSuite) getArr(dt arrow.DataType, str string) arrow.Array {
+	arr, _, err := array.FromJSON(b.mem, dt, strings.NewReader(str), array.WithUseNumber())
+	b.Require().NoError(err)
+	return arr
+}
+
 type Float16BinaryFuncTestSuite struct {
 	BinaryFuncTestSuite
 }
diff --git a/go/arrow/compute/datum.go b/go/arrow/compute/datum.go
index e02d50a98a..f6a46e3ef4 100644
--- a/go/arrow/compute/datum.go
+++ b/go/arrow/compute/datum.go
@@ -239,8 +239,6 @@ func (d *TableDatum) Equals(other Datum) bool {
 	return false
 }
 
-// CollectionDatum is a slice of Datums
-
 // NewDatum will construct the appropriate Datum type based on what is passed in
 // as the argument.
 //
@@ -258,23 +256,38 @@ func NewDatum(value interface{}) Datum {
 		return NewDatum(v.data())
 	case arrow.Array:
 		v.Data().Retain()
-		return &ArrayDatum{v.Data().(*array.Data)}
-	case arrow.ArrayData:
+		return &ArrayDatum{v.Data()}
+	case scalar.Releasable:
 		v.Retain()
+		return NewDatumWithoutOwning(v)
+	case scalar.Scalar:
+		return &ScalarDatum{v}
+	default:
+		return &ScalarDatum{scalar.MakeScalar(value)}
+	}
+}
+
+// NewDatumWithoutOwning is like NewDatum only it does not call Retain on
+// the passed in value (if applicable). This means that if the resulting
+// Datum should not have Release called on it and the original value needs
+// to outlive the Datum.
+//
+// Only use this if you know what you're doing. For the most part this is
+// just a convenience function.+-
+
+func NewDatumWithoutOwning(value interface{}) Datum {
+	switch v := value.(type) {
+	case arrow.Array:
+		return &ArrayDatum{v.Data()}
+	case arrow.ArrayData:
 		return &ArrayDatum{v}
 	case *arrow.Chunked:
-		v.Retain()
 		return &ChunkedDatum{v}
 	case arrow.Record:
-		v.Retain()
 		return &RecordDatum{v}
 	case arrow.Table:
-		v.Retain()
 		return &TableDatum{v}
 	case scalar.Scalar:
-		if ls, ok := v.(scalar.Releasable); ok {
-			ls.Retain()
-		}
 		return &ScalarDatum{v}
 	default:
 		return &ScalarDatum{scalar.MakeScalar(value)}
diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go
index 80adcbd1e9..f6a2661abd 100644
--- a/go/arrow/compute/executor.go
+++ b/go/arrow/compute/executor.go
@@ -613,6 +613,7 @@ func (s *scalarExecutor) executeSpans(data chan<- Datum) (err error) {
 
 		output = *s.prepareOutput(int(input.Len))
 		if err = s.executeSingleSpan(&input, &output); err != nil {
+			output.Release()
 			return
 		}
 		err = s.emitResult(&output, data)
diff --git a/go/arrow/compute/internal/exec/utils.go b/go/arrow/compute/internal/exec/utils.go
index b57cb5990a..61a1854a60 100644
--- a/go/arrow/compute/internal/exec/utils.go
+++ b/go/arrow/compute/internal/exec/utils.go
@@ -25,6 +25,7 @@ import (
 
 	"github.com/apache/arrow/go/v11/arrow"
 	"github.com/apache/arrow/go/v11/arrow/array"
+	"github.com/apache/arrow/go/v11/arrow/bitutil"
 	"github.com/apache/arrow/go/v11/arrow/decimal128"
 	"github.com/apache/arrow/go/v11/arrow/decimal256"
 	"github.com/apache/arrow/go/v11/arrow/float16"
@@ -99,7 +100,7 @@ func GetValues[T FixedWidthTypes](data arrow.ArrayData, i int) []T {
 	return ret[data.Offset():]
 }
 
-// GetSpanValues returns a properly typed slice bye reinterpreting
+// GetSpanValues returns a properly typed slice by reinterpreting
 // the buffer at index i using unsafe.Slice. This will take into account
 // the offset of the given ArraySpan.
 func GetSpanValues[T FixedWidthTypes](span *ArraySpan, i int) []T {
@@ -158,6 +159,7 @@ func OptionsInit[T any](_ *KernelCtx, args KernelInitArgs) (KernelState, error)
 }
 
 var typMap = map[reflect.Type]arrow.DataType{
+	reflect.TypeOf(false):           arrow.FixedWidthTypes.Boolean,
 	reflect.TypeOf(int8(0)):         arrow.PrimitiveTypes.Int8,
 	reflect.TypeOf(int16(0)):        arrow.PrimitiveTypes.Int16,
 	reflect.TypeOf(int32(0)):        arrow.PrimitiveTypes.Int32,
@@ -192,13 +194,13 @@ func GetType[T NumericTypes | bool | string]() arrow.Type {
 	return typMap[reflect.TypeOf(z)].ID()
 }
 
-type arrayBuilder[T NumericTypes] interface {
+type arrayBuilder[T NumericTypes | bool] interface {
 	array.Builder
 	Append(T)
 	AppendValues([]T, []bool)
 }
 
-func ArrayFromSlice[T NumericTypes](mem memory.Allocator, data []T) arrow.Array {
+func ArrayFromSlice[T NumericTypes | bool](mem memory.Allocator, data []T) arrow.Array {
 	bldr := array.NewBuilder(mem, typMap[reflect.TypeOf(data).Elem()]).(arrayBuilder[T])
 	defer bldr.Release()
 
@@ -303,3 +305,78 @@ func (c *ChunkResolver) Resolve(idx int64) (chunk, index int64) {
 	atomic.StoreInt64(&c.cached, chunk)
 	return
 }
+
+type arrayTypes interface {
+	FixedWidthTypes | TemporalTypes | bool | string | []byte
+}
+
+type ArrayIter[T arrayTypes] interface {
+	Next() T
+}
+
+type BoolIter struct {
+	Rdr *bitutil.BitmapReader
+}
+
+func NewBoolIter(arr *ArraySpan) ArrayIter[bool] {
+	return &BoolIter{
+		Rdr: bitutil.NewBitmapReader(arr.Buffers[1].Buf, int(arr.Offset), int(arr.Len))}
+}
+
+func (b *BoolIter) Next() (out bool) {
+	out = b.Rdr.Set()
+	b.Rdr.Next()
+	return
+}
+
+type PrimitiveIter[T FixedWidthTypes] struct {
+	Values []T
+}
+
+func NewPrimitiveIter[T FixedWidthTypes](arr *ArraySpan) ArrayIter[T] {
+	return &PrimitiveIter[T]{Values: GetSpanValues[T](arr, 1)}
+}
+
+func (p *PrimitiveIter[T]) Next() (v T) {
+	v = p.Values[0]
+	p.Values = p.Values[1:]
+	return
+}
+
+type VarBinaryIter[OffsetT int32 | int64] struct {
+	Offsets []OffsetT
+	Data    []byte
+	Pos     int64
+}
+
+func NewVarBinaryIter[OffsetT int32 | int64](arr *ArraySpan) ArrayIter[[]byte] {
+	return &VarBinaryIter[OffsetT]{
+		Offsets: GetSpanOffsets[OffsetT](arr, 1),
+		Data:    arr.Buffers[2].Buf,
+	}
+}
+
+func (v *VarBinaryIter[OffsetT]) Next() []byte {
+	cur := v.Pos
+	v.Pos++
+	return v.Data[v.Offsets[cur]:v.Offsets[v.Pos]]
+}
+
+type FSBIter struct {
+	Data  []byte
+	Width int
+	Pos   int64
+}
+
+func NewFSBIter(arr *ArraySpan) ArrayIter[[]byte] {
+	return &FSBIter{
+		Data:  arr.Buffers[1].Buf,
+		Width: arr.Type.(arrow.FixedWidthDataType).Bytes(),
+	}
+}
+
+func (f *FSBIter) Next() []byte {
+	start := f.Width * int(f.Pos)
+	f.Pos++
+	return f.Data[start : start+f.Width]
+}
diff --git a/go/arrow/compute/internal/kernels/Makefile b/go/arrow/compute/internal/kernels/Makefile
index 53dda4da43..ac00bd837c 100644
--- a/go/arrow/compute/internal/kernels/Makefile
+++ b/go/arrow/compute/internal/kernels/Makefile
@@ -20,7 +20,7 @@ PERL_FIXUP_ROTATE=perl -i -pe 's/(ro[rl]\s+\w{2,3})$$/\1, 1/'
 C2GOASM=c2goasm
 CC=clang-11
 CXX=clang++-11
-C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 \
+C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=5000 \
 				-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib
 ASM_FLAGS_AVX2=-mavx2 -mfma
 ASM_FLAGS_SSE4=-msse4
@@ -37,7 +37,8 @@ ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -n
 
 INTEL_SOURCES := \
 	cast_numeric_avx2_amd64.s cast_numeric_sse4_amd64.s constant_factor_avx2_amd64.s \
-	constant_factor_sse4_amd64.s base_arithmetic_avx2_amd64.s base_arithmetic_sse4_amd64.s
+	constant_factor_sse4_amd64.s base_arithmetic_avx2_amd64.s base_arithmetic_sse4_amd64.s \
+	scalar_comparison_avx2_amd64.s scalar_comparison_sse4_amd64.s
 
 #
 # ARROW-15336: DO NOT add the assembly target for Arm64 (ARM_SOURCES) until c2goasm added the Arm64 support.
@@ -62,6 +63,12 @@ _lib/base_arithmetic_avx2_amd64.s: _lib/base_arithmetic.cc
 _lib/base_arithmetic_sse4_amd64.s: _lib/base_arithmetic.cc
 	$(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
 
+_lib/scalar_comparison_avx2_amd64.s: _lib/scalar_comparison.cc
+	$(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@	
+
+_lib/scalar_comparison_sse4_amd64.s: _lib/scalar_comparison.cc
+	$(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
 _lib/base_arithmetic_neon.s: _lib/base_arithmetic.cc
 	$(CXX) -std=c++17 -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
 
@@ -92,6 +99,12 @@ base_arithmetic_avx2_amd64.s: _lib/base_arithmetic_avx2_amd64.s
 base_arithmetic_sse4_amd64.s: _lib/base_arithmetic_sse4_amd64.s
 	$(C2GOASM) -a -f $^ $@
 
+scalar_comparison_avx2_amd64.s: _lib/scalar_comparison_avx2_amd64.s	
+	$(C2GOASM) -a -f $^ $@
+
+scalar_comparison_sse4_amd64.s: _lib/scalar_comparison_sse4_amd64.s
+	$(C2GOASM) -a -f $^ $@
+
 clean:
 	rm -f $(INTEL_SOURCES)
 	rm -f $(addprefix _lib/,$(INTEL_SOURCES))
diff --git a/go/arrow/compute/internal/kernels/_lib/scalar_comparison.cc b/go/arrow/compute/internal/kernels/_lib/scalar_comparison.cc
new file mode 100644
index 0000000000..09540f3679
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/scalar_comparison.cc
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arch.h>
+#include <stdint.h>
+#include "types.h"
+
+// pack integers into a bitmap in batches of 8
+template <int batch_size>
+inline void pack_bits(const uint32_t* values, uint8_t* out) {
+    for (int i = 0; i < batch_size / 8; ++i) {
+        *out++ = (values[0] | values[1]<<1 | values[2]<<2 | values[3]<<3 |
+                values[4]<<4 | values[5]<<5 | values[6]<<6 | values[7]<<7);
+        values += 8;
+    }
+}
+
+struct Equal {
+    template <typename T>
+    static constexpr bool Call(const T& left, const T& right) {
+        return left == right;
+    }
+};
+
+struct NotEqual {
+    template <typename T>
+    static constexpr bool Call(const T& left, const T& right) {
+        return left != right;
+    }
+};
+
+struct Greater {
+    template <typename T>
+    static constexpr bool Call(const T& left, const T& right) {
+        return left > right;
+    }
+};
+
+struct GreaterEqual {
+    template <typename T>
+    static constexpr bool Call(const T& left, const T& right) {
+        return left >= right;
+    }
+};
+
+static inline void set_bit_to(uint8_t* bits, int64_t i, bool bit_is_set) {
+    bits[i/8] ^= static_cast<uint8_t>(-static_cast<uint8_t>(bit_is_set) ^ bits[i / 8]) & static_cast<uint8_t>(1 << (i % 8));
+}
+
+template <typename T, typename Op>
+struct compare_primitive_arr_arr {
+    static inline void Exec(const void* left_void, const void* right_void, int64_t length, void* out_void, const int offset) {
+        const T* left = reinterpret_cast<const T*>(left_void);
+        const T* right = reinterpret_cast<const T*>(right_void);
+        uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_void);
+        static constexpr int kBatchSize = 32;
+        int64_t num_batches = length / kBatchSize;
+        uint32_t temp_output[kBatchSize];
+
+        if (int prefix = offset % 8) {
+            for (int i = prefix; i < 8; ++i) {
+                set_bit_to(out_bitmap, i, Op::template Call<T>(*left++, *right++));
+            }
+            out_bitmap++;
+        }
+
+        for (int64_t j = 0; j < num_batches; ++j) {
+            for (int i = 0; i < kBatchSize; ++i) {
+                temp_output[i] = Op::template Call<T>(*left++, *right++);
+            }
+            pack_bits<kBatchSize>(temp_output, out_bitmap);
+            out_bitmap += kBatchSize / 8;
+        }
+        int64_t bit_index = 0;
+        for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+            set_bit_to(out_bitmap, bit_index++, Op::template Call<T>(*left++, *right++));
+        }
+    }
+};
+
+template <typename T, typename Op>
+struct compare_primitive_arr_scalar {
+    static inline void Exec(const void* left_void, const void* right_void, int64_t length, void* out_void, const int offset) {
+        const T* left = reinterpret_cast<const T*>(left_void);
+        const T right = *reinterpret_cast<const T*>(right_void);
+        uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_void);
+        static constexpr int kBatchSize = 32;
+        int64_t num_batches = length / kBatchSize;
+        uint32_t temp_output[kBatchSize];
+
+        if (int prefix = offset % 8) {
+            for (int i = prefix; i < 8; ++i) {
+                set_bit_to(out_bitmap, i, Op::template Call<T>(*left++, right));
+            }
+            out_bitmap++;
+        }
+
+        for (int64_t j = 0; j < num_batches; ++j) {
+            for (int i = 0; i < kBatchSize; ++i) {
+                temp_output[i] = Op::template Call<T>(*left++, right);
+            }
+            pack_bits<kBatchSize>(temp_output, out_bitmap);
+            out_bitmap += kBatchSize / 8;
+        }
+        int64_t bit_index = 0;
+        for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+            set_bit_to(out_bitmap, bit_index++, Op::template Call<T>(*left++, right));
+        }
+    }
+};
+
+template <typename T, typename Op>
+struct compare_primitive_scalar_arr {
+    static inline void Exec(const void* left_void, const void* right_void, int64_t length, void* out_void, const int offset) {
+        const T left = *reinterpret_cast<const T*>(left_void);
+        const T* right = reinterpret_cast<const T*>(right_void);
+        uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_void);
+        static constexpr int kBatchSize = 32;
+        int64_t num_batches = length / kBatchSize;
+        uint32_t temp_output[kBatchSize];
+
+        if (int prefix = offset % 8) {
+            for (int i = prefix; i < 8; ++i) {
+                set_bit_to(out_bitmap, i, Op::template Call<T>(left, *right++));
+            }
+            out_bitmap++;
+        }
+
+        for (int64_t j = 0; j < num_batches; ++j) {
+            for (int i = 0; i < kBatchSize; ++i) {
+                temp_output[i] = Op::template Call<T>(left, *right++);
+            }
+            pack_bits<kBatchSize>(temp_output, out_bitmap);
+            out_bitmap += kBatchSize / 8;
+        }
+        int64_t bit_index = 0;
+        for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+            set_bit_to(out_bitmap, bit_index++, Op::template Call<T>(left, *right++));
+        }
+    }
+};
+
+enum class cmpop : int8_t {
+    EQUAL,
+    NOT_EQUAL,
+    GREATER,
+    GREATER_EQUAL,
+    // LESS and LESS_EQUAL are handled by doing flipped
+    // versions of GREATER and GREATER_EQUAL
+};
+
+template <typename Op, template <typename...> typename Impl>
+static inline void comparison_exec(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    const auto ty = static_cast<arrtype>(type);
+
+    switch (ty) {
+    case arrtype::UINT8:
+        return Impl<uint8_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::INT8:
+        return Impl<int8_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::UINT16:
+        return Impl<uint16_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::INT16:
+        return Impl<int16_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::UINT32:
+        return Impl<uint32_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::INT32:
+        return Impl<int32_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::UINT64:
+        return Impl<uint64_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::INT64:
+        return Impl<int64_t, Op>::Exec(left, right, length, out, offset);
+    case arrtype::FLOAT32:
+        return Impl<float, Op>::Exec(left, right, length, out, offset);
+    case arrtype::FLOAT64:
+        return Impl<double, Op>::Exec(left, right, length, out, offset);
+    default:
+        break;
+    }
+}
+
+extern "C" void FULL_NAME(comparison_equal_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<Equal, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_equal_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<Equal, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_equal_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<Equal, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_not_equal_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<NotEqual, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_not_equal_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<NotEqual, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_not_equal_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<NotEqual, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<Greater, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<Greater, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<Greater, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_equal_arr_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<GreaterEqual, compare_primitive_arr_arr>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_equal_arr_scalar)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<GreaterEqual, compare_primitive_arr_scalar>(type, left, right, out, length, offset);
+}
+
+extern "C" void FULL_NAME(comparison_greater_equal_scalar_arr)(const int type, const void* left, const void* right, void* out, const int64_t length, const int offset) {
+    comparison_exec<GreaterEqual, compare_primitive_scalar_arr>(type, left, right, out, length, offset);
+}
diff --git a/go/arrow/compute/internal/kernels/_lib/scalar_comparison_avx2_amd64.s b/go/arrow/compute/internal/kernels/_lib/scalar_comparison_avx2_amd64.s
new file mode 100644
index 0000000000..b29d6694a1
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/scalar_comparison_avx2_amd64.s
@@ -0,0 +1,67763 @@
+	.text
+	.intel_syntax noprefix
+	.file	"scalar_comparison.cc"
+	.globl	comparison_equal_arr_arr_avx2   # -- Begin function comparison_equal_arr_arr_avx2
+	.p2align	4, 0x90
+	.type	comparison_equal_arr_arr_avx2,@function
+comparison_equal_arr_arr_avx2:          # @comparison_equal_arr_arr_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	r15
+	push	r14
+	push	r13
+	push	r12
+	push	rbx
+	and	rsp, -8
+	sub	rsp, 72
+                                        # kill: def $r9d killed $r9d def $r9
+	mov	r11, r8
+	mov	r14, rcx
+	cmp	edi, 6
+	jg	.LBB0_29
+# %bb.1:
+	cmp	edi, 3
+	jle	.LBB0_2
+# %bb.15:
+	cmp	edi, 4
+	je	.LBB0_68
+# %bb.16:
+	cmp	edi, 5
+	je	.LBB0_79
+# %bb.17:
+	cmp	edi, 6
+	jne	.LBB0_123
+# %bb.18:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_22
+# %bb.19:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_20:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rsi]
+	add	rsi, 4
+	cmp	ecx, dword ptr [rdx]
+	lea	rdx, [rdx + 4]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_20
+# %bb.21:
+	add	r14, 1
+.LBB0_22:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_26
+# %bb.23:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 64], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_24:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	mov	eax, dword ptr [rsi]
+	mov	ecx, dword ptr [rsi + 4]
+	cmp	eax, dword ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	ecx, dword ptr [rdx + 4]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 8]
+	cmp	eax, dword ptr [rdx + 8]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 12]
+	cmp	eax, dword ptr [rdx + 12]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 16]
+	cmp	eax, dword ptr [rdx + 16]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 20]
+	cmp	eax, dword ptr [rdx + 20]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 24]
+	cmp	eax, dword ptr [rdx + 24]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 28]
+	cmp	eax, dword ptr [rdx + 28]
+	sete	r13b
+	mov	eax, dword ptr [rsi + 32]
+	cmp	eax, dword ptr [rdx + 32]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 36]
+	cmp	eax, dword ptr [rdx + 36]
+	sete	r8b
+	mov	eax, dword ptr [rsi + 40]
+	cmp	eax, dword ptr [rdx + 40]
+	sete	r11b
+	mov	eax, dword ptr [rsi + 44]
+	cmp	eax, dword ptr [rdx + 44]
+	sete	r15b
+	mov	eax, dword ptr [rsi + 48]
+	cmp	eax, dword ptr [rdx + 48]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 52]
+	cmp	eax, dword ptr [rdx + 52]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 56]
+	cmp	eax, dword ptr [rdx + 56]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 60]
+	cmp	eax, dword ptr [rdx + 60]
+	sete	bl
+	mov	eax, dword ptr [rsi + 64]
+	mov	ecx, dword ptr [rsi + 68]
+	cmp	eax, dword ptr [rdx + 64]
+	mov	eax, dword ptr [rsi + 72]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	cmp	ecx, dword ptr [rdx + 68]
+	mov	ecx, dword ptr [rsi + 76]
+	sete	r10b
+	cmp	eax, dword ptr [rdx + 72]
+	mov	eax, dword ptr [rsi + 80]
+	sete	r14b
+	cmp	ecx, dword ptr [rdx + 76]
+	mov	ecx, dword ptr [rsi + 84]
+	sete	r12b
+	cmp	eax, dword ptr [rdx + 80]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	cmp	ecx, dword ptr [rdx + 84]
+	mov	eax, dword ptr [rsi + 88]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 88]
+	mov	eax, dword ptr [rsi + 92]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 92]
+	mov	eax, dword ptr [rsi + 96]
+	sete	r9b
+	cmp	eax, dword ptr [rdx + 96]
+	mov	eax, dword ptr [rsi + 100]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 100]
+	mov	eax, dword ptr [rsi + 104]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 104]
+	mov	eax, dword ptr [rsi + 108]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 108]
+	mov	eax, dword ptr [rsi + 112]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 112]
+	mov	eax, dword ptr [rsi + 116]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 116]
+	mov	eax, dword ptr [rsi + 120]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 120]
+	mov	eax, dword ptr [rsi + 124]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	sub	rsi, -128
+	cmp	eax, dword ptr [rdx + 124]
+	sete	dil
+	movzx	eax, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	al, 6
+	shl	r13b, 7
+	or	r13b, al
+	movzx	eax, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r11b, 2
+	or	r11b, r8b
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	r8d, ecx
+	shl	r15b, 3
+	or	r15b, r11b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r8b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r15b
+	mov	r8d, eax
+	movzx	eax, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r8b
+	movzx	r8d, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	r8b, 6
+	shl	bl, 7
+	or	bl, r8b
+	or	r13b, cl
+	or	bl, al
+	add	r10b, r10b
+	add	r10b, byte ptr [rsp + 10]       # 1-byte Folded Reload
+	shl	r14b, 2
+	or	r14b, r10b
+	shl	r12b, 3
+	or	r12b, r14b
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r12b
+	mov	ecx, eax
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r13b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	dil, 7
+	or	dil, cl
+	or	dil, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 128
+	add	r14, 4
+	add	qword ptr [rsp + 56], -1        # 8-byte Folded Spill
+	jne	.LBB0_24
+# %bb.25:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 64]       # 8-byte Reload
+.LBB0_26:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.27:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_28:                               # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	mov	edi, dword ptr [rsi + 4*rcx]
+	cmp	edi, dword ptr [rdx + 4*rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_28
+	jmp	.LBB0_123
+.LBB0_29:
+	cmp	edi, 8
+	jle	.LBB0_30
+# %bb.43:
+	cmp	edi, 9
+	je	.LBB0_101
+# %bb.44:
+	cmp	edi, 11
+	je	.LBB0_112
+# %bb.45:
+	cmp	edi, 12
+	jne	.LBB0_123
+# %bb.46:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_50
+# %bb.47:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_48:                               # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rsi]           # xmm0 = mem[0],zero
+	add	rsi, 8
+	vucomisd	xmm0, qword ptr [rdx]
+	lea	rdx, [rdx + 8]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_48
+# %bb.49:
+	add	r14, 1
+.LBB0_50:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_54
+# %bb.51:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 32], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 40], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_52:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	vmovsd	xmm0, qword ptr [rsi]           # xmm0 = mem[0],zero
+	vmovsd	xmm1, qword ptr [rsi + 8]       # xmm1 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	vucomisd	xmm1, qword ptr [rdx + 8]
+	sete	al
+	vmovsd	xmm0, qword ptr [rsi + 16]      # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 16]
+	vmovsd	xmm0, qword ptr [rsi + 24]      # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 24]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	vmovsd	xmm0, qword ptr [rsi + 32]      # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 32]
+	vmovsd	xmm0, qword ptr [rsi + 40]      # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 40]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	vmovsd	xmm0, qword ptr [rsi + 48]      # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 48]
+	vmovsd	xmm0, qword ptr [rsi + 56]      # xmm0 = mem[0],zero
+	sete	r13b
+	vucomisd	xmm0, qword ptr [rdx + 56]
+	sete	r15b
+	vmovsd	xmm0, qword ptr [rsi + 64]      # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 64]
+	vmovsd	xmm0, qword ptr [rsi + 72]      # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 72]
+	sete	cl
+	vmovsd	xmm0, qword ptr [rsi + 80]      # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 80]
+	vmovsd	xmm0, qword ptr [rsi + 88]      # xmm0 = mem[0],zero
+	sete	r9b
+	vucomisd	xmm0, qword ptr [rdx + 88]
+	sete	r11b
+	vmovsd	xmm0, qword ptr [rsi + 96]      # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 96]
+	vmovsd	xmm0, qword ptr [rsi + 104]     # xmm0 = mem[0],zero
+	sete	r10b
+	vucomisd	xmm0, qword ptr [rdx + 104]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	vmovsd	xmm0, qword ptr [rsi + 112]     # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 112]
+	vmovsd	xmm0, qword ptr [rsi + 120]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 120]
+	sete	bl
+	vmovsd	xmm0, qword ptr [rsi + 128]     # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 128]
+	vmovsd	xmm0, qword ptr [rsi + 136]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 136]
+	vmovsd	xmm0, qword ptr [rsi + 144]     # xmm0 = mem[0],zero
+	sete	r14b
+	vucomisd	xmm0, qword ptr [rdx + 144]
+	vmovsd	xmm0, qword ptr [rsi + 152]     # xmm0 = mem[0],zero
+	sete	r12b
+	vucomisd	xmm0, qword ptr [rdx + 152]
+	vmovsd	xmm0, qword ptr [rsi + 160]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 160]
+	vmovsd	xmm0, qword ptr [rsi + 168]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 168]
+	vmovsd	xmm0, qword ptr [rsi + 176]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 176]
+	vmovsd	xmm0, qword ptr [rsi + 184]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 184]
+	vmovsd	xmm0, qword ptr [rsi + 192]     # xmm0 = mem[0],zero
+	sete	r8b
+	vucomisd	xmm0, qword ptr [rdx + 192]
+	vmovsd	xmm0, qword ptr [rsi + 200]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 200]
+	vmovsd	xmm0, qword ptr [rsi + 208]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 208]
+	vmovsd	xmm0, qword ptr [rsi + 216]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 216]
+	vmovsd	xmm0, qword ptr [rsi + 224]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 224]
+	vmovsd	xmm0, qword ptr [rsi + 232]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 232]
+	vmovsd	xmm0, qword ptr [rsi + 240]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rdx + 240]
+	vmovsd	xmm0, qword ptr [rsi + 248]     # xmm0 = mem[0],zero
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	add	rsi, 256
+	vucomisd	xmm0, qword ptr [rdx + 248]
+	sete	dil
+	add	al, al
+	add	al, byte ptr [rsp + 4]          # 1-byte Folded Reload
+	shl	r13b, 6
+	shl	r15b, 7
+	or	r15b, r13b
+	movzx	r13d, byte ptr [rsp + 5]        # 1-byte Folded Reload
+	shl	r13b, 2
+	or	r13b, al
+	mov	eax, r13d
+	add	cl, cl
+	add	cl, byte ptr [rsp + 8]          # 1-byte Folded Reload
+	movzx	r13d, byte ptr [rsp + 22]       # 1-byte Folded Reload
+	shl	r13b, 3
+	or	r13b, al
+	shl	r9b, 2
+	or	r9b, cl
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, r13b
+	mov	r13d, ecx
+	shl	r11b, 3
+	or	r11b, r9b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r13b
+	shl	r10b, 4
+	or	r10b, r11b
+	movzx	eax, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r10b
+	movzx	r9d, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	r9b, 6
+	shl	bl, 7
+	or	bl, r9b
+	or	r15b, cl
+	or	bl, al
+	add	r14b, r14b
+	add	r14b, byte ptr [rsp + 14]       # 1-byte Folded Reload
+	shl	r12b, 2
+	or	r12b, r14b
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r12b
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 10]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r15b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r8b, 7
+	or	r8b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r8b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 20]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 19]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, al
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 6
+	shl	dil, 7
+	or	dil, al
+	or	dil, cl
+	mov	byte ptr [r14 + 2], r8b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 256
+	add	r14, 4
+	add	qword ptr [rsp + 40], -1        # 8-byte Folded Spill
+	jne	.LBB0_52
+# %bb.53:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 32]       # 8-byte Reload
+.LBB0_54:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.55:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_56:                               # =>This Inner Loop Header: Depth=1
+	vmovsd	xmm0, qword ptr [rsi + 8*rcx]   # xmm0 = mem[0],zero
+	vucomisd	xmm0, qword ptr [rdx + 8*rcx]
+	lea	r8, [rcx + 1]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_56
+	jmp	.LBB0_123
+.LBB0_2:
+	cmp	edi, 2
+	je	.LBB0_57
+# %bb.3:
+	cmp	edi, 3
+	jne	.LBB0_123
+# %bb.4:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_8
+# %bb.5:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_6:                                # =>This Inner Loop Header: Depth=1
+	movzx	ecx, byte ptr [rsi]
+	add	rsi, 1
+	cmp	cl, byte ptr [rdx]
+	lea	rdx, [rdx + 1]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_6
+# %bb.7:
+	add	r14, 1
+.LBB0_8:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_12
+# %bb.9:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 32], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_10:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	movzx	eax, byte ptr [rsi]
+	movzx	ecx, byte ptr [rsi + 1]
+	cmp	al, byte ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	cl, byte ptr [rdx + 1]
+	sete	cl
+	movzx	eax, byte ptr [rsi + 2]
+	cmp	al, byte ptr [rdx + 2]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 3]
+	cmp	al, byte ptr [rdx + 3]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 4]
+	cmp	al, byte ptr [rdx + 4]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 5]
+	cmp	al, byte ptr [rdx + 5]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 6]
+	cmp	al, byte ptr [rdx + 6]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 7]
+	cmp	al, byte ptr [rdx + 7]
+	sete	r15b
+	movzx	eax, byte ptr [rsi + 8]
+	cmp	al, byte ptr [rdx + 8]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 9]
+	cmp	al, byte ptr [rdx + 9]
+	sete	dil
+	movzx	eax, byte ptr [rsi + 10]
+	cmp	al, byte ptr [rdx + 10]
+	sete	r10b
+	movzx	eax, byte ptr [rsi + 11]
+	cmp	al, byte ptr [rdx + 11]
+	sete	r11b
+	movzx	eax, byte ptr [rsi + 12]
+	cmp	al, byte ptr [rdx + 12]
+	sete	r14b
+	movzx	eax, byte ptr [rsi + 13]
+	cmp	al, byte ptr [rdx + 13]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 14]
+	cmp	al, byte ptr [rdx + 14]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 15]
+	cmp	al, byte ptr [rdx + 15]
+	sete	bl
+	movzx	eax, byte ptr [rsi + 16]
+	cmp	al, byte ptr [rdx + 16]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 17]
+	cmp	al, byte ptr [rdx + 17]
+	sete	r12b
+	movzx	eax, byte ptr [rsi + 18]
+	cmp	al, byte ptr [rdx + 18]
+	sete	r13b
+	movzx	eax, byte ptr [rsi + 19]
+	cmp	al, byte ptr [rdx + 19]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 20]
+	cmp	al, byte ptr [rdx + 20]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 21]
+	cmp	al, byte ptr [rdx + 21]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 22]
+	cmp	al, byte ptr [rdx + 22]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 23]
+	cmp	al, byte ptr [rdx + 23]
+	sete	r9b
+	movzx	eax, byte ptr [rsi + 24]
+	cmp	al, byte ptr [rdx + 24]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 25]
+	cmp	al, byte ptr [rdx + 25]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 26]
+	cmp	al, byte ptr [rdx + 26]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 27]
+	cmp	al, byte ptr [rdx + 27]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 28]
+	cmp	al, byte ptr [rdx + 28]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 29]
+	cmp	al, byte ptr [rdx + 29]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 30]
+	cmp	al, byte ptr [rdx + 30]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 31]
+	add	rsi, 32
+	cmp	al, byte ptr [rdx + 31]
+	sete	r8b
+	add	cl, cl
+	add	cl, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	eax, ecx
+	movzx	ecx, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r15b, 7
+	or	r15b, cl
+	movzx	ecx, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, al
+	mov	eax, ecx
+	add	dil, dil
+	add	dil, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r10b, 2
+	or	r10b, dil
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	edi, ecx
+	shl	r11b, 3
+	or	r11b, r10b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, dil
+	shl	r14b, 4
+	or	r14b, r11b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r14b
+	movzx	edi, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	dil, 6
+	shl	bl, 7
+	or	bl, dil
+	or	r15b, cl
+	or	bl, al
+	add	r12b, r12b
+	add	r12b, byte ptr [rsp + 13]       # 1-byte Folded Reload
+	shl	r13b, 2
+	or	r13b, r12b
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r13b
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 10]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r15b
+	movzx	ecx, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r8b, 7
+	or	r8b, cl
+	or	r8b, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], r8b
+	add	rdx, 32
+	add	r14, 4
+	add	qword ptr [rsp + 32], -1        # 8-byte Folded Spill
+	jne	.LBB0_10
+# %bb.11:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 56]       # 8-byte Reload
+.LBB0_12:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.13:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_14:                               # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	movzx	ebx, byte ptr [rsi + rcx]
+	cmp	bl, byte ptr [rdx + rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_14
+	jmp	.LBB0_123
+.LBB0_30:
+	cmp	edi, 7
+	je	.LBB0_90
+# %bb.31:
+	cmp	edi, 8
+	jne	.LBB0_123
+# %bb.32:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_36
+# %bb.33:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_34:                               # =>This Inner Loop Header: Depth=1
+	mov	rcx, qword ptr [rsi]
+	add	rsi, 8
+	cmp	rcx, qword ptr [rdx]
+	lea	rdx, [rdx + 8]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_34
+# %bb.35:
+	add	r14, 1
+.LBB0_36:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_40
+# %bb.37:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 64], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_38:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	mov	rax, qword ptr [rsi]
+	mov	rcx, qword ptr [rsi + 8]
+	cmp	rax, qword ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	rcx, qword ptr [rdx + 8]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 16]
+	cmp	rax, qword ptr [rdx + 16]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 24]
+	cmp	rax, qword ptr [rdx + 24]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 32]
+	cmp	rax, qword ptr [rdx + 32]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 40]
+	cmp	rax, qword ptr [rdx + 40]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 48]
+	cmp	rax, qword ptr [rdx + 48]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 56]
+	cmp	rax, qword ptr [rdx + 56]
+	sete	r13b
+	mov	rax, qword ptr [rsi + 64]
+	cmp	rax, qword ptr [rdx + 64]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 72]
+	cmp	rax, qword ptr [rdx + 72]
+	sete	r8b
+	mov	rax, qword ptr [rsi + 80]
+	cmp	rax, qword ptr [rdx + 80]
+	sete	r11b
+	mov	rax, qword ptr [rsi + 88]
+	cmp	rax, qword ptr [rdx + 88]
+	sete	r15b
+	mov	rax, qword ptr [rsi + 96]
+	cmp	rax, qword ptr [rdx + 96]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 104]
+	cmp	rax, qword ptr [rdx + 104]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 112]
+	cmp	rax, qword ptr [rdx + 112]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 120]
+	cmp	rax, qword ptr [rdx + 120]
+	sete	bl
+	mov	rax, qword ptr [rsi + 128]
+	mov	rcx, qword ptr [rsi + 136]
+	cmp	rax, qword ptr [rdx + 128]
+	mov	rax, qword ptr [rsi + 144]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	cmp	rcx, qword ptr [rdx + 136]
+	mov	rcx, qword ptr [rsi + 152]
+	sete	r10b
+	cmp	rax, qword ptr [rdx + 144]
+	mov	rax, qword ptr [rsi + 160]
+	sete	r14b
+	cmp	rcx, qword ptr [rdx + 152]
+	mov	rcx, qword ptr [rsi + 168]
+	sete	r12b
+	cmp	rax, qword ptr [rdx + 160]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	cmp	rcx, qword ptr [rdx + 168]
+	mov	rax, qword ptr [rsi + 176]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 176]
+	mov	rax, qword ptr [rsi + 184]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 184]
+	mov	rax, qword ptr [rsi + 192]
+	sete	r9b
+	cmp	rax, qword ptr [rdx + 192]
+	mov	rax, qword ptr [rsi + 200]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 200]
+	mov	rax, qword ptr [rsi + 208]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 208]
+	mov	rax, qword ptr [rsi + 216]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 216]
+	mov	rax, qword ptr [rsi + 224]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 224]
+	mov	rax, qword ptr [rsi + 232]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 232]
+	mov	rax, qword ptr [rsi + 240]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 240]
+	mov	rax, qword ptr [rsi + 248]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	add	rsi, 256
+	cmp	rax, qword ptr [rdx + 248]
+	sete	dil
+	movzx	eax, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	al, 6
+	shl	r13b, 7
+	or	r13b, al
+	movzx	eax, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r11b, 2
+	or	r11b, r8b
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	r8d, ecx
+	shl	r15b, 3
+	or	r15b, r11b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r8b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r15b
+	mov	r8d, eax
+	movzx	eax, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r8b
+	movzx	r8d, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	r8b, 6
+	shl	bl, 7
+	or	bl, r8b
+	or	r13b, cl
+	or	bl, al
+	add	r10b, r10b
+	add	r10b, byte ptr [rsp + 10]       # 1-byte Folded Reload
+	shl	r14b, 2
+	or	r14b, r10b
+	shl	r12b, 3
+	or	r12b, r14b
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r12b
+	mov	ecx, eax
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r13b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	dil, 7
+	or	dil, cl
+	or	dil, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 256
+	add	r14, 4
+	add	qword ptr [rsp + 56], -1        # 8-byte Folded Spill
+	jne	.LBB0_38
+# %bb.39:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 64]       # 8-byte Reload
+.LBB0_40:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.41:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_42:                               # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	mov	rdi, qword ptr [rsi + 8*rcx]
+	cmp	rdi, qword ptr [rdx + 8*rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_42
+	jmp	.LBB0_123
+.LBB0_68:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_72
+# %bb.69:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_70:                               # =>This Inner Loop Header: Depth=1
+	movzx	ecx, word ptr [rsi]
+	add	rsi, 2
+	cmp	cx, word ptr [rdx]
+	lea	rdx, [rdx + 2]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_70
+# %bb.71:
+	add	r14, 1
+.LBB0_72:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_76
+# %bb.73:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 64], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_74:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	movzx	eax, word ptr [rsi]
+	movzx	ecx, word ptr [rsi + 2]
+	cmp	ax, word ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	cx, word ptr [rdx + 2]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 4]
+	cmp	ax, word ptr [rdx + 4]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 6]
+	cmp	ax, word ptr [rdx + 6]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 8]
+	cmp	ax, word ptr [rdx + 8]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 10]
+	cmp	ax, word ptr [rdx + 10]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 12]
+	cmp	ax, word ptr [rdx + 12]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 14]
+	cmp	ax, word ptr [rdx + 14]
+	sete	r13b
+	movzx	eax, word ptr [rsi + 16]
+	cmp	ax, word ptr [rdx + 16]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 18]
+	cmp	ax, word ptr [rdx + 18]
+	sete	r8b
+	movzx	eax, word ptr [rsi + 20]
+	cmp	ax, word ptr [rdx + 20]
+	sete	r11b
+	movzx	eax, word ptr [rsi + 22]
+	cmp	ax, word ptr [rdx + 22]
+	sete	r15b
+	movzx	eax, word ptr [rsi + 24]
+	cmp	ax, word ptr [rdx + 24]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 26]
+	cmp	ax, word ptr [rdx + 26]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 28]
+	cmp	ax, word ptr [rdx + 28]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 30]
+	cmp	ax, word ptr [rdx + 30]
+	sete	bl
+	movzx	eax, word ptr [rsi + 32]
+	movzx	ecx, word ptr [rsi + 34]
+	cmp	ax, word ptr [rdx + 32]
+	movzx	eax, word ptr [rsi + 36]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	cmp	cx, word ptr [rdx + 34]
+	movzx	ecx, word ptr [rsi + 38]
+	sete	r10b
+	cmp	ax, word ptr [rdx + 36]
+	movzx	eax, word ptr [rsi + 40]
+	sete	r14b
+	cmp	cx, word ptr [rdx + 38]
+	movzx	ecx, word ptr [rsi + 42]
+	sete	r12b
+	cmp	ax, word ptr [rdx + 40]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	cmp	cx, word ptr [rdx + 42]
+	movzx	eax, word ptr [rsi + 44]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 44]
+	movzx	eax, word ptr [rsi + 46]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 46]
+	movzx	eax, word ptr [rsi + 48]
+	sete	r9b
+	cmp	ax, word ptr [rdx + 48]
+	movzx	eax, word ptr [rsi + 50]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 50]
+	movzx	eax, word ptr [rsi + 52]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 52]
+	movzx	eax, word ptr [rsi + 54]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 54]
+	movzx	eax, word ptr [rsi + 56]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 56]
+	movzx	eax, word ptr [rsi + 58]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 58]
+	movzx	eax, word ptr [rsi + 60]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 60]
+	movzx	eax, word ptr [rsi + 62]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	add	rsi, 64
+	cmp	ax, word ptr [rdx + 62]
+	sete	dil
+	movzx	eax, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	al, 6
+	shl	r13b, 7
+	or	r13b, al
+	movzx	eax, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r11b, 2
+	or	r11b, r8b
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	r8d, ecx
+	shl	r15b, 3
+	or	r15b, r11b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r8b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r15b
+	mov	r8d, eax
+	movzx	eax, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r8b
+	movzx	r8d, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	r8b, 6
+	shl	bl, 7
+	or	bl, r8b
+	or	r13b, cl
+	or	bl, al
+	add	r10b, r10b
+	add	r10b, byte ptr [rsp + 10]       # 1-byte Folded Reload
+	shl	r14b, 2
+	or	r14b, r10b
+	shl	r12b, 3
+	or	r12b, r14b
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r12b
+	mov	ecx, eax
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r13b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	dil, 7
+	or	dil, cl
+	or	dil, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 64
+	add	r14, 4
+	add	qword ptr [rsp + 56], -1        # 8-byte Folded Spill
+	jne	.LBB0_74
+# %bb.75:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 64]       # 8-byte Reload
+.LBB0_76:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.77:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_78:                               # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	movzx	edi, word ptr [rsi + 2*rcx]
+	cmp	di, word ptr [rdx + 2*rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_78
+	jmp	.LBB0_123
+.LBB0_79:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_83
+# %bb.80:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_81:                               # =>This Inner Loop Header: Depth=1
+	movzx	ecx, word ptr [rsi]
+	add	rsi, 2
+	cmp	cx, word ptr [rdx]
+	lea	rdx, [rdx + 2]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_81
+# %bb.82:
+	add	r14, 1
+.LBB0_83:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_87
+# %bb.84:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 64], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_85:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	movzx	eax, word ptr [rsi]
+	movzx	ecx, word ptr [rsi + 2]
+	cmp	ax, word ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	cx, word ptr [rdx + 2]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 4]
+	cmp	ax, word ptr [rdx + 4]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 6]
+	cmp	ax, word ptr [rdx + 6]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 8]
+	cmp	ax, word ptr [rdx + 8]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 10]
+	cmp	ax, word ptr [rdx + 10]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 12]
+	cmp	ax, word ptr [rdx + 12]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 14]
+	cmp	ax, word ptr [rdx + 14]
+	sete	r13b
+	movzx	eax, word ptr [rsi + 16]
+	cmp	ax, word ptr [rdx + 16]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 18]
+	cmp	ax, word ptr [rdx + 18]
+	sete	r8b
+	movzx	eax, word ptr [rsi + 20]
+	cmp	ax, word ptr [rdx + 20]
+	sete	r11b
+	movzx	eax, word ptr [rsi + 22]
+	cmp	ax, word ptr [rdx + 22]
+	sete	r15b
+	movzx	eax, word ptr [rsi + 24]
+	cmp	ax, word ptr [rdx + 24]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 26]
+	cmp	ax, word ptr [rdx + 26]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 28]
+	cmp	ax, word ptr [rdx + 28]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	movzx	eax, word ptr [rsi + 30]
+	cmp	ax, word ptr [rdx + 30]
+	sete	bl
+	movzx	eax, word ptr [rsi + 32]
+	movzx	ecx, word ptr [rsi + 34]
+	cmp	ax, word ptr [rdx + 32]
+	movzx	eax, word ptr [rsi + 36]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	cmp	cx, word ptr [rdx + 34]
+	movzx	ecx, word ptr [rsi + 38]
+	sete	r10b
+	cmp	ax, word ptr [rdx + 36]
+	movzx	eax, word ptr [rsi + 40]
+	sete	r14b
+	cmp	cx, word ptr [rdx + 38]
+	movzx	ecx, word ptr [rsi + 42]
+	sete	r12b
+	cmp	ax, word ptr [rdx + 40]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	cmp	cx, word ptr [rdx + 42]
+	movzx	eax, word ptr [rsi + 44]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 44]
+	movzx	eax, word ptr [rsi + 46]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 46]
+	movzx	eax, word ptr [rsi + 48]
+	sete	r9b
+	cmp	ax, word ptr [rdx + 48]
+	movzx	eax, word ptr [rsi + 50]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 50]
+	movzx	eax, word ptr [rsi + 52]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 52]
+	movzx	eax, word ptr [rsi + 54]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 54]
+	movzx	eax, word ptr [rsi + 56]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 56]
+	movzx	eax, word ptr [rsi + 58]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 58]
+	movzx	eax, word ptr [rsi + 60]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	cmp	ax, word ptr [rdx + 60]
+	movzx	eax, word ptr [rsi + 62]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	add	rsi, 64
+	cmp	ax, word ptr [rdx + 62]
+	sete	dil
+	movzx	eax, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	al, 6
+	shl	r13b, 7
+	or	r13b, al
+	movzx	eax, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r11b, 2
+	or	r11b, r8b
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	r8d, ecx
+	shl	r15b, 3
+	or	r15b, r11b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r8b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r15b
+	mov	r8d, eax
+	movzx	eax, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r8b
+	movzx	r8d, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	r8b, 6
+	shl	bl, 7
+	or	bl, r8b
+	or	r13b, cl
+	or	bl, al
+	add	r10b, r10b
+	add	r10b, byte ptr [rsp + 10]       # 1-byte Folded Reload
+	shl	r14b, 2
+	or	r14b, r10b
+	shl	r12b, 3
+	or	r12b, r14b
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r12b
+	mov	ecx, eax
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r13b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	dil, 7
+	or	dil, cl
+	or	dil, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 64
+	add	r14, 4
+	add	qword ptr [rsp + 56], -1        # 8-byte Folded Spill
+	jne	.LBB0_85
+# %bb.86:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 64]       # 8-byte Reload
+.LBB0_87:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.88:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_89:                               # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	movzx	edi, word ptr [rsi + 2*rcx]
+	cmp	di, word ptr [rdx + 2*rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_89
+	jmp	.LBB0_123
+.LBB0_101:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_105
+# %bb.102:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_103:                              # =>This Inner Loop Header: Depth=1
+	mov	rcx, qword ptr [rsi]
+	add	rsi, 8
+	cmp	rcx, qword ptr [rdx]
+	lea	rdx, [rdx + 8]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_103
+# %bb.104:
+	add	r14, 1
+.LBB0_105:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_109
+# %bb.106:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 64], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_107:                              # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	mov	rax, qword ptr [rsi]
+	mov	rcx, qword ptr [rsi + 8]
+	cmp	rax, qword ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	rcx, qword ptr [rdx + 8]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 16]
+	cmp	rax, qword ptr [rdx + 16]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 24]
+	cmp	rax, qword ptr [rdx + 24]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 32]
+	cmp	rax, qword ptr [rdx + 32]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 40]
+	cmp	rax, qword ptr [rdx + 40]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 48]
+	cmp	rax, qword ptr [rdx + 48]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 56]
+	cmp	rax, qword ptr [rdx + 56]
+	sete	r13b
+	mov	rax, qword ptr [rsi + 64]
+	cmp	rax, qword ptr [rdx + 64]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 72]
+	cmp	rax, qword ptr [rdx + 72]
+	sete	r8b
+	mov	rax, qword ptr [rsi + 80]
+	cmp	rax, qword ptr [rdx + 80]
+	sete	r11b
+	mov	rax, qword ptr [rsi + 88]
+	cmp	rax, qword ptr [rdx + 88]
+	sete	r15b
+	mov	rax, qword ptr [rsi + 96]
+	cmp	rax, qword ptr [rdx + 96]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 104]
+	cmp	rax, qword ptr [rdx + 104]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 112]
+	cmp	rax, qword ptr [rdx + 112]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	mov	rax, qword ptr [rsi + 120]
+	cmp	rax, qword ptr [rdx + 120]
+	sete	bl
+	mov	rax, qword ptr [rsi + 128]
+	mov	rcx, qword ptr [rsi + 136]
+	cmp	rax, qword ptr [rdx + 128]
+	mov	rax, qword ptr [rsi + 144]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	cmp	rcx, qword ptr [rdx + 136]
+	mov	rcx, qword ptr [rsi + 152]
+	sete	r10b
+	cmp	rax, qword ptr [rdx + 144]
+	mov	rax, qword ptr [rsi + 160]
+	sete	r14b
+	cmp	rcx, qword ptr [rdx + 152]
+	mov	rcx, qword ptr [rsi + 168]
+	sete	r12b
+	cmp	rax, qword ptr [rdx + 160]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	cmp	rcx, qword ptr [rdx + 168]
+	mov	rax, qword ptr [rsi + 176]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 176]
+	mov	rax, qword ptr [rsi + 184]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 184]
+	mov	rax, qword ptr [rsi + 192]
+	sete	r9b
+	cmp	rax, qword ptr [rdx + 192]
+	mov	rax, qword ptr [rsi + 200]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 200]
+	mov	rax, qword ptr [rsi + 208]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 208]
+	mov	rax, qword ptr [rsi + 216]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 216]
+	mov	rax, qword ptr [rsi + 224]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 224]
+	mov	rax, qword ptr [rsi + 232]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 232]
+	mov	rax, qword ptr [rsi + 240]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	cmp	rax, qword ptr [rdx + 240]
+	mov	rax, qword ptr [rsi + 248]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	add	rsi, 256
+	cmp	rax, qword ptr [rdx + 248]
+	sete	dil
+	movzx	eax, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	al, 6
+	shl	r13b, 7
+	or	r13b, al
+	movzx	eax, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r11b, 2
+	or	r11b, r8b
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	r8d, ecx
+	shl	r15b, 3
+	or	r15b, r11b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r8b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r15b
+	mov	r8d, eax
+	movzx	eax, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r8b
+	movzx	r8d, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	r8b, 6
+	shl	bl, 7
+	or	bl, r8b
+	or	r13b, cl
+	or	bl, al
+	add	r10b, r10b
+	add	r10b, byte ptr [rsp + 10]       # 1-byte Folded Reload
+	shl	r14b, 2
+	or	r14b, r10b
+	shl	r12b, 3
+	or	r12b, r14b
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r12b
+	mov	ecx, eax
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r13b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	dil, 7
+	or	dil, cl
+	or	dil, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 256
+	add	r14, 4
+	add	qword ptr [rsp + 56], -1        # 8-byte Folded Spill
+	jne	.LBB0_107
+# %bb.108:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 64]       # 8-byte Reload
+.LBB0_109:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.110:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_111:                              # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	mov	rdi, qword ptr [rsi + 8*rcx]
+	cmp	rdi, qword ptr [rdx + 8*rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_111
+	jmp	.LBB0_123
+.LBB0_112:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_116
+# %bb.113:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_114:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rsi]           # xmm0 = mem[0],zero,zero,zero
+	add	rsi, 4
+	vucomiss	xmm0, dword ptr [rdx]
+	lea	rdx, [rdx + 4]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_114
+# %bb.115:
+	add	r14, 1
+.LBB0_116:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_120
+# %bb.117:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 32], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 40], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_118:                              # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	vmovss	xmm0, dword ptr [rsi]           # xmm0 = mem[0],zero,zero,zero
+	vmovss	xmm1, dword ptr [rsi + 4]       # xmm1 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	vucomiss	xmm1, dword ptr [rdx + 4]
+	sete	al
+	vmovss	xmm0, dword ptr [rsi + 8]       # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 8]
+	vmovss	xmm0, dword ptr [rsi + 12]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 12]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	vmovss	xmm0, dword ptr [rsi + 16]      # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 16]
+	vmovss	xmm0, dword ptr [rsi + 20]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 20]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	vmovss	xmm0, dword ptr [rsi + 24]      # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 24]
+	vmovss	xmm0, dword ptr [rsi + 28]      # xmm0 = mem[0],zero,zero,zero
+	sete	r13b
+	vucomiss	xmm0, dword ptr [rdx + 28]
+	sete	r15b
+	vmovss	xmm0, dword ptr [rsi + 32]      # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 32]
+	vmovss	xmm0, dword ptr [rsi + 36]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 36]
+	sete	cl
+	vmovss	xmm0, dword ptr [rsi + 40]      # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 40]
+	vmovss	xmm0, dword ptr [rsi + 44]      # xmm0 = mem[0],zero,zero,zero
+	sete	r9b
+	vucomiss	xmm0, dword ptr [rdx + 44]
+	sete	r11b
+	vmovss	xmm0, dword ptr [rsi + 48]      # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 48]
+	vmovss	xmm0, dword ptr [rsi + 52]      # xmm0 = mem[0],zero,zero,zero
+	sete	r10b
+	vucomiss	xmm0, dword ptr [rdx + 52]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	vmovss	xmm0, dword ptr [rsi + 56]      # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 56]
+	vmovss	xmm0, dword ptr [rsi + 60]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 60]
+	sete	bl
+	vmovss	xmm0, dword ptr [rsi + 64]      # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 64]
+	vmovss	xmm0, dword ptr [rsi + 68]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 68]
+	vmovss	xmm0, dword ptr [rsi + 72]      # xmm0 = mem[0],zero,zero,zero
+	sete	r14b
+	vucomiss	xmm0, dword ptr [rdx + 72]
+	vmovss	xmm0, dword ptr [rsi + 76]      # xmm0 = mem[0],zero,zero,zero
+	sete	r12b
+	vucomiss	xmm0, dword ptr [rdx + 76]
+	vmovss	xmm0, dword ptr [rsi + 80]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 80]
+	vmovss	xmm0, dword ptr [rsi + 84]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 84]
+	vmovss	xmm0, dword ptr [rsi + 88]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 88]
+	vmovss	xmm0, dword ptr [rsi + 92]      # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 92]
+	vmovss	xmm0, dword ptr [rsi + 96]      # xmm0 = mem[0],zero,zero,zero
+	sete	r8b
+	vucomiss	xmm0, dword ptr [rdx + 96]
+	vmovss	xmm0, dword ptr [rsi + 100]     # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 100]
+	vmovss	xmm0, dword ptr [rsi + 104]     # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 104]
+	vmovss	xmm0, dword ptr [rsi + 108]     # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 108]
+	vmovss	xmm0, dword ptr [rsi + 112]     # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 112]
+	vmovss	xmm0, dword ptr [rsi + 116]     # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 116]
+	vmovss	xmm0, dword ptr [rsi + 120]     # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rdx + 120]
+	vmovss	xmm0, dword ptr [rsi + 124]     # xmm0 = mem[0],zero,zero,zero
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	sub	rsi, -128
+	vucomiss	xmm0, dword ptr [rdx + 124]
+	sete	dil
+	add	al, al
+	add	al, byte ptr [rsp + 4]          # 1-byte Folded Reload
+	shl	r13b, 6
+	shl	r15b, 7
+	or	r15b, r13b
+	movzx	r13d, byte ptr [rsp + 5]        # 1-byte Folded Reload
+	shl	r13b, 2
+	or	r13b, al
+	mov	eax, r13d
+	add	cl, cl
+	add	cl, byte ptr [rsp + 8]          # 1-byte Folded Reload
+	movzx	r13d, byte ptr [rsp + 22]       # 1-byte Folded Reload
+	shl	r13b, 3
+	or	r13b, al
+	shl	r9b, 2
+	or	r9b, cl
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, r13b
+	mov	r13d, ecx
+	shl	r11b, 3
+	or	r11b, r9b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r13b
+	shl	r10b, 4
+	or	r10b, r11b
+	movzx	eax, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r10b
+	movzx	r9d, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	r9b, 6
+	shl	bl, 7
+	or	bl, r9b
+	or	r15b, cl
+	or	bl, al
+	add	r14b, r14b
+	add	r14b, byte ptr [rsp + 14]       # 1-byte Folded Reload
+	shl	r12b, 2
+	or	r12b, r14b
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r12b
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 10]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r15b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r8b, 7
+	or	r8b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r8b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 20]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 19]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, al
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 6
+	shl	dil, 7
+	or	dil, al
+	or	dil, cl
+	mov	byte ptr [r14 + 2], r8b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 128
+	add	r14, 4
+	add	qword ptr [rsp + 40], -1        # 8-byte Folded Spill
+	jne	.LBB0_118
+# %bb.119:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 32]       # 8-byte Reload
+.LBB0_120:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.121:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_122:                              # =>This Inner Loop Header: Depth=1
+	vmovss	xmm0, dword ptr [rsi + 4*rcx]   # xmm0 = mem[0],zero,zero,zero
+	vucomiss	xmm0, dword ptr [rdx + 4*rcx]
+	lea	r8, [rcx + 1]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_122
+	jmp	.LBB0_123
+.LBB0_57:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_61
+# %bb.58:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_59:                               # =>This Inner Loop Header: Depth=1
+	movzx	ecx, byte ptr [rsi]
+	add	rsi, 1
+	cmp	cl, byte ptr [rdx]
+	lea	rdx, [rdx + 1]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_59
+# %bb.60:
+	add	r14, 1
+.LBB0_61:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_65
+# %bb.62:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 32], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_63:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	movzx	eax, byte ptr [rsi]
+	movzx	ecx, byte ptr [rsi + 1]
+	cmp	al, byte ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	cl, byte ptr [rdx + 1]
+	sete	cl
+	movzx	eax, byte ptr [rsi + 2]
+	cmp	al, byte ptr [rdx + 2]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 3]
+	cmp	al, byte ptr [rdx + 3]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 4]
+	cmp	al, byte ptr [rdx + 4]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 5]
+	cmp	al, byte ptr [rdx + 5]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 6]
+	cmp	al, byte ptr [rdx + 6]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 7]
+	cmp	al, byte ptr [rdx + 7]
+	sete	r15b
+	movzx	eax, byte ptr [rsi + 8]
+	cmp	al, byte ptr [rdx + 8]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 9]
+	cmp	al, byte ptr [rdx + 9]
+	sete	dil
+	movzx	eax, byte ptr [rsi + 10]
+	cmp	al, byte ptr [rdx + 10]
+	sete	r10b
+	movzx	eax, byte ptr [rsi + 11]
+	cmp	al, byte ptr [rdx + 11]
+	sete	r11b
+	movzx	eax, byte ptr [rsi + 12]
+	cmp	al, byte ptr [rdx + 12]
+	sete	r14b
+	movzx	eax, byte ptr [rsi + 13]
+	cmp	al, byte ptr [rdx + 13]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 14]
+	cmp	al, byte ptr [rdx + 14]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 15]
+	cmp	al, byte ptr [rdx + 15]
+	sete	bl
+	movzx	eax, byte ptr [rsi + 16]
+	cmp	al, byte ptr [rdx + 16]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 17]
+	cmp	al, byte ptr [rdx + 17]
+	sete	r12b
+	movzx	eax, byte ptr [rsi + 18]
+	cmp	al, byte ptr [rdx + 18]
+	sete	r13b
+	movzx	eax, byte ptr [rsi + 19]
+	cmp	al, byte ptr [rdx + 19]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 20]
+	cmp	al, byte ptr [rdx + 20]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 21]
+	cmp	al, byte ptr [rdx + 21]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 22]
+	cmp	al, byte ptr [rdx + 22]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 23]
+	cmp	al, byte ptr [rdx + 23]
+	sete	r9b
+	movzx	eax, byte ptr [rsi + 24]
+	cmp	al, byte ptr [rdx + 24]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 25]
+	cmp	al, byte ptr [rdx + 25]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 26]
+	cmp	al, byte ptr [rdx + 26]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 27]
+	cmp	al, byte ptr [rdx + 27]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 28]
+	cmp	al, byte ptr [rdx + 28]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 29]
+	cmp	al, byte ptr [rdx + 29]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 30]
+	cmp	al, byte ptr [rdx + 30]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	movzx	eax, byte ptr [rsi + 31]
+	add	rsi, 32
+	cmp	al, byte ptr [rdx + 31]
+	sete	r8b
+	add	cl, cl
+	add	cl, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	eax, ecx
+	movzx	ecx, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r15b, 7
+	or	r15b, cl
+	movzx	ecx, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, al
+	mov	eax, ecx
+	add	dil, dil
+	add	dil, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r10b, 2
+	or	r10b, dil
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	edi, ecx
+	shl	r11b, 3
+	or	r11b, r10b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, dil
+	shl	r14b, 4
+	or	r14b, r11b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r14b
+	movzx	edi, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	dil, 6
+	shl	bl, 7
+	or	bl, dil
+	or	r15b, cl
+	or	bl, al
+	add	r12b, r12b
+	add	r12b, byte ptr [rsp + 13]       # 1-byte Folded Reload
+	shl	r13b, 2
+	or	r13b, r12b
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r13b
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 10]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r15b
+	movzx	ecx, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r8b, 7
+	or	r8b, cl
+	or	r8b, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], r8b
+	add	rdx, 32
+	add	r14, 4
+	add	qword ptr [rsp + 32], -1        # 8-byte Folded Spill
+	jne	.LBB0_63
+# %bb.64:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 56]       # 8-byte Reload
+.LBB0_65:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.66:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_67:                               # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	movzx	ebx, byte ptr [rsi + rcx]
+	cmp	bl, byte ptr [rdx + rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_67
+	jmp	.LBB0_123
+.LBB0_90:
+	lea	r15, [r11 + 31]
+	test	r11, r11
+	cmovns	r15, r11
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB0_94
+# %bb.91:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB0_92:                               # =>This Inner Loop Header: Depth=1
+	mov	ecx, dword ptr [rsi]
+	add	rsi, 4
+	cmp	ecx, dword ptr [rdx]
+	lea	rdx, [rdx + 4]
+	sete	r10b
+	neg	r10b
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r8d, byte ptr [r14 + rdi]
+	xor	r10b, r8b
+	lea	r9d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r9d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, r10b
+	xor	bl, r8b
+	mov	byte ptr [r14 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB0_92
+# %bb.93:
+	add	r14, 1
+.LBB0_94:
+	sar	r15, 5
+	cmp	r11, 32
+	jl	.LBB0_98
+# %bb.95:
+	mov	qword ptr [rsp + 24], r11       # 8-byte Spill
+	mov	qword ptr [rsp + 64], r15       # 8-byte Spill
+	mov	qword ptr [rsp + 56], r15       # 8-byte Spill
+	.p2align	4, 0x90
+.LBB0_96:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 48], r14       # 8-byte Spill
+	mov	eax, dword ptr [rsi]
+	mov	ecx, dword ptr [rsi + 4]
+	cmp	eax, dword ptr [rdx]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	ecx, dword ptr [rdx + 4]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 8]
+	cmp	eax, dword ptr [rdx + 8]
+	sete	byte ptr [rsp + 20]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 12]
+	cmp	eax, dword ptr [rdx + 12]
+	sete	byte ptr [rsp + 21]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 16]
+	cmp	eax, dword ptr [rdx + 16]
+	sete	byte ptr [rsp + 22]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 20]
+	cmp	eax, dword ptr [rdx + 20]
+	sete	byte ptr [rsp + 23]             # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 24]
+	cmp	eax, dword ptr [rdx + 24]
+	sete	byte ptr [rsp + 4]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 28]
+	cmp	eax, dword ptr [rdx + 28]
+	sete	r13b
+	mov	eax, dword ptr [rsi + 32]
+	cmp	eax, dword ptr [rdx + 32]
+	sete	byte ptr [rsp + 9]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 36]
+	cmp	eax, dword ptr [rdx + 36]
+	sete	r8b
+	mov	eax, dword ptr [rsi + 40]
+	cmp	eax, dword ptr [rdx + 40]
+	sete	r11b
+	mov	eax, dword ptr [rsi + 44]
+	cmp	eax, dword ptr [rdx + 44]
+	sete	r15b
+	mov	eax, dword ptr [rsi + 48]
+	cmp	eax, dword ptr [rdx + 48]
+	sete	byte ptr [rsp + 5]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 52]
+	cmp	eax, dword ptr [rdx + 52]
+	sete	byte ptr [rsp + 6]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 56]
+	cmp	eax, dword ptr [rdx + 56]
+	sete	byte ptr [rsp + 7]              # 1-byte Folded Spill
+	mov	eax, dword ptr [rsi + 60]
+	cmp	eax, dword ptr [rdx + 60]
+	sete	bl
+	mov	eax, dword ptr [rsi + 64]
+	mov	ecx, dword ptr [rsi + 68]
+	cmp	eax, dword ptr [rdx + 64]
+	mov	eax, dword ptr [rsi + 72]
+	sete	byte ptr [rsp + 10]             # 1-byte Folded Spill
+	cmp	ecx, dword ptr [rdx + 68]
+	mov	ecx, dword ptr [rsi + 76]
+	sete	r10b
+	cmp	eax, dword ptr [rdx + 72]
+	mov	eax, dword ptr [rsi + 80]
+	sete	r14b
+	cmp	ecx, dword ptr [rdx + 76]
+	mov	ecx, dword ptr [rsi + 84]
+	sete	r12b
+	cmp	eax, dword ptr [rdx + 80]
+	sete	byte ptr [rsp + 8]              # 1-byte Folded Spill
+	cmp	ecx, dword ptr [rdx + 84]
+	mov	eax, dword ptr [rsi + 88]
+	sete	byte ptr [rsp + 11]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 88]
+	mov	eax, dword ptr [rsi + 92]
+	sete	byte ptr [rsp + 12]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 92]
+	mov	eax, dword ptr [rsi + 96]
+	sete	r9b
+	cmp	eax, dword ptr [rdx + 96]
+	mov	eax, dword ptr [rsi + 100]
+	sete	byte ptr [rsp + 19]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 100]
+	mov	eax, dword ptr [rsi + 104]
+	sete	byte ptr [rsp + 13]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 104]
+	mov	eax, dword ptr [rsi + 108]
+	sete	byte ptr [rsp + 14]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 108]
+	mov	eax, dword ptr [rsi + 112]
+	sete	byte ptr [rsp + 15]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 112]
+	mov	eax, dword ptr [rsi + 116]
+	sete	byte ptr [rsp + 16]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 116]
+	mov	eax, dword ptr [rsi + 120]
+	sete	byte ptr [rsp + 18]             # 1-byte Folded Spill
+	cmp	eax, dword ptr [rdx + 120]
+	mov	eax, dword ptr [rsi + 124]
+	sete	byte ptr [rsp + 17]             # 1-byte Folded Spill
+	sub	rsi, -128
+	cmp	eax, dword ptr [rdx + 124]
+	sete	dil
+	movzx	eax, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 40]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 4]         # 1-byte Folded Reload
+	shl	al, 6
+	shl	r13b, 7
+	or	r13b, al
+	movzx	eax, byte ptr [rsp + 20]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 9]         # 1-byte Folded Reload
+	movzx	ecx, byte ptr [rsp + 21]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, al
+	mov	eax, ecx
+	shl	r11b, 2
+	or	r11b, r8b
+	movzx	ecx, byte ptr [rsp + 22]        # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, al
+	mov	r8d, ecx
+	shl	r15b, 3
+	or	r15b, r11b
+	movzx	ecx, byte ptr [rsp + 23]        # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, r8b
+	movzx	eax, byte ptr [rsp + 5]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r15b
+	mov	r8d, eax
+	movzx	eax, byte ptr [rsp + 6]         # 1-byte Folded Reload
+	shl	al, 5
+	or	al, r8b
+	movzx	r8d, byte ptr [rsp + 7]         # 1-byte Folded Reload
+	shl	r8b, 6
+	shl	bl, 7
+	or	bl, r8b
+	or	r13b, cl
+	or	bl, al
+	add	r10b, r10b
+	add	r10b, byte ptr [rsp + 10]       # 1-byte Folded Reload
+	shl	r14b, 2
+	or	r14b, r10b
+	shl	r12b, 3
+	or	r12b, r14b
+	movzx	eax, byte ptr [rsp + 8]         # 1-byte Folded Reload
+	shl	al, 4
+	or	al, r12b
+	mov	ecx, eax
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	movzx	eax, byte ptr [rsp + 11]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	mov	byte ptr [r14], r13b
+	movzx	ecx, byte ptr [rsp + 12]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	r9b, 7
+	or	r9b, cl
+	mov	byte ptr [r14 + 1], bl
+	or	r9b, al
+	movzx	eax, byte ptr [rsp + 13]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 19]         # 1-byte Folded Reload
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 14]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 15]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 16]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, cl
+	mov	ecx, eax
+	movzx	eax, byte ptr [rsp + 18]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, cl
+	movzx	ecx, byte ptr [rsp + 17]        # 1-byte Folded Reload
+	shl	cl, 6
+	shl	dil, 7
+	or	dil, cl
+	or	dil, al
+	mov	byte ptr [r14 + 2], r9b
+	mov	byte ptr [r14 + 3], dil
+	add	rdx, 128
+	add	r14, 4
+	add	qword ptr [rsp + 56], -1        # 8-byte Folded Spill
+	jne	.LBB0_96
+# %bb.97:
+	mov	r11, qword ptr [rsp + 24]       # 8-byte Reload
+	mov	r15, qword ptr [rsp + 64]       # 8-byte Reload
+.LBB0_98:
+	shl	r15, 5
+	cmp	r15, r11
+	jge	.LBB0_123
+# %bb.99:
+	sub	r11, r15
+	xor	ecx, ecx
+	.p2align	4, 0x90
+.LBB0_100:                              # =>This Inner Loop Header: Depth=1
+	lea	r8, [rcx + 1]
+	mov	edi, dword ptr [rsi + 4*rcx]
+	cmp	edi, dword ptr [rdx + 4*rcx]
+	sete	bl
+	neg	bl
+	mov	rdi, rcx
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r14 + rdi]
+	xor	bl, r9b
+	and	cl, 7
+	mov	al, 1
+                                        # kill: def $cl killed $cl killed $rcx
+	shl	al, cl
+	and	al, bl
+	xor	al, r9b
+	mov	byte ptr [r14 + rdi], al
+	mov	rcx, r8
+	cmp	r11, r8
+	jne	.LBB0_100
+.LBB0_123:
+	lea	rsp, [rbp - 40]
+	pop	rbx
+	pop	r12
+	pop	r13
+	pop	r14
+	pop	r15
+	pop	rbp
+	ret
+.Lfunc_end0:
+	.size	comparison_equal_arr_arr_avx2, .Lfunc_end0-comparison_equal_arr_arr_avx2
+                                        # -- End function
+	.section	.rodata.cst32,"aM",@progbits,32
+	.p2align	5                               # -- Begin function comparison_equal_arr_scalar_avx2
+.LCPI1_0:
+	.zero	32,1
+.LCPI1_1:
+	.zero	32,4
+.LCPI1_2:
+	.zero	32,8
+.LCPI1_3:
+	.zero	32,16
+.LCPI1_4:
+	.zero	32,32
+.LCPI1_5:
+	.zero	32,64
+.LCPI1_6:
+	.zero	32,128
+	.text
+	.globl	comparison_equal_arr_scalar_avx2
+	.p2align	4, 0x90
+	.type	comparison_equal_arr_scalar_avx2,@function
+comparison_equal_arr_scalar_avx2:       # @comparison_equal_arr_scalar_avx2
+# %bb.0:
+	push	rbp
+	mov	rbp, rsp
+	push	r15
+	push	r14
+	push	r13
+	push	r12
+	push	rbx
+	and	rsp, -32
+	sub	rsp, 1280
+                                        # kill: def $r9d killed $r9d def $r9
+	mov	r10, r8
+	mov	r11, rcx
+	cmp	edi, 6
+	jg	.LBB1_13
+# %bb.1:
+	cmp	edi, 3
+	jle	.LBB1_25
+# %bb.2:
+	cmp	edi, 4
+	je	.LBB1_49
+# %bb.3:
+	cmp	edi, 5
+	je	.LBB1_57
+# %bb.4:
+	cmp	edi, 6
+	jne	.LBB1_164
+# %bb.5:
+	mov	r13d, dword ptr [rdx]
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_9
+# %bb.6:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_7:                                # =>This Inner Loop Header: Depth=1
+	cmp	dword ptr [rsi], r13d
+	lea	rsi, [rsi + 4]
+	sete	dl
+	neg	dl
+	lea	rbx, [rax + 7]
+	test	rax, rax
+	cmovns	rbx, rax
+	sar	rbx, 3
+	movzx	r8d, byte ptr [r11 + rbx]
+	xor	dl, r8b
+	lea	edi, [8*rbx]
+	mov	ecx, eax
+	sub	ecx, edi
+	mov	edi, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	edi, cl
+	and	dil, dl
+	xor	dil, r8b
+	mov	byte ptr [r11 + rbx], dil
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_7
+# %bb.8:
+	add	r11, 1
+.LBB1_9:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_101
+# %bb.10:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 176], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_11:                               # =>This Inner Loop Header: Depth=1
+	cmp	dword ptr [rsi], r13d
+	sete	byte ptr [rsp + 152]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 4], r13d
+	sete	dil
+	cmp	dword ptr [rsi + 8], r13d
+	sete	r14b
+	cmp	dword ptr [rsi + 12], r13d
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 16], r13d
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 20], r13d
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 24], r13d
+	sete	al
+	cmp	dword ptr [rsi + 28], r13d
+	sete	bl
+	cmp	dword ptr [rsi + 32], r13d
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 36], r13d
+	sete	dl
+	cmp	dword ptr [rsi + 40], r13d
+	sete	r9b
+	cmp	dword ptr [rsi + 44], r13d
+	sete	r10b
+	cmp	dword ptr [rsi + 48], r13d
+	sete	r11b
+	cmp	dword ptr [rsi + 52], r13d
+	sete	r12b
+	cmp	dword ptr [rsi + 56], r13d
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 60], r13d
+	sete	cl
+	cmp	dword ptr [rsi + 64], r13d
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 68], r13d
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 72], r13d
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 76], r13d
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 80], r13d
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 84], r13d
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 88], r13d
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 92], r13d
+	sete	r15b
+	cmp	dword ptr [rsi + 96], r13d
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 100], r13d
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 104], r13d
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 108], r13d
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 112], r13d
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 116], r13d
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 120], r13d
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 124], r13d
+	sete	r8b
+	add	dil, dil
+	add	dil, byte ptr [rsp + 152]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, dil
+	add	dl, dl
+	add	dl, byte ptr [rsp + 104]        # 1-byte Folded Reload
+	movzx	eax, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r14b
+	shl	r9b, 2
+	or	r9b, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, al
+	mov	edi, edx
+	shl	r10b, 3
+	or	r10b, r9b
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	movzx	edi, byte ptr [rsp + 112]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	add	dl, dl
+	add	dl, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	edi, edx
+	mov	rdx, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	byte ptr [rdx], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [rdx + 1], cl
+	or	r15b, dil
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, bl
+	movzx	ebx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r8b, 7
+	or	r8b, bl
+	or	r8b, cl
+	mov	byte ptr [rdx + 2], r15b
+	mov	byte ptr [rdx + 3], r8b
+	add	rsi, 128
+	add	rdx, 4
+	mov	qword ptr [rsp + 272], rdx      # 8-byte Spill
+	add	qword ptr [rsp + 168], -1       # 8-byte Folded Spill
+	jne	.LBB1_11
+# %bb.12:
+	mov	r14, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 176]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_102
+	jmp	.LBB1_164
+.LBB1_13:
+	cmp	edi, 8
+	jle	.LBB1_39
+# %bb.14:
+	cmp	edi, 9
+	je	.LBB1_65
+# %bb.15:
+	cmp	edi, 11
+	je	.LBB1_73
+# %bb.16:
+	cmp	edi, 12
+	jne	.LBB1_164
+# %bb.17:
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	vmovsd	xmm0, qword ptr [rdx]           # xmm0 = mem[0],zero
+	sub	r9d, eax
+	je	.LBB1_21
+# %bb.18:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_19:                               # =>This Inner Loop Header: Depth=1
+	vucomisd	xmm0, qword ptr [rsi]
+	lea	rsi, [rsi + 8]
+	sete	dl
+	neg	dl
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r9d, byte ptr [r11 + rdi]
+	xor	dl, r9b
+	lea	r8d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r8d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, dl
+	xor	bl, r9b
+	mov	byte ptr [r11 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_19
+# %bb.20:
+	add	r11, 1
+.LBB1_21:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_105
+# %bb.22:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 152], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_23:                               # =>This Inner Loop Header: Depth=1
+	vucomisd	xmm0, qword ptr [rsi]
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 8]
+	sete	r9b
+	vucomisd	xmm0, qword ptr [rsi + 16]
+	sete	r14b
+	vucomisd	xmm0, qword ptr [rsi + 24]
+	sete	r13b
+	vucomisd	xmm0, qword ptr [rsi + 32]
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 40]
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 48]
+	sete	al
+	vucomisd	xmm0, qword ptr [rsi + 56]
+	sete	bl
+	vucomisd	xmm0, qword ptr [rsi + 64]
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 72]
+	sete	dl
+	vucomisd	xmm0, qword ptr [rsi + 80]
+	sete	dil
+	vucomisd	xmm0, qword ptr [rsi + 88]
+	sete	r10b
+	vucomisd	xmm0, qword ptr [rsi + 96]
+	sete	r11b
+	vucomisd	xmm0, qword ptr [rsi + 104]
+	sete	r12b
+	vucomisd	xmm0, qword ptr [rsi + 112]
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 120]
+	sete	cl
+	vucomisd	xmm0, qword ptr [rsi + 128]
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 136]
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 144]
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 152]
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 160]
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 168]
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 176]
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 184]
+	sete	r15b
+	vucomisd	xmm0, qword ptr [rsi + 192]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 200]
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 208]
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 216]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 224]
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 232]
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 240]
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	vucomisd	xmm0, qword ptr [rsi + 248]
+	sete	r8b
+	add	r9b, r9b
+	add	r9b, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, r9b
+	add	dl, dl
+	add	dl, byte ptr [rsp + 112]        # 1-byte Folded Reload
+	shl	r13b, 3
+	or	r13b, r14b
+	shl	dil, 2
+	or	dil, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, r13b
+	mov	r9d, edx
+	shl	r10b, 3
+	or	r10b, dil
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, r9b
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	movzx	edi, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	eax, byte ptr [rsp + 104]       # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, al
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	edi, edx
+	mov	rdx, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	byte ptr [rdx], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [rdx + 1], cl
+	or	r15b, dil
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, bl
+	movzx	ebx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r8b, 7
+	or	r8b, bl
+	or	r8b, cl
+	mov	byte ptr [rdx + 2], r15b
+	mov	byte ptr [rdx + 3], r8b
+	add	rsi, 256
+	add	rdx, 4
+	mov	qword ptr [rsp + 272], rdx      # 8-byte Spill
+	add	qword ptr [rsp + 152], -1       # 8-byte Folded Spill
+	jne	.LBB1_23
+# %bb.24:
+	mov	r14, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 168]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_106
+	jmp	.LBB1_164
+.LBB1_25:
+	cmp	edi, 2
+	je	.LBB1_81
+# %bb.26:
+	cmp	edi, 3
+	jne	.LBB1_164
+# %bb.27:
+	mov	r14b, byte ptr [rdx]
+	lea	r13, [r10 + 31]
+	test	r10, r10
+	mov	r15, r10
+	cmovns	r13, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_31
+# %bb.28:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_29:                               # =>This Inner Loop Header: Depth=1
+	cmp	byte ptr [rsi], r14b
+	lea	rsi, [rsi + 1]
+	sete	dl
+	neg	dl
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r9d, byte ptr [r11 + rdi]
+	xor	dl, r9b
+	lea	r8d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r8d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, dl
+	xor	bl, r9b
+	mov	byte ptr [r11 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_29
+# %bb.30:
+	add	r11, 1
+.LBB1_31:
+	sar	r13, 5
+	cmp	r15, 32
+	jl	.LBB1_108
+# %bb.32:
+	cmp	r13, 32
+	mov	dword ptr [rsp + 28], r14d      # 4-byte Spill
+	mov	qword ptr [rsp + 280], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 392], r13      # 8-byte Spill
+	jb	.LBB1_35
+# %bb.33:
+	mov	rax, r13
+	shl	rax, 5
+	add	rax, rsi
+	cmp	r11, rax
+	jae	.LBB1_165
+# %bb.34:
+	lea	rax, [r11 + 4*r13]
+	cmp	rsi, rax
+	jae	.LBB1_165
+.LBB1_35:
+	xor	eax, eax
+	mov	qword ptr [rsp + 384], rax      # 8-byte Spill
+	mov	r12, rsi
+	mov	qword ptr [rsp + 376], r11      # 8-byte Spill
+.LBB1_36:
+	sub	r13, qword ptr [rsp + 384]      # 8-byte Folded Reload
+	mov	qword ptr [rsp + 152], r13      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_37:                               # =>This Inner Loop Header: Depth=1
+	mov	rcx, r12
+	cmp	byte ptr [r12], r14b
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 1], r14b
+	sete	r8b
+	cmp	byte ptr [r12 + 2], r14b
+	sete	r15b
+	cmp	byte ptr [r12 + 3], r14b
+	sete	r13b
+	cmp	byte ptr [r12 + 4], r14b
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 5], r14b
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 6], r14b
+	sete	al
+	cmp	byte ptr [r12 + 7], r14b
+	sete	r11b
+	cmp	byte ptr [r12 + 8], r14b
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 9], r14b
+	sete	dl
+	cmp	byte ptr [r12 + 10], r14b
+	sete	sil
+	cmp	byte ptr [r12 + 11], r14b
+	sete	dil
+	cmp	byte ptr [r12 + 12], r14b
+	sete	r10b
+	cmp	byte ptr [r12 + 13], r14b
+	sete	r12b
+	cmp	byte ptr [rcx + 14], r14b
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 15], r14b
+	sete	r9b
+	cmp	byte ptr [rcx + 16], r14b
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 17], r14b
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 18], r14b
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 19], r14b
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 20], r14b
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 21], r14b
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 22], r14b
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 23], r14b
+	sete	r14b
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 24], bl
+	sete	byte ptr [rsp + 272]            # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 25], bl
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 26], bl
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 27], bl
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 28], bl
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 29], bl
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 30], bl
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 31], bl
+	sete	bl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	shl	al, 6
+	shl	r11b, 7
+	or	r11b, al
+	shl	r15b, 2
+	or	r15b, r8b
+	add	dl, dl
+	add	dl, byte ptr [rsp + 320]        # 1-byte Folded Reload
+	shl	r13b, 3
+	or	r13b, r15b
+	shl	sil, 2
+	or	sil, dl
+	movzx	edx, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, r13b
+	mov	r8d, edx
+	shl	dil, 3
+	or	dil, sil
+	movzx	edx, byte ptr [rsp + 112]       # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, r8b
+	shl	r10b, 4
+	or	r10b, dil
+	shl	r12b, 5
+	or	r12b, r10b
+	movzx	esi, byte ptr [rsp + 104]       # 1-byte Folded Reload
+	shl	sil, 6
+	shl	r9b, 7
+	or	r9b, sil
+	or	r11b, dl
+	or	r9b, r12b
+	movzx	eax, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 288]        # 1-byte Folded Reload
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, al
+	mov	esi, edx
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, sil
+	mov	esi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, sil
+	mov	esi, edx
+	movzx	edx, byte ptr [rsp + 72]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, sil
+	mov	rsi, qword ptr [rsp + 376]      # 8-byte Reload
+	mov	byte ptr [rsi], r11b
+	movzx	edi, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dil, 6
+	shl	r14b, 7
+	or	r14b, dil
+	mov	byte ptr [rsi + 1], r9b
+	or	r14b, dl
+	movzx	eax, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 272]        # 1-byte Folded Reload
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, dl
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, dl
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, dl
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, dl
+	movzx	edx, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	shl	dl, 6
+	shl	bl, 7
+	or	bl, dl
+	or	bl, al
+	mov	byte ptr [rsi + 2], r14b
+	mov	r14d, dword ptr [rsp + 28]      # 4-byte Reload
+	mov	byte ptr [rsi + 3], bl
+	lea	r12, [rcx + 32]
+	add	rsi, 4
+	mov	qword ptr [rsp + 376], rsi      # 8-byte Spill
+	add	qword ptr [rsp + 152], -1       # 8-byte Folded Spill
+	jne	.LBB1_37
+# %bb.38:
+	mov	r15, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r13, qword ptr [rsp + 392]      # 8-byte Reload
+	jmp	.LBB1_109
+.LBB1_39:
+	cmp	edi, 7
+	je	.LBB1_93
+# %bb.40:
+	cmp	edi, 8
+	jne	.LBB1_164
+# %bb.41:
+	mov	r13, qword ptr [rdx]
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_45
+# %bb.42:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_43:                               # =>This Inner Loop Header: Depth=1
+	cmp	qword ptr [rsi], r13
+	lea	rsi, [rsi + 8]
+	sete	dl
+	neg	dl
+	lea	rbx, [rax + 7]
+	test	rax, rax
+	cmovns	rbx, rax
+	sar	rbx, 3
+	movzx	r8d, byte ptr [r11 + rbx]
+	xor	dl, r8b
+	lea	edi, [8*rbx]
+	mov	ecx, eax
+	sub	ecx, edi
+	mov	edi, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	edi, cl
+	and	dil, dl
+	xor	dil, r8b
+	mov	byte ptr [r11 + rbx], dil
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_43
+# %bb.44:
+	add	r11, 1
+.LBB1_45:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_112
+# %bb.46:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 176], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_47:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	cmp	qword ptr [rsi], r13
+	sete	byte ptr [rsp + 152]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 8], r13
+	sete	dil
+	cmp	qword ptr [rsi + 16], r13
+	sete	r14b
+	cmp	qword ptr [rsi + 24], r13
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 32], r13
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 40], r13
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 48], r13
+	sete	al
+	cmp	qword ptr [rsi + 56], r13
+	sete	bl
+	cmp	qword ptr [rsi + 64], r13
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 72], r13
+	sete	dl
+	cmp	qword ptr [rsi + 80], r13
+	sete	r9b
+	cmp	qword ptr [rsi + 88], r13
+	sete	r10b
+	cmp	qword ptr [rsi + 96], r13
+	sete	r11b
+	cmp	qword ptr [rsi + 104], r13
+	sete	r12b
+	cmp	qword ptr [rsi + 112], r13
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 120], r13
+	sete	cl
+	cmp	qword ptr [rsi + 128], r13
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 136], r13
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 144], r13
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 152], r13
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 160], r13
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 168], r13
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 176], r13
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 184], r13
+	sete	r15b
+	cmp	qword ptr [rsi + 192], r13
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 200], r13
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 208], r13
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 216], r13
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 224], r13
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 232], r13
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 240], r13
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 248], r13
+	sete	r8b
+	add	dil, dil
+	add	dil, byte ptr [rsp + 152]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, dil
+	add	dl, dl
+	add	dl, byte ptr [rsp + 104]        # 1-byte Folded Reload
+	movzx	eax, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r14b
+	shl	r9b, 2
+	or	r9b, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, al
+	mov	edi, edx
+	shl	r10b, 3
+	or	r10b, r9b
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	mov	r11, qword ptr [rsp + 272]      # 8-byte Reload
+	movzx	edi, byte ptr [rsp + 112]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	add	dl, dl
+	add	dl, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	byte ptr [r11], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [r11 + 1], cl
+	or	r15b, dl
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, dl
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, dl
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, dl
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, dl
+	movzx	edx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	dl, 6
+	shl	r8b, 7
+	or	r8b, dl
+	or	r8b, cl
+	mov	byte ptr [r11 + 2], r15b
+	mov	byte ptr [r11 + 3], r8b
+	add	rsi, 256
+	add	r11, 4
+	add	qword ptr [rsp + 168], -1       # 8-byte Folded Spill
+	jne	.LBB1_47
+# %bb.48:
+	mov	r14, r11
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 176]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_113
+	jmp	.LBB1_164
+.LBB1_49:
+	movzx	r13d, word ptr [rdx]
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_53
+# %bb.50:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_51:                               # =>This Inner Loop Header: Depth=1
+	cmp	word ptr [rsi], r13w
+	lea	rsi, [rsi + 2]
+	sete	dl
+	neg	dl
+	lea	rbx, [rax + 7]
+	test	rax, rax
+	cmovns	rbx, rax
+	sar	rbx, 3
+	movzx	r8d, byte ptr [r11 + rbx]
+	xor	dl, r8b
+	lea	edi, [8*rbx]
+	mov	ecx, eax
+	sub	ecx, edi
+	mov	edi, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	edi, cl
+	and	dil, dl
+	xor	dil, r8b
+	mov	byte ptr [r11 + rbx], dil
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_51
+# %bb.52:
+	add	r11, 1
+.LBB1_53:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_116
+# %bb.54:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 176], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_55:                               # =>This Inner Loop Header: Depth=1
+	cmp	word ptr [rsi], r13w
+	sete	al
+	cmp	word ptr [rsi + 2], r13w
+	sete	dil
+	cmp	word ptr [rsi + 4], r13w
+	sete	r14b
+	cmp	word ptr [rsi + 6], r13w
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 8], r13w
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 10], r13w
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 12], r13w
+	sete	byte ptr [rsp + 152]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 14], r13w
+	sete	bl
+	cmp	word ptr [rsi + 16], r13w
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 18], r13w
+	sete	dl
+	cmp	word ptr [rsi + 20], r13w
+	sete	r9b
+	cmp	word ptr [rsi + 22], r13w
+	sete	r10b
+	cmp	word ptr [rsi + 24], r13w
+	sete	r11b
+	cmp	word ptr [rsi + 26], r13w
+	sete	r12b
+	cmp	word ptr [rsi + 28], r13w
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 30], r13w
+	sete	cl
+	cmp	word ptr [rsi + 32], r13w
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 34], r13w
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 36], r13w
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 38], r13w
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 40], r13w
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 42], r13w
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 44], r13w
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 46], r13w
+	sete	r15b
+	cmp	word ptr [rsi + 48], r13w
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 50], r13w
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 52], r13w
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 54], r13w
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 56], r13w
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 58], r13w
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 60], r13w
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 62], r13w
+	sete	r8b
+	add	dil, dil
+	or	dil, al
+	movzx	eax, byte ptr [rsp + 152]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, dil
+	add	dl, dl
+	add	dl, byte ptr [rsp + 112]        # 1-byte Folded Reload
+	movzx	eax, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r14b
+	shl	r9b, 2
+	or	r9b, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, al
+	mov	edi, edx
+	shl	r10b, 3
+	or	r10b, r9b
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	movzx	edi, byte ptr [rsp + 104]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	add	dl, dl
+	add	dl, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	edi, edx
+	mov	rdx, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	byte ptr [rdx], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [rdx + 1], cl
+	or	r15b, dil
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, bl
+	movzx	ebx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r8b, 7
+	or	r8b, bl
+	or	r8b, cl
+	mov	byte ptr [rdx + 2], r15b
+	mov	byte ptr [rdx + 3], r8b
+	add	rsi, 64
+	add	rdx, 4
+	mov	qword ptr [rsp + 272], rdx      # 8-byte Spill
+	add	qword ptr [rsp + 168], -1       # 8-byte Folded Spill
+	jne	.LBB1_55
+# %bb.56:
+	mov	r14, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 176]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_117
+	jmp	.LBB1_164
+.LBB1_57:
+	movzx	r13d, word ptr [rdx]
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_61
+# %bb.58:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_59:                               # =>This Inner Loop Header: Depth=1
+	cmp	word ptr [rsi], r13w
+	lea	rsi, [rsi + 2]
+	sete	dl
+	neg	dl
+	lea	rbx, [rax + 7]
+	test	rax, rax
+	cmovns	rbx, rax
+	sar	rbx, 3
+	movzx	r8d, byte ptr [r11 + rbx]
+	xor	dl, r8b
+	lea	edi, [8*rbx]
+	mov	ecx, eax
+	sub	ecx, edi
+	mov	edi, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	edi, cl
+	and	dil, dl
+	xor	dil, r8b
+	mov	byte ptr [r11 + rbx], dil
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_59
+# %bb.60:
+	add	r11, 1
+.LBB1_61:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_120
+# %bb.62:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 176], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_63:                               # =>This Inner Loop Header: Depth=1
+	cmp	word ptr [rsi], r13w
+	sete	byte ptr [rsp + 152]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 2], r13w
+	sete	dil
+	cmp	word ptr [rsi + 4], r13w
+	sete	r14b
+	cmp	word ptr [rsi + 6], r13w
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 8], r13w
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 10], r13w
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 12], r13w
+	sete	al
+	cmp	word ptr [rsi + 14], r13w
+	sete	bl
+	cmp	word ptr [rsi + 16], r13w
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 18], r13w
+	sete	dl
+	cmp	word ptr [rsi + 20], r13w
+	sete	r9b
+	cmp	word ptr [rsi + 22], r13w
+	sete	r10b
+	cmp	word ptr [rsi + 24], r13w
+	sete	r11b
+	cmp	word ptr [rsi + 26], r13w
+	sete	r12b
+	cmp	word ptr [rsi + 28], r13w
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 30], r13w
+	sete	cl
+	cmp	word ptr [rsi + 32], r13w
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 34], r13w
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 36], r13w
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 38], r13w
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 40], r13w
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 42], r13w
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 44], r13w
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 46], r13w
+	sete	r15b
+	cmp	word ptr [rsi + 48], r13w
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 50], r13w
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 52], r13w
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 54], r13w
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 56], r13w
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 58], r13w
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	word ptr [rsi + 60], r13w
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	cmp	word ptr [rsi + 62], r13w
+	sete	r8b
+	add	dil, dil
+	add	dil, byte ptr [rsp + 152]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, dil
+	add	dl, dl
+	add	dl, byte ptr [rsp + 104]        # 1-byte Folded Reload
+	movzx	eax, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r14b
+	shl	r9b, 2
+	or	r9b, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, al
+	mov	edi, edx
+	shl	r10b, 3
+	or	r10b, r9b
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	movzx	edi, byte ptr [rsp + 112]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	add	dl, dl
+	add	dl, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	edi, edx
+	mov	rdx, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	byte ptr [rdx], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [rdx + 1], cl
+	or	r15b, dil
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, bl
+	movzx	ebx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r8b, 7
+	or	r8b, bl
+	or	r8b, cl
+	mov	byte ptr [rdx + 2], r15b
+	mov	byte ptr [rdx + 3], r8b
+	add	rsi, 64
+	add	rdx, 4
+	mov	qword ptr [rsp + 272], rdx      # 8-byte Spill
+	add	qword ptr [rsp + 168], -1       # 8-byte Folded Spill
+	jne	.LBB1_63
+# %bb.64:
+	mov	r14, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 176]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_121
+	jmp	.LBB1_164
+.LBB1_65:
+	mov	r13, qword ptr [rdx]
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_69
+# %bb.66:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_67:                               # =>This Inner Loop Header: Depth=1
+	cmp	qword ptr [rsi], r13
+	lea	rsi, [rsi + 8]
+	sete	dl
+	neg	dl
+	lea	rbx, [rax + 7]
+	test	rax, rax
+	cmovns	rbx, rax
+	sar	rbx, 3
+	movzx	r8d, byte ptr [r11 + rbx]
+	xor	dl, r8b
+	lea	edi, [8*rbx]
+	mov	ecx, eax
+	sub	ecx, edi
+	mov	edi, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	edi, cl
+	and	dil, dl
+	xor	dil, r8b
+	mov	byte ptr [r11 + rbx], dil
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_67
+# %bb.68:
+	add	r11, 1
+.LBB1_69:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_123
+# %bb.70:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 176], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_71:                               # =>This Inner Loop Header: Depth=1
+	cmp	qword ptr [rsi], r13
+	sete	byte ptr [rsp + 152]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 8], r13
+	sete	dil
+	cmp	qword ptr [rsi + 16], r13
+	sete	r14b
+	cmp	qword ptr [rsi + 24], r13
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 32], r13
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 40], r13
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 48], r13
+	sete	al
+	cmp	qword ptr [rsi + 56], r13
+	sete	bl
+	cmp	qword ptr [rsi + 64], r13
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 72], r13
+	sete	dl
+	cmp	qword ptr [rsi + 80], r13
+	sete	r9b
+	cmp	qword ptr [rsi + 88], r13
+	sete	r10b
+	cmp	qword ptr [rsi + 96], r13
+	sete	r11b
+	cmp	qword ptr [rsi + 104], r13
+	sete	r12b
+	cmp	qword ptr [rsi + 112], r13
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 120], r13
+	sete	cl
+	cmp	qword ptr [rsi + 128], r13
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 136], r13
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 144], r13
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 152], r13
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 160], r13
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 168], r13
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 176], r13
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 184], r13
+	sete	r15b
+	cmp	qword ptr [rsi + 192], r13
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 200], r13
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 208], r13
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 216], r13
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 224], r13
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 232], r13
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 240], r13
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	cmp	qword ptr [rsi + 248], r13
+	sete	r8b
+	add	dil, dil
+	add	dil, byte ptr [rsp + 152]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, dil
+	add	dl, dl
+	add	dl, byte ptr [rsp + 104]        # 1-byte Folded Reload
+	movzx	eax, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r14b
+	shl	r9b, 2
+	or	r9b, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, al
+	mov	edi, edx
+	shl	r10b, 3
+	or	r10b, r9b
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	movzx	edi, byte ptr [rsp + 112]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	add	dl, dl
+	add	dl, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	edi, edx
+	mov	rdx, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	byte ptr [rdx], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [rdx + 1], cl
+	or	r15b, dil
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, bl
+	movzx	ebx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r8b, 7
+	or	r8b, bl
+	or	r8b, cl
+	mov	byte ptr [rdx + 2], r15b
+	mov	byte ptr [rdx + 3], r8b
+	add	rsi, 256
+	add	rdx, 4
+	mov	qword ptr [rsp + 272], rdx      # 8-byte Spill
+	add	qword ptr [rsp + 168], -1       # 8-byte Folded Spill
+	jne	.LBB1_71
+# %bb.72:
+	mov	r14, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 176]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_124
+	jmp	.LBB1_164
+.LBB1_73:
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	vmovss	xmm0, dword ptr [rdx]           # xmm0 = mem[0],zero,zero,zero
+	sub	r9d, eax
+	je	.LBB1_77
+# %bb.74:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_75:                               # =>This Inner Loop Header: Depth=1
+	vucomiss	xmm0, dword ptr [rsi]
+	lea	rsi, [rsi + 4]
+	sete	dl
+	neg	dl
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r9d, byte ptr [r11 + rdi]
+	xor	dl, r9b
+	lea	r8d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r8d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, dl
+	xor	bl, r9b
+	mov	byte ptr [r11 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_75
+# %bb.76:
+	add	r11, 1
+.LBB1_77:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_126
+# %bb.78:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 152], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_79:                               # =>This Inner Loop Header: Depth=1
+	vucomiss	xmm0, dword ptr [rsi]
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 4]
+	sete	r9b
+	vucomiss	xmm0, dword ptr [rsi + 8]
+	sete	r14b
+	vucomiss	xmm0, dword ptr [rsi + 12]
+	sete	r13b
+	vucomiss	xmm0, dword ptr [rsi + 16]
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 20]
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 24]
+	sete	al
+	vucomiss	xmm0, dword ptr [rsi + 28]
+	sete	bl
+	vucomiss	xmm0, dword ptr [rsi + 32]
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 36]
+	sete	dl
+	vucomiss	xmm0, dword ptr [rsi + 40]
+	sete	dil
+	vucomiss	xmm0, dword ptr [rsi + 44]
+	sete	r10b
+	vucomiss	xmm0, dword ptr [rsi + 48]
+	sete	r11b
+	vucomiss	xmm0, dword ptr [rsi + 52]
+	sete	r12b
+	vucomiss	xmm0, dword ptr [rsi + 56]
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 60]
+	sete	cl
+	vucomiss	xmm0, dword ptr [rsi + 64]
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 68]
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 72]
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 76]
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 80]
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 84]
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 88]
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 92]
+	sete	r15b
+	vucomiss	xmm0, dword ptr [rsi + 96]
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 100]
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 104]
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 108]
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 112]
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 116]
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 120]
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	vucomiss	xmm0, dword ptr [rsi + 124]
+	sete	r8b
+	add	r9b, r9b
+	add	r9b, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, r9b
+	add	dl, dl
+	add	dl, byte ptr [rsp + 112]        # 1-byte Folded Reload
+	shl	r13b, 3
+	or	r13b, r14b
+	shl	dil, 2
+	or	dil, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, r13b
+	mov	r9d, edx
+	shl	r10b, 3
+	or	r10b, dil
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, r9b
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	movzx	edi, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	eax, byte ptr [rsp + 104]       # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, al
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	edi, edx
+	mov	rdx, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	byte ptr [rdx], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [rdx + 1], cl
+	or	r15b, dil
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, bl
+	mov	ebx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, bl
+	movzx	ebx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r8b, 7
+	or	r8b, bl
+	or	r8b, cl
+	mov	byte ptr [rdx + 2], r15b
+	mov	byte ptr [rdx + 3], r8b
+	add	rsi, 128
+	add	rdx, 4
+	mov	qword ptr [rsp + 272], rdx      # 8-byte Spill
+	add	qword ptr [rsp + 152], -1       # 8-byte Folded Spill
+	jne	.LBB1_79
+# %bb.80:
+	mov	r14, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 168]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_127
+	jmp	.LBB1_164
+.LBB1_81:
+	mov	r14b, byte ptr [rdx]
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_85
+# %bb.82:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_83:                               # =>This Inner Loop Header: Depth=1
+	cmp	byte ptr [rsi], r14b
+	lea	rsi, [rsi + 1]
+	sete	dl
+	neg	dl
+	lea	rdi, [rax + 7]
+	test	rax, rax
+	cmovns	rdi, rax
+	sar	rdi, 3
+	movzx	r9d, byte ptr [r11 + rdi]
+	xor	dl, r9b
+	lea	r8d, [8*rdi]
+	mov	ecx, eax
+	sub	ecx, r8d
+	mov	ebx, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	ebx, cl
+	and	bl, dl
+	xor	bl, r9b
+	mov	byte ptr [r11 + rdi], bl
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_83
+# %bb.84:
+	add	r11, 1
+.LBB1_85:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_129
+# %bb.86:
+	cmp	r15, 32
+	mov	dword ptr [rsp + 28], r14d      # 4-byte Spill
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 392], r15      # 8-byte Spill
+	jb	.LBB1_89
+# %bb.87:
+	mov	rax, r15
+	shl	rax, 5
+	add	rax, rsi
+	cmp	r11, rax
+	jae	.LBB1_168
+# %bb.88:
+	lea	rax, [r11 + 4*r15]
+	cmp	rsi, rax
+	jae	.LBB1_168
+.LBB1_89:
+	xor	eax, eax
+	mov	qword ptr [rsp + 384], rax      # 8-byte Spill
+	mov	r12, rsi
+	mov	qword ptr [rsp + 376], r11      # 8-byte Spill
+.LBB1_90:
+	sub	r15, qword ptr [rsp + 384]      # 8-byte Folded Reload
+	mov	qword ptr [rsp + 152], r15      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_91:                               # =>This Inner Loop Header: Depth=1
+	mov	rcx, r12
+	cmp	byte ptr [r12], r14b
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 1], r14b
+	sete	r8b
+	cmp	byte ptr [r12 + 2], r14b
+	sete	r15b
+	cmp	byte ptr [r12 + 3], r14b
+	sete	r13b
+	cmp	byte ptr [r12 + 4], r14b
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 5], r14b
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 6], r14b
+	sete	al
+	cmp	byte ptr [r12 + 7], r14b
+	sete	r11b
+	cmp	byte ptr [r12 + 8], r14b
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	byte ptr [r12 + 9], r14b
+	sete	dl
+	cmp	byte ptr [r12 + 10], r14b
+	sete	sil
+	cmp	byte ptr [r12 + 11], r14b
+	sete	dil
+	cmp	byte ptr [r12 + 12], r14b
+	sete	r10b
+	cmp	byte ptr [r12 + 13], r14b
+	sete	r12b
+	cmp	byte ptr [rcx + 14], r14b
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 15], r14b
+	sete	r9b
+	cmp	byte ptr [rcx + 16], r14b
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 17], r14b
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 18], r14b
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 19], r14b
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 20], r14b
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 21], r14b
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 22], r14b
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	cmp	byte ptr [rcx + 23], r14b
+	sete	r14b
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 24], bl
+	sete	byte ptr [rsp + 272]            # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 25], bl
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 26], bl
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 27], bl
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 28], bl
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 29], bl
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 30], bl
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	mov	ebx, dword ptr [rsp + 28]       # 4-byte Reload
+	cmp	byte ptr [rcx + 31], bl
+	sete	bl
+	add	r8b, r8b
+	add	r8b, byte ptr [rsp + 32]        # 1-byte Folded Reload
+	shl	al, 6
+	shl	r11b, 7
+	or	r11b, al
+	shl	r15b, 2
+	or	r15b, r8b
+	add	dl, dl
+	add	dl, byte ptr [rsp + 320]        # 1-byte Folded Reload
+	shl	r13b, 3
+	or	r13b, r15b
+	shl	sil, 2
+	or	sil, dl
+	movzx	edx, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, r13b
+	mov	r8d, edx
+	shl	dil, 3
+	or	dil, sil
+	movzx	edx, byte ptr [rsp + 112]       # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, r8b
+	shl	r10b, 4
+	or	r10b, dil
+	shl	r12b, 5
+	or	r12b, r10b
+	movzx	esi, byte ptr [rsp + 104]       # 1-byte Folded Reload
+	shl	sil, 6
+	shl	r9b, 7
+	or	r9b, sil
+	or	r11b, dl
+	or	r9b, r12b
+	movzx	eax, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 288]        # 1-byte Folded Reload
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, al
+	mov	esi, edx
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, sil
+	mov	esi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, sil
+	mov	esi, edx
+	movzx	edx, byte ptr [rsp + 72]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, sil
+	mov	rsi, qword ptr [rsp + 376]      # 8-byte Reload
+	mov	byte ptr [rsi], r11b
+	movzx	edi, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dil, 6
+	shl	r14b, 7
+	or	r14b, dil
+	mov	byte ptr [rsi + 1], r9b
+	or	r14b, dl
+	movzx	eax, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	add	al, al
+	add	al, byte ptr [rsp + 272]        # 1-byte Folded Reload
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	al, 2
+	or	al, dl
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	al, 3
+	or	al, dl
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	shl	al, 4
+	or	al, dl
+	mov	edx, eax
+	movzx	eax, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	al, 5
+	or	al, dl
+	movzx	edx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	dl, 6
+	shl	bl, 7
+	or	bl, dl
+	or	bl, al
+	mov	byte ptr [rsi + 2], r14b
+	mov	r14d, dword ptr [rsp + 28]      # 4-byte Reload
+	mov	byte ptr [rsi + 3], bl
+	lea	r12, [rcx + 32]
+	add	rsi, 4
+	mov	qword ptr [rsp + 376], rsi      # 8-byte Spill
+	add	qword ptr [rsp + 152], -1       # 8-byte Folded Spill
+	jne	.LBB1_91
+# %bb.92:
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 392]      # 8-byte Reload
+	jmp	.LBB1_130
+.LBB1_93:
+	mov	r13d, dword ptr [rdx]
+	lea	r15, [r10 + 31]
+	test	r10, r10
+	cmovns	r15, r10
+	lea	eax, [r9 + 7]
+	test	r9d, r9d
+	cmovns	eax, r9d
+	and	eax, -8
+	sub	r9d, eax
+	je	.LBB1_97
+# %bb.94:
+	movsxd	rax, r9d
+	.p2align	4, 0x90
+.LBB1_95:                               # =>This Inner Loop Header: Depth=1
+	cmp	dword ptr [rsi], r13d
+	lea	rsi, [rsi + 4]
+	sete	dl
+	neg	dl
+	lea	rbx, [rax + 7]
+	test	rax, rax
+	cmovns	rbx, rax
+	sar	rbx, 3
+	movzx	r8d, byte ptr [r11 + rbx]
+	xor	dl, r8b
+	lea	edi, [8*rbx]
+	mov	ecx, eax
+	sub	ecx, edi
+	mov	edi, 1
+                                        # kill: def $cl killed $cl killed $ecx
+	shl	edi, cl
+	and	dil, dl
+	xor	dil, r8b
+	mov	byte ptr [r11 + rbx], dil
+	add	rax, 1
+	cmp	rax, 8
+	jne	.LBB1_95
+# %bb.96:
+	add	r11, 1
+.LBB1_97:
+	sar	r15, 5
+	cmp	r10, 32
+	jl	.LBB1_133
+# %bb.98:
+	mov	qword ptr [rsp + 280], r10      # 8-byte Spill
+	mov	qword ptr [rsp + 176], r15      # 8-byte Spill
+	mov	qword ptr [rsp + 168], r15      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_99:                               # =>This Inner Loop Header: Depth=1
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	cmp	dword ptr [rsi], r13d
+	sete	byte ptr [rsp + 152]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 4], r13d
+	sete	dil
+	cmp	dword ptr [rsi + 8], r13d
+	sete	r14b
+	cmp	dword ptr [rsi + 12], r13d
+	sete	byte ptr [rsp + 160]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 16], r13d
+	sete	byte ptr [rsp + 136]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 20], r13d
+	sete	byte ptr [rsp + 88]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 24], r13d
+	sete	al
+	cmp	dword ptr [rsi + 28], r13d
+	sete	bl
+	cmp	dword ptr [rsi + 32], r13d
+	sete	byte ptr [rsp + 104]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 36], r13d
+	sete	dl
+	cmp	dword ptr [rsi + 40], r13d
+	sete	r9b
+	cmp	dword ptr [rsi + 44], r13d
+	sete	r10b
+	cmp	dword ptr [rsi + 48], r13d
+	sete	r11b
+	cmp	dword ptr [rsi + 52], r13d
+	sete	r12b
+	cmp	dword ptr [rsi + 56], r13d
+	sete	byte ptr [rsp + 112]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 60], r13d
+	sete	cl
+	cmp	dword ptr [rsi + 64], r13d
+	sete	byte ptr [rsp + 72]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 68], r13d
+	sete	byte ptr [rsp + 120]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 72], r13d
+	sete	byte ptr [rsp + 128]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 76], r13d
+	sete	byte ptr [rsp + 144]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 80], r13d
+	sete	byte ptr [rsp + 80]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 84], r13d
+	sete	byte ptr [rsp + 96]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 88], r13d
+	sete	byte ptr [rsp + 64]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 92], r13d
+	sete	r15b
+	cmp	dword ptr [rsi + 96], r13d
+	sete	byte ptr [rsp + 32]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 100], r13d
+	sete	byte ptr [rsp + 48]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 104], r13d
+	sete	byte ptr [rsp + 56]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 108], r13d
+	sete	byte ptr [rsp + 40]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 112], r13d
+	sete	byte ptr [rsp + 320]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 116], r13d
+	sete	byte ptr [rsp + 288]            # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 120], r13d
+	sete	byte ptr [rsp + 28]             # 1-byte Folded Spill
+	cmp	dword ptr [rsi + 124], r13d
+	sete	r8b
+	add	dil, dil
+	add	dil, byte ptr [rsp + 152]       # 1-byte Folded Reload
+	shl	al, 6
+	shl	bl, 7
+	or	bl, al
+	shl	r14b, 2
+	or	r14b, dil
+	add	dl, dl
+	add	dl, byte ptr [rsp + 104]        # 1-byte Folded Reload
+	movzx	eax, byte ptr [rsp + 160]       # 1-byte Folded Reload
+	shl	al, 3
+	or	al, r14b
+	shl	r9b, 2
+	or	r9b, dl
+	movzx	edx, byte ptr [rsp + 136]       # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, al
+	mov	edi, edx
+	shl	r10b, 3
+	or	r10b, r9b
+	movzx	edx, byte ptr [rsp + 88]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	shl	r11b, 4
+	or	r11b, r10b
+	shl	r12b, 5
+	or	r12b, r11b
+	mov	r11, qword ptr [rsp + 272]      # 8-byte Reload
+	movzx	edi, byte ptr [rsp + 112]       # 1-byte Folded Reload
+	shl	dil, 6
+	shl	cl, 7
+	or	cl, dil
+	or	bl, dl
+	or	cl, r12b
+	movzx	edx, byte ptr [rsp + 120]       # 1-byte Folded Reload
+	add	dl, dl
+	add	dl, byte ptr [rsp + 72]         # 1-byte Folded Reload
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 128]       # 1-byte Folded Reload
+	shl	dl, 2
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 144]       # 1-byte Folded Reload
+	shl	dl, 3
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 80]        # 1-byte Folded Reload
+	shl	dl, 4
+	or	dl, dil
+	mov	edi, edx
+	movzx	edx, byte ptr [rsp + 96]        # 1-byte Folded Reload
+	shl	dl, 5
+	or	dl, dil
+	mov	byte ptr [r11], bl
+	movzx	ebx, byte ptr [rsp + 64]        # 1-byte Folded Reload
+	shl	bl, 6
+	shl	r15b, 7
+	or	r15b, bl
+	mov	byte ptr [r11 + 1], cl
+	or	r15b, dl
+	movzx	ecx, byte ptr [rsp + 48]        # 1-byte Folded Reload
+	add	cl, cl
+	add	cl, byte ptr [rsp + 32]         # 1-byte Folded Reload
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 56]        # 1-byte Folded Reload
+	shl	cl, 2
+	or	cl, dl
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 40]        # 1-byte Folded Reload
+	shl	cl, 3
+	or	cl, dl
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 320]       # 1-byte Folded Reload
+	shl	cl, 4
+	or	cl, dl
+	mov	edx, ecx
+	movzx	ecx, byte ptr [rsp + 288]       # 1-byte Folded Reload
+	shl	cl, 5
+	or	cl, dl
+	movzx	edx, byte ptr [rsp + 28]        # 1-byte Folded Reload
+	shl	dl, 6
+	shl	r8b, 7
+	or	r8b, dl
+	or	r8b, cl
+	mov	byte ptr [r11 + 2], r15b
+	mov	byte ptr [r11 + 3], r8b
+	add	rsi, 128
+	add	r11, 4
+	add	qword ptr [rsp + 168], -1       # 8-byte Folded Spill
+	jne	.LBB1_99
+# %bb.100:
+	mov	r14, r11
+	mov	r10, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r15, qword ptr [rsp + 176]      # 8-byte Reload
+	shl	r15, 5
+	cmp	r15, r10
+	jl	.LBB1_134
+	jmp	.LBB1_164
+.LBB1_101:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_102:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	je	.LBB1_135
+# %bb.103:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_104:                              # =>This Inner Loop Header: Depth=1
+	cmp	dword ptr [rsi], r13d
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	xor	al, r9b
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	cmp	dword ptr [rsi + 4], r13d
+	lea	rsi, [rsi + 8]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_104
+	jmp	.LBB1_161
+.LBB1_105:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_106:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	jne	.LBB1_136
+# %bb.107:
+	xor	r11d, r11d
+	jmp	.LBB1_138
+.LBB1_108:
+	mov	qword ptr [rsp + 376], r11      # 8-byte Spill
+	mov	r12, rsi
+.LBB1_109:
+	shl	r13, 5
+	cmp	r13, r15
+	jge	.LBB1_164
+# %bb.110:
+	mov	r8, r15
+	sub	r8, r13
+	not	r13
+	add	r13, r15
+	je	.LBB1_132
+# %bb.140:
+	mov	r10, r8
+	and	r10, -2
+	xor	esi, esi
+	mov	r11, qword ptr [rsp + 376]      # 8-byte Reload
+	.p2align	4, 0x90
+.LBB1_141:                              # =>This Inner Loop Header: Depth=1
+	cmp	byte ptr [r12 + rsi], r14b
+	sete	bl
+	neg	bl
+	mov	rdi, rsi
+	shr	rdi, 3
+	mov	ecx, esi
+	and	cl, 6
+	mov	dl, 1
+	shl	dl, cl
+	movzx	r9d, byte ptr [r11 + rdi]
+	xor	bl, r9b
+	and	dl, bl
+	xor	dl, r9b
+	mov	byte ptr [r11 + rdi], dl
+	cmp	byte ptr [r12 + rsi + 1], r14b
+	lea	rsi, [rsi + 2]
+	sete	bl
+	neg	bl
+	xor	bl, dl
+	or	cl, 1
+	mov	al, 1
+	shl	al, cl
+	and	al, bl
+	xor	al, dl
+	mov	byte ptr [r11 + rdi], al
+	cmp	r10, rsi
+	jne	.LBB1_141
+	jmp	.LBB1_156
+.LBB1_112:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_113:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	je	.LBB1_125
+# %bb.114:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_115:                              # =>This Inner Loop Header: Depth=1
+	cmp	qword ptr [rsi], r13
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	xor	al, r9b
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	cmp	qword ptr [rsi + 8], r13
+	lea	rsi, [rsi + 16]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_115
+	jmp	.LBB1_148
+.LBB1_116:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_117:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	je	.LBB1_122
+# %bb.118:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_119:                              # =>This Inner Loop Header: Depth=1
+	cmp	word ptr [rsi], r13w
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	xor	al, r9b
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	cmp	word ptr [rsi + 2], r13w
+	lea	rsi, [rsi + 4]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_119
+	jmp	.LBB1_144
+.LBB1_120:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_121:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	jne	.LBB1_142
+.LBB1_122:
+	xor	r11d, r11d
+	jmp	.LBB1_144
+.LBB1_123:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_124:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	jne	.LBB1_146
+.LBB1_125:
+	xor	r11d, r11d
+	jmp	.LBB1_148
+.LBB1_126:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_127:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	jne	.LBB1_150
+# %bb.128:
+	xor	r11d, r11d
+	jmp	.LBB1_152
+.LBB1_129:
+	mov	qword ptr [rsp + 376], r11      # 8-byte Spill
+	mov	r12, rsi
+.LBB1_130:
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+# %bb.131:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	jne	.LBB1_154
+.LBB1_132:
+	xor	esi, esi
+	jmp	.LBB1_157
+.LBB1_133:
+	mov	r14, r11
+	shl	r15, 5
+	cmp	r15, r10
+	jge	.LBB1_164
+.LBB1_134:
+	mov	r8, r10
+	sub	r8, r15
+	not	r15
+	add	r15, r10
+	jne	.LBB1_159
+.LBB1_135:
+	xor	r11d, r11d
+	jmp	.LBB1_161
+.LBB1_136:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_137:                              # =>This Inner Loop Header: Depth=1
+	vucomisd	xmm0, qword ptr [rsi]
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	xor	al, r9b
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	vucomisd	xmm0, qword ptr [rsi + 8]
+	lea	rsi, [rsi + 16]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_137
+.LBB1_138:
+	test	r8b, 1
+	je	.LBB1_164
+# %bb.139:
+	vucomisd	xmm0, qword ptr [rsi]
+	jmp	.LBB1_163
+.LBB1_142:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_143:                              # =>This Inner Loop Header: Depth=1
+	cmp	word ptr [rsi], r13w
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	xor	al, r9b
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	cmp	word ptr [rsi + 2], r13w
+	lea	rsi, [rsi + 4]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_143
+.LBB1_144:
+	test	r8b, 1
+	je	.LBB1_164
+# %bb.145:
+	cmp	word ptr [rsi], r13w
+	jmp	.LBB1_163
+.LBB1_146:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_147:                              # =>This Inner Loop Header: Depth=1
+	cmp	qword ptr [rsi], r13
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	xor	al, r9b
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	cmp	qword ptr [rsi + 8], r13
+	lea	rsi, [rsi + 16]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_147
+.LBB1_148:
+	test	r8b, 1
+	je	.LBB1_164
+# %bb.149:
+	cmp	qword ptr [rsi], r13
+	jmp	.LBB1_163
+.LBB1_150:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_151:                              # =>This Inner Loop Header: Depth=1
+	vucomiss	xmm0, dword ptr [rsi]
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	xor	al, r9b
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	vucomiss	xmm0, dword ptr [rsi + 4]
+	lea	rsi, [rsi + 8]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_151
+.LBB1_152:
+	test	r8b, 1
+	je	.LBB1_164
+# %bb.153:
+	vucomiss	xmm0, dword ptr [rsi]
+	jmp	.LBB1_163
+.LBB1_154:
+	mov	r10, r8
+	and	r10, -2
+	xor	esi, esi
+	mov	r11, qword ptr [rsp + 376]      # 8-byte Reload
+	.p2align	4, 0x90
+.LBB1_155:                              # =>This Inner Loop Header: Depth=1
+	cmp	byte ptr [r12 + rsi], r14b
+	sete	bl
+	neg	bl
+	mov	rdi, rsi
+	shr	rdi, 3
+	mov	ecx, esi
+	and	cl, 6
+	mov	dl, 1
+	shl	dl, cl
+	movzx	r9d, byte ptr [r11 + rdi]
+	xor	bl, r9b
+	and	dl, bl
+	xor	dl, r9b
+	mov	byte ptr [r11 + rdi], dl
+	cmp	byte ptr [r12 + rsi + 1], r14b
+	lea	rsi, [rsi + 2]
+	sete	bl
+	neg	bl
+	xor	bl, dl
+	or	cl, 1
+	mov	al, 1
+	shl	al, cl
+	and	al, bl
+	xor	al, dl
+	mov	byte ptr [r11 + rdi], al
+	cmp	r10, rsi
+	jne	.LBB1_155
+.LBB1_156:
+	add	r12, rsi
+.LBB1_157:
+	test	r8b, 1
+	je	.LBB1_164
+# %bb.158:
+	cmp	byte ptr [r12], r14b
+	sete	al
+	neg	al
+	mov	rdx, rsi
+	shr	rdx, 3
+	mov	r8, qword ptr [rsp + 376]       # 8-byte Reload
+	mov	dil, byte ptr [r8 + rdx]
+	and	sil, 7
+	mov	bl, 1
+	mov	ecx, esi
+	shl	bl, cl
+	xor	al, dil
+	and	bl, al
+	xor	bl, dil
+	mov	byte ptr [r8 + rdx], bl
+	jmp	.LBB1_164
+.LBB1_159:
+	mov	r10, r8
+	and	r10, -2
+	xor	r11d, r11d
+	mov	r15, r14
+	.p2align	4, 0x90
+.LBB1_160:                              # =>This Inner Loop Header: Depth=1
+	cmp	dword ptr [rsi], r13d
+	sete	al
+	neg	al
+	mov	rdi, r11
+	shr	rdi, 3
+	movzx	r9d, byte ptr [r15 + rdi]
+	mov	ecx, r11d
+	and	cl, 6
+	mov	bl, 1
+	shl	bl, cl
+	xor	al, r9b
+	and	bl, al
+	xor	bl, r9b
+	mov	byte ptr [r15 + rdi], bl
+	add	r11, 2
+	cmp	dword ptr [rsi + 4], r13d
+	lea	rsi, [rsi + 8]
+	sete	al
+	neg	al
+	xor	al, bl
+	or	cl, 1
+	mov	dl, 1
+	shl	dl, cl
+	and	dl, al
+	xor	dl, bl
+	mov	byte ptr [r15 + rdi], dl
+	cmp	r10, r11
+	jne	.LBB1_160
+.LBB1_161:
+	test	r8b, 1
+	je	.LBB1_164
+# %bb.162:
+	cmp	dword ptr [rsi], r13d
+.LBB1_163:
+	sete	al
+	neg	al
+	mov	rdx, r11
+	shr	rdx, 3
+	mov	sil, byte ptr [r14 + rdx]
+	and	r11b, 7
+	mov	bl, 1
+	mov	ecx, r11d
+	shl	bl, cl
+	xor	al, sil
+	and	bl, al
+	xor	bl, sil
+	mov	byte ptr [r14 + rdx], bl
+.LBB1_164:
+	lea	rsp, [rbp - 40]
+	pop	rbx
+	pop	r12
+	pop	r13
+	pop	r14
+	pop	r15
+	pop	rbp
+	vzeroupper
+	ret
+.LBB1_165:
+	and	r13, -32
+	mov	rax, r13
+	shl	rax, 5
+	add	rax, rsi
+	mov	qword ptr [rsp + 400], rax      # 8-byte Spill
+	mov	qword ptr [rsp + 384], r13      # 8-byte Spill
+	lea	rax, [r11 + 4*r13]
+	mov	qword ptr [rsp + 376], rax      # 8-byte Spill
+	vmovd	xmm0, r14d
+	vpbroadcastb	ymm0, xmm0
+	vmovdqa	ymmword ptr [rsp + 512], ymm0   # 32-byte Spill
+	xor	eax, eax
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_166:                              # =>This Inner Loop Header: Depth=1
+	mov	rbx, rax
+	mov	qword ptr [rsp + 408], rax      # 8-byte Spill
+	shl	rbx, 5
+	mov	rax, rbx
+	or	rax, 32
+	mov	qword ptr [rsp + 208], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 64
+	mov	qword ptr [rsp + 88], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 96
+	mov	qword ptr [rsp + 64], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 128
+	mov	qword ptr [rsp + 160], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 160
+	mov	qword ptr [rsp + 320], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 192
+	mov	qword ptr [rsp + 144], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 224
+	mov	qword ptr [rsp + 224], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 256
+	mov	qword ptr [rsp + 264], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 288
+	mov	qword ptr [rsp + 96], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 320
+	mov	qword ptr [rsp + 136], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 512
+	mov	rcx, rax
+	movzx	eax, byte ptr [rsi + rax]
+	mov	rdx, rcx
+	vmovd	xmm0, eax
+	mov	rcx, rbx
+	movzx	eax, byte ptr [rsi + rbx]
+	vmovd	xmm3, eax
+	movzx	eax, byte ptr [rsi + rdx + 1]
+	vmovd	xmm4, eax
+	movzx	eax, byte ptr [rsi + rbx + 1]
+	vmovd	xmm10, eax
+	movzx	eax, byte ptr [rsi + rdx + 2]
+	vmovd	xmm1, eax
+	vmovdqa	xmmword ptr [rsp + 480], xmm1   # 16-byte Spill
+	movzx	eax, byte ptr [rsi + rbx + 2]
+	vmovd	xmm1, eax
+	vmovdqa	xmmword ptr [rsp + 448], xmm1   # 16-byte Spill
+	movzx	eax, byte ptr [rsi + rdx + 3]
+	vmovd	xmm11, eax
+	movzx	eax, byte ptr [rsi + rbx + 3]
+	vmovd	xmm8, eax
+	movzx	eax, byte ptr [rsi + rdx + 4]
+	vmovd	xmm1, eax
+	vmovdqa	xmmword ptr [rsp + 416], xmm1   # 16-byte Spill
+	movzx	eax, byte ptr [rsi + rbx + 4]
+	vmovd	xmm13, eax
+	movzx	eax, byte ptr [rsi + rdx + 5]
+	vmovd	xmm14, eax
+	movzx	eax, byte ptr [rsi + rbx + 5]
+	vmovd	xmm6, eax
+	movzx	eax, byte ptr [rsi + rdx + 6]
+	mov	qword ptr [rsp + 240], rdx      # 8-byte Spill
+	vmovd	xmm12, eax
+	movzx	eax, byte ptr [rsi + rbx + 6]
+	vmovd	xmm7, eax
+	movzx	eax, byte ptr [rsi + rdx + 7]
+	vmovd	xmm2, eax
+	movzx	eax, byte ptr [rsi + rbx + 7]
+	vmovd	xmm1, eax
+	mov	rax, rbx
+	or	rax, 352
+	mov	qword ptr [rsp + 128], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 384
+	mov	qword ptr [rsp + 120], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 416
+	mov	qword ptr [rsp + 32], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 448
+	mov	qword ptr [rsp + 288], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 480
+	mov	qword ptr [rsp + 48], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 544
+	mov	qword ptr [rsp + 152], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 576
+	mov	qword ptr [rsp + 232], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 608
+	mov	qword ptr [rsp + 104], rax      # 8-byte Spill
+	mov	r15, rbx
+	or	r15, 640
+	mov	qword ptr [rsp + 176], r15      # 8-byte Spill
+	mov	r11, rbx
+	or	r11, 672
+	mov	qword ptr [rsp + 200], r11      # 8-byte Spill
+	mov	r8, rbx
+	or	r8, 704
+	mov	qword ptr [rsp + 168], r8       # 8-byte Spill
+	mov	rdx, rbx
+	or	rdx, 736
+	mov	qword ptr [rsp + 192], rdx      # 8-byte Spill
+	mov	r12, rbx
+	or	r12, 768
+	mov	qword ptr [rsp + 216], r12      # 8-byte Spill
+	mov	r14, rbx
+	or	r14, 800
+	mov	qword ptr [rsp + 184], r14      # 8-byte Spill
+	mov	r10, rbx
+	or	r10, 832
+	mov	qword ptr [rsp + 80], r10       # 8-byte Spill
+	mov	r9, rbx
+	or	r9, 864
+	mov	qword ptr [rsp + 72], r9        # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 896
+	mov	qword ptr [rsp + 248], rax      # 8-byte Spill
+	mov	rdi, rbx
+	or	rdi, 928
+	mov	qword ptr [rsp + 112], rdi      # 8-byte Spill
+	mov	rax, rbx
+	mov	qword ptr [rsp + 256], rbx      # 8-byte Spill
+	or	rax, 960
+	mov	qword ptr [rsp + 40], rax       # 8-byte Spill
+	or	rcx, 992
+	mov	qword ptr [rsp + 56], rcx       # 8-byte Spill
+	mov	r13, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm9, xmm0, byte ptr [rsi + r13], 1
+	mov	rbx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm9, byte ptr [rsi + rbx], 2
+	mov	rbx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11], 5
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12], 8
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14], 9
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9], 11
+	mov	r13, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax], 14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx], 15
+	mov	r14, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14], 1
+	mov	r12, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12], 2
+	mov	r10, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10], 3
+	mov	r11, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11], 4
+	mov	r8, qword ptr [rsp + 320]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8], 5
+	mov	r9, qword ptr [rsp + 144]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9], 6
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15], 7
+	mov	rdi, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax], 9
+	mov	rbx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx], 10
+	mov	rcx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx], 11
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 12
+	mov	rdx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 13
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 14
+	mov	rdx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 15
+	mov	rdx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 1
+	mov	rdx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 2
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 3
+	mov	rdx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 4
+	mov	rdx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 5
+	mov	rdx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 6
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 7
+	mov	rdx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 8
+	mov	rdx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 9
+	mov	rdx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 10
+	mov	rdx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 11
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 1], 12
+	mov	rdx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 13
+	mov	r13, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 1], 14
+	mov	rdx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 15
+	vpinsrb	xmm5, xmm10, byte ptr [rsi + r14 + 1], 1
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r12 + 1], 2
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 1], 3
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r11 + 1], 4
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r8 + 1], 5
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 1], 6
+	mov	r8, r9
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r15 + 1], 7
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 1], 8
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 9
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rbx + 1], 10
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 1], 11
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 14
+	vinserti128	ymm15, ymm3, xmm0, 1
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm5, byte ptr [rsi + rax + 1], 15
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 8]
+	vmovd	xmm9, edi
+	vinserti128	ymm0, ymm0, xmm4, 1
+	vmovdqa	ymmword ptr [rsp + 1216], ymm0  # 32-byte Spill
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 8]
+	vmovd	xmm10, edi
+	mov	rdx, qword ptr [rsp + 152]      # 8-byte Reload
+	vmovdqa	xmm0, xmmword ptr [rsp + 480]   # 16-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 2], 1
+	mov	rcx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 2], 2
+	mov	r10, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 2], 3
+	mov	rax, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 4
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 5
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 6
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 7
+	mov	r12, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 2], 8
+	mov	r13, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 2], 9
+	mov	r9, qword ptr [rsp + 80]        # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 2], 10
+	mov	r11, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 2], 11
+	mov	r14, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 2], 12
+	mov	r15, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 2], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 14
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 15
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vmovdqa	xmm3, xmmword ptr [rsp + 448]   # 16-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 2], 1
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 2
+	mov	rdi, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 3
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 4
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 5
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 2], 6
+	mov	rdi, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 7
+	mov	rbx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 8
+	mov	rbx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 9
+	mov	rbx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 10
+	mov	r8, qword ptr [rsp + 128]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 2], 11
+	mov	rbx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 12
+	mov	rbx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 13
+	mov	rbx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 14
+	mov	rbx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 15
+	vpinsrb	xmm4, xmm11, byte ptr [rsi + rdx + 3], 1
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 3], 2
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r10 + 3], 3
+	mov	rcx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 3], 4
+	mov	rcx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 3], 5
+	mov	rcx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 3], 6
+	mov	r8, qword ptr [rsp + 192]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r8 + 3], 7
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r12 + 3], 8
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 3], 9
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r9 + 3], 10
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r11 + 3], 11
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r14 + 3], 12
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r15 + 3], 13
+	mov	r14, r15
+	mov	rdx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 3], 14
+	mov	rcx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 3], 15
+	vpinsrb	xmm5, xmm8, byte ptr [rsi + rax + 3], 1
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 2
+	mov	r10, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 3], 3
+	mov	r15, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r15 + 3], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 5
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 6
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 3], 7
+	mov	r11, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r11 + 3], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 10
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 11
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 13
+	vinserti128	ymm0, ymm3, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 480], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm5, byte ptr [rsi + rax + 3], 14
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 9]
+	vmovd	xmm8, edi
+	mov	r9, rbx
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 3], 15
+	vinserti128	ymm0, ymm0, xmm4, 1
+	vmovdqa	ymmword ptr [rsp + 448], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 9]
+	vmovd	xmm11, edi
+	vmovdqa	xmm0, xmmword ptr [rsp + 416]   # 16-byte Reload
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 1
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 2
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 3
+	mov	r13, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 4], 4
+	mov	rcx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 4], 5
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 4], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 4], 8
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 9
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 10
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 11
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 4], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 4], 14
+	mov	r12, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 4], 15
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm13, byte ptr [rsi + rax + 4], 1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 4], 2
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 4], 3
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 4], 4
+	mov	r10, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 4], 5
+	mov	rdi, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 4], 6
+	mov	r14, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 4], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 4], 8
+	mov	rbx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 4], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 4], 10
+	mov	r11, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 4], 11
+	mov	r8, qword ptr [rsp + 120]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 4], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 4], 13
+	mov	r15, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 4], 14
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 4], 15
+	mov	r9, qword ptr [rsp + 152]       # 8-byte Reload
+	vpinsrb	xmm4, xmm14, byte ptr [rsi + r9 + 5], 1
+	mov	r15, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r15 + 5], 2
+	mov	r9, qword ptr [rsp + 104]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r9 + 5], 3
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 5], 4
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 5
+	mov	rcx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 6
+	mov	rcx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 7
+	mov	r13, rcx
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 5], 8
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 5], 9
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 5], 10
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 5], 11
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 5], 12
+	mov	rcx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 5], 14
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r12 + 5], 15
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm5, xmm6, byte ptr [rsi + rax + 5], 1
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 5], 2
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 3
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 5], 4
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 5], 5
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 5], 6
+	mov	r10, rdi
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r14 + 5], 7
+	mov	r14, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r14 + 5], 8
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rbx + 5], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 10
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r11 + 5], 11
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r8 + 5], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 14
+	vinserti128	ymm14, ymm3, xmm0, 1
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm5, byte ptr [rsi + rax + 5], 15
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 10]
+	vmovd	xmm3, edi
+	vinserti128	ymm0, ymm0, xmm4, 1
+	vmovdqa	ymmword ptr [rsp + 416], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 10]
+	vmovd	xmm4, edi
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm12, byte ptr [rsi + rax + 6], 1
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 6], 2
+	mov	r9, qword ptr [rsp + 104]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 6], 3
+	mov	rbx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 6], 4
+	mov	rdx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 6], 5
+	mov	r11, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 6], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 6], 7
+	mov	rcx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 6], 8
+	mov	r13, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 6], 9
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 6], 10
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 6], 11
+	mov	r12, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 6], 12
+	mov	r8, qword ptr [rsp + 112]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 6], 13
+	mov	rcx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 6], 14
+	mov	rcx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 6], 15
+	mov	rcx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm5, xmm7, byte ptr [rsi + rcx + 6], 1
+	mov	rcx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 2
+	mov	rcx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 3
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 4
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 6], 5
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 6], 6
+	mov	rcx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 7
+	mov	r10, r14
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r14 + 6], 8
+	mov	rcx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 9
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 10
+	mov	rdx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 6], 11
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 6], 12
+	mov	rdx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 6], 13
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 6], 14
+	mov	r14, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r14 + 6], 15
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 7], 1
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 7], 2
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 7], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 7], 4
+	mov	r14, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 7], 5
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 7], 6
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 7], 7
+	mov	r11, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 7], 8
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 7], 9
+	mov	rdx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 7], 10
+	mov	r9, qword ptr [rsp + 72]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 7], 11
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 7], 12
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 7], 13
+	mov	rdx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 7], 14
+	mov	rdx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 7], 15
+	mov	rdx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 1
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 2
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 3
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 7], 5
+	mov	rdx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 6
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 7], 7
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 7], 8
+	mov	rdx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 10
+	mov	r13, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 7], 11
+	mov	rcx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 12
+	mov	rcx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 13
+	vinserti128	ymm0, ymm5, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 1184], ymm0  # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm1, byte ptr [rsi + rcx + 7], 14
+	mov	rcx, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 11]
+	vmovd	xmm1, edi
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 7], 15
+	vinserti128	ymm0, ymm0, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 1152], ymm0  # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 11]
+	vmovd	xmm2, edi
+	vpinsrb	xmm0, xmm9, byte ptr [rsi + rax + 8], 1
+	mov	r12, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 8], 2
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 8], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 8], 4
+	mov	rbx, r14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 8], 5
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 8], 6
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 8], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 8], 8
+	mov	r8, qword ptr [rsp + 184]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 8], 9
+	mov	rdx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 8], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 8], 11
+	mov	r11, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 8], 12
+	mov	rdi, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 8], 13
+	mov	rdx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 8], 14
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 8], 15
+	mov	rdx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm5, xmm10, byte ptr [rsi + rdx + 8], 1
+	mov	r14, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r14 + 8], 2
+	mov	rdi, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 8], 3
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 8], 4
+	mov	rdx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 5
+	mov	rdx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 6
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r15 + 8], 7
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 8], 8
+	mov	r9, qword ptr [rsp + 96]        # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 8], 9
+	mov	r15, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r15 + 8], 10
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r13 + 8], 11
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 12
+	mov	rdx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 13
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 14
+	mov	r13, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r13 + 8], 15
+	mov	r13, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm6, xmm8, byte ptr [rsi + r13 + 9], 1
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r12 + 9], 2
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 3
+	mov	rcx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 4
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rbx + 9], 5
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rax + 9], 6
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rdx + 9], 7
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rax + 9], 8
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r8 + 9], 9
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rax + 9], 10
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 11
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r11 + 9], 12
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rax + 9], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rax + 9], 14
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rax + 9], 15
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm7, xmm11, byte ptr [rsi + rax + 9], 1
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r14 + 9], 2
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 3
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rdi + 9], 4
+	mov	r14, rdi
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 5
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 6
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 7
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r10 + 9], 8
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r9 + 9], 9
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r15 + 9], 10
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 11
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 12
+	mov	r15, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r15 + 9], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 14
+	vinserti128	ymm0, ymm5, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 1120], ymm0  # 32-byte Spill
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm5, xmm7, byte ptr [rsi + rax + 9], 15
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 12]
+	vmovd	xmm0, edi
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vmovdqa	ymmword ptr [rsp + 1088], ymm5  # 32-byte Spill
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 12]
+	vmovd	xmm5, edi
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 10], 1
+	mov	rbx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 10], 2
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 3
+	mov	r9, qword ptr [rsp + 176]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 10], 4
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 5
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 6
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 10], 7
+	mov	r8, qword ptr [rsp + 216]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 10], 8
+	mov	r12, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 10], 9
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 10
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 10], 11
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 10], 12
+	mov	rcx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 10], 13
+	mov	rdi, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 10], 14
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 10], 15
+	mov	r11, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r11 + 10], 1
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdi + 10], 2
+	mov	rdi, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdi + 10], 3
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r14 + 10], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 5
+	mov	r14, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r14 + 10], 6
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 7
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r10 + 10], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 9
+	mov	r14, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r14 + 10], 10
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 11
+	mov	r10, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r10 + 10], 12
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r15 + 10], 13
+	mov	r15, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r15 + 10], 14
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 15
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 11], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 11], 2
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 11], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 11], 4
+	mov	rbx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 11], 5
+	mov	r13, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 11], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 11], 7
+	mov	r9, rdx
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 11], 8
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 11], 9
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 11], 10
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 11], 11
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 11], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 11], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 11], 14
+	mov	r15, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 11], 15
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 11], 1
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 2
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 11], 3
+	mov	rax, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 4
+	mov	rcx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 5
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 6
+	mov	rcx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 7
+	mov	r12, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 11], 8
+	mov	rcx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 11], 10
+	mov	rcx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 11
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 11], 12
+	mov	rcx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 13
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vmovdqa	ymmword ptr [rsp + 1056], ymm3  # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 14
+	mov	rcx, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 13]
+	vmovd	xmm3, edi
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 11], 15
+	vinserti128	ymm1, ymm2, xmm1, 1
+	vmovdqa	ymmword ptr [rsp + 1024], ymm1  # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 13]
+	vmovd	xmm1, edi
+	mov	rcx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 1
+	mov	rcx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 2
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 3
+	mov	rdx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 12], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 12], 5
+	mov	rbx, r13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 12], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 12], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 12], 8
+	mov	r9, qword ptr [rsp + 184]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 12], 9
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 10
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 11
+	mov	r11, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 12], 12
+	mov	rcx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 13
+	mov	r14, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 12], 14
+	mov	r10, r15
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 12], 15
+	mov	r13, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm5, byte ptr [rsi + r13 + 12], 1
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 12], 2
+	mov	rcx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 12], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 12], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 12], 5
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 12], 6
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 12], 7
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 12], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 12], 9
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 12], 10
+	mov	r12, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 12], 11
+	mov	rcx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 12], 12
+	mov	rcx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 12], 13
+	mov	rcx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 12], 14
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 12], 15
+	mov	rcx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 1
+	mov	rcx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 2
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 3
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 13], 4
+	mov	rcx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 5
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 13], 6
+	mov	rcx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 13], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 13], 9
+	mov	r9, qword ptr [rsp + 80]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 13], 10
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 11
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 13], 12
+	mov	r11, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 13], 13
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 13], 14
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 13], 15
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 13], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 13], 2
+	mov	rcx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 3
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 4
+	mov	rcx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 5
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 13], 7
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 8
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 13], 11
+	mov	r10, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 13], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 14
+	vinserti128	ymm0, ymm2, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 992], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm1, byte ptr [rsi + rax + 13], 15
+	mov	r13, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r13 + 14]
+	vmovd	xmm1, edi
+	vinserti128	ymm0, ymm0, xmm3, 1
+	vmovdqa	ymmword ptr [rsp + 960], ymm0   # 32-byte Spill
+	mov	r14, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r14 + 14]
+	vmovd	xmm0, edi
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 1
+	mov	rdx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 14], 2
+	mov	rbx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 14], 3
+	mov	r8, qword ptr [rsp + 176]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 14], 4
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 5
+	mov	rcx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 14], 6
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 7
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 8
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 14], 10
+	mov	r15, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 14], 11
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 14], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 14
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 15
+	mov	r9, qword ptr [rsp + 208]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 14], 1
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 14], 2
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 14], 3
+	mov	r11, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 14], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 14], 5
+	mov	r12, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 14], 6
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 14], 7
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 14], 8
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 9
+	mov	rdi, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 10
+	mov	rdi, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 11
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 14], 12
+	mov	rdi, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 13
+	mov	rdi, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 14
+	mov	rdi, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 15
+	movzx	edi, byte ptr [rsi + r13 + 15]
+	vmovd	xmm2, edi
+	mov	rdi, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 15], 1
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 15], 2
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 15], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 15], 4
+	mov	rbx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 15], 5
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 15], 6
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 15], 7
+	mov	rcx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 15], 8
+	mov	rcx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 15], 9
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 15], 10
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 15], 11
+	mov	r13, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 15], 12
+	mov	rcx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 15], 13
+	mov	r8, qword ptr [rsp + 40]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 15], 14
+	mov	rcx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 15], 15
+	movzx	edi, byte ptr [rsi + r14 + 15]
+	vmovd	xmm3, edi
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 15], 1
+	mov	rcx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 15], 2
+	mov	rcx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 15], 3
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 15], 4
+	mov	rcx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 15], 5
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 15], 6
+	mov	r10, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 15], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 10
+	mov	r12, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 15], 11
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 14
+	mov	r15, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 15], 15
+	vinserti128	ymm0, ymm0, xmm1, 1
+	vmovdqa	ymmword ptr [rsp + 896], ymm0   # 32-byte Spill
+	vinserti128	ymm0, ymm3, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 928], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 16]
+	vmovd	xmm0, edi
+	mov	r14, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 16], 1
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 16], 2
+	mov	r11, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 16], 3
+	mov	rax, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 16], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 16], 5
+	mov	r9, qword ptr [rsp + 168]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 16], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 16], 7
+	mov	rdx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 16], 8
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 16], 9
+	mov	rbx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 16], 10
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 16], 11
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 16], 12
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 16], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 16], 14
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 16], 15
+	mov	rdi, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 16]
+	vmovd	xmm1, edi
+	mov	rdi, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 1
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 2
+	mov	rdi, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 3
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 16], 5
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 16], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 16], 7
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 16], 8
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 9
+	mov	rdi, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 16], 11
+	mov	rdi, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 12
+	mov	r12, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 16], 13
+	mov	r13, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 16], 14
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 16], 15
+	mov	rdi, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 17]
+	vmovd	xmm2, edi
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 17], 1
+	mov	r8, qword ptr [rsp + 232]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 17], 2
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 17], 3
+	mov	r10, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 17], 4
+	mov	rdi, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 17], 5
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 17], 6
+	mov	rdi, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 17], 7
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 17], 8
+	mov	rdx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 17], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 17], 10
+	mov	r11, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 17], 11
+	mov	rdx, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 17], 12
+	mov	rdx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 17], 13
+	mov	rdx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 17], 14
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 17], 15
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 17]
+	vmovd	xmm3, edi
+	mov	r14, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 17], 1
+	mov	r15, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 17], 2
+	mov	r9, qword ptr [rsp + 64]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 17], 3
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 17], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 17], 5
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 17], 6
+	mov	rbx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 17], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 17], 8
+	mov	rcx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 17], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 17], 10
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 17], 11
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 17], 12
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 17], 13
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 17], 14
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 864], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm3, byte ptr [rsi + rax + 17], 15
+	vinserti128	ymm0, ymm0, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 832], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 18]
+	vmovd	xmm0, edi
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 1
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 18], 2
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 18], 4
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 5
+	mov	r8, qword ptr [rsp + 168]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 18], 6
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 7
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 8
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 9
+	mov	r10, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 18], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 18], 11
+	mov	r12, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 18], 12
+	mov	r11, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 18], 13
+	mov	rdi, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 18], 14
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 18], 15
+	mov	rdi, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 18]
+	vmovd	xmm1, edi
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 18], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 18], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 18], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 18], 4
+	mov	rdx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 18], 5
+	mov	r14, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 18], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 18], 7
+	mov	rdx, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 18], 8
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 18], 9
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 18], 10
+	mov	r15, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 18], 11
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 18], 12
+	mov	rdi, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 18], 13
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 18], 14
+	mov	rbx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 18], 15
+	mov	rdi, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 19]
+	vmovd	xmm2, edi
+	mov	rdi, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 1
+	mov	rdi, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 2
+	mov	rdi, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 3
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 4
+	mov	r13, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 19], 5
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 19], 6
+	mov	rdi, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 7
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 8
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 19], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 19], 10
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 19], 11
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 19], 12
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 19], 13
+	mov	r9, qword ptr [rsp + 40]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 19], 14
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 19], 15
+	mov	r10, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r10 + 19]
+	vmovd	xmm3, edi
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 1
+	mov	r8, qword ptr [rsp + 88]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 19], 2
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 3
+	mov	rax, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 5
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 19], 6
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 7
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 9
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 19], 10
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 19], 11
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 19], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 14
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 19], 15
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 768], ymm0   # 32-byte Spill
+	vinserti128	ymm0, ymm3, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 800], ymm0   # 32-byte Spill
+	mov	r11, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r11 + 20]
+	vmovd	xmm0, edi
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 1
+	mov	rdx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 20], 2
+	mov	rdi, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 20], 3
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 20], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 20], 5
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 20], 6
+	mov	r13, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 20], 7
+	mov	r15, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 20], 8
+	mov	rdi, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 20], 9
+	mov	r12, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 20], 10
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 20], 11
+	mov	rdi, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 20], 12
+	mov	rdi, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 20], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 20], 14
+	mov	r14, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 20], 15
+	movzx	edi, byte ptr [rsi + r10 + 20]
+	vmovd	xmm1, edi
+	mov	rbx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 20], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 20], 2
+	mov	r8, qword ptr [rsp + 64]        # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 20], 3
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 4
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 5
+	mov	rdi, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 6
+	mov	r10, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 20], 7
+	mov	rdi, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 8
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 9
+	mov	rdi, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 10
+	mov	rdi, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 11
+	mov	r9, qword ptr [rsp + 120]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 20], 12
+	mov	rdi, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 13
+	mov	rdi, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 14
+	mov	rdi, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 20], 15
+	movzx	edi, byte ptr [rsi + r11 + 21]
+	vmovd	xmm2, edi
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 21], 1
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 21], 2
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 21], 3
+	mov	rax, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 21], 4
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 21], 5
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 21], 6
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 21], 7
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 21], 8
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 21], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 21], 10
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 21], 11
+	mov	r12, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 21], 12
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 21], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 21], 14
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 21], 15
+	mov	r11, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r11 + 21]
+	vmovd	xmm3, edi
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 21], 1
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 2
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 21], 3
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 21], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 5
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 6
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 21], 7
+	mov	r13, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 21], 8
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 21], 9
+	mov	r15, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 21], 10
+	mov	rbx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 21], 11
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 21], 12
+	mov	r8, qword ptr [rsp + 32]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 21], 13
+	mov	rdi, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 21], 14
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 704], ymm0   # 32-byte Spill
+	mov	r10, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm3, byte ptr [rsi + r10 + 21], 15
+	vinserti128	ymm0, ymm0, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 736], ymm0   # 32-byte Spill
+	mov	rdi, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 22]
+	vmovd	xmm0, edi
+	mov	rdi, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 1
+	mov	rdi, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 2
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 3
+	mov	rdx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 4
+	mov	rdx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 5
+	mov	rdx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 6
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 7
+	mov	rdx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 8
+	mov	rdx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 9
+	mov	r14, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 22], 10
+	mov	rdi, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 11
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 22], 12
+	mov	rdi, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 13
+	mov	r9, qword ptr [rsp + 40]        # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 22], 14
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 15
+	movzx	edi, byte ptr [rsi + r11 + 22]
+	vmovd	xmm1, edi
+	mov	rdi, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 22], 1
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 22], 2
+	mov	r12, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 22], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 22], 4
+	mov	rcx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 22], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 22], 6
+	mov	r11, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 22], 7
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 22], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 22], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 22], 10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 22], 11
+	mov	r15, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 22], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 22], 13
+	mov	rcx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 22], 14
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 22], 15
+	mov	rdi, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 23]
+	vmovd	xmm2, edi
+	mov	r10, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 23], 1
+	mov	r8, qword ptr [rsp + 232]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 23], 2
+	mov	rdi, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 3
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 4
+	mov	rdi, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 5
+	mov	r13, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 23], 6
+	mov	rdi, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 7
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 8
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 23], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 23], 10
+	mov	rdx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 23], 11
+	mov	r14, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 23], 12
+	mov	rbx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 23], 13
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 23], 14
+	mov	rdx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 23], 15
+	mov	rdi, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 23]
+	vmovd	xmm3, edi
+	mov	r9, qword ptr [rsp + 208]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 23], 1
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 23], 2
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 23], 3
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 23], 4
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 23], 5
+	mov	rdi, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 23], 6
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 23], 7
+	mov	r12, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 23], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 23], 9
+	mov	r11, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 23], 10
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 23], 11
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 23], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 23], 13
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 23], 14
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 23], 15
+	vinserti128	ymm10, ymm1, xmm0, 1
+	vinserti128	ymm0, ymm3, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 672], ymm0   # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 24]
+	vmovd	xmm0, edi
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 24], 1
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 24], 2
+	mov	r10, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 24], 3
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 24], 4
+	mov	rdi, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 24], 5
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 24], 6
+	mov	r8, qword ptr [rsp + 192]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 24], 7
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 24], 8
+	mov	r13, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 24], 9
+	mov	rdi, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 24], 10
+	mov	rdi, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 24], 11
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 24], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 24], 13
+	mov	rdi, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 24], 14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 24], 15
+	mov	rdx, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdx + 24]
+	vmovd	xmm1, edi
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 24], 1
+	mov	r9, qword ptr [rsp + 88]        # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 24], 2
+	mov	rdi, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 24], 3
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 24], 4
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 24], 5
+	mov	rdi, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 24], 6
+	mov	rdi, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 24], 7
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 24], 8
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 24], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 24], 10
+	mov	rdi, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 24], 11
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 24], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 24], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 24], 14
+	mov	r11, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 24], 15
+	movzx	edi, byte ptr [rsi + rcx + 25]
+	vmovd	xmm2, edi
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 25], 1
+	mov	rbx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 25], 2
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 25], 3
+	mov	rax, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 25], 4
+	mov	r14, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 25], 5
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 25], 6
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 25], 7
+	mov	rcx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 25], 8
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 25], 9
+	mov	r15, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 25], 10
+	mov	r8, qword ptr [rsp + 72]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 25], 11
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 25], 12
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 25], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 25], 14
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 25], 15
+	movzx	edi, byte ptr [rsi + rdx + 25]
+	vmovd	xmm3, edi
+	mov	r12, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 25], 1
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 25], 2
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 3
+	mov	rax, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 5
+	mov	r13, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 25], 6
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 7
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 8
+	mov	r10, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 25], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 10
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 11
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 25], 12
+	mov	rdx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 25], 13
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 25], 14
+	vinserti128	ymm9, ymm1, xmm0, 1
+	vpinsrb	xmm0, xmm3, byte ptr [rsi + r11 + 25], 15
+	vinserti128	ymm8, ymm0, xmm2, 1
+	mov	r11, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r11 + 26]
+	vmovd	xmm0, edi
+	mov	rdx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 26], 1
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 26], 2
+	mov	rbx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 26], 3
+	mov	rdx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 26], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 26], 5
+	mov	r9, qword ptr [rsp + 168]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 26], 6
+	mov	rdi, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 26], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 26], 8
+	mov	rcx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 26], 9
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 26], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 26], 11
+	mov	r14, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 26], 12
+	mov	r15, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 26], 13
+	mov	rcx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 26], 14
+	mov	rcx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 26], 15
+	mov	rcx, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 26]
+	vmovd	xmm1, edi
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 26], 1
+	mov	rcx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 26], 2
+	mov	rcx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 26], 3
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 26], 4
+	mov	r12, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 26], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 26], 6
+	mov	rcx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 26], 7
+	mov	r13, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 26], 8
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 26], 9
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 26], 10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 11
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 12
+	mov	r10, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 26], 13
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 14
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 15
+	movzx	edi, byte ptr [rsi + r11 + 27]
+	vmovd	xmm2, edi
+	mov	r11, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 27], 1
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 27], 2
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 27], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 27], 4
+	mov	r8, qword ptr [rsp + 200]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 27], 5
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 27], 6
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 27], 7
+	mov	rdx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 27], 8
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 27], 9
+	mov	rdx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 27], 10
+	mov	rdx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 27], 11
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 27], 12
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 27], 13
+	mov	rdx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 27], 14
+	mov	rdx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 27], 15
+	mov	rdx, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdx + 27]
+	vmovd	xmm3, edi
+	mov	rdx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 27], 1
+	mov	r9, qword ptr [rsp + 88]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 27], 2
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 27], 3
+	mov	r14, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 27], 4
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 27], 5
+	mov	r15, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 27], 6
+	mov	rdi, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 27], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 27], 8
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 27], 9
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 27], 10
+	mov	rcx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 27], 11
+	mov	rcx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 27], 12
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 27], 13
+	mov	r13, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 27], 14
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 27], 15
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 544], ymm0   # 32-byte Spill
+	vinserti128	ymm0, ymm3, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 576], ymm0   # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 28]
+	vmovd	xmm0, edi
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 28], 1
+	mov	rcx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 28], 2
+	mov	rdi, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 28], 3
+	mov	r11, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 28], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 28], 5
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 28], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 28], 7
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 28], 8
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 28], 9
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 28], 10
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 28], 11
+	mov	rbx, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 28], 12
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 28], 13
+	mov	rdi, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 28], 14
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 28], 15
+	mov	r12, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r12 + 28]
+	vmovd	xmm1, edi
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 28], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 28], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 28], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 28], 4
+	mov	r9, qword ptr [rsp + 320]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 28], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 28], 6
+	mov	r15, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 28], 7
+	mov	r8, qword ptr [rsp + 264]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 28], 8
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 28], 9
+	mov	r14, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 28], 10
+	mov	r10, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 28], 11
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 28], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 28], 13
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 28], 14
+	mov	rdi, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 28], 15
+	mov	rdi, qword ptr [rsp + 240]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 29]
+	vmovd	xmm2, edi
+	mov	r13, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 29], 1
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 29], 2
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 29], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 29], 4
+	mov	r11, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 29], 5
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 6
+	mov	rdi, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 7
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 8
+	mov	rdi, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 9
+	mov	rdi, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 10
+	mov	rdi, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 11
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 29], 12
+	mov	rdi, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 13
+	mov	rdi, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 14
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 29], 15
+	movzx	edi, byte ptr [rsi + r12 + 29]
+	vmovd	xmm3, edi
+	mov	rbx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 29], 1
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 29], 2
+	mov	rdi, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 29], 3
+	mov	r12, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 29], 4
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 29], 5
+	mov	rdi, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 29], 6
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 29], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 29], 8
+	mov	r9, qword ptr [rsp + 96]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 29], 9
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 29], 10
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 29], 11
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 29], 12
+	mov	r14, rdx
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 29], 13
+	mov	r10, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm4, xmm3, byte ptr [rsi + r10 + 29], 14
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 640], ymm0   # 32-byte Spill
+	mov	rdx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm4, byte ptr [rsi + rdx + 29], 15
+	vinserti128	ymm0, ymm0, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 608], ymm0   # 32-byte Spill
+	mov	r8, qword ptr [rsp + 240]       # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r8 + 30]
+	vmovd	xmm0, edi
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 30], 1
+	movzx	edi, byte ptr [rsi + r8 + 31]
+	vmovd	xmm1, edi
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 31], 1
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 2
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 30], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 31], 3
+	mov	rax, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 30], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 31], 5
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 6
+	mov	r11, qword ptr [rsp + 272]      # 8-byte Reload
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 7
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 7
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 8
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 8
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 9
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 10
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 11
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 11
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 12
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 13
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 13
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 14
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 31], 14
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 30], 15
+	vpinsrb	xmm2, xmm1, byte ptr [rsi + rax + 31], 15
+	mov	rcx, qword ptr [rsp + 256]      # 8-byte Reload
+	movzx	eax, byte ptr [rsi + rcx + 30]
+	vmovd	xmm1, eax
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 30], 1
+	movzx	eax, byte ptr [rsi + rcx + 31]
+	vmovd	xmm7, eax
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rbx + 31], 1
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 2
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 2
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 3
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 30], 4
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r12 + 31], 4
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 5
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 5
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 6
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 30], 7
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r15 + 31], 7
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 8
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 8
+	mov	rax, r9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 30], 9
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r9 + 31], 9
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 10
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 10
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 11
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 11
+	mov	rax, r14
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 30], 12
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r14 + 31], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 30], 13
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 31], 13
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 30], 14
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r10 + 31], 14
+	mov	rax, rdx
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 30], 15
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rdx + 31], 15
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 320], ymm0   # 32-byte Spill
+	vinserti128	ymm0, ymm7, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 288], ymm0   # 32-byte Spill
+	vmovdqa	ymm0, ymmword ptr [rsp + 512]   # 32-byte Reload
+	vpcmpeqb	ymm2, ymm0, ymmword ptr [rsp + 1216] # 32-byte Folded Reload
+	vmovdqa	ymm1, ymmword ptr [rip + .LCPI1_0] # ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+	vpand	ymm7, ymm2, ymm1
+	vpsubb	ymm11, ymm7, ymm2
+	vpcmpeqb	ymm7, ymm15, ymm0
+	vpand	ymm7, ymm7, ymm1
+	vpcmpeqb	ymm12, ymm0, ymmword ptr [rsp + 480] # 32-byte Folded Reload
+	vmovdqa	ymm6, ymmword ptr [rip + .LCPI1_1] # ymm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+	vpand	ymm12, ymm12, ymm6
+	vpor	ymm7, ymm12, ymm7
+	vpor	ymm11, ymm11, ymm7
+	vpcmpeqb	ymm7, ymm0, ymmword ptr [rsp + 448] # 32-byte Folded Reload
+	vmovdqa	ymm2, ymmword ptr [rip + .LCPI1_2] # ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+	vpand	ymm7, ymm7, ymm2
+	vpcmpeqb	ymm12, ymm14, ymm0
+	vmovdqa	ymm4, ymmword ptr [rip + .LCPI1_3] # ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+	vpand	ymm12, ymm12, ymm4
+	vpor	ymm7, ymm12, ymm7
+	vpcmpeqb	ymm12, ymm0, ymmword ptr [rsp + 416] # 32-byte Folded Reload
+	vmovdqa	ymm13, ymmword ptr [rip + .LCPI1_4] # ymm13 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+	vpand	ymm12, ymm12, ymm13
+	vmovdqa	ymm14, ymm13
+	vpor	ymm7, ymm12, ymm7
+	vpor	ymm11, ymm11, ymm7
+	vpcmpeqb	ymm7, ymm0, ymmword ptr [rsp + 1184] # 32-byte Folded Reload
+	vmovdqa	ymm5, ymmword ptr [rip + .LCPI1_5] # ymm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+	vpand	ymm7, ymm7, ymm5
+	vpcmpeqb	ymm12, ymm0, ymmword ptr [rsp + 1152] # 32-byte Folded Reload
+	vpsllw	ymm12, ymm12, 7
+	vmovdqa	ymm15, ymmword ptr [rip + .LCPI1_6] # ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+	vpand	ymm12, ymm12, ymm15
+	vpor	ymm7, ymm12, ymm7
+	vpor	ymm13, ymm11, ymm7
+	vpcmpeqb	ymm7, ymm0, ymmword ptr [rsp + 1088] # 32-byte Folded Reload
+	vpand	ymm12, ymm7, ymm1
+	vpsubb	ymm7, ymm12, ymm7
+	vpcmpeqb	ymm12, ymm0, ymmword ptr [rsp + 1120] # 32-byte Folded Reload
+	vpand	ymm12, ymm12, ymm1
+	vpcmpeqb	ymm11, ymm0, ymmword ptr [rsp + 1056] # 32-byte Folded Reload
+	vpand	ymm11, ymm11, ymm6
+	vpor	ymm11, ymm12, ymm11
+	vpor	ymm7, ymm11, ymm7
+	vpcmpeqb	ymm11, ymm0, ymmword ptr [rsp + 1024] # 32-byte Folded Reload
+	vpand	ymm11, ymm11, ymm2
+	vpcmpeqb	ymm12, ymm0, ymmword ptr [rsp + 992] # 32-byte Folded Reload
+	vpand	ymm12, ymm12, ymm4
+	vpor	ymm11, ymm11, ymm12
+	vpcmpeqb	ymm12, ymm0, ymmword ptr [rsp + 960] # 32-byte Folded Reload
+	vpand	ymm12, ymm12, ymm14
+	vmovdqa	ymm3, ymm14
+	vpor	ymm11, ymm11, ymm12
+	vpor	ymm7, ymm11, ymm7
+	vpcmpeqb	ymm11, ymm0, ymmword ptr [rsp + 896] # 32-byte Folded Reload
+	vpand	ymm11, ymm11, ymm5
+	vpcmpeqb	ymm12, ymm0, ymmword ptr [rsp + 928] # 32-byte Folded Reload
+	vpsllw	ymm12, ymm12, 7
+	vpand	ymm12, ymm12, ymm15
+	vpor	ymm11, ymm11, ymm12
+	vpor	ymm12, ymm11, ymm7
+	vpcmpeqb	ymm7, ymm0, ymmword ptr [rsp + 832] # 32-byte Folded Reload
+	vpand	ymm11, ymm7, ymm1
+	vpsubb	ymm7, ymm11, ymm7
+	vpcmpeqb	ymm11, ymm0, ymmword ptr [rsp + 864] # 32-byte Folded Reload
+	vpand	ymm11, ymm11, ymm1
+	vpcmpeqb	ymm14, ymm0, ymmword ptr [rsp + 768] # 32-byte Folded Reload
+	vpand	ymm14, ymm14, ymm6
+	vpor	ymm11, ymm11, ymm14
+	vpor	ymm7, ymm11, ymm7
+	vpcmpeqb	ymm11, ymm0, ymmword ptr [rsp + 800] # 32-byte Folded Reload
+	vpand	ymm11, ymm11, ymm2
+	vpcmpeqb	ymm14, ymm0, ymmword ptr [rsp + 704] # 32-byte Folded Reload
+	vpand	ymm14, ymm14, ymm4
+	vpor	ymm11, ymm11, ymm14
+	vpcmpeqb	ymm14, ymm0, ymmword ptr [rsp + 736] # 32-byte Folded Reload
+	vpand	ymm14, ymm14, ymm3
+	vpor	ymm11, ymm11, ymm14
+	vpor	ymm7, ymm11, ymm7
+	vpcmpeqb	ymm10, ymm10, ymm0
+	vmovdqa	ymm14, ymm5
+	vpand	ymm10, ymm10, ymm5
+	vpcmpeqb	ymm11, ymm0, ymmword ptr [rsp + 672] # 32-byte Folded Reload
+	vpsllw	ymm11, ymm11, 7
+	vpand	ymm11, ymm11, ymm15
+	vpor	ymm10, ymm10, ymm11
+	vpor	ymm7, ymm10, ymm7
+	vpcmpeqb	ymm8, ymm8, ymm0
+	vpand	ymm10, ymm8, ymm1
+	vpsubb	ymm8, ymm10, ymm8
+	vpcmpeqb	ymm9, ymm9, ymm0
+	vpand	ymm9, ymm9, ymm1
+	vpcmpeqb	ymm5, ymm0, ymmword ptr [rsp + 544] # 32-byte Folded Reload
+	vpand	ymm5, ymm5, ymm6
+	vpor	ymm5, ymm9, ymm5
+	vpor	ymm5, ymm8, ymm5
+	vpcmpeqb	ymm6, ymm0, ymmword ptr [rsp + 576] # 32-byte Folded Reload
+	vpand	ymm6, ymm6, ymm2
+	vpcmpeqb	ymm3, ymm0, ymmword ptr [rsp + 640] # 32-byte Folded Reload
+	vpand	ymm3, ymm3, ymm4
+	vpor	ymm3, ymm6, ymm3
+	vpcmpeqb	ymm4, ymm0, ymmword ptr [rsp + 608] # 32-byte Folded Reload
+	vpand	ymm4, ymm4, ymmword ptr [rip + .LCPI1_4]
+	vpor	ymm3, ymm3, ymm4
+	vpor	ymm3, ymm5, ymm3
+	vpcmpeqb	ymm1, ymm0, ymmword ptr [rsp + 320] # 32-byte Folded Reload
+	vpand	ymm1, ymm14, ymm1
+	vpcmpeqb	ymm2, ymm0, ymmword ptr [rsp + 288] # 32-byte Folded Reload
+	vpsllw	ymm2, ymm2, 7
+	vpand	ymm2, ymm15, ymm2
+	vpor	ymm1, ymm1, ymm2
+	vpor	ymm1, ymm3, ymm1
+	vpunpcklbw	ymm2, ymm13, ymm12      # ymm2 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23]
+	vpunpckhbw	ymm0, ymm13, ymm12      # ymm0 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31]
+	vpunpcklbw	ymm3, ymm7, ymm1        # ymm3 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[4],ymm1[4],ymm7[5],ymm1[5],ymm7[6],ymm1[6],ymm7[7],ymm1[7],ymm7[16],ymm1[16],ymm7[17],ymm1[17],ymm7[18],ymm1[18],ymm7[19],ymm1[19],ymm7[20],ymm1[20],ymm7[21],ymm1[21],ymm7[22],ymm1[22],ymm7[23],ymm1[23]
+	vpunpckhbw	ymm1, ymm7, ymm1        # ymm1 = ymm7[8],ymm1[8],ymm7[9],ymm1[9],ymm7[10],ymm1[10],ymm7[11],ymm1[11],ymm7[12],ymm1[12],ymm7[13],ymm1[13],ymm7[14],ymm1[14],ymm7[15],ymm1[15],ymm7[24],ymm1[24],ymm7[25],ymm1[25],ymm7[26],ymm1[26],ymm7[27],ymm1[27],ymm7[28],ymm1[28],ymm7[29],ymm1[29],ymm7[30],ymm1[30],ymm7[31],ymm1[31]
+	vpunpcklwd	ymm4, ymm2, ymm3        # ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+	vpunpckhwd	ymm2, ymm2, ymm3        # ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+	vpunpcklwd	ymm3, ymm0, ymm1        # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+	vpunpckhwd	ymm0, ymm0, ymm1        # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+	vinserti128	ymm1, ymm4, xmm2, 1
+	vperm2i128	ymm2, ymm4, ymm2, 49    # ymm2 = ymm4[2,3],ymm2[2,3]
+	vinserti128	ymm4, ymm3, xmm0, 1
+	vperm2i128	ymm0, ymm3, ymm0, 49    # ymm0 = ymm3[2,3],ymm0[2,3]
+	mov	rcx, qword ptr [rsp + 408]      # 8-byte Reload
+	vmovdqu	ymmword ptr [r11 + 4*rcx + 96], ymm0
+	vmovdqu	ymmword ptr [r11 + 4*rcx + 64], ymm2
+	vmovdqu	ymmword ptr [r11 + 4*rcx + 32], ymm4
+	vmovdqu	ymmword ptr [r11 + 4*rcx], ymm1
+	add	rcx, 32
+	mov	rax, rcx
+	cmp	rcx, qword ptr [rsp + 384]      # 8-byte Folded Reload
+	jne	.LBB1_166
+# %bb.167:
+	mov	r13, qword ptr [rsp + 392]      # 8-byte Reload
+	cmp	r13, qword ptr [rsp + 384]      # 8-byte Folded Reload
+	mov	r15, qword ptr [rsp + 280]      # 8-byte Reload
+	mov	r14d, dword ptr [rsp + 28]      # 4-byte Reload
+	mov	r12, qword ptr [rsp + 400]      # 8-byte Reload
+	jne	.LBB1_36
+	jmp	.LBB1_109
+.LBB1_168:
+	and	r15, -32
+	mov	rax, r15
+	shl	rax, 5
+	add	rax, rsi
+	mov	qword ptr [rsp + 400], rax      # 8-byte Spill
+	mov	qword ptr [rsp + 384], r15      # 8-byte Spill
+	lea	rax, [r11 + 4*r15]
+	mov	qword ptr [rsp + 376], rax      # 8-byte Spill
+	vmovd	xmm0, r14d
+	vpbroadcastb	ymm0, xmm0
+	vmovdqa	ymmword ptr [rsp + 512], ymm0   # 32-byte Spill
+	xor	eax, eax
+	mov	qword ptr [rsp + 272], r11      # 8-byte Spill
+	.p2align	4, 0x90
+.LBB1_169:                              # =>This Inner Loop Header: Depth=1
+	mov	rbx, rax
+	mov	qword ptr [rsp + 408], rax      # 8-byte Spill
+	shl	rbx, 5
+	mov	rax, rbx
+	or	rax, 32
+	mov	qword ptr [rsp + 104], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 64
+	mov	qword ptr [rsp + 152], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 96
+	mov	qword ptr [rsp + 176], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 128
+	mov	qword ptr [rsp + 120], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 160
+	mov	qword ptr [rsp + 168], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 192
+	mov	qword ptr [rsp + 232], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 224
+	mov	qword ptr [rsp + 216], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 256
+	mov	qword ptr [rsp + 56], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 288
+	mov	qword ptr [rsp + 64], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 320
+	mov	qword ptr [rsp + 40], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 512
+	mov	rcx, rax
+	movzx	eax, byte ptr [rsi + rax]
+	vmovd	xmm0, eax
+	movzx	eax, byte ptr [rsi + rbx]
+	vmovd	xmm3, eax
+	movzx	eax, byte ptr [rsi + rcx + 1]
+	vmovd	xmm4, eax
+	movzx	eax, byte ptr [rsi + rbx + 1]
+	vmovd	xmm10, eax
+	movzx	eax, byte ptr [rsi + rcx + 2]
+	mov	rdx, rcx
+	vmovd	xmm1, eax
+	vmovdqa	xmmword ptr [rsp + 480], xmm1   # 16-byte Spill
+	mov	rcx, rbx
+	movzx	eax, byte ptr [rsi + rbx + 2]
+	vmovd	xmm1, eax
+	vmovdqa	xmmword ptr [rsp + 448], xmm1   # 16-byte Spill
+	movzx	eax, byte ptr [rsi + rdx + 3]
+	vmovd	xmm11, eax
+	movzx	eax, byte ptr [rsi + rbx + 3]
+	vmovd	xmm8, eax
+	movzx	eax, byte ptr [rsi + rdx + 4]
+	vmovd	xmm1, eax
+	vmovdqa	xmmword ptr [rsp + 416], xmm1   # 16-byte Spill
+	movzx	eax, byte ptr [rsi + rbx + 4]
+	vmovd	xmm13, eax
+	movzx	eax, byte ptr [rsi + rdx + 5]
+	vmovd	xmm14, eax
+	movzx	eax, byte ptr [rsi + rbx + 5]
+	vmovd	xmm6, eax
+	movzx	eax, byte ptr [rsi + rdx + 6]
+	mov	qword ptr [rsp + 248], rdx      # 8-byte Spill
+	vmovd	xmm12, eax
+	movzx	eax, byte ptr [rsi + rbx + 6]
+	vmovd	xmm7, eax
+	movzx	eax, byte ptr [rsi + rdx + 7]
+	vmovd	xmm2, eax
+	movzx	eax, byte ptr [rsi + rbx + 7]
+	vmovd	xmm1, eax
+	mov	rax, rbx
+	or	rax, 352
+	mov	qword ptr [rsp + 200], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 384
+	mov	qword ptr [rsp + 320], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 416
+	mov	qword ptr [rsp + 32], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 448
+	mov	qword ptr [rsp + 96], rax       # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 480
+	mov	qword ptr [rsp + 288], rax      # 8-byte Spill
+	mov	rax, rbx
+	or	rax, 544
+	mov	qword ptr [rsp + 136], rax      # 8-byte Spill
+	or	rbx, 576
+	mov	qword ptr [rsp + 256], rbx      # 8-byte Spill
+	mov	rax, rcx
+	or	rax, 608
+	mov	qword ptr [rsp + 48], rax       # 8-byte Spill
+	mov	r12, rcx
+	or	r12, 640
+	mov	qword ptr [rsp + 208], r12      # 8-byte Spill
+	mov	r14, rcx
+	or	r14, 672
+	mov	qword ptr [rsp + 144], r14      # 8-byte Spill
+	mov	rax, rcx
+	or	rax, 704
+	mov	qword ptr [rsp + 184], rax      # 8-byte Spill
+	mov	rdi, rcx
+	or	rdi, 736
+	mov	r9, rcx
+	or	r9, 768
+	mov	qword ptr [rsp + 224], r9       # 8-byte Spill
+	mov	r15, rcx
+	or	r15, 800
+	mov	qword ptr [rsp + 112], r15      # 8-byte Spill
+	mov	r11, rcx
+	or	r11, 832
+	mov	qword ptr [rsp + 192], r11      # 8-byte Spill
+	mov	r10, rcx
+	or	r10, 864
+	mov	qword ptr [rsp + 88], r10       # 8-byte Spill
+	mov	r8, rcx
+	or	r8, 896
+	mov	qword ptr [rsp + 128], r8       # 8-byte Spill
+	mov	rdx, rcx
+	or	rdx, 928
+	mov	qword ptr [rsp + 240], rdx      # 8-byte Spill
+	mov	rax, rcx
+	mov	qword ptr [rsp + 264], rcx      # 8-byte Spill
+	or	rax, 960
+	mov	qword ptr [rsp + 72], rax       # 8-byte Spill
+	or	rcx, 992
+	mov	qword ptr [rsp + 80], rcx       # 8-byte Spill
+	mov	r13, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm9, xmm0, byte ptr [rsi + r13], 1
+	vpinsrb	xmm0, xmm9, byte ptr [rsi + rbx], 2
+	mov	rbx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14], 5
+	mov	rbx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi], 7
+	mov	r13, rdi
+	mov	qword ptr [rsp + 160], rdi      # 8-byte Spill
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9], 8
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15], 9
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10], 11
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax], 14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx], 15
+	mov	r14, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14], 1
+	mov	r10, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10], 2
+	mov	r12, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12], 3
+	mov	r8, qword ptr [rsp + 120]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8], 4
+	mov	r11, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11], 5
+	mov	r9, qword ptr [rsp + 232]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9], 6
+	mov	r15, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15], 7
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi], 8
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax], 9
+	mov	rbx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx], 10
+	mov	rcx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx], 11
+	mov	rdx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 12
+	mov	rdx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 13
+	mov	rdx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 14
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx], 15
+	mov	rdx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 1
+	mov	rdx, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 2
+	mov	rdx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 3
+	mov	rdx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 4
+	mov	rdx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 5
+	mov	rdx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 6
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 1], 7
+	mov	r13, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 1], 8
+	mov	r13, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 1], 9
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 10
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 11
+	mov	rdx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 12
+	mov	rdx, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 13
+	mov	rdx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 14
+	mov	rdx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 1], 15
+	vpinsrb	xmm5, xmm10, byte ptr [rsi + r14 + 1], 1
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 1], 2
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r12 + 1], 3
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r8 + 1], 4
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r11 + 1], 5
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 1], 6
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r15 + 1], 7
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 1], 8
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 9
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rbx + 1], 10
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 1], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 13
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 1], 14
+	vinserti128	ymm15, ymm3, xmm0, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm5, byte ptr [rsi + rax + 1], 15
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 8]
+	vmovd	xmm9, edi
+	vinserti128	ymm0, ymm0, xmm4, 1
+	vmovdqa	ymmword ptr [rsp + 1216], ymm0  # 32-byte Spill
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 8]
+	vmovd	xmm10, edi
+	mov	r8, qword ptr [rsp + 136]       # 8-byte Reload
+	vmovdqa	xmm0, xmmword ptr [rsp + 480]   # 16-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 2], 1
+	mov	rcx, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 2], 2
+	mov	r10, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 2], 3
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 4
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 5
+	mov	r9, qword ptr [rsp + 184]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 2], 6
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 2], 7
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 8
+	mov	r12, r13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 2], 9
+	mov	r13, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 2], 10
+	mov	r11, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 2], 11
+	mov	r14, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 2], 12
+	mov	r15, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 2], 13
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 14
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 2], 15
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vmovdqa	xmm3, xmmword ptr [rsp + 448]   # 16-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 2], 1
+	mov	rdi, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 2
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 3
+	mov	rdi, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 4
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 5
+	mov	rdi, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 6
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 2], 7
+	mov	rbx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 8
+	mov	rbx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 9
+	mov	rbx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 10
+	mov	rbx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 11
+	mov	rbx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 12
+	mov	rbx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 13
+	mov	rbx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 14
+	mov	rbx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 2], 15
+	vpinsrb	xmm4, xmm11, byte ptr [rsi + r8 + 3], 1
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 3], 2
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r10 + 3], 3
+	mov	rbx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 3], 4
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 3], 5
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r9 + 3], 6
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 3], 7
+	mov	rdx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 3], 8
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r12 + 3], 9
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 3], 10
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r11 + 3], 11
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r14 + 3], 12
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r15 + 3], 13
+	mov	r9, qword ptr [rsp + 72]        # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r9 + 3], 14
+	mov	r11, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r11 + 3], 15
+	vpinsrb	xmm5, xmm8, byte ptr [rsi + rax + 3], 1
+	mov	rbx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rbx + 3], 2
+	mov	rax, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 3
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 4
+	mov	r10, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 3], 5
+	mov	r14, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r14 + 3], 6
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 3], 7
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 8
+	mov	r15, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r15 + 3], 9
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 10
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 3], 13
+	vinserti128	ymm0, ymm3, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 480], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm0, xmm5, byte ptr [rsi + rax + 3], 14
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 9]
+	vmovd	xmm8, edi
+	mov	r12, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 3], 15
+	vinserti128	ymm0, ymm0, xmm4, 1
+	vmovdqa	ymmword ptr [rsp + 448], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 9]
+	vmovd	xmm11, edi
+	vmovdqa	xmm0, xmmword ptr [rsp + 416]   # 16-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 4], 1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 2
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 3
+	mov	r13, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 4], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 4], 5
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 6
+	mov	rax, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 4], 8
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 9
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 10
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 11
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 12
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 4], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 4], 14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 4], 15
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm3, xmm13, byte ptr [rsi + rax + 4], 1
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 4], 2
+	mov	r11, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 4], 3
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 4], 4
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 4], 5
+	mov	rdi, r14
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 4], 6
+	mov	r10, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 4], 7
+	mov	r9, qword ptr [rsp + 56]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 4], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 4], 9
+	mov	rbx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 4], 10
+	mov	r14, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 4], 11
+	mov	rbx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 4], 12
+	mov	rbx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 4], 13
+	mov	r15, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 4], 14
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 4], 15
+	vpinsrb	xmm4, xmm14, byte ptr [rsi + r8 + 5], 1
+	mov	r15, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r15 + 5], 2
+	mov	rbx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 5], 3
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 5], 4
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 5
+	mov	r13, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 5], 6
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 7
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 5], 8
+	mov	rcx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 9
+	mov	rcx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 10
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdx + 5], 11
+	mov	rcx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 12
+	mov	r8, qword ptr [rsp + 240]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r8 + 5], 13
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 14
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rcx + 5], 15
+	mov	r12, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm5, xmm6, byte ptr [rsi + r12 + 5], 1
+	mov	rdx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 5], 2
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r11 + 5], 3
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 4
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 5
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 5], 6
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r10 + 5], 7
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 5], 8
+	mov	r9, qword ptr [rsp + 64]        # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 5], 9
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 10
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r14 + 5], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 13
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 5], 14
+	vinserti128	ymm14, ymm3, xmm0, 1
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm5, byte ptr [rsi + rax + 5], 15
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 10]
+	vmovd	xmm3, edi
+	vinserti128	ymm0, ymm0, xmm4, 1
+	vmovdqa	ymmword ptr [rsp + 416], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 10]
+	vmovd	xmm4, edi
+	mov	r11, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm12, byte ptr [rsi + r11 + 6], 1
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 6], 2
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 6], 3
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 6], 4
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 6], 5
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 6], 6
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 6], 7
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 6], 8
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 6], 9
+	mov	r14, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 6], 10
+	mov	r10, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 6], 11
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 6], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 6], 13
+	mov	rdi, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 6], 14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 6], 15
+	vpinsrb	xmm5, xmm7, byte ptr [rsi + r12 + 6], 1
+	mov	rcx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 2
+	mov	rcx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 3
+	mov	rcx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 4
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 6], 5
+	mov	rbx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rbx + 6], 6
+	mov	rcx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 7
+	mov	r12, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r12 + 6], 8
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 6], 9
+	mov	rcx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 10
+	mov	r9, qword ptr [rsp + 200]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 6], 11
+	mov	r8, qword ptr [rsp + 320]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r8 + 6], 12
+	mov	r13, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r13 + 6], 13
+	mov	rcx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 14
+	mov	rcx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rcx + 6], 15
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 7], 1
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 7], 2
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 7], 3
+	mov	r11, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 7], 4
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 7], 5
+	mov	rcx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 7], 6
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 7], 7
+	mov	rdx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 7], 8
+	mov	rcx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 7], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 7], 10
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 7], 11
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 7], 12
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 7], 13
+	mov	r15, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 7], 14
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 7], 15
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 1
+	mov	rcx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 2
+	mov	rcx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 3
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 7], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 7], 6
+	mov	r13, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 7], 7
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 7], 8
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 7], 9
+	mov	rcx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 7], 11
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 7], 12
+	mov	rcx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 7], 13
+	vinserti128	ymm0, ymm5, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 1184], ymm0  # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm0, xmm1, byte ptr [rsi + rcx + 7], 14
+	mov	rcx, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 11]
+	vmovd	xmm1, edi
+	mov	rcx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 7], 15
+	vinserti128	ymm0, ymm0, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 1152], ymm0  # 32-byte Spill
+	mov	rcx, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rcx + 11]
+	vmovd	xmm2, edi
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm9, byte ptr [rsi + rcx + 8], 1
+	mov	r8, qword ptr [rsp + 256]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 8], 2
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 8], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 8], 4
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 8], 5
+	mov	rcx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 8], 6
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 8], 7
+	mov	r14, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 8], 8
+	mov	r10, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 8], 9
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 8], 10
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 8], 11
+	mov	r12, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 8], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 8], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 8], 14
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 8], 15
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm5, xmm10, byte ptr [rsi + rax + 8], 1
+	mov	r9, qword ptr [rsp + 152]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r9 + 8], 2
+	mov	rax, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 8], 3
+	mov	rdi, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdi + 8], 4
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 8], 5
+	mov	r15, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r15 + 8], 6
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + r13 + 8], 7
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 8], 8
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 8], 9
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 8], 10
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rax + 8], 11
+	mov	rdx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 12
+	mov	rdx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 13
+	mov	rdx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 14
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm5, xmm5, byte ptr [rsi + rdx + 8], 15
+	mov	r13, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm6, xmm8, byte ptr [rsi + r13 + 9], 1
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r8 + 9], 2
+	mov	rdx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rdx + 9], 3
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r11 + 9], 4
+	mov	rdx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rdx + 9], 5
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 6
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 7
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r14 + 9], 8
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r10 + 9], 9
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rbx + 9], 10
+	mov	rcx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 11
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r12 + 9], 12
+	mov	rcx, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 13
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + rcx + 9], 14
+	mov	r13, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm6, xmm6, byte ptr [rsi + r13 + 9], 15
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm7, xmm11, byte ptr [rsi + rcx + 9], 1
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r9 + 9], 2
+	mov	rcx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rcx + 9], 3
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rdi + 9], 4
+	mov	r11, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r11 + 9], 5
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + r15 + 9], 6
+	mov	rbx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rbx + 9], 7
+	mov	rcx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rcx + 9], 8
+	mov	rcx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rcx + 9], 9
+	mov	rcx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rcx + 9], 10
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 13
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm7, xmm7, byte ptr [rsi + rax + 9], 14
+	vinserti128	ymm0, ymm5, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 1120], ymm0  # 32-byte Spill
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm5, xmm7, byte ptr [rsi + rax + 9], 15
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 12]
+	vmovd	xmm0, edi
+	vinserti128	ymm5, ymm5, xmm6, 1
+	vmovdqa	ymmword ptr [rsp + 1088], ymm5  # 32-byte Spill
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 12]
+	vmovd	xmm5, edi
+	mov	rdx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 10], 1
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 10], 2
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 10], 3
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 4
+	mov	r12, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 10], 5
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 6
+	mov	r9, qword ptr [rsp + 160]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 10], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 10], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 10], 9
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 10
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 11
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 10], 12
+	mov	r10, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 10], 13
+	mov	r15, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 10], 14
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 10], 15
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 1
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 2
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rdi + 10], 3
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 4
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r11 + 10], 5
+	mov	r11, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r11 + 10], 6
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 10], 7
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 8
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rax + 10], 9
+	mov	rbx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 10], 10
+	mov	rbx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 10], 11
+	mov	rbx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 10], 12
+	mov	rbx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 10], 13
+	mov	rbx, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + rbx + 10], 14
+	mov	r13, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm4, xmm4, byte ptr [rsi + r13 + 10], 15
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 11], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 11], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 11], 3
+	mov	rcx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 11], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 11], 5
+	mov	r8, qword ptr [rsp + 184]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 11], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 11], 7
+	mov	r12, r9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 11], 8
+	mov	rdx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 11], 9
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 11], 10
+	mov	rdx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 11], 11
+	mov	rdx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 11], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 11], 13
+	mov	r13, r10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 11], 14
+	mov	rdx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 11], 15
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 11], 1
+	mov	r14, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 11], 2
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 11], 3
+	mov	rdi, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 11], 4
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 11], 5
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 11], 6
+	mov	r9, qword ptr [rsp + 216]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 11], 7
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 11], 8
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 9
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 10
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 13
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vmovdqa	ymmword ptr [rsp + 1056], ymm3  # 32-byte Spill
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 11], 14
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 13]
+	vmovd	xmm3, edi
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 11], 15
+	vinserti128	ymm1, ymm2, xmm1, 1
+	vmovdqa	ymmword ptr [rsp + 1024], ymm1  # 32-byte Spill
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 13]
+	vmovd	xmm1, edi
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 12], 1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 12], 2
+	mov	r15, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 12], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 4
+	mov	r10, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 12], 5
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 12], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 12], 7
+	mov	r12, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 12], 8
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 12], 9
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 12], 10
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 12], 11
+	mov	rcx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 12], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 12], 13
+	mov	r13, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 12], 14
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 12], 15
+	vpinsrb	xmm2, xmm5, byte ptr [rsi + rdx + 12], 1
+	mov	rdi, r14
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 12], 2
+	mov	r11, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 12], 3
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 12], 4
+	mov	r14, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 12], 5
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 12], 6
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 12], 7
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 12], 8
+	mov	rbx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 12], 9
+	mov	rbx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 12], 10
+	mov	rbx, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 12], 11
+	mov	rbx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 12], 12
+	mov	r9, qword ptr [rsp + 32]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 12], 13
+	mov	r8, qword ptr [rsp + 96]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 12], 14
+	mov	rbx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 12], 15
+	mov	rbx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 13], 1
+	mov	rbx, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 13], 2
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 13], 3
+	mov	rbx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 13], 4
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 13], 5
+	mov	r15, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 13], 6
+	mov	rbx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 13], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 13], 8
+	mov	r12, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 13], 9
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 13], 10
+	mov	rbx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 13], 11
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 12
+	mov	r10, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 13], 13
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 13], 14
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 13], 15
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 13], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 13], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 13], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 13], 5
+	mov	rcx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 6
+	mov	rcx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 13], 7
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 8
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 9
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 10
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 13], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 13], 13
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 13], 14
+	vinserti128	ymm0, ymm2, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 992], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm1, byte ptr [rsi + rax + 13], 15
+	mov	r13, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r13 + 14]
+	vmovd	xmm1, edi
+	vinserti128	ymm0, ymm0, xmm3, 1
+	vmovdqa	ymmword ptr [rsp + 960], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 14]
+	vmovd	xmm0, edi
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 14], 1
+	mov	r11, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 14], 2
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 14], 3
+	mov	rbx, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 14], 4
+	mov	r8, qword ptr [rsp + 144]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 14], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 14], 6
+	mov	rcx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 14], 7
+	mov	rcx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 14], 8
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 14], 9
+	mov	rcx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 14], 10
+	mov	rcx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 14], 11
+	mov	rdi, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 14], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 14], 13
+	mov	rdx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 14], 14
+	mov	r9, qword ptr [rsp + 80]        # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 14], 15
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 14], 1
+	mov	rdi, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 2
+	mov	r12, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 14], 3
+	mov	rdi, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 4
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 14], 5
+	mov	rdi, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 6
+	mov	r10, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 14], 7
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 8
+	mov	r14, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 14], 9
+	mov	rdi, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 10
+	mov	rdi, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 11
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 12
+	mov	rdi, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 13
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 14
+	mov	rdi, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 14], 15
+	movzx	edi, byte ptr [rsi + r13 + 15]
+	vmovd	xmm2, edi
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 15], 1
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 15], 2
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 15], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 15], 4
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 15], 5
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 15], 6
+	mov	r15, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 15], 7
+	mov	r8, qword ptr [rsp + 224]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 15], 8
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 15], 9
+	mov	rbx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 15], 10
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 15], 11
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 15], 12
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 15], 13
+	mov	r13, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 15], 14
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 15], 15
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 15]
+	vmovd	xmm3, edi
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 15], 1
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 2
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 15], 3
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 4
+	mov	rdx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 15], 5
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 6
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 15], 7
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 15], 9
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 10
+	mov	r14, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 15], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 15], 12
+	mov	rcx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 15], 13
+	mov	r11, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 15], 14
+	mov	r12, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 15], 15
+	vinserti128	ymm0, ymm0, xmm1, 1
+	vmovdqa	ymmword ptr [rsp + 896], ymm0   # 32-byte Spill
+	vinserti128	ymm0, ymm3, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 928], ymm0   # 32-byte Spill
+	mov	r10, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r10 + 16]
+	vmovd	xmm0, edi
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 16], 1
+	mov	rcx, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 16], 2
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 16], 3
+	mov	r9, qword ptr [rsp + 208]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 16], 4
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 16], 5
+	mov	rcx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 16], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 16], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 16], 8
+	mov	rcx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 16], 9
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 16], 10
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 16], 11
+	mov	rdi, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 16], 12
+	mov	r8, qword ptr [rsp + 240]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 16], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 16], 14
+	mov	rdi, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 16], 15
+	mov	rbx, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rbx + 16]
+	vmovd	xmm1, edi
+	mov	rdi, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 1
+	mov	rdi, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 2
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 16], 3
+	mov	r13, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 16], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 16], 5
+	mov	rdx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 16], 6
+	mov	rdx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 16], 7
+	mov	rdx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 16], 8
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 16], 9
+	mov	rdx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 16], 10
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 16], 11
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 16], 12
+	mov	r15, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 16], 13
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 16], 14
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 16], 15
+	movzx	edi, byte ptr [rsi + r10 + 17]
+	vmovd	xmm2, edi
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 17], 1
+	mov	rdx, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 17], 2
+	mov	r10, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 17], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 17], 4
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 17], 5
+	mov	r11, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 17], 6
+	mov	rax, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 17], 7
+	mov	r14, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 17], 8
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 17], 9
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 17], 10
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 17], 11
+	mov	r12, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 17], 12
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 17], 13
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 17], 14
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 17], 15
+	movzx	edi, byte ptr [rsi + rbx + 17]
+	vmovd	xmm3, edi
+	mov	rcx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 17], 1
+	mov	rcx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 17], 2
+	mov	r8, qword ptr [rsp + 176]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 17], 3
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 17], 4
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 5
+	mov	rdi, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 6
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 7
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 8
+	mov	rdi, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 9
+	mov	r9, qword ptr [rsp + 40]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 17], 10
+	mov	rdi, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 11
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 12
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 17], 13
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 17], 14
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 864], ymm0   # 32-byte Spill
+	mov	rdi, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm3, byte ptr [rsi + rdi + 17], 15
+	vinserti128	ymm0, ymm0, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 832], ymm0   # 32-byte Spill
+	mov	rdi, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 18]
+	vmovd	xmm0, edi
+	mov	rdi, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 18], 1
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 18], 2
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 18], 3
+	mov	r15, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 18], 4
+	mov	rdi, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 18], 5
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 18], 6
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 18], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 18], 8
+	mov	rdx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 18], 9
+	mov	rdx, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 18], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 11
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 18], 12
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 13
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 14
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 18], 15
+	movzx	edi, byte ptr [rsi + rbx + 18]
+	vmovd	xmm1, edi
+	mov	r13, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 18], 1
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 18], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 18], 3
+	mov	r10, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 18], 4
+	mov	r14, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 18], 5
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 18], 6
+	mov	r11, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 18], 7
+	mov	rcx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 18], 8
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 18], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 18], 10
+	mov	rdi, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 18], 11
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 18], 12
+	mov	rdi, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 18], 13
+	mov	rdi, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 18], 14
+	mov	rdi, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdi + 18], 15
+	mov	rdi, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 19]
+	vmovd	xmm2, edi
+	mov	rdi, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 1
+	mov	r12, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 19], 2
+	mov	rdi, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 19], 4
+	mov	rdi, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 5
+	mov	rdi, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 6
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 7
+	mov	rdi, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 8
+	mov	rdi, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 19], 10
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 11
+	mov	rdi, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 12
+	mov	rdi, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 19], 13
+	mov	r9, qword ptr [rsp + 72]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 19], 14
+	mov	r8, qword ptr [rsp + 80]        # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 19], 15
+	movzx	edi, byte ptr [rsi + rbx + 19]
+	vmovd	xmm3, edi
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 19], 1
+	mov	rdx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 19], 2
+	mov	rdx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 19], 3
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 19], 4
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 19], 5
+	mov	rbx, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 19], 6
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 19], 7
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 19], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 9
+	mov	r11, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 19], 10
+	mov	rax, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 11
+	mov	r13, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 19], 12
+	mov	rax, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 13
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 19], 14
+	mov	r14, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 19], 15
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 768], ymm0   # 32-byte Spill
+	vinserti128	ymm0, ymm3, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 800], ymm0   # 32-byte Spill
+	mov	r15, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r15 + 20]
+	vmovd	xmm0, edi
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 20], 1
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 20], 2
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 3
+	mov	r12, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 20], 4
+	mov	r10, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 20], 5
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 6
+	mov	rax, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 7
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 8
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 9
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 10
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 11
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 12
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 20], 13
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 20], 14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 20], 15
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 20]
+	vmovd	xmm1, edi
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 20], 1
+	mov	r8, qword ptr [rsp + 152]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 20], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 20], 3
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 20], 4
+	mov	rdx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 20], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 20], 6
+	mov	rdx, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 20], 7
+	mov	rdx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 20], 8
+	mov	rdx, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 20], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 20], 10
+	mov	r11, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 20], 11
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 20], 12
+	mov	r13, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 20], 13
+	mov	r9, qword ptr [rsp + 96]        # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 20], 14
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 20], 15
+	movzx	edi, byte ptr [rsi + r15 + 21]
+	vmovd	xmm2, edi
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 21], 1
+	mov	rcx, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 21], 2
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 21], 3
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 21], 4
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 21], 5
+	mov	rdx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 21], 6
+	mov	r12, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 21], 7
+	mov	rcx, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 21], 8
+	mov	r10, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r10 + 21], 9
+	mov	rdi, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 21], 10
+	mov	r14, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 21], 11
+	mov	rdi, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 21], 12
+	mov	rbx, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 21], 13
+	mov	rdi, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 21], 14
+	mov	rdi, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 21], 15
+	mov	rdi, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 21]
+	vmovd	xmm3, edi
+	mov	rdi, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 21], 1
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 21], 2
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 21], 3
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 4
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 5
+	mov	r8, qword ptr [rsp + 232]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 21], 6
+	mov	r15, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 21], 7
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 8
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 9
+	mov	rax, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 10
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 21], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 21], 12
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 21], 13
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 21], 14
+	vinserti128	ymm0, ymm1, xmm0, 1
+	vmovdqa	ymmword ptr [rsp + 704], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm3, byte ptr [rsi + rax + 21], 15
+	vinserti128	ymm0, ymm0, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 736], ymm0   # 32-byte Spill
+	mov	rax, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 22]
+	vmovd	xmm0, edi
+	mov	rdi, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 1
+	mov	rdi, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 2
+	mov	rdi, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 3
+	mov	rdi, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 22], 4
+	mov	r13, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 22], 5
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 22], 6
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 22], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 22], 8
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r10 + 22], 9
+	mov	r12, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r12 + 22], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 22], 11
+	mov	r11, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 22], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 22], 13
+	mov	rcx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 22], 14
+	mov	rcx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 22], 15
+	mov	rbx, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rbx + 22]
+	vmovd	xmm1, edi
+	mov	r10, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 22], 1
+	mov	rcx, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 22], 2
+	mov	rdx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 3
+	mov	rdx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 4
+	mov	rdx, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 5
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 22], 6
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 22], 7
+	mov	rdx, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 8
+	mov	r14, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 22], 9
+	mov	rdx, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 10
+	mov	r9, qword ptr [rsp + 200]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 22], 11
+	mov	rdx, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 12
+	mov	rdx, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 13
+	mov	r15, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 22], 14
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 22], 15
+	movzx	edi, byte ptr [rsi + rax + 23]
+	vmovd	xmm2, edi
+	mov	rax, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 23], 1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 23], 2
+	mov	rdx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 23], 3
+	mov	rax, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rax + 23], 4
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 23], 5
+	mov	rdi, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 6
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 7
+	mov	rdi, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 8
+	mov	rdi, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 9
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r12 + 23], 10
+	mov	rdi, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 11
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 23], 12
+	mov	rdi, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 13
+	mov	rdi, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 23], 14
+	mov	r13, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 23], 15
+	movzx	edi, byte ptr [rsi + rbx + 23]
+	vmovd	xmm3, edi
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 23], 1
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 23], 2
+	mov	rcx, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rcx + 23], 3
+	mov	rbx, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rbx + 23], 4
+	mov	r10, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 23], 5
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 23], 6
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 23], 7
+	mov	r8, qword ptr [rsp + 56]        # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r8 + 23], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 23], 9
+	mov	r12, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 23], 10
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 23], 11
+	mov	r11, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r11 + 23], 12
+	mov	r14, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r14 + 23], 13
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r15 + 23], 14
+	mov	r9, qword ptr [rsp + 288]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r9 + 23], 15
+	vinserti128	ymm10, ymm1, xmm0, 1
+	vinserti128	ymm0, ymm3, xmm2, 1
+	vmovdqa	ymmword ptr [rsp + 672], ymm0   # 32-byte Spill
+	mov	rdi, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdi + 24]
+	vmovd	xmm0, edi
+	mov	rdi, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdi + 24], 1
+	mov	r15, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 24], 2
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 24], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 4
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 5
+	mov	rax, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 6
+	mov	rdx, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rdx + 24], 7
+	mov	rax, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 8
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 9
+	mov	rax, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 10
+	mov	rax, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 11
+	mov	rax, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 12
+	mov	rax, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 13
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 24], 14
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r13 + 24], 15
+	mov	rax, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rax + 24]
+	vmovd	xmm1, edi
+	mov	rax, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 24], 1
+	mov	rax, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 24], 2
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rcx + 24], 3
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rbx + 24], 4
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 24], 5
+	mov	r10, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 24], 6
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 24], 7
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r8 + 24], 8
+	mov	r13, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 24], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 24], 10
+	mov	r12, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r12 + 24], 11
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 24], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 24], 13
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 24], 14
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r9 + 24], 15
+	mov	r8, qword ptr [rsp + 248]       # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r8 + 25]
+	vmovd	xmm2, edi
+	mov	rcx, qword ptr [rsp + 136]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 25], 1
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 25], 2
+	mov	rcx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 25], 3
+	mov	r15, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r15 + 25], 4
+	mov	rcx, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 25], 5
+	mov	rcx, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rcx + 25], 6
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 25], 7
+	mov	r9, qword ptr [rsp + 224]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 25], 8
+	mov	rdx, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 25], 9
+	mov	r11, qword ptr [rsp + 192]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r11 + 25], 10
+	mov	rbx, qword ptr [rsp + 88]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rbx + 25], 11
+	mov	rdx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 25], 12
+	mov	r14, qword ptr [rsp + 240]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r14 + 25], 13
+	mov	rdx, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 25], 14
+	mov	rdx, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 25], 15
+	mov	rdx, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdx + 25]
+	vmovd	xmm3, edi
+	mov	rdx, qword ptr [rsp + 104]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdx + 25], 1
+	mov	rdi, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 25], 2
+	mov	rdi, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 25], 3
+	mov	rdi, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 25], 4
+	mov	rdi, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 25], 5
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 25], 6
+	mov	rdi, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 25], 7
+	mov	rdi, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 25], 8
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 25], 9
+	mov	r13, qword ptr [rsp + 40]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r13 + 25], 10
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r12 + 25], 11
+	mov	rdi, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rdi + 25], 12
+	mov	r10, qword ptr [rsp + 32]       # 8-byte Reload
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + r10 + 25], 13
+	vpinsrb	xmm3, xmm3, byte ptr [rsi + rax + 25], 14
+	vinserti128	ymm9, ymm1, xmm0, 1
+	mov	rdi, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm0, xmm3, byte ptr [rsi + rdi + 25], 15
+	vinserti128	ymm8, ymm0, xmm2, 1
+	movzx	edi, byte ptr [rsi + r8 + 26]
+	vmovd	xmm0, edi
+	mov	r8, qword ptr [rsp + 136]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r8 + 26], 1
+	mov	rax, qword ptr [rsp + 256]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 26], 2
+	mov	rax, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 26], 3
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r15 + 26], 4
+	mov	rax, qword ptr [rsp + 144]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 26], 5
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rcx + 26], 6
+	mov	rax, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 26], 7
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r9 + 26], 8
+	mov	rax, qword ptr [rsp + 112]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 26], 9
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r11 + 26], 10
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 26], 11
+	mov	rbx, qword ptr [rsp + 128]      # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rbx + 26], 12
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + r14 + 26], 13
+	mov	rcx, r14
+	mov	rax, qword ptr [rsp + 72]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 26], 14
+	mov	rax, qword ptr [rsp + 80]       # 8-byte Reload
+	vpinsrb	xmm0, xmm0, byte ptr [rsi + rax + 26], 15
+	mov	r12, qword ptr [rsp + 264]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + r12 + 26]
+	vmovd	xmm1, edi
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 26], 1
+	mov	r14, qword ptr [rsp + 152]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r14 + 26], 2
+	mov	r15, qword ptr [rsp + 176]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r15 + 26], 3
+	mov	rax, qword ptr [rsp + 120]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 4
+	mov	rax, qword ptr [rsp + 168]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 5
+	mov	rax, qword ptr [rsp + 232]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 6
+	mov	rax, qword ptr [rsp + 216]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 7
+	mov	rax, qword ptr [rsp + 56]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 8
+	mov	rax, qword ptr [rsp + 64]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 9
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r13 + 26], 10
+	mov	r11, qword ptr [rsp + 200]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r11 + 26], 11
+	mov	rax, qword ptr [rsp + 320]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 12
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + r10 + 26], 13
+	mov	rax, qword ptr [rsp + 96]       # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rax + 26], 14
+	mov	rdx, qword ptr [rsp + 288]      # 8-byte Reload
+	vpinsrb	xmm1, xmm1, byte ptr [rsi + rdx + 26], 15
+	mov	rdx, qword ptr [rsp + 248]      # 8-byte Reload
+	movzx	edi, byte ptr [rsi + rdx + 27]
+	vmovd	xmm2, edi
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 27], 1
+	mov	r8, qword ptr [rsp + 256]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r8 + 27], 2
+	mov	rdx, qword ptr [rsp + 48]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdx + 27], 3
+	mov	rdi, qword ptr [rsp + 208]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 27], 4
+	mov	r9, qword ptr [rsp + 144]       # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r9 + 27], 5
+	mov	r13, qword ptr [rsp + 184]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + r13 + 27], 6
+	mov	rdi, qword ptr [rsp + 160]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 27], 7
+	mov	rdi, qword ptr [rsp + 224]      # 8-byte Reload
+	vpinsrb	xmm2, xmm2, byte ptr [rsi + rdi + 27], 8
... 247016 lines suppressed ...