You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@orc.apache.org by "stiga-huang (via GitHub)" <gi...@apache.org> on 2023/04/10 14:06:19 UTC

[GitHub] [orc] stiga-huang commented on a diff in pull request #1375: ORC-1356: [C++] Use Intel AVX-512 instructions to accelerate the Rle-bit-packing decode

stiga-huang commented on code in PR #1375:
URL: https://github.com/apache/orc/pull/1375#discussion_r1161720614


##########
c++/src/BpackingAvx512.cc:
##########
@@ -0,0 +1,2724 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BpackingAvx512.hh"
+#include "BitUnpackerAvx512.hh"
+#include "CpuInfoUtil.hh"
+#include "RLEv2.hh"
+
+namespace orc {
+  UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) {
+    // PASS
+  }
+
+  UnpackAvx512::~UnpackAvx512() {
+    // PASS
+  }
+
+  inline void UnpackAvx512::alignHeaderBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen,
+                                                uint64_t& bufRestByteLen, uint64_t& len,
+                                                uint32_t& bitWidth, uint64_t& tailBitLen,
+                                                uint32_t& backupByteLen, uint64_t& numElements,
+                                                bool& resetBuf, const uint8_t*& srcPtr,
+                                                int64_t*& dstPtr, uint32_t bitMaxSize) {

Review Comment:
   Some suggestions on this long parameter list:
   - There are several parameters about length. Rename `len` to something more meaningful, e.g. `remainingNumElements`
   - `bitWidth` is a const argument. Let's use `uint32_t bitWidth` directly, or `const uint32_t bitWidth` to avoid modifying it unintentionally.
   - Put input parameters (`bitWidth`, `bitMaxSize`) before output parameters, based on Google C++ code style:
   https://google.github.io/styleguide/cppguide.html#Inputs_and_Outputs
   
   



##########
c++/src/BpackingAvx512.cc:
##########
@@ -0,0 +1,2724 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BpackingAvx512.hh"
+#include "BitUnpackerAvx512.hh"
+#include "CpuInfoUtil.hh"
+#include "RLEv2.hh"
+
+namespace orc {
+  UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) {
+    // PASS
+  }
+
+  UnpackAvx512::~UnpackAvx512() {
+    // PASS
+  }
+
+  inline void UnpackAvx512::alignHeaderBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen,
+                                                uint64_t& bufRestByteLen, uint64_t& len,
+                                                uint32_t& bitWidth, uint64_t& tailBitLen,
+                                                uint32_t& backupByteLen, uint64_t& numElements,
+                                                bool& resetBuf, const uint8_t*& srcPtr,
+                                                int64_t*& dstPtr, uint32_t bitMaxSize) {
+    if (startBit != 0) {
+      bufMoveByteLen +=
+          moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH);
+    } else {
+      bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH);
+    }
+
+    if (bufMoveByteLen <= bufRestByteLen) {
+      numElements = len;
+      resetBuf = false;
+      len -= numElements;
+    } else {
+      if (startBit != 0) {
+        numElements =
+            (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth;
+        len -= numElements;
+        tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit,
+                          bitWidth);
+        resetBuf = true;
+      } else {
+        numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth;
+        len -= numElements;
+        tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth);
+        resetBuf = true;
+      }

Review Comment:
   These codes are similar. We can simplify them to
   ```cpp
         uint64_t leadingBits = 0;
         if (startBit != 0) leadingBits = ORC_VECTOR_BYTE_WIDTH - startBit;
         uint64_t bufRestBitLen = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + leadingBits;
         numElements = bufRestBitLen / bitWidth;
         len -= numElements;
         tailBitLen = fmod(bufRestBitLen, bitWidth);
         resetBuf = true;
   ```



##########
c++/src/BpackingAvx512.cc:
##########
@@ -0,0 +1,2724 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BpackingAvx512.hh"
+#include "BitUnpackerAvx512.hh"
+#include "CpuInfoUtil.hh"
+#include "RLEv2.hh"
+
+namespace orc {
+  UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) {
+    // PASS
+  }
+
+  UnpackAvx512::~UnpackAvx512() {
+    // PASS
+  }
+
+  inline void UnpackAvx512::alignHeaderBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen,
+                                                uint64_t& bufRestByteLen, uint64_t& len,
+                                                uint32_t& bitWidth, uint64_t& tailBitLen,
+                                                uint32_t& backupByteLen, uint64_t& numElements,
+                                                bool& resetBuf, const uint8_t*& srcPtr,
+                                                int64_t*& dstPtr, uint32_t bitMaxSize) {
+    if (startBit != 0) {
+      bufMoveByteLen +=
+          moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH);
+    } else {
+      bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH);
+    }
+
+    if (bufMoveByteLen <= bufRestByteLen) {
+      numElements = len;
+      resetBuf = false;
+      len -= numElements;

Review Comment:
   `len` can be set to 0 directly.



##########
c++/src/BitUnpackerAvx512.hh:
##########
@@ -0,0 +1,488 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BIT_UNPACKER_AVX512_HH
+#define ORC_BIT_UNPACKER_AVX512_HH
+
+// Mingw-w64 defines strcasecmp in string.h
+#if defined(_WIN32) && !defined(strcasecmp)
+#include <string.h>
+#define strcasecmp stricmp
+#else
+#include <strings.h>
+#endif
+
+#include <immintrin.h>
+#include <cstdint>
+#include <vector>
+
+namespace orc {
+#define ORC_VECTOR_BITS_2_BYTE(x) \
+  (((x) + 7u) >> 3u) /**< Convert a number of bits to a number of bytes */
+#define ORC_VECTOR_ONE_64U (1ULL)
+#define ORC_VECTOR_MAX_16U 0xFFFF     /**< Max value for uint16_t */
+#define ORC_VECTOR_MAX_32U 0xFFFFFFFF /**< Max value for uint32_t */
+#define ORC_VECTOR_BYTE_WIDTH 8u      /**< Byte width in bits */
+#define ORC_VECTOR_WORD_WIDTH 16u     /**< Word width in bits */
+#define ORC_VECTOR_DWORD_WIDTH 32u    /**< Dword width in bits */
+#define ORC_VECTOR_QWORD_WIDTH 64u    /**< Qword width in bits */
+#define ORC_VECTOR_BIT_MASK(x) \
+  ((ORC_VECTOR_ONE_64U << (x)) - 1u) /**< Bit mask below bit position */
+
+#define ORC_VECTOR_BITS_2_WORD(x) \
+  (((x) + 15u) >> 4u) /**< Convert a number of bits to a number of words */
+#define ORC_VECTOR_BITS_2_DWORD(x) \
+  (((x) + 31u) >> 5u) /**< Convert a number of bits to a number of double words */
+
+  // ------------------------------------ 3u -----------------------------------------
+  static uint8_t shuffleIdxTable3u_0[64] = {
+      1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u,
+      3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u,
+      5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u};
+  static uint8_t shuffleIdxTable3u_1[64] = {
+      0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u,
+      3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u,
+      5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u};
+  static uint16_t shiftTable3u_0[32] = {13u, 7u,  9u,  11u, 13u, 7u,  9u,  11u, 13u, 7u,  9u,
+                                        11u, 13u, 7u,  9u,  11u, 13u, 7u,  9u,  11u, 13u, 7u,
+                                        9u,  11u, 13u, 7u,  9u,  11u, 13u, 7u,  9u,  11u};
+  static uint16_t shiftTable3u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u,
+                                        0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u,
+                                        2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u};
+  static uint16_t permutexIdxTable3u[32] = {0u,  1u,  2u,  0x0, 0x0, 0x0, 0x0, 0x0, 3u,  4u,  5u,
+                                            0x0, 0x0, 0x0, 0x0, 0x0, 6u,  7u,  8u,  0x0, 0x0, 0x0,
+                                            0x0, 0x0, 9u,  10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0};
+
+  // ------------------------------------ 5u -----------------------------------------
+  static uint8_t shuffleIdxTable5u_0[64] = {
+      1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u,
+      4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u,
+      8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u};
+  static uint8_t shuffleIdxTable5u_1[64] = {
+      1u, 0u, 2u,  1u, 3u, 2u, 5u, 4u, 6u,  5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u,  1u, 3u, 2u,
+      5u, 4u, 6u,  5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u,  2u, 5u, 4u, 6u,  5u, 7u, 6u,
+      8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u,  2u, 5u, 4u, 6u, 5u, 7u,  6u, 8u, 7u, 10u, 9u};
+  static uint16_t shiftTable5u_0[32] = {11u, 9u,  7u,  5u, 11u, 9u,  7u,  5u, 11u, 9u,  7u,
+                                        5u,  11u, 9u,  7u, 5u,  11u, 9u,  7u, 5u,  11u, 9u,
+                                        7u,  5u,  11u, 9u, 7u,  5u,  11u, 9u, 7u,  5u};
+  static uint16_t shiftTable5u_1[32] = {2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u,
+                                        0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u,
+                                        6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u};
+  static uint16_t permutexIdxTable5u[32] = {0u,  1u,  2u,  3u,  4u,  0x0, 0x0, 0x0, 5u,  6u,  7u,
+                                            8u,  9u,  0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0,
+                                            0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0};
+
+  // ------------------------------------ 6u -----------------------------------------
+  static uint8_t shuffleIdxTable6u_0[64] = {
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u};
+  static uint8_t shuffleIdxTable6u_1[64] = {
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u,
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u,
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u,
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u};
+  static uint16_t shiftTable6u_0[32] = {10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u,  10u,
+                                        6u,  10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u,
+                                        10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u};
+  static uint16_t shiftTable6u_1[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u,
+                                        0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u,
+                                        4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u};
+  static uint32_t permutexIdxTable6u[16] = {0u, 1u, 2u, 0x0, 3u, 4u,  5u,  0x0,
+                                            6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0};
+
+  // ------------------------------------ 7u -----------------------------------------
+  static uint8_t shuffleIdxTable7u_0[64] = {
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u,
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u,
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u,
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u};
+  static uint8_t shuffleIdxTable7u_1[64] = {
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u,
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u,
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u,
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u};
+  static uint16_t shiftTable7u_0[32] = {9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u,
+                                        7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u,
+                                        5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u};
+  static uint16_t shiftTable7u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u,
+                                        0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u,
+                                        2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u};
+  static uint16_t permutexIdxTable7u[32] = {0u,  1u,  2u,  3u,  4u,  5u,  6u,  0x0, 7u,  8u,  9u,
+                                            10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u,
+                                            20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0};
+
+  // ------------------------------------ 9u -----------------------------------------
+  static uint16_t permutexIdxTable9u_0[32] = {0u,  1u,  1u,  2u,  2u,  3u,  3u,  4u,  4u,  5u,  5u,
+                                              6u,  6u,  7u,  7u,  8u,  9u,  10u, 10u, 11u, 11u, 12u,
+                                              12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u};
+  static uint16_t permutexIdxTable9u_1[32] = {0u,  1u,  1u,  2u,  2u,  3u,  3u,  4u,  5u,  6u,  6u,
+                                              7u,  7u,  8u,  8u,  9u,  9u,  10u, 10u, 11u, 11u, 12u,
+                                              12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u};
+  static uint32_t shiftTable9u_0[16] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+                                        0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u};
+  static uint32_t shiftTable9u_1[16] = {7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u,
+                                        7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u};
+
+  static uint8_t shuffleIdxTable9u_0[64] = {
+      1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u,
+      4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u,
+      7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u};
+  static uint16_t shiftTable9u_2[32] = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u,
+                                        4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u,
+                                        1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+  static uint64_t gatherIdxTable9u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u};
+
+  // ------------------------------------ 10u -----------------------------------------
+  static uint8_t shuffleIdxTable10u_0[64] = {
+      1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u,
+      4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u,
+      8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u};
+  static uint16_t shiftTable10u[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u,
+                                       0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u,
+                                       2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u};
+  static uint16_t permutexIdxTable10u[32] = {0u,  1u,  2u,  3u,  4u,  0x0, 0x0, 0x0, 5u,  6u,  7u,
+                                             8u,  9u,  0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0,
+                                             0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0};
+
+  // ------------------------------------ 11u -----------------------------------------
+  static uint16_t permutexIdxTable11u_0[32] = {
+      0u,  1u,  1u,  2u,  2u,  3u,  4u,  5u,  5u,  6u,  6u,  7u,  8u,  9u,  9u,  10u,
+      11u, 12u, 12u, 13u, 13u, 14u, 15u, 16u, 16u, 17u, 17u, 18u, 19u, 20u, 20u, 21u};
+  static uint16_t permutexIdxTable11u_1[32] = {
+      0u,  1u,  2u,  3u,  3u,  4u,  4u,  5u,  6u,  7u,  7u,  8u,  8u,  9u,  10u, 11u,
+      11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 21u, 22u};
+  static uint32_t shiftTable11u_0[16] = {0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u,
+                                         0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u};
+  static uint32_t shiftTable11u_1[16] = {5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u,
+                                         5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u};
+
+  static uint8_t shuffleIdxTable11u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u};
+  static uint8_t shuffleIdxTable11u_1[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u};
+  static uint32_t shiftTable11u_2[16] = {21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u,
+                                         21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u};
+  static uint32_t shiftTable11u_3[16] = {6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u,
+                                         6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u};
+  static uint64_t gatherIdxTable11u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u};
+
+  // ------------------------------------ 12u -----------------------------------------
+  static uint8_t shuffleIdxTable12u_0[64] = {
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u};
+  static uint16_t shiftTable12u[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u,
+                                       0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u,
+                                       4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u};
+  static uint32_t permutexIdxTable12u[16] = {0u, 1u, 2u, 0x0, 3u, 4u,  5u,  0x0,
+                                             6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0};
+
+  // ------------------------------------ 13u -----------------------------------------
+  static uint16_t permutexIdxTable13u_0[32] = {
+      0u,  1u,  1u,  2u,  3u,  4u,  4u,  5u,  6u,  7u,  8u,  9u,  9u,  10u, 11u, 12u,
+      13u, 14u, 14u, 15u, 16u, 17u, 17u, 18u, 19u, 20u, 21u, 22u, 22u, 23u, 24u, 25u};
+  static uint16_t permutexIdxTable13u_1[32] = {
+      0u,  1u,  2u,  3u,  4u,  5u,  5u,  6u,  7u,  8u,  8u,  9u,  10u, 11u, 12u, 13u,
+      13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 25u, 26u};
+  static uint32_t shiftTable13u_0[16] = {0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u,
+                                         0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u};
+  static uint32_t shiftTable13u_1[16] = {3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u,
+                                         3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u};
+
+  static uint8_t shuffleIdxTable13u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u};
+  static uint8_t shuffleIdxTable13u_1[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u};
+  static uint32_t shiftTable13u_2[16] = {19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u,
+                                         19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u};
+  static uint32_t shiftTable13u_3[16] = {10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u,
+                                         10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u};
+  static uint64_t gatherIdxTable13u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u};
+
+  // ------------------------------------ 14u -----------------------------------------
+  static uint8_t shuffleIdxTable14u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u};
+  static uint8_t shuffleIdxTable14u_1[64] = {
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u};
+  static uint32_t shiftTable14u_0[16] = {18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u,
+                                         18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u};
+  static uint32_t shiftTable14u_1[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u,
+                                         12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u};
+  static uint16_t permutexIdxTable14u[32] = {0u,  1u,  2u,  3u,  4u,  5u,  6u,  0x0, 7u,  8u,  9u,
+                                             10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u,
+                                             20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0};
+
+  // ------------------------------------ 15u -----------------------------------------
+  static uint16_t permutexIdxTable15u_0[32] = {
+      0u,  1u,  1u,  2u,  3u,  4u,  5u,  6u,  7u,  8u,  9u,  10u, 11u, 12u, 13u, 14u,
+      15u, 16u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u};
+  static uint16_t permutexIdxTable15u_1[32] = {
+      0u,  1u,  2u,  3u,  4u,  5u,  6u,  7u,  8u,  9u,  10u, 11u, 12u, 13u, 14u, 15u,
+      15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u, 30u};
+  static uint32_t shiftTable15u_0[16] = {0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u,
+                                         0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u};
+  static uint32_t shiftTable15u_1[16] = {1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u,
+                                         1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+
+  static uint8_t shuffleIdxTable15u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u};
+  static uint8_t shuffleIdxTable15u_1[64] = {
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u};
+  static uint32_t shiftTable15u_2[16] = {17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u,
+                                         17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u};
+  static uint32_t shiftTable15u_3[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u,
+                                         14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u};
+  static uint64_t gatherIdxTable15u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u};
+
+  // ------------------------------------ 17u -----------------------------------------
+  static uint32_t permutexIdxTable17u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u};
+  static uint32_t permutexIdxTable17u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u};
+  static uint64_t shiftTable17u_0[8] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u};
+  static uint64_t shiftTable17u_1[8] = {15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+
+  static uint8_t shuffleIdxTable17u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u,
+      3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u,
+      9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u};
+  static uint32_t shiftTable17u_2[16] = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                         15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u};
+  static uint64_t gatherIdxTable17u[8] = {0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u};
+
+  // ------------------------------------ 18u -----------------------------------------
+  static uint32_t permutexIdxTable18u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u};
+  static uint32_t permutexIdxTable18u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u};
+  static uint64_t shiftTable18u_0[8] = {0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u};
+  static uint64_t shiftTable18u_1[8] = {14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u};
+
+  static uint8_t shuffleIdxTable18u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u,
+      3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u,
+      9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u};
+  static uint32_t shiftTable18u_2[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u,
+                                         14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u};
+  static uint64_t gatherIdxTable18u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u};
+
+  // ------------------------------------ 19u -----------------------------------------
+  static uint32_t permutexIdxTable19u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u};
+  static uint32_t permutexIdxTable19u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u,
+                                               5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u};
+  static uint64_t shiftTable19u_0[8] = {0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u};
+  static uint64_t shiftTable19u_1[8] = {13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u};
+
+  static uint8_t shuffleIdxTable19u_0[64] = {
+      3u,  2u, 1u, 0u, 5u, 4u, 3u,  2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u,  2u, 1u, 0u, 5u, 4u,
+      3u,  2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u,  2u, 7u, 6u, 5u, 4u,
+      10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u, 2u, 8u,  7u, 6u, 5u, 10u, 9u, 8u, 7u};
+  static uint32_t shiftTable19u_2[16] = {13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u,
+                                         13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u};
+  static uint64_t gatherIdxTable19u[8] = {0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u};
+
+  // ------------------------------------ 20u -----------------------------------------
+  static uint8_t shuffleIdxTable20u_0[64] = {
+      3u,  2u, 1u, 0u, 5u, 4u, 3u,  2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u,  2u, 1u, 0u, 5u, 4u,
+      3u,  2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u,  2u, 8u, 7u, 6u, 5u,
+      10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u, 2u, 8u,  7u, 6u, 5u, 10u, 9u, 8u, 7u};
+  static uint32_t shiftTable20u[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u,
+                                       12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u};
+  static uint16_t permutexIdxTable20u[32] = {0u,  1u,  2u,  3u,  4u,  0x0, 0x0, 0x0, 5u,  6u,  7u,
+                                             8u,  9u,  0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0,
+                                             0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0};
+
+  // ------------------------------------ 21u -----------------------------------------
+  static uint32_t permutexIdxTable21u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u};
+  static uint32_t permutexIdxTable21u_1[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u,
+                                               5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u};
+  static uint64_t shiftTable21u_0[8] = {0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u};
+  static uint64_t shiftTable21u_1[8] = {11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u};
+
+  static uint8_t shuffleIdxTable21u_0[64] = {
+      3u,  2u, 1u, 0u, 5u, 4u, 3u,  2u,  8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u,  2u,  1u, 0u, 6u, 5u,
+      4u,  3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u,  0u, 5u, 4u, 3u,  2u,  8u, 7u, 6u, 5u,
+      10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u,  6u, 5u, 4u, 3u, 8u,  7u, 6u, 5u, 11u, 10u, 9u, 8u};
+  static uint32_t shiftTable21u_2[16] = {11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u,
+                                         11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u};
+  static uint64_t gatherIdxTable21u[8] = {0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u};
+
+  // ------------------------------------ 22u -----------------------------------------
+  static uint32_t permutexIdxTable22u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u,
+                                               5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u};
+  static uint32_t permutexIdxTable22u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 4u,  5u,
+                                               6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u};
+  static uint64_t shiftTable22u_0[8] = {0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u};
+  static uint64_t shiftTable22u_1[8] = {10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u};
+
+  static uint8_t shuffleIdxTable22u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u};
+  static uint32_t shiftTable22u_2[16] = {10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u,
+                                         10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u};
+  static uint64_t gatherIdxTable22u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u};
+
+  // ------------------------------------ 23u -----------------------------------------
+  static uint32_t permutexIdxTable23u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u,  5u,
+                                               5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u};
+  static uint32_t permutexIdxTable23u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u,  5u,  6u,
+                                               6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u};
+  static uint64_t shiftTable23u_0[8] = {0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u};
+  static uint64_t shiftTable23u_1[8] = {9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u};
+
+  static uint8_t shuffleIdxTable23u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u,  8u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u,  8u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u};
+  static uint32_t shiftTable23u_2[16] = {9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u,
+                                         9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u};
+  static uint64_t gatherIdxTable23u[8] = {0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u};
+
+  // ------------------------------------ 24u -----------------------------------------
+  static uint8_t shuffleIdxTable24u_0[64] = {
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF,
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF,
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF,
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF};
+  static uint32_t permutexIdxTable24u[16] = {0u, 1u, 2u, 0x0, 3u, 4u,  5u,  0x0,
+                                             6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0};
+
+  // ------------------------------------ 26u -----------------------------------------
+  static uint32_t permutexIdxTable26u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u,  4u,  5u,
+                                               6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u};
+  static uint32_t permutexIdxTable26u_1[16] = {0u, 1u, 2u, 3u, 4u,  5u,  5u,  6u,
+                                               7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u};
+  static uint64_t shiftTable26u_0[8] = {0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u};
+  static uint64_t shiftTable26u_1[8] = {6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u};
+
+  static uint8_t shuffleIdxTable26u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u};
+  static uint32_t shiftTable26u_2[16] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u,
+                                         6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u};
+  static uint64_t gatherIdxTable26u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u};
+
+  // ------------------------------------ 28u -----------------------------------------
+  static uint8_t shuffleIdxTable28u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u};
+  static uint32_t shiftTable28u[16] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u,
+                                       4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u};
+  static uint16_t permutexIdxTable28u[32] = {0u,  1u,  2u,  3u,  4u,  5u,  6u,  0x0, 7u,  8u,  9u,
+                                             10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u,
+                                             20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0};
+
+  // ------------------------------------ 30u -----------------------------------------
+  static uint32_t permutexIdxTable30u_0[16] = {0u, 1u, 1u, 2u,  3u,  4u,  5u,  6u,
+                                               7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u};
+  static uint32_t permutexIdxTable30u_1[16] = {0u, 1u, 2u,  3u,  4u,  5u,  6u,  7u,
+                                               8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u};
+  static uint64_t shiftTable30u_0[8] = {0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u};
+  static uint64_t shiftTable30u_1[8] = {2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u};
+
+  static uint8_t shuffleIdxTable30u_0[64] = {
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u,
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u,
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u,
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u};
+  static uint8_t shuffleIdxTable30u_1[64] = {
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u,
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u,
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u,
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u};
+  static uint64_t shiftTable30u_2[8] = {34u, 30u, 34u, 30u, 34u, 30u, 34u, 30u};
+  static uint64_t shiftTable30u_3[8] = {28u, 24u, 28u, 24u, 28u, 24u, 28u, 24u};
+  static uint64_t gatherIdxTable30u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u};
+
+  static uint64_t nibbleReverseTable[8] = {
+      0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901,
+      0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901};
+
+  static uint64_t reverseMaskTable1u[8] = {
+      0x0001020304050607, 0x08090A0B0C0D0E0F, 0x1011121314151617, 0x18191A1B1C1D1E1F,
+      0x2021222324252627, 0x28292A2B2C2D2E2F, 0x3031323334353637, 0x38393A3B3C3D3E3F};
+
+  static uint64_t reverseMaskTable16u[8] = {
+      0x0607040502030001, 0x0E0F0C0D0A0B0809, 0x1617141512131011, 0x1E1F1C1D1A1B1819,
+      0x2627242522232021, 0x2E2F2C2D2A2B2829, 0x3637343532333031, 0x3E3F3C3D3A3B3839};
+
+  static uint64_t reverseMaskTable32u[8] = {
+      0x0405060700010203, 0x0C0D0E0F08090A0B, 0x1415161710111213, 0x1C1D1E1F18191A1B,
+      0x2425262720212223, 0x2C2D2E2F28292A2B, 0x3435363730313233, 0x3C3D3E3F38393A3B};
+
+  inline uint32_t getAlign(uint32_t startBit, uint32_t base, uint32_t bitSize) {
+    uint32_t remnant = bitSize - startBit;
+    uint32_t retValue = 0xFFFFFFFF;
+    for (uint32_t i = 0u; i < bitSize; ++i) {
+      uint32_t testValue = (i * base) % bitSize;
+      if (testValue == remnant) {
+        retValue = i;
+        break;
+      }
+    }
+    return retValue;
+  }
+
+  inline uint64_t moveLen(uint64_t x, uint64_t y) {

Review Comment:
   It's hard to understand the meaning of this method at a glance. Can we rename the parameters or add some comments? E.g. rename `x` to `numBits`, rename `moveLen` to `moveByteLen`?
   
   It seems `y` is always `ORC_VECTOR_BYTE_WIDTH`. Maybe we can ignore this parameter?
   
   The code can also be simplified:
   ```
     inline uint64_t moveLen(uint64_t x, uint64_t y) {
       uint64_t result = x / y;
       if (x % y != 0) ++result;
       return result;
     }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org