You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ji...@apache.org on 2023/06/13 02:49:23 UTC

[doris-thirdparty] branch clucene updated: [Fix](PFOR) revert TurboPFOR to last version, and fix some build issue (#88)

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new dae2b5d8 [Fix](PFOR) revert TurboPFOR to last version, and fix some build issue (#88)
dae2b5d8 is described below

commit dae2b5d830a942e2e9692e2f3c0f609eff767d5a
Author: airborne12 <ai...@gmail.com>
AuthorDate: Tue Jun 13 10:49:18 2023 +0800

    [Fix](PFOR) revert TurboPFOR to last version, and fix some build issue (#88)
---
 src/core/CLucene/index/SegmentTermDocs.cpp      |     4 +
 src/core/CMakeLists.txt                         |     2 +-
 src/ext/for/CMakeLists.txt                      |    59 +-
 src/ext/for/README.md                           |   585 +
 src/ext/for/bic.c                               |   201 -
 src/ext/for/bitpack.c                           |   339 +-
 src/ext/for/{include_ => }/bitpack.h            |   136 +-
 src/ext/for/bitpack_.h                          |  1041 +-
 src/ext/for/bitunpack.c                         |   818 +-
 src/ext/for/bitunpack_.h                        |  1032 +-
 src/ext/for/bitutil.c                           |   770 +-
 src/ext/for/{include_/bitutil_.h => bitutil.h}  |   350 +-
 src/ext/for/{include_ => }/conf.h               |   230 +-
 src/ext/for/eliasfano.c                         |   213 -
 src/ext/for/ext/OPT_PFD/main.cpp                |   101 -
 src/ext/for/ext/OPT_PFD/opt_p4.h                |    54 -
 src/ext/for/ext/OPT_PFD/pf.h                    |   158 -
 src/ext/for/ext/OPT_PFD/s16head.h               |   251 -
 src/ext/for/ext/OPT_PFD/unpack.h                |   773 --
 src/ext/for/ext/SPDP_10.c                       |   238 -
 src/ext/for/ext/bg/bg.c                         |   185 -
 src/ext/for/ext/bg/bg.h                         |   109 -
 src/ext/for/ext/bg/defines.h                    |    54 -
 src/ext/for/ext/fastpfor.cc                     |   121 -
 src/ext/for/ext/fastpfor.h                      |    20 -
 src/ext/for/ext/gb.c                            |   151 -
 src/ext/for/ext/gov2.png                        |   Bin 33041 -> 0 bytes
 src/ext/for/ext/libdroundfast.c                 |    61 -
 src/ext/for/ext/polycom/optp4.c                 |    22 -
 src/ext/for/ext/polycom/optp4.h                 |    11 -
 src/ext/for/ext/polycom/optpfd.c                |    26 -
 src/ext/for/ext/polycom/optpfd.h                |    11 -
 src/ext/for/ext/polycom/polyvbyte.c             |    14 -
 src/ext/for/ext/polycom/polyvbyte.h             |    10 -
 src/ext/for/ext/polycom/vbyte_poly.h            |    46 -
 src/ext/for/ext/rc.c                            |  1809 ---
 src/ext/for/ext/rc.h                            |     8 -
 src/ext/for/ext/simdcomp_/simdfor.c             | 14501 ----------------------
 src/ext/for/ext/simple8b.c                      |   330 -
 src/ext/for/ext/simple8b.h                      |     9 -
 src/ext/for/ext/vabyte.h                        |    99 -
 src/ext/for/ext/varintg8iu.c                    |   184 -
 src/ext/for/ext/varintg8iu.h                    |     5 -
 src/ext/for/ext/vas16c.h                        |    36 -
 src/ext/for/ext/vas16d.h                        |   403 -
 src/ext/for/fp.c                                |   954 --
 src/ext/for/{include_ => }/fp.h                 |    65 +-
 src/ext/for/icapp.c                             |  2326 ----
 src/ext/for/iccodec.c                           |   813 --
 src/ext/for/idx.h                               |    53 -
 src/ext/for/idxcr.c                             |   175 -
 src/ext/for/idxqry.c                            |   682 -
 src/ext/for/idxseg.c                            |   133 -
 src/ext/for/include_/bic.h                      |    66 -
 src/ext/for/include_/bitiobe.h                  |    42 -
 src/ext/for/include_/bitutil.h                  |   160 -
 src/ext/for/include_/eliasfano.h                |    36 -
 src/ext/for/include_/iccodec.h                  |   109 -
 src/ext/for/include_/transpose.h                |   231 -
 src/ext/for/include_/vbit.h                     |    29 -
 src/ext/for/include_/vint.h                     |   249 -
 src/ext/for/include_/vlcbit.h                   |   117 -
 src/ext/for/include_/vlcbyte.h                  |   170 -
 src/ext/for/jic.c                               |   175 -
 src/ext/for/jic.h                               |   693 --
 src/ext/for/libext.mak                          |   327 -
 src/ext/for/makefile                            |   156 +
 src/ext/for/makefile.vs                         |    78 +
 src/ext/for/{include_ => }/sse_neon.h           |   217 +-
 src/ext/for/{include_ => }/time_.h              |   159 +-
 src/ext/for/transpose.c                         |  1171 +-
 src/ext/for/transpose.h                         |   113 +
 src/ext/for/transpose_.c                        |   472 -
 src/ext/for/trle.c                              |   125 -
 src/ext/for/{include_ => }/trle.h               |    32 +-
 src/ext/for/trle_.h                             |     8 +-
 src/ext/for/trlec.c                             |    87 +-
 src/ext/for/trled.c                             |    71 +-
 src/ext/for/v8.c                                |   449 +-
 src/ext/for/v8pack.c                            |   203 -
 src/ext/for/vbit.c                              |   304 -
 src/ext/for/vint.c                              |   153 +-
 src/ext/for/vint.h                              |   401 +
 src/ext/for/{include_ => }/vp4.h                |   107 +-
 src/ext/for/vp4c.c                              |   132 +-
 src/ext/for/vp4d.c                              |   116 +-
 src/ext/for/vs/bitpack_avx2.c                   |     2 +
 src/ext/for/vs/bitpack_sse.c                    |     2 +
 src/ext/for/vs/bitunpack_avx2.c                 |     2 +
 src/ext/for/vs/bitunpack_sse.c                  |     2 +
 src/ext/for/vs/getopt.c                         |   562 +
 src/ext/for/vs/getopt.h                         |    97 +
 src/ext/for/vs/inttypes.h                       |   306 +
 src/ext/for/vs/stdint.h                         |   259 +
 src/ext/for/vs/transpose_avx2.c                 |     2 +
 src/ext/for/vs/transpose_sse.c                  |     2 +
 src/ext/for/vs/vp4c_avx2.c                      |     2 +
 src/ext/for/vs/vp4c_sse.c                       |     2 +
 src/ext/for/vs/vp4d_avx2.c                      |     2 +
 src/ext/for/vs/vp4d_sse.c                       |     2 +
 src/ext/for/vs/vs2017/TurboPFor.sln             |    41 +
 src/ext/for/vs/vs2017/TurboPFor.vcxproj         |   226 +
 src/ext/for/vs/vs2017/TurboPFor.vcxproj.filters |   101 +
 src/ext/for/vs/vs2017/icapp.vcxproj             |   175 +
 src/ext/for/vs/vs2017/icapp.vcxproj.filters     |    21 +
 src/ext/for/vsimple.c                           |    43 +-
 src/ext/for/{include_ => }/vsimple.h            |    31 +-
 107 files changed, 6690 insertions(+), 32931 deletions(-)

diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp b/src/core/CLucene/index/SegmentTermDocs.cpp
index f64256d8..0fe90357 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -162,6 +162,10 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t length) {
                 }
             }
         } else {
+            // NOTE: Pad arraySize from 511 to 512 for alignment since the first block size is 511, and add one more extra space to prevent overflow.
+            auto paddingSize = (arraySize / PFOR_BLOCK_SIZE) * PFOR_BLOCK_SIZE + PFOR_BLOCK_SIZE;
+            _docs.resize(paddingSize + 1);
+            _freqs.resize(paddingSize + 1);
             {
                 uint32_t SerializedSize = freqStream->readVInt();
                 std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 8f9422fa..ce1dfcb5 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -209,7 +209,7 @@ file(GLOB_RECURSE HEADERS ${clucene-core_SOURCE_DIR}/*.h )
 
 #create the libraries
 INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/core)
-INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/ext/for/include_)
+INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/ext/for)
 
 IF (BUILD_SHARED_LIBRARIES)
   add_library(clucene-core SHARED
diff --git a/src/ext/for/CMakeLists.txt b/src/ext/for/CMakeLists.txt
index e14dcd48..3b14781f 100644
--- a/src/ext/for/CMakeLists.txt
+++ b/src/ext/for/CMakeLists.txt
@@ -1,37 +1,28 @@
 cmake_minimum_required(VERSION 3.10)
 project(powturbo)
 
-#INCLUDE (DefineOptions)
-#DEFINE_OPTIONS(EXTRA_OPTIONS EXTRA_LIBS)
-
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_C_STANDARD 99)
 
-# Compiler options
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 
-#set(CMAKE_C_FLAGS_DEBUG "-DDEBUG -g")
-set(CMAKE_C_FLAGS "-DNDEBUG -s -O3")
+set(DEBUG "-DNDEBUG -g -O3")
 set(OPT "-w -Wall -fstrict-aliasing -falign-loops -Wno-int-conversion")
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPT}")
-
-
-# Architecture-specific settings
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
     set(SSE "-march=corei7-avx -mtune=corei7-avx")
     set(AVX2 "-march=haswell")
+    set(CMAKE_C_FLAGS ${SSE})
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
     set(SSE "-march=armv8-a")
+    set(CMAKE_C_FLAGS "-march=armv8-a")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
     set(SSE "-D__SSSE3__")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power9 -mtune=power9")
 endif()
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE}")
-
 if(FLOAT16)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_FLOAT16")
 endif()
@@ -39,58 +30,62 @@ endif()
 if(STATIC)
     set(CMAKE_EXE_LINKER_FLAGS "-static")
 endif()
-
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${DEBUG} ${OPT}")
+       
+separate_arguments(avx2_c_flags_list UNIX_COMMAND "${DEBUG} ${OPT}")
 separate_arguments(c_flags_list UNIX_COMMAND "${CMAKE_C_FLAGS}")
-# Include directories
+
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
-# Define base source files
 set(SRC_FILES
-	${CMAKE_CURRENT_SOURCE_DIR}/bitutil.c
-	${CMAKE_CURRENT_SOURCE_DIR}/bitpack.c
-	${CMAKE_CURRENT_SOURCE_DIR}/bitunpack.c
-	${CMAKE_CURRENT_SOURCE_DIR}/vp4c.c
-	${CMAKE_CURRENT_SOURCE_DIR}/vp4d.c
-	${CMAKE_CURRENT_SOURCE_DIR}/transpose.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/bitpack.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/bitunpack.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/vp4c.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/vp4d.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/transpose.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/bitutil.c
 )
 
-# Add base source files to library
 add_library(ic STATIC
-    fp.c
     v8.c
     vint.c
     trlec.c
     trled.c
-    vsimple.c
-    eliasfano.c
+    vsimple.c 
+    bitutil.c
+    bitpack.c
+    bitunpack.c
+    vp4c.c
+    vp4d.c
+    transpose.c
 )
 
-# Add custom commands to generate SSE and AVX2 versions of source files
 foreach(SRC_FILE ${SRC_FILES})
     get_filename_component(SRC_NAME ${SRC_FILE} NAME_WE)
-    #set(SSE_OUTPUT ${SRC_NAME}_sse.o)
+    set(SSE_OUTPUT ${SRC_NAME}_sse.o)
     set(AVX2_OUTPUT ${SRC_NAME}_avx2.o)
-    set(OUTPUT ${SRC_NAME}.o)
 
     add_custom_command(
-        OUTPUT ${OUTPUT}
-	COMMAND ${CMAKE_C_COMPILER} -c -o ${OUTPUT} ${SRC_FILE} ${c_flags_list}
+        OUTPUT ${SSE_OUTPUT}
+        COMMAND ${CMAKE_C_COMPILER} -DSSE2_ON ${c_flags_list} -c -o ${SSE_OUTPUT} ${SRC_FILE}
         DEPENDS ${SRC_FILE}
     )
 
-    target_sources(ic PRIVATE ${OUTPUT})
     target_sources(ic PRIVATE ${SSE_OUTPUT})
 
     if(USE_AVX2)
         add_custom_command(
             OUTPUT ${AVX2_OUTPUT}
-	    COMMAND ${CMAKE_C_COMPILER} -c -o ${AVX2_OUTPUT} ${SRC_FILE} ${c_flags_list} ${AVX2}
+	    COMMAND ${CMAKE_C_COMPILER} ${AVX2} -DAVX2_ON ${avx2_c_flags_list} -c -o ${AVX2_OUTPUT} ${SRC_FILE}
             DEPENDS ${SRC_FILE}
         )
 
         target_sources(ic PRIVATE ${AVX2_OUTPUT})
     endif()
 endforeach()
+
+set(LIB_DESTINATION ../)
+
 install(TARGETS ic
       DESTINATION ${LIB_DESTINATION}
       COMPONENT ext)
diff --git a/src/ext/for/README.md b/src/ext/for/README.md
new file mode 100644
index 00000000..31e8f25e
--- /dev/null
+++ b/src/ext/for/README.md
@@ -0,0 +1,585 @@
+TurboPFor: Fastest Integer Compression 
+
+[//]: # ([![Build Status][travisBadge]][travisLink])
+[//]: # ([travisBadge]: https://api.travis-ci.com/powturbo/TurboPFor-Integer-Compression.svg?branch=master)
+[//]: # ([travisLink]: https://app.travis-ci.com/powturbo/TurboPFor-Integer-Compression)
+======================================
+* **TurboPFor: The synonym for "integer compression"**
+  * **ALL** functions available for **AMD/Intel**, **64 bits ARMv8 NEON** Linux+MacOS/M1 & **Power9 Altivec**
+  * 100% C (C++ headers), as simple as memcpy. OS:Linux amd64, arm64, Power9, MacOs (Amd/intel + Apple M1),
+  * :+1: **Java** Critical Natives/JNI. Access TurboPFor **incl. SIMD/AVX2!** from Java as fast as calling from C
+  * :sparkles: **FULL** range 8/16/32/64 bits scalar + 16/32/64 bits SIMD functions
+  * No other "Integer Compression" compress/decompress faster
+  * :sparkles: Direct Access, **integrated** (SIMD/AVX2) FOR/delta/Delta of Delta/Zigzag for sorted/unsorted arrays
+  * **16 bits** + **64 bits** SIMD integrated functions
+* **For/PFor/PForDelta**
+  * **Novel TurboPFor** (PFor/PForDelta) scheme w./ **direct access** + **SIMD/AVX2**. **+RLE**
+  * Outstanding compression/speed. More efficient than **ANY** other fast "integer compression" scheme.
+  * Compress 70 times faster and decompress up to 4 times faster than OptPFD
+* **Bit Packing**
+  * Fastest and most efficient **"SIMD Bit Packing"** **15 Billions integers/sec (60Gb/s!)**
+  * Scalar **"Bit Packing"** decoding nearly as fast as SIMD-Packing in realistic (No "pure cache") scenarios
+  * **Direct/Random Access** : Access any single bit packed entry with **zero decompression**
+* **Variable byte**
+  * Scalar **"Variable Byte"** faster and more efficient than **ANY** other implementation
+  * SIMD **TurboByte** fastest group varint (16+32 bits) incl. integrated delta,zigzag,...
+  * **TurboByte+TurboPackV** novel hybrid scheme combining the fastest SIMD codecs.
+* **Simple family**
+  * **Novel** **"Variable Simple"** (incl. **RLE**) faster and more efficient than simple16, simple-8b
+* **Elias fano**
+  * Fastest **"Elias Fano"** implementation w/ or w/o SIMD/AVX2
++ **Transform**
+  * Scalar & SIMD Transform: Delta, Zigzag, Zigzag of delta, XOR, Transpose/Shuffle, 
+  * **lossy** floating point compression with *TurboPFor* or [TurboTranspose](https://github.com/powturbo/TurboTranspose)+lz77
+* **Floating Point Compression**
+  * Delta/Zigzag + improved gorilla style + (Differential) Finite Context Method FCM/DFCM floating point compression
+  * Using **TurboPFor**, unsurpassed compression and more than 5 GB/s throughput
+  * Point wise relative error bound **lossy** floating point compression
+  * **TurboFloat** novel efficient floating point compression using TurboPFor
+* **Time Series Compression**
+  * **Fastest Gorilla** 16/32/64 bits style compression (**zigzag of delta** + **RLE**).
+  * can compress times series to only 0.01%. Speed > 10 GB/s compression and > 13 GB/s decompress.
+* **Inverted Index ...do less, go fast!**
+  * Direct Access to compressed *frequency* and *position* data w/ zero decompression
+  * **Novel** **"Intersection w/ skip intervals"**, decompress the minimum necessary blocks (**~10-15%)!**. 
+  * **Novel** Implicit skips with zero extra overhead
+  * **Novel** Efficient **Bidirectional** Inverted Index Architecture (forward/backwards traversal) incl. "integer compression".
+  * more than **2000! queries per second** on GOV2 dataset (25 millions documents) on a **SINGLE** core
+  * :sparkles: Revolutionary Parallel Query Processing on Multicores **> 7000!!! queries/sec** on a simple quad core PC.<br>
+   **...forget** ~~Map Reduce, Hadoop, multi-node clusters,~~ ...
+
+![Promo video](turbopfor.jpg?raw=true)
+
+### Integer Compression Benchmark (single thread):
+- Download [IcApp](https://sites.google.com/site/powturbo/downloads) a new benchmark for TurboPFor<br>
+  for testing allmost all integer and floating point file types.
+- Practical (No **PURE** cache) "integer compression" benchmark w/ **large** arrays.
+- [Benchmark Intel CPU: Skylake i7-6700 3.4GHz gcc 9.2](https://github.com/powturbo/TurboPFor/issues/47)
+- [Benchmark ARM: ARMv8 A73-ODROID-N2 1.8GHz](https://github.com/powturbo/TurboPFor/issues/49)
+
+##### - Synthetic data:
+ - Generate and test (zipfian) skewed distribution (100.000.000 integers, Block size=128/256)<br>
+   Note: Unlike general purpose compression, a small fixed size (ex. 128 integers) is in general used in "integer compression".
+   Large blocks involved, while processing queries (inverted index, search engines, databases, graphs, in memory computing,...) need to be entirely decoded.
+
+        ./icbench -a1.5 -m0 -M255 -n100M ZIPF
+
+|C Size|ratio%|Bits/Integer|C MB/s|D MB/s|Name  2019.11|
+|--------:|-----:|--------:|----------:|----------:|--------------|
+|62,939,886| 15.7| 5.04|**2369**|**10950**|**TurboPFor256**|
+|63,392,759| 15.8| 5.07|1359|7803|**TurboPFor128**|
+|63,392,801| 15.8| 5.07|1328|924|**TurboPForDA**|
+|65,060,504| 16.3| 5.20|60|2748|[FP_SIMDOptPFor](#FastPFor)|
+|65,359,916|16.3| 5.23| 32|2436|PC_OptPFD|
+|73,477,088|18.4| 5.88|408|2484|PC_Simple16|
+|73,481,096| 18.4| 5.88|624|8748|[FP_SimdFastPFor](#FastPFor) 64Ki *|
+|76,345,136| 19.1| 6.11|1072|2878|**VSimple**|
+|91,947,533| 23.0| 7.36|284|11737|[QMX](#QMX) 64k *|
+|93,285,864| 23.3| 7.46|1568|10232|[FP_GroupSimple](#FastPFor) 64Ki *|
+|95,915,096|24.0| 7.67|  848|3832|Simple-8b|
+|99,910,930| 25.0| 7.99|**17298**|**12408**|**TurboByte+TurboPack**|
+|99,910,930| 25.0| 7.99|**17357**|**12363**|**TurboPackV** sse|
+|99,910,930| 25.0| 7.99|11694|10138|**TurboPack** scalar|
+|99,910,930| 25.0| 7.99|8420|8876|**TurboFor**|
+|100,332,929| 25.1| 8.03|17077|11170|**TurboPack256V** avx2|
+|101,015,650| 25.3| 8.08|11191|10333|**TurboVByte**|
+|102,074,663| 25.5| 8.17|6689|9524|[MaskedVByte](#MaskedVByte)|
+|102,074,663| 25.5| 8.17|2260|4208|[PC_Vbyte](#PolyCom)|
+|102,083,036| 25.5| 8.17|5200|4268|[FP_VByte](#FastPFor)|
+|112,500,000| 28.1| 9.00|1528|12140|[VarintG8IU](#VarintG8IU)|
+|125,000,000| 31.2|10.00|13039|12366|**TurboByte**|
+|125,000,000| 31.2|10.00|11197|11984|[StreamVbyte 2019](#StreamVByte)|
+|400,000,000|	100.00|	32.00| 8960|8948|Copy|
+|         |      |     |   N/A  | N/A   |EliasFano|
+
+(*) codecs inefficient for small block sizes are tested with 64Ki integers/block.
+
+- MB/s: 1.000.000 bytes/second. **1000 MB/s = 1 GB/s**<br> 
+- **#BOLD** = pareto frontier.<br>
+- FP=FastPFor SC:simdcomp PC:Polycom<br>
+- TurboPForDA,TurboForDA: Direct Access is normally used when accessing few individual values.<br>
+- Eliasfano can be directly used only for increasing sequences
+------------------------------------------------------------------------
+##### - Data files:
+ - gov2.sorted from [DocId data set](#DocId) Block size=128/Delta coding
+
+        ./icbench -fS -r gov2.sorted
+
+![Speed/Ratio](ext/gov2.png "Speed/Ratio: Decompression")
+
+|Size |Ratio %|Bits/Integer|C Time MB/s|D Time MB/s|Function 2019.11|
+|-----------:|------:|-----:|-------:|-------:|---------------------|
+| 3,321,663,893| 13.9| 4.44|**1320**|**6088**|**TurboPFor**| 
+| 3,339,730,557| 14.0| 4.47|  32| 2144|PC.OptPFD|
+| 3,350,717,959| 14.0| 4.48|**1536**|**7128**|**TurboPFor256**| 
+| 3,501,671,314| 14.6| 4.68|  56| 2840|**VSimple**|
+| 3,768,146,467| 15.8| 5.04|**3228**| 3652|**EliasFanoV**|
+| 3,822,161,885| 16.0| 5.11| 572| 2444|PC_Simple16|
+| 4,411,714,936| 18.4| 5.90|**9304**|**10444**|**TurboByte+TurboPack**|
+| 4,521,326,518| 18.9| 6.05| 836| 3296|Simple-8b|
+| 4,649,671,427| 19.4| 6.22|3084| 3848|**TurboVbyte**|
+| 4,955,740,045| 20.7| 6.63|7064|10268|**TurboPackV**|
+| 4,955,740,045| 20.7| 6.63|5724| 8020|**TurboPack**|
+| 5,205,324,760| 21.8| 6.96|6952| 9488|SC_SIMDPack128|
+| 5,393,769,503| 22.5| 7.21|**14466**|**11902**|**TurboPackV256**|
+| 6,221,886,390| 26.0| 8.32|6668| 6952|**TurboFor**|
+| 6,221,886,390| 26.0| 8.32|6644| 2260|**TurboForDA**|
+| 6,699,519,000| 28.0| 8.96|1888| 1980|FP_Vbyte|
+| 6,700,989,563| 28.0| 8.96|2740| 3384|MaskedVByte|
+| 7,622,896,878| 31.9|10.20| 836| 4792|VarintG8IU|
+| 8,060,125,035| 33.7|11.50|8456| 9476|Streamvbyte 2019|
+| 8,594,342,216| 35.9|11.50|5228| 6376|libfor|
+|23,918,861,764|100.0|32.00|5824| 5924|Copy|
+
+Block size: 64Ki = 256k bytes. Ki=1024 Integers
+
+|Size |Ratio %|Bits/Integer|C Time MB/s|D Time MB/s|Function |
+|----------:|-----:|----:|------:|------:|---------------------|
+| 3,164,940,562| 13.2|**4.23**|**1344**|**6004**|**TurboPFor 64Ki**|
+| 3,273,213,464| 13.7| 4.38|**1496**|**7008**|**TurboPFor256 64Ki**|
+| 3,965,982,954| 16.6| 5.30|**1520**| 2452|[lz4](#lz4)+DT 64Ki|
+| 4,234,154,427| 17.7| 5.66| 436| 5672|qmx 64Ki| 
+| 6,074,995,117| 25.4| 8.13| 1976| 2916|[blosc_lz4](#blosc) 64Ki| 
+| 8,773,150,644| 36.7|11.74| 2548|5204|blosc_lz 64Ki|
+
+"lz4+DT 64Ki" = Delta+Transpose from TurboPFor + lz4<br>
+"blosc_lz4" internal lz4 compressor+vectorized shuffle
+
+##### - Time Series:
+- Test file  [Timestamps: ts.txt(sorted)](https://github.com/zhenjl/encoding/tree/master/benchmark/data)
+
+        ./icapp -Ft ts.txt -I15 -J15
+
+|Function        |C MB/s|   size  |ratio%| D MB/s|Text
+|----------------|-----:|--------:|------:|------:|--------------------|
+|bvzenc32        |**10632**|45,909|0.008|**12823**|ZigZag|
+|bvzzenc32       |**8914**|56,713|0.010|**13499**|ZigZag Delta of delta|
+|vsenc32         |**12294**|140,400| 0.024 |12877 |Variable Simple|
+|p4nzenc256v32   | 1932| 596,018|  0.10 |13326 |TurboPFor256 ZigZag|
+|p4ndenc256v32   | 1961| 596,018|  0.10 |13339 |TurboPFor256 Delta| 
+|bitndpack256v32 |**12564**|909,189|  0.16 |13505 |TurboPackV256 Delta|
+|p4nzenc32       | 1810|  1,159,633|  0.20 | 8502 |TurboPFor ZigZag|
+|p4nzenc128v32   | 1795|  1,159,633|  0.20 |13338 |TurboPFor ZigZag| 
+|bitnzpack256v32 | 9651|  1,254,757|  0.22 |**13503**|TurboPackV256 ZigZag| 
+|bitnzpack128v32 |10155|  1,472,804|  0.26 |13380 |TurboPackV ZigZag| 
+|vbddenc32       | 6198| 18,057,296|  3.13 |10982 |TurboVByte Delta of delta|
+|memcpy          |13397|577,141,992|100.00||
+
+##### - Transpose/Shuffle (no compression)
+        ./icbench -eTRANSFORM ZIPF
+
+|Size |C Time MB/s|D Time MB/s|Function|
+|----------:|------:|------:|-----------------------------------|
+|100,000,000|**9400**|**9132**|**TPbyte 4** TurboPFor Byte Transpose/shuffle AVX2|
+|100,000,000|8784|8860|**TPbyte 4** TurboPFor Byte Transpose/shuffle SSE|
+|100,000,000|7688|7656|Blosc_Shuffle AVX2|
+|100,000,000|**5204**|**7460**|**TPnibble 4** TurboPFor Nibble Transpose/shuffle SSE|
+|100,000,000|6620|6284|Blosc shuffle SSE|
+|100,000,000|3156|3372|Bitshuffle AVX2|
+|100,000,000|2100|2176|Bitshuffle SSE|
+
+##### - (Lossy) Floating point compression: 
+        ./icapp -Fd file          " 64 bits floating point raw file 
+        ./icapp -Ff file          " 32 bits floating point raw file 
+        ./icapp -Fcf file         " text file with miltiple entries (ex.  8.657,56.8,4.5 ...)
+        ./icapp -Ftf file         " text file (1 entry per line)
+        ./icapp -Ftf file -v5     " + display the first entries read
+        ./icapp -Ftf file.csv -K3 " but 3th column in a csv file (ex. number,Text,456.5 -> 456.5
+        ./icapp -Ftf file -g.001  " lossy compression with allowed pointwise relative error 0.001
+
+- see also [TurboTranspose](https://github.com/powturbo/TurboTranspose)
+
+##### - Compressed Inverted Index Intersections with GOV2<br />
+   GOV2: 426GB, 25 Millions documents, average doc. size=18k.
+
+   + Aol query log: 18.000 queries<br />
+     **~1300** queries per second (single core)<br />
+     **~5000** queries per second (quad core)<br />
+     Ratio = 14.37% Decoded/Total Integers.
+
+   + TREC Million Query Track (1MQT):<br />
+     **~1100** queries per second (Single core)<br /> 
+     **~4500** queries per second (Quad core CPU)<br />
+     Ratio = 11.59% Decoded/Total Integers.
+
+- Benchmarking intersections (Single core, AOL query log)
+
+| max.docid/q|Time s| q/s | ms/q | % docid found|
+|-----------------:|---:|----:|-----:|-------:|
+|1.000|7.88|2283.1|0.438|81|
+|10.000|10.54|1708.5|0.585|84|
+| ALL |13.96|1289.0|0.776|100|
+q/s: queries/second, ms/q:milliseconds/query
+
+- Benchmarking Parallel Query Processing (Quad core, AOL query log)
+
+| max.docid/q|Time s| q/s | ms/q | % docids found|
+|-----------------:|----:|----:|-----:|-------:|
+|1.000|2.66|6772.6|0.148|81|
+|10.000|3.39|5307.5|0.188|84|
+|ALL|3.57|5036.5|0.199|100|
+
+###### Notes:
+- Search engines are spending 90% of the time in intersections when processing queries. 
+- Most search engines are using pruning strategies, caching popular queries,... to reduce the time for intersections and query processing.
+- As indication, google is processing [40.000 Queries per seconds](http://www.internetlivestats.com/google-search-statistics/),
+using [900.000 multicore servers](https://www.cloudyn.com/blog/10-facts-didnt-know-server-farms/) for searching [8 billions web pages](http://searchenginewatch.com/sew/study/2063479/coincidentally-googles-index-size-jumps) (320 X size of GOV2).
+- Recent "integer compression" GOV2 experiments (best paper at ECIR 2014) [On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf) using 8-core Xeon PC are reporting 1.2 seconds per query (for 1.000 Top-k docids).
+
+### Compile:
+        Download or clone TurboPFor
+		git clone git://github.com/powturbo/TurboPFor.git
+		cd TurboPFor
+		make
+        
+
+        To benchmark external libraries + lz77 compression:
+		git clone --recursive git://github.com/powturbo/TurboPFor.git
+		cd TurboPFor
+        make CODEC1=1 CODEC2=1 LZ=1
+
+###### Windows visual c++
+		nmake /f makefile.vs
+
+###### Windows visual studio c++
+        project files under vs/vs2017
+  
+### Testing:
+##### - Synthetic data (use ZIPF parameter):
+  + benchmark groups of "integer compression" functions <br />
+
+        ./icbench -eBENCH -a1.2 -m0 -M255 -n100M ZIPF
+        ./icbench -eBITPACK/VBYTE -a1.2 -m0 -M255 -n100M ZIPF
+
+   >*Type "icbench -l1" for a list*
+
+   >*-zipfian distribution alpha = 1.2 (Ex. -a1.0=uniform -a1.5=skewed distribution)<br />
+     -number of integers = 100.000.000<br />
+     -integer range from 0 to 255<br />*
+  
+  + Unsorted lists: individual function test (ex. Copy TurboPack TurboPFor)<br />
+
+        ./icbench -a1.5 -m0 -M255 -ecopy/turbopack/turbopfor/turbopack256v ZIPF
+
+  + Unsorted lists: Zigzag encoding w/ option **-fz** or FOR encoding<br />
+
+        ./icbench -fz -eturbovbyte/turbopfor/turbopackv ZIPF
+        ./icbench -eturboforv ZIPF
+
+  + Sorted lists: differential coding w/ option **-fs** (increasing) or **-fS** (strictly increasing)<br />
+
+        ./icbench -fs -eturbopack/turbopfor/turbopfor256v ZIPF
+
+  + Generate interactive "file.html" plot for browsing
+  
+        ./icbench -p2 -S2 -Q3 file.tbb
+		
+  + Unit test: test function from bit size 0 to 32
+  
+        ./icbench -m0 -M32 -eturbpfor -fu 
+        ./icbench -m0 -M8 -eturbopack -fs -n1M 
+
+##### - Data files:
+  - Raw 32 bits binary data file [Test data](https://github.com/ot/partitioned_elias_fano/tree/master/test/test_data)
+
+        ./icbench file
+        ./icapp file           
+        ./icapp -Fs file         "16 bits raw binary file
+        ./icapp -Fu file         "32 bits raw binary file
+        ./icapp -Fl file         "64 bits raw binary file
+        ./icapp -Ff file         "32 bits raw floating point binary file
+        ./icapp -Fd file         "64 bits raw floating point binary file
+
+  - Text file: 1 entry per line. [Test data: ts.txt(sorted) and lat.txt(unsorted)](https://github.com/zhenjl/encoding/tree/master/benchmark/data))
+
+        ./icbench -eBENCH -fts ts.txt
+        ./icbench -eBENCH -ft  lat.txt
+
+        ./icapp -Fts data.txt            "text file, one 16 bits integer per line
+        ./icapp -Ftu ts.txt              "text file, one 32 bits integer per line
+        ./icapp -Ftl ts.txt              "text file, one 64 bits integer per line
+        ./icapp -Ftf file                "text file, one 32 bits floating point (ex. 8.32456) per line
+        ./icapp -Ftd file                "text file, one 64 bits floating point (ex. 8.324567789) per line
+        ./icapp -Ftd file -v5            "like prev., display the first 100 values read
+        ./icapp -Ftd file -v5 -g.00001   "like prev., error bound lossy floating point compression
+        ./icapp -Ftt file                "text file, timestamp in seconds iso-8601 -> 32 bits integer (ex. 2018-03-12T04:31:06)
+        ./icapp -FtT file                "text file, timestamp in milliseconds iso-8601 -> 64 bits integer (ex. 2018-03-12T04:31:06.345)
+        ./icapp -Ftl -D2 -H file         "skip 1th line, convert numbers with 2 decimal digits to 64 bits integers (ex. 456.23 -> 45623)
+        ./icapp -Ftl -D2 -H -K3 file.csv  "like prev., use the 3th number in the line (ex. label=3245, text=99 usage=456.23 -> 456.23 )
+        ./icapp -Ftl -D2 -H -K3 -k| file.csv "like prev., use '|' as separator
+
+  - Text file: multiple numbers separated by non-digits (0..9,-,.) characters (ex. 134534,-45678,98788,4345, )
+
+        ./icapp -Fc data.txt         "text file, 32 bits integers (ex. 56789,3245,23,678 ) 
+        ./icapp -Fcd data.txt        "text file, 64 bits floting-point numbers (ex. 34.7689,5.20,45.789 )
+
+  - Multiblocks of 32 bits binary file. (Example gov2 from [DocId data set](#DocId))<br />
+    Block format: [n1: #of Ids][Id1] [Id2]...[IdN] [n2: #of Ids][Id1][Id2]...[IdN]...
+
+        ./icbench -fS -r gov2.sorted
+
+
+##### - Intersections:
+  1 - Download Gov2 (or ClueWeb09) + query files (Ex. "1mq.txt") from [DocId data set](#DocId)<br />
+   8GB RAM required (16GB recommended for benchmarking "clueweb09" files).
+
+  2 - Create index file
+
+
+        ./idxcr gov2.sorted .
+
+
+   >*create inverted index file "gov2.sorted.i" in the current directory*
+
+  3 - Test intersections
+
+
+        ./idxqry gov2.sorted.i 1mq.txt
+
+
+  >*run queries in file "1mq.txt" over the index of gov2 file*
+
+##### - Parallel Query Processing:
+  1 - Create partitions
+
+  
+        ./idxseg gov2.sorted . -26m -s8
+
+  
+ >*create 8 (CPU hardware threads) partitions for a total of ~26 millions document ids*
+  
+  2 - Create index file for each partition
+
+
+      ./idxcr gov2.sorted.s*
+
+
+  >*create inverted index file for all partitions "gov2.sorted.s00 - gov2.sorted.s07" in the current directory*
+
+  3 - Intersections:
+  
+  delete "idxqry.o" file and then type "make para" to compile "idxqry" w. multithreading
+
+
+      ./idxqry gov2.sorted.s*.i 1mq.txt
+
+  >*run queries in file "1mq.txt" over the index of all gov2 partitions "gov2.sorted.s00.i - gov2.sorted.s07.i".*
+
+### Function usage:
+See benchmark "icbench" program for "integer compression" usage examples.
+In general encoding/decoding functions are of the form:
+
+  >**char *endptr = encode( unsigned *in, unsigned n, char *out, [unsigned start], [int b])**<br />
+  endptr : set by encode to the next character in "out" after the encoded buffer<br />
+  in     : input integer array<br />
+  n      : number of elements<br />
+  out    : pointer to output buffer<br />
+  b      : number of bits. Only for bit packing functions<br />
+  start  : previous value. Only for integrated delta encoding functions
+
+   
+  >**char *endptr = decode( char *in, unsigned n, unsigned *out, [unsigned start], [int b])**<br />
+  endptr : set by decode to the next character in "in" after the decoded buffer<br />
+  in     : pointer to input buffer<br />
+  n      : number of elements<br />
+  out    : output integer array<br />
+  b      : number of bits. Only for bit unpacking functions<br />
+  start  : previous value. Only for integrated delta decoding functions
+
+  **Simple high level functions:** 
+  >**size_t compressed_size = encode( unsigned *in, size_t n, char *out)**<br />
+  compressed_size : number of bytes written into compressed output buffer out<br />
+   
+  >**size_t compressed_size = decode( char *in, size_t n, unsigned *out)**<br />
+  compressed_size : number of bytes read from compressed input buffer in<br />
+
+### Function syntax:
+ - {vb | p4 | bit | vs}[n][d | d1 | f | fm | z ]{enc/dec | pack/unpack}[| 128V | 256V][8 | 16 | 32 | 64]:<br />
+   vb:  variable byte<br />
+   p4:  turbopfor<br />
+   vs:  variable simple<br />
+   bit: bit packing<br />
+   n :  high level array functions for large arrays.
+	
+   ''  : encoding for unsorted integer lists<br />
+   'd' : delta encoding for increasing integer lists (sorted w/ duplicate)<br />
+   'd1': delta encoding for strictly increasing integer lists (sorted unique)<br />
+   'f' : FOR encoding for sorted integer lists<br />
+   'z' :  ZigZag encoding for unsorted integer lists<br />
+   
+   'enc' or 'pack'  : encode or bitpack<br />
+   'dec' or 'unpack': decode or bitunpack<br />
+   'NN'             : integer size (8/16/32/64)<br />
+   
+header files to use with documentation:<br />
+
+| c/c++ header file|Integer Compression functions| examples |
+|------------|-----------------------------|-----------------|
+|vint.h|variable byte| vbenc32/vbdec32 vbdenc32/vbddec32 vbzenc32/vbzdec32 |
+|vsimple.h|variable simple| vsenc64/vsdec64 |
+|vp4.h|TurboPFor|  p4enc32/p4dec32 p4denc32/p4ddec32 p4zenc32/p4zdec32 |
+|bitpack.h|Bit Packing, For, +Direct Access| bitpack256v32/bitunpack256v32 bitforenc64/bitfordec64|
+|eliasfano.h|Elias Fano| efanoenc256v32/efanoc256v32 |
+
+Note: Some low level functions (like p4enc32) are limited to 128/256 (SSE/AVX2) integers per call.
+
+### Environment:
+###### OS/Compiler (64 bits):
+- Windows: MinGW-w64 makefile
+- Windows: Visual c++ (>=VS2008) - makefile.vs (for nmake)
+- Windows: Visual Studio project file - vs/vs2017 - Thanks to [PavelP](https://github.com/pps83)
+- Linux amd64: GNU GCC (>=4.6)
+- Linux amd64: Clang (>=3.2) 
+- Linux arm64: 64 bits aarch64 ARMv8:  gcc (>=6.3)
+- Linux arm64: 64 bits aarch64 ARMv8:  clang
+- MaxOS: XCode (>=9)
+- MaxOS: Apple M1 (Clang)
+- PowerPC ppc64le (incl. SIMD): gcc (>=8.0)
+
+###### Multithreading:
+- All TurboPFor integer compression functions are thread safe
+
+### References:
+
+*   [TurboPFor: an analysis](https://michael.stapelberg.ch/posts/2019-02-05-turbopfor-analysis/)
+
+* **Applications:**
+  * [Debian Code Search](https://github.com/Debian/dcs/)</br>
+    [Debian Code Search: positional index, TurboPFor-compressed](https://michael.stapelberg.ch/posts/2019-09-29-dcs-positional-turbopfor-index/)
+  * [Graph500](https://github.com/julianromera/graph500)
+  * [Small Polygon Compression](https://arxiv.org/abs/1509.05505) + [Poster](http://abhinavjauhri.me/publications/dcc_poster_2016.pdf) + [code](https://github.com/ajauhri/bignum_compression)
+  * [Parallel Graph Analysis (Lecture 18)](http://www.cs.rpi.edu/~slotag/classes/FA16/) + [code](http://www.cs.rpi.edu/~slotag/classes/FA16/handson/lec18-comp2.cpp)
+
+* **Benchmark references:**
+  * <a name="FastPFor"></a>[FastPFor](https://github.com/lemire/FastPFor) + [Simdcomp](https://github.com/lemire/simdcomp): SIMDPack FPF, Vbyte FPF, VarintG8IU, StreamVbyte, GroupSimple
+  * <a name="OptPFD"></a><a name="Simple16"></a>[Optimized Pfor-delta compression code](http://jinruhe.com): OptPFD/OptP4, Simple16 (limited to 28 bits integers)
+  * <a name="MaskedVByte"></a>[MaskedVByte](http://maskedvbyte.org/). See also: [Vectorized VByte Decoding](http://engineering.indeed.com/blog/2015/03/vectorized-vbyte-decoding-high-performance-vector-instructions/)
+  * <a name="Streamvbyte"></a>[Streamvbyte](https://github.com/lemire/streamvbyte). 
+  * <a name="Simple-8b"></a>[Index Compression Using 64-Bit Words](http://people.eng.unimelb.edu.au/ammoffat/abstracts/am10spe.html): Simple-8b (speed optimized version tested)
+  * <a name="libfor"></a>[libfor](https://github.com/cruppstahl/for)
+  * <a name="QMX"></a>[Compression, SIMD, and Postings Lists](http://www.cs.otago.ac.nz/homepages/andrew/papers/) QMX integer compression from the "simple family"
+  * <a name="lz4"></a>[lz4](https://github.com/Cyan4973/lz4). included w. block size 64K as indication. Tested after preprocessing w. delta+transpose
+  * <a name="blosc"></a>[blosc](https://github.com/Blosc/c-blosc). blosc is like transpose/shuffle+lz77. Tested blosc+lz4 and blosclz incl. vectorizeed shuffle.<br>
+  * <a name="DocId"></a>[Document identifier data set](http://lemire.me/data/integercompression2014.html)
+
+* **Integer compression publications:**
+  * :green_book:[Evaluating Lightweight Integer Compression Algorithms in Column-Oriented In-Memory DBMS](http://www.adms-conf.org/2021-camera-ready/heinzl_adms21.pdf)
+  * :green_book:[In Vacuo and In Situ Evaluation of SIMD Codecs (TurboPackV,TurboPFor/QMX)](http://dl.acm.org/citation.cfm?id=3015023) + [paper](http://www.cs.otago.ac.nz/homepages/andrew/papers/)
+  * :green_book:[SIMD Compression and the Intersection of Sorted Integers](http://arxiv.org/abs/1401.6399)
+  * :green_book:[Partitioned Elias-Fano Indexes](http://www.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf)
+  * :green_book:[On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf)
+  * :green_book:[Google's Group Varint Encoding](http://static.googleusercontent.com/media/research.google.com/de//people/jeff/WSDM09-keynote.pdf)
+  * :green_book:[Integer Compression tweets](https://twitter.com/search?q=%23integercompression&src=typd)
+  * :green_book:[Efficient Compression of Scientific Floating-Point Data and An Application in Structural Analysis](https://www.jstage.jst.go.jp/article/jsces/2017/0/2017_20170002/_article)
+  * :green_book:[SPDP is a compression/decompression algorithm for binary IEEE 754 32/64 bits floating-point data](http://cs.txstate.edu/~burtscher/research/SPDPcompressor/)<br />
+    :green_book:[ SPDP - An Automatically Synthesized Lossless Compression Algorithm for Floating-Point Data](http://cs.txstate.edu/~mb92/papers/dcc18.pdf) + [DCC 2018](http://www.cs.brandeis.edu//~dcc/Programs/Program2018.pdf)
+
+Last update:  13 Nov 2021
+
+## APPENDIX: icbench Integer Compression Benchmark
+
+##### TurboPFor + external libraries
+<pre>
+TurboPFor               	https://github.com/powturbo/TurboPFor
+FastPFor (FP)              	https://github.com/lemire/FastPFor
+lz4				https://github.com/Cyan4973/lz4
+LittleIntPacker (LI)       	https://github.com/lemire/LittleIntPacker
+MaskedVbyte             	http://maskedvbyte.org
+Polycom (PC)               	https://github.com/encode84/bcm
+simdcomp (SC)              	https://github.com/lemire/simdcomp
+Simple-8b optimized     	https://github.com/powturbo/TurboPFor
+Streamvbyte             	https://github.com/lemire/streamvbyte
+VarintG8IU              	https://github.com/lemire/FastPFor
+</pre>
+
+##### Functions integrated into 'icbench' for benchmarking
+<pre>
+Codec group:
+TURBOPFOR        TurboPFor library TurboPFor256V/TurboPack256V/TurboPFor256N/TurboPFor/TurboPackV/TurboVByte/TurboPack/TurboForDA/EliasFano/VSimple/TurboPForN/TurboPackN/TurboPForDI
+DEFAULT          Default TurboPFor/TurboPackV/TurboVByte/TurboPack/TurboFor/TurboPForN/TurboPackN/TurboPForDI/TurboPFor256V/TurboPack256V/TurboPFor256N
+BENCH            Benchmark TurboPFor/TurboPackV/TurboVByte/TurboPack/QMX/FP.SimdFastPfor/FP.SimdOptPFor/MaskedVbyte/StreamVbyte
+EFFICIENT        Efficient TurboPFor/vsimple/turbovbyte
+TRANSFORM        transpose/shufle,delta,zigzag tpbyte4s/tpbyte,4/tpnibble,4/ZigZag_32/Delta_32/BitShuffle,4
+BITPACK          Bit Packing TurboPack256V/TurboPackV/TurboPackH/TurboPack/SC.SimdPack128/SC.SimdPack256
+VBYTE            Variable byte TurboVByte/FP.VByte/PC.Vbyte/VarintG8IU/MaskedVbyte/StreamVbyte
+SIMPLE           Simple Family simple8b/simple16/vsimple/qmx
+LZ4              lz4+bitshufle/transpose 4,8 lz4_bitshufle/lz4_tp4/lz4_tp8
+LI               Little Integer LI_Pack/LI_TurboPack/LI_SuperPack/LI_HorPack
+
+
+Function         Description                                      level
+
+--------         -----------                                      -----
+TurboPFor        PFor (SSE2)
+TurboPForN       PFor (SSE2) large blocks
+TurboPFor256     PFor (AVX2)
+TurboPFor256N    PFor (AVX2) large blocks
+TurboPForDA      PFor direct access
+TurboPForDI      PFord min
+TurboPForZZ      PFor zigzag of delta
+TurboFor         FOR
+TurboForV        FOR (SIMD)
+TurboFor256V     FOR (AVX2)
+TurboForDA       FOR direct access
+TurboPackDA      Bit packing direct access
+TurboPack        Bit packing (scalar)
+TurboPackN       Bit packing (scalar) large blocks
+TurboPackV       Bit packing (SSE2 Vertical)
+TurboPackH       Bit packing (SSE2 Horizontal)
+TurboPackVN      Bit packing (SSE2 large block)
+TurboPack256V    Bit packing (AVX2 Vertical)
+TurboPack256N    Bit packing (AVX2 large block)
+TurboVByte       Variable byte (scalar)
+VSimple          Variable simple (scalar)
+EliasFano        Elias fano (scalar)
+EliasFanoV       Eliasfano  (SSE2)
+EliasFano256V    Elias fano (AVX2)
+memcpy           memcpy
+copy             Integer copy
+tpbyte4s         Byte Transpose (scalar)
+tpbyte           Byte transpose (simd)  2,4,8
+tpnibble         Nibble transpose (simd)  2,4,8
+ZigZag32         ZigZag encoding (sse2)
+Delta32          Delta encoding (sse2)
+DDelta32         Delta of delta encoding (sse2)
+Xor32            Xor encoding (sse2)
+FP_PREV64        Floating point PFOR
+FP_FCM64         Floating point PFOR (FCM)
+FP_DFCM64        Floating point PFOR (DFCM)
+TurboPFor64      PFOR 64
+TurboPFor64V     PFOR 64
+Simple8b         64 bits Simple family (instable)
+PC_Simple16      Simple 16. limited to 28 bits
+PC_OptPFD        OptPFD. limited to 28 bits
+PC_Vbyte         Variable byte
+PC_Rice          Rice coding (instable)
+VarintG8IU       Variable byte SIMD
+MaskedVbyte      Variable byte SIMD
+StreamVbyte      Variable byte SIMD
+FP_FastPFor      PFor scalar (inefficient for small blocks)
+FP_SimdFastPFor  PFor SIMD (inefficient for small blocks)
+FP_OptPFor       OptPFor scalar 
+FP_SIMDOptPFor   OptPFor SIMD
+FP_VByte         Variable byte
+FP_Simple8bRLE   Simple-8b + rle
+FP_GROUPSIMPLE   Group Simple
+SC_SIMDPack128   Bit packing (SSE4.1)
+SC_SIMDPack256   Bit packing (SSE4.1)
+SC_For           For (SSE4.1)
+SC_ForDA         For direct access (SSE4.1)
+LibFor_For       For
+LibFor_ForDA     For direct access
+LI_Pack          Bit packing (scalar)
+LI_TurboPack     Bit packing (scalar)
+LI_SuperPack     Bit packing (scalar)
+LI_HorPack       Bit packing (sse4.1 horizontal) 
+LI_BMIPack256    Bit packing (avx2)
+lz4              lz4
+lz4_bit          Bitshuffle + [delta]+lz4 2,4,8
+lz4_nibble       TurboPFor's [delta]+nibble transpose + lz4 2,4,8
+lz4_bitxor       Bitshuffle + [xor]+lz4 2,4,8
+lz4_nibblexor    TurboPFor's [xor]+nibble transpose + lz4 2,4,8
+lz4_byte         TurboPFor's [delta]+byte transpose + lz4 2,4,8
+BitShuffle       Bit shuffle (simd) 2,4,8
+</pre>
+
diff --git a/src/ext/for/bic.c b/src/ext/for/bic.c
deleted file mode 100644
index 16c532b8..00000000
--- a/src/ext/for/bic.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/**
-    Copyright (C) powturbo 2019-2023
-    SPDX-License-Identifier: GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - email    : powturbo [AT] gmail.com
-    - github   : https://github.com/powturbo
-    - homepage : https://sites.google.com/site/powturbo/
-    - twitter  : https://twitter.com/powturbo
-**/
-// Binary Interpolative Coding
-// Reference: "On Implementing the Binary Interpolative Coding Algorithm" GIULIO ERMANNO PIBIRI, ISTI-CNS http://pages.di.unipi.it/pibiri/papers/BIC.pdf
-//            "Techniques for Inverted Index Compression" GIULIO ERMANNO PIBIRI, ROSSANO VENTURINI, University of Pisa https://arxiv.org/abs/1908.10598
-
-#ifndef USIZE //---------- implementation --------------------------------------------------------------------------------------------------------------------------------------
-#include "include_/conf.h"
-#include "include_/bic.h"
-
-#include "include_/bitutil_.h"
-
-static ALWAYS_INLINE unsigned pow2next(unsigned x) { return x<2?1:(1ull << (__bsr32((x)-1)+1)); }
-
-size_t bicbound16(size_t n) { return n*2+4; }
-size_t bicbound32(size_t n) { return n*4+4; }
-//-- Simple binary ----------------------------------------------------------------------
-#define bicput(bw,br, _u_, _x_, _usize_) bitput(  bw,br, T2(__bsr,_usize_)(_u_) + 1, _x_)   /*AS(_u_ > 0, "Fatal bicput"); AS(_x_ <= _u_, "Fatal bicput2");*/
-#define bicget(bw,br, _u_, _x_, _usize_) bitget57(bw,br, T2(__bsr,_usize_)(_u_) + 1, _x_)
-
-//------------------------------------------
-#define BICENC_ bicbenc_
-#define BICDEC_ bicbdec_
-#define BICENC  bicbenc
-#define BICDEC  bicbdec
-
-//---- 16 bits ----------
-#define USIZE 16
-#define uint_t  uint16_t
-#include "bic.c"
-
-//---- 32 bits ----------
-#define USIZE 32
-#define uint_t  uint32_t
-#include "bic.c"
-#undef bicput
-#undef bicget
-#undef BICENC_
-#undef BICDEC_
-#undef BICENC
-#undef BICDEC
-
-// -- Leftmost minimal ---------------------------------------------------------------------
-#define bicput(bw,br, _u_, _x_, _usize_) { \
-  unsigned _x = _x_, _u = _u_, _b = T2(__bsr,_usize_)(_u), hi = (1ull << (_b + 1)) - _u - 1;\
-  if(_x < hi)      bitput(bw,br, _b,   _x);\
-  else { _x += hi; bitput(bw,br, _b+1, (_x&1)<<_b | _x >> 1); }\
-}
-
-#define bicget(bw,br, _u_, _x_, _usize_) {\
-  unsigned _u = _u_;\
-  unsigned _b  = T2(__bsr,_usize_)(_u);\
-  uint_t   _hi = (1ull << (_b + 1)) - _u - 1;\
-  if((_x_ = bitpeek57(bw,br,_b)) < _hi) bitrmv(bw,br,_b);\
-  else { \
-    unsigned _y = (bitbw(bw,br)>>_b)&1;\
-    bitrmv(bw,br,_b+1);\
-	_x_= (_x_<<1) + _y - _hi;\
-  }\
-}
-
-//--------------------------------------------
-#define BICENC_ bicenc_
-#define BICDEC_ bicdec_
-#define BICENC  bicenc
-#define BICDEC  bicdec
-
-//---- 16 bits ----------
-#define USIZE 16
-#define uint_t  uint16_t
-#include "bic.c"
-
-//---- 32 bits ----------
-#define USIZE 32
-#define uint_t  uint32_t
-#include "bic.c"
-#undef bicput
-#undef bicget
-#undef BICENC_
-#undef BICDEC_
-#undef BICENC
-#undef BICDEC
-
-//-- Center Minimal -----------------------------------------------------
-#define bicput(bw,br, _u_, _x_, _usize_) { \
-  unsigned _x = _x_, _u = _u_, _b = T2(__bsr,_usize_)(_u); \
-  uint64_t _c = (1ull << (_b + 1)) - _u - 1; \
-  unsigned _c2 = _c >> 1, _r2 = _u >> 1, _lo = _r2-_c2, _hi = _r2+_c2+1;\
-  if(!(_u & 1)) _lo -= 1; \
-  _b += (_x <= _lo || _x >= _hi);\
-  bitput(bw,br, _b, _x);\
-}
-
-#define bicget(bw,br, _u_, _x_, _usize_) { \
-  unsigned _u = _u_, _b = T2(__bsr,_usize_)(_u);\
-  uint64_t  _c = (1ull << (_b + 1)) - _u - 1;\
-  unsigned _c2 = _c>>1, _r2 = _u>>1, _lo = _r2 - _c2;\
-  _lo -= ((_u & 1) == 0);\
-  if((_x_ = bitpeek57(bw,br,_b)) > _lo) bitrmv(bw,br,_b);\
-  else bitget57(bw,br, _b+1, _x_);\
-}
-
-//--------------------------------------------
-#define BICENC_ bicmenc_
-#define BICDEC_ bicmdec_
-#define BICENC  bicmenc
-#define BICDEC  bicmdec
-
-//---- 16 bits ----------
-#define USIZE 16
-#define uint_t  uint16_t
-#include "bic.c"
-
-//---- 32 bits ----------
-#define USIZE 32
-#define uint_t  uint32_t
-#include "bic.c"
-
-
-#else //-------------------- Template functions ----------------------------------------------------------------------------------------------------------
-
-static void T2(BICENC_,USIZE)(uint_t *in, unsigned n, unsigned char **_op, unsigned lo, unsigned hi, unsigned h, uint64_t *bw, unsigned *br) {
-  while(n)
-    if(hi - lo + 1 != n) { 												//AC(lo <= hi,"bicenc fatal lo=%d>hi=%d n=%d\n", lo, hi, n); AS(hi - lo >= n - 1, "bicenc_32 fatal hi-lo>n-1\n");
-      unsigned x = in[h];
-	  bicput(*bw, *br, hi-n-lo+1, x-lo-h, USIZE);  bitenorm(*bw,*br,*_op);
-      T2(BICENC_,USIZE)( in, h, _op, lo, x-1, h>>1, bw,br);
-      in += h+1; n -= h+1; lo = x+1; h = n >> 1;
-	} else break;
-}
-
-#define RE(a) //a  // recursion : RE(a) a
-#define RD(a) a    // recursion : RD(a)
-static void T2(BICDEC_,USIZE)(unsigned char **_ip, unsigned n, uint_t *out, unsigned lo, unsigned hi, unsigned h, uint64_t *bw, unsigned *br) {
-  RE(if(!n) return);
-  RD(do) {
-    if(likely(hi - lo + 1 != n)) {						    //AS(lo <= hi, "bicdec fatal");
-      unsigned x;
-	  bitdnorm(*bw,*br,*_ip); bicget(*bw,*br, hi-lo+1-n, x, USIZE);
-      out[h] = (x += lo + h);
-      if(n != 1) {
-           T2(BICDEC_,USIZE)(_ip,   h,   out,         lo,  x-1,       h>>1, bw,br);
-        RE(T2(BICDEC_,USIZE)(_ip,n- h-1, out+ h+1,    x+1, hi,  (n-h-1)>>1, bw,br));
-		RD(                      n-=h+1; out+=h+1; lo=x+1;        h = n>>1);
-	  } RD(else break);
-    } else {
-	  BITFORSET_(out, n, lo, 1); 					//for(unsigned i = 0; i != n; ++i) out[i] = lo+i; //
-	  RD(break);
-    }
-  } RD(while(n));
-}
-
-unsigned T2(BICENC,USIZE)(uint_t *in, unsigned n, unsigned char *out) {
-  if(!n) return 0; 						//for(unsigned i = 1; i < n; i++) { AC(in[i]>in[i-1], "bicenc32: Not sorted at=%u,count=%d\n", i, n);  } //printf("n=%u ", n);printf("%u,", in[i]);
-  bitdef(bw,br);
-  unsigned char *op = out;
-  unsigned      x = in[n-1];
-
-  ctou32(op) = x; op += 4;
-  T2(BICENC_,USIZE)(in, n-1, &op, 0, x, pow2next(n)>>1, &bw,&br);
-  bitflush(bw,br,op);
-  return op - out;
-}
-
-unsigned T2(BICDEC,USIZE)(unsigned char *in, unsigned n, uint_t *out) {
-  if(!n) return 0;
-  bitdef(bw,br);
-  unsigned char *ip = in;
-  unsigned      x = ctou32(ip);
-
-  ip       += 4;
-  out[n-1]  = x;
-  T2(BICDEC_,USIZE)(&ip, n-1, out, 0, x, pow2next(n)>>1, &bw,&br);
-  bitalign(bw,br,ip);
-  return ip - in;
-}
-
-#undef USIZE
-#undef uint_t
-#endif
diff --git a/src/ext/for/bitpack.c b/src/ext/for/bitpack.c
index d894b0df..666b1030 100644
--- a/src/ext/for/bitpack.c
+++ b/src/ext/for/bitpack.c
@@ -1,6 +1,6 @@
 /**
-    Copyright (C) powturbo 2013-2023
-    SPDX-License-Identifier: GPL v2 License
+    Copyright (C) powturbo 2013-2019
+    GPL v2 License
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -23,19 +23,13 @@
 **/
 //  "Integer Compression" bit packing
 
-#pragma warning( disable : 4005)
-#pragma warning( disable : 4090)
-#pragma warning( disable : 4068)
-
 #include <stdio.h>
-#include <string.h>
-#include "include_/conf.h"
-#include "include_/bitpack.h"
-#include "include_/bitutil.h"
-
-#include "include_/vlcbyte.h"
-#include "include_/bitutil_.h"
-
+#define BITUTIL_IN
+#define VINT_IN
+#include "conf.h"
+#include "bitutil.h"
+#include "vint.h"
+#include "bitpack.h"
 #define PAD8(_x_) ( (((_x_)+8-1)/8) )
 
   #ifdef __ARM_NEON
@@ -44,28 +38,13 @@
 #define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
   #endif
 
+#pragma warning( disable : 4005)
+#pragma warning( disable : 4090)
+#pragma warning( disable : 4068)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunsequenced"
 
-#ifndef __AVX2__ 
-#define BITNBOUND(_n_, _esize_, _csize_) ((_esize_*_n_) + ((_n_+_csize_-1)/_csize_))
-
-size_t bitnbound8(     size_t n){ return BITNBOUND(n, 1, 128); }
-size_t bitnbound16(    size_t n){ return BITNBOUND(n, 2, 128); }
-size_t bitnbound32(    size_t n){ return BITNBOUND(n, 4, 128); }
-size_t bitnbound64(    size_t n){ return BITNBOUND(n, 8, 128); }
-                                
-size_t bitnbound128v8( size_t n){ return BITNBOUND(n, 1, 128); }
-size_t bitnbound128v16(size_t n){ return BITNBOUND(n, 2, 128); }
-size_t bitnbound128v32(size_t n){ return BITNBOUND(n, 4, 128); }
-size_t bitnbound128v64(size_t n){ return BITNBOUND(n, 8, 128); }
-                                
-size_t bitnbound256v8( size_t n){ return BITNBOUND(n, 1, 256); }
-size_t bitnbound256v16(size_t n){ return BITNBOUND(n, 2, 256); }
-size_t bitnbound256v32(size_t n){ return BITNBOUND(n, 4, 256); }
-size_t bitnbound256v64(size_t n){ return BITNBOUND(n, 8, 128); }
-
-//---------------------------------------------- Plain -----------------------------------------------------------------------
+#if !defined(SSE2_ON) && !defined(AVX2_ON) //----------------------------------- Plain -----------------------------------------------------------------------
 typedef unsigned char *(*BITPACK_F8)( uint8_t  *__restrict out, unsigned n, const unsigned char *__restrict in);
 typedef unsigned char *(*BITPACK_D8)( uint8_t  *__restrict out, unsigned n, const unsigned char *__restrict in, uint8_t start);
 typedef unsigned char *(*BITPACK_F16)(uint16_t *__restrict out, unsigned n, const unsigned char *__restrict in);
@@ -75,25 +54,24 @@ typedef unsigned char *(*BITPACK_D32)(uint32_t *__restrict out, unsigned n, cons
 typedef unsigned char *(*BITPACK_F64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in);
 typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint64_t start);
 
-  #if 1 //def _MSC_VER
+#if 1 //def _MSC_VER
 #define VX (v=x)
 #define V  x
-  #else
+#else
 #define VX v
 #define V  v
-  #endif
+#endif
 
-  #if 0
+#if 0
 #define IP0(_ip_,_x_) *_ip_
 #define IP( _ip_,_x_) *_ip_++
 #define IPI(_ip_)
-  #else
+#else
 #define IP0(_ip_,_x_) _ip_[_x_]
 #define IP( _ip_,_x_) _ip_[_x_]
 #define IPI(_ip_) _ip_ += 32
-  #endif
+#endif
 
-//---- bitpack ---------------
 #define IP9(_ip_,_x_, _parm_)
 #define IPW(_ip_,_x_)           VX
 #define IPX(_ip_,_x_)           (V = IP(_ip_,_x_))
@@ -111,12 +89,11 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
 #undef IP32
 #undef IP64
 
-//----- bitpack delta --------------
 #define DELTA
 
-#define IP9(_ip_,_x_, _parm_)    V = IP0(_ip_,_x_) - start; start = IP(_ip_,_x_)
-#define IPV(_ip_,_x_)            VX
-#define IPX(_ip_,_x_)            (V = IP(_ip_,_x_) - start)
+#define IP9(_ip_,_x_, _parm_)   V = IP0(_ip_,_x_) - start; start = IP(_ip_,_x_)
+#define IPV(_ip_,_x_)           VX
+#define IPX(_ip_,_x_)          (V = IP(_ip_,_x_) - start)
 #define IP16(_ip_,_x_, _parm_)   start = IP(_ip_,_x_)
 #define IP32(_ip_,_x_, _parm_)   start = IP(_ip_,_x_)
 #define IP64(_ip_,_x_, _parm_)   start = IP(_ip_,_x_)
@@ -129,9 +106,8 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
 #undef IP32
 #undef IP64
 
-//----- bitpack FOR ---------------
 #define IP9(_ip_,_x_, _parm_)
-#define IPV(_ip_,_x_)           (IP(_ip_,_x_) - start)
+#define IPV(_ip_,_x_)           IP(_ip_,_x_) - start
 #define IPX(_ip_,_x_)           (V = IP(_ip_,_x_) - start)
 #define IP16(_ip_,_x_, _parm_)
 #define IP32(_ip_,_x_, _parm_)
@@ -145,10 +121,9 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
 #undef IP32
 #undef IP64
 
-//----- bitpack delta 1 -----------
-#define IP9( _ip_,_x_, _parm_)   V = IP0(_ip_,_x_) - start - 1; start = IP(_ip_,_x_)
-#define IPV( _ip_,_x_)           VX
-#define IPX(_ip_,_x_)            (V = IP(_ip_,_x_) - start - 1)
+#define IP9( _ip_,_x_, _parm_)  V = IP0(_ip_,_x_) - start - 1; start = IP(_ip_,_x_)
+#define IPV( _ip_,_x_)          VX
+#define IPX(_ip_,_x_)          (V = IP(_ip_,_x_) - start - 1)
 #define IP16(_ip_,_x_, _parm_)   start = IP(_ip_,_x_)
 #define IP32(_ip_,_x_, _parm_)   start = IP(_ip_,_x_)
 #define IP64(_ip_,_x_, _parm_)   start = IP(_ip_,_x_)
@@ -168,10 +143,9 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
 #define _BITPACK_ bitepack
 #include "bitpack_.h"*/
 
-//------ bitpack zigzag --------------------
-#define IP9(_ip_,_x_, _parm_)   V = T2(zigzagenc, USIZE)(IP(_ip_,_x_) - start); start = IP(_ip_,_x_)
+#define IP9(_ip_,_x_, _parm_)   V = TEMPLATE2(zigzagenc, USIZE)(IP(_ip_,_x_) - start); start = IP(_ip_,_x_)
 #define IPV(_ip_,_x_)           VX
-#define IPX(_ip_,_x_)           (V = T2(zigzagenc, USIZE)(IP(_ip_,_x_) - start))
+#define IPX(_ip_,_x_)          (V = TEMPLATE2(zigzagenc, USIZE)(IP(_ip_,_x_) - start))
 #define IP16(_ip_,_x_, _parm_)  start = IP(_ip_,_x_)
 #define IP32(_ip_,_x_, _parm_)  start = IP(_ip_,_x_)
 #define IP64(_ip_,_x_, _parm_)  start = IP(_ip_,_x_)
@@ -184,23 +158,6 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
 #undef IP32
 #undef IP64
 
-//------ bitpack xor --------------------
-#define IP9(_ip_,_x_, _parm_)   V = IP(_ip_,_x_) ^ start; start = IP(_ip_,_x_)
-#define IPV(_ip_,_x_)           VX
-#define IPX(_ip_,_x_)           (V = IP(_ip_,_x_) ^ start)
-#define IP16(_ip_,_x_, _parm_)  start = IP(_ip_,_x_)
-#define IP32(_ip_,_x_, _parm_)  start = IP(_ip_,_x_)
-#define IP64(_ip_,_x_, _parm_)  start = IP(_ip_,_x_)
-#define _BITPACK_ bitxpack
-#include "bitpack_.h"
-#undef IP9
-#undef IPV
-#undef IPX
-#undef IP16
-#undef IP32
-#undef IP64
-
-//----- bitpack FOR 1 ---------------------
 #define IPI(_ip_) _ip_ += 32; start += 32
 #define IP9(_ip_,_x_, _parm_)
 #define IPV(_ip_,_x_)           (IP(_ip_,_x_) - start - (_x_) - 1)
@@ -218,14 +175,14 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
 #undef IP32
 #undef IP64
 
-//----------------------------------- bitnpack ----------------------------------------------------
 #define BITNPACK(in, n, out, _csize_, _usize_) { unsigned char *op = out;\
   for(ip = in, in += n; ip < in;) { \
-    T3(uint, _usize_, _t) o,x;\
+    TEMPLATE3(uint, _usize_, _t) o,x;\
     unsigned iplen = in - ip,b; \
-    if(iplen > _csize_) iplen = _csize_;                                        PREFETCH(ip+512,0);\
-    o = T2(bit,_usize_)(ip, iplen, &x); b = T2(bsr,_usize_)(o);\
-    *op++ = b; op = T2(bitpacka, _usize_)[b](ip, iplen, op);\
+    if(iplen > _csize_) iplen = _csize_; \
+	  PREFETCH(ip+512,0);\
+    o = TEMPLATE2(bit,_usize_)(ip, iplen, &x); b = TEMPLATE2(bsr,_usize_)(o);\
+    *op++ = b; op = TEMPLATE2(bitpacka, _usize_)[b](ip, iplen, op);\
     ip += iplen;\
   }\
   return op - out;\
@@ -233,15 +190,14 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
 
 #define BITNDPACK(in, n, out, _csize_, _usize_, _bitd_, _bitpacka_) { if(!n) return 0;\
   unsigned char *op = out; \
-  T3(uint, _usize_, _t) o,x;\
+  TEMPLATE3(uint, _usize_, _t) o,x;\
   start = *in++; \
-  T2(vbxput, _usize_)(op, start);\
-  for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { \
-    unsigned b;                                                                 PREFETCH(ip+512,0);\
-    o = T2(_bitd_, _usize_)(ip, _csize_, &x, start); b = T2(bsr,_usize_)(o); *op++ = b; op = T2(_bitpacka_,_usize_)[b](ip, _csize_, op, start); ip += _csize_; start = ip[-1];\
+  TEMPLATE2(vbxput, _usize_)(op, start);\
+  for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { unsigned b;     PREFETCH(ip+512,0);\
+    o = TEMPLATE2(_bitd_, _usize_)(ip, _csize_, &x, start); b = TEMPLATE2(bsr,_usize_)(o); *op++ = b; op = TEMPLATE2(_bitpacka_,_usize_)[b](ip, _csize_, op, start); ip += _csize_; start = ip[-1];\
   }\
   if(n&=(_csize_-1)) { unsigned b;\
-    o = T2(_bitd_, _usize_)(ip, n,       &x, start); b = T2(bsr,_usize_)(o); *op++ = b; op = T2(_bitpacka_,_usize_)[b](ip, n,       op, start);\
+    o = TEMPLATE2(_bitd_, _usize_)(ip, n,       &x, start); b = TEMPLATE2(bsr,_usize_)(o); *op++ = b; op = TEMPLATE2(_bitpacka_,_usize_)[b](ip, n,       op, start);\
   }\
   return op - out;\
 }
@@ -266,34 +222,28 @@ size_t bitnzpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict
 size_t bitnzpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitz, bitzpacka); }
 size_t bitnzpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitz, bitzpacka); }
 
-size_t bitnxpack8(  uint8_t  *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t  *ip,start; BITNDPACK(in, n, out, 128,  8, bitx, bitxpacka); }
-size_t bitnxpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNDPACK(in, n, out, 128, 16, bitx, bitxpacka); }
-size_t bitnxpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitx, bitxpacka); }
-size_t bitnxpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitx, bitxpacka); }
-
 size_t bitnfpack8(  uint8_t  *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t  *ip,start; BITNDPACK(in, n, out, 128,  8, bitf, bitfpacka); }
 size_t bitnfpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNDPACK(in, n, out, 128, 16, bitf, bitfpacka); }
 size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitf, bitfpacka); }
 size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitf, bitfpacka); }
-  #endif // ifndef AVX2
 
-//--------------------------------------- SIMD ----------------------------------------------------------------------------------------------
+#else //--------------------------------------- SIMD ----------------------------------------------------------------------------------------------
 
 #define _BITNPACKV(in, n, out, _csize_, _usize_, _bitpackv_) {\
-  unsigned char *op = out; T3(uint, _usize_, _t) _o,_x;\
+  unsigned char *op = out; TEMPLATE3(uint, _usize_, _t) _o,_x;\
   for(ip = in; ip != in + (n&~(_csize_-1)); ip += _csize_) {        PREFETCH(ip+512,0);\
-                         unsigned _b; _o = T2(bit,_usize_)(ip, _csize_, &_x); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(_bitpackv_, _usize_)(ip, _csize_, op, _b);\
-  } if(n&=(_csize_-1)) { unsigned _b; _o = T2(bit,_usize_)(ip, n,       &_x); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(bitpack,    _usize_)(ip, n,       op, _b); }\
+                         unsigned _b; _o = TEMPLATE2(bit,_usize_)(ip, _csize_, &_x); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(_bitpackv_, _usize_)(ip, _csize_, op, _b);\
+  } if(n&=(_csize_-1)) { unsigned _b; _o = TEMPLATE2(bit,_usize_)(ip, n,       &_x); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(bitpack,    _usize_)(ip, n,       op, _b); }\
   return op - out;\
 }
 
 #define _BITNDPACKV(in, n, out, _csize_, _usize_, _bitdv_, _bitpackv_,  _bitd_, _bitpack_) { if(!n) return 0;\
-  unsigned char *op = out; T3(uint, _usize_, _t) _o,_x;\
+  unsigned char *op = out; TEMPLATE3(uint, _usize_, _t) _o,_x;\
   start = *in++; \
-  T2(vbxput, _usize_)(op, start);\
+  TEMPLATE2(vbxput, _usize_)(op, start);\
   for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\
-                         unsigned _b; _o = T2(_bitdv_, _usize_)(ip, _csize_, &_x, start); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(_bitpackv_, _usize_)(ip, _csize_, op, start, _b); ip += _csize_; start = ip[-1];\
-  } if(n&=(_csize_-1)) { unsigned _b; _o = T2(_bitd_,  _usize_)(ip, n,       &_x, start); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(_bitpack_,  _usize_)(ip, n,       op, start, _b); }\
+                         unsigned _b; _o = TEMPLATE2(_bitdv_, _usize_)(ip, _csize_, &_x, start); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(_bitpackv_, _usize_)(ip, _csize_, op, start, _b); ip += _csize_; start = ip[-1];\
+  } if(n&=(_csize_-1)) { unsigned _b; _o = TEMPLATE2(_bitd_,  _usize_)(ip, n,       &_x, start); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(_bitpack_,  _usize_)(ip, n,       op, start, _b); }\
   return op - out;\
 }
 
@@ -308,88 +258,63 @@ size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict
 #define OPPE(__op)
 #define IPPE(__op)
 
-//--- bitpack ---------------
 #define VI32(ip, i, iv, parm)
 #define IP32(ip, i, iv) _mm256_loadu_si256(ip++)
 
-unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
+unsigned char *bitpack256v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
 #undef VI32
 #undef IP32
 
-//-- bipack FOR --------------------------------------------------------------------------------------------------------------
+
 #define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),sv)
 #define IP32(_ip_, i, _iv_)             _iv_
 #include "bitpack_.h"
-unsigned char *bitfpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(256*b);
-  __m256i sv = _mm256_set1_epi32(start), v;
+unsigned char *bitfpack256v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+  __m256i v, sv = _mm256_set1_epi32(start);
   BITPACK256V32(in, b, out, sv);
   return pout;
 }
 
 #define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),_sv_); _sv_ = _mm256_add_epi32(_sv_,cv);
 #define IP32(ip, i, _iv_) _iv_
-unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(256*b);
-  __m256i v, sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), 
-          cv = _mm256_set1_epi32(8);
-  BITPACK256V32(in, b, out, sv); 
-  return pout;
+unsigned char *bitf1pack256v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+  __m256i v, sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm256_set1_epi32(8);
+  BITPACK256V32(in, b, out, sv); return pout;
 }
 
-//-- bitpack delta -------------------------------------------------------------------------------------------------------------
 #define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v
 #define IP32(ip, i, _iv_) _iv_
 #include "bitpack_.h"
-unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(256*b);
+unsigned char *bitdpack256v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
   __m256i v,sv = _mm256_set1_epi32(start);
   BITPACK256V32(in, b, out, sv);
   return pout;
 }
 
-//-- bitpack delta 1 ---------------------------------------------------------------------------------------------------------------
 #define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = _mm256_sub_epi32(mm256_delta_epi32(v,_sv_),cv); _sv_ = v
-unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(256*b);
-  __m256i sv = _mm256_set1_epi32(start), v,
-          cv = _mm256_set1_epi32(1);
+unsigned char *bitd1pack256v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+  __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
   BITPACK256V32(in, b, out, sv);
   return pout;
 }
 
-//-- bitpack zigzag -------------------------------------------------------------------------------------------------------------------------
 #define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v; _iv_ = mm256_zzage_epi32(_iv_)
-unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(256*b);
-  __m256i sv = _mm256_set1_epi32(start), v,
-		  cv = _mm256_set1_epi32(1);
-  BITPACK256V32(in, b, out, sv);
-  return pout;
-}
-
-//-- bitpack xor --------------------------------------------------------------------------------------------------------------
-#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_xore_epi32(v,_sv_); _sv_ = v; 
-unsigned char *bitxpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(256*b);
-  __m256i sv = _mm256_set1_epi32(start), v;
+unsigned char *bitzpack256v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+  __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
   BITPACK256V32(in, b, out, sv);
   return pout;
 }
 
-//--------------------------------------------------- bitnpack --------------------------------------------------------------------------------------------------
-size_t bitnpack256v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip;       _BITNPACKV( in, n, out, 256, 32,            bitpack256v); }
-size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd256v,  bitdpack256v, bitd,  bitdpack); }
-size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1256v, bitd1pack256v,bitd1, bitd1pack); }
-size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz256v,  bitzpack256v, bitz,  bitzpack); }
-size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf,      bitfpack256v, bitf,  bitfpack); }
-size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitx256v,  bitxpack256v, bitx,  bitxpack); }
+size_t bitnpack256v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip;       _BITNPACKV( in, n, out, 256, 32, bitpack256v); }
+size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd,  bitdpack256v, bitd, bitdpack); }
+size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1, bitd1pack256v,bitd1, bitd1pack); }
+size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz,  bitzpack256v, bitz, bitzpack); }
+size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf,  bitfpack256v, bitf, bitfpack); }
 
-  #elif defined(__SSE3__) || defined(__ARM_NEON) //----------------------------- SSE / AVX ---------------------------------------------------------------
+  #elif defined(__SSE2__) || defined(__ARM_NEON) //----------------------------- SSE ---------------------------------------------------------------
 #define OPPE(__op)
 #define IPPE(__op)
 
-//-- bitpack  --------------------------------------------------------------------------------
 #define VI16(ip, i, iv, parm)
 #define VI32(ip, i, iv, parm)
 #define IP16(_ip_, i, iv) _mm_loadu_si128(_ip_++)
@@ -397,15 +322,8 @@ size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__rest
 #include "bitpack_.h"
 unsigned char *bitpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V16(in, b, out, 0); return pout; }
 unsigned char *bitpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; }
-unsigned char *bitpack256w32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { 
-  unsigned char *_out = out; 
-  unsigned      *_in  = in;
-  BITPACK128V32(in, b, out, 0); 
-  in = _in+128; 
-  out = _out+PAD8(128*b); 
-  BITPACK128V32(in, b, out, 0); 
-  return _out+PAD8(256*b); 
-}
+unsigned char *bitpack256w32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *_out=out; unsigned *_in=in;
+BITPACK128V32(in, b, out, 0); in = _in+128; out = _out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return _out+PAD8(256*b); }
 
 #ifdef __ARM_NEON
 //#define IP32(_ip_, i, iv)     _mm_or_si128(_mm_shuffle_epi32(    _mm_loadu_si128(_ip_++),_MM_SHUFFLE(3, 1, 2, 0)), _mm_shuffle_epi32(     _mm_loadu_si128(_ip_++),_MM_SHUFFLE(2, 0, 3, 1)) )
@@ -415,148 +333,75 @@ unsigned char *bitpack256w32(unsigned       *__restrict in, unsigned n, unsigned
 #endif
 #include "bitpack_.h"
 unsigned char *bitpack128v64(uint64_t       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) {
-  if(b <= 32) { 
-    unsigned char *pout = out+PAD8(128*b); 
-	BITPACK128V32(in, b, out, 0); 
-	return pout; 
-  } else return bitpack64(in,n,out,b);
+  if(b<=32) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; } else return bitpack64(in,n,out,b);
 }
 
-//-- bitpack delta -----------------------------------------------------------------------------------------------------------------------
 #define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi16(v,_sv_); _sv_ = v
 #define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi32(v,_sv_); _sv_ = v
 #define IP16(ip, i, _iv_) _iv_
 #define IP32(ip, i, _iv_) _iv_
 #include "bitpack_.h"
-unsigned char *bitdpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi16(start), v; 
-  BITPACK128V16(in, b, out, sv); 
-  return pout;
+unsigned char *bitdpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v,sv = _mm_set1_epi16(start); BITPACK128V16(in, b, out, sv); return pout;
 }
-unsigned char *bitdpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi32(start), v; 
-  BITPACK128V32(in, b, out, sv); 
-  return pout;
+unsigned char *bitdpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v,sv = _mm_set1_epi32(start); BITPACK128V32(in, b, out, sv); return pout;
 }
 
-//-- bitpack FOR ---------------------------------------------------------------------------------------------------------------------------
 #define VI16(_ip_, _i_, _iv_, _sv_)
 #define VI32(_ip_, _i_, _iv_, _sv_)
 #define IP16(_ip_, i, _iv_)             _mm_sub_epi16(_mm_loadu_si128(_ip_++),sv)
 #define IP32(_ip_, i, _iv_)             _mm_sub_epi32(_mm_loadu_si128(_ip_++),sv)
 #include "bitpack_.h"
-unsigned char *bitfpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi16(start), v; 
-  BITPACK128V16(in, b, out, sv);  
-  return pout;
+unsigned char *bitfpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set1_epi16(start); BITPACK128V16(in, b, out, sv);  return pout;
 }
-unsigned char *bitfpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi32(start), v; 
-  BITPACK128V32(in, b, out, sv);  
-  return pout;
+unsigned char *bitfpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set1_epi32(start); BITPACK128V32(in, b, out, sv);  return pout;
 }
 
-//-- bitpack delta 1 -----------------------------------------------------------------------------------------------------------------------
 #define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi16(mm_delta_epi16(v,_sv_),cv); _sv_ = v
 #define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi32(mm_delta_epi32(v,_sv_),cv); _sv_ = v
 #define IP16(ip, i, _iv_) _iv_
 #define IP32(ip, i, _iv_) _iv_
-unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi16(start), 
-          cv = _mm_set1_epi16(1), v; 
-  BITPACK128V16(in, b, out, sv); 
-  return pout;
+unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(1), v; BITPACK128V16(in, b, out, sv); return pout;
 }
-
-unsigned char *bitd1pack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi32(start), v,
-          cv = _mm_set1_epi32(1); 
-  BITPACK128V32(in, b, out, sv); 
-  return pout;
+unsigned char *bitd1pack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); BITPACK128V32(in, b, out, sv); return pout;
 }
 
-//-- bitpack sub -----------------------------------------------------------------------------------------------------------------------------
 #define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi16(SUBI16x8(v,_sv_),cv); _sv_ = v
 #define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi32(SUBI32x4(v,_sv_),cv); _sv_ = v
 #define IP16(ip, i, _iv_) _iv_
 #define IP32(ip, i, _iv_) _iv_
-unsigned char *bits1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi16(start), v,
-          cv = _mm_set1_epi16(8); 
-  BITPACK128V16(in, b, out, sv); 
-  return pout;
+unsigned char *bits1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITPACK128V16(in, b, out, sv); return pout;
 }
-unsigned char *bits1pack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi32(start), v,
-          cv = _mm_set1_epi32(4); 
-  BITPACK128V32(in, b, out, sv); 
-  return pout;
+unsigned char *bits1pack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); return pout;
 }
 
-//-- bitpack FOR 1 -------------------------------------------------------------------------------------------------------------------------
 #define VI16(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi16(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi16(_sv_,cv);
 #define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi32(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi32(_sv_,cv);
 #define IP16(ip, i, _iv_) _iv_
 #define IP32(ip, i, _iv_) _iv_
-unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), v,
-          cv = _mm_set1_epi16(8); 
-  BITPACK128V16(in, b, out, sv); 
-  return pout;
+unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm_set1_epi16(8); BITPACK128V16(in, b, out, sv); return pout;
 }
-unsigned char *bitf1pack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set_epi32( start+4,start+3,start+2,start+1), v,
-          cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); 
-  return pout;
+unsigned char *bitf1pack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set_epi32(                                start+4,start+3,start+2,start+1), cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); return pout;
 }
 
-//-- bitpack zigzag ----------------------------------------------------------------------------------------------------------------------
 #define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi16(v,_sv_); _sv_ = v; _iv_ = mm_zzage_epi16(_iv_)
-unsigned char *bitzpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi16(start), v,
-          cv = _mm_set1_epi16(1); 
-  BITPACK128V16(in, b, out, sv); 
-  return pout;
+unsigned char *bitzpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(1); BITPACK128V16(in, b, out, sv); return pout;
 }
-
 #define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi32(v,_sv_); _sv_ = v; _iv_ = mm_zzage_epi32(_iv_)
-unsigned char *bitzpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi32(start), v,
-          cv = _mm_set1_epi32(1); 
-  BITPACK128V32(in, b, out, sv); 
-  return pout;
+unsigned char *bitzpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+  __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); BITPACK128V32(in, b, out, sv); return pout;
 }
 
-//-- bitpack xor --------------------------------------------------------------------------------------------------------------------------
-#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_xore_epi16(v,_sv_); _sv_ = v;
-unsigned char *bitxpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi16(start), v;
-  BITPACK128V16(in, b, out, sv); 
-  return pout;
-}
-
-#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_xore_epi32(v,_sv_); _sv_ = v;
-unsigned char *bitxpack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b) { 
-  unsigned char *pout = out+PAD8(128*b);
-  __m128i sv = _mm_set1_epi32(start), v;
-  BITPACK128V32(in, b, out, sv); 
-  return pout;
-}
-
-//---------------------------- bitpack --------------------------------------------------------------------------------------------------------------------------
 size_t bitnpack128v16(  uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip;       _BITNPACKV( in, n, out, 128, 16, bitpack128v); }
 size_t bitnpack128v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip;       _BITNPACKV( in, n, out, 128, 32, bitpack128v); }
 size_t bitnpack128v64(  uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip;       _BITNPACKV( in, n, out, 128, 64, bitpack128v); }
@@ -574,11 +419,9 @@ size_t bitns1pack128v32(uint32_t *__restrict in, size_t n, unsigned char *__rest
 size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitz,  bitzpack128v, bitz, bitzpack); }
 size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitz,  bitzpack128v, bitz, bitzpack); }
 
-size_t bitnxpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitx,  bitxpack128v, bitx, bitxpack); }
-size_t bitnxpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitx,  bitxpack128v, bitx, bitxpack); }
-
 size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitf,  bitfpack128v, bitf, bitfpack); }
 size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitf,  bitfpack128v, bitf, bitfpack); }
   #endif // SSE
+#endif // Plain
 
 #pragma clang diagnostic pop
diff --git a/src/ext/for/include_/bitpack.h b/src/ext/for/bitpack.h
similarity index 81%
rename from src/ext/for/include_/bitpack.h
rename to src/ext/for/bitpack.h
index a4aa1f5a..b0b9e022 100644
--- a/src/ext/for/include_/bitpack.h
+++ b/src/ext/for/bitpack.h
@@ -1,24 +1,41 @@
-//-- bitpack -------------------------------------------------------------------------------------------------------
+/**
+    Copyright (C) powturbo 2013-2019
+    GPL v2 License
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    - homepage : https://sites.google.com/site/powturbo/
+    - github   : https://github.com/powturbo
+    - twitter  : https://twitter.com/powturbo
+    - email    : powturbo [_AT_] gmail [_DOT_] com
+**/
+//     bitpack.h - "Integer Compression" Binary Packing header file
+#ifndef BITPACK_H_
+#define BITPACK_H_
+#if defined(_MSC_VER) && _MSC_VER < 1600
+#include "vs/stdint.h"
+#else
+#include <stdint.h>
+#endif
+#include <stddef.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-size_t bitnbound8(     size_t n);
-size_t bitnbound16(    size_t n);
-size_t bitnbound32(    size_t n);
-size_t bitnbound64(    size_t n);
-
-size_t bitnbound128v8( size_t n);
-size_t bitnbound128v16(size_t n);
-size_t bitnbound128v32(size_t n);
-size_t bitnbound128v64(size_t n);
-
-size_t bitnbound256v8( size_t n);
-size_t bitnbound256v16(size_t n);
-size_t bitnbound256v32(size_t n);
-size_t bitnbound256v64(size_t n);
-
-//******************** Bit Packing High Level API - n unlimited ****************************
+//******************** Bit Packing High Level API - n unlimited ***************************************************
 size_t bitnpack8(         uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
 size_t bitnpack16(        uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
 size_t bitnpack32(        uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
@@ -52,14 +69,6 @@ size_t bitnzpack128v16(   uint16_t *__restrict in, size_t n, unsigned char *__re
 size_t bitnzpack128v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
 size_t bitnzpack256v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
 
-size_t bitnxpack8(        uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack16(       uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack32(       uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack64(       uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack128v16(   uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack128v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack256v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
 size_t bitnfpack8(        uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
 size_t bitnfpack16(       uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
 size_t bitnfpack32(       uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
@@ -101,14 +110,6 @@ size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__re
 size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
 size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
 
-size_t bitnxunpack8(      unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t bitnxunpack16(     unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnxunpack32(     unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnxunpack64(     unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitnxunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnxunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
 size_t bitnfunpack8(      unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
 size_t bitnfunpack16(     unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
 size_t bitnfunpack32(     unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
@@ -116,13 +117,6 @@ size_t bitnfunpack64(     unsigned char *__restrict in, size_t n, uint64_t *__re
 size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
 size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
 size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t bitns1pack128v16(  uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitns1pack128v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitns1unpack128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitns1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-
 //******** Bit Packing Low level API ****************************************************************
 // bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
 unsigned char *bitpack8(    uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
@@ -158,12 +152,6 @@ unsigned char *bitzpack16(  uint16_t *__restrict in, unsigned n, const unsigned
 unsigned char *bitzpack32(  uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
 unsigned char *bitzpack64(  uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
 
-// xor : unsorted integer array
-unsigned char *bitxpack8(   uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitxpack16(  uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitxpack32(  uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitxpack64(  uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-
 //-------------------------------------- SIMD ------------------------------------------------------------------------------------------
 // Pack array with 128 unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
 unsigned char *bitpack128v16(  unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out                      , unsigned b);
@@ -172,31 +160,24 @@ unsigned char *bitd1pack128v16(unsigned short    *__restrict in, unsigned n, uns
 unsigned char *bitfpack128v16( unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
 unsigned char *bitf1pack128v16(unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
 unsigned char *bitzpack128v16( unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitxpack128v16( unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
 
-unsigned char *bitpack128v32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out,                       unsigned b);
-unsigned char *bitdpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitd1pack128v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitfpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitf1pack128v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitzpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitxpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
+unsigned char *bitpack128v32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out                , unsigned b);
+unsigned char *bitdpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitd1pack128v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitfpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitf1pack128v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitzpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
 
 //unsigned char *bitpack256w32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out                  , unsigned b);
-unsigned char *bitpack128v64(  uint64_t          *__restrict in, unsigned n, unsigned char *__restrict out,                       unsigned b);
-
-unsigned char *bitpack256v32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out,                       unsigned b);
-unsigned char *bitdpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitd1pack256v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitfpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitf1pack256v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitzpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-unsigned char *bitxpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned       start, unsigned b);
-
-unsigned char *bits1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bits1pack128v32(unsigned       *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bits1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
+unsigned char *bitpack128v64(  uint64_t          *__restrict in, unsigned n, unsigned char *__restrict out                , unsigned b);
+
+unsigned char *bitpack256v32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out                , unsigned b);
+unsigned char *bitdpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitd1pack256v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitfpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitf1pack256v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitzpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+
 //********************************** Bit Packing : Unpack ****************************************************************
 
 // ---------------- Unpack a b-bits packed integer array -------------------------------------------------------------------------------
@@ -208,6 +189,15 @@ unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint
 
 // ---------------- Direct Access to a single packed integer array entry --------------------------------------------------------------
   #ifdef TURBOPFOR_DAC
+    #ifdef __AVX2__
+#include <immintrin.h>
+#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
+#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
+    #else
+#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
+#define bzhi32(_u_, _b_) ((_u_) & ((1u  <<(_b_))-1))
+    #endif
+
 #include "conf.h"
 
 static ALWAYS_INLINE unsigned  bitgetx32(const unsigned char *__restrict in, unsigned  idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
@@ -247,12 +237,6 @@ unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, uin
 unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
 unsigned char *bitzunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
 
-// ---------------- Xor : integrated bitpacking, for xor packed unsorted
-unsigned char *bitxunpack8(  const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitxunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitxunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitxunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-
 // ---------------- For : Direct Access for packed SORTED array  --------------------------------------------
 // out[i] = start + in[i] + i
 unsigned char *bitfunpack8(  const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b);
@@ -270,7 +254,6 @@ unsigned char *bitf1unpack64(const unsigned char *__restrict in, unsigned n, uin
 // SIMD unpack a 128/256 bitpacked integer array. Return value = end of packed buffer in
 unsigned char *bitunpack128v16(  const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out,                       unsigned b);
 unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitxunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
 unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
 unsigned char *bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
 unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
@@ -278,7 +261,6 @@ unsigned char *bitf1unpack128v16(const unsigned char *__restrict in, unsigned n,
 
 unsigned char *bitunpack128v32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out,                 unsigned b);
 unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitxunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
@@ -289,7 +271,6 @@ unsigned char *bitunpack128v64(  const unsigned char *__restrict in, unsigned n,
 
 unsigned char *bitunpack256v32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out,                 unsigned b);
 unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitxunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
@@ -325,4 +306,5 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n
 #ifdef __cplusplus
 }
 #endif
+#endif
 
diff --git a/src/ext/for/bitpack_.h b/src/ext/for/bitpack_.h
index 6480df87..d20cf9f0 100644
--- a/src/ext/for/bitpack_.h
+++ b/src/ext/for/bitpack_.h
@@ -1,6 +1,6 @@
 /**
-  Copyright (C) powturbo 2013-2023
-  SPDX-License-Identifier: GPL v2 License
+  Copyright (C) powturbo 2013-2017
+  GPL v2 License
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -2258,556 +2258,543 @@
   BITBLK64_64(ip, 31, op, parm);  IPI(ip); op += 64*4/sizeof(op[0]);\
 }
 
-#define BP(_b_,_usize_) unsigned char *out_=out+PAD8(n*_b_),*op, bout[PAD8(64*_b_)]; T3(uint,_usize_,_t) bin[64],*ip,*in_=in+n, v,x; \
-  do { ip = in+32; op = out+PAD8(32*_b_); if(ip > in_) { memcpy(bin, in, (in_-in)*(_usize_/8)); in = bin; out = bout; } \
-    T2(BITPACK64_,_b_)(in, out, start); in = ip; out = op; PREFETCH(in+384,0);\
-  } while(in<in_); if(in>in_) { out -= PAD8(32*_b_); memcpy(out,bout,PAD8((in_-(in-32))*_b_)); }  return out_
-
 #ifndef DELTA
 #define USIZE 8
-unsigned char *T2(_BITPACK_,8_0)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
-unsigned char *T2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(1,8);}
-unsigned char *T2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(2,8);}
-unsigned char *T2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(3,8);}
-unsigned char *T2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(4,8);}
-unsigned char *T2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(5,8);}
-unsigned char *T2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(6,8);}
-unsigned char *T2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(7,8);}
-unsigned char *T2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(8,8);}
-BITPACK_F8 T2(_BITPACK_,a8)[] = {
-  &T2(_BITPACK_,8_0),
-  &T2(_BITPACK_,8_1),
-  &T2(_BITPACK_,8_2),
-  &T2(_BITPACK_,8_3),
-  &T2(_BITPACK_,8_4),
-  &T2(_BITPACK_,8_5),
-  &T2(_BITPACK_,8_6),
-  &T2(_BITPACK_,8_7),
-  &T2(_BITPACK_,8_8)
+unsigned char *TEMPLATE2(_BITPACK_,8_0)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*1); uint8_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*2); uint8_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*3); uint8_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*4); uint8_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*5); uint8_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*6); uint8_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*7); uint8_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*8); uint8_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F8 TEMPLATE2(_BITPACK_,a8)[] = {
+  &TEMPLATE2(_BITPACK_,8_0),
+  &TEMPLATE2(_BITPACK_,8_1),
+  &TEMPLATE2(_BITPACK_,8_2),
+  &TEMPLATE2(_BITPACK_,8_3),
+  &TEMPLATE2(_BITPACK_,8_4),
+  &TEMPLATE2(_BITPACK_,8_5),
+  &TEMPLATE2(_BITPACK_,8_6),
+  &TEMPLATE2(_BITPACK_,8_7),
+  &TEMPLATE2(_BITPACK_,8_8)
 };
-unsigned char *T2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a8)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a8)[ b](in, n, out); }
 
 #define USIZE 16
-unsigned char *T2(_BITPACK_,16_0 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
-unsigned char *T2(_BITPACK_,16_1 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(1,16);}
-unsigned char *T2(_BITPACK_,16_2 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(2,16);}
-unsigned char *T2(_BITPACK_,16_3 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(3,16);}
-unsigned char *T2(_BITPACK_,16_4 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(4,16);}
-unsigned char *T2(_BITPACK_,16_5 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(5,16);}
-unsigned char *T2(_BITPACK_,16_6 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(6,16);}
-unsigned char *T2(_BITPACK_,16_7 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(7,16);}
-unsigned char *T2(_BITPACK_,16_8 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(8,16);}
-unsigned char *T2(_BITPACK_,16_9 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(9,16);}
-unsigned char *T2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(10,16);}
-unsigned char *T2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(11,16);}
-unsigned char *T2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(12,16);}
-unsigned char *T2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(13,16);}
-unsigned char *T2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(14,16);}
-unsigned char *T2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(15,16);}
-unsigned char *T2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(16,16);}
-BITPACK_F16 T2(_BITPACK_,a16)[] = {
-  &T2(_BITPACK_,16_0),
-  &T2(_BITPACK_,16_1),
-  &T2(_BITPACK_,16_2),
-  &T2(_BITPACK_,16_3),
-  &T2(_BITPACK_,16_4),
-  &T2(_BITPACK_,16_5),
-  &T2(_BITPACK_,16_6),
-  &T2(_BITPACK_,16_7),
-  &T2(_BITPACK_,16_8),
-  &T2(_BITPACK_,16_9),
-  &T2(_BITPACK_,16_10),
-  &T2(_BITPACK_,16_11),
-  &T2(_BITPACK_,16_12),
-  &T2(_BITPACK_,16_13),
-  &T2(_BITPACK_,16_14),
-  &T2(_BITPACK_,16_15),
-  &T2(_BITPACK_,16_16)
+unsigned char *TEMPLATE2(_BITPACK_,16_0)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,16_1)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*1); uint16_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_2)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*2); uint16_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_3)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*3); uint16_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_4)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*4); uint16_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_5)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*5); uint16_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_6)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*6); uint16_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_7)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*7); uint16_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_8)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*8); uint16_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_9)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*9); uint16_t v,x;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*10); uint16_t v,x;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*11); uint16_t v,x;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*12); uint16_t v,x;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*13); uint16_t v,x;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*14); uint16_t v,x;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*15); uint16_t v,x;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*16); uint16_t v,x;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F16 TEMPLATE2(_BITPACK_,a16)[] = {
+  &TEMPLATE2(_BITPACK_,16_0),
+  &TEMPLATE2(_BITPACK_,16_1),
+  &TEMPLATE2(_BITPACK_,16_2),
+  &TEMPLATE2(_BITPACK_,16_3),
+  &TEMPLATE2(_BITPACK_,16_4),
+  &TEMPLATE2(_BITPACK_,16_5),
+  &TEMPLATE2(_BITPACK_,16_6),
+  &TEMPLATE2(_BITPACK_,16_7),
+  &TEMPLATE2(_BITPACK_,16_8),
+  &TEMPLATE2(_BITPACK_,16_9),
+  &TEMPLATE2(_BITPACK_,16_10),
+  &TEMPLATE2(_BITPACK_,16_11),
+  &TEMPLATE2(_BITPACK_,16_12),
+  &TEMPLATE2(_BITPACK_,16_13),
+  &TEMPLATE2(_BITPACK_,16_14),
+  &TEMPLATE2(_BITPACK_,16_15),
+  &TEMPLATE2(_BITPACK_,16_16)
 };
-unsigned char *T2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a16)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a16)[ b](in, n, out); }
 
 #define USIZE 32
-unsigned char *T2(_BITPACK_,32_0 )( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
-unsigned char *T2(_BITPACK_,32_1 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 1,32);}
-unsigned char *T2(_BITPACK_,32_2 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 2,32);}
-unsigned char *T2(_BITPACK_,32_3 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 3,32);}
-unsigned char *T2(_BITPACK_,32_4 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 4,32);}
-unsigned char *T2(_BITPACK_,32_5 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 5,32);}
-unsigned char *T2(_BITPACK_,32_6 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 6,32);}
-unsigned char *T2(_BITPACK_,32_7 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 7,32);}
-unsigned char *T2(_BITPACK_,32_8 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 8,32);}
-unsigned char *T2(_BITPACK_,32_9 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 9,32);}
-unsigned char *T2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(10,32);}
-unsigned char *T2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(11,32);}
-unsigned char *T2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(12,32);}
-unsigned char *T2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(13,32);}
-unsigned char *T2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(14,32);}
-unsigned char *T2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(15,32);}
-unsigned char *T2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(16,32);}
-unsigned char *T2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(17,32);}
-unsigned char *T2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(18,32);}
-unsigned char *T2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(19,32);}
-unsigned char *T2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(20,32);}
-unsigned char *T2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(21,32);}
-unsigned char *T2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(22,32);}
-unsigned char *T2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(23,32);}
-unsigned char *T2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(24,32);}
-unsigned char *T2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(25,32);}
-unsigned char *T2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(26,32);}
-unsigned char *T2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(27,32);}
-unsigned char *T2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(28,32);}
-unsigned char *T2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(29,32);}
-unsigned char *T2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(30,32);}
-unsigned char *T2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(31,32);}
-unsigned char *T2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(32,32);}
-BITPACK_F32 T2(_BITPACK_,a32)[] = {
-  &T2(_BITPACK_,32_0),
-  &T2(_BITPACK_,32_1),
-  &T2(_BITPACK_,32_2),
-  &T2(_BITPACK_,32_3),
-  &T2(_BITPACK_,32_4),
-  &T2(_BITPACK_,32_5),
-  &T2(_BITPACK_,32_6),
-  &T2(_BITPACK_,32_7),
-  &T2(_BITPACK_,32_8),
-  &T2(_BITPACK_,32_9),
-  &T2(_BITPACK_,32_10),
-  &T2(_BITPACK_,32_11),
-  &T2(_BITPACK_,32_12),
-  &T2(_BITPACK_,32_13),
-  &T2(_BITPACK_,32_14),
-  &T2(_BITPACK_,32_15),
-  &T2(_BITPACK_,32_16),
-  &T2(_BITPACK_,32_17),
-  &T2(_BITPACK_,32_18),
-  &T2(_BITPACK_,32_19),
-  &T2(_BITPACK_,32_20),
-  &T2(_BITPACK_,32_21),
-  &T2(_BITPACK_,32_22),
-  &T2(_BITPACK_,32_23),
-  &T2(_BITPACK_,32_24),
-  &T2(_BITPACK_,32_25),
-  &T2(_BITPACK_,32_26),
-  &T2(_BITPACK_,32_27),
-  &T2(_BITPACK_,32_28),
-  &T2(_BITPACK_,32_29),
-  &T2(_BITPACK_,32_30),
-  &T2(_BITPACK_,32_31),
-  &T2(_BITPACK_,32_32)
+unsigned char *TEMPLATE2(_BITPACK_,32_0)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,32_1)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*1); uint32_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_2)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*2); uint32_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_3)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*3); uint32_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_4)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*4); uint32_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_5)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*5); uint32_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_6)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*6); uint32_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_7)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*7); uint32_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_8)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*8); uint32_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_9)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*9); uint32_t v,x;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*10); uint32_t v,x;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*11); uint32_t v,x;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*12); uint32_t v,x;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*13); uint32_t v,x;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*14); uint32_t v,x;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*15); uint32_t v,x;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*16); uint32_t v,x;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*17); uint32_t v,x;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*18); uint32_t v,x;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*19); uint32_t v,x;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*20); uint32_t v,x;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*21); uint32_t v,x;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*22); uint32_t v,x;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*23); uint32_t v,x;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*24); uint32_t v,x;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*25); uint32_t v,x;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*26); uint32_t v,x;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*27); uint32_t v,x;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*28); uint32_t v,x;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*29); uint32_t v,x;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*30); uint32_t v,x;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*31); uint32_t v,x;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*32); uint32_t v,x;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F32 TEMPLATE2(_BITPACK_,a32)[] = {
+  &TEMPLATE2(_BITPACK_,32_0),
+  &TEMPLATE2(_BITPACK_,32_1),
+  &TEMPLATE2(_BITPACK_,32_2),
+  &TEMPLATE2(_BITPACK_,32_3),
+  &TEMPLATE2(_BITPACK_,32_4),
+  &TEMPLATE2(_BITPACK_,32_5),
+  &TEMPLATE2(_BITPACK_,32_6),
+  &TEMPLATE2(_BITPACK_,32_7),
+  &TEMPLATE2(_BITPACK_,32_8),
+  &TEMPLATE2(_BITPACK_,32_9),
+  &TEMPLATE2(_BITPACK_,32_10),
+  &TEMPLATE2(_BITPACK_,32_11),
+  &TEMPLATE2(_BITPACK_,32_12),
+  &TEMPLATE2(_BITPACK_,32_13),
+  &TEMPLATE2(_BITPACK_,32_14),
+  &TEMPLATE2(_BITPACK_,32_15),
+  &TEMPLATE2(_BITPACK_,32_16),
+  &TEMPLATE2(_BITPACK_,32_17),
+  &TEMPLATE2(_BITPACK_,32_18),
+  &TEMPLATE2(_BITPACK_,32_19),
+  &TEMPLATE2(_BITPACK_,32_20),
+  &TEMPLATE2(_BITPACK_,32_21),
+  &TEMPLATE2(_BITPACK_,32_22),
+  &TEMPLATE2(_BITPACK_,32_23),
+  &TEMPLATE2(_BITPACK_,32_24),
+  &TEMPLATE2(_BITPACK_,32_25),
+  &TEMPLATE2(_BITPACK_,32_26),
+  &TEMPLATE2(_BITPACK_,32_27),
+  &TEMPLATE2(_BITPACK_,32_28),
+  &TEMPLATE2(_BITPACK_,32_29),
+  &TEMPLATE2(_BITPACK_,32_30),
+  &TEMPLATE2(_BITPACK_,32_31),
+  &TEMPLATE2(_BITPACK_,32_32)
 };
-unsigned char *T2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a32)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a32)[ b](in, n, out); }
 
 #define USIZE 64
-unsigned char *T2(_BITPACK_,64_0 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
-unsigned char *T2(_BITPACK_,64_1 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 1,64);}
-unsigned char *T2(_BITPACK_,64_2 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 2,64);}
-unsigned char *T2(_BITPACK_,64_3 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 3,64);}
-unsigned char *T2(_BITPACK_,64_4 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 4,64);}
-unsigned char *T2(_BITPACK_,64_5 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 5,64);}
-unsigned char *T2(_BITPACK_,64_6 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 6,64);}
-unsigned char *T2(_BITPACK_,64_7 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 7,64);}
-unsigned char *T2(_BITPACK_,64_8 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 8,64);}
-unsigned char *T2(_BITPACK_,64_9 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP( 9,64);}
-unsigned char *T2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(10,64);}
-unsigned char *T2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(11,64);}
-unsigned char *T2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(12,64);}
-unsigned char *T2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(13,64);}
-unsigned char *T2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(14,64);}
-unsigned char *T2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(15,64);}
-unsigned char *T2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(16,64);}
-unsigned char *T2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(17,64);}
-unsigned char *T2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(18,64);}
-unsigned char *T2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(19,64);}
-unsigned char *T2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(20,64);}
-unsigned char *T2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(21,64);}
-unsigned char *T2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(22,64);}
-unsigned char *T2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(23,64);}
-unsigned char *T2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(24,64);}
-unsigned char *T2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(25,64);}
-unsigned char *T2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(26,64);}
-unsigned char *T2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(27,64);}
-unsigned char *T2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(28,64);}
-unsigned char *T2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(29,64);}
-unsigned char *T2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(30,64);}
-unsigned char *T2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(31,64);}
-unsigned char *T2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(32,64);}
-unsigned char *T2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(33,64);}
-unsigned char *T2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(34,64);}
-unsigned char *T2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(35,64);}
-unsigned char *T2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(36,64);}
-unsigned char *T2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(37,64);}
-unsigned char *T2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(38,64);}
-unsigned char *T2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(39,64);}
-unsigned char *T2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(40,64);}
-unsigned char *T2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(41,64);}
-unsigned char *T2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(42,64);}
-unsigned char *T2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(43,64);}
-unsigned char *T2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(44,64);}
-unsigned char *T2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(45,64);}
-unsigned char *T2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(46,64);}
-unsigned char *T2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(47,64);}
-unsigned char *T2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(48,64);}
-unsigned char *T2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(49,64);}
-unsigned char *T2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(50,64);}
-unsigned char *T2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(51,64);}
-unsigned char *T2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(52,64);}
-unsigned char *T2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(53,64);}
-unsigned char *T2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(54,64);}
-unsigned char *T2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(55,64);}
-unsigned char *T2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(56,64);}
-unsigned char *T2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(57,64);}
-unsigned char *T2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(58,64);}
-unsigned char *T2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(59,64);}
-unsigned char *T2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(60,64);}
-unsigned char *T2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(61,64);}
-unsigned char *T2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(62,64);}
-unsigned char *T2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(63,64);}
-unsigned char *T2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { BP(64,64);}
-BITPACK_F64 T2(_BITPACK_,a64)[] = {
-  &T2(_BITPACK_,64_0),
-  &T2(_BITPACK_,64_1),
-  &T2(_BITPACK_,64_2),
-  &T2(_BITPACK_,64_3),
-  &T2(_BITPACK_,64_4),
-  &T2(_BITPACK_,64_5),
-  &T2(_BITPACK_,64_6),
-  &T2(_BITPACK_,64_7),
-  &T2(_BITPACK_,64_8),
-  &T2(_BITPACK_,64_9),
-  &T2(_BITPACK_,64_10),
-  &T2(_BITPACK_,64_11),
-  &T2(_BITPACK_,64_12),
-  &T2(_BITPACK_,64_13),
-  &T2(_BITPACK_,64_14),
-  &T2(_BITPACK_,64_15),
-  &T2(_BITPACK_,64_16),
-  &T2(_BITPACK_,64_17),
-  &T2(_BITPACK_,64_18),
-  &T2(_BITPACK_,64_19),
-  &T2(_BITPACK_,64_20),
-  &T2(_BITPACK_,64_21),
-  &T2(_BITPACK_,64_22),
-  &T2(_BITPACK_,64_23),
-  &T2(_BITPACK_,64_24),
-  &T2(_BITPACK_,64_25),
-  &T2(_BITPACK_,64_26),
-  &T2(_BITPACK_,64_27),
-  &T2(_BITPACK_,64_28),
-  &T2(_BITPACK_,64_29),
-  &T2(_BITPACK_,64_30),
-  &T2(_BITPACK_,64_31),
-  &T2(_BITPACK_,64_32),
-  &T2(_BITPACK_,64_33),
-  &T2(_BITPACK_,64_34),
-  &T2(_BITPACK_,64_35),
-  &T2(_BITPACK_,64_36),
-  &T2(_BITPACK_,64_37),
-  &T2(_BITPACK_,64_38),
-  &T2(_BITPACK_,64_39),
-  &T2(_BITPACK_,64_40),
-  &T2(_BITPACK_,64_41),
-  &T2(_BITPACK_,64_42),
-  &T2(_BITPACK_,64_43),
-  &T2(_BITPACK_,64_44),
-  &T2(_BITPACK_,64_45),
-  &T2(_BITPACK_,64_46),
-  &T2(_BITPACK_,64_47),
-  &T2(_BITPACK_,64_48),
-  &T2(_BITPACK_,64_49),
-  &T2(_BITPACK_,64_50),
-  &T2(_BITPACK_,64_51),
-  &T2(_BITPACK_,64_52),
-  &T2(_BITPACK_,64_53),
-  &T2(_BITPACK_,64_54),
-  &T2(_BITPACK_,64_55),
-  &T2(_BITPACK_,64_56),
-  &T2(_BITPACK_,64_57),
-  &T2(_BITPACK_,64_58),
-  &T2(_BITPACK_,64_59),
-  &T2(_BITPACK_,64_60),
-  &T2(_BITPACK_,64_61),
-  &T2(_BITPACK_,64_62),
-  &T2(_BITPACK_,64_63),
-  &T2(_BITPACK_,64_64)
+unsigned char *TEMPLATE2(_BITPACK_,64_0)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,64_1)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*1); uint64_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_2)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*2); uint64_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_3)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*3); uint64_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_4)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*4); uint64_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_5)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*5); uint64_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_6)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*6); uint64_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_7)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*7); uint64_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_8)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*8); uint64_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_9)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*9); uint64_t v,x;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*10); uint64_t v,x;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*11); uint64_t v,x;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*12); uint64_t v,x;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*13); uint64_t v,x;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*14); uint64_t v,x;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*15); uint64_t v,x;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*16); uint64_t v,x;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*17); uint64_t v,x;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*18); uint64_t v,x;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*19); uint64_t v,x;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*20); uint64_t v,x;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*21); uint64_t v,x;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*22); uint64_t v,x;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*23); uint64_t v,x;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*24); uint64_t v,x;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*25); uint64_t v,x;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*26); uint64_t v,x;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*27); uint64_t v,x;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*28); uint64_t v,x;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*29); uint64_t v,x;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*30); uint64_t v,x;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*31); uint64_t v,x;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*32); uint64_t v,x;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*33); uint64_t v,x;do { BITPACK64_33( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*34); uint64_t v,x;do { BITPACK64_34( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*35); uint64_t v,x;do { BITPACK64_35( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*36); uint64_t v,x;do { BITPACK64_36( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*37); uint64_t v,x;do { BITPACK64_37( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*38); uint64_t v,x;do { BITPACK64_38( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*39); uint64_t v,x;do { BITPACK64_39( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*40); uint64_t v,x;do { BITPACK64_40( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*41); uint64_t v,x;do { BITPACK64_41( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*42); uint64_t v,x;do { BITPACK64_42( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*43); uint64_t v,x;do { BITPACK64_43( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*44); uint64_t v,x;do { BITPACK64_44( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*45); uint64_t v,x;do { BITPACK64_45( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*46); uint64_t v,x;do { BITPACK64_46( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*47); uint64_t v,x;do { BITPACK64_47( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*48); uint64_t v,x;do { BITPACK64_48( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*49); uint64_t v,x;do { BITPACK64_49( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*50); uint64_t v,x;do { BITPACK64_50( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*51); uint64_t v,x;do { BITPACK64_51( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*52); uint64_t v,x;do { BITPACK64_52( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*53); uint64_t v,x;do { BITPACK64_53( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*54); uint64_t v,x;do { BITPACK64_54( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*55); uint64_t v,x;do { BITPACK64_55( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*56); uint64_t v,x;do { BITPACK64_56( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*57); uint64_t v,x;do { BITPACK64_57( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*58); uint64_t v,x;do { BITPACK64_58( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*59); uint64_t v,x;do { BITPACK64_59( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*60); uint64_t v,x;do { BITPACK64_60( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*61); uint64_t v,x;do { BITPACK64_61( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*62); uint64_t v,x;do { BITPACK64_62( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*63); uint64_t v,x;do { BITPACK64_63( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out  ) { unsigned char *out_=out+PAD8(n*64); uint64_t v,x;do { BITPACK64_64( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F64 TEMPLATE2(_BITPACK_,a64)[] = {
+  &TEMPLATE2(_BITPACK_,64_0),
+  &TEMPLATE2(_BITPACK_,64_1),
+  &TEMPLATE2(_BITPACK_,64_2),
+  &TEMPLATE2(_BITPACK_,64_3),
+  &TEMPLATE2(_BITPACK_,64_4),
+  &TEMPLATE2(_BITPACK_,64_5),
+  &TEMPLATE2(_BITPACK_,64_6),
+  &TEMPLATE2(_BITPACK_,64_7),
+  &TEMPLATE2(_BITPACK_,64_8),
+  &TEMPLATE2(_BITPACK_,64_9),
+  &TEMPLATE2(_BITPACK_,64_10),
+  &TEMPLATE2(_BITPACK_,64_11),
+  &TEMPLATE2(_BITPACK_,64_12),
+  &TEMPLATE2(_BITPACK_,64_13),
+  &TEMPLATE2(_BITPACK_,64_14),
+  &TEMPLATE2(_BITPACK_,64_15),
+  &TEMPLATE2(_BITPACK_,64_16),
+  &TEMPLATE2(_BITPACK_,64_17),
+  &TEMPLATE2(_BITPACK_,64_18),
+  &TEMPLATE2(_BITPACK_,64_19),
+  &TEMPLATE2(_BITPACK_,64_20),
+  &TEMPLATE2(_BITPACK_,64_21),
+  &TEMPLATE2(_BITPACK_,64_22),
+  &TEMPLATE2(_BITPACK_,64_23),
+  &TEMPLATE2(_BITPACK_,64_24),
+  &TEMPLATE2(_BITPACK_,64_25),
+  &TEMPLATE2(_BITPACK_,64_26),
+  &TEMPLATE2(_BITPACK_,64_27),
+  &TEMPLATE2(_BITPACK_,64_28),
+  &TEMPLATE2(_BITPACK_,64_29),
+  &TEMPLATE2(_BITPACK_,64_30),
+  &TEMPLATE2(_BITPACK_,64_31),
+  &TEMPLATE2(_BITPACK_,64_32),
+  &TEMPLATE2(_BITPACK_,64_33),
+  &TEMPLATE2(_BITPACK_,64_34),
+  &TEMPLATE2(_BITPACK_,64_35),
+  &TEMPLATE2(_BITPACK_,64_36),
+  &TEMPLATE2(_BITPACK_,64_37),
+  &TEMPLATE2(_BITPACK_,64_38),
+  &TEMPLATE2(_BITPACK_,64_39),
+  &TEMPLATE2(_BITPACK_,64_40),
+  &TEMPLATE2(_BITPACK_,64_41),
+  &TEMPLATE2(_BITPACK_,64_42),
+  &TEMPLATE2(_BITPACK_,64_43),
+  &TEMPLATE2(_BITPACK_,64_44),
+  &TEMPLATE2(_BITPACK_,64_45),
+  &TEMPLATE2(_BITPACK_,64_46),
+  &TEMPLATE2(_BITPACK_,64_47),
+  &TEMPLATE2(_BITPACK_,64_48),
+  &TEMPLATE2(_BITPACK_,64_49),
+  &TEMPLATE2(_BITPACK_,64_50),
+  &TEMPLATE2(_BITPACK_,64_51),
+  &TEMPLATE2(_BITPACK_,64_52),
+  &TEMPLATE2(_BITPACK_,64_53),
+  &TEMPLATE2(_BITPACK_,64_54),
+  &TEMPLATE2(_BITPACK_,64_55),
+  &TEMPLATE2(_BITPACK_,64_56),
+  &TEMPLATE2(_BITPACK_,64_57),
+  &TEMPLATE2(_BITPACK_,64_58),
+  &TEMPLATE2(_BITPACK_,64_59),
+  &TEMPLATE2(_BITPACK_,64_60),
+  &TEMPLATE2(_BITPACK_,64_61),
+  &TEMPLATE2(_BITPACK_,64_62),
+  &TEMPLATE2(_BITPACK_,64_63),
+  &TEMPLATE2(_BITPACK_,64_64)
 };
-unsigned char *T2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a64)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a64)[ b](in, n, out); }
 
 #else
 #define USIZE 8
-unsigned char *T2(_BITPACK_,8_0)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { return out; }
-unsigned char *T2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(1,8);}
-unsigned char *T2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(2,8);}
-unsigned char *T2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(3,8);}
-unsigned char *T2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(4,8);}
-unsigned char *T2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(5,8);}
-unsigned char *T2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(6,8);}
-unsigned char *T2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(7,8);}
-unsigned char *T2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(8,8);}
-BITPACK_D8 T2(_BITPACK_,a8)[] = {
-  &T2(_BITPACK_,8_0),
-  &T2(_BITPACK_,8_1),
-  &T2(_BITPACK_,8_2),
-  &T2(_BITPACK_,8_3),
-  &T2(_BITPACK_,8_4),
-  &T2(_BITPACK_,8_5),
-  &T2(_BITPACK_,8_6),
-  &T2(_BITPACK_,8_7),
-  &T2(_BITPACK_,8_8)
+unsigned char *TEMPLATE2(_BITPACK_,8_0)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*1); uint8_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*2); uint8_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*3); uint8_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*4); uint8_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*5); uint8_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*6); uint8_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*7); uint8_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*8); uint8_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D8 TEMPLATE2(_BITPACK_,a8)[] = {
+  &TEMPLATE2(_BITPACK_,8_0),
+  &TEMPLATE2(_BITPACK_,8_1),
+  &TEMPLATE2(_BITPACK_,8_2),
+  &TEMPLATE2(_BITPACK_,8_3),
+  &TEMPLATE2(_BITPACK_,8_4),
+  &TEMPLATE2(_BITPACK_,8_5),
+  &TEMPLATE2(_BITPACK_,8_6),
+  &TEMPLATE2(_BITPACK_,8_7),
+  &TEMPLATE2(_BITPACK_,8_8)
 };
-unsigned char *T2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start, unsigned b) { return T2(_BITPACK_,a8)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a8)[ b](in, n, out, start); }
 
 #define USIZE 16
-unsigned char *T2(_BITPACK_,16_0 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { return out; }
-unsigned char *T2(_BITPACK_,16_1 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 1,16);}
-unsigned char *T2(_BITPACK_,16_2 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 2,16);}
-unsigned char *T2(_BITPACK_,16_3 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 3,16);}
-unsigned char *T2(_BITPACK_,16_4 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 4,16);}
-unsigned char *T2(_BITPACK_,16_5 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 5,16);}
-unsigned char *T2(_BITPACK_,16_6 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 6,16);}
-unsigned char *T2(_BITPACK_,16_7 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 7,16);}
-unsigned char *T2(_BITPACK_,16_8 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 8,16);}
-unsigned char *T2(_BITPACK_,16_9 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 9,16);}
-unsigned char *T2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(10,16);}
-unsigned char *T2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(11,16);}
-unsigned char *T2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(12,16);}
-unsigned char *T2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(13,16);}
-unsigned char *T2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(14,16);}
-unsigned char *T2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(15,16);}
-unsigned char *T2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(16,16);}
-BITPACK_D16 T2(_BITPACK_,a16)[] = {
-  &T2(_BITPACK_,16_0),
-  &T2(_BITPACK_,16_1),
-  &T2(_BITPACK_,16_2),
-  &T2(_BITPACK_,16_3),
-  &T2(_BITPACK_,16_4),
-  &T2(_BITPACK_,16_5),
-  &T2(_BITPACK_,16_6),
-  &T2(_BITPACK_,16_7),
-  &T2(_BITPACK_,16_8),
-  &T2(_BITPACK_,16_9),
-  &T2(_BITPACK_,16_10),
-  &T2(_BITPACK_,16_11),
-  &T2(_BITPACK_,16_12),
-  &T2(_BITPACK_,16_13),
-  &T2(_BITPACK_,16_14),
-  &T2(_BITPACK_,16_15),
-  &T2(_BITPACK_,16_16)
+unsigned char *TEMPLATE2(_BITPACK_,16_0)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,16_1)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*1); uint16_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_2)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*2); uint16_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_3)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*3); uint16_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_4)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*4); uint16_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_5)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*5); uint16_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_6)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*6); uint16_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_7)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*7); uint16_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_8)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*8); uint16_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_9)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*9); uint16_t v,x=0;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*10); uint16_t v,x=0;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*11); uint16_t v,x=0;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*12); uint16_t v,x=0;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*13); uint16_t v,x=0;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*14); uint16_t v,x=0;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*15); uint16_t v,x=0;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*16); uint16_t v,x=0;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D16 TEMPLATE2(_BITPACK_,a16)[] = {
+  &TEMPLATE2(_BITPACK_,16_0),
+  &TEMPLATE2(_BITPACK_,16_1),
+  &TEMPLATE2(_BITPACK_,16_2),
+  &TEMPLATE2(_BITPACK_,16_3),
+  &TEMPLATE2(_BITPACK_,16_4),
+  &TEMPLATE2(_BITPACK_,16_5),
+  &TEMPLATE2(_BITPACK_,16_6),
+  &TEMPLATE2(_BITPACK_,16_7),
+  &TEMPLATE2(_BITPACK_,16_8),
+  &TEMPLATE2(_BITPACK_,16_9),
+  &TEMPLATE2(_BITPACK_,16_10),
+  &TEMPLATE2(_BITPACK_,16_11),
+  &TEMPLATE2(_BITPACK_,16_12),
+  &TEMPLATE2(_BITPACK_,16_13),
+  &TEMPLATE2(_BITPACK_,16_14),
+  &TEMPLATE2(_BITPACK_,16_15),
+  &TEMPLATE2(_BITPACK_,16_16)
 };
-unsigned char *T2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start, unsigned b) { return T2(_BITPACK_,a16)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a16)[ b](in, n, out, start); }
 
 #define USIZE 32
-unsigned char *T2(_BITPACK_,32_0 )( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { return out; }
-unsigned char *T2(_BITPACK_,32_1 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 1,32);}
-unsigned char *T2(_BITPACK_,32_2 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 2,32);}
-unsigned char *T2(_BITPACK_,32_3 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 3,32);}
-unsigned char *T2(_BITPACK_,32_4 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 4,32);}
-unsigned char *T2(_BITPACK_,32_5 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 5,32);}
-unsigned char *T2(_BITPACK_,32_6 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 6,32);}
-unsigned char *T2(_BITPACK_,32_7 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 7,32);}
-unsigned char *T2(_BITPACK_,32_8 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 8,32);}
-unsigned char *T2(_BITPACK_,32_9 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 9,32);}
-unsigned char *T2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(10,32);}
-unsigned char *T2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(11,32);}
-unsigned char *T2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(12,32);}
-unsigned char *T2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(13,32);}
-unsigned char *T2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(14,32);}
-unsigned char *T2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(15,32);}
-unsigned char *T2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(16,32);}
-unsigned char *T2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(17,32);}
-unsigned char *T2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(18,32);}
-unsigned char *T2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(19,32);}
-unsigned char *T2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(20,32);}
-unsigned char *T2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(21,32);}
-unsigned char *T2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(22,32);}
-unsigned char *T2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(23,32);}
-unsigned char *T2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(24,32);}
-unsigned char *T2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(25,32);}
-unsigned char *T2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(26,32);}
-unsigned char *T2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(27,32);}
-unsigned char *T2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(28,32);}
-unsigned char *T2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(29,32);}
-unsigned char *T2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(30,32);}
-unsigned char *T2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(31,32);}
-unsigned char *T2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(32,32);}
-BITPACK_D32 T2(_BITPACK_,a32)[] = {
-  &T2(_BITPACK_,32_0),
-  &T2(_BITPACK_,32_1),
-  &T2(_BITPACK_,32_2),
-  &T2(_BITPACK_,32_3),
-  &T2(_BITPACK_,32_4),
-  &T2(_BITPACK_,32_5),
-  &T2(_BITPACK_,32_6),
-  &T2(_BITPACK_,32_7),
-  &T2(_BITPACK_,32_8),
-  &T2(_BITPACK_,32_9),
-  &T2(_BITPACK_,32_10),
-  &T2(_BITPACK_,32_11),
-  &T2(_BITPACK_,32_12),
-  &T2(_BITPACK_,32_13),
-  &T2(_BITPACK_,32_14),
-  &T2(_BITPACK_,32_15),
-  &T2(_BITPACK_,32_16),
-  &T2(_BITPACK_,32_17),
-  &T2(_BITPACK_,32_18),
-  &T2(_BITPACK_,32_19),
-  &T2(_BITPACK_,32_20),
-  &T2(_BITPACK_,32_21),
-  &T2(_BITPACK_,32_22),
-  &T2(_BITPACK_,32_23),
-  &T2(_BITPACK_,32_24),
-  &T2(_BITPACK_,32_25),
-  &T2(_BITPACK_,32_26),
-  &T2(_BITPACK_,32_27),
-  &T2(_BITPACK_,32_28),
-  &T2(_BITPACK_,32_29),
-  &T2(_BITPACK_,32_30),
-  &T2(_BITPACK_,32_31),
-  &T2(_BITPACK_,32_32)
+unsigned char *TEMPLATE2(_BITPACK_,32_0)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,32_1)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*1); uint32_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_2)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*2); uint32_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_3)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*3); uint32_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_4)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*4); uint32_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_5)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*5); uint32_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_6)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*6); uint32_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_7)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*7); uint32_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_8)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*8); uint32_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_9)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*9); uint32_t v,x=0;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*10); uint32_t v,x=0;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*11); uint32_t v,x=0;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*12); uint32_t v,x=0;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*13); uint32_t v,x=0;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*14); uint32_t v,x=0;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*15); uint32_t v,x=0;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*16); uint32_t v,x=0;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*17); uint32_t v,x=0;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*18); uint32_t v,x=0;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*19); uint32_t v,x=0;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*20); uint32_t v,x=0;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*21); uint32_t v,x=0;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*22); uint32_t v,x=0;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*23); uint32_t v,x=0;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*24); uint32_t v,x=0;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*25); uint32_t v,x=0;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*26); uint32_t v,x=0;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*27); uint32_t v,x=0;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*28); uint32_t v,x=0;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*29); uint32_t v,x=0;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*30); uint32_t v,x=0;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*31); uint32_t v,x=0;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*32); uint32_t v,x=0;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D32 TEMPLATE2(_BITPACK_,a32)[] = {
+  &TEMPLATE2(_BITPACK_,32_0),
+  &TEMPLATE2(_BITPACK_,32_1),
+  &TEMPLATE2(_BITPACK_,32_2),
+  &TEMPLATE2(_BITPACK_,32_3),
+  &TEMPLATE2(_BITPACK_,32_4),
+  &TEMPLATE2(_BITPACK_,32_5),
+  &TEMPLATE2(_BITPACK_,32_6),
+  &TEMPLATE2(_BITPACK_,32_7),
+  &TEMPLATE2(_BITPACK_,32_8),
+  &TEMPLATE2(_BITPACK_,32_9),
+  &TEMPLATE2(_BITPACK_,32_10),
+  &TEMPLATE2(_BITPACK_,32_11),
+  &TEMPLATE2(_BITPACK_,32_12),
+  &TEMPLATE2(_BITPACK_,32_13),
+  &TEMPLATE2(_BITPACK_,32_14),
+  &TEMPLATE2(_BITPACK_,32_15),
+  &TEMPLATE2(_BITPACK_,32_16),
+  &TEMPLATE2(_BITPACK_,32_17),
+  &TEMPLATE2(_BITPACK_,32_18),
+  &TEMPLATE2(_BITPACK_,32_19),
+  &TEMPLATE2(_BITPACK_,32_20),
+  &TEMPLATE2(_BITPACK_,32_21),
+  &TEMPLATE2(_BITPACK_,32_22),
+  &TEMPLATE2(_BITPACK_,32_23),
+  &TEMPLATE2(_BITPACK_,32_24),
+  &TEMPLATE2(_BITPACK_,32_25),
+  &TEMPLATE2(_BITPACK_,32_26),
+  &TEMPLATE2(_BITPACK_,32_27),
+  &TEMPLATE2(_BITPACK_,32_28),
+  &TEMPLATE2(_BITPACK_,32_29),
+  &TEMPLATE2(_BITPACK_,32_30),
+  &TEMPLATE2(_BITPACK_,32_31),
+  &TEMPLATE2(_BITPACK_,32_32)
 };
-unsigned char *T2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start, unsigned b) { return T2(_BITPACK_,a32)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a32)[ b](in, n, out, start); }
 
 #define USIZE 64
-unsigned char *T2(_BITPACK_,64_0 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { return out; }
-unsigned char *T2(_BITPACK_,64_1 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 1,64);}
-unsigned char *T2(_BITPACK_,64_2 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 2,64);}
-unsigned char *T2(_BITPACK_,64_3 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 3,64);}
-unsigned char *T2(_BITPACK_,64_4 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 4,64);}
-unsigned char *T2(_BITPACK_,64_5 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 5,64);}
-unsigned char *T2(_BITPACK_,64_6 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 6,64);}
-unsigned char *T2(_BITPACK_,64_7 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 7,64);}
-unsigned char *T2(_BITPACK_,64_8 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 8,64);}
-unsigned char *T2(_BITPACK_,64_9 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 9,64);}
-unsigned char *T2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(10,64);}
-unsigned char *T2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(11,64);}
-unsigned char *T2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(12,64);}
-unsigned char *T2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(13,64);}
-unsigned char *T2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(14,64);}
-unsigned char *T2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(15,64);}
-unsigned char *T2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(16,64);}
-unsigned char *T2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(17,64);}
-unsigned char *T2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(18,64);}
-unsigned char *T2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(19,64);}
-unsigned char *T2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(20,64);}
-unsigned char *T2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(21,64);}
-unsigned char *T2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(22,64);}
-unsigned char *T2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(23,64);}
-unsigned char *T2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(24,64);}
-unsigned char *T2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(25,64);}
-unsigned char *T2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(26,64);}
-unsigned char *T2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(27,64);}
-unsigned char *T2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(28,64);}
-unsigned char *T2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(29,64);}
-unsigned char *T2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(30,64);}
-unsigned char *T2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(31,64);}
-unsigned char *T2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(32,64);}
-unsigned char *T2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(33,64);}
-unsigned char *T2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(34,64);}
-unsigned char *T2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(35,64);}
-unsigned char *T2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(36,64);}
-unsigned char *T2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(37,64);}
-unsigned char *T2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(38,64);}
-unsigned char *T2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(39,64);}
-unsigned char *T2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(40,64);}
-unsigned char *T2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(41,64);}
-unsigned char *T2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(42,64);}
-unsigned char *T2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(43,64);}
-unsigned char *T2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(44,64);}
-unsigned char *T2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(45,64);}
-unsigned char *T2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(46,64);}
-unsigned char *T2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(47,64);}
-unsigned char *T2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(48,64);}
-unsigned char *T2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(49,64);}
-unsigned char *T2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(50,64);}
-unsigned char *T2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(51,64);}
-unsigned char *T2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(52,64);}
-unsigned char *T2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(53,64);}
-unsigned char *T2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(54,64);}
-unsigned char *T2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(55,64);}
-unsigned char *T2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(56,64);}
-unsigned char *T2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(57,64);}
-unsigned char *T2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(58,64);}
-unsigned char *T2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(59,64);}
-unsigned char *T2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(60,64);}
-unsigned char *T2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(61,64);}
-unsigned char *T2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(62,64);}
-unsigned char *T2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(63,64);}
-unsigned char *T2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(64,64);}
-BITPACK_D64 T2(_BITPACK_,a64)[] = {
-  &T2(_BITPACK_,64_0),
-  &T2(_BITPACK_,64_1),
-  &T2(_BITPACK_,64_2),
-  &T2(_BITPACK_,64_3),
-  &T2(_BITPACK_,64_4),
-  &T2(_BITPACK_,64_5),
-  &T2(_BITPACK_,64_6),
-  &T2(_BITPACK_,64_7),
-  &T2(_BITPACK_,64_8),
-  &T2(_BITPACK_,64_9),
-  &T2(_BITPACK_,64_10),
-  &T2(_BITPACK_,64_11),
-  &T2(_BITPACK_,64_12),
-  &T2(_BITPACK_,64_13),
-  &T2(_BITPACK_,64_14),
-  &T2(_BITPACK_,64_15),
-  &T2(_BITPACK_,64_16),
-  &T2(_BITPACK_,64_17),
-  &T2(_BITPACK_,64_18),
-  &T2(_BITPACK_,64_19),
-  &T2(_BITPACK_,64_20),
-  &T2(_BITPACK_,64_21),
-  &T2(_BITPACK_,64_22),
-  &T2(_BITPACK_,64_23),
-  &T2(_BITPACK_,64_24),
-  &T2(_BITPACK_,64_25),
-  &T2(_BITPACK_,64_26),
-  &T2(_BITPACK_,64_27),
-  &T2(_BITPACK_,64_28),
-  &T2(_BITPACK_,64_29),
-  &T2(_BITPACK_,64_30),
-  &T2(_BITPACK_,64_31),
-  &T2(_BITPACK_,64_32),
-  &T2(_BITPACK_,64_33),
-  &T2(_BITPACK_,64_34),
-  &T2(_BITPACK_,64_35),
-  &T2(_BITPACK_,64_36),
-  &T2(_BITPACK_,64_37),
-  &T2(_BITPACK_,64_38),
-  &T2(_BITPACK_,64_39),
-  &T2(_BITPACK_,64_40),
-  &T2(_BITPACK_,64_41),
-  &T2(_BITPACK_,64_42),
-  &T2(_BITPACK_,64_43),
-  &T2(_BITPACK_,64_44),
-  &T2(_BITPACK_,64_45),
-  &T2(_BITPACK_,64_46),
-  &T2(_BITPACK_,64_47),
-  &T2(_BITPACK_,64_48),
-  &T2(_BITPACK_,64_49),
-  &T2(_BITPACK_,64_50),
-  &T2(_BITPACK_,64_51),
-  &T2(_BITPACK_,64_52),
-  &T2(_BITPACK_,64_53),
-  &T2(_BITPACK_,64_54),
-  &T2(_BITPACK_,64_55),
-  &T2(_BITPACK_,64_56),
-  &T2(_BITPACK_,64_57),
-  &T2(_BITPACK_,64_58),
-  &T2(_BITPACK_,64_59),
-  &T2(_BITPACK_,64_60),
-  &T2(_BITPACK_,64_61),
-  &T2(_BITPACK_,64_62),
-  &T2(_BITPACK_,64_63),
-  &T2(_BITPACK_,64_64)
+unsigned char *TEMPLATE2(_BITPACK_,64_0)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,64_1)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*1); uint64_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_2)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*2); uint64_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_3)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*3); uint64_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_4)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*4); uint64_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_5)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*5); uint64_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_6)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*6); uint64_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_7)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*7); uint64_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_8)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*8); uint64_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_9)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*9); uint64_t v,x=0;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*10); uint64_t v,x=0;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*11); uint64_t v,x=0;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*12); uint64_t v,x=0;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*13); uint64_t v,x=0;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*14); uint64_t v,x=0;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*15); uint64_t v,x=0;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*16); uint64_t v,x=0;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*17); uint64_t v,x=0;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*18); uint64_t v,x=0;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*19); uint64_t v,x=0;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*20); uint64_t v,x=0;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*21); uint64_t v,x=0;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*22); uint64_t v,x=0;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*23); uint64_t v,x=0;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*24); uint64_t v,x=0;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*25); uint64_t v,x=0;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*26); uint64_t v,x=0;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*27); uint64_t v,x=0;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*28); uint64_t v,x=0;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*29); uint64_t v,x=0;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*30); uint64_t v,x=0;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*31); uint64_t v,x=0;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*32); uint64_t v,x=0;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*33); uint64_t v,x=0;do { BITPACK64_33( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*34); uint64_t v,x=0;do { BITPACK64_34( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*35); uint64_t v,x=0;do { BITPACK64_35( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*36); uint64_t v,x=0;do { BITPACK64_36( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*37); uint64_t v,x=0;do { BITPACK64_37( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*38); uint64_t v,x=0;do { BITPACK64_38( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*39); uint64_t v,x=0;do { BITPACK64_39( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*40); uint64_t v,x=0;do { BITPACK64_40( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*41); uint64_t v,x=0;do { BITPACK64_41( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*42); uint64_t v,x=0;do { BITPACK64_42( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*43); uint64_t v,x=0;do { BITPACK64_43( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*44); uint64_t v,x=0;do { BITPACK64_44( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*45); uint64_t v,x=0;do { BITPACK64_45( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*46); uint64_t v,x=0;do { BITPACK64_46( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*47); uint64_t v,x=0;do { BITPACK64_47( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*48); uint64_t v,x=0;do { BITPACK64_48( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*49); uint64_t v,x=0;do { BITPACK64_49( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*50); uint64_t v,x=0;do { BITPACK64_50( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*51); uint64_t v,x=0;do { BITPACK64_51( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*52); uint64_t v,x=0;do { BITPACK64_52( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*53); uint64_t v,x=0;do { BITPACK64_53( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*54); uint64_t v,x=0;do { BITPACK64_54( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*55); uint64_t v,x=0;do { BITPACK64_55( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*56); uint64_t v,x=0;do { BITPACK64_56( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*57); uint64_t v,x=0;do { BITPACK64_57( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*58); uint64_t v,x=0;do { BITPACK64_58( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*59); uint64_t v,x=0;do { BITPACK64_59( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*60); uint64_t v,x=0;do { BITPACK64_60( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*61); uint64_t v,x=0;do { BITPACK64_61( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*62); uint64_t v,x=0;do { BITPACK64_62( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*63); uint64_t v,x=0;do { BITPACK64_63( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*64); uint64_t v,x=0;do { BITPACK64_64( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D64 TEMPLATE2(_BITPACK_,a64)[] = {
+  &TEMPLATE2(_BITPACK_,64_0),
+  &TEMPLATE2(_BITPACK_,64_1),
+  &TEMPLATE2(_BITPACK_,64_2),
+  &TEMPLATE2(_BITPACK_,64_3),
+  &TEMPLATE2(_BITPACK_,64_4),
+  &TEMPLATE2(_BITPACK_,64_5),
+  &TEMPLATE2(_BITPACK_,64_6),
+  &TEMPLATE2(_BITPACK_,64_7),
+  &TEMPLATE2(_BITPACK_,64_8),
+  &TEMPLATE2(_BITPACK_,64_9),
+  &TEMPLATE2(_BITPACK_,64_10),
+  &TEMPLATE2(_BITPACK_,64_11),
+  &TEMPLATE2(_BITPACK_,64_12),
+  &TEMPLATE2(_BITPACK_,64_13),
+  &TEMPLATE2(_BITPACK_,64_14),
+  &TEMPLATE2(_BITPACK_,64_15),
+  &TEMPLATE2(_BITPACK_,64_16),
+  &TEMPLATE2(_BITPACK_,64_17),
+  &TEMPLATE2(_BITPACK_,64_18),
+  &TEMPLATE2(_BITPACK_,64_19),
+  &TEMPLATE2(_BITPACK_,64_20),
+  &TEMPLATE2(_BITPACK_,64_21),
+  &TEMPLATE2(_BITPACK_,64_22),
+  &TEMPLATE2(_BITPACK_,64_23),
+  &TEMPLATE2(_BITPACK_,64_24),
+  &TEMPLATE2(_BITPACK_,64_25),
+  &TEMPLATE2(_BITPACK_,64_26),
+  &TEMPLATE2(_BITPACK_,64_27),
+  &TEMPLATE2(_BITPACK_,64_28),
+  &TEMPLATE2(_BITPACK_,64_29),
+  &TEMPLATE2(_BITPACK_,64_30),
+  &TEMPLATE2(_BITPACK_,64_31),
+  &TEMPLATE2(_BITPACK_,64_32),
+  &TEMPLATE2(_BITPACK_,64_33),
+  &TEMPLATE2(_BITPACK_,64_34),
+  &TEMPLATE2(_BITPACK_,64_35),
+  &TEMPLATE2(_BITPACK_,64_36),
+  &TEMPLATE2(_BITPACK_,64_37),
+  &TEMPLATE2(_BITPACK_,64_38),
+  &TEMPLATE2(_BITPACK_,64_39),
+  &TEMPLATE2(_BITPACK_,64_40),
+  &TEMPLATE2(_BITPACK_,64_41),
+  &TEMPLATE2(_BITPACK_,64_42),
+  &TEMPLATE2(_BITPACK_,64_43),
+  &TEMPLATE2(_BITPACK_,64_44),
+  &TEMPLATE2(_BITPACK_,64_45),
+  &TEMPLATE2(_BITPACK_,64_46),
+  &TEMPLATE2(_BITPACK_,64_47),
+  &TEMPLATE2(_BITPACK_,64_48),
+  &TEMPLATE2(_BITPACK_,64_49),
+  &TEMPLATE2(_BITPACK_,64_50),
+  &TEMPLATE2(_BITPACK_,64_51),
+  &TEMPLATE2(_BITPACK_,64_52),
+  &TEMPLATE2(_BITPACK_,64_53),
+  &TEMPLATE2(_BITPACK_,64_54),
+  &TEMPLATE2(_BITPACK_,64_55),
+  &TEMPLATE2(_BITPACK_,64_56),
+  &TEMPLATE2(_BITPACK_,64_57),
+  &TEMPLATE2(_BITPACK_,64_58),
+  &TEMPLATE2(_BITPACK_,64_59),
+  &TEMPLATE2(_BITPACK_,64_60),
+  &TEMPLATE2(_BITPACK_,64_61),
+  &TEMPLATE2(_BITPACK_,64_62),
+  &TEMPLATE2(_BITPACK_,64_63),
+  &TEMPLATE2(_BITPACK_,64_64)
 };
-unsigned char *T2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start, unsigned b) { return T2(_BITPACK_,a64)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a64)[ b](in, n, out, start); }
 
 #endif
 #endif //IP9
diff --git a/src/ext/for/bitunpack.c b/src/ext/for/bitunpack.c
index 45060084..1dd78003 100644
--- a/src/ext/for/bitunpack.c
+++ b/src/ext/for/bitunpack.c
@@ -1,6 +1,6 @@
 /**
-    Copyright (C) powturbo 2013-2023
-    SPDX-License-Identifier: GPL v2 License
+    Copyright (C) powturbo 2013-2019
+    GPL v2 License
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -22,26 +22,25 @@
     - email    : powturbo [_AT_] gmail [_DOT_] com
 **/
 //   "Integer Compression" Bit Packing
+#define BITUTIL_IN
+#define VINT_IN
+#include "conf.h"
+#include "bitutil.h"
+#include "bitpack.h"
+#include "vint.h"
+
+#define PAD8(_x_) (((_x_)+7)/8)
+
 #pragma warning( disable : 4005)
 #pragma warning( disable : 4090)
 #pragma warning( disable : 4068)
 
-#include <string.h>
-#include "include_/conf.h"
-#include "include_/bitpack.h"
-#include "include_/bitutil.h"
-#include "include_/vlcbyte.h"
-
-#include "include_/bitutil_.h"
-
-#define PAD8(_x_) (((_x_)+7)/8)
-
 #pragma GCC push_options
 #pragma GCC optimize ("align-functions=16")
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunsequenced"
 
-#ifndef __AVX2__ //----------------------------------- Plain -------------------------------------------------------------------------------------------
+#if !defined(SSE2_ON) && !defined(AVX2_ON) //----------------------------------- Plain -------------------------------------------------------------------------------------------
 typedef unsigned char *(*BITUNPACK_F8)( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out);
 typedef unsigned char *(*BITUNPACK_D8)( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t start);
 typedef unsigned char *(*BITUNPACK_F16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out);
@@ -59,51 +58,39 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi
 #define OPX(_op_)     _op_ += 32
   #endif
 
-//-- bitpack -------------
 #define OPI(_op_,_nb_,_parm_) OPX(_op_)
 #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _w_
 #define _BITUNPACK_ bitunpack
 #include "bitunpack_.h"
 
-
 #define DELTA
-//-- bitunpack delta -------------
+
 #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_))
 #define _BITUNPACK_ bitdunpack  // delta + 0
 #include "bitunpack_.h"
 
-//-- bitunpack zigzag -----------
-#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += T2(zigzagdec, USIZE)(_w_))
+#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += TEMPLATE2(zigzagdec, USIZE)(_w_))
 #define _BITUNPACK_ bitzunpack  // zigzag
 #include "bitunpack_.h"
 
-//-- bitunpack xor -----------
-#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ ^= (_w_))
-#define _BITUNPACK_ bitxunpack  // xor
-#include "bitunpack_.h"
-
-//-- bitunpack FOR -----------
 #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ + (_w_))
 #define _BITUNPACK_ bitfunpack  // for
 #include "bitunpack_.h"
 
-//-- bitunpack delta 1 -----------
 #define OPI(_op_,_nb_,_parm_) OPX(_op_); _parm_ += 32
 #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_)) + (_x_+1)
 #define _BITUNPACK_ bitd1unpack  // delta + 1
 #include "bitunpack_.h"
 
-//-- bitunpack FOR 1 -----------
 #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _parm_ + (_w_)+(_x_+1)
 #define _BITUNPACK_ bitf1unpack  // for + 1
 #include "bitunpack_.h"
 #undef OPI
 
-//------------------------------------------------------- bitnunpack ----------------------------------------------------------
 #define BITNUNPACK(in, n, out, _csize_, _usize_) {\
   unsigned char *ip = in;\
-  for(op = out,out+=n; op < out;) { unsigned oplen = out - op,b; if(oplen > _csize_) oplen = _csize_;       /*PREFETCH(ip+512,0);*/\
-    b = *ip++; ip = T2(bitunpacka, _usize_)[b](ip, oplen, op);\
+  for(op = out,out+=n; op < out;) { unsigned oplen = out - op,b; if(oplen > _csize_) oplen = _csize_;       PREFETCH(ip+512,0);\
+    b = *ip++; ip = TEMPLATE2(bitunpacka, _usize_)[b](ip, oplen, op);\
     op += oplen;\
   } \
   return ip - in;\
@@ -111,10 +98,10 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi
 
 #define BITNDUNPACK(in, n, out, _csize_, _usize_, _bitunpacka_) { if(!n) return 0;\
   unsigned char *ip = in;\
-  T2(vbxget, _usize_)(ip, start);\
-  for(*out++ = start,--n,op = out; op != out+(n&~(_csize_-1)); ) {                              /*PREFETCH(ip+512,0);*/\
-                         unsigned b = *ip++; ip = T2(_bitunpacka_, _usize_)[b](ip, _csize_, op, start); op += _csize_; start = op[-1];\
-  } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(_bitunpacka_, _usize_)[b](ip, n,       op, start); }\
+  TEMPLATE2(vbxget, _usize_)(ip, start);\
+  for(*out++ = start,--n,op = out; op != out+(n&~(_csize_-1)); ) {                              PREFETCH(ip+512,0);\
+                         unsigned b = *ip++; ip = TEMPLATE2(_bitunpacka_, _usize_)[b](ip, _csize_, op, start); op += _csize_; start = op[-1];\
+  } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(_bitunpacka_, _usize_)[b](ip, n,       op, start); }\
   return ip - in;\
 }
 
@@ -138,33 +125,28 @@ size_t bitnzunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restri
 size_t bitnzunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitzunpacka); }
 size_t bitnzunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitzunpacka); }
 
-size_t bitnxunpack8(  unsigned char *__restrict in, size_t n, uint8_t  *__restrict out) { uint8_t  *op,start; BITNDUNPACK(in, n, out, 128,  8, bitxunpacka); }
-size_t bitnxunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitxunpacka); }
-size_t bitnxunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitxunpacka); }
-size_t bitnxunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitxunpacka); }
-
 size_t bitnfunpack8(  unsigned char *__restrict in, size_t n, uint8_t  *__restrict out) { uint8_t  *op,start; BITNDUNPACK(in, n, out, 128,  8, bitfunpacka); }
 size_t bitnfunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitfunpacka); }
 size_t bitnfunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitfunpacka); }
 size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitfunpacka); }
-  #endif
-//#else //-------------------------------------------- SSE/AVX2 ---------------------------------------------------------------------------------------
+
+#else //-------------------------------------------- SSE/AVX2 ---------------------------------------------------------------------------------------
 
 #define _BITNUNPACKV(in, n, out, _csize_, _usize_, _bitunpackv_) {\
   unsigned char *ip = in;\
-  for(op = out; op != out+(n&~(_csize_-1)); op += _csize_) {                                                    /*PREFETCH(in+512,0);*/\
-                         unsigned b = *ip++; ip = T2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\
-  } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(bitunpack,    _usize_)(ip, n,       op,b); }\
+  for(op = out; op != out+(n&~(_csize_-1)); op += _csize_) {                                                    PREFETCH(in+512,0);\
+                         unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\
+  } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(bitunpack,    _usize_)(ip, n,       op,b); }\
   return ip - in;\
 }
 
 #define _BITNDUNPACKV(in, n, out, _csize_, _usize_, _bitunpackv_, _bitunpack_) { if(!n) return 0;\
   unsigned char *ip = in;\
-  T2(vbxget, _usize_)(ip, start); \
+  TEMPLATE2(vbxget, _usize_)(ip, start); \
   *out++ = start;\
-  for(--n,op = out; op != out+(n&~(_csize_-1)); ) {                                 /*PREFETCH(ip+512,0);*/\
-                         unsigned b = *ip++; ip = T2(_bitunpackv_, _usize_)(ip, _csize_, op, start,b); op += _csize_; start = op[-1];\
-  } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(_bitunpack_,  _usize_)(ip, n,     op, start,b); }\
+  for(--n,op = out; op != out+(n&~(_csize_-1)); ) {                                 PREFETCH(ip+512,0);\
+                         unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op, start,b); op += _csize_; start = op[-1];\
+  } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(_bitunpack_,  _usize_)(ip, n,     op, start,b); }\
   return ip - in;\
 }
   #ifdef __AVX2__ //-------------------------------- AVX2 ----------------------------------------------------------------------------
@@ -178,262 +160,262 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restri
 static inline __m128i _mm_cvtsi64_si128(__int64 a) {  return _mm_loadl_epi64((__m128i*)&a); }
     #endif
 static ALIGNED(unsigned char, permv[256][8], 32) = {
-{0,0,0,0,0,0,0,0},
-{0,1,1,1,1,1,1,1},
-{1,0,1,1,1,1,1,1},
-{0,1,2,2,2,2,2,2},
-{1,1,0,1,1,1,1,1},
-{0,2,1,2,2,2,2,2},
-{2,0,1,2,2,2,2,2},
-{0,1,2,3,3,3,3,3},
-{1,1,1,0,1,1,1,1},
-{0,2,2,1,2,2,2,2},
-{2,0,2,1,2,2,2,2},
-{0,1,3,2,3,3,3,3},
-{2,2,0,1,2,2,2,2},
-{0,3,1,2,3,3,3,3},
-{3,0,1,2,3,3,3,3},
-{0,1,2,3,4,4,4,4},
-{1,1,1,1,0,1,1,1},
-{0,2,2,2,1,2,2,2},
-{2,0,2,2,1,2,2,2},
-{0,1,3,3,2,3,3,3},
-{2,2,0,2,1,2,2,2},
-{0,3,1,3,2,3,3,3},
-{3,0,1,3,2,3,3,3},
-{0,1,2,4,3,4,4,4},
-{2,2,2,0,1,2,2,2},
-{0,3,3,1,2,3,3,3},
-{3,0,3,1,2,3,3,3},
-{0,1,4,2,3,4,4,4},
-{3,3,0,1,2,3,3,3},
-{0,4,1,2,3,4,4,4},
-{4,0,1,2,3,4,4,4},
-{0,1,2,3,4,5,5,5},
-{1,1,1,1,1,0,1,1},
-{0,2,2,2,2,1,2,2},
-{2,0,2,2,2,1,2,2},
-{0,1,3,3,3,2,3,3},
-{2,2,0,2,2,1,2,2},
-{0,3,1,3,3,2,3,3},
-{3,0,1,3,3,2,3,3},
-{0,1,2,4,4,3,4,4},
-{2,2,2,0,2,1,2,2},
-{0,3,3,1,3,2,3,3},
-{3,0,3,1,3,2,3,3},
-{0,1,4,2,4,3,4,4},
-{3,3,0,1,3,2,3,3},
-{0,4,1,2,4,3,4,4},
-{4,0,1,2,4,3,4,4},
-{0,1,2,3,5,4,5,5},
-{2,2,2,2,0,1,2,2},
-{0,3,3,3,1,2,3,3},
-{3,0,3,3,1,2,3,3},
-{0,1,4,4,2,3,4,4},
-{3,3,0,3,1,2,3,3},
-{0,4,1,4,2,3,4,4},
-{4,0,1,4,2,3,4,4},
-{0,1,2,5,3,4,5,5},
-{3,3,3,0,1,2,3,3},
-{0,4,4,1,2,3,4,4},
-{4,0,4,1,2,3,4,4},
-{0,1,5,2,3,4,5,5},
-{4,4,0,1,2,3,4,4},
-{0,5,1,2,3,4,5,5},
-{5,0,1,2,3,4,5,5},
-{0,1,2,3,4,5,6,6},
-{1,1,1,1,1,1,0,1},
-{0,2,2,2,2,2,1,2},
-{2,0,2,2,2,2,1,2},
-{0,1,3,3,3,3,2,3},
-{2,2,0,2,2,2,1,2},
-{0,3,1,3,3,3,2,3},
-{3,0,1,3,3,3,2,3},
-{0,1,2,4,4,4,3,4},
-{2,2,2,0,2,2,1,2},
-{0,3,3,1,3,3,2,3},
-{3,0,3,1,3,3,2,3},
-{0,1,4,2,4,4,3,4},
-{3,3,0,1,3,3,2,3},
-{0,4,1,2,4,4,3,4},
-{4,0,1,2,4,4,3,4},
-{0,1,2,3,5,5,4,5},
-{2,2,2,2,0,2,1,2},
-{0,3,3,3,1,3,2,3},
-{3,0,3,3,1,3,2,3},
-{0,1,4,4,2,4,3,4},
-{3,3,0,3,1,3,2,3},
-{0,4,1,4,2,4,3,4},
-{4,0,1,4,2,4,3,4},
-{0,1,2,5,3,5,4,5},
-{3,3,3,0,1,3,2,3},
-{0,4,4,1,2,4,3,4},
-{4,0,4,1,2,4,3,4},
-{0,1,5,2,3,5,4,5},
-{4,4,0,1,2,4,3,4},
-{0,5,1,2,3,5,4,5},
-{5,0,1,2,3,5,4,5},
-{0,1,2,3,4,6,5,6},
-{2,2,2,2,2,0,1,2},
-{0,3,3,3,3,1,2,3},
-{3,0,3,3,3,1,2,3},
-{0,1,4,4,4,2,3,4},
-{3,3,0,3,3,1,2,3},
-{0,4,1,4,4,2,3,4},
-{4,0,1,4,4,2,3,4},
-{0,1,2,5,5,3,4,5},
-{3,3,3,0,3,1,2,3},
-{0,4,4,1,4,2,3,4},
-{4,0,4,1,4,2,3,4},
-{0,1,5,2,5,3,4,5},
-{4,4,0,1,4,2,3,4},
-{0,5,1,2,5,3,4,5},
-{5,0,1,2,5,3,4,5},
-{0,1,2,3,6,4,5,6},
-{3,3,3,3,0,1,2,3},
-{0,4,4,4,1,2,3,4},
-{4,0,4,4,1,2,3,4},
-{0,1,5,5,2,3,4,5},
-{4,4,0,4,1,2,3,4},
-{0,5,1,5,2,3,4,5},
-{5,0,1,5,2,3,4,5},
-{0,1,2,6,3,4,5,6},
-{4,4,4,0,1,2,3,4},
-{0,5,5,1,2,3,4,5},
-{5,0,5,1,2,3,4,5},
-{0,1,6,2,3,4,5,6},
-{5,5,0,1,2,3,4,5},
-{0,6,1,2,3,4,5,6},
-{6,0,1,2,3,4,5,6},
-{0,1,2,3,4,5,6,7},
-{1,1,1,1,1,1,1,0},
-{0,2,2,2,2,2,2,1},
-{2,0,2,2,2,2,2,1},
-{0,1,3,3,3,3,3,2},
-{2,2,0,2,2,2,2,1},
-{0,3,1,3,3,3,3,2},
-{3,0,1,3,3,3,3,2},
-{0,1,2,4,4,4,4,3},
-{2,2,2,0,2,2,2,1},
-{0,3,3,1,3,3,3,2},
-{3,0,3,1,3,3,3,2},
-{0,1,4,2,4,4,4,3},
-{3,3,0,1,3,3,3,2},
-{0,4,1,2,4,4,4,3},
-{4,0,1,2,4,4,4,3},
-{0,1,2,3,5,5,5,4},
-{2,2,2,2,0,2,2,1},
-{0,3,3,3,1,3,3,2},
-{3,0,3,3,1,3,3,2},
-{0,1,4,4,2,4,4,3},
-{3,3,0,3,1,3,3,2},
-{0,4,1,4,2,4,4,3},
-{4,0,1,4,2,4,4,3},
-{0,1,2,5,3,5,5,4},
-{3,3,3,0,1,3,3,2},
-{0,4,4,1,2,4,4,3},
-{4,0,4,1,2,4,4,3},
-{0,1,5,2,3,5,5,4},
-{4,4,0,1,2,4,4,3},
-{0,5,1,2,3,5,5,4},
-{5,0,1,2,3,5,5,4},
-{0,1,2,3,4,6,6,5},
-{2,2,2,2,2,0,2,1},
-{0,3,3,3,3,1,3,2},
-{3,0,3,3,3,1,3,2},
-{0,1,4,4,4,2,4,3},
-{3,3,0,3,3,1,3,2},
-{0,4,1,4,4,2,4,3},
-{4,0,1,4,4,2,4,3},
-{0,1,2,5,5,3,5,4},
-{3,3,3,0,3,1,3,2},
-{0,4,4,1,4,2,4,3},
-{4,0,4,1,4,2,4,3},
-{0,1,5,2,5,3,5,4},
-{4,4,0,1,4,2,4,3},
-{0,5,1,2,5,3,5,4},
-{5,0,1,2,5,3,5,4},
-{0,1,2,3,6,4,6,5},
-{3,3,3,3,0,1,3,2},
-{0,4,4,4,1,2,4,3},
-{4,0,4,4,1,2,4,3},
-{0,1,5,5,2,3,5,4},
-{4,4,0,4,1,2,4,3},
-{0,5,1,5,2,3,5,4},
-{5,0,1,5,2,3,5,4},
-{0,1,2,6,3,4,6,5},
-{4,4,4,0,1,2,4,3},
-{0,5,5,1,2,3,5,4},
-{5,0,5,1,2,3,5,4},
-{0,1,6,2,3,4,6,5},
-{5,5,0,1,2,3,5,4},
-{0,6,1,2,3,4,6,5},
-{6,0,1,2,3,4,6,5},
-{0,1,2,3,4,5,7,6},
-{2,2,2,2,2,2,0,1},
-{0,3,3,3,3,3,1,2},
-{3,0,3,3,3,3,1,2},
-{0,1,4,4,4,4,2,3},
-{3,3,0,3,3,3,1,2},
-{0,4,1,4,4,4,2,3},
-{4,0,1,4,4,4,2,3},
-{0,1,2,5,5,5,3,4},
-{3,3,3,0,3,3,1,2},
-{0,4,4,1,4,4,2,3},
-{4,0,4,1,4,4,2,3},
-{0,1,5,2,5,5,3,4},
-{4,4,0,1,4,4,2,3},
-{0,5,1,2,5,5,3,4},
-{5,0,1,2,5,5,3,4},
-{0,1,2,3,6,6,4,5},
-{3,3,3,3,0,3,1,2},
-{0,4,4,4,1,4,2,3},
-{4,0,4,4,1,4,2,3},
-{0,1,5,5,2,5,3,4},
-{4,4,0,4,1,4,2,3},
-{0,5,1,5,2,5,3,4},
-{5,0,1,5,2,5,3,4},
-{0,1,2,6,3,6,4,5},
-{4,4,4,0,1,4,2,3},
-{0,5,5,1,2,5,3,4},
-{5,0,5,1,2,5,3,4},
-{0,1,6,2,3,6,4,5},
-{5,5,0,1,2,5,3,4},
-{0,6,1,2,3,6,4,5},
-{6,0,1,2,3,6,4,5},
-{0,1,2,3,4,7,5,6},
-{3,3,3,3,3,0,1,2},
-{0,4,4,4,4,1,2,3},
-{4,0,4,4,4,1,2,3},
-{0,1,5,5,5,2,3,4},
-{4,4,0,4,4,1,2,3},
-{0,5,1,5,5,2,3,4},
-{5,0,1,5,5,2,3,4},
-{0,1,2,6,6,3,4,5},
-{4,4,4,0,4,1,2,3},
-{0,5,5,1,5,2,3,4},
-{5,0,5,1,5,2,3,4},
-{0,1,6,2,6,3,4,5},
-{5,5,0,1,5,2,3,4},
-{0,6,1,2,6,3,4,5},
-{6,0,1,2,6,3,4,5},
-{0,1,2,3,7,4,5,6},
-{4,4,4,4,0,1,2,3},
-{0,5,5,5,1,2,3,4},
-{5,0,5,5,1,2,3,4},
-{0,1,6,6,2,3,4,5},
-{5,5,0,5,1,2,3,4},
-{0,6,1,6,2,3,4,5},
-{6,0,1,6,2,3,4,5},
-{0,1,2,7,3,4,5,6},
-{5,5,5,0,1,2,3,4},
-{0,6,6,1,2,3,4,5},
-{6,0,6,1,2,3,4,5},
-{0,1,7,2,3,4,5,6},
-{6,6,0,1,2,3,4,5},
-{0,7,1,2,3,4,5,6},
-{7,0,1,2,3,4,5,6},
-{0,1,2,3,4,5,6,7}
+0,0,0,0,0,0,0,0,
+0,1,1,1,1,1,1,1,
+1,0,1,1,1,1,1,1,
+0,1,2,2,2,2,2,2,
+1,1,0,1,1,1,1,1,
+0,2,1,2,2,2,2,2,
+2,0,1,2,2,2,2,2,
+0,1,2,3,3,3,3,3,
+1,1,1,0,1,1,1,1,
+0,2,2,1,2,2,2,2,
+2,0,2,1,2,2,2,2,
+0,1,3,2,3,3,3,3,
+2,2,0,1,2,2,2,2,
+0,3,1,2,3,3,3,3,
+3,0,1,2,3,3,3,3,
+0,1,2,3,4,4,4,4,
+1,1,1,1,0,1,1,1,
+0,2,2,2,1,2,2,2,
+2,0,2,2,1,2,2,2,
+0,1,3,3,2,3,3,3,
+2,2,0,2,1,2,2,2,
+0,3,1,3,2,3,3,3,
+3,0,1,3,2,3,3,3,
+0,1,2,4,3,4,4,4,
+2,2,2,0,1,2,2,2,
+0,3,3,1,2,3,3,3,
+3,0,3,1,2,3,3,3,
+0,1,4,2,3,4,4,4,
+3,3,0,1,2,3,3,3,
+0,4,1,2,3,4,4,4,
+4,0,1,2,3,4,4,4,
+0,1,2,3,4,5,5,5,
+1,1,1,1,1,0,1,1,
+0,2,2,2,2,1,2,2,
+2,0,2,2,2,1,2,2,
+0,1,3,3,3,2,3,3,
+2,2,0,2,2,1,2,2,
+0,3,1,3,3,2,3,3,
+3,0,1,3,3,2,3,3,
+0,1,2,4,4,3,4,4,
+2,2,2,0,2,1,2,2,
+0,3,3,1,3,2,3,3,
+3,0,3,1,3,2,3,3,
+0,1,4,2,4,3,4,4,
+3,3,0,1,3,2,3,3,
+0,4,1,2,4,3,4,4,
+4,0,1,2,4,3,4,4,
+0,1,2,3,5,4,5,5,
+2,2,2,2,0,1,2,2,
+0,3,3,3,1,2,3,3,
+3,0,3,3,1,2,3,3,
+0,1,4,4,2,3,4,4,
+3,3,0,3,1,2,3,3,
+0,4,1,4,2,3,4,4,
+4,0,1,4,2,3,4,4,
+0,1,2,5,3,4,5,5,
+3,3,3,0,1,2,3,3,
+0,4,4,1,2,3,4,4,
+4,0,4,1,2,3,4,4,
+0,1,5,2,3,4,5,5,
+4,4,0,1,2,3,4,4,
+0,5,1,2,3,4,5,5,
+5,0,1,2,3,4,5,5,
+0,1,2,3,4,5,6,6,
+1,1,1,1,1,1,0,1,
+0,2,2,2,2,2,1,2,
+2,0,2,2,2,2,1,2,
+0,1,3,3,3,3,2,3,
+2,2,0,2,2,2,1,2,
+0,3,1,3,3,3,2,3,
+3,0,1,3,3,3,2,3,
+0,1,2,4,4,4,3,4,
+2,2,2,0,2,2,1,2,
+0,3,3,1,3,3,2,3,
+3,0,3,1,3,3,2,3,
+0,1,4,2,4,4,3,4,
+3,3,0,1,3,3,2,3,
+0,4,1,2,4,4,3,4,
+4,0,1,2,4,4,3,4,
+0,1,2,3,5,5,4,5,
+2,2,2,2,0,2,1,2,
+0,3,3,3,1,3,2,3,
+3,0,3,3,1,3,2,3,
+0,1,4,4,2,4,3,4,
+3,3,0,3,1,3,2,3,
+0,4,1,4,2,4,3,4,
+4,0,1,4,2,4,3,4,
+0,1,2,5,3,5,4,5,
+3,3,3,0,1,3,2,3,
+0,4,4,1,2,4,3,4,
+4,0,4,1,2,4,3,4,
+0,1,5,2,3,5,4,5,
+4,4,0,1,2,4,3,4,
+0,5,1,2,3,5,4,5,
+5,0,1,2,3,5,4,5,
+0,1,2,3,4,6,5,6,
+2,2,2,2,2,0,1,2,
+0,3,3,3,3,1,2,3,
+3,0,3,3,3,1,2,3,
+0,1,4,4,4,2,3,4,
+3,3,0,3,3,1,2,3,
+0,4,1,4,4,2,3,4,
+4,0,1,4,4,2,3,4,
+0,1,2,5,5,3,4,5,
+3,3,3,0,3,1,2,3,
+0,4,4,1,4,2,3,4,
+4,0,4,1,4,2,3,4,
+0,1,5,2,5,3,4,5,
+4,4,0,1,4,2,3,4,
+0,5,1,2,5,3,4,5,
+5,0,1,2,5,3,4,5,
+0,1,2,3,6,4,5,6,
+3,3,3,3,0,1,2,3,
+0,4,4,4,1,2,3,4,
+4,0,4,4,1,2,3,4,
+0,1,5,5,2,3,4,5,
+4,4,0,4,1,2,3,4,
+0,5,1,5,2,3,4,5,
+5,0,1,5,2,3,4,5,
+0,1,2,6,3,4,5,6,
+4,4,4,0,1,2,3,4,
+0,5,5,1,2,3,4,5,
+5,0,5,1,2,3,4,5,
+0,1,6,2,3,4,5,6,
+5,5,0,1,2,3,4,5,
+0,6,1,2,3,4,5,6,
+6,0,1,2,3,4,5,6,
+0,1,2,3,4,5,6,7,
+1,1,1,1,1,1,1,0,
+0,2,2,2,2,2,2,1,
+2,0,2,2,2,2,2,1,
+0,1,3,3,3,3,3,2,
+2,2,0,2,2,2,2,1,
+0,3,1,3,3,3,3,2,
+3,0,1,3,3,3,3,2,
+0,1,2,4,4,4,4,3,
+2,2,2,0,2,2,2,1,
+0,3,3,1,3,3,3,2,
+3,0,3,1,3,3,3,2,
+0,1,4,2,4,4,4,3,
+3,3,0,1,3,3,3,2,
+0,4,1,2,4,4,4,3,
+4,0,1,2,4,4,4,3,
+0,1,2,3,5,5,5,4,
+2,2,2,2,0,2,2,1,
+0,3,3,3,1,3,3,2,
+3,0,3,3,1,3,3,2,
+0,1,4,4,2,4,4,3,
+3,3,0,3,1,3,3,2,
+0,4,1,4,2,4,4,3,
+4,0,1,4,2,4,4,3,
+0,1,2,5,3,5,5,4,
+3,3,3,0,1,3,3,2,
+0,4,4,1,2,4,4,3,
+4,0,4,1,2,4,4,3,
+0,1,5,2,3,5,5,4,
+4,4,0,1,2,4,4,3,
+0,5,1,2,3,5,5,4,
+5,0,1,2,3,5,5,4,
+0,1,2,3,4,6,6,5,
+2,2,2,2,2,0,2,1,
+0,3,3,3,3,1,3,2,
+3,0,3,3,3,1,3,2,
+0,1,4,4,4,2,4,3,
+3,3,0,3,3,1,3,2,
+0,4,1,4,4,2,4,3,
+4,0,1,4,4,2,4,3,
+0,1,2,5,5,3,5,4,
+3,3,3,0,3,1,3,2,
+0,4,4,1,4,2,4,3,
+4,0,4,1,4,2,4,3,
+0,1,5,2,5,3,5,4,
+4,4,0,1,4,2,4,3,
+0,5,1,2,5,3,5,4,
+5,0,1,2,5,3,5,4,
+0,1,2,3,6,4,6,5,
+3,3,3,3,0,1,3,2,
+0,4,4,4,1,2,4,3,
+4,0,4,4,1,2,4,3,
+0,1,5,5,2,3,5,4,
+4,4,0,4,1,2,4,3,
+0,5,1,5,2,3,5,4,
+5,0,1,5,2,3,5,4,
+0,1,2,6,3,4,6,5,
+4,4,4,0,1,2,4,3,
+0,5,5,1,2,3,5,4,
+5,0,5,1,2,3,5,4,
+0,1,6,2,3,4,6,5,
+5,5,0,1,2,3,5,4,
+0,6,1,2,3,4,6,5,
+6,0,1,2,3,4,6,5,
+0,1,2,3,4,5,7,6,
+2,2,2,2,2,2,0,1,
+0,3,3,3,3,3,1,2,
+3,0,3,3,3,3,1,2,
+0,1,4,4,4,4,2,3,
+3,3,0,3,3,3,1,2,
+0,4,1,4,4,4,2,3,
+4,0,1,4,4,4,2,3,
+0,1,2,5,5,5,3,4,
+3,3,3,0,3,3,1,2,
+0,4,4,1,4,4,2,3,
+4,0,4,1,4,4,2,3,
+0,1,5,2,5,5,3,4,
+4,4,0,1,4,4,2,3,
+0,5,1,2,5,5,3,4,
+5,0,1,2,5,5,3,4,
+0,1,2,3,6,6,4,5,
+3,3,3,3,0,3,1,2,
+0,4,4,4,1,4,2,3,
+4,0,4,4,1,4,2,3,
+0,1,5,5,2,5,3,4,
+4,4,0,4,1,4,2,3,
+0,5,1,5,2,5,3,4,
+5,0,1,5,2,5,3,4,
+0,1,2,6,3,6,4,5,
+4,4,4,0,1,4,2,3,
+0,5,5,1,2,5,3,4,
+5,0,5,1,2,5,3,4,
+0,1,6,2,3,6,4,5,
+5,5,0,1,2,5,3,4,
+0,6,1,2,3,6,4,5,
+6,0,1,2,3,6,4,5,
+0,1,2,3,4,7,5,6,
+3,3,3,3,3,0,1,2,
+0,4,4,4,4,1,2,3,
+4,0,4,4,4,1,2,3,
+0,1,5,5,5,2,3,4,
+4,4,0,4,4,1,2,3,
+0,5,1,5,5,2,3,4,
+5,0,1,5,5,2,3,4,
+0,1,2,6,6,3,4,5,
+4,4,4,0,4,1,2,3,
+0,5,5,1,5,2,3,4,
+5,0,5,1,5,2,3,4,
+0,1,6,2,6,3,4,5,
+5,5,0,1,5,2,3,4,
+0,6,1,2,6,3,4,5,
+6,0,1,2,6,3,4,5,
+0,1,2,3,7,4,5,6,
+4,4,4,4,0,1,2,3,
+0,5,5,5,1,2,3,4,
+5,0,5,5,1,2,3,4,
+0,1,6,6,2,3,4,5,
+5,5,0,5,1,2,3,4,
+0,6,1,6,2,3,4,5,
+6,0,1,6,2,3,4,5,
+0,1,2,7,3,4,5,6,
+5,5,5,0,1,2,3,4,
+0,6,6,1,2,3,4,5,
+6,0,6,1,2,3,4,5,
+0,1,7,2,3,4,5,6,
+6,6,0,1,2,3,4,5,
+0,7,1,2,3,4,5,6,
+7,0,1,2,3,4,5,6,
+0,1,2,3,4,5,6,7
 };
 #define u2vmask(_m_,_tv_)                  _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_)
 #define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_,  _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(permv[_m_]))) )
@@ -489,57 +471,38 @@ unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n,
 }
 
 //--------------------------------------- zeromask unpack for TurboPFor vp4d.c --------------------------------------
-
-//-- bitunpack used in vp4d.c ---------
 #define VO32(_op_, _i_, _ov_, _nb_,_parm_)  xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) )); pex += popcnt32(xm)
 #define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) xm = *bb++; _mm256_storeu_si256(_op_++,                                          mm256_maskz_loadu_epi32(xm,(__m256i*)pex) );         pex += popcnt32(xm)
 #define BITUNPACK0(_parm_)
 #include "bitunpack_.h"
 unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(256*b); 
-             unsigned xm; 
-              __m256i zv = _mm256_setzero_si256(), sv,
-                      tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+  const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
   BITUNPACK256V32(in, b, out, sv);
   return (unsigned char *)ip;
 }
 
-//-- bitunpack zigzag ---------
 #define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_)
 #define VO32(_op_, i, _ov_, _nb_,_sv_) _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
   const unsigned char *ip = in+PAD8(256*b);
-  __m256i sv = _mm256_set1_epi32(start);          //, zv = _mm256_setzero_si256();
+  __m256i sv = _mm256_set1_epi32(start);//, zv = _mm256_setzero_si256();
   BITUNPACK256V32(in, b, out, sv);
   return (unsigned char *)ip;
 }
 
-//-- bitunpack xor ---------
-#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_)
-#define VO32(_op_, i, _ov_, _nb_,_sv_)    _sv_ = mm256_xord_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
-#include "bitunpack_.h"
-#define BITUNPACK0(_parm_)
-unsigned char *bitxunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
-  const unsigned char *ip = in+PAD8(256*b);
-  __m256i sv = _mm256_set1_epi32(start);     
-  BITUNPACK256V32(in, b, out, sv);
-  return (unsigned char *)ip;
-}
 
-//-- bitunpack delta ---------
 #define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
   const unsigned char *ip = in+PAD8(256*b);
-  __m256i sv = _mm256_set1_epi32(start);                // zv = _mm256_setzero_si256();
+  __m256i sv = _mm256_set1_epi32(start);// zv = _mm256_setzero_si256();
   BITUNPACK256V32(in, b, out, sv);
   return (unsigned char *)ip;
 }
 
-//-- bitunpack FOR ---------
 #define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, sv))
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
@@ -549,8 +512,7 @@ unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n,
   BITUNPACK256V32(in, b, out, sv);
   return (unsigned char *)ip;
 }
-
-//-- bitunpack delta used in vp4d.c ---------
+//-----------------------------------------------------------------------------
 #define VX32(_i_,  _nb_,_ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm)
 #define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ =                                          mm256_maskz_loadu_epi32(xm,(__m256i*)pex);       pex += popcnt32(xm)
 
@@ -565,7 +527,6 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
   return (unsigned char *)ip;
 }
 
-//-- bitunpack zigag used in vp4d.c ---------
 #define VX32(_i_, _nb_,_ov_)  xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm)
 #define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ =                                          mm256_maskz_loadu_epi32(xm,(__m256i*)pex);          pex += popcnt32(xm)
 
@@ -592,14 +553,12 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
 unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
   const unsigned char *ip = in+PAD8(256*b); 
         unsigned xm; 
-  const __m256i zv = _mm256_setzero_si256(), 
-                tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); 
+  const __m256i zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); 
         __m256i sv = _mm256_set1_epi32(start);
   BITUNPACK256V32(in, b, out, sv); 
   return (unsigned char *)ip;
 }
 
-//-- bitunpack delta 1 -----------------------------
 #define VO32(_op_, i, _ov_, _nb_,_sv_)    _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
 #define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv)
 #include "bitunpack_.h"
@@ -607,13 +566,11 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n
 unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
   const unsigned char *ip = in+PAD8(256*b);
   const __m256i zv = _mm256_setzero_si256();
-        __m256i sv = _mm256_set1_epi32(start), 
-		        cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
+        __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
   BITUNPACK256V32(in, b, out, sv);
   return (unsigned char *)ip;
 }
 
-//-- bitunpack FOR 1 -----------------------------
 #define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _sv_)); _sv_ = _mm256_add_epi32(_sv_, cv)
 #define VOZ32(_op_, _i_, ov, _nb_,_sv_)   _mm256_storeu_si256(_op_++, _sv_);                         _sv_ = _mm256_add_epi32(_sv_, cv);
 #include "bitunpack_.h"
@@ -626,31 +583,24 @@ unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n
   return (unsigned char *)ip;
 }
 
-//-- bitunpack delta 1 for vp4d.c -----------------------------
 #define VO32( _op_, _i_, _ov_, _nb_,_sv_)   VX32( _i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
 #define VOZ32(_op_, _i_, _ov_, _nb_,_sv_)   VXZ32(_i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128()
 unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(256*b); 
-  unsigned xm;
-  const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1), 
-                zv = _mm256_setzero_si256(), 
-				tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+  const unsigned char *ip = in+PAD8(256*b); unsigned xm;
+  const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1), zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
         __m256i sv = _mm256_set1_epi32(start);
   BITUNPACK256V32(in, b, out, sv);
   return (unsigned char *)ip;
 }
 
-//---------------------------------------------------- bitnunpack ---------------------------------------------------------------------
 size_t bitnunpack256v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op;       _BITNUNPACKV( in, n, out, 256, 32, bitunpack256v); }
 size_t bitndunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitdunpack256v,  bitdunpack); }
 size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitd1unpack256v, bitd1unpack); }
 //size_t bitns1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bits1unpack256v, bitd1unpack); }
 size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitzunpack256v,  bitzunpack); }
-size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitxunpack256v,  bitxunpack); }
 size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v,  bitfunpack); }
-
   #elif defined(__SSE2__) || defined(__ARM_NEON) //------------------------------ SSE2/SSSE3 ---------------------------------------------------------
 #define BITMAX16 16
 #define BITMAX32 32
@@ -1051,30 +1001,15 @@ ALIGNED(char, _shuffle_16[256][16],16) = {
 #define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128()
 #include "bitunpack_.h"
 
-//--- bitunpack for vp4d.c ------------------------------
 unsigned char *_bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-        unsigned m; 
-  __m128i sv; 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv; BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  unsigned m; 
-  __m128i sv; 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv; BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
-
 unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
-  const unsigned char *_in=in; 
-  unsigned *_out=out, m; 
-  __m128i sv;
-  BITUNPACK128V32(in, b, out, sv); 
-  out = _out+128; 
-  in  = _in+PAD8(128*b);
+  const unsigned char *_in=in; unsigned *_out=out, m; __m128i sv;
+  BITUNPACK128V32(in, b, out, sv); out = _out+128; in=_in+PAD8(128*b);
   BITUNPACK128V32(in, b, out, sv);
   return (unsigned char *)_in+PAD8(256*b);
 }
@@ -1088,23 +1023,19 @@ unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n,
 
 #include "bitunpack_.h"
 unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t       *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  unsigned m; 
-  __m128i zv = _mm_setzero_si128(); 
-  BITUNPACK128V32(in, b, out, 0); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i zv = _mm_setzero_si128(); BITUNPACK128V32(in, b, out, 0); return (unsigned char *)ip;
 }
 
+#define BITMAX16 16
+#define BITMAX32 32
+
 #undef VO32
 #undef VOZ32
 #undef VO16
 #undef VOZ16
 #undef BITUNPACK0
-//--------------------------------------------------------------------------------------------------------------------------------------------
-#define BITMAX16 16
-#define BITMAX32 32
 
-//--- bitunpack zigzag --------------------
+//-------------------------------------------------------------------
 #define VOZ16(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
 #define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
 #define VO16( _op_, _i_, _ov_, _nb_,_sv_) _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
@@ -1112,77 +1043,38 @@ unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n,
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set1_epi16(start); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-              __m128i sv = _mm_set1_epi32(start); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
-}
-
-#define VO16( _op_, _i_, _ov_, _nb_,_sv_) _sv_ = mm_xord_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
-#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _sv_ = mm_xord_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
-#include "bitunpack_.h"
-#define BITUNPACK0(_parm_)
-unsigned char *bitxunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set1_epi16(start); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
-}
-
-unsigned char *bitxunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-              __m128i sv = _mm_set1_epi32(start); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
-//-- bitunpack delta ------------------------------
 #define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
 #define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-              __m128i sv = _mm_set1_epi16(start); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
   const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
-//-- bitunpack FOR ----------------------------
 #define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, sv))
 #define VO16( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, sv))
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set1_epi16(start); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set1_epi32(start); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
     #if defined(__SSSE3__) || defined(__ARM_NEON)
 #define BITMAX16 15
 #define BITMAX32 31
 
-//-- bitunpack delta used in vp4d.c ---------
 #define VX32(_i_, _nb_,_ov_)         if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8( mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
 #define VXZ32(_i_, _nb_,_ov_)        if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ =                     _mm_shuffle_epi8(               _mm_loadu_si128((__m128i*)pex),        _mm_loadu_si128((__m128i*)_shuffle_32[m]));  pex += popcnt32(m)
 #define VO32( _op_, _i_, _ov_, _nb_,_sv_)   VX32( _i_, _nb_,_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
@@ -1195,18 +1087,10 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n,
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *_bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  unsigned m; 
-  __m128i sv = _mm_set1_epi16(start); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start, unsigned b, unsigned       *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  unsigned m; __m128i sv = _mm_set1_epi32(start); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
 /*
@@ -1216,7 +1100,6 @@ unsigned char *_bitdunpack128v64( const unsigned char *__restrict in, unsigned n
   const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start),zv = _mm_setzero_si128(); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }*/
 
-//-- bitunpack zigzag used in vp4d.c --------------------------
 #define VX16(_i_, _nb_,_ov_)              m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8( mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
 #define VXZ16(_i_, _nb_,_ov_)             m = *bb++; _ov_ =                     _mm_shuffle_epi8(               _mm_loadu_si128((__m128i*)pex),        _mm_loadu_si128((__m128i*)_shuffle_16[m]) );   pex += popcnt32(m)
 #define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_);  _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
@@ -1230,47 +1113,29 @@ unsigned char *_bitdunpack128v64( const unsigned char *__restrict in, unsigned n
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *_bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  unsigned m; __m128i sv = _mm_set1_epi16(start); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
 unsigned char *_bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  unsigned m; __m128i sv = _mm_set1_epi32(start); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 #define BITMAX16 16
 #define BITMAX32 32
     #endif
 
-//-- bitunpack delta 1 ------------------------------
 #define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
 #define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
 #define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
 #define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv)
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8)
-
 unsigned char *bitd1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set1_epi16(start), 
-          cv = _mm_set_epi16(8,7,6,5,4,3,2,1); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start), cv = _mm_set_epi16(8,7,6,5,4,3,2,1); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4)
 unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-              __m128i sv  = _mm_set1_epi32(start), 
-                      cv  = _mm_set_epi32(4,3,2,1); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
-//-- bitunpack sub 1 ------------------------------
 #define VO16(_op_, i, _ov_, _nb_,_sv_) ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
 #define VO32(_op_, i, _ov_, _nb_,_sv_) ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
 #define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
@@ -1278,48 +1143,29 @@ unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8)
 unsigned char *bits1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set1_epi16(start), 
-          cv = _mm_set1_epi16(8); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4)
 unsigned char *bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set1_epi32(start), 
-          cv = _mm_set1_epi32(4); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
-//-- bitunpack FOR 1 ------------------
 #define VO16( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, _sv_)); _sv_ = _mm_add_epi16(_sv_, cv)
 #define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _sv_)); _sv_ = _mm_add_epi32(_sv_, cv)
 #define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _sv_);                      _sv_ = _mm_add_epi32(_sv_, cv);
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_)
 unsigned char *bitf1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), 
-          cv = _mm_set1_epi16(8); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
 unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
-  const unsigned char *ip = in+PAD8(128*b); 
-  __m128i sv = _mm_set_epi32(start+4,start+3,start+2,start+1),           
-          cv = _mm_set1_epi32(4); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set_epi32(start+4,start+3,start+2,start+1),                                 cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
     #if defined(__SSSE3__) || defined(__ARM_NEON)
 #define BITMAX16 15
 #define BITMAX32 31
 
-//-- bitunpack delta 1 for vp4d.c -----------------------
 #define VX16(_i_, _nb_,_ov_)                                                    m =  *bb++;       _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8( mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]))); pex += popcnt32(m)
 #define VX32(_i_, _nb_,_ov_)              if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8( mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
 #define VXZ16(_i_, _nb_,_ov_)                                                   m =  *bb++;       _ov_ =                     _mm_shuffle_epi8(               _mm_loadu_si128((__m128i*)pex),        _mm_loadu_si128((__m128i*)_shuffle_16[m]));  pex += popcnt32(m)
@@ -1333,20 +1179,13 @@ unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128()
 unsigned char *_bitd1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-             unsigned m; 
-			 __m128i sv = _mm_set1_epi16(start), 
-			         cv = _mm_set_epi16(8,7,6,5,4,3,2,1); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start), cv = _mm_set_epi16(8,7,6,5,4,3,2,1); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 #define BITUNPACK0(_parm_) mv = _mm_setzero_si128()
 unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start, unsigned b, unsigned       *__restrict pex, unsigned char *bb) {
   const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(        4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 
-//-- bitunpack sub 1 -----------------------
 #define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_);  ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
 #define VOZ16(_op_, _i_, _ov_, _nb_,_sv_) VXZ16( _i_, _nb_,_ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
 #define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_);  ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
@@ -1355,28 +1194,16 @@ unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned
 #include "bitunpack_.h"
 #define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128()
 unsigned char *_bits1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-             unsigned m; 
-			  __m128i sv = _mm_set1_epi16(start), 
-			          cv = _mm_set1_epi16(8); 
-  BITUNPACK128V16(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
 }
-
 #define BITUNPACK0(_parm_) mv = _mm_setzero_si128()
 unsigned char *_bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start, unsigned b, unsigned       *__restrict pex, unsigned char *bb) {
-  const unsigned char *ip = in+PAD8(128*b); 
-             unsigned m; 
-			 __m128i  sv = _mm_set1_epi32(start), 
-			          cv = _mm_set1_epi32(4); 
-  BITUNPACK128V32(in, b, out, sv); 
-  return (unsigned char *)ip;
+  const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
 }
 #define BITMAX16 16
 #define BITMAX32 32
     #endif
 
-//--------------------------------------------------- bitnunpack ------------------------------------------------------------------------------------------------------------------
 size_t bitnunpack128v16(  unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op;       _BITNUNPACKV( in, n, out, 128, 16, bitunpack128v); }
 size_t bitnunpack128v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op;       _BITNUNPACKV( in, n, out, 128, 32, bitunpack128v); }
 size_t bitnunpack128v64(  unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op;       _BITNUNPACKV( in, n, out, 128, 64, bitunpack128v); }
@@ -1394,14 +1221,11 @@ size_t bitns1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__re
 size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitzunpack128v, bitzunpack); }
 size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitzunpack128v, bitzunpack); }
 
-size_t bitnxunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitxunpack128v, bitxunpack); }
-size_t bitnxunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitxunpack128v, bitxunpack); }
-
 size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitfunpack128v, bitfunpack); }
 size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitfunpack128v, bitfunpack); }
 
 #endif
-//#endif
+#endif
 
 #pragma clang diagnostic pop
 #pragma GCC pop_options
diff --git a/src/ext/for/bitunpack_.h b/src/ext/for/bitunpack_.h
index 1e70ab40..cebbbe9f 100644
--- a/src/ext/for/bitunpack_.h
+++ b/src/ext/for/bitunpack_.h
@@ -1,6 +1,6 @@
 /**
-  Copyright (C) powturbo 2013-2023
-  SPDX-License-Identifier: GPL v2 License
+  Copyright (C) powturbo 2013-2017
+  GPL v2 License
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -3103,547 +3103,543 @@
   BITUNBLK64_64(ip, 31, op, nb,parm);  OPI(op, nb,parm); ip += 64*4/sizeof(ip[0]);\
 }
 
-#define BU(_b_,_usize_) unsigned char *in_=in+PAD8(n*_b_),*ip, bin[PAD8(64*_b_)+1]; T3(uint,_usize_,_t) *out_=out+n,bout[64],*op; \
-  do { ip=in+PAD8(32*_b_); op = out+32; if(op > out_) { memcpy(bin, in, in_-in); ip = NULL; in = bin; out = bout; } T2(BITUNPACK64_,_b_)(in, out, _b_,start); PREFETCH(in+384,0); in = ip; out = op; \
-} while(out < out_); if(!ip) { op-=32; memcpy(op,bout,(out_-op)*(_usize_/8)); } return in_
-
 #ifndef DELTA
 #define USIZE 8
-unsigned char *T2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(0,8); }
-unsigned char *T2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(1,8); }
-unsigned char *T2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(2,8); }
-unsigned char *T2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(3,8); }
-unsigned char *T2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(4,8); }
-unsigned char *T2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(5,8); }
-unsigned char *T2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(6,8); }
-unsigned char *T2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(7,8); }
-unsigned char *T2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { BU(8,8); }
-BITUNPACK_F8 T2(_BITUNPACK_,a8)[] = {
-  &T2(_BITUNPACK_,8_0),
-  &T2(_BITUNPACK_,8_1),
-  &T2(_BITUNPACK_,8_2),
-  &T2(_BITUNPACK_,8_3),
-  &T2(_BITUNPACK_,8_4),
-  &T2(_BITUNPACK_,8_5),
-  &T2(_BITUNPACK_,8_6),
-  &T2(_BITUNPACK_,8_7),
-  &T2(_BITUNPACK_,8_8)
+unsigned char *TEMPLATE2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*0); const uint8_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F8 TEMPLATE2(_BITUNPACK_,a8)[] = {
+  &TEMPLATE2(_BITUNPACK_,8_0),
+  &TEMPLATE2(_BITUNPACK_,8_1),
+  &TEMPLATE2(_BITUNPACK_,8_2),
+  &TEMPLATE2(_BITUNPACK_,8_3),
+  &TEMPLATE2(_BITUNPACK_,8_4),
+  &TEMPLATE2(_BITUNPACK_,8_5),
+  &TEMPLATE2(_BITUNPACK_,8_6),
+  &TEMPLATE2(_BITUNPACK_,8_7),
+  &TEMPLATE2(_BITUNPACK_,8_8)
 };
-unsigned char *T2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out , unsigned b) { return T2(_BITUNPACK_,a8)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a8)[ b](in, n, out); }
 
 #define USIZE 16
-unsigned char *T2(_BITUNPACK_,16_0 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 0,16); }
-unsigned char *T2(_BITUNPACK_,16_1 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 1,16); }
-unsigned char *T2(_BITUNPACK_,16_2 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 2,16); }
-unsigned char *T2(_BITUNPACK_,16_3 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 3,16); }
-unsigned char *T2(_BITUNPACK_,16_4 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 4,16); }
-unsigned char *T2(_BITUNPACK_,16_5 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 5,16); }
-unsigned char *T2(_BITUNPACK_,16_6 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 6,16); }
-unsigned char *T2(_BITUNPACK_,16_7 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 7,16); }
-unsigned char *T2(_BITUNPACK_,16_8 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 8,16); }
-unsigned char *T2(_BITUNPACK_,16_9 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU( 9,16); }
-unsigned char *T2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU(10,16); }
-unsigned char *T2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU(11,16); }
-unsigned char *T2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU(12,16); }
-unsigned char *T2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU(13,16); }
-unsigned char *T2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU(14,16); }
-unsigned char *T2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU(15,16); }
-unsigned char *T2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { BU(16,16); }
-BITUNPACK_F16 T2(_BITUNPACK_,a16)[] = {
-  &T2(_BITUNPACK_,16_0),
-  &T2(_BITUNPACK_,16_1),
-  &T2(_BITUNPACK_,16_2),
-  &T2(_BITUNPACK_,16_3),
-  &T2(_BITUNPACK_,16_4),
-  &T2(_BITUNPACK_,16_5),
-  &T2(_BITUNPACK_,16_6),
-  &T2(_BITUNPACK_,16_7),
-  &T2(_BITUNPACK_,16_8),
-  &T2(_BITUNPACK_,16_9),
-  &T2(_BITUNPACK_,16_10),
-  &T2(_BITUNPACK_,16_11),
-  &T2(_BITUNPACK_,16_12),
-  &T2(_BITUNPACK_,16_13),
-  &T2(_BITUNPACK_,16_14),
-  &T2(_BITUNPACK_,16_15),
-  &T2(_BITUNPACK_,16_16)
+unsigned char *TEMPLATE2(_BITUNPACK_,16_0)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*0); const uint16_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_1)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_2)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_3)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_4)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_5)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_6)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_7)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_8)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_9)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*9); do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*10); do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*11); do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*12); do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*13); do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*14); do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*15); do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*16); do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F16 TEMPLATE2(_BITUNPACK_,a16)[] = {
+  &TEMPLATE2(_BITUNPACK_,16_0),
+  &TEMPLATE2(_BITUNPACK_,16_1),
+  &TEMPLATE2(_BITUNPACK_,16_2),
+  &TEMPLATE2(_BITUNPACK_,16_3),
+  &TEMPLATE2(_BITUNPACK_,16_4),
+  &TEMPLATE2(_BITUNPACK_,16_5),
+  &TEMPLATE2(_BITUNPACK_,16_6),
+  &TEMPLATE2(_BITUNPACK_,16_7),
+  &TEMPLATE2(_BITUNPACK_,16_8),
+  &TEMPLATE2(_BITUNPACK_,16_9),
+  &TEMPLATE2(_BITUNPACK_,16_10),
+  &TEMPLATE2(_BITUNPACK_,16_11),
+  &TEMPLATE2(_BITUNPACK_,16_12),
+  &TEMPLATE2(_BITUNPACK_,16_13),
+  &TEMPLATE2(_BITUNPACK_,16_14),
+  &TEMPLATE2(_BITUNPACK_,16_15),
+  &TEMPLATE2(_BITUNPACK_,16_16)
 };
-unsigned char *T2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t  *__restrict out , unsigned b) { return T2(_BITUNPACK_,a16)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t  *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a16)[ b](in, n, out); }
 
 #define USIZE 32
-unsigned char *T2(_BITUNPACK_,32_0 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 0,32); }
-unsigned char *T2(_BITUNPACK_,32_1 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 1,32); }
-unsigned char *T2(_BITUNPACK_,32_2 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 2,32); }
-unsigned char *T2(_BITUNPACK_,32_3 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 3,32); }
-unsigned char *T2(_BITUNPACK_,32_4 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 4,32); }
-unsigned char *T2(_BITUNPACK_,32_5 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 5,32); }
-unsigned char *T2(_BITUNPACK_,32_6 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 6,32); }
-unsigned char *T2(_BITUNPACK_,32_7 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 7,32); }
-unsigned char *T2(_BITUNPACK_,32_8 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 8,32); }
-unsigned char *T2(_BITUNPACK_,32_9 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU( 9,32); }
-unsigned char *T2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(10,32); }
-unsigned char *T2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(11,32); }
-unsigned char *T2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(12,32); }
-unsigned char *T2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(13,32); }
-unsigned char *T2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(14,32); }
-unsigned char *T2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(15,32); }
-unsigned char *T2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(16,32); }
-unsigned char *T2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(17,32); }
-unsigned char *T2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(18,32); }
-unsigned char *T2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(19,32); }
-unsigned char *T2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(20,32); }
-unsigned char *T2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(21,32); }
-unsigned char *T2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(22,32); }
-unsigned char *T2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(23,32); }
-unsigned char *T2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(24,32); }
-unsigned char *T2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(25,32); }
-unsigned char *T2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(26,32); }
-unsigned char *T2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(27,32); }
-unsigned char *T2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(28,32); }
-unsigned char *T2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(29,32); }
-unsigned char *T2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(30,32); }
-unsigned char *T2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(31,32); }
-unsigned char *T2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { BU(32,32); }
-BITUNPACK_F32 T2(_BITUNPACK_,a32)[] = {
-  &T2(_BITUNPACK_,32_0),
-  &T2(_BITUNPACK_,32_1),
-  &T2(_BITUNPACK_,32_2),
-  &T2(_BITUNPACK_,32_3),
-  &T2(_BITUNPACK_,32_4),
-  &T2(_BITUNPACK_,32_5),
-  &T2(_BITUNPACK_,32_6),
-  &T2(_BITUNPACK_,32_7),
-  &T2(_BITUNPACK_,32_8),
-  &T2(_BITUNPACK_,32_9),
-  &T2(_BITUNPACK_,32_10),
-  &T2(_BITUNPACK_,32_11),
-  &T2(_BITUNPACK_,32_12),
-  &T2(_BITUNPACK_,32_13),
-  &T2(_BITUNPACK_,32_14),
-  &T2(_BITUNPACK_,32_15),
-  &T2(_BITUNPACK_,32_16),
-  &T2(_BITUNPACK_,32_17),
-  &T2(_BITUNPACK_,32_18),
-  &T2(_BITUNPACK_,32_19),
-  &T2(_BITUNPACK_,32_20),
-  &T2(_BITUNPACK_,32_21),
-  &T2(_BITUNPACK_,32_22),
-  &T2(_BITUNPACK_,32_23),
-  &T2(_BITUNPACK_,32_24),
-  &T2(_BITUNPACK_,32_25),
-  &T2(_BITUNPACK_,32_26),
-  &T2(_BITUNPACK_,32_27),
-  &T2(_BITUNPACK_,32_28),
-  &T2(_BITUNPACK_,32_29),
-  &T2(_BITUNPACK_,32_30),
-  &T2(_BITUNPACK_,32_31),
-  &T2(_BITUNPACK_,32_32)
+unsigned char *TEMPLATE2(_BITUNPACK_,32_0)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*0); const uint32_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_1)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_2)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_3)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_4)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_5)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_6)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_7)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_8)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_9)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*9); do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*10); do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*11); do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*12); do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*13); do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*14); do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*15); do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*16); do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*17); do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*18); do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*19); do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*20); do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*21); do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*22); do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*23); do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*24); do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*25); do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*26); do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*27); do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*28); do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*29); do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*30); do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*31); do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*32); do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F32 TEMPLATE2(_BITUNPACK_,a32)[] = {
+  &TEMPLATE2(_BITUNPACK_,32_0),
+  &TEMPLATE2(_BITUNPACK_,32_1),
+  &TEMPLATE2(_BITUNPACK_,32_2),
+  &TEMPLATE2(_BITUNPACK_,32_3),
+  &TEMPLATE2(_BITUNPACK_,32_4),
+  &TEMPLATE2(_BITUNPACK_,32_5),
+  &TEMPLATE2(_BITUNPACK_,32_6),
+  &TEMPLATE2(_BITUNPACK_,32_7),
+  &TEMPLATE2(_BITUNPACK_,32_8),
+  &TEMPLATE2(_BITUNPACK_,32_9),
+  &TEMPLATE2(_BITUNPACK_,32_10),
+  &TEMPLATE2(_BITUNPACK_,32_11),
+  &TEMPLATE2(_BITUNPACK_,32_12),
+  &TEMPLATE2(_BITUNPACK_,32_13),
+  &TEMPLATE2(_BITUNPACK_,32_14),
+  &TEMPLATE2(_BITUNPACK_,32_15),
+  &TEMPLATE2(_BITUNPACK_,32_16),
+  &TEMPLATE2(_BITUNPACK_,32_17),
+  &TEMPLATE2(_BITUNPACK_,32_18),
+  &TEMPLATE2(_BITUNPACK_,32_19),
+  &TEMPLATE2(_BITUNPACK_,32_20),
+  &TEMPLATE2(_BITUNPACK_,32_21),
+  &TEMPLATE2(_BITUNPACK_,32_22),
+  &TEMPLATE2(_BITUNPACK_,32_23),
+  &TEMPLATE2(_BITUNPACK_,32_24),
+  &TEMPLATE2(_BITUNPACK_,32_25),
+  &TEMPLATE2(_BITUNPACK_,32_26),
+  &TEMPLATE2(_BITUNPACK_,32_27),
+  &TEMPLATE2(_BITUNPACK_,32_28),
+  &TEMPLATE2(_BITUNPACK_,32_29),
+  &TEMPLATE2(_BITUNPACK_,32_30),
+  &TEMPLATE2(_BITUNPACK_,32_31),
+  &TEMPLATE2(_BITUNPACK_,32_32)
 };
-unsigned char *T2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t  *__restrict out , unsigned b) { return T2(_BITUNPACK_,a32)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t  *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a32)[ b](in, n, out); }
 
 #define USIZE 64
-unsigned char *T2(_BITUNPACK_,64_0 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 0,64); }
-unsigned char *T2(_BITUNPACK_,64_1 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 1,64); }
-unsigned char *T2(_BITUNPACK_,64_2 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 2,64); }
-unsigned char *T2(_BITUNPACK_,64_3 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 3,64); }
-unsigned char *T2(_BITUNPACK_,64_4 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 4,64); }
-unsigned char *T2(_BITUNPACK_,64_5 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 5,64); }
-unsigned char *T2(_BITUNPACK_,64_6 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 6,64); }
-unsigned char *T2(_BITUNPACK_,64_7 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 7,64); }
-unsigned char *T2(_BITUNPACK_,64_8 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 8,64); }
-unsigned char *T2(_BITUNPACK_,64_9 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU( 9,64); }
-unsigned char *T2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(10,64); }
-unsigned char *T2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(11,64); }
-unsigned char *T2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(12,64); }
-unsigned char *T2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(13,64); }
-unsigned char *T2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(14,64); }
-unsigned char *T2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(15,64); }
-unsigned char *T2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(16,64); }
-unsigned char *T2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(17,64); }
-unsigned char *T2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(18,64); }
-unsigned char *T2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(19,64); }
-unsigned char *T2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(20,64); }
-unsigned char *T2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(21,64); }
-unsigned char *T2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(22,64); }
-unsigned char *T2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(23,64); }
-unsigned char *T2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(24,64); }
-unsigned char *T2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(25,64); }
-unsigned char *T2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(26,64); }
-unsigned char *T2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(27,64); }
-unsigned char *T2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(28,64); }
-unsigned char *T2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(29,64); }
-unsigned char *T2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(30,64); }
-unsigned char *T2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(31,64); }
-unsigned char *T2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(32,64); }
-unsigned char *T2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(33,64); }
-unsigned char *T2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(34,64); }
-unsigned char *T2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(35,64); }
-unsigned char *T2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(36,64); }
-unsigned char *T2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(37,64); }
-unsigned char *T2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(38,64); }
-unsigned char *T2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(39,64); }
-unsigned char *T2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(40,64); }
-unsigned char *T2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(41,64); }
-unsigned char *T2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(42,64); }
-unsigned char *T2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(43,64); }
-unsigned char *T2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(44,64); }
-unsigned char *T2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(45,64); }
-unsigned char *T2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(46,64); }
-unsigned char *T2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(47,64); }
-unsigned char *T2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(48,64); }
-unsigned char *T2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(49,64); }
-unsigned char *T2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(50,64); }
-unsigned char *T2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(51,64); }
-unsigned char *T2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(52,64); }
-unsigned char *T2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(53,64); }
-unsigned char *T2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(54,64); }
-unsigned char *T2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(55,64); }
-unsigned char *T2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(56,64); }
-unsigned char *T2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(57,64); }
-unsigned char *T2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(58,64); }
-unsigned char *T2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(59,64); }
-unsigned char *T2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(60,64); }
-unsigned char *T2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(61,64); }
-unsigned char *T2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(62,64); }
-unsigned char *T2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(63,64); }
-unsigned char *T2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { BU(64,64); }
-BITUNPACK_F64 T2(_BITUNPACK_,a64)[] = {
-  &T2(_BITUNPACK_,64_0),
-  &T2(_BITUNPACK_,64_1),
-  &T2(_BITUNPACK_,64_2),
-  &T2(_BITUNPACK_,64_3),
-  &T2(_BITUNPACK_,64_4),
-  &T2(_BITUNPACK_,64_5),
-  &T2(_BITUNPACK_,64_6),
-  &T2(_BITUNPACK_,64_7),
-  &T2(_BITUNPACK_,64_8),
-  &T2(_BITUNPACK_,64_9),
-  &T2(_BITUNPACK_,64_10),
-  &T2(_BITUNPACK_,64_11),
-  &T2(_BITUNPACK_,64_12),
-  &T2(_BITUNPACK_,64_13),
-  &T2(_BITUNPACK_,64_14),
-  &T2(_BITUNPACK_,64_15),
-  &T2(_BITUNPACK_,64_16),
-  &T2(_BITUNPACK_,64_17),
-  &T2(_BITUNPACK_,64_18),
-  &T2(_BITUNPACK_,64_19),
-  &T2(_BITUNPACK_,64_20),
-  &T2(_BITUNPACK_,64_21),
-  &T2(_BITUNPACK_,64_22),
-  &T2(_BITUNPACK_,64_23),
-  &T2(_BITUNPACK_,64_24),
-  &T2(_BITUNPACK_,64_25),
-  &T2(_BITUNPACK_,64_26),
-  &T2(_BITUNPACK_,64_27),
-  &T2(_BITUNPACK_,64_28),
-  &T2(_BITUNPACK_,64_29),
-  &T2(_BITUNPACK_,64_30),
-  &T2(_BITUNPACK_,64_31),
-  &T2(_BITUNPACK_,64_32),
-  &T2(_BITUNPACK_,64_33),
-  &T2(_BITUNPACK_,64_34),
-  &T2(_BITUNPACK_,64_35),
-  &T2(_BITUNPACK_,64_36),
-  &T2(_BITUNPACK_,64_37),
-  &T2(_BITUNPACK_,64_38),
-  &T2(_BITUNPACK_,64_39),
-  &T2(_BITUNPACK_,64_40),
-  &T2(_BITUNPACK_,64_41),
-  &T2(_BITUNPACK_,64_42),
-  &T2(_BITUNPACK_,64_43),
-  &T2(_BITUNPACK_,64_44),
-  &T2(_BITUNPACK_,64_45),
-  &T2(_BITUNPACK_,64_46),
-  &T2(_BITUNPACK_,64_47),
-  &T2(_BITUNPACK_,64_48),
-  &T2(_BITUNPACK_,64_49),
-  &T2(_BITUNPACK_,64_50),
-  &T2(_BITUNPACK_,64_51),
-  &T2(_BITUNPACK_,64_52),
-  &T2(_BITUNPACK_,64_53),
-  &T2(_BITUNPACK_,64_54),
-  &T2(_BITUNPACK_,64_55),
-  &T2(_BITUNPACK_,64_56),
-  &T2(_BITUNPACK_,64_57),
-  &T2(_BITUNPACK_,64_58),
-  &T2(_BITUNPACK_,64_59),
-  &T2(_BITUNPACK_,64_60),
-  &T2(_BITUNPACK_,64_61),
-  &T2(_BITUNPACK_,64_62),
-  &T2(_BITUNPACK_,64_63),
-  &T2(_BITUNPACK_,64_64)
+unsigned char *TEMPLATE2(_BITUNPACK_,64_0)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*0); const uint64_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_1)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_2)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_3)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_4)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_5)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_6)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_7)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_8)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_9)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*9); do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*10); do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*11); do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*12); do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*13); do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*14); do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*15); do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*16); do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*17); do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*18); do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*19); do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*20); do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*21); do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*22); do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*23); do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*24); do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*25); do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*26); do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*27); do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*28); do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*29); do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*30); do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*31); do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*32); do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*33); do { BITUNPACK64_33( in, out, 33,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*34); do { BITUNPACK64_34( in, out, 34,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*35); do { BITUNPACK64_35( in, out, 35,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*36); do { BITUNPACK64_36( in, out, 36,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*37); do { BITUNPACK64_37( in, out, 37,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*38); do { BITUNPACK64_38( in, out, 38,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*39); do { BITUNPACK64_39( in, out, 39,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*40); do { BITUNPACK64_40( in, out, 40,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*41); do { BITUNPACK64_41( in, out, 41,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*42); do { BITUNPACK64_42( in, out, 42,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*43); do { BITUNPACK64_43( in, out, 43,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*44); do { BITUNPACK64_44( in, out, 44,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*45); do { BITUNPACK64_45( in, out, 45,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*46); do { BITUNPACK64_46( in, out, 46,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*47); do { BITUNPACK64_47( in, out, 47,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*48); do { BITUNPACK64_48( in, out, 48,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*49); do { BITUNPACK64_49( in, out, 49,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*50); do { BITUNPACK64_50( in, out, 50,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*51); do { BITUNPACK64_51( in, out, 51,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*52); do { BITUNPACK64_52( in, out, 52,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*53); do { BITUNPACK64_53( in, out, 53,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*54); do { BITUNPACK64_54( in, out, 54,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*55); do { BITUNPACK64_55( in, out, 55,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*56); do { BITUNPACK64_56( in, out, 56,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*57); do { BITUNPACK64_57( in, out, 57,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*58); do { BITUNPACK64_58( in, out, 58,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*59); do { BITUNPACK64_59( in, out, 59,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*60); do { BITUNPACK64_60( in, out, 60,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*61); do { BITUNPACK64_61( in, out, 61,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*62); do { BITUNPACK64_62( in, out, 62,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*63); do { BITUNPACK64_63( in, out, 63,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out  ) { unsigned char *in_=in+PAD8(n*64); do { BITUNPACK64_64( in, out, 64,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F64 TEMPLATE2(_BITUNPACK_,a64)[] = {
+  &TEMPLATE2(_BITUNPACK_,64_0),
+  &TEMPLATE2(_BITUNPACK_,64_1),
+  &TEMPLATE2(_BITUNPACK_,64_2),
+  &TEMPLATE2(_BITUNPACK_,64_3),
+  &TEMPLATE2(_BITUNPACK_,64_4),
+  &TEMPLATE2(_BITUNPACK_,64_5),
+  &TEMPLATE2(_BITUNPACK_,64_6),
+  &TEMPLATE2(_BITUNPACK_,64_7),
+  &TEMPLATE2(_BITUNPACK_,64_8),
+  &TEMPLATE2(_BITUNPACK_,64_9),
+  &TEMPLATE2(_BITUNPACK_,64_10),
+  &TEMPLATE2(_BITUNPACK_,64_11),
+  &TEMPLATE2(_BITUNPACK_,64_12),
+  &TEMPLATE2(_BITUNPACK_,64_13),
+  &TEMPLATE2(_BITUNPACK_,64_14),
+  &TEMPLATE2(_BITUNPACK_,64_15),
+  &TEMPLATE2(_BITUNPACK_,64_16),
+  &TEMPLATE2(_BITUNPACK_,64_17),
+  &TEMPLATE2(_BITUNPACK_,64_18),
+  &TEMPLATE2(_BITUNPACK_,64_19),
+  &TEMPLATE2(_BITUNPACK_,64_20),
+  &TEMPLATE2(_BITUNPACK_,64_21),
+  &TEMPLATE2(_BITUNPACK_,64_22),
+  &TEMPLATE2(_BITUNPACK_,64_23),
+  &TEMPLATE2(_BITUNPACK_,64_24),
+  &TEMPLATE2(_BITUNPACK_,64_25),
+  &TEMPLATE2(_BITUNPACK_,64_26),
+  &TEMPLATE2(_BITUNPACK_,64_27),
+  &TEMPLATE2(_BITUNPACK_,64_28),
+  &TEMPLATE2(_BITUNPACK_,64_29),
+  &TEMPLATE2(_BITUNPACK_,64_30),
+  &TEMPLATE2(_BITUNPACK_,64_31),
+  &TEMPLATE2(_BITUNPACK_,64_32),
+  &TEMPLATE2(_BITUNPACK_,64_33),
+  &TEMPLATE2(_BITUNPACK_,64_34),
+  &TEMPLATE2(_BITUNPACK_,64_35),
+  &TEMPLATE2(_BITUNPACK_,64_36),
+  &TEMPLATE2(_BITUNPACK_,64_37),
+  &TEMPLATE2(_BITUNPACK_,64_38),
+  &TEMPLATE2(_BITUNPACK_,64_39),
+  &TEMPLATE2(_BITUNPACK_,64_40),
+  &TEMPLATE2(_BITUNPACK_,64_41),
+  &TEMPLATE2(_BITUNPACK_,64_42),
+  &TEMPLATE2(_BITUNPACK_,64_43),
+  &TEMPLATE2(_BITUNPACK_,64_44),
+  &TEMPLATE2(_BITUNPACK_,64_45),
+  &TEMPLATE2(_BITUNPACK_,64_46),
+  &TEMPLATE2(_BITUNPACK_,64_47),
+  &TEMPLATE2(_BITUNPACK_,64_48),
+  &TEMPLATE2(_BITUNPACK_,64_49),
+  &TEMPLATE2(_BITUNPACK_,64_50),
+  &TEMPLATE2(_BITUNPACK_,64_51),
+  &TEMPLATE2(_BITUNPACK_,64_52),
+  &TEMPLATE2(_BITUNPACK_,64_53),
+  &TEMPLATE2(_BITUNPACK_,64_54),
+  &TEMPLATE2(_BITUNPACK_,64_55),
+  &TEMPLATE2(_BITUNPACK_,64_56),
+  &TEMPLATE2(_BITUNPACK_,64_57),
+  &TEMPLATE2(_BITUNPACK_,64_58),
+  &TEMPLATE2(_BITUNPACK_,64_59),
+  &TEMPLATE2(_BITUNPACK_,64_60),
+  &TEMPLATE2(_BITUNPACK_,64_61),
+  &TEMPLATE2(_BITUNPACK_,64_62),
+  &TEMPLATE2(_BITUNPACK_,64_63),
+  &TEMPLATE2(_BITUNPACK_,64_64)
 };
-unsigned char *T2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t  *__restrict out , unsigned b) { return T2(_BITUNPACK_,a64)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t  *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a64)[ b](in, n, out); }
 
 #else
 #define USIZE 8
-unsigned char *T2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(0,8); }
-unsigned char *T2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(1,8); }
-unsigned char *T2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(2,8); }
-unsigned char *T2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(3,8); }
-unsigned char *T2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(4,8); }
-unsigned char *T2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(5,8); }
-unsigned char *T2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(6,8); }
-unsigned char *T2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(7,8); }
-unsigned char *T2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(8,8); }
-BITUNPACK_D8 T2(_BITUNPACK_,a8)[] = {
-  &T2(_BITUNPACK_,8_0),
-  &T2(_BITUNPACK_,8_1),
-  &T2(_BITUNPACK_,8_2),
-  &T2(_BITUNPACK_,8_3),
-  &T2(_BITUNPACK_,8_4),
-  &T2(_BITUNPACK_,8_5),
-  &T2(_BITUNPACK_,8_6),
-  &T2(_BITUNPACK_,8_7),
-  &T2(_BITUNPACK_,8_8)
+unsigned char *TEMPLATE2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint8_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D8 TEMPLATE2(_BITUNPACK_,a8)[] = {
+  &TEMPLATE2(_BITUNPACK_,8_0),
+  &TEMPLATE2(_BITUNPACK_,8_1),
+  &TEMPLATE2(_BITUNPACK_,8_2),
+  &TEMPLATE2(_BITUNPACK_,8_3),
+  &TEMPLATE2(_BITUNPACK_,8_4),
+  &TEMPLATE2(_BITUNPACK_,8_5),
+  &TEMPLATE2(_BITUNPACK_,8_6),
+  &TEMPLATE2(_BITUNPACK_,8_7),
+  &TEMPLATE2(_BITUNPACK_,8_8)
 };
-unsigned char *T2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out , uint8_t start, unsigned b) { return T2(_BITUNPACK_,a8)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out , uint8_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a8)[ b](in, n, out, start); }
 
 #define USIZE 16
-unsigned char *T2(_BITUNPACK_,16_0 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 0,16); }
-unsigned char *T2(_BITUNPACK_,16_1 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 1,16); }
-unsigned char *T2(_BITUNPACK_,16_2 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 2,16); }
-unsigned char *T2(_BITUNPACK_,16_3 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 3,16); }
-unsigned char *T2(_BITUNPACK_,16_4 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 4,16); }
-unsigned char *T2(_BITUNPACK_,16_5 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 5,16); }
-unsigned char *T2(_BITUNPACK_,16_6 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 6,16); }
-unsigned char *T2(_BITUNPACK_,16_7 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 7,16); }
-unsigned char *T2(_BITUNPACK_,16_8 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 8,16); }
-unsigned char *T2(_BITUNPACK_,16_9 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 9,16); }
-unsigned char *T2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(10,16); }
-unsigned char *T2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(11,16); }
-unsigned char *T2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(12,16); }
-unsigned char *T2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(13,16); }
-unsigned char *T2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(14,16); }
-unsigned char *T2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(15,16); }
-unsigned char *T2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(16,16); }
-BITUNPACK_D16 T2(_BITUNPACK_,a16)[] = {
-  &T2(_BITUNPACK_,16_0),
-  &T2(_BITUNPACK_,16_1),
-  &T2(_BITUNPACK_,16_2),
-  &T2(_BITUNPACK_,16_3),
-  &T2(_BITUNPACK_,16_4),
-  &T2(_BITUNPACK_,16_5),
-  &T2(_BITUNPACK_,16_6),
-  &T2(_BITUNPACK_,16_7),
-  &T2(_BITUNPACK_,16_8),
-  &T2(_BITUNPACK_,16_9),
-  &T2(_BITUNPACK_,16_10),
-  &T2(_BITUNPACK_,16_11),
-  &T2(_BITUNPACK_,16_12),
-  &T2(_BITUNPACK_,16_13),
-  &T2(_BITUNPACK_,16_14),
-  &T2(_BITUNPACK_,16_15),
-  &T2(_BITUNPACK_,16_16)
+unsigned char *TEMPLATE2(_BITUNPACK_,16_0)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint16_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_1)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_2)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_3)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_4)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_5)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_6)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_7)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_8)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_9)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*9),x=0; do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*10),x=0; do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*11),x=0; do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*12),x=0; do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*13),x=0; do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*14),x=0; do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*15),x=0; do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*16),x=0; do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D16 TEMPLATE2(_BITUNPACK_,a16)[] = {
+  &TEMPLATE2(_BITUNPACK_,16_0),
+  &TEMPLATE2(_BITUNPACK_,16_1),
+  &TEMPLATE2(_BITUNPACK_,16_2),
+  &TEMPLATE2(_BITUNPACK_,16_3),
+  &TEMPLATE2(_BITUNPACK_,16_4),
+  &TEMPLATE2(_BITUNPACK_,16_5),
+  &TEMPLATE2(_BITUNPACK_,16_6),
+  &TEMPLATE2(_BITUNPACK_,16_7),
+  &TEMPLATE2(_BITUNPACK_,16_8),
+  &TEMPLATE2(_BITUNPACK_,16_9),
+  &TEMPLATE2(_BITUNPACK_,16_10),
+  &TEMPLATE2(_BITUNPACK_,16_11),
+  &TEMPLATE2(_BITUNPACK_,16_12),
+  &TEMPLATE2(_BITUNPACK_,16_13),
+  &TEMPLATE2(_BITUNPACK_,16_14),
+  &TEMPLATE2(_BITUNPACK_,16_15),
+  &TEMPLATE2(_BITUNPACK_,16_16)
 };
-unsigned char *T2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t  *__restrict out , uint16_t start, unsigned b) { return T2(_BITUNPACK_,a16)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t  *__restrict out , uint16_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a16)[ b](in, n, out, start); }
 
 #define USIZE 32
-unsigned char *T2(_BITUNPACK_,32_0 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 0,32); }
-unsigned char *T2(_BITUNPACK_,32_1 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 1,32); }
-unsigned char *T2(_BITUNPACK_,32_2 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 2,32); }
-unsigned char *T2(_BITUNPACK_,32_3 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 3,32); }
-unsigned char *T2(_BITUNPACK_,32_4 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 4,32); }
-unsigned char *T2(_BITUNPACK_,32_5 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 5,32); }
-unsigned char *T2(_BITUNPACK_,32_6 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 6,32); }
-unsigned char *T2(_BITUNPACK_,32_7 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 7,32); }
-unsigned char *T2(_BITUNPACK_,32_8 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 8,32); }
-unsigned char *T2(_BITUNPACK_,32_9 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 9,32); }
-unsigned char *T2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(10,32); }
-unsigned char *T2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(11,32); }
-unsigned char *T2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(12,32); }
-unsigned char *T2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(13,32); }
-unsigned char *T2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(14,32); }
-unsigned char *T2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(15,32); }
-unsigned char *T2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(16,32); }
-unsigned char *T2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(17,32); }
-unsigned char *T2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(18,32); }
-unsigned char *T2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(19,32); }
-unsigned char *T2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(20,32); }
-unsigned char *T2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(21,32); }
-unsigned char *T2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(22,32); }
-unsigned char *T2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(23,32); }
-unsigned char *T2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(24,32); }
-unsigned char *T2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(25,32); }
-unsigned char *T2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(26,32); }
-unsigned char *T2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(27,32); }
-unsigned char *T2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(28,32); }
-unsigned char *T2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(29,32); }
-unsigned char *T2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(30,32); }
-unsigned char *T2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(31,32); }
-unsigned char *T2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(32,32); }
-BITUNPACK_D32 T2(_BITUNPACK_,a32)[] = {
-  &T2(_BITUNPACK_,32_0),
-  &T2(_BITUNPACK_,32_1),
-  &T2(_BITUNPACK_,32_2),
-  &T2(_BITUNPACK_,32_3),
-  &T2(_BITUNPACK_,32_4),
-  &T2(_BITUNPACK_,32_5),
-  &T2(_BITUNPACK_,32_6),
-  &T2(_BITUNPACK_,32_7),
-  &T2(_BITUNPACK_,32_8),
-  &T2(_BITUNPACK_,32_9),
-  &T2(_BITUNPACK_,32_10),
-  &T2(_BITUNPACK_,32_11),
-  &T2(_BITUNPACK_,32_12),
-  &T2(_BITUNPACK_,32_13),
-  &T2(_BITUNPACK_,32_14),
-  &T2(_BITUNPACK_,32_15),
-  &T2(_BITUNPACK_,32_16),
-  &T2(_BITUNPACK_,32_17),
-  &T2(_BITUNPACK_,32_18),
-  &T2(_BITUNPACK_,32_19),
-  &T2(_BITUNPACK_,32_20),
-  &T2(_BITUNPACK_,32_21),
-  &T2(_BITUNPACK_,32_22),
-  &T2(_BITUNPACK_,32_23),
-  &T2(_BITUNPACK_,32_24),
-  &T2(_BITUNPACK_,32_25),
-  &T2(_BITUNPACK_,32_26),
-  &T2(_BITUNPACK_,32_27),
-  &T2(_BITUNPACK_,32_28),
-  &T2(_BITUNPACK_,32_29),
-  &T2(_BITUNPACK_,32_30),
-  &T2(_BITUNPACK_,32_31),
-  &T2(_BITUNPACK_,32_32)
+unsigned char *TEMPLATE2(_BITUNPACK_,32_0)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint32_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_1)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_2)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_3)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_4)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_5)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_6)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_7)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_8)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_9)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*9),x=0; do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*10),x=0; do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*11),x=0; do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*12),x=0; do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*13),x=0; do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*14),x=0; do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*15),x=0; do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*16),x=0; do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*17),x=0; do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*18),x=0; do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*19),x=0; do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*20),x=0; do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*21),x=0; do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*22),x=0; do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*23),x=0; do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*24),x=0; do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*25),x=0; do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*26),x=0; do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*27),x=0; do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*28),x=0; do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*29),x=0; do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*30),x=0; do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*31),x=0; do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*32),x=0; do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D32 TEMPLATE2(_BITUNPACK_,a32)[] = {
+  &TEMPLATE2(_BITUNPACK_,32_0),
+  &TEMPLATE2(_BITUNPACK_,32_1),
+  &TEMPLATE2(_BITUNPACK_,32_2),
+  &TEMPLATE2(_BITUNPACK_,32_3),
+  &TEMPLATE2(_BITUNPACK_,32_4),
+  &TEMPLATE2(_BITUNPACK_,32_5),
+  &TEMPLATE2(_BITUNPACK_,32_6),
+  &TEMPLATE2(_BITUNPACK_,32_7),
+  &TEMPLATE2(_BITUNPACK_,32_8),
+  &TEMPLATE2(_BITUNPACK_,32_9),
+  &TEMPLATE2(_BITUNPACK_,32_10),
+  &TEMPLATE2(_BITUNPACK_,32_11),
+  &TEMPLATE2(_BITUNPACK_,32_12),
+  &TEMPLATE2(_BITUNPACK_,32_13),
+  &TEMPLATE2(_BITUNPACK_,32_14),
+  &TEMPLATE2(_BITUNPACK_,32_15),
+  &TEMPLATE2(_BITUNPACK_,32_16),
+  &TEMPLATE2(_BITUNPACK_,32_17),
+  &TEMPLATE2(_BITUNPACK_,32_18),
+  &TEMPLATE2(_BITUNPACK_,32_19),
+  &TEMPLATE2(_BITUNPACK_,32_20),
+  &TEMPLATE2(_BITUNPACK_,32_21),
+  &TEMPLATE2(_BITUNPACK_,32_22),
+  &TEMPLATE2(_BITUNPACK_,32_23),
+  &TEMPLATE2(_BITUNPACK_,32_24),
+  &TEMPLATE2(_BITUNPACK_,32_25),
+  &TEMPLATE2(_BITUNPACK_,32_26),
+  &TEMPLATE2(_BITUNPACK_,32_27),
+  &TEMPLATE2(_BITUNPACK_,32_28),
+  &TEMPLATE2(_BITUNPACK_,32_29),
+  &TEMPLATE2(_BITUNPACK_,32_30),
+  &TEMPLATE2(_BITUNPACK_,32_31),
+  &TEMPLATE2(_BITUNPACK_,32_32)
 };
-unsigned char *T2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t  *__restrict out , uint32_t start, unsigned b) { return T2(_BITUNPACK_,a32)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t  *__restrict out , uint32_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a32)[ b](in, n, out, start); }
 
 #define USIZE 64
-unsigned char *T2(_BITUNPACK_,64_0 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 0,64); }
-unsigned char *T2(_BITUNPACK_,64_1 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 1,64); }
-unsigned char *T2(_BITUNPACK_,64_2 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 2,64); }
-unsigned char *T2(_BITUNPACK_,64_3 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 3,64); }
-unsigned char *T2(_BITUNPACK_,64_4 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 4,64); }
-unsigned char *T2(_BITUNPACK_,64_5 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 5,64); }
-unsigned char *T2(_BITUNPACK_,64_6 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 6,64); }
-unsigned char *T2(_BITUNPACK_,64_7 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 7,64); }
-unsigned char *T2(_BITUNPACK_,64_8 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 8,64); }
-unsigned char *T2(_BITUNPACK_,64_9 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 9,64); }
-unsigned char *T2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(10,64); }
-unsigned char *T2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(11,64); }
-unsigned char *T2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(12,64); }
-unsigned char *T2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(13,64); }
-unsigned char *T2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(14,64); }
-unsigned char *T2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(15,64); }
-unsigned char *T2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(16,64); }
-unsigned char *T2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(17,64); }
-unsigned char *T2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(18,64); }
-unsigned char *T2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(19,64); }
-unsigned char *T2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(20,64); }
-unsigned char *T2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(21,64); }
-unsigned char *T2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(22,64); }
-unsigned char *T2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(23,64); }
-unsigned char *T2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(24,64); }
-unsigned char *T2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(25,64); }
-unsigned char *T2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(26,64); }
-unsigned char *T2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(27,64); }
-unsigned char *T2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(28,64); }
-unsigned char *T2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(29,64); }
-unsigned char *T2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(30,64); }
-unsigned char *T2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(31,64); }
-unsigned char *T2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(32,64); }
-unsigned char *T2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(33,64); }
-unsigned char *T2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(34,64); }
-unsigned char *T2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(35,64); }
-unsigned char *T2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(36,64); }
-unsigned char *T2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(37,64); }
-unsigned char *T2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(38,64); }
-unsigned char *T2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(39,64); }
-unsigned char *T2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(40,64); }
-unsigned char *T2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(41,64); }
-unsigned char *T2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(42,64); }
-unsigned char *T2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(43,64); }
-unsigned char *T2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(44,64); }
-unsigned char *T2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(45,64); }
-unsigned char *T2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(46,64); }
-unsigned char *T2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(47,64); }
-unsigned char *T2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(48,64); }
-unsigned char *T2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(49,64); }
-unsigned char *T2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(50,64); }
-unsigned char *T2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(51,64); }
-unsigned char *T2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(52,64); }
-unsigned char *T2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(53,64); }
-unsigned char *T2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(54,64); }
-unsigned char *T2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(55,64); }
-unsigned char *T2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(56,64); }
-unsigned char *T2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(57,64); }
-unsigned char *T2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(58,64); }
-unsigned char *T2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(59,64); }
-unsigned char *T2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(60,64); }
-unsigned char *T2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(61,64); }
-unsigned char *T2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(62,64); }
-unsigned char *T2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(63,64); }
-unsigned char *T2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(64,64); }
-BITUNPACK_D64 T2(_BITUNPACK_,a64)[] = {
-  &T2(_BITUNPACK_,64_0),
-  &T2(_BITUNPACK_,64_1),
-  &T2(_BITUNPACK_,64_2),
-  &T2(_BITUNPACK_,64_3),
-  &T2(_BITUNPACK_,64_4),
-  &T2(_BITUNPACK_,64_5),
-  &T2(_BITUNPACK_,64_6),
-  &T2(_BITUNPACK_,64_7),
-  &T2(_BITUNPACK_,64_8),
-  &T2(_BITUNPACK_,64_9),
-  &T2(_BITUNPACK_,64_10),
-  &T2(_BITUNPACK_,64_11),
-  &T2(_BITUNPACK_,64_12),
-  &T2(_BITUNPACK_,64_13),
-  &T2(_BITUNPACK_,64_14),
-  &T2(_BITUNPACK_,64_15),
-  &T2(_BITUNPACK_,64_16),
-  &T2(_BITUNPACK_,64_17),
-  &T2(_BITUNPACK_,64_18),
-  &T2(_BITUNPACK_,64_19),
-  &T2(_BITUNPACK_,64_20),
-  &T2(_BITUNPACK_,64_21),
-  &T2(_BITUNPACK_,64_22),
-  &T2(_BITUNPACK_,64_23),
-  &T2(_BITUNPACK_,64_24),
-  &T2(_BITUNPACK_,64_25),
-  &T2(_BITUNPACK_,64_26),
-  &T2(_BITUNPACK_,64_27),
-  &T2(_BITUNPACK_,64_28),
-  &T2(_BITUNPACK_,64_29),
-  &T2(_BITUNPACK_,64_30),
-  &T2(_BITUNPACK_,64_31),
-  &T2(_BITUNPACK_,64_32),
-  &T2(_BITUNPACK_,64_33),
-  &T2(_BITUNPACK_,64_34),
-  &T2(_BITUNPACK_,64_35),
-  &T2(_BITUNPACK_,64_36),
-  &T2(_BITUNPACK_,64_37),
-  &T2(_BITUNPACK_,64_38),
-  &T2(_BITUNPACK_,64_39),
-  &T2(_BITUNPACK_,64_40),
-  &T2(_BITUNPACK_,64_41),
-  &T2(_BITUNPACK_,64_42),
-  &T2(_BITUNPACK_,64_43),
-  &T2(_BITUNPACK_,64_44),
-  &T2(_BITUNPACK_,64_45),
-  &T2(_BITUNPACK_,64_46),
-  &T2(_BITUNPACK_,64_47),
-  &T2(_BITUNPACK_,64_48),
-  &T2(_BITUNPACK_,64_49),
-  &T2(_BITUNPACK_,64_50),
-  &T2(_BITUNPACK_,64_51),
-  &T2(_BITUNPACK_,64_52),
-  &T2(_BITUNPACK_,64_53),
-  &T2(_BITUNPACK_,64_54),
-  &T2(_BITUNPACK_,64_55),
-  &T2(_BITUNPACK_,64_56),
-  &T2(_BITUNPACK_,64_57),
-  &T2(_BITUNPACK_,64_58),
-  &T2(_BITUNPACK_,64_59),
-  &T2(_BITUNPACK_,64_60),
-  &T2(_BITUNPACK_,64_61),
-  &T2(_BITUNPACK_,64_62),
-  &T2(_BITUNPACK_,64_63),
-  &T2(_BITUNPACK_,64_64)
+unsigned char *TEMPLATE2(_BITUNPACK_,64_0)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint64_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_1)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_2)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_3)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_4)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_5)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_6)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_7)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_8)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_9)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*9),x=0; do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*10),x=0; do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*11),x=0; do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*12),x=0; do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*13),x=0; do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*14),x=0; do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*15),x=0; do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*16),x=0; do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*17),x=0; do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*18),x=0; do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*19),x=0; do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*20),x=0; do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*21),x=0; do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*22),x=0; do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*23),x=0; do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*24),x=0; do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*25),x=0; do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*26),x=0; do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*27),x=0; do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*28),x=0; do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*29),x=0; do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*30),x=0; do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*31),x=0; do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*32),x=0; do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*33),x=0; do { BITUNPACK64_33( in, out, 33,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*34),x=0; do { BITUNPACK64_34( in, out, 34,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*35),x=0; do { BITUNPACK64_35( in, out, 35,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*36),x=0; do { BITUNPACK64_36( in, out, 36,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*37),x=0; do { BITUNPACK64_37( in, out, 37,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*38),x=0; do { BITUNPACK64_38( in, out, 38,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*39),x=0; do { BITUNPACK64_39( in, out, 39,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*40),x=0; do { BITUNPACK64_40( in, out, 40,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*41),x=0; do { BITUNPACK64_41( in, out, 41,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*42),x=0; do { BITUNPACK64_42( in, out, 42,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*43),x=0; do { BITUNPACK64_43( in, out, 43,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*44),x=0; do { BITUNPACK64_44( in, out, 44,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*45),x=0; do { BITUNPACK64_45( in, out, 45,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*46),x=0; do { BITUNPACK64_46( in, out, 46,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*47),x=0; do { BITUNPACK64_47( in, out, 47,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*48),x=0; do { BITUNPACK64_48( in, out, 48,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*49),x=0; do { BITUNPACK64_49( in, out, 49,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*50),x=0; do { BITUNPACK64_50( in, out, 50,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*51),x=0; do { BITUNPACK64_51( in, out, 51,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*52),x=0; do { BITUNPACK64_52( in, out, 52,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*53),x=0; do { BITUNPACK64_53( in, out, 53,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*54),x=0; do { BITUNPACK64_54( in, out, 54,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*55),x=0; do { BITUNPACK64_55( in, out, 55,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*56),x=0; do { BITUNPACK64_56( in, out, 56,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*57),x=0; do { BITUNPACK64_57( in, out, 57,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*58),x=0; do { BITUNPACK64_58( in, out, 58,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*59),x=0; do { BITUNPACK64_59( in, out, 59,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*60),x=0; do { BITUNPACK64_60( in, out, 60,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*61),x=0; do { BITUNPACK64_61( in, out, 61,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*62),x=0; do { BITUNPACK64_62( in, out, 62,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*63),x=0; do { BITUNPACK64_63( in, out, 63,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*64),x=0; do { BITUNPACK64_64( in, out, 64,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D64 TEMPLATE2(_BITUNPACK_,a64)[] = {
+  &TEMPLATE2(_BITUNPACK_,64_0),
+  &TEMPLATE2(_BITUNPACK_,64_1),
+  &TEMPLATE2(_BITUNPACK_,64_2),
+  &TEMPLATE2(_BITUNPACK_,64_3),
+  &TEMPLATE2(_BITUNPACK_,64_4),
+  &TEMPLATE2(_BITUNPACK_,64_5),
+  &TEMPLATE2(_BITUNPACK_,64_6),
+  &TEMPLATE2(_BITUNPACK_,64_7),
+  &TEMPLATE2(_BITUNPACK_,64_8),
+  &TEMPLATE2(_BITUNPACK_,64_9),
+  &TEMPLATE2(_BITUNPACK_,64_10),
+  &TEMPLATE2(_BITUNPACK_,64_11),
+  &TEMPLATE2(_BITUNPACK_,64_12),
+  &TEMPLATE2(_BITUNPACK_,64_13),
+  &TEMPLATE2(_BITUNPACK_,64_14),
+  &TEMPLATE2(_BITUNPACK_,64_15),
+  &TEMPLATE2(_BITUNPACK_,64_16),
+  &TEMPLATE2(_BITUNPACK_,64_17),
+  &TEMPLATE2(_BITUNPACK_,64_18),
+  &TEMPLATE2(_BITUNPACK_,64_19),
+  &TEMPLATE2(_BITUNPACK_,64_20),
+  &TEMPLATE2(_BITUNPACK_,64_21),
+  &TEMPLATE2(_BITUNPACK_,64_22),
+  &TEMPLATE2(_BITUNPACK_,64_23),
+  &TEMPLATE2(_BITUNPACK_,64_24),
+  &TEMPLATE2(_BITUNPACK_,64_25),
+  &TEMPLATE2(_BITUNPACK_,64_26),
+  &TEMPLATE2(_BITUNPACK_,64_27),
+  &TEMPLATE2(_BITUNPACK_,64_28),
+  &TEMPLATE2(_BITUNPACK_,64_29),
+  &TEMPLATE2(_BITUNPACK_,64_30),
+  &TEMPLATE2(_BITUNPACK_,64_31),
+  &TEMPLATE2(_BITUNPACK_,64_32),
+  &TEMPLATE2(_BITUNPACK_,64_33),
+  &TEMPLATE2(_BITUNPACK_,64_34),
+  &TEMPLATE2(_BITUNPACK_,64_35),
+  &TEMPLATE2(_BITUNPACK_,64_36),
+  &TEMPLATE2(_BITUNPACK_,64_37),
+  &TEMPLATE2(_BITUNPACK_,64_38),
+  &TEMPLATE2(_BITUNPACK_,64_39),
+  &TEMPLATE2(_BITUNPACK_,64_40),
+  &TEMPLATE2(_BITUNPACK_,64_41),
+  &TEMPLATE2(_BITUNPACK_,64_42),
+  &TEMPLATE2(_BITUNPACK_,64_43),
+  &TEMPLATE2(_BITUNPACK_,64_44),
+  &TEMPLATE2(_BITUNPACK_,64_45),
+  &TEMPLATE2(_BITUNPACK_,64_46),
+  &TEMPLATE2(_BITUNPACK_,64_47),
+  &TEMPLATE2(_BITUNPACK_,64_48),
+  &TEMPLATE2(_BITUNPACK_,64_49),
+  &TEMPLATE2(_BITUNPACK_,64_50),
+  &TEMPLATE2(_BITUNPACK_,64_51),
+  &TEMPLATE2(_BITUNPACK_,64_52),
+  &TEMPLATE2(_BITUNPACK_,64_53),
+  &TEMPLATE2(_BITUNPACK_,64_54),
+  &TEMPLATE2(_BITUNPACK_,64_55),
+  &TEMPLATE2(_BITUNPACK_,64_56),
+  &TEMPLATE2(_BITUNPACK_,64_57),
+  &TEMPLATE2(_BITUNPACK_,64_58),
+  &TEMPLATE2(_BITUNPACK_,64_59),
+  &TEMPLATE2(_BITUNPACK_,64_60),
+  &TEMPLATE2(_BITUNPACK_,64_61),
+  &TEMPLATE2(_BITUNPACK_,64_62),
+  &TEMPLATE2(_BITUNPACK_,64_63),
+  &TEMPLATE2(_BITUNPACK_,64_64)
 };
-unsigned char *T2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t  *__restrict out , uint64_t start, unsigned b) { return T2(_BITUNPACK_,a64)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t  *__restrict out , uint64_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a64)[ b](in, n, out, start); }
 
 #endif
 #endif //OPI
diff --git a/src/ext/for/bitutil.c b/src/ext/for/bitutil.c
index 075a5727..5edca0a0 100644
--- a/src/ext/for/bitutil.c
+++ b/src/ext/for/bitutil.c
@@ -1,6 +1,6 @@
 /**
-    Copyright (C) powturbo 2013-2023
-    SPDX-License-Identifier: GPL v2 License
+    Copyright (C) powturbo 2013-2019
+    GPL v2 License
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -21,198 +21,14 @@
     - twitter  : https://twitter.com/powturbo
     - email    : powturbo [_AT_] gmail [_DOT_] com
 **/
-//   "Integer Compression" utility - delta, for, zigzag / Floating point compression
-#pragma warning( disable : 4005)
-#pragma warning( disable : 4090)
-#pragma warning( disable : 4068)
-
+//    "Integer Compression" utility - delta, for, zigzag / Floating point compression
 #include <math.h> //nan
-#include "include_/conf.h"
-#include "include_/bitutil.h"
-
-#include "include_/bitutil_.h"
-
-#define BT(_i_) { o |= ip[_i_]; x |= ip[_i_] ^ u0; }
-
-#ifdef __AVX2__
-
-uint32_t bit256v32(uint32_t *in, unsigned n, uint32_t *px) {
-  uint32_t o = 0,x,u0 = in[0], *ip = in;
-  __m256i vb0 = _mm256_set1_epi32(*in), 
-          vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
-          vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();
-  for(; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
-    __m256i v0 = _mm256_loadu_si256((__m256i *) ip);
-    __m256i v1 = _mm256_loadu_si256((__m256i *)(ip+8));
-    vo0 = _mm256_or_si256(vo0, v0);
-    vo1 = _mm256_or_si256(vo1, v1);
-    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
-    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
-  }
-  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
-  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
-  for(; ip != in+n; ip++) BT(0);
-  if(px) *px = x;
-  return o;
-}
-
-// delta ---------------------------------------------------------------------------------------------------------------
-uint32_t bitd256v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { 
-  uint32_t o = 0, x, *ip = in, u0 = in[0] - start;
-  __m256i vb0 = _mm256_set1_epi32(u0),
-          vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
-          vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();           __m256i vs = _mm256_set1_epi32(start);
-  for(; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
-    __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
-    __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8));                        __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0;
-                                                                                __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1;
-    vo0 = _mm256_or_si256(vo0, v0);
-    vo1 = _mm256_or_si256(vo1, v1);
-    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
-    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
-  }                                                                             start = (unsigned)_mm256_extract_epi32(vs, 7);
-  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
-  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
-
-  for(;ip != in+n; ip++) {
-    uint32_t u = *ip - start; start = *ip;
-    o |= u;
-    x |= u ^ u0;
-  }
-  if(px) *px = x;
-  return o;
-}
-
-void bitddec256v32(uint32_t *in, unsigned n, unsigned start) {
-  unsigned *ip = in;
-  __m256i vs = _mm256_set1_epi32(start);
-  for(; ip != in+(n&~(8-1)); ip += 8) {
-    __m256i v =  _mm256_loadu_si256((__m256i *)ip);
-    vs = mm256_scan_epi32(v,vs);
-    _mm256_storeu_si256((__m256i *)ip, vs);
-  }
-  start = (unsigned)_mm256_extract_epi32(vs, 7);
-  while(ip != in+n) {
-    *ip = (start += (*ip));
-    ip++;
-  }
-}
-
-//-- delta 1 --------------------------------------------------------------------------------------------------------------------------------------
-uint32_t bitd1256v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
-  uint32_t o, x, *ip = in, u0 = in[0]-start-1;
-   __m256i vb0 = _mm256_set1_epi32(u0),
-           vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
-           vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();          __m256i vs = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
-  for(; ip != in+(n&~(16-1)); ip += 16) {                                       PREFETCH(ip+512,0);
-    __m256i vi0 = _mm256_loadu_si256((__m256i *)ip);
-    __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8));                        __m256i v0 = _mm256_sub_epi32(mm256_delta_epi32(vi0,vs),cv); vs = vi0;
-                                                                                __m256i v1 = _mm256_sub_epi32(mm256_delta_epi32(vi1,vs),cv); vs = vi1;
-    vo0 = _mm256_or_si256(vo0, v0);
-    vo1 = _mm256_or_si256(vo1, v1);
-    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
-    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
-  }                                                                             start = (unsigned)_mm256_extract_epi32(vs, 7);
-  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
-  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
-  for(;ip != in+n; ip++) {
-    uint32_t u = ip[0] - start-1; start = *ip;
-    o |= u;
-    x |= u ^ u0;
-}
-  if(px) *px = x;
-  return o;
-}
-
-void bitd1dec256v32(uint32_t *in, unsigned n, uint32_t start) {
-  __m256i vs = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
-  unsigned *ip = in;
-  for(; ip != in+(n&~(8-1)); ip += 8) {
-    __m256i v =  _mm256_loadu_si256((__m256i *)ip);                             vs = mm256_scani_epi32(v, vs, cv);
-    _mm256_storeu_si256((__m256i *)ip, vs);
-  }
-                                                                                start = (unsigned)_mm256_extract_epi32(vs, 7);
-  while(ip != in+n) {
-    *ip = (start += (*ip) + 1);
-    ip++;
-  }
-}
-
-//--  Xor ----------------------------------------------------------------------------------------------------------------------
-uint32_t bitx256v32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
-  uint32_t o = 0, *ip = in;
-  __m256i vo0 = _mm256_setzero_si256(),
-          vo1 = _mm256_setzero_si256(), 
-		   vs = _mm256_set1_epi32(start);
-		   
-  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                //PREFETCH(ip+512,0);
-    __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
-    __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8));                        __m256i v0 = mm256_xore_epi32(vi0,vs); vs = vi0; 
-                                                                                __m256i v1 = mm256_xore_epi32(vi1,vs); vs = vi1; 
-    vo0 = _mm256_or_si256(vo0, v0);
-    vo1 = _mm256_or_si256(vo1, v1);
-  }                                                                             start = (unsigned)_mm256_extract_epi32(vs, 7);
-  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
-  for(;ip != in+n; ip++) {
-    o |= ip[0] ^ start; start = ip[0];
-  }
-  if(px) *px = o;
-  return o; 
-}
-
-//-- zigzag ------------------------------------------------------------------------------------------------------------------------------------------------
-uint32_t bitz256v32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
-  uint32_t o, x, *ip; uint32_t u0 = zigzagenc32((int)in[0] - (int)start);
-  __m256i vb0 = _mm256_set1_epi32(u0), 
-          vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
-          vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(),
-		   vs = _mm256_set1_epi32(start);
-		   
-  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                //PREFETCH(ip+512,0);
-    __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
-    __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8));                        __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0; v0 = mm256_zzage_epi32(v0);
-                                                                                __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1; v1 = mm256_zzage_epi32(v1);
-    vo0 = _mm256_or_si256(vo0, v0);
-    vo1 = _mm256_or_si256(vo1, v1);
-    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
-    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
-  }                                                                             start = (unsigned)_mm256_extract_epi32(vs, 7);
-  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
-  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+#include "conf.h"
+#define BITUTIL_IN
+#include "bitutil.h"
 
-  for(;ip != in+n; ip++) {
-    uint32_t u = zigzagenc32((int)ip[0] - (int)start); start = *ip; //((int)(*ip) - (int)start);    //i = (i << 1) ^ (i >> 31);
-    o |= u;
-    x |= u ^ u0;
-  }
-  if(px) *px = x;
-  return o; 
-}
-
-/* slower than SSE
-void bitzdec256v32(unsigned *in, unsigned n, unsigned start) {
-  __m256i vs = _mm256_set1_epi32(start);
-  unsigned *ip = in;
-  for(; ip != in+(n&~(16-1)); ip += 16) {
-    __m256i iv0 = _mm256_loadu_si256((__m256i *)ip),
-            iv1 = _mm256_loadu_si256((__m256i *)(ip+8));
-    iv0 = mm256_zzagd_epi32(iv0);
-    iv1 = mm256_zzagd_epi32(iv1);
-    vs = mm256_scan_epi32(iv0, vs);
-	//__m256i _vs = vs;
-    _mm256_storeu_si256((__m256i *)ip, vs);
-    vs = mm256_scan_epi32(iv1, vs);
-    _mm256_storeu_si256((__m256i *)(ip+8), vs);
-  }
-  start = (unsigned)_mm256_extract_epi32(_mm256_srli_si256(vs,12), 4);
-  while(ip != in+n) {
-    unsigned z = *ip;
-    *ip++ = (start += (z >> 1 ^ -(z & 1)));
-  }
-}*/
-
-#else // avx2
 //------------ 'or' for bitsize + 'xor' for all duplicate ------------------
+#define BT(_i_) { o |= ip[_i_]; x |= ip[_i_] ^ u0; }
 #define BIT(_in_, _n_, _usize_) {\
   u0 = _in_[0]; o = x = 0;\
   for(ip = _in_; ip != _in_+(_n_&~(4-1)); ip += 4) { BT(0); BT(1); BT(2); BT(3); }\
@@ -223,13 +39,11 @@ uint8_t  bit8( uint8_t  *in, unsigned n, uint8_t  *px) { uint8_t  o,x,u0,*ip; BI
 uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px) { uint64_t o,x,u0,*ip; BIT(in, n, 64); if(px) *px = x; return o; }
 
 uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
-  uint16_t o, x, u0 = in[0], *ip = in;
-  
+  uint16_t o, x, u0 = in[0], *ip;
     #if defined(__SSE2__) || defined(__ARM_NEON)
-  __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
-          vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi16(u0);
-									
-  for(; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
+  __m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
+                                     vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();
+  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
     __m128i v0 = _mm_loadu_si128((__m128i *) ip);
     __m128i v1 = _mm_loadu_si128((__m128i *)(ip+8));
     vo0 = _mm_or_si128( vo0, v0);
@@ -240,22 +54,32 @@ uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
   vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
   vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
     #else
-  ip = in; o = x = 0;
+  ip = in; o = x = 0; //BIT( in, n, 16);
     #endif
-	
   for(; ip != in+n; ip++) BT(0);
   if(px) *px = x;
   return o;
 }
 
 uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) {
-  uint32_t o,x,u0 = in[0], *ip = in;
-  
-    #if defined(__SSE2__) || defined(__ARM_NEON)
-  __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
-          vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi32(u0);
-									
-  for(; ip != in+(n&~(8-1)); ip += 8) {                                  PREFETCH(ip+512,0);
+  uint32_t o,x,u0 = in[0], *ip;
+    #ifdef __AVX2__
+  __m256i vb0 = _mm256_set1_epi32(*in), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+                                        vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();
+  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
+    __m256i v0 = _mm256_loadu_si256((__m256i *) ip);
+    __m256i v1 = _mm256_loadu_si256((__m256i *)(ip+8));
+    vo0 = _mm256_or_si256(vo0, v0);
+    vo1 = _mm256_or_si256(vo1, v1);
+    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+  }
+  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+    #elif defined(__SSE2__) || defined(__ARM_NEON)
+  __m128i vb0 = _mm_set1_epi32(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
+                                     vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();
+  for(ip = in; ip != in+(n&~(8-1)); ip += 8) {                                  PREFETCH(ip+512,0);
     __m128i v0 = _mm_loadu_si128((__m128i *) ip);
     __m128i v1 = _mm_loadu_si128((__m128i *)(ip+4));
     vo0 = _mm_or_si128(vo0, v0);
@@ -266,9 +90,8 @@ uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) {
   vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
   vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
     #else
-  ip = in; o = x = 0;
+  ip = in; o = x = 0; //BIT( in, n, 32);
     #endif
-	
   for(; ip != in+n; ip++) BT(0);
   if(px) *px = x;
   return o;
@@ -286,12 +109,12 @@ uint8_t   bitd8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { uint8
 uint64_t  bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t u, u0 = in[0]-start, o, x; BITDE(uint64_t, in, n, 0, o |= u; x |= u^u0); if(px) *px = x; return o; }
 
 uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
-  uint16_t o, x, *ip = in, u0 = in[0] - start;
+  uint16_t o, x, *ip, u0 = in[0]-start;
     #if defined(__SSE2__) || defined(__ARM_NEON)
   __m128i vb0 = _mm_set1_epi16(u0),
           vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
           vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();                 __m128i vs = _mm_set1_epi16(start);
-  for(; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
+  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
     __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
     __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8));                           __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0;
                                                                                 __m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1;
@@ -305,7 +128,6 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
     #else
   ip = in; o = x = 0;
     #endif
-	
   for(;ip != in+n; ip++) {
     uint16_t u = *ip - start; start = *ip;
     o |= u;
@@ -316,13 +138,27 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
 }
 
 uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
-  uint32_t o = 0, x=0, *ip = in, u0 = in[0] - start;
-  
-    #if defined(__SSE2__) || defined(__ARM_NEON)
+  uint32_t o, x, *ip, u0 = in[0] - start;
+    #ifdef __AVX2__
+  __m256i vb0 = _mm256_set1_epi32(u0),
+          vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+          vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();           __m256i vs = _mm256_set1_epi32(start);
+  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
+    __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
+    __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8));                        __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0;
+                                                                                __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1;
+    vo0 = _mm256_or_si256(vo0, v0);
+    vo1 = _mm256_or_si256(vo1, v1);
+    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+  }                                                                             start = (unsigned)_mm256_extract_epi32(vs, 7);
+  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+    #elif defined(__SSE2__) || defined(__ARM_NEON)
   __m128i vb0 = _mm_set1_epi32(u0),
           vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
           vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();                 __m128i vs = _mm_set1_epi32(start);
-  for(; ip != in+(n&~(8-1)); ip += 8) {                                  PREFETCH(ip+512,0);
+  for(ip = in; ip != in+(n&~(8-1)); ip += 8) {                                  PREFETCH(ip+512,0);
     __m128i vi0 = _mm_loadu_si128((__m128i *)ip);
     __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4));                           __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0;
                                                                                 __m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1;
@@ -336,7 +172,6 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
     #else
   ip = in; o = x = 0;
     #endif
-	
   for(;ip != in+n; ip++) {
     uint32_t u = *ip - start; start = *ip;
     o |= u;
@@ -348,39 +183,48 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
 
 //----- Undelta: In-place prefix sum (min. Delta = 0) -------------------
 #define DD(i) _ip[i] = (start += _ip[i] + _md);
-#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const unsigned _md = _md_;\
+#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const int _md = _md_;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { DD(0); DD(1); DD(2); DD(3); }\
   for(;_ip != _in_+_n_; _ip++) DD(0);\
 }
 
-void bitddec8( uint8_t  *in, unsigned n, uint8_t  start) { BITDD(uint8_t,  in, n, 0); }
-void bitddec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, n, 0); }
-void bitddec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 0); }
-
-void bitddec32(uint32_t *in, unsigned n, unsigned start) {
-    #if defined(__SSSE3__) || defined(__ARM_NEON)
+void bitddec8( uint8_t  *p, unsigned n, uint8_t  start) { BITDD(uint8_t,  p, n, 0); }
+void bitddec16(uint16_t *p, unsigned n, uint16_t start) { BITDD(uint16_t, p, n, 0); }
+void bitddec64(uint64_t *p, unsigned n, uint64_t start) { BITDD(uint64_t, p, n, 0); }
+void bitddec32(uint32_t *p, unsigned n, unsigned start) {
+    #ifdef __AVX2__
+  __m256i vs = _mm256_set1_epi32(start);
+  unsigned *ip;
+  for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+    __m256i v =  _mm256_loadu_si256((__m256i *)ip);
+    vs = mm256_scan_epi32(v,vs);
+    _mm256_storeu_si256((__m256i *)ip, vs);
+  }
+  start = (unsigned)_mm256_extract_epi32(vs, 7);
+  while(ip != p+n) {
+    *ip = (start += (*ip));
+    ip++;
+  }
+    #elif defined(__SSE2__) || defined(__ARM_NEON)
   __m128i vs = _mm_set1_epi32(start);
-  unsigned *ip = in;
-  for(; ip != in+(n&~(8-1)); ip += 8) {
-    __m128i v0 =  _mm_loadu_si128((__m128i *)ip);
-    __m128i v1 =  _mm_loadu_si128((__m128i *)(ip+4));
-    vs = mm_scan_epi32(v0, vs);
+  unsigned *ip;
+  for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
+    __m128i v =  _mm_loadu_si128((__m128i *)ip);
+    vs = mm_scan_epi32(v, vs);
     _mm_storeu_si128((__m128i *)ip, vs);
-    vs = mm_scan_epi32(v1, vs);
-    _mm_storeu_si128((__m128i *)(ip+4), vs);
   }
   start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
-  while(ip != in+n) {
+  while(ip != p+n) {
     *ip = (start += (*ip));
     ip++;
   }
     #else
-  BITDD(uint32_t, in, n, 0);
+  BITDD(uint32_t, p, n, 0);
     #endif
 }
 
-//----------- Zigzag Delta ----------------------------------------------------------------------------------------------------------------------------------------------------------------
-#define ZDE(i, _usize_) d = (_ip[i]-start)-_md; u = T2(zigzagenc, _usize_)(d - startd); startd = d; start = _ip[i]
+//----------- Zigzag of Delta --------------------------
+#define ZDE(i, _usize_) d = (_ip[i]-start)-_md; u = TEMPLATE2(zigzagenc, _usize_)(d - startd); startd = d; start = _ip[i]
 #define BITZDE(_t_, _in_, _n_, _md_, _usize_, _act_) { _t_ *_ip, _md = _md_;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDE(0, _usize_);_act_; ZDE(1, _usize_);_act_; ZDE(2, _usize_);_act_; ZDE(3, _usize_);_act_; }\
   for(;_ip != _in_+_n_;_ip++) { ZDE(0, _usize_); _act_; }\
@@ -390,35 +234,48 @@ uint8_t  bitzz8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { uint8
 uint16_t bitzz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o=0, x=0,d,startd=0,u; BITZDE(uint16_t, in, n, 1, 16, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
 uint32_t bitzz32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint64_t o=0, x=0,d,startd=0,u; BITZDE(uint32_t, in, n, 1, 32, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
 uint64_t bitzz64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0, x=0,d,startd=0,u; BITZDE(uint64_t, in, n, 1, 64, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
-
 uint8_t  bitzzenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta) { uint8_t  o=0,*op = out,u,d,startd=0; BITZDE(uint8_t,  in, n, mindelta,  8,o |= u;*op++ = u); return o;}
 uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,*op = out,u,d,startd=0; BITZDE(uint16_t, in, n, mindelta, 16,o |= u;*op++ = u); return o;}
 uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) { uint32_t o=0,*op = out,u,d,startd=0; BITZDE(uint32_t, in, n, mindelta, 32,o |= u;*op++ = u); return o;}
 uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,*op = out,u,d,startd=0; BITZDE(uint64_t, in, n, mindelta, 64,o |= u;*op++ = u); return o;}
 
 #define ZDD(i) u = _ip[i]; d = u - start; _ip[i] = zigzagdec64(u)+(int64_t)startd+_md; startd = d; start = u
-#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const unsigned _md = _md_;\
+#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const int _md = _md_;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDD(0); ZDD(1); ZDD(2); ZDD(3); }\
   for(;_ip != _in_+_n_; _ip++) ZDD(0);\
 }
-void bitzzdec8( uint8_t  *in, unsigned n, uint8_t  start) { BITZDD(uint8_t,  in, n, 1); }
-void bitzzdec16(uint16_t *in, unsigned n, uint16_t start) { BITZDD(uint16_t, in, n, 1); }
-void bitzzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDD(uint64_t, in, n, 1); }
-void bitzzdec32(uint32_t *in, unsigned n, uint32_t start) { BITZDD(uint32_t, in, n, 1); }
+void bitzzdec8( uint8_t  *p, unsigned n, uint8_t  start) { BITZDD(uint8_t,  p, n, 1); }
+void bitzzdec16(uint16_t *p, unsigned n, uint16_t start) { BITZDD(uint16_t, p, n, 1); }
+void bitzzdec64(uint64_t *p, unsigned n, uint64_t start) { BITZDD(uint64_t, p, n, 1); }
+void bitzzdec32(uint32_t *p, unsigned n, uint32_t start) { BITZDD(uint32_t, p, n, 1); }
 
 //-----Undelta: In-place prefix sum (min. Delta = 1) -------------------
-uint8_t  bitd18( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { uint8_t  o=0,x=0,u; BITDE(uint8_t,  in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
-uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o=0,x=0,u; BITDE(uint16_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
-uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0,x=0,u; BITDE(uint64_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
+uint8_t  bitd18( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { uint8_t  o=0,x=0,u,*ip; BITDE(uint8_t,  in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
+uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o=0,x=0,u,*ip; BITDE(uint16_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
+uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0,x=0,u,*ip; BITDE(uint64_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
 
 uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
-  uint32_t o = 0, x=0, *ip = in, u0 = in[0]-start-1;
-  
-   #if defined(__SSE2__) || defined(__ARM_NEON)
+  uint32_t o, x, *ip, u0 = in[0]-start-1;
+    #ifdef __AVX2__
+   __m256i vb0 = _mm256_set1_epi32(u0),
+           vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+           vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();          __m256i vs = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
+  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
+    __m256i vi0 = _mm256_loadu_si256((__m256i *)ip);
+    __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8));                        __m256i v0 = _mm256_sub_epi32(mm256_delta_epi32(vi0,vs),cv); vs = vi0;
+                                                                                __m256i v1 = _mm256_sub_epi32(mm256_delta_epi32(vi1,vs),cv); vs = vi1;
+    vo0 = _mm256_or_si256(vo0, v0);
+    vo1 = _mm256_or_si256(vo1, v1);
+    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+  }                                                                             start = (unsigned)_mm256_extract_epi32(vs, 7);
+  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+   #elif defined(__SSE2__) || defined(__ARM_NEON)
   __m128i vb0 = _mm_set1_epi32(u0),
           vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
           vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();                 __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1);
-  for(; ip != in+(n&~(8-1)); ip += 8) {                                  PREFETCH(ip+512,0);
+  for(ip = in; ip != in+(n&~(8-1)); ip += 8) {                                  PREFETCH(ip+512,0);
     __m128i vi0 = _mm_loadu_si128((__m128i *)ip);
     __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4));                           __m128i v0 = _mm_sub_epi32(mm_delta_epi32(vi0,vs),cv); vs = vi0;
                                                                                 __m128i v1 = _mm_sub_epi32(mm_delta_epi32(vi1,vs),cv); vs = vi1;
@@ -432,7 +289,6 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
     #else
   ip = in; o = x = 0;
     #endif
-	
   for(;ip != in+n; ip++) {
     uint32_t u = ip[0] - start-1; start = *ip;
     o |= u;
@@ -444,8 +300,8 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
 
 uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
     #if defined(__SSE2__) || defined(__ARM_NEON)
-  uint16_t *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8);
-  for(; ip != in+(n&~(8-1)); ip += 8) {
+  unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8);
+  for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
     __m128i iv = _mm_loadu_si128((__m128i *)ip);
     bv = _mm_or_si128(bv,_mm_sub_epi16(SUBI16x8(iv,vs),cv));
     vs = iv;
@@ -459,8 +315,8 @@ uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
 
 unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
     #if defined(__SSE2__) || defined(__ARM_NEON)
-  unsigned *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4);
-  for(; ip != in+(n&~(4-1)); ip += 4) {
+  unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4);
+  for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
     __m128i iv = _mm_loadu_si128((__m128i *)ip);
     bv = _mm_or_si128(bv,_mm_sub_epi32(SUBI32x4(iv,vs),cv));
     vs = iv;
@@ -472,26 +328,37 @@ unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
     #endif
 }
 
-void bitd1dec8( uint8_t  *in, unsigned n, uint8_t  start) { BITDD(uint8_t,  in, n, 1); }
-void bitd1dec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, n, 1); }
-void bitd1dec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 1); }
-
-void bitd1dec32(uint32_t *in, unsigned n, uint32_t start) {
-    #if defined(__SSSE3__) || defined(__ARM_NEON)
+void bitd1dec8( uint8_t  *p, unsigned n, uint8_t  start) { BITDD(uint8_t,  p, n, 1); }
+void bitd1dec16(uint16_t *p, unsigned n, uint16_t start) { BITDD(uint16_t, p, n, 1); }
+void bitd1dec64(uint64_t *p, unsigned n, uint64_t start) { BITDD(uint64_t, p, n, 1); }
+void bitd1dec32(uint32_t *p, unsigned n, uint32_t start) {
+    #ifdef __AVX2__
+  __m256i vs = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
+  unsigned *ip;
+  for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+    __m256i v =  _mm256_loadu_si256((__m256i *)ip);                             vs = mm256_scani_epi32(v, vs, cv);
+    _mm256_storeu_si256((__m256i *)ip, vs);
+  }
+                                                                                start = (unsigned)_mm256_extract_epi32(vs, 7);
+  while(ip != p+n) {
+    *ip = (start += (*ip) + 1);
+    ip++;
+  }
+    #elif defined(__SSE2__) || defined(__ARM_NEON)
   __m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
-  unsigned *ip = in;
-  for(; ip != in+(n&~(4-1)); ip += 4) {
+  unsigned *ip;
+  for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
     __m128i v =  _mm_loadu_si128((__m128i *)ip);
     vs = mm_scani_epi32(v, vs, cv);
     _mm_storeu_si128((__m128i *)ip, vs);
   }
   start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
-  while(ip != in+n) {
+  while(ip != p+n) {
     *ip = (start += (*ip) + 1);
     ip++;
   }
     #else
-  BITDD(uint32_t, in, n, 1);
+  BITDD(uint32_t, p, n, 1);
     #endif
 }
 
@@ -508,14 +375,14 @@ uint16_t bitdi16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint1
 uint32_t bitdi32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t mindelta,u,*_ip; BITDIE(in, n); if(px) *px = 0; return mindelta; }
 uint64_t bitdi64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t mindelta,u,*_ip; BITDIE(in, n); if(px) *px = 0; return mindelta; }
 
-uint8_t  bitdienc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta) { uint8_t  o=0,x=0,*op = out,u; BITDE(uint8_t,  in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
-uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
-uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
+uint8_t  bitdienc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta) { uint8_t  o=0,x=0,*op = out,u,*ip; BITDE(uint8_t,  in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
+uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u,*ip; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
+uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u,*ip; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
 uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
     #if defined(__SSE2__) || defined(__ARM_NEON)
-  unsigned *ip = in,b,*op = out;
+  unsigned *ip,b,*op = out;
   __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv;
-  for(; ip != in+(n&~(4-1)); ip += 4,op += 4) {
+  for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
     __m128i iv = _mm_loadu_si128((__m128i *)ip);
     bv = _mm_or_si128(bv, dv = _mm_sub_epi32(mm_delta_epi32(iv,vs),cv));
     vs = iv;
@@ -536,12 +403,12 @@ uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
   return b;
 }
 
-void bitdidec8(  uint8_t  *in, unsigned n, uint8_t  start, uint8_t  mindelta) { BITDD(uint8_t,  in, n, mindelta); }
-void bitdidec16( uint16_t *in, unsigned n, uint16_t start, uint16_t mindelta) { BITDD(uint16_t, in, n, mindelta); }
-void bitdidec32( uint32_t *in, unsigned n, uint32_t start, uint32_t mindelta) { BITDD(uint32_t, in, n, mindelta); }
-void bitdidec64( uint64_t *in, unsigned n, uint64_t start, uint64_t mindelta) { BITDD(uint64_t, in, n, mindelta); }
+void bitdidec8(  uint8_t  *p, unsigned n, uint8_t  start, uint8_t  mindelta) { BITDD(uint8_t,  p, n, mindelta); }
+void bitdidec16( uint16_t *p, unsigned n, uint16_t start, uint16_t mindelta) { BITDD(uint16_t, p, n, mindelta); }
+void bitdidec32( uint32_t *p, unsigned n, uint32_t start, uint32_t mindelta) { BITDD(uint32_t, p, n, mindelta); }
+void bitdidec64( uint64_t *p, unsigned n, uint64_t start, uint64_t mindelta) { BITDD(uint64_t, p, n, mindelta); }
 
-//------------------- For ---------------------------------------------------------------------------------------------------
+//------------------- For ------------------------------
 uint8_t  bitf8(  uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { if(px) *px = 0; return n?in[n-1] - start    :0; }
 uint8_t  bitf18( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { if(px) *px = 0; return n?in[n-1] - start - n:0; }
 uint16_t bitf16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { if(px) *px = 0; return n?in[n-1] - start    :0; }
@@ -551,25 +418,24 @@ uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { if(px
 uint64_t bitf64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { if(px) *px = 0; return n?in[n-1] - start    :0; }
 uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { if(px) *px = 0; return n?in[n-1] - start - n:0; }
 
-//------------------- Zigzag -------------------------------------------------------------------------------------------------------------------------------------
-#define ZE(i,_it_,_usize_) u = T2(zigzagenc, _usize_)((_it_)_ip[i]-(_it_)start); start = _ip[i]
-#define BITZENC(_ut_, _it_, _usize_, _in_,_n_, _act_) { _ut_ *_ip; x = -1;\
+//------------------- Zigzag ---------------------------
+#define ZE(i,_it_,_usize_) u = TEMPLATE2(zigzagenc, _usize_)((_it_)_ip[i]-(_it_)start); start = _ip[i]
+#define BITZENC(_ut_, _it_, _usize_, _in_,_n_, _act_) { _ut_ *_ip; o = 0; x = -1;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZE(0,_it_,_usize_);_act_; ZE(1,_it_,_usize_);_act_; ZE(2,_it_,_usize_);_act_; ZE(3,_it_,_usize_);_act_; }\
   for(;_ip != _in_+_n_; _ip++) { ZE(0,_it_,_usize_); _act_; }\
 }
 
 // 'or' bits for zigzag encoding
-uint8_t  bitz8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { uint8_t  o=0, u,x; BITZENC(uint8_t,  int8_t, 8, in, n, o |= x); if(px) *px = 0; return o; }
-uint64_t bitz64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0, u,x; BITZENC(uint64_t, int64_t,64,in, n, o |= x); if(px) *px = 0; return o; }
+uint8_t  bitz8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { uint8_t  o, u,x; BITZENC(uint8_t,  int8_t, 8, in, n, o |= x); if(px) *px = 0; return o; }
+uint64_t bitz64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o, u,x; BITZENC(uint64_t, int64_t,64,in, n, o |= x); if(px) *px = 0; return o; }
 
 uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
-  uint16_t o, x, *ip = in; 
-  uint32_t u0 = zigzagenc16((int)in[0] - (int)start);
+  uint16_t o, x, *ip; uint32_t u0 = zigzagenc16((int)in[0] - (int)start);
 
     #if defined(__SSE2__) || defined(__ARM_NEON)
   __m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
                                     vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
-  for(; ip != in+(n&~(16-1)); ip += 16) {            PREFETCH(ip+512,0);
+  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {            PREFETCH(ip+512,0);
     __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
     __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8));                                      __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0; v0 = mm_zzage_epi16(v0);
                                                                                            __m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1; v1 = mm_zzage_epi16(v1);
@@ -594,13 +460,27 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
 }
 
 uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
-  uint32_t o, x, *ip=in,
-           u0 = zigzagenc32((int)in[0] - (int)start);
-    #if defined(__SSE2__) || defined(__ARM_NEON)
+  uint32_t o, x, *ip; uint32_t u0 = zigzagenc32((int)in[0] - (int)start);
+    #ifdef __AVX2__
+   __m256i vb0 = _mm256_set1_epi32(u0), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+                                         vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start);
+  for(ip = in; ip != in+(n&~(16-1)); ip += 16) {                                PREFETCH(ip+512,0);
+    __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
+    __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8));                        __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0; v0 = mm256_zzage_epi32(v0);
+                                                                                __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1; v1 = mm256_zzage_epi32(v1);
+    vo0 = _mm256_or_si256(vo0, v0);
+    vo1 = _mm256_or_si256(vo1, v1);
+    vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+    vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+  }                                                                             start = (unsigned)_mm256_extract_epi32(vs, 7);
+  vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+  vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+
+    #elif defined(__SSE2__) || defined(__ARM_NEON)
    __m128i vb0 = _mm_set1_epi32(u0),
            vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
            vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();                __m128i vs = _mm_set1_epi32(start);
-  for(; ip != in+(n&~(8-1)); ip += 8) {                                  //PREFETCH(ip+512,0);
+  for(ip = in; ip != in+(n&~(8-1)); ip += 8) {                                  PREFETCH(ip+512,0);
     __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
     __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4));                           __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0; v0 = mm_zzage_epi32(v0);
                                                                                 __m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1; v1 = mm_zzage_epi32(v1);
@@ -608,7 +488,7 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
     vo1 = _mm_or_si128(vo1, v1);
     vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
     vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
-  }                                                                             start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12));
+  }                                                                             start = mm_cvtsi128_si16(_mm_srli_si128(vs,12));
   vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
   vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
     #else
@@ -623,25 +503,19 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
   return o;
 }
 
-uint8_t  bitzenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta) { uint8_t  o=0,x,u,*op = out; BITZENC(uint8_t,  int8_t,  8,in, n, o |= u; *op++ = u); return o; }
-uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x,u,*op = out; BITZENC(uint16_t, int16_t,16,in, n, o |= u; *op++ = u); return o; }
-uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; }
-
+uint8_t  bitzenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta) { uint8_t  o,x,u,*op = out; BITZENC(uint8_t,  int8_t,  8,in, n, o |= u; *op++ = u); return o; }
+uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o,x,u,*op = out; BITZENC(uint16_t, int16_t,16,in, n, o |= u; *op++ = u); return o; }
+uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; }
 uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
     #if defined(__SSE2__) || defined(__ARM_NEON)
-  unsigned *ip = in,b,*op = out;
-  __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start);
-  for(; ip != in+(n&~(8-1)); ip += 8,op += 8) {
-    __m128i iv0 = _mm_loadu_si128((__m128i *)ip),     dv0;
-	__m128i iv1 = _mm_loadu_si128((__m128i *)(ip+4)), dv1;
-    dv0 = mm_delta_epi32(iv0,vs); vs = iv0;
-    dv0 = mm_zzage_epi32(dv0);
-    bv = _mm_or_si128(bv, dv0);
-    dv1 = mm_delta_epi32(iv1,vs); vs = iv1;
-    dv1 = mm_zzage_epi32(dv1);
-    bv = _mm_or_si128(bv, dv1);
-    _mm_storeu_si128((__m128i *)op, dv0);
-    _mm_storeu_si128((__m128i *)(op+4), dv1);
+  unsigned *ip,b,*op = out;
+  __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), dv;
+  for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
+    __m128i iv = _mm_loadu_si128((__m128i *)ip);
+    dv = mm_delta_epi32(iv,vs); vs = iv;
+    dv = mm_zzage_epi32(dv);
+    bv = _mm_or_si128(bv, dv);
+    _mm_storeu_si128((__m128i *)op, dv);
   }
   start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
   b = mm_hor_epi32(bv);
@@ -653,122 +527,81 @@ uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint
     *op++ = x;
   }
     #else
-  uint32_t b = 0, *op = out,x, u;
+  uint32_t b = 0, *op = out,x;
   BITZENC(uint32_t, int32_t, 32,in, n, b |= x; *op++ = x);
     #endif
   return bsr32(b);
 }
 
-#define ZD(_t_, _usize_, i) { _t_ _z = _ip[i]; _ip[i] = (start += T2(zigzagdec, _usize_)(_z)); }
+#define ZD(_t_, _usize_, i) { _t_ _z = _ip[i]; _ip[i] = (start += TEMPLATE2(zigzagdec, _usize_)(_z)); }
 #define BITZDEC(_t_, _usize_, _in_, _n_) { _t_ *_ip;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZD(_t_, _usize_, 0); ZD(_t_, _usize_, 1); ZD(_t_, _usize_, 2); ZD(_t_, _usize_, 3); }\
   for(;_ip != _in_+_n_;_ip++) ZD(_t_, _usize_, 0);\
 }
 
-void bitzdec8( uint8_t  *in, unsigned n, uint8_t  start) { BITZDEC(uint8_t,  8, in, n); }
-void bitzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,in, n); }
+void bitzdec8( uint8_t  *p, unsigned n, uint8_t  start) { BITZDEC(uint8_t,  8, p, n); }
+void bitzdec64(uint64_t *p, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,p, n); }
 
-void bitzdec16(uint16_t *in, unsigned n, uint16_t start) {
+void bitzdec16(uint16_t *p, unsigned n, uint16_t start) {
     #if defined(__SSSE3__) || defined(__ARM_NEON)
   __m128i vs = _mm_set1_epi16(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
-  uint16_t *ip = in;
-  for(; ip != in+(n&~(8-1)); ip += 8) {
-    __m128i iv =  _mm_loadu_si128((__m128i *)ip);                   
-	iv = mm_zzagd_epi16(iv);
+  uint16_t *ip;
+  for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+    __m128i iv =  _mm_loadu_si128((__m128i *)ip);
+    iv = mm_zzagd_epi16(iv);
     vs = mm_scan_epi16(iv, vs);
     _mm_storeu_si128((__m128i *)ip, vs);
   }
   start = (uint16_t)_mm_cvtsi128_si32(_mm_srli_si128(vs,14));
-  while(ip != in+n) {
+  while(ip != p+n) {
     uint16_t z = *ip;
     *ip++ = (start += (z >> 1 ^ -(z & 1)));
   }
     #else
-  BITZDEC(uint16_t, 16, in, n);
+  BITZDEC(uint16_t, 16, p, n);
     #endif
 }
 
-void bitzdec32(unsigned *in, unsigned n, unsigned start) {
-    #if defined(__SSSE3__) || defined(__ARM_NEON)
-  __m128i vs = _mm_set1_epi32(start);
-  unsigned *ip = in;
-  for(; ip != in+(n&~(8-1)); ip += 8) {
-    __m128i iv0 = _mm_loadu_si128((__m128i *)ip),
-            iv1 = _mm_loadu_si128((__m128i *)(ip+4));
-    iv0 = mm_zzagd_epi32(iv0);
-    iv1 = mm_zzagd_epi32(iv1);
-    vs = mm_scan_epi32(iv0, vs);
-	__m128i _vs = vs;
-    vs = mm_scan_epi32(iv1, vs);
-    _mm_storeu_si128((__m128i *)ip, _vs);
-    _mm_storeu_si128((__m128i *)(ip+4), vs);
+void bitzdec32(unsigned *p, unsigned n, unsigned start) {
+    #ifdef __AVX2__
+  __m256i vs = _mm256_set1_epi32(start); //, zv = _mm256_setzero_si256()*/; //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
+  unsigned *ip;
+  for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+    __m256i iv =  _mm256_loadu_si256((__m256i *)ip);
+    iv = mm256_zzagd_epi32(iv);
+    vs = mm256_scan_epi32(iv,vs);
+    _mm256_storeu_si256((__m256i *)ip, vs);
+  }
+  start = (unsigned)_mm256_extract_epi32(_mm256_srli_si256(vs,12), 4);
+  while(ip != p+n) {
+    unsigned z = *ip;
+    *ip++ = (start += (z >> 1 ^ -(z & 1)));
+  }
+    #elif defined(__SSE2__) || defined(__ARM_NEON)
+  __m128i vs = _mm_set1_epi32(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
+  unsigned *ip;
+  for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
+    __m128i iv =  _mm_loadu_si128((__m128i *)ip);
+    iv = mm_zzagd_epi32(iv);
+    vs = mm_scan_epi32(iv, vs);
+    _mm_storeu_si128((__m128i *)ip, vs);
   }
   start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
-  while(ip != in+n) {
+  while(ip != p+n) {
     unsigned z = *ip;
     *ip++ = (start += zigzagdec32(z));
   }
     #else
-  BITZDEC(uint32_t, 32, in, n);
+  BITZDEC(uint32_t, 32, p, n);
     #endif
 }
 
-//----------------------- XOR ------------------------------------------------------------------------------------------------------
+//----------------------- XOR : return max. bits ---------------------------------
 #define XE(i) x = _ip[i] ^ start; start = _ip[i]
 #define BITXENC(_t_, _in_, _n_, _act_) { _t_ *_ip;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { XE(0);_act_; XE(1);_act_; XE(2);_act_; XE(3);_act_; }\
   for(        ; _ip != _in_+ _n_;         _ip++   ) { XE(0);_act_; }\
 }
-
-uint8_t  bitx8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start) { uint8_t  o=0, u=0,x; BITXENC(uint8_t,  in, n, o |= x); if(px) *px = 0; return o; }
-uint64_t bitx64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0, u=0,x; BITXENC(uint64_t, in, n, o |= x); if(px) *px = 0; return o; }
-
-uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
-  uint16_t o = 0, *ip = in;
-
-    #if defined(__SSE2__) || defined(__ARM_NEON)
-  __m128i vo0 = _mm_setzero_si128(),
-          vo1 = _mm_setzero_si128(),
-           vs = _mm_set1_epi16(start);
-  for(; ip != in+(n&~(16-1)); ip += 16) {            //PREFETCH(ip+512,0);
-    __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
-    __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8));                                      __m128i v0 = mm_xore_epi16(vi0,vs); vs = vi0; 
-                                                                                           __m128i v1 = mm_xore_epi16(vi1,vs); vs = vi1; 
-    vo0 = _mm_or_si128(vo0, v0);
-    vo1 = _mm_or_si128(vo1, v1);
-  }                                                                                         start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
-  vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
-    #endif
-  for(;ip != in+n; ip++) {
-    o |= ip[0] ^ start; start = ip[0];
-  }
-  if(px) *px = o;
-  return o;
-}
- 
-uint32_t bitx32(unsigned *in, unsigned n, uint32_t *px, uint32_t start) {
-  uint32_t o = 0, *ip = in;
-  
-    #if defined(__SSE2__) || defined(__ARM_NEON)
-  __m128i vo0 = _mm_setzero_si128(),
-          vo1 = _mm_setzero_si128(),                                          
-		   vs = _mm_set1_epi32(start);
-  for(; ip != in+(n&~(8-1)); ip += 8) {                                  //PREFETCH(ip+512,0);
-    __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
-    __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4));                           __m128i v0 = mm_xore_epi32(vi0,vs); vs = vi0; 
-                                                                                __m128i v1 = mm_xore_epi32(vi1,vs); vs = vi1;
-    vo0 = _mm_or_si128(vo0, v0);
-    vo1 = _mm_or_si128(vo1, v1);
-  }                                                                             start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12));
-  vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
-    #endif
-  for(;ip != in+n; ip++) {
-    o |= ip[0] ^ start; start = ip[0];
-  }
-  if(px) *px = o;
-  return o;
-}
-
 uint8_t  bitxenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start) { uint8_t  b = 0,*op = out,x; BITXENC(uint8_t,  in, n, b |= x; *op++ = x); return b; }
 uint16_t bitxenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start) { uint16_t b = 0,*op = out,x; BITXENC(uint16_t, in, n, b |= x; *op++ = x); return b; }
 uint32_t bitxenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start) { uint32_t b = 0,*op = out,x; BITXENC(uint32_t, in, n, b |= x; *op++ = x); return b; }
@@ -780,50 +613,10 @@ uint64_t bitxenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start) { ui
   for(        ;_ip != _in_+ _n_        ; _ip++   )   XD(0);\
 }
 
-void bitxdec8( uint8_t  *in, unsigned n, uint8_t  start) { BITXDEC(uint8_t,  in, n); }
-void bitxdec64(uint64_t *in, unsigned n, uint64_t start) { BITXDEC(uint64_t, in, n); }
-
-void bitxdec16(uint16_t *in, unsigned n, uint16_t start) {
-    #if defined(__SSSE3__) || defined(__ARM_NEON)
-  __m128i vs = _mm_set1_epi16(start);
-  uint16_t *ip = in;
-  for(; ip != in+(n&~(8-1)); ip += 8) {
-    __m128i iv =  _mm_loadu_si128((__m128i *)ip);                   
-	vs = mm_xord_epi16(iv, vs);
-    _mm_storeu_si128((__m128i *)ip, vs);
-  }
-  start = (uint16_t)_mm_cvtsi128_si32(_mm_srli_si128(vs,14));
-  while(ip != in+n) {
-    uint16_t z = *ip;
-    *ip++ = (start ^= z);
-  }
-    #else
-  BITXDEC(uint16_t, in, n);
-    #endif
-}
-
-void bitxdec32(unsigned *in, unsigned n, unsigned start) {
-    #if defined(__SSSE3__) || defined(__ARM_NEON)
-  __m128i vs = _mm_set1_epi32(start);
-  unsigned *ip = in;
-  for(; ip != in+(n&~(8-1)); ip += 8) {
-    __m128i iv0 = _mm_loadu_si128((__m128i *)ip),
-            iv1 = _mm_loadu_si128((__m128i *)(ip+4));
-    vs = mm_xord_epi32(iv0, vs);
-	__m128i _vs = vs;
-    vs = mm_xord_epi32(iv1, vs);
-    _mm_storeu_si128((__m128i *)ip,    _vs);
-    _mm_storeu_si128((__m128i *)(ip+4), vs);
-  }
-  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
-  while(ip != in+n) {
-    unsigned z = *ip;
-    *ip++ = (start ^= z);
-  }
-    #else
-  BITXDEC(uint32_t, 32, in, n);
-    #endif
-}
+void bitxdec8( uint8_t  *p, unsigned n, uint8_t  start) { BITXDEC(uint8_t,  p, n); }
+void bitxdec16(uint16_t *p, unsigned n, uint16_t start) { BITXDEC(uint16_t, p, n); }
+void bitxdec32(uint32_t *p, unsigned n, uint32_t start) { BITXDEC(uint32_t, p, n); }
+void bitxdec64(uint64_t *p, unsigned n, uint64_t start) { BITXDEC(uint64_t, p, n); }
 
 //-------------- For : calc max. bits, min,max value ------------------------
 #define FM(i) mi = _ip[i] < mi?_ip[i]:mi; mx = _ip[i] > mx?_ip[i]:mx
@@ -837,125 +630,60 @@ uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t  *px, uint16_t *pmin) { uint
 uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t  *px, uint32_t *pmin) { uint32_t mi,mx; BITFM(uint32_t, in, n); *pmin = mi; if(px) *px = 0; return mx - mi; }
 uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t  *px, uint64_t *pmin) { uint64_t mi,mx; BITFM(uint64_t, in, n); *pmin = mi; if(px) *px = 0; return mx - mi; }
 
-//---------------------- any esize ----------------------------------
-void bitxenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
-  switch(esize) {
-    case 2 : bitxenc16(in, n/2, out, 0); break;
-    case 4 : bitxenc32(in, n/4, out, 0); break;
-    case 8 : bitxenc64(in, n/8, out, 0); break;
-    default: bitxenc8( in, n/1, out, 0); break;
-  }
-}
-
-void bitxdec(unsigned char *in, unsigned n, unsigned esize) {
-  switch(esize) {
-    case 2 : bitxdec16(in, n/2, 0);break;
-    case 4 : bitxdec32(in, n/4, 0);break;
-    case 8 : bitxdec64(in, n/8, 0);break;
-    default: bitxdec8( in, n/1, 0);break;
-  }
-}
-
-void bitzenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
-  switch(esize) {
-    case 2 : bitzenc16(in, n/2, out, 0, 0); break;
-    case 4 : bitzenc32(in, n/4, out, 0, 0); break;
-    case 8 : bitzenc64(in, n/8, out, 0, 0); break;
-    default: bitzenc8( in, n/1, out, 0, 0); break;
-  }
-}
-
-void bitzdec(unsigned char *in, unsigned n, unsigned esize) {
-  switch(esize) {
-    case 2 : bitzdec16(in, n/2, 0);break;
-    case 4 : bitzdec32(in, n/4, 0);break;
-    case 8 : bitzdec64(in, n/8, 0);break;
-    default: bitzdec8( in, n/1, 0);break;
-  }
-}
-
 //----------- Lossy floating point conversion: pad the trailing mantissa bits with zero bits according to the relative error e (ex. 0.00001)  ----------
 
   #ifdef USE_FLOAT16
 // https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
 #define ctof16(_cp_) (*(_Float16 *)(_cp_))
 
-_Float16 _fprazor16(_Float16 d, float e, int lg2e) {
-  uint16_t du = ctou16(&d), sign, u;
-  int      b  = (du>>10 & 0x1f) - 15; // mantissa=10 bits, exponent=5bits, bias=15
-  _Float16 ed;
-  if ((b = 12 - b - lg2e) <= 0) 
-	return d;
-  b    = b > 10?10:b;
-  sign = du & (1<<15);
-  du  &= 0x7fff;
-  for(d = ctof16(&du), ed = e * d;;) {
-    u = du & (~((1u<<(--b))-1)); if(d - ctof16(&u) <= ed) break;
-    u = du & (~((1u<<(--b))-1)); if(d - ctof16(&u) <= ed) break;
-  }
-  u |= sign;
+static inline _Float16 _fppad16(_Float16 d, float e, int lg2e) {
+  uint16_t u, du = ctou16(&d);
+  int b = (du>>10 & 0x1f)-15; // mantissa=10 bits, exponent=5bits, bias=15
+  if ((b = 12 - b - lg2e) <= 0) return d;
+  b = (b > 10) ? 10 : b;
+  do { u = du & (~((1u<<(--b))-1)); } while (fabs((ctof16(&u) - d)/d) > e);
   return ctof16(&u);
 }
 
-void fprazor16(_Float16 *in, unsigned n, _Float16 *out, float e) { 
-  int lg2e = -log(e)/log(2.0); _Float16 *ip; 
-  
-  for (ip = in; ip < in+n; ip++,out++)
-    *out = _fprazor16(*ip, e, lg2e); 
-}
+void fppad16(_Float16 *in, size_t n, _Float16 *out, float e) { int lg2e = -log(e)/log(2.0); _Float16 *ip; for (ip = in; ip < in+n; ip++,out++) *out = _fppad16(*ip, e, lg2e); }
   #endif
 
-float _fprazor32(float d, float e, int lg2e) {
-  uint32_t du = ctou32(&d), sign, u;
-  int      b  = (du>>23 & 0xff) - 0x7e;
-  float    ed;
- 
+//do u = du & (~((1u<<(--b))-1)); while(fabsf((ctof32(&u) - d)/d) > e);
+#define OP(t,s) sign = du & ((t)1<<(s-1)); du &= ~((t)1<<(s-1));  d = TEMPLATE2(ctof,s)(&du);\
+  do u = du & (~(((t)1<<(--b))-1)); while(d - TEMPLATE2(ctof,s)(&u) > e*d);\
+  u |= sign;\
+  return TEMPLATE2(ctof,s)(&u);
+
+static inline float _fppad32(float d, float e, int lg2e) {
+  uint32_t u, du = ctou32(&d), sign;
+  int      b = (du>>23 & 0xff)-0x7e;
   if((b = 25 - b - lg2e) <= 0)
-    return d;                                         AS(!isnan(d), "_fprazor32: isnan");
+    return d;
   b    = b > 23?23:b;
   sign = du & (1<<31);
   du  &= 0x7fffffffu;
-  
-  for(d = ctof32(&du), ed = e * d;;) {
-    u = du & (~((1u<<(--b))-1)); if(d - ctof32(&u) <= ed) break;
-    u = du & (~((1u<<(--b))-1)); if(d - ctof32(&u) <= ed) break;
-    u = du & (~((1u<<(--b))-1)); if(d - ctof32(&u) <= ed) break;
-  }
+  d    = ctof32(&du);
+  do u = du & (~((1u<<(--b))-1)); while(d - ctof32(&u) > e*d);
   u |= sign;
   return ctof32(&u);
 }
 
-void fprazor32(float *in, unsigned n, float *out, float e) { 
-  int   lg2e = -log(e)/log(2.0); 
-  float *ip; 
-  for(ip = in; ip < in+n; ip++,out++) 
-	*out = _fprazor32(*ip, e, lg2e); 
-}
+void fppad32(float *in, size_t n, float *out, float e) { int lg2e = -log(e)/log(2.0); float *ip; for(ip = in; ip < in+n; ip++,out++) *out = _fppad32(*ip, e, lg2e); }
 
-double _fprazor64(double d, double e, int lg2e) { //if(isnan(d)) return d;
-  uint64_t du = ctou64(&d), sign, u;
-  int      b  = (du>>52 & 0x7ff) - 0x3fe;
-  double   ed;
-  
+static inline double _fppad64(double d, double e, int lg2e) { if(isnan(d)) return d;
+  union    r { uint64_t u; double d; } u,du; du.d = d; //if((du.u>>52)==0xfff)
+  uint64_t sign;
+  int      b = (du.u>>52 & 0x7ff)-0x3fe;
   if((b = 54 - b - lg2e) <= 0)
     return d;
-  b     = b > 52?52:b;
-  sign  = du & (1ull<<63); 
-  du   &= 0x7fffffffffffffffull;
-
-  for(d = ctof64(&du), ed = e * d;;) {
-    u = du & (~((1ull<<(--b))-1)); if(d - ctof64(&u) <= ed) break;
-    u = du & (~((1ull<<(--b))-1)); if(d - ctof64(&u) <= ed) break;
-  }
-  u |= sign; 
+  b = b > 52?52:b;
+  sign = du.u & (1ull<<63); du.u &= 0x7fffffffffffffffull;
+  int _b = b;
+  for(;;) { if((_b -= 8) <= 0) break; u.u = du.u & (~((1ull<<_b)-1)); if(d - u.d <= e*d) break; b = _b; }
+  do u.u = du.u & (~((1ull<<(--b))-1)); while(d - u.d > e*d);
+  u.u |= sign;
   return ctof64(&u);
 }
 
-void fprazor64(double *in, unsigned n, double *out, double e) { 
-  int    lg2e = -log(e)/log(2.0); 
-  double *ip; 
-  
-  for(ip = in; ip < in+n; ip++,out++) 
-	*out = _fprazor64(*ip, e, lg2e); 
-}
-#endif
+void fppad64(double *in, size_t n, double *out, double e) { int lg2e = -log(e)/log(2.0); double *ip; for(ip = in; ip < in+n; ip++,out++) *out = _fppad64(*ip, e, lg2e); }
+
diff --git a/src/ext/for/include_/bitutil_.h b/src/ext/for/bitutil.h
similarity index 55%
rename from src/ext/for/include_/bitutil_.h
rename to src/ext/for/bitutil.h
index 69850433..e311b414 100644
--- a/src/ext/for/include_/bitutil_.h
+++ b/src/ext/for/bitutil.h
@@ -1,5 +1,5 @@
 /**
-    Copyright (C) powturbo 2013-2023
+    Copyright (C) powturbo 2013-2019
     GPL v2 License
 
     This program is free software; you can redistribute it and/or modify
@@ -22,6 +22,8 @@
     - email    : powturbo [_AT_] gmail [_DOT_] com
 **/
 //     "Integer Compression: max.bits, delta, zigzag, xor"
+
+#ifdef BITUTIL_IN
   #ifdef __AVX2__
 #include <immintrin.h>
   #elif defined(__AVX__)
@@ -46,15 +48,14 @@
   #else
 #include <stdint.h>
   #endif
-#include "../include_/sse_neon.h"
+#include "sse_neon.h"
 
   #ifdef __ARM_NEON
 #define PREFETCH(_ip_,_rw_)
   #else
-#define PREFETCH(_ip_,_rw_) //__builtin_prefetch(_ip_,_rw_)
+#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
   #endif
-  
-//------------------------ zigzag encoding ----------------------------------------------
+//------------------------ zigzag encoding -------------------------------------------------------------
 static inline unsigned char  zigzagenc8( signed char    x) { return x << 1 ^   x >> 7;  }
 static inline          char  zigzagdec8( unsigned char  x) { return x >> 1 ^ -(x &  1); }
 
@@ -67,137 +68,128 @@ static inline int            zigzagdec32(unsigned x)       { return x >> 1 ^ -(x
 static inline uint64_t       zigzagenc64(int64_t  x)       { return x << 1 ^ x >> 63;  }
 static inline  int64_t       zigzagdec64(uint64_t x)       { return x >> 1 ^ -(x & 1); }
 
+  #if defined(__SSE2__) || defined(__ARM_NEON)
+static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1),  mm_srai_epi16(v,15)); }
+static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1),  mm_srai_epi32(v,31)); }
+//static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1), _mm_srai_epi64(v,63)); }
+
+static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1),  mm_srai_epi16( mm_slli_epi16(v,15),15) ); }
+static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1),  mm_srai_epi32( mm_slli_epi32(v,31),31) ); }
+//static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128(mm_srli_epi64(v,1), _mm_srai_epi64( m_slli_epi64(v,63),63) ); }
+
+  #endif
   #ifdef __AVX2__
-#define mm256_srai_epi64_63(v, s) _mm256_srai_epi32(_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)), 31)
-static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32(   v,31)); }
-static ALWAYS_INLINE __m256i mm256_zzage_epi64(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi64(v,1),  mm256_srai_epi64_63(v,63)); }
-static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32(   _mm256_slli_epi32(v,31),31) ); }
-static ALWAYS_INLINE __m256i mm256_zzagd_epi64(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi64(v,1),  mm256_srai_epi64_63(_mm256_slli_epi64(v,63),63) ); }
+static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32(v,31)); }
+static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32(_mm256_slli_epi32(v,31),31) ); }
+  #endif
 
-//-- AVX2 delta <-> prefix sum (scan) / xor encode <-> xor decode ---------------------------------------------------------------------------------------
+//-------------- AVX2 delta + prefix sum (scan) / xor encode/decode ---------------------------------------------------------------------------------------
+  #ifdef __AVX2__
 static ALWAYS_INLINE __m256i mm256_delta_epi32(__m256i v, __m256i sv) { return _mm256_sub_epi32(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
 static ALWAYS_INLINE __m256i mm256_delta_epi64(__m256i v, __m256i sv) { return _mm256_sub_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)),  8)); }
-
 static ALWAYS_INLINE __m256i mm256_xore_epi32( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
 static ALWAYS_INLINE __m256i mm256_xore_epi64( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)),  8)); }
 
-#define MM256_HDEC_EPI32(_v_,_sv_,_ho_) {\
-  _v_  = _ho_(_v_, _mm256_slli_si256(_v_, 4));\
-  _v_  = _ho_(_v_, _mm256_slli_si256(_v_, 8));\
-  return _ho_(       _mm256_permute2x128_si256(                       _mm256_shuffle_epi32(_sv_,_MM_SHUFFLE(3, 3, 3, 3)), _sv_, 0x11),\
-           _ho_(_v_, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 3, 3)),       0x20)));\
+static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) {
+  v  = _mm256_add_epi32(v, _mm256_slli_si256(v, 4));
+  v  = _mm256_add_epi32(v, _mm256_slli_si256(v, 8));
+  return _mm256_add_epi32(     _mm256_permute2x128_si256(                       _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
+           _mm256_add_epi32(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)),     0x20)));
 }
-static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) { MM256_HDEC_EPI32(v,sv,_mm256_add_epi32); }
-static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) { MM256_HDEC_EPI32(v,sv,_mm256_xor_si256); }
-
-#define MM256_HDEC_EPI64(_v_,_sv_,_ho_) {\
-  _v_ = _ho_(_v_, _mm256_alignr_epi8(_v_, _mm256_permute2x128_si256(_v_, _v_, _MM_SHUFFLE(0, 0, 2, 0)), 8));\
-  return _ho_(_mm256_permute4x64_epi64(_sv_, _MM_SHUFFLE(3, 3, 3, 3)), _ho_(_mm256_permute2x128_si256(_v_, _v_, _MM_SHUFFLE(0, 0, 2, 0)), _v_) );\
+static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) {
+  v  = _mm256_xor_si256(v, _mm256_slli_si256(v, 4));
+  v  = _mm256_xor_si256(v, _mm256_slli_si256(v, 8));
+  return _mm256_xor_si256(     _mm256_permute2x128_si256(                       _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
+           _mm256_xor_si256(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)),     0x20)));
 }
-static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) { MM256_HDEC_EPI64(v,sv,_mm256_add_epi64); }
-static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) { MM256_HDEC_EPI64(v,sv,_mm256_xor_si256); }
-
-static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); }
 
-//-- Horizontal OR ---------------------------------------
-static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) {
-  v = _mm256_or_si256(v, _mm256_srli_si256(v, 8));
-  v = _mm256_or_si256(v, _mm256_srli_si256(v, 4));
-  return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4);
+static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) {
+  v = _mm256_add_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
+  return _mm256_add_epi64(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_add_epi64(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
 }
-
-static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
-  v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1)));
-  return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0);
+static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) {
+  v = _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
+  return _mm256_xor_si256(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_xor_si256(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
 }
-  #endif
- 
-  #if defined(__SSSE3__) || defined(__ARM_NEON)
-#define mm_srai_epi64_63(_v_, _s_) _mm_srai_epi32(_mm_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 1, 1)), 31)
-  
-static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1),  mm_srai_epi16(   v,15)); }
-static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1),  mm_srai_epi32(   v,31)); }
-static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1),  mm_srai_epi64_63(v,63)); }
 
-static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1),  mm_srai_epi16(    mm_slli_epi16(v,15),15)); }
-static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1),  mm_srai_epi32(    mm_slli_epi32(v,31),31)); }
-static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128( mm_srli_epi64(v,1),  mm_srai_epi64_63( mm_slli_epi64(v,63),63)); }
+static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); }
+  #endif
 
+  #if defined(__SSSE3__) || defined(__ARM_NEON)
 static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_alignr_epi8(v, sv, 14)); }
 static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_alignr_epi8(v, sv, 12)); }
-static ALWAYS_INLINE __m128i mm_delta_epi64(__m128i v, __m128i sv) { return _mm_sub_epi64(v, _mm_alignr_epi8(v, sv,  8)); }
-
 static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 14)); }
 static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 12)); }
-static ALWAYS_INLINE __m128i mm_xore_epi64( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv,  8)); }
 
-#define MM_HDEC_EPI32(_v_,_sv_,_ho_) { \
-  _v_ = _ho_(_v_, _mm_slli_si128(_v_, 4)); \
-  _v_ = _ho_(mm_shuffle_nnnn_epi32(_sv_, 3), _ho_(_mm_slli_si128(_v_, 8), _v_));\
-}
+#define MM_HDEC_EPI32(_v_,_sv_,_hop_) { _v_ = _hop_(_v_, _mm_slli_si128(_v_, 4)); _v_ = _hop_(mm_shuffle_nnnn_epi32(_sv_, 3), _hop_(_mm_slli_si128(_v_, 8), _v_)); }
 static ALWAYS_INLINE __m128i mm_scan_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_add_epi32); return v; }
 static ALWAYS_INLINE __m128i mm_xord_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_xor_si128); return v; }
 
-#define MM_HDEC_EPI64(_v_,_sv_,_ho_) { \
-  _v_ = _ho_(_v_, _mm_slli_si128(_v_, 8)); \
-  _v_ = _ho_(_mm_shuffle_epi8(_sv_, _mm_set_epi8(15,14,13,12,11,10,9,8, 15,14,13,12,11,10,9,8)), _v_);\
+#define MM_HDEC_EPI16(_v_,_sv_,_hop_) {\
+  _v_  = _hop_(      _v_, _mm_slli_si128(_v_, 2));\
+  _v_  = _hop_(      _v_, _mm_slli_si128(_v_, 4));\
+  _v_  = _hop_(_hop_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\
 }
-static ALWAYS_INLINE __m128i mm_scan_epi64(__m128i v, __m128i sv) { MM_HDEC_EPI64(v,sv,_mm_add_epi64); return v; }
-static ALWAYS_INLINE __m128i mm_xord_epi64(__m128i v, __m128i sv) { MM_HDEC_EPI64(v,sv,_mm_xor_si128); return v; }
 
-#define MM_HDEC_EPI16(_v_,_sv_,_ho_) {\
-  _v_  = _ho_(      _v_, _mm_slli_si128(_v_, 2));\
-  _v_  = _ho_(      _v_, _mm_slli_si128(_v_, 4));\
-  _v_  = _ho_(_ho_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\
-}
 static ALWAYS_INLINE __m128i mm_scan_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_add_epi16); return v; }
 static ALWAYS_INLINE __m128i mm_xord_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_xor_si128); return v; }
-
-#define MM_HDEC_EPI8(_v_,_sv_,_ho_) {\
-  _v_  = _ho_(      _v_, _mm_slli_si128(_v_, 1));\
-  _v_  = _ho_(      _v_, _mm_slli_si128(_v_, 2));\
-  _v_  = _ho_(      _v_, _mm_slli_si128(_v_, 4));\
-  _v_  = _ho_(_ho_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi8(0xfe)));/*TODO: test*/\
-}
-static ALWAYS_INLINE __m128i mm_scan_epi8(__m128i v, __m128i sv) { MM_HDEC_EPI8(v,sv,_mm_add_epi8);  return v; }
-static ALWAYS_INLINE __m128i mm_xord_epi8(__m128i v, __m128i sv) { MM_HDEC_EPI8(v,sv,_mm_xor_si128); return v; }
-
 //-------- scan with vi delta > 0 -----------------------------
 static ALWAYS_INLINE __m128i mm_scani_epi16(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi16(mm_scan_epi16(v, sv), vi); }
 static ALWAYS_INLINE __m128i mm_scani_epi32(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi32(mm_scan_epi32(v, sv), vi); }
 
-#define MM_HOZ_EPI16(v,_ho_) {\
-  v = _ho_(v, _mm_srli_si128(v, 8));\
-  v = _ho_(v, _mm_srli_si128(v, 6));\
-  v = _ho_(v, _mm_srli_si128(v, 4));\
-  v = _ho_(v, _mm_srli_si128(v, 2));\
+  #elif defined(__SSE2__)
+static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
+static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
+static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
+static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
+  #endif
+
+#if !defined(_M_X64) && !defined(__x86_64__) && defined(__AVX__)
+#define _mm256_extract_epi64(v, index) ((__int64)((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2) | (((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2 + 1)) << 32)))
+#endif
+
+//------------------ Horizontal OR -----------------------------------------------
+  #ifdef __AVX2__
+static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) {
+  v = _mm256_or_si256(v, _mm256_srli_si256(v, 8));
+  v = _mm256_or_si256(v, _mm256_srli_si256(v, 4));
+  return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4);
+}
+
+static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
+  v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1)));
+  return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0);
+}
+  #endif
+
+  #if defined(__SSE2__) || defined(__ARM_NEON)
+#define MM_HOZ_EPI16(v,_hop_) {\
+  v = _hop_(v, _mm_srli_si128(v, 8));\
+  v = _hop_(v, _mm_srli_si128(v, 6));\
+  v = _hop_(v, _mm_srli_si128(v, 4));\
+  v = _hop_(v, _mm_srli_si128(v, 2));\
 }
 
-#define MM_HOZ_EPI32(v,_ho_) {\
-  v = _ho_(v, _mm_srli_si128(v, 8));\
-  v = _ho_(v, _mm_srli_si128(v, 4));\
+#define MM_HOZ_EPI32(v,_hop_) {\
+  v = _hop_(v, _mm_srli_si128(v, 8));\
+  v = _hop_(v, _mm_srli_si128(v, 4));\
 }
 
 static ALWAYS_INLINE uint16_t mm_hor_epi16( __m128i v) { MM_HOZ_EPI16(v,_mm_or_si128);               return (unsigned short)_mm_cvtsi128_si32(v); }
 static ALWAYS_INLINE uint32_t mm_hor_epi32( __m128i v) { MM_HOZ_EPI32(v,_mm_or_si128);               return (unsigned      )_mm_cvtsi128_si32(v); }
 static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _mm_srli_si128(v, 8)); return (uint64_t      )_mm_cvtsi128_si64(v); }
-  
+  #endif
+
 //----------------- sub / add ----------------------------------------------------------
+  #if defined(__SSE2__) || defined(__ARM_NEON)
 #define SUBI16x8(_v_, _sv_)       _mm_sub_epi16(_v_, _sv_)
 #define SUBI32x4(_v_, _sv_)       _mm_sub_epi32(_v_, _sv_)
 #define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
 #define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)
 
-//---------------- Convert _mm_cvtsi128_siXX -------------------------------------------
-static ALWAYS_INLINE uint8_t   mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
-static ALWAYS_INLINE uint16_t  mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
-#define mm_cvtsi128_si32(_v_) _mm_cvtsi128_si32(_v_)
-
-  #elif defined(__SSE2__)
-static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
-static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
-static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
-static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
+//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
+static ALWAYS_INLINE uint8_t  mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
+static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
   #endif
 
 //--------- memset -----------------------------------------
@@ -251,7 +243,6 @@ static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_
 #define BITFORZERO32(_out_, _n_, _start_, _mindelta_) BITFORSET_(_out_, _n_, _start_, _mindelta_)
 #define BITZERO32(   _out_, _n_, _start_)             BITFORSET_(_out_, _n_, _start_, 0)
   #endif
-#define BITZERO16(   _out_, _n_, _start_)             BITFORSET_(_out_, _n_, _start_, 0)
 
 #define DELTR( _in_, _n_, _start_, _mindelta_,      _out_) { unsigned _v; for(      _v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_); }
 #define DELTRB(_in_, _n_, _start_, _mindelta_, _b_, _out_) { unsigned _v; for(_b_=0,_v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_), _b_ |= _out_[_v]; _b_ = bsr32(_b_); }
@@ -348,7 +339,28 @@ static ALWAYS_INLINE __m256i mm256_rbit_epi64(__m256i v) { return mm256_rbit_epi
 static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi8(mm256_rev_si128(v)); }
   #endif
 
-// ------------------ bitio general macros ---------------------------
+// ------------------ bitio genaral macros ---------------------------
+  #ifdef __AVX2__
+    #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#include <intrin.h>
+    #else
+#include <x86intrin.h>
+    #endif
+#define bzhi_u32(_u_, _b_)               _bzhi_u32(_u_, _b_)
+
+    #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
+#define bzhi_u64(_u_, _b_)               ((_u_) & ((1ull<<(_b_))-1))
+    #else
+#define bzhi_u64(_u_, _b_)               _bzhi_u64(_u_, _b_)
+    #endif
+  #else
+#define bzhi_u64(_u_, _b_)               ((_u_) & ((1ull<<(_b_))-1))
+#define bzhi_u32(_u_, _b_)               ((_u_) & ((1u  <<(_b_))-1))
+  #endif
+
+#define BZHI64(_u_, _b_)                 (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))
+#define BZHI32(_u_, _b_)                 (_b_ == 32?        0xffffffffu  :((_u_) & ((1u  <<(_b_))-1)))
+
 #define bitdef(     _bw_,_br_)           uint64_t _bw_=0; unsigned _br_=0
 #define bitini(     _bw_,_br_)           _bw_=_br_=0
 //-- bitput ---------
@@ -367,9 +379,9 @@ static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi
 #define BITPEEK64(  _bw_,_br_,_nb_)      BZHI64(bitbw(_bw_,_br_), _nb_)
 #define BITGET64(   _bw_,_br_,_nb_,_x_)  _x_ = BITPEEK64(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
 
-#define bitpeek57(  _bw_,_br_,_nb_)      bzhi64(bitbw(_bw_,_br_), _nb_)
+#define bitpeek57(  _bw_,_br_,_nb_)      bzhi_u64(bitbw(_bw_,_br_), _nb_)
 #define bitget57(   _bw_,_br_,_nb_,_x_)  _x_ = bitpeek57(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
-#define bitpeek31(  _bw_,_br_,_nb_)      bzhi32(bitbw(_bw_,_br_), _nb_)
+#define bitpeek31(  _bw_,_br_,_nb_)      bzhi_u32(bitbw(_bw_,_br_), _nb_)
 #define bitget31(   _bw_,_br_,_nb_,_x_)  _x_ = bitpeek31(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
 //------------------ templates -----------------------------------
 #define bitput8( _bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
@@ -381,11 +393,155 @@ static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi
 #define bitget16(_bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
 #define bitget32(_bw_,_br_,_b_,_x_,_ip_) bitget57(_bw_,_br_,_b_,_x_)
 #define bitget64(_bw_,_br_,_b_,_x_,_ip_) if((_b_)>45) { unsigned _v; bitget57(_bw_,_br_,(_b_)-32,_x_); bitdnorm(_bw_,_br_,_ip_); BITGET64(_bw_,_br_,32,_v); _x_ = _x_<<32|_v; } else bitget57(_bw_,_br_,_b_,_x_)  
+#endif
+
+//---------- max. bit length + transform for sorted/unsorted arrays, delta,delta 1, delta > 1, zigzag, zigzag of delta, xor, FOR,----------------
+#ifdef __cplusplus
+extern "C" {
+#endif
+//------ ORed array, used to determine the maximum bit length of the elements in an unsorted integer array ---------------------
+uint8_t  bit8( uint8_t  *in, unsigned n, uint8_t  *px);
+uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px);
+uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px);
+uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px);
+
+//-------------- delta = 0: Sorted integer array w/ mindelta = 0 ----------------------------------------------
+//-- ORed array, maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1]
+uint8_t  bitd8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
+uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//-- in-place reverse delta 0
+void bitddec8(  uint8_t  *p,  unsigned n, uint8_t  start); // non decreasing (out[i] = in[i] - in[i-1])
+void bitddec16( uint16_t *p,  unsigned n, uint16_t start);
+void bitddec32( uint32_t *p,  unsigned n, uint32_t start);
+void bitddec64( uint64_t *p,  unsigned n, uint64_t start);
+
+//-- vectorized fast delta4 one: out[0] = in[4]-in[0], out[1]=in[5]-in[1], out[2]=in[6]-in[2], out[3]=in[7]-in[3],...
+uint16_t bits128v16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bits128v32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+
+//------------- delta = 1: Sorted integer array w/ mindelta = 1 ---------------------------------------------
+//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
+uint8_t  bitd18( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
+uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//-- in-place reverse delta one
+void bitd1dec8(     uint8_t  *p,  unsigned n, uint8_t  start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
+void bitd1dec16(    uint16_t *p,  unsigned n, uint16_t start);
+void bitd1dec32(    uint32_t *p,  unsigned n, uint32_t start);
+void bitd1dec64(    uint64_t *p,  unsigned n, uint64_t start);
+
+//------------- delta > 1: Sorted integer array w/ mindelta > 1 ---------------------------------------------
+//-- ORed array, for max. bit length get min. delta ()
+uint8_t  bitdi8(    uint8_t  *in, unsigned n, uint8_t  *px,  uint8_t  start);
+uint16_t bitdi16(   uint16_t *in, unsigned n, uint16_t *px,  uint16_t start);
+uint32_t bitdi32(   uint32_t *in, unsigned n, uint32_t *px,  uint32_t start);
+uint64_t bitdi64(   uint64_t *in, unsigned n, uint64_t *px,  uint64_t start);
+//-- transform sorted integer array to delta array: out[i] = in[i] - in[i-1] - mindelta
+uint8_t  bitdienc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta);
+uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
+uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
+uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
+//-- in-place reverse delta
+void     bitdidec8( uint8_t  *in, unsigned n,                uint8_t  start, uint8_t  mindelta);
+void     bitdidec16(uint16_t *in, unsigned n,                uint16_t start, uint16_t mindelta);
+void     bitdidec32(uint32_t *in, unsigned n,                uint32_t start, uint32_t mindelta);
+void     bitdidec64(uint64_t *in, unsigned n,                uint64_t start, uint64_t mindelta);
+
+//------------- FOR : array bit length: ---------------------------------------------------------------------
+//------ ORed array, for max. bit length of the non decreasing integer array.  out[i] = in[i] - start
+uint8_t  bitf8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
+uint16_t bitf16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitf32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitf64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//------ ORed array, for max. bit length of the non strictly decreasing integer array out[i] = in[i] - 1 - start
+uint8_t  bitf18( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
+uint16_t bitf116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//------ ORed array, for max. bit length for usorted array
+uint8_t  bitfm8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  *pmin);  // unsorted
+uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t *px, uint16_t *pmin);
+uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin);
+uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin);
+
+//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------------------
+//-- ORed array, to get maximum zigzag bit length integer array
+uint8_t  bitz8(    uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
+uint16_t bitz16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitz32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitz64(   uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+//-- Zigzag transform
+uint8_t  bitzenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta);
+uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
+uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
+uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
+//-- in-place zigzag reverse transform
+void bitzdec8(     uint8_t  *in, unsigned n,                uint8_t  start);
+void bitzdec16(    uint16_t *in, unsigned n,                uint16_t start);
+void bitzdec32(    uint32_t *in, unsigned n,                uint32_t start);
+void bitzdec64(    uint64_t *in, unsigned n,                uint64_t start);
+
+//------------- Zigzag of zigzag/delta : unsorted/sorted integer array ----------------------------------------------------
+//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
+uint8_t  bitzz8(    uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
+uint16_t bitzz16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitzz32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitzz64(   uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+uint8_t  bitzzenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta);
+uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
+uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
+uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
+
+//-- in-place reverse zigzag of delta (encoded w/ bitdiencNN and parameter mindelta = 1)
+void bitzzdec8(     uint8_t  *in,  unsigned n, uint8_t  start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
+void bitzzdec16(    uint16_t *in,  unsigned n, uint16_t start);
+void bitzzdec32(    uint32_t *in,  unsigned n, uint32_t start);
+void bitzzdec64(    uint64_t *in,  unsigned n, uint64_t start);
+
+//------------- XOR encoding for unsorted integer lists: out[i] = in[i] - in[i-1] -------------
+//-- ORed array, to get maximum zigzag bit length integer array
+uint8_t  bitx8(    uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
+uint16_t bitx16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitx32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitx64(   uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//-- XOR transform
+uint8_t  bitxenc8(  uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start);
+uint16_t bitxenc16( uint16_t *in, unsigned n, uint16_t *out, uint16_t start);
+uint32_t bitxenc32( uint32_t *in, unsigned n, uint32_t *out, uint32_t start);
+uint64_t bitxenc64( uint64_t *in, unsigned n, uint64_t *out, uint64_t start);
+
+//-- XOR in-place reverse transform
+void bitxdec8(      uint8_t  *p,  unsigned n, uint8_t  start);
+void bitxdec16(     uint16_t *p,  unsigned n, uint16_t start);
+void bitxdec32(     uint32_t *p,  unsigned n, uint32_t start);
+void bitxdec64(     uint64_t *p,  unsigned n, uint64_t start);
+
+//------- Lossy floating point transform: pad the trailing mantissa bits with zeros according to the error e (ex. e=0.00001)
+  #ifdef USE_FLOAT16
+void fppad16(_Float16 *in, size_t n, _Float16  *out, float  e);
+  #endif
+void fppad32(float  *in, size_t n, float  *out, float  e);
+void fppad64(double *in, size_t n, double *out, double e);
+
+#ifdef __cplusplus
+}
+#endif
 
 //---- Floating point to Integer decomposition ---------------------------------
 // seeeeeeee21098765432109876543210 (s:sign, e:exponent, 0-9:mantissa)
+  #ifdef BITUTIL_IN
 #define MANTF32    23
 #define MANTF64    52
 
 #define BITFENC(_u_, _sgn_, _expo_, _mant_,      _mantbits_, _one_) _sgn_ = _u_ >> (sizeof(_u_)*8-1); _expo_ = ((_u_ >> (_mantbits_)) & ( (_one_<<(sizeof(_u_)*8 - 1 - _mantbits_)) -1)); _mant_ = _u_ & ((_one_<<_mantbits_)-1);
 #define BITFDEC(     _sgn_, _expo_, _mant_, _u_, _mantbits_)        _u_ = (_sgn_) << (sizeof(_u_)*8-1) | (_expo_) << _mantbits_ | (_mant_)
+  #endif
diff --git a/src/ext/for/include_/conf.h b/src/ext/for/conf.h
similarity index 50%
rename from src/ext/for/include_/conf.h
rename to src/ext/for/conf.h
index d04bb8cc..be6face4 100644
--- a/src/ext/for/include_/conf.h
+++ b/src/ext/for/conf.h
@@ -1,10 +1,10 @@
 /**
-    Copyright (C) powturbo 2016-2023
-    GPL v3 License
+    Copyright (C) powturbo 2013-2019
+    GPL v2 License
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 3 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
     (at your option) any later version.
 
     This program is distributed in the hope that it will be useful,
@@ -23,20 +23,8 @@
 **/
 
 // conf.h - config & common
-#ifndef CONF_H_
-#define CONF_H_
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-  #if !defined(_STDINT) && !defined(_MSC_STDINT_H_)
-typedef unsigned char      uint8_t;
-typedef unsigned short     uint16_t;
-typedef unsigned int       uint32_t;
-typedef unsigned long long uint64_t;
-  #endif
-#else
-#include <stdint.h>
-#endif
-#include <stddef.h>
-
+#ifndef CONF_H
+#define CONF_H
 //------------------------- Compiler ------------------------------------------
   #if defined(__GNUC__)
 #include <stdint.h>
@@ -47,40 +35,30 @@ typedef unsigned long long uint64_t;
 #define likely(x)       __builtin_expect((x),1)
 #define unlikely(x)     __builtin_expect((x),0)
 
-//#define bswap8(x)    (x)
-    #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
-#define bswap16(x) __builtin_bswap16(x)
-    #else
-static ALWAYS_INLINE unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
-    #endif
-#define bswap32(x) __builtin_bswap32(x)
-#define bswap64(x) __builtin_bswap64(x)
-
 #define popcnt32(_x_)   __builtin_popcount(_x_)
 #define popcnt64(_x_)   __builtin_popcountll(_x_)
 
     #if defined(__i386__) || defined(__x86_64__)
-//x,__bsr32:     1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5,...
-//x,  bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,...
-static ALWAYS_INLINE int    __bsr32(               int x) {             asm("bsr  %1,%0" : "=r" (x) : "rm" (x) ); return x; }
-static ALWAYS_INLINE int      bsr32(               int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
-static ALWAYS_INLINE int      bsr64(uint64_t x          ) { return x?64 - __builtin_clzll(x):0; }
-static ALWAYS_INLINE int    __bsr64(uint64_t x          ) { return   63 - __builtin_clzll(x);   }
-
-static ALWAYS_INLINE unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static ALWAYS_INLINE unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static ALWAYS_INLINE uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static ALWAYS_INLINE uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+//x,__bsr32:     1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
+//  x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
+static inline int    __bsr32(               int x) {             asm("bsr  %1,%0" : "=r" (x) : "rm" (x) ); return x; }
+static inline int      bsr32(               int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
+static inline int      bsr64(uint64_t x          ) { return x?64 - __builtin_clzll(x):0; }
+static inline int    __bsr64(uint64_t x          ) { return   63 - __builtin_clzll(x);   }
+
+static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
     #else
-static ALWAYS_INLINE int    __bsr32(unsigned x          ) { return   31 - __builtin_clz(  x); }
-static ALWAYS_INLINE int      bsr32(int x               ) { return x?32 - __builtin_clz(  x):0; }
-static ALWAYS_INLINE int      bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
-static ALWAYS_INLINE int    __bsr64(uint64_t x          ) { return   63 - __builtin_clzll(x);   }
-
-static ALWAYS_INLINE unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
-static ALWAYS_INLINE unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
-static ALWAYS_INLINE unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
-static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
+static inline int    __bsr32(unsigned x          ) { return   31 - __builtin_clz(  x); }
+static inline int      bsr32(int x               ) { return x?32 - __builtin_clz(  x):0; }
+static inline int      bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
+
+static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
+static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
+static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
+static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
     #endif
 
 #define ctz64(_x_) __builtin_ctzll(_x_)
@@ -88,6 +66,15 @@ static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (6
 #define clz64(_x_) __builtin_clzll(_x_)
 #define clz32(_x_) __builtin_clz(_x_)    // 00000000 00000000 00000000 01000000 = 25
 
+//#define bswap8(x)    (x)
+    #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+#define bswap16(x) __builtin_bswap16(x)
+    #else
+static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
+    #endif
+#define bswap32(x) __builtin_bswap32(x)
+#define bswap64(x) __builtin_bswap64(x)
+
   #elif _MSC_VER //----------------------------------------------------
 #include <windows.h>
 #include <intrin.h>
@@ -107,12 +94,12 @@ static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (6
 #define likely(x)       (x)
 #define unlikely(x)     (x)
 
-static ALWAYS_INLINE int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
-static ALWAYS_INLINE int bsr32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?z+1:0; }
-static ALWAYS_INLINE int ctz32(  unsigned x) { unsigned long z;   _BitScanForward(&z, x); return x?z:32; }
-static ALWAYS_INLINE int clz32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?31-z:32; }
+static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
+static inline int bsr32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?z+1:0; }
+static inline int ctz32(  unsigned x) { unsigned long z;   _BitScanForward(&z, x); return x?z:32; }
+static inline int clz32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?31-z:32; }
   #if !defined(_M_ARM64) && !defined(_M_X64)
-static ALWAYS_INLINE unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
+static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
   unsigned long x0 = (unsigned long)x, top, bottom;         _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0);
   *ret = x0 ? bottom : 32 + top;  return x != 0;
 }
@@ -121,10 +108,9 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
   *ret = x1 ? top + 32 : bottom;  return x != 0;
 }
   #endif
-static ALWAYS_INLINE int __bsr64(uint64_t x) { unsigned long z = 0; _BitScanReverse64(&z, x); return z; }
-static ALWAYS_INLINE int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
-static ALWAYS_INLINE int ctz64(uint64_t x) { unsigned long z;   _BitScanForward64(&z, x); return x?z:64; }
-static ALWAYS_INLINE int clz64(uint64_t x) { unsigned long z;   _BitScanReverse64(&z, x); return x?63-z:64; }
+static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
+static inline int ctz64(uint64_t x) { unsigned long z;   _BitScanForward64(&z, x); return x?z:64; }
+static inline int clz64(uint64_t x) { unsigned long z;   _BitScanReverse64(&z, x); return x?63-z:64; }
 
 #define rol32(x,s) _lrotl(x, s)
 #define ror32(x,s) _lrotr(x, s)
@@ -140,47 +126,43 @@ static ALWAYS_INLINE int clz64(uint64_t x) { unsigned long z;   _BitScanReverse6
 #define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
   #endif
 
-#define sleep(x)     Sleep(x/1000)
-#define fseeko       _fseeki64
-#define ftello       _ftelli64
-#define strcasecmp   _stricmp
-#define strncasecmp  _strnicmp
-#define strtoull     _strtoui64
-static ALWAYS_INLINE double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
+#define sleep(x)    Sleep(x/1000)
+#define fseeko      _fseeki64
+#define ftello      _ftelli64
+#define strcasecmp  _stricmp
+#define strncasecmp _strnicmp
+#define strtoull    _strtoui64
+static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
   #endif
 
 #define __bsr8(_x_)  __bsr32(_x_)
 #define __bsr16(_x_) __bsr32(_x_)
-#define bsr8(_x_)    bsr32(_x_)
-#define bsr16(_x_)   bsr32(_x_)
-#define ctz8(_x_)    ctz32((_x_)+(1<< 8))
-#define ctz16(_x_)   ctz32((_x_)+(1<<16))
-#define clz8(_x_)    (clz32(_x_)-24)
-#define clz16(_x_)   (clz32(_x_)-16)
+#define bsr8(_x_)  bsr32(_x_)
+#define bsr16(_x_) bsr32(_x_)
+#define ctz8(_x_)  ctz32(_x_)
+#define ctz16(_x_) ctz32(_x_)
+#define clz8(_x_)  (clz32(_x_)-24)
+#define clz16(_x_) (clz32(_x_)-16)
 
-#define popcnt8(x)   popcnt32(x)
-#define popcnt16(x)  popcnt32(x)
+#define popcnt8(x)  popcnt32(x)
+#define popcnt16(x) popcnt32(x)
 
 //--------------- Unaligned memory access -------------------------------------
   #ifdef UA_MEMCPY
 #include <string.h>
-static ALWAYS_INLINE unsigned short     ctou16(const void *cp) { unsigned short     x; memcpy(&x, cp, sizeof(x)); return x; } // ua read
-static ALWAYS_INLINE unsigned           ctou32(const void *cp) { unsigned           x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE size_t             ctousz(const void *cp) { size_t             x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE float              ctof32(const void *cp) { float              x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE double             ctof64(const void *cp) { double             x; memcpy(&x, cp, sizeof(x)); return x; }
-
-static ALWAYS_INLINE void               stou16(      void *cp, unsigned short     x) { memcpy(cp, &x, sizeof(x)); } // ua write
-static ALWAYS_INLINE void               stou32(      void *cp, unsigned           x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void               stou64(      void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void               stousz(      void *cp, size_t             x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void               stof32(      void *cp, float              x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void               stof64(      void *cp, double             x) { memcpy(cp, &x, sizeof(x)); }
-
-static ALWAYS_INLINE void               ltou32(unsigned           *x, const void *cp) { memcpy(x, cp, sizeof(*x)); } // ua read into ptr 
-static ALWAYS_INLINE void               ltou64(unsigned long long *x, const void *cp) { memcpy(x, cp, sizeof(*x)); }
-
+static inline unsigned short     ctou16(const void *cp) { unsigned short     x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline unsigned           ctou32(const void *cp) { unsigned           x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline size_t             ctousz(const void *cp) { size_t             x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline float              ctof32(const void *cp) { float              x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline double             ctof64(const void *cp) { double             x; memcpy(&x, cp, sizeof(x)); return x; }
+
+static inline void               stou16(      void *cp, unsigned short     x) { memcpy(cp, &x, sizeof(x)); }
+static inline void               stou32(      void *cp, unsigned           x) { memcpy(cp, &x, sizeof(x)); }
+static inline void               stou64(      void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
+static inline void               stousz(      void *cp, size_t             x) { memcpy(cp, &x, sizeof(x)); }
+static inline void               stof32(      void *cp, float              x) { memcpy(cp, &x, sizeof(x)); }
+static inline void               stof64(      void *cp, double             x) { memcpy(cp, &x, sizeof(x)); }
   #elif defined(__i386__) || defined(__x86_64__) || \
     defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\
     defined(__powerpc__) || defined(__s390__) ||\
@@ -192,30 +174,14 @@ static ALWAYS_INLINE void               ltou64(unsigned long long *x, const void
 #define ctou32(_cp_) (*(unsigned       *)(_cp_))
 #define ctof32(_cp_) (*(float          *)(_cp_))
 
-#define stou16(_cp_, _x_)  (*(unsigned short *)(_cp_) = _x_)
-#define stou32(_cp_, _x_)  (*(unsigned       *)(_cp_) = _x_)
-#define stof32(_cp_, _x_)  (*(float          *)(_cp_) = _x_)
-
-#define ltou32(_px_, _cp_) *(_px_) = *(unsigned *)(_cp_)
-
     #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER)
 #define ctou64(_cp_)       (*(uint64_t *)(_cp_))
 #define ctof64(_cp_)       (*(double   *)(_cp_))
-
-#define stou64(_cp_, _x_)  (*(uint64_t *)(_cp_) = _x_)
-#define stof64(_cp_, _x_)  (*(double   *)(_cp_) = _x_)
-
-#define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_)
-
     #elif defined(__ARM_FEATURE_UNALIGNED)
 struct _PACKED longu     { uint64_t l; };
 struct _PACKED doubleu   { double   d; };
 #define ctou64(_cp_) ((struct longu     *)(_cp_))->l
 #define ctof64(_cp_) ((struct doubleu   *)(_cp_))->d
-
-#define stou64(_cp_) ((struct longu     *)(_cp_))->l = _x_
-#define stof64(_cp_) ((struct doubleu   *)(_cp_))->d = _x_
-#define ltou64(_px_, _cp_) *(_px_) = ((struct longu *)(_cp_))->l
     #endif
 
   #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)
@@ -230,15 +196,6 @@ struct _PACKED doubleu   { double             d; };
 #define ctou64(_cp_) ((struct longu     *)(_cp_))->l
 #define ctof32(_cp_) ((struct floatu    *)(_cp_))->f
 #define ctof64(_cp_) ((struct doubleu   *)(_cp_))->d
-
-#define stou16(_cp_, _x_) ((struct shortu    *)(_cp_))->s = _x_
-#define stou32(_cp_, _x_) ((struct unsignedu *)(_cp_))->u = _x_
-#define stou64(_cp_, _x_) ((struct longu     *)(_cp_))->l = _x_
-#define stof32(_cp_, _x_) ((struct floatu    *)(_cp_))->f = _x_
-#define stof64(_cp_, _x_) ((struct doubleu   *)(_cp_))->d = _x_
-
-#define ltou32(_cp_) *(_px_) = ((struct unsignedu *)(_cp_))->u
-#define ltou64(_cp_) *(_px_) = ((struct longu *)(_cp_))->l
   #else
 #error "unknown cpu"
   #endif
@@ -261,16 +218,12 @@ struct _PACKED doubleu   { double             d; };
 #endif
 
 //---------------------misc ---------------------------------------------------
-#define BZMASK64(_b_)                    (~(~0ull << (_b_)))
-#define BZMASK32(_b_)                    (~(~0u   << (_b_)))
-#define BZMASK16(_b_)                    BZMASK32(_b_)
-#define BZMASK8( _b_)                    BZMASK32(_b_)
-
-#define BZHI64(_u_, _b_)                 ((_u_) & BZMASK64(_b_))  // b Constant
-#define BZHI32(_u_, _b_)                 ((_u_) & BZMASK32(_b_)) 
-#define BZHI16(_u_, _b_)                 BZHI32(_u_, _b_)
-#define BZHI8( _u_, _b_)                 BZHI32(_u_, _b_)
-#define BEXTR32(x,start,len)             (((x) >> (start)) & ((1u << (len)) - 1)) //Bit field extract (with register)
+#define BZHI64F(_u_, _b_) 				 ((_u_) & ((1ull<<(_b_))-1))  // _b_ < 64
+#define BZHI32F(_u_, _b_)                ((_u_) & ((1u  <<(_b_))-1))  // _b_ < 32
+#define BZHI64( _u_, _b_)                (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))  // Constant
+#define BZHI32( _u_, _b_)                (_b_ == 32?        0xffffffffu  :((_u_) & ((1u  <<(_b_))-1)))
+#define BZHI16( _u_, _b_)                BZHI32(_u_, _b_)
+#define BZHI8(  _u_, _b_)                BZHI32(_u_, _b_)
 
     #ifdef __AVX2__
       #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
@@ -278,31 +231,26 @@ struct _PACKED doubleu   { double             d; };
       #else
 #include <x86intrin.h>
       #endif
-#define bzhi32(_u_, _b_)                 _bzhi_u32(_u_, _b_)  // b variable
-#define bextr32(x,start,len)             _bextr_u32(x,start,len)  
+#define bzhi32(_u_, _b_)                 _bzhi_u32(_u_, _b_)
 
       #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
-#define bzhi64(_u_, _b_)                 BZHI64(_u_, _b_)
+#define bzhi64(_u_, _b_)                 ((_u_) & ((1ull<<(_b_))-1))
       #else
 #define bzhi64(_u_, _b_)                 _bzhi_u64(_u_, _b_)
       #endif
     #else
-#define bzhi64(_u_, _b_)                 BZHI64(_u_, _b_) 
-#define bzhi32(_u_, _b_)                 BZHI32(_u_, _b_)
-#define bextr32(x,start,len)             (((x) >> (start)) & ((1u << (len)) - 1)) //Bit field extract (with register)
+#define bzhi_u64(_u_, _b_)               BZHI64(_u_, _b_) 
+#define bzhi_u32(_u_, _b_)               BZHI32(_u_, _b_) 
     #endif
 
-#define bzhi16(_u_, _b_)                 bzhi32(_u_, _b_)
-#define bzhi8( _u_, _b_)                 bzhi32(_u_, _b_)
-
 #define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
 #define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
 
-#define T2_(_x_, _y_) _x_##_y_
-#define T2(_x_, _y_) T2_(_x_,_y_)
+#define TEMPLATE2_(_x_, _y_) _x_##_y_
+#define TEMPLATE2(_x_, _y_) TEMPLATE2_(_x_,_y_)
 
-#define T3_(_x_,_y_,_z_) _x_##_y_##_z_
-#define T3(_x_,_y_,_z_) T3_(_x_, _y_, _z_)
+#define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_
+#define TEMPLATE3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_)
 
 #define CACHE_LINE_SIZE     64
 #define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
@@ -314,21 +262,21 @@ struct _PACKED doubleu   { double             d; };
   #ifdef _MSC_VER
     #ifdef NDEBUG
 #define AS(expr, fmt, ...)
-#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
+#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
 #define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
     #else
-#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
-#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
+#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
+#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
 #define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
     #endif
   #else
     #ifdef NDEBUG
 #define AS(expr, fmt,args...)
-#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
+#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
 #define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
     #else
-#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
-#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
+#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
+#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
 #define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
     #endif
   #endif
diff --git a/src/ext/for/eliasfano.c b/src/ext/for/eliasfano.c
deleted file mode 100644
index 730d0919..00000000
--- a/src/ext/for/eliasfano.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2023
-    SPDX-License-Identifier: GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-
-//   eliasfano.c - "Integer Compression" Elias Fano
-#ifndef USIZE
-#include <stdlib.h>
-#include <string.h>
-#include "include_/conf.h"
-#include "include_/bitpack.h"
-#include "include_/bitutil.h"
-#include "include_/eliasfano.h"
-
-#include "include_/bitutil_.h"
-
-#pragma warning( disable : 4005)
-#pragma warning( disable : 4090)
-#pragma warning( disable : 4068)
-
-
-#define PAD8(__x) ( (((__x)+8-1)/8) )
-
-  #ifdef __SSE42__
-#include <nmmintrin.h>
-#define bslr32(x) _blsr_u32(x)
-#define bslr64(x) _blsr_u64(x)
-  #else
-//static inline unsigned long long blsr(unsigned long long x) { return x & (x - 1); }
-#define blsr32(_x_) ((_x_) & ((_x_) - 1))
-#define blsr64(_x_) ((_x_) & ((_x_) - 1))
-  #endif
-#define blsr8(_x_)  blsr32(_x_)
-#define blsr16(_x_) blsr32(_x_)
-
-#define EFE(__x,__i,__start) ((__x[__i] - __start)-(__i)*EF_INC)
-
-#define BITPACK bitpack
-#define BITUNPACK bitunpack
-#define EF_INC 1
-#define EFANOENC efano1enc
-#define EFANODEC efano1dec
-
-#define USIZE 32
-#include "eliasfano.c"
-#undef USIZE
-
-/*#define USIZE 16
-#include "eliasfano.c"
-#undef USIZE*/
-
-#undef EF_INC
-#undef EFANOENC
-#undef EFANODEC
-
-//----------
-#define EF_INC 0
-#define EFANOENC efanoenc
-#define EFANODEC efanodec
-
-#define USIZE 32
-#include "eliasfano.c"
-#undef USIZE
-
-#define USIZE 64
-#include "eliasfano.c"
-#undef USIZE
-
-/*#define USIZE 16
-#include "eliasfano.c"
-#undef USIZE*/
-
-#undef BITPACK
-#undef BITUNPACK
-
-#undef EF_INC
-#undef EFANOENC
-#undef EFANODEC
-
-//----------------------
-  #if defined(__SSE2__) || defined(__ARM_NEON)
-#define VSIZE 128
-
-#define BITPACK   bitpack128v
-#define BITUNPACK bitunpack128v
-#define EF_INC 1
-#define EFANOENC  efano1enc128v
-#define EFANODEC  efano1dec128v
-
-#define USIZE 32
-#include "eliasfano.c"
-#undef EF_INC
-#undef EFANOENC
-#undef EFANODEC
-
-#define EF_INC 0
-#define EFANOENC  efanoenc128v
-#define EFANODEC  efanodec128v
-#include "eliasfano.c"
-  #endif
-
-  #ifdef __AVX2__
-#define VSIZE 256
-#define BITPACK bitpack256v
-#define BITUNPACK bitunpack256v
-#define EF_INC 1
-#define EFANOENC efano1enc256v
-#define EFANODEC efano1dec256v
-#include "eliasfano.c"
-
-#define EF_INC 0
-#define EFANOENC efanoenc256v
-#define EFANODEC efanodec256v
-#include "eliasfano.c"
-  #endif
-
-#else //--------------------------------------------- implementation ---------------------------------------------------------------
-#define uint_t T3(uint, USIZE, _t)
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wparentheses"
-
-unsigned char *T2(EFANOENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) {
-  uint_t *ip, e,x,hl,i;
-  unsigned char *op;
-  unsigned lb;
-  uint_t _pa[1024+64],*pa=_pa;
-  if(!n) return out;
-  if(n > 1024) pa = malloc(sizeof(pa[0])*(n+64));    if(!pa) die("efanoenc:malloc error size=%d ", n);
-  e = EFE(in,n-1,start);
-  if(!e) { out[0] = 0; if(pa != _pa) free(pa);return out+1; }
-
-  lb = T2(bsr, USIZE)(e/n);
-  x = ((uint_t)1 << lb)-1; hl = PAD8((e>>lb)+n);
-
-  for(i = 0; i != n&~3;) {
-    pa[i] = EFE(in,i,start) & x; ++i;
-    pa[i] = EFE(in,i,start) & x; ++i;
-    pa[i] = EFE(in,i,start) & x; ++i;
-    pa[i] = EFE(in,i,start) & x; ++i;
-  }
-  while(i < n) pa[i] = EFE(in,i,start) & x, ++i;
-  *out = lb+1;
-  op = T2(BITPACK,USIZE)(pa, n, out+1, lb);
-
-  memset(op, 0, hl);
-  for(i = 0; i != n&~3; ) {
-    x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
-    x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
-    x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
-    x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
-  }
-  while(i < n) x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7),++i;
-  if(pa != _pa) free(pa);
-  return op+hl;
-}
-
-unsigned char *T2(EFANODEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) {
-  unsigned char *ip = in;
-  uint_t        i,j,lb = *ip++;
-  uint64_t      b,x;
-  if(!n)
-    return in;
-
-  if(!lb) {
-      #if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32
-        #if EF_INC == 1
-    BITFORZERO32(out, n, start, 1);
-        #else
-    BITZERO32( out, n, start);
-        #endif
-      #else
-    BITFORSET_(out, n, start, EF_INC);
-      #endif
-    return ip;
-  }
-
-  ip = T2(BITUNPACK,USIZE)(ip, n, out, --lb);
-  #define EFD(i) if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; b = blsr64(b); ++i;
-
-  for(i=j=0;; j += sizeof(uint64_t)*8) {                                            //PREFETCH(ip+256,0);
-    for(b = ctou64(ip+(j>>3)); ; ) {
-      EFD(i); EFD(i); EFD(i); EFD(i);
-      if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC;
-      if(unlikely(++i >= n))
-        goto e;
-      b = blsr64(b);
-    }
-  }
-  e:return ip + PAD8((EFE(out,n-1,start)>>lb)+n);
-}
-
-#pragma clang diagnostic pop
-#endif
diff --git a/src/ext/for/ext/OPT_PFD/main.cpp b/src/ext/for/ext/OPT_PFD/main.cpp
deleted file mode 100644
index 2c0ec066..00000000
--- a/src/ext/for/ext/OPT_PFD/main.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- *  test for OPT-pfd
- *
- *      Author: sding
- *
- *
- */
-
-
-
-#include<iostream>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "opt_p4.h"
-
-using namespace std;
-
-char PATH[128] = "/usr/home/shuai/dumplist/wordlist_Excite";		// for reading list
-
-int get_list(char *term, unsigned int *doc_id, unsigned int *freq, unsigned int *maxc)
-{
-	char fpath[128];
-	sprintf(fpath,"%s/%s",PATH,term);
-	FILE *fdd = fopen(fpath,"r");
-	if(fdd==NULL)	return 0;
-
-	int nread, npos;
-
-	nread = fread(&npos, sizeof(unsigned), 1, fdd);
-	npos = 0;
-
-	while (nread > 0)
-	{
-		nread = fread(&doc_id[npos], sizeof(unsigned), 1, fdd);
-		if (nread <= 0)  break;
-		fread(&freq[npos], sizeof(unsigned), 1, fdd);
-		npos++;
-	}
-	fclose(fdd);
-
-	int i;
-
-    /* fill out the max values */
-	for (i = 0; i < npos; i += BS)
-		maxc[(i/BS)] = doc_id[i+BS-1];
-
-    /* take the gap for doc_id */
-	for (i = npos-1; i > 0; i--)
-	{	
-		doc_id[i] -= doc_id[i-1];
-		doc_id[i] --;
-	}
-
-	for (i = 0; i < npos; i++)
-		freq[i]--;
-	return npos;
-}
-
-int main()	// just for testing
-{
-	int MAX_NDOC = 25205179;
-	unsigned int *docid = new unsigned int[MAX_NDOC];
-	unsigned int *docid_check = new unsigned int[MAX_NDOC ];
-
-	unsigned int *fre = new unsigned int[MAX_NDOC];
-	unsigned int *maxc = new unsigned int[MAX_NDOC/BS];
-	unsigned int *aux = new unsigned int[MAX_NDOC];
-	unsigned int * all_array = new unsigned int[2048];		// extra array for coding
-	
-
-	int listSize = get_list("information", docid, fre, maxc);
-	cout<<"list size is "<<listSize<<endl;
-	for(int i=0;i<listSize; i++)
-		docid_check[i] = docid[i];
-
-	int cSize = OPT4(docid,listSize,aux);
-	cout<<"Compressed size is "<<cSize<<" byte"<<endl;
-
-	unsigned int *_ww = aux;
-	for (int i = 0; i*BS < listSize; i++)
-	{
-		/* _ww = detailed_p4_decode(docid_check, _ww, all_array); */		// this is fast
-		_ww = 	detailed_p4_decode(docid_check + BS * i, _ww,  all_array);	// this is slow
-	}
-	// check correctness
-	for(int i=0;i<listSize;i++)
-	{
-		if( docid_check[i]!= docid[i] )
-		{
-			cout<<"Exceptions happen"<<endl;
-			exit (1);
-		}
-	}
-	delete []docid;
-	delete []docid_check;
-	delete []fre;
-	delete []maxc;
-	delete []aux;
-	delete []all_array;
-}
diff --git a/src/ext/for/ext/OPT_PFD/opt_p4.h b/src/ext/for/ext/OPT_PFD/opt_p4.h
deleted file mode 100644
index 24038fa0..00000000
--- a/src/ext/for/ext/OPT_PFD/opt_p4.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "pf.h"
-#define BS 128
-//using namespace std;
-//file "OPT_PFD.zip" form: http://jinruhe.com/  
-//int dnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32};
-
-void p4_encode(unsigned int *doc_id, int npos, int b,unsigned int *buf , int *size, int *ex_n)
-{
-	int i = 0;
-	unsigned int *ww = buf;
-	detailed_p4_encode(&ww, &(doc_id[i]), b, size,ex_n);
-}
-
-/*
-*	when list_size is too small, not good to use this function
-*/
-int OPT4(unsigned int *doc_id,unsigned int list_size,unsigned int *aux)
-{
-	int i,j,l;
-	for(i=0; i<2*BS; i++)
-	{
-		doc_id[i+list_size] = 0 ;		// pack the input, avoid garbage data in the end
-	}
-	int size = 0;
-	int ex_n = 0;
-	int csize = 0; 	// compressed size in bytes
-			
-	int chunk_size = 0;																					
-	int b = -1, temp_en = 0;
-	int offset = 0;
-	for(j=0;j<list_size;j+=BS)				// for each chunk
-	{
-		chunk_size = 999999999;
-		b = -1;
-		// get the smallest chunk size by trying all b's
-		for(l=0;l<16;l++)
-				{
-					p4_encode(doc_id+j, BS, l, aux+offset, &size, &ex_n);
-					if(chunk_size > size * 4)			// int bytes
-					{
-						chunk_size = size *4;
-						b = l;
-						temp_en = ex_n;
-					}		
-				}	
-
-				csize += chunk_size;
-				//printf("encode:%u\n", b);
-				p4_encode(doc_id + j, BS, b, aux + offset, &size, &ex_n);
-				offset += size;
-	}
-	
-	return csize;	
-}
diff --git a/src/ext/for/ext/OPT_PFD/pf.h b/src/ext/for/ext/OPT_PFD/pf.h
deleted file mode 100644
index 788f8cca..00000000
--- a/src/ext/for/ext/OPT_PFD/pf.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "s16head.h"
-#include "unpack.h"
-
-
-#define BS 128 
-#define FRAC 0.10 
-#define S 16 
-#define PCHUNK 128
-
-void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w);
-
-
-int detailed_p4_encode(unsigned int **w, unsigned int* p, int num , int *chunk_size, int * exception_n)
-{
-	int i, j, t, s;
-
-	unsigned int b = cnum[num];
-	int bb_e;
-	int bb_p;
-	int p_low;
-	unsigned int e_n = 0;
-	int max_p = 0;
-	int max_e = 0;
- 
-	unsigned int* out = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
-	unsigned int* ex = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
-	unsigned int* po = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
-
-	unsigned int* tp = NULL;
-	unsigned int *_pp, *_ww;
-
-	if (b == 32)
-	{
-		(*w)[0] = ((b<<10)) + (0);
-		*w +=1;	
-		for (i = 0; i < PCHUNK ; i++)  (*w)[i] = p[i];
-		*w += (PCHUNK);
-		(*chunk_size) = 1 + BS;
-
-		free(out);
-		free(ex);
-		free(po);
-		return 0;
-	}
-
-	for (i = 0; i < PCHUNK ; i++)
-	{
-		if ( p[i] >= (1<<b) )		//exception
-		{
-			p_low = p[i] & ((1<<b)-1);
-			out[i] = p_low;
-			ex[e_n] = (p[i] >> b);
-			po[(e_n++)] = i;               //          
-		}
-		else
-			out[i] = p[i];
-	}
-
-	if (1)		// force to pass every time
-	{
-		/*get the gap of position*/
-		for(j = e_n-1;j>0;j--)
-		{
-			po[j] = po[j] - po[j-1] ; 
-			po[j] --;
-		}
- 	
-		s = ((b * PCHUNK)>>5);
-		tp = (*w);
-		(*w)[0] = ((num<<10))+e_n;			// record b and number of exceptions into this value, in the other version we pick this value out and did not count it
-		(*w) += 1;		
-		for (i = 0; i < s; i++)  (*w)[i] = 0;
-		pack(out, b, PCHUNK , *w);
-		*w += s;
-
-		unsigned int *all_array = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*4) ;
-		for(j=0;j<e_n;j++)		
-		{
-			all_array[j] = po[j];
-			all_array[e_n+j] =ex[j];
-		}
-		for (_pp = all_array, _ww = (*w); _pp < &(all_array[2*e_n]); )
-			s16_encode(&_ww, &_pp, &(all_array[2*e_n]) - _pp);
-
-		(*chunk_size) = 1 + s + (_ww - (*w)) ;
-
-		(*w) += (_ww - (*w)) ;
-		
-		(*exception_n) = e_n;
-
-		free(out);
-		free(ex);
-		free(po);
-		free(all_array);
-		return (e_n);
-
-	}
-}
-
-
-void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w)
-{
-  int i, bp, wp, s;
-
-  for (bp = 0, i = 0; i < n; i++, bp += b)
-  {
-    wp = bp>>5;
-    s = 32 - b - (bp & 31);
-    if (s >= 0)
-      w[wp] |= (v[i]<<s);
-    else
-    {
-      s = -s;
-      w[wp] |= (v[i]>>s);
-      w[wp+1] = (v[i]<<(32-s));
-    }
-  }   
-}
-
-/*modified p4decode */
-unsigned int *detailed_p4_decode(unsigned int *_p, unsigned int *_w,  unsigned int * all_array)
-{
-
-  int i, s;
-  unsigned int x;
-  int flag = _w[0];
-  (_w)++;
-  
-  unsigned int *_ww,*_pp;
-  unsigned int b = ((flag>>10) & 31);
-  unsigned int e_n = (flag & 1023) ;
-
-  (unpack[b])(_p, _w);
-
-  b = cnum[b];
-  _w += ((b * BS)>>5);
-  unsigned int _k = 0;
-  unsigned int psum = 0;
-  if(e_n != 0 )
-  {
-	  for (_pp = all_array, _ww = (unsigned int *)(_w); _pp < &(all_array[e_n*2]);)
-	  {
-		  S16_DECODE(_ww, _pp);
-	  }
-
-	  _w += (_ww - _w);
-	  psum = all_array[0];
-
-	  for(i=0;i<e_n;i++)
-	  {
-		  _p[psum] += (all_array[e_n+i]<<b);
-		  psum += all_array[ i + 1] + 1;
-	  }
-  }
-  return(_w);
-}
-
-
diff --git a/src/ext/for/ext/OPT_PFD/s16head.h b/src/ext/for/ext/OPT_PFD/s16head.h
deleted file mode 100644
index 99ae4ff1..00000000
--- a/src/ext/for/ext/OPT_PFD/s16head.h
+++ /dev/null
@@ -1,251 +0,0 @@
-
-void s16_encode(unsigned int **_w, unsigned int **_p, unsigned int m)
-{ 
-int cnum[16] = {28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1};
-int cbits[16][28] = { {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
-   {2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0},
-   {1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,0},
-   {1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,0,0,0,0,0,0,0},
-   {2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {4,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {3,4,4,4,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {5,5,5,5,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {4,4,5,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {6,6,6,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {5,5,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {7,7,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {10,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
-   {28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} };
-
-  unsigned int _k, _j, _m, _o;
-
-  for (_k = 0; _k < 16; _k++) 
-  { 
-    (**_w) = _k<<28; 
-    _m = (cnum[_k] < m)? cnum[_k]:m; 
-    for (_j = 0, _o = 0; (_j < _m) && (*((*_p)+_j) < (1<<cbits[_k][_j])); ) 
-    { 
-      (**_w) += ((*((*_p)+_j))<<_o); 
-      _o += cbits[_k][_j]; 
-      _j++;
-    } 
-    if (_j == _m) 
-    { 
-      (*_p) += _m; 
-      (*_w)++; 
-      break; 
-    } 
-  } 
-}
-
-
-/* more optimized handcoded edition */
-
-#define S16_DECODE(_w, _p)	\
-{ \
-  _k = (*_w)>>28; \
-  switch(_k) \
-  { \
-    case 0: \
-      *_p = (*_w) & 1;     _p++; \
-      *_p = (*_w>>1) & 1;  _p++; \
-      *_p = (*_w>>2) & 1;  _p++; \
-      *_p = (*_w>>3) & 1;  _p++; \
-      *_p = (*_w>>4) & 1;  _p++; \
-      *_p = (*_w>>5) & 1;  _p++; \
-      *_p = (*_w>>6) & 1;  _p++; \
-      *_p = (*_w>>7) & 1;  _p++; \
-      *_p = (*_w>>8) & 1;  _p++; \
-      *_p = (*_w>>9) & 1;  _p++; \
-      *_p = (*_w>>10) & 1;  _p++; \
-      *_p = (*_w>>11) & 1;  _p++; \
-      *_p = (*_w>>12) & 1;  _p++; \
-      *_p = (*_w>>13) & 1;  _p++; \
-      *_p = (*_w>>14) & 1;  _p++; \
-      *_p = (*_w>>15) & 1;  _p++; \
-      *_p = (*_w>>16) & 1;  _p++; \
-      *_p = (*_w>>17) & 1;  _p++; \
-      *_p = (*_w>>18) & 1;  _p++; \
-      *_p = (*_w>>19) & 1;  _p++; \
-      *_p = (*_w>>20) & 1;  _p++; \
-      *_p = (*_w>>21) & 1;  _p++; \
-      *_p = (*_w>>22) & 1;  _p++; \
-      *_p = (*_w>>23) & 1;  _p++; \
-      *_p = (*_w>>24) & 1;  _p++; \
-      *_p = (*_w>>25) & 1;  _p++; \
-      *_p = (*_w>>26) & 1;  _p++; \
-      *_p = (*_w>>27) & 1;  _p++; \
-      break; \
-    case 1: \
-      *_p = (*_w) & 3;     _p++; \
-      *_p = (*_w>>2) & 3;  _p++; \
-      *_p = (*_w>>4) & 3;  _p++; \
-      *_p = (*_w>>6) & 3;  _p++; \
-      *_p = (*_w>>8) & 3;  _p++; \
-      *_p = (*_w>>10) & 3;  _p++; \
-      *_p = (*_w>>12) & 3;  _p++; \
-      *_p = (*_w>>14) & 1;  _p++; \
-      *_p = (*_w>>15) & 1;  _p++; \
-      *_p = (*_w>>16) & 1;  _p++; \
-      *_p = (*_w>>17) & 1;  _p++; \
-      *_p = (*_w>>18) & 1;  _p++; \
-      *_p = (*_w>>19) & 1;  _p++; \
-      *_p = (*_w>>20) & 1;  _p++; \
-      *_p = (*_w>>21) & 1;  _p++; \
-      *_p = (*_w>>22) & 1;  _p++; \
-      *_p = (*_w>>23) & 1;  _p++; \
-      *_p = (*_w>>24) & 1;  _p++; \
-      *_p = (*_w>>25) & 1;  _p++; \
-      *_p = (*_w>>26) & 1;  _p++; \
-      *_p = (*_w>>27) & 1;  _p++; \
-      break; \
-    case 2: \
-      *_p = (*_w) & 1;     _p++; \
-      *_p = (*_w>>1) & 1;  _p++; \
-      *_p = (*_w>>2) & 1;  _p++; \
-      *_p = (*_w>>3) & 1;  _p++; \
-      *_p = (*_w>>4) & 1;  _p++; \
-      *_p = (*_w>>5) & 1;  _p++; \
-      *_p = (*_w>>6) & 1;  _p++; \
-      *_p = (*_w>>7) & 3;  _p++; \
-      *_p = (*_w>>9) & 3;  _p++; \
-      *_p = (*_w>>11) & 3;  _p++; \
-      *_p = (*_w>>13) & 3;  _p++; \
-      *_p = (*_w>>15) & 3;  _p++; \
-      *_p = (*_w>>17) & 3;  _p++; \
-      *_p = (*_w>>19) & 3;  _p++; \
-      *_p = (*_w>>21) & 1;  _p++; \
-      *_p = (*_w>>22) & 1;  _p++; \
-      *_p = (*_w>>23) & 1;  _p++; \
-      *_p = (*_w>>24) & 1;  _p++; \
-      *_p = (*_w>>25) & 1;  _p++; \
-      *_p = (*_w>>26) & 1;  _p++; \
-      *_p = (*_w>>27) & 1;  _p++; \
-      break; \
-    case 3: \
-      *_p = (*_w) & 1;     _p++; \
-      *_p = (*_w>>1) & 1;  _p++; \
-      *_p = (*_w>>2) & 1;  _p++; \
-      *_p = (*_w>>3) & 1;  _p++; \
-      *_p = (*_w>>4) & 1;  _p++; \
-      *_p = (*_w>>5) & 1;  _p++; \
-      *_p = (*_w>>6) & 1;  _p++; \
-      *_p = (*_w>>7) & 1;  _p++; \
-      *_p = (*_w>>8) & 1;  _p++; \
-      *_p = (*_w>>9) & 1;  _p++; \
-      *_p = (*_w>>10) & 1;  _p++; \
-      *_p = (*_w>>11) & 1;  _p++; \
-      *_p = (*_w>>12) & 1;  _p++; \
-      *_p = (*_w>>13) & 1;  _p++; \
-      *_p = (*_w>>14) & 3;  _p++; \
-      *_p = (*_w>>16) & 3;  _p++; \
-      *_p = (*_w>>18) & 3;  _p++; \
-      *_p = (*_w>>20) & 3;  _p++; \
-      *_p = (*_w>>22) & 3;  _p++; \
-      *_p = (*_w>>24) & 3;  _p++; \
-      *_p = (*_w>>26) & 3;  _p++; \
-      break; \
-    case 4: \
-      *_p = (*_w) & 3;     _p++; \
-      *_p = (*_w>>2) & 3;  _p++; \
-      *_p = (*_w>>4) & 3;  _p++; \
-      *_p = (*_w>>6) & 3;  _p++; \
-      *_p = (*_w>>8) & 3;  _p++; \
-      *_p = (*_w>>10) & 3;  _p++; \
-      *_p = (*_w>>12) & 3;  _p++; \
-      *_p = (*_w>>14) & 3;  _p++; \
-      *_p = (*_w>>16) & 3;  _p++; \
-      *_p = (*_w>>18) & 3;  _p++; \
-      *_p = (*_w>>20) & 3;  _p++; \
-      *_p = (*_w>>22) & 3;  _p++; \
-      *_p = (*_w>>24) & 3;  _p++; \
-      *_p = (*_w>>26) & 3;  _p++; \
-      break; \
-    case 5: \
-      *_p = (*_w) & 15;     _p++; \
-      *_p = (*_w>>4) & 7;  _p++; \
-      *_p = (*_w>>7) & 7;  _p++; \
-      *_p = (*_w>>10) & 7;  _p++; \
-      *_p = (*_w>>13) & 7;  _p++; \
-      *_p = (*_w>>16) & 7;  _p++; \
-      *_p = (*_w>>19) & 7;  _p++; \
-      *_p = (*_w>>22) & 7;  _p++; \
-      *_p = (*_w>>25) & 7;  _p++; \
-      break; \
-    case 6: \
-      *_p = (*_w) & 7;     _p++; \
-      *_p = (*_w>>3) & 15;  _p++; \
-      *_p = (*_w>>7) & 15;  _p++; \
-      *_p = (*_w>>11) & 15;  _p++; \
-      *_p = (*_w>>15) & 15;  _p++; \
-      *_p = (*_w>>19) & 7;  _p++; \
-      *_p = (*_w>>22) & 7;  _p++; \
-      *_p = (*_w>>25) & 7;  _p++; \
-      break; \
-    case 7: \
-      *_p = (*_w) & 15;     _p++; \
-      *_p = (*_w>>4) & 15;  _p++; \
-      *_p = (*_w>>8) & 15;  _p++; \
-      *_p = (*_w>>12) & 15;  _p++; \
-      *_p = (*_w>>16) & 15;  _p++; \
-      *_p = (*_w>>20) & 15;  _p++; \
-      *_p = (*_w>>24) & 15;  _p++; \
-      break; \
-    case 8: \
-      *_p = (*_w) & 31;     _p++; \
-      *_p = (*_w>>5) & 31;  _p++; \
-      *_p = (*_w>>10) & 31;  _p++; \
-      *_p = (*_w>>15) & 31;  _p++; \
-      *_p = (*_w>>20) & 15;  _p++; \
-      *_p = (*_w>>24) & 15;  _p++; \
-      break; \
-    case 9: \
-      *_p = (*_w) & 15;     _p++; \
-      *_p = (*_w>>4) & 15;  _p++; \
-      *_p = (*_w>>8) & 31;  _p++; \
-      *_p = (*_w>>13) & 31;  _p++; \
-      *_p = (*_w>>18) & 31;  _p++; \
-      *_p = (*_w>>23) & 31;  _p++; \
-      break; \
-    case 10: \
-      *_p = (*_w) & 63;     _p++; \
-      *_p = (*_w>>6) & 63;  _p++; \
-      *_p = (*_w>>12) & 63;  _p++; \
-      *_p = (*_w>>18) & 31;  _p++; \
-      *_p = (*_w>>23) & 31;  _p++; \
-      break; \
-    case 11: \
-      *_p = (*_w) & 31;     _p++; \
-      *_p = (*_w>>5) & 31;  _p++; \
-      *_p = (*_w>>10) & 63;  _p++; \
-      *_p = (*_w>>16) & 63;  _p++; \
-      *_p = (*_w>>22) & 63;  _p++; \
-      break; \
-    case 12: \
-      *_p = (*_w) & 127;     _p++; \
-      *_p = (*_w>>7) & 127;  _p++; \
-      *_p = (*_w>>14) & 127;  _p++; \
-      *_p = (*_w>>21) & 127;  _p++; \
-      break; \
-    case 13: \
-      *_p = (*_w) & 1023;     _p++; \
-      *_p = (*_w>>10) & 511;  _p++; \
-      *_p = (*_w>>19) & 511;  _p++; \
-      break; \
-    case 14: \
-      *_p = (*_w) & 16383;     _p++; \
-      *_p = (*_w>>14) & 16383;  _p++; \
-      break; \
-    case 15: \
-      *_p = (*_w) & ((1<<28)-1);     _p++; \
-      break; \
-  }\
-  _w++; \
-}
-
-
-
-
-
diff --git a/src/ext/for/ext/OPT_PFD/unpack.h b/src/ext/for/ext/OPT_PFD/unpack.h
deleted file mode 100644
index abb225cd..00000000
--- a/src/ext/for/ext/OPT_PFD/unpack.h
+++ /dev/null
@@ -1,773 +0,0 @@
-
-/*************************************************************/
-/* macros for fast unpacking of integers of fixed bit length */
-/*************************************************************/
-
-#define BS 128 
-
-/* supported bit lengths */
-int cnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32};
-
-void unpack0(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i++)  p[i] = 0;
-}
-
-
-void unpack1(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 1)
-  {
-    p[0] = (w[0] >> 31);
-    p[1] = (w[0] >> 30) & 1;
-    p[2] = (w[0] >> 29) & 1;
-    p[3] = (w[0] >> 28) & 1;
-    p[4] = (w[0] >> 27) & 1;
-    p[5] = (w[0] >> 26) & 1;
-    p[6] = (w[0] >> 25) & 1;
-    p[7] = (w[0] >> 24) & 1;
-    p[8] = (w[0] >> 23) & 1;
-    p[9] = (w[0] >> 22) & 1;
-    p[10] = (w[0] >> 21) & 1;
-    p[11] = (w[0] >> 20) & 1;
-    p[12] = (w[0] >> 19) & 1;
-    p[13] = (w[0] >> 18) & 1;
-    p[14] = (w[0] >> 17) & 1;
-    p[15] = (w[0] >> 16) & 1;
-    p[16] = (w[0] >> 15) & 1;
-    p[17] = (w[0] >> 14) & 1;
-    p[18] = (w[0] >> 13) & 1;
-    p[19] = (w[0] >> 12) & 1;
-    p[20] = (w[0] >> 11) & 1;
-    p[21] = (w[0] >> 10) & 1;
-    p[22] = (w[0] >> 9) & 1;
-    p[23] = (w[0] >> 8) & 1;
-    p[24] = (w[0] >> 7) & 1;
-    p[25] = (w[0] >> 6) & 1;
-    p[26] = (w[0] >> 5) & 1;
-    p[27] = (w[0] >> 4) & 1;
-    p[28] = (w[0] >> 3) & 1;
-    p[29] = (w[0] >> 2) & 1;
-    p[30] = (w[0] >> 1) & 1;
-    p[31] = (w[0]) & 1;
-  }
-}
-
-
-void unpack2(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 2)
-  {
-    p[0] = (w[0] >> 30);
-    p[1] = (w[0] >> 28) & 3;
-    p[2] = (w[0] >> 26) & 3;
-    p[3] = (w[0] >> 24) & 3;
-    p[4] = (w[0] >> 22) & 3;
-    p[5] = (w[0] >> 20) & 3;
-    p[6] = (w[0] >> 18) & 3;
-    p[7] = (w[0] >> 16) & 3;
-    p[8] = (w[0] >> 14) & 3;
-    p[9] = (w[0] >> 12) & 3;
-    p[10] = (w[0] >> 10) & 3;
-    p[11] = (w[0] >> 8) & 3;
-    p[12] = (w[0] >> 6) & 3;
-    p[13] = (w[0] >> 4) & 3;
-    p[14] = (w[0] >> 2) & 3;
-    p[15] = (w[0]) & 3;
-    p[16] = (w[1] >> 30);
-    p[17] = (w[1] >> 28) & 3;
-    p[18] = (w[1] >> 26) & 3;
-    p[19] = (w[1] >> 24) & 3;
-    p[20] = (w[1] >> 22) & 3;
-    p[21] = (w[1] >> 20) & 3;
-    p[22] = (w[1] >> 18) & 3;
-    p[23] = (w[1] >> 16) & 3;
-    p[24] = (w[1] >> 14) & 3;
-    p[25] = (w[1] >> 12) & 3;
-    p[26] = (w[1] >> 10) & 3;
-    p[27] = (w[1] >> 8) & 3;
-    p[28] = (w[1] >> 6) & 3;
-    p[29] = (w[1] >> 4) & 3;
-    p[30] = (w[1] >> 2) & 3;
-    p[31] = (w[1]) & 3;
-  }
-}
-
-
-void unpack3(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 3) 
-  { 
-    p[0] = (w[0] >> 29);
-    p[1] = (w[0] >> 26) & 7;
-    p[2] = (w[0] >> 23) & 7;
-    p[3] = (w[0] >> 20) & 7;
-    p[4] = (w[0] >> 17) & 7;
-    p[5] = (w[0] >> 14) & 7;
-    p[6] = (w[0] >> 11) & 7;
-    p[7] = (w[0] >> 8) & 7;
-    p[8] = (w[0] >> 5) & 7;
-    p[9] = (w[0] >> 2) & 7;
-    p[10] = (w[0] << 1) & 7;
-    p[10] |= (w[1] >> 31);
-    p[11] = (w[1] >> 28) & 7;
-    p[12] = (w[1] >> 25) & 7;
-    p[13] = (w[1] >> 22) & 7;
-    p[14] = (w[1] >> 19) & 7;
-    p[15] = (w[1] >> 16) & 7;
-    p[16] = (w[1] >> 13) & 7;
-    p[17] = (w[1] >> 10) & 7;
-    p[18] = (w[1] >> 7) & 7;
-    p[19] = (w[1] >> 4) & 7;
-    p[20] = (w[1] >> 1) & 7;
-    p[21] = (w[1] << 2) & 7;
-    p[21] |= (w[2] >> 30);
-    p[22] = (w[2] >> 27) & 7;
-    p[23] = (w[2] >> 24) & 7;
-    p[24] = (w[2] >> 21) & 7;
-    p[25] = (w[2] >> 18) & 7;
-    p[26] = (w[2] >> 15) & 7;
-    p[27] = (w[2] >> 12) & 7;
-    p[28] = (w[2] >> 9) & 7;
-    p[29] = (w[2] >> 6) & 7;
-    p[30] = (w[2] >> 3) & 7;
-    p[31] = (w[2]) & 7;
-  }
-}
-
-
-void unpack4(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 4) 
-  { 
-    p[0] = (w[0] >> 28);
-    p[1] = (w[0] >> 24) & 15;
-    p[2] = (w[0] >> 20) & 15;
-    p[3] = (w[0] >> 16) & 15;
-    p[4] = (w[0] >> 12) & 15;
-    p[5] = (w[0] >> 8) & 15;
-    p[6] = (w[0] >> 4) & 15;
-    p[7] = (w[0]) & 15;
-    p[8] = (w[1] >> 28);
-    p[9] = (w[1] >> 24) & 15;
-    p[10] = (w[1] >> 20) & 15;
-    p[11] = (w[1] >> 16) & 15;
-    p[12] = (w[1] >> 12) & 15;
-    p[13] = (w[1] >> 8) & 15;
-    p[14] = (w[1] >> 4) & 15;
-    p[15] = (w[1]) & 15;
-    p[16] = (w[2] >> 28);
-    p[17] = (w[2] >> 24) & 15;
-    p[18] = (w[2] >> 20) & 15;
-    p[19] = (w[2] >> 16) & 15;
-    p[20] = (w[2] >> 12) & 15;
-    p[21] = (w[2] >> 8) & 15;
-    p[22] = (w[2] >> 4) & 15;
-    p[23] = (w[2]) & 15;
-    p[24] = (w[3] >> 28);
-    p[25] = (w[3] >> 24) & 15;
-    p[26] = (w[3] >> 20) & 15;
-    p[27] = (w[3] >> 16) & 15;
-    p[28] = (w[3] >> 12) & 15;
-    p[29] = (w[3] >> 8) & 15;
-    p[30] = (w[3] >> 4) & 15;
-    p[31] = (w[3]) & 15;
-  }
-}
-
-
-void unpack5(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 5) 
-  { 
-    p[0] = (w[0] >> 27);
-    p[1] = (w[0] >> 22) & 31;
-    p[2] = (w[0] >> 17) & 31;
-    p[3] = (w[0] >> 12) & 31;
-    p[4] = (w[0] >> 7) & 31;
-    p[5] = (w[0] >> 2) & 31;
-    p[6] = (w[0] << 3) & 31;
-    p[6] |= (w[1] >> 29);
-    p[7] = (w[1] >> 24) & 31;
-    p[8] = (w[1] >> 19) & 31;
-    p[9] = (w[1] >> 14) & 31;
-    p[10] = (w[1] >> 9) & 31;
-    p[11] = (w[1] >> 4) & 31;
-    p[12] = (w[1] << 1) & 31;
-    p[12] |= (w[2] >> 31);
-    p[13] = (w[2] >> 26) & 31;
-    p[14] = (w[2] >> 21) & 31;
-    p[15] = (w[2] >> 16) & 31;
-    p[16] = (w[2] >> 11) & 31;
-    p[17] = (w[2] >> 6) & 31;
-    p[18] = (w[2] >> 1) & 31;
-    p[19] = (w[2] << 4) & 31;
-    p[19] |= (w[3] >> 28);
-    p[20] = (w[3] >> 23) & 31;
-    p[21] = (w[3] >> 18) & 31;
-    p[22] = (w[3] >> 13) & 31;
-    p[23] = (w[3] >> 8) & 31;
-    p[24] = (w[3] >> 3) & 31;
-    p[25] = (w[3] << 2) & 31;
-    p[25] |= (w[4] >> 30);
-    p[26] = (w[4] >> 25) & 31;
-    p[27] = (w[4] >> 20) & 31;
-    p[28] = (w[4] >> 15) & 31;
-    p[29] = (w[4] >> 10) & 31;
-    p[30] = (w[4] >> 5) & 31;
-    p[31] = (w[4]) & 31;
-  }
-}
-
-
-void unpack6(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 6) 
-  { 
-    p[0] = (w[0] >> 26);
-    p[1] = (w[0] >> 20) & 63;
-    p[2] = (w[0] >> 14) & 63;
-    p[3] = (w[0] >> 8) & 63;
-    p[4] = (w[0] >> 2) & 63;
-    p[5] = (w[0] << 4) & 63;
-    p[5] |= (w[1] >> 28);
-    p[6] = (w[1] >> 22) & 63;
-    p[7] = (w[1] >> 16) & 63;
-    p[8] = (w[1] >> 10) & 63;
-    p[9] = (w[1] >> 4) & 63;
-    p[10] = (w[1] << 2) & 63;
-    p[10] |= (w[2] >> 30);
-    p[11] = (w[2] >> 24) & 63;
-    p[12] = (w[2] >> 18) & 63;
-    p[13] = (w[2] >> 12) & 63;
-    p[14] = (w[2] >> 6) & 63;
-    p[15] = (w[2]) & 63;
-    p[16] = (w[3] >> 26);
-    p[17] = (w[3] >> 20) & 63;
-    p[18] = (w[3] >> 14) & 63;
-    p[19] = (w[3] >> 8) & 63;
-    p[20] = (w[3] >> 2) & 63;
-    p[21] = (w[3] << 4) & 63;
-    p[21] |= (w[4] >> 28);
-    p[22] = (w[4] >> 22) & 63;
-    p[23] = (w[4] >> 16) & 63;
-    p[24] = (w[4] >> 10) & 63;
-    p[25] = (w[4] >> 4) & 63;
-    p[26] = (w[4] << 2) & 63;
-    p[26] |= (w[5] >> 30);
-    p[27] = (w[5] >> 24) & 63;
-    p[28] = (w[5] >> 18) & 63;
-    p[29] = (w[5] >> 12) & 63;
-    p[30] = (w[5] >> 6) & 63;
-    p[31] = (w[5]) & 63;
-  }
-}
-
-
-void unpack7(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 7) 
-  { 
-    p[0] = (w[0] >> 25);
-    p[1] = (w[0] >> 18) & 127;
-    p[2] = (w[0] >> 11) & 127;
-    p[3] = (w[0] >> 4) & 127;
-    p[4] = (w[0] << 3) & 127;
-    p[4] |= (w[1] >> 29);
-    p[5] = (w[1] >> 22) & 127;
-    p[6] = (w[1] >> 15) & 127;
-    p[7] = (w[1] >> 8) & 127;
-    p[8] = (w[1] >> 1) & 127;
-    p[9] = (w[1] << 6) & 127;
-    p[9] |= (w[2] >> 26);
-    p[10] = (w[2] >> 19) & 127;
-    p[11] = (w[2] >> 12) & 127;
-    p[12] = (w[2] >> 5) & 127;
-    p[13] = (w[2] << 2) & 127;
-    p[13] |= (w[3] >> 30);
-    p[14] = (w[3] >> 23) & 127;
-    p[15] = (w[3] >> 16) & 127;
-    p[16] = (w[3] >> 9) & 127;
-    p[17] = (w[3] >> 2) & 127;
-    p[18] = (w[3] << 5) & 127;
-    p[18] |= (w[4] >> 27);
-    p[19] = (w[4] >> 20) & 127;
-    p[20] = (w[4] >> 13) & 127;
-    p[21] = (w[4] >> 6) & 127;
-    p[22] = (w[4] << 1) & 127;
-    p[22] |= (w[5] >> 31);
-    p[23] = (w[5] >> 24) & 127;
-    p[24] = (w[5] >> 17) & 127;
-    p[25] = (w[5] >> 10) & 127;
-    p[26] = (w[5] >> 3) & 127;
-    p[27] = (w[5] << 4) & 127;
-    p[27] |= (w[6] >> 28);
-    p[28] = (w[6] >> 21) & 127;
-    p[29] = (w[6] >> 14) & 127;
-    p[30] = (w[6] >> 7) & 127;
-    p[31] = (w[6]) & 127;
-  }
-}
-
-
-void unpack8(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 8) 
-  { 
-    p[0] = (w[0] >> 24);
-    p[1] = (w[0] >> 16) & 255;
-    p[2] = (w[0] >> 8) & 255;
-    p[3] = (w[0]) & 255;
-    p[4] = (w[1] >> 24);
-    p[5] = (w[1] >> 16) & 255;
-    p[6] = (w[1] >> 8) & 255;
-    p[7] = (w[1]) & 255;
-    p[8] = (w[2] >> 24);
-    p[9] = (w[2] >> 16) & 255;
-    p[10] = (w[2] >> 8) & 255;
-    p[11] = (w[2]) & 255;
-    p[12] = (w[3] >> 24);
-    p[13] = (w[3] >> 16) & 255;
-    p[14] = (w[3] >> 8) & 255;
-    p[15] = (w[3]) & 255;
-    p[16] = (w[4] >> 24);
-    p[17] = (w[4] >> 16) & 255;
-    p[18] = (w[4] >> 8) & 255;
-    p[19] = (w[4]) & 255;
-    p[20] = (w[5] >> 24);
-    p[21] = (w[5] >> 16) & 255;
-    p[22] = (w[5] >> 8) & 255;
-    p[23] = (w[5]) & 255;
-    p[24] = (w[6] >> 24);
-    p[25] = (w[6] >> 16) & 255;
-    p[26] = (w[6] >> 8) & 255;
-    p[27] = (w[6]) & 255;
-    p[28] = (w[7] >> 24);
-    p[29] = (w[7] >> 16) & 255;
-    p[30] = (w[7] >> 8) & 255;
-    p[31] = (w[7]) & 255;
-  }
-}
-
-
-void unpack9(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 9)
-  {
-    p[0] = (w[0] >> 23);
-    p[1] = (w[0] >> 14) & 511;
-    p[2] = (w[0] >> 5) & 511;
-    p[3] = (w[0] << 4) & 511;
-    p[3] |= (w[1] >> 28);
-    p[4] = (w[1] >> 19) & 511;
-    p[5] = (w[1] >> 10) & 511;
-    p[6] = (w[1] >> 1) & 511;
-    p[7] = (w[1] << 8) & 511;
-    p[7] |= (w[2] >> 24);
-    p[8] = (w[2] >> 15) & 511;
-    p[9] = (w[2] >> 6) & 511;
-    p[10] = (w[2] << 3) & 511;
-    p[10] |= (w[3] >> 29);
-    p[11] = (w[3] >> 20) & 511;
-    p[12] = (w[3] >> 11) & 511;
-    p[13] = (w[3] >> 2) & 511;
-    p[14] = (w[3] << 7) & 511;
-    p[14] |= (w[4] >> 25);
-    p[15] = (w[4] >> 16) & 511;
-    p[16] = (w[4] >> 7) & 511;
-    p[17] = (w[4] << 2) & 511;
-    p[17] |= (w[5] >> 30);
-    p[18] = (w[5] >> 21) & 511;
-    p[19] = (w[5] >> 12) & 511;
-    p[20] = (w[5] >> 3) & 511;
-    p[21] = (w[5] << 6) & 511;
-    p[21] |= (w[6] >> 26);
-    p[22] = (w[6] >> 17) & 511;
-    p[23] = (w[6] >> 8) & 511;
-    p[24] = (w[6] << 1) & 511;
-    p[24] |= (w[7] >> 31);
-    p[25] = (w[7] >> 22) & 511;
-    p[26] = (w[7] >> 13) & 511;
-    p[27] = (w[7] >> 4) & 511;
-    p[28] = (w[7] << 5) & 511;
-    p[28] |= (w[8] >> 27);
-    p[29] = (w[8] >> 18) & 511;
-    p[30] = (w[8] >> 9) & 511;
-    p[31] = (w[8]) & 511;
-  }
-}
-
-
-void unpack10(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 10) 
-  { 
-    p[0] = (w[0] >> 22);
-    p[1] = (w[0] >> 12) & 1023;
-    p[2] = (w[0] >> 2) & 1023;
-    p[3] = (w[0] << 8) & 1023;
-    p[3] |= (w[1] >> 24);
-    p[4] = (w[1] >> 14) & 1023;
-    p[5] = (w[1] >> 4) & 1023;
-    p[6] = (w[1] << 6) & 1023;
-    p[6] |= (w[2] >> 26);
-    p[7] = (w[2] >> 16) & 1023;
-    p[8] = (w[2] >> 6) & 1023;
-    p[9] = (w[2] << 4) & 1023;
-    p[9] |= (w[3] >> 28);
-    p[10] = (w[3] >> 18) & 1023;
-    p[11] = (w[3] >> 8) & 1023;
-    p[12] = (w[3] << 2) & 1023;
-    p[12] |= (w[4] >> 30);
-    p[13] = (w[4] >> 20) & 1023;
-    p[14] = (w[4] >> 10) & 1023;
-    p[15] = (w[4]) & 1023;
-    p[16] = (w[5] >> 22);
-    p[17] = (w[5] >> 12) & 1023;
-    p[18] = (w[5] >> 2) & 1023;
-    p[19] = (w[5] << 8) & 1023;
-    p[19] |= (w[6] >> 24);
-    p[20] = (w[6] >> 14) & 1023;
-    p[21] = (w[6] >> 4) & 1023;
-    p[22] = (w[6] << 6) & 1023;
-    p[22] |= (w[7] >> 26);
-    p[23] = (w[7] >> 16) & 1023;
-    p[24] = (w[7] >> 6) & 1023;
-    p[25] = (w[7] << 4) & 1023;
-    p[25] |= (w[8] >> 28);
-    p[26] = (w[8] >> 18) & 1023;
-    p[27] = (w[8] >> 8) & 1023;
-    p[28] = (w[8] << 2) & 1023;
-    p[28] |= (w[9] >> 30);
-    p[29] = (w[9] >> 20) & 1023;
-    p[30] = (w[9] >> 10) & 1023;
-    p[31] = (w[9]) & 1023;
-  }
-}
-
-
-void unpack11(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 11) 
-  { 
-    p[0] = (w[0] >> 21);
-    p[1] = (w[0] >> 10) & 2047;
-    p[2] = (w[0] << 1) & 2047;
-    p[2] |= (w[1] >> 31);
-    p[3] = (w[1] >> 20) & 2047;
-    p[4] = (w[1] >> 9) & 2047;
-    p[5] = (w[1] << 2) & 2047;
-    p[5] |= (w[2] >> 30);
-    p[6] = (w[2] >> 19) & 2047;
-    p[7] = (w[2] >> 8) & 2047;
-    p[8] = (w[2] << 3) & 2047;
-    p[8] |= (w[3] >> 29);
-    p[9] = (w[3] >> 18) & 2047;
-    p[10] = (w[3] >> 7) & 2047;
-    p[11] = (w[3] << 4) & 2047;
-    p[11] |= (w[4] >> 28);
-    p[12] = (w[4] >> 17) & 2047;
-    p[13] = (w[4] >> 6) & 2047;
-    p[14] = (w[4] << 5) & 2047;
-    p[14] |= (w[5] >> 27);
-    p[15] = (w[5] >> 16) & 2047;
-    p[16] = (w[5] >> 5) & 2047;
-    p[17] = (w[5] << 6) & 2047;
-    p[17] |= (w[6] >> 26);
-    p[18] = (w[6] >> 15) & 2047;
-    p[19] = (w[6] >> 4) & 2047;
-    p[20] = (w[6] << 7) & 2047;
-    p[20] |= (w[7] >> 25);
-    p[21] = (w[7] >> 14) & 2047;
-    p[22] = (w[7] >> 3) & 2047;
-    p[23] = (w[7] << 8) & 2047;
-    p[23] |= (w[8] >> 24);
-    p[24] = (w[8] >> 13) & 2047;
-    p[25] = (w[8] >> 2) & 2047;
-    p[26] = (w[8] << 9) & 2047;
-    p[26] |= (w[9] >> 23);
-    p[27] = (w[9] >> 12) & 2047;
-    p[28] = (w[9] >> 1) & 2047;
-    p[29] = (w[9] << 10) & 2047;
-    p[29] |= (w[10] >> 22);
-    p[30] = (w[10] >> 11) & 2047;
-    p[31] = (w[10]) & 2047;
-  }
-}
-
-
-void unpack12(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 12) 
-  { 
-    p[0] = (w[0] >> 20);
-    p[1] = (w[0] >> 8) & 4095;
-    p[2] = (w[0] << 4) & 4095;
-    p[2] |= (w[1] >> 28);
-    p[3] = (w[1] >> 16) & 4095;
-    p[4] = (w[1] >> 4) & 4095;
-    p[5] = (w[1] << 8) & 4095;
-    p[5] |= (w[2] >> 24);
-    p[6] = (w[2] >> 12) & 4095;
-    p[7] = (w[2]) & 4095;
-    p[8] = (w[3] >> 20);
-    p[9] = (w[3] >> 8) & 4095;
-    p[10] = (w[3] << 4) & 4095;
-    p[10] |= (w[4] >> 28);
-    p[11] = (w[4] >> 16) & 4095;
-    p[12] = (w[4] >> 4) & 4095;
-    p[13] = (w[4] << 8) & 4095;
-    p[13] |= (w[5] >> 24);
-    p[14] = (w[5] >> 12) & 4095;
-    p[15] = (w[5]) & 4095;
-    p[16] = (w[6] >> 20);
-    p[17] = (w[6] >> 8) & 4095;
-    p[18] = (w[6] << 4) & 4095;
-    p[18] |= (w[7] >> 28);
-    p[19] = (w[7] >> 16) & 4095;
-    p[20] = (w[7] >> 4) & 4095;
-    p[21] = (w[7] << 8) & 4095;
-    p[21] |= (w[8] >> 24);
-    p[22] = (w[8] >> 12) & 4095;
-    p[23] = (w[8]) & 4095;
-    p[24] = (w[9] >> 20);
-    p[25] = (w[9] >> 8) & 4095;
-    p[26] = (w[9] << 4) & 4095;
-    p[26] |= (w[10] >> 28);
-    p[27] = (w[10] >> 16) & 4095;
-    p[28] = (w[10] >> 4) & 4095;
-    p[29] = (w[10] << 8) & 4095;
-    p[29] |= (w[11] >> 24);
-    p[30] = (w[11] >> 12) & 4095;
-    p[31] = (w[11]) & 4095;
-  }
-}
-
-
-void unpack13(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 13) 
-  { 
-    p[0] = (w[0] >> 19);
-    p[1] = (w[0] >> 6) & 8191;
-    p[2] = (w[0] << 7) & 8191;
-    p[2] |= (w[1] >> 25);
-    p[3] = (w[1] >> 12) & 8191;
-    p[4] = (w[1] << 1) & 8191;
-    p[4] |= (w[2] >> 31);
-    p[5] = (w[2] >> 18) & 8191;
-    p[6] = (w[2] >> 5) & 8191;
-    p[7] = (w[2] << 8) & 8191;
-    p[7] |= (w[3] >> 24);
-    p[8] = (w[3] >> 11) & 8191;
-    p[9] = (w[3] << 2) & 8191;
-    p[9] |= (w[4] >> 30);
-    p[10] = (w[4] >> 17) & 8191;
-    p[11] = (w[4] >> 4) & 8191;
-    p[12] = (w[4] << 9) & 8191;
-    p[12] |= (w[5] >> 23);
-    p[13] = (w[5] >> 10) & 8191;
-    p[14] = (w[5] << 3) & 8191;
-    p[14] |= (w[6] >> 29);
-    p[15] = (w[6] >> 16) & 8191;
-    p[16] = (w[6] >> 3) & 8191;
-    p[17] = (w[6] << 10) & 8191;
-    p[17] |= (w[7] >> 22);
-    p[18] = (w[7] >> 9) & 8191;
-    p[19] = (w[7] << 4) & 8191;
-    p[19] |= (w[8] >> 28);
-    p[20] = (w[8] >> 15) & 8191;
-    p[21] = (w[8] >> 2) & 8191;
-    p[22] = (w[8] << 11) & 8191;
-    p[22] |= (w[9] >> 21);
-    p[23] = (w[9] >> 8) & 8191;
-    p[24] = (w[9] << 5) & 8191;
-    p[24] |= (w[10] >> 27);
-    p[25] = (w[10] >> 14) & 8191;
-    p[26] = (w[10] >> 1) & 8191;
-    p[27] = (w[10] << 12) & 8191;
-    p[27] |= (w[11] >> 20);
-    p[28] = (w[11] >> 7) & 8191;
-    p[29] = (w[11] << 6) & 8191;
-    p[29] |= (w[12] >> 26);
-    p[30] = (w[12] >> 13) & 8191;
-    p[31] = (w[12]) & 8191;
-  }
-}
-
-
-void unpack16(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 16) 
-  { 
-    p[0] = (w[0] >> 16);
-    p[1] = (w[0]) & 65535;
-    p[2] = (w[1] >> 16);
-    p[3] = (w[1]) & 65535;
-    p[4] = (w[2] >> 16);
-    p[5] = (w[2]) & 65535;
-    p[6] = (w[3] >> 16);
-    p[7] = (w[3]) & 65535;
-    p[8] = (w[4] >> 16);
-    p[9] = (w[4]) & 65535;
-    p[10] = (w[5] >> 16);
-    p[11] = (w[5]) & 65535;
-    p[12] = (w[6] >> 16);
-    p[13] = (w[6]) & 65535;
-    p[14] = (w[7] >> 16);
-    p[15] = (w[7]) & 65535;
-    p[16] = (w[8] >> 16);
-    p[17] = (w[8]) & 65535;
-    p[18] = (w[9] >> 16);
-    p[19] = (w[9]) & 65535;
-    p[20] = (w[10] >> 16);
-    p[21] = (w[10]) & 65535;
-    p[22] = (w[11] >> 16);
-    p[23] = (w[11]) & 65535;
-    p[24] = (w[12] >> 16);
-    p[25] = (w[12]) & 65535;
-    p[26] = (w[13] >> 16);
-    p[27] = (w[13]) & 65535;
-    p[28] = (w[14] >> 16);
-    p[29] = (w[14]) & 65535;
-    p[30] = (w[15] >> 16);
-    p[31] = (w[15]) & 65535;
-  }
-}
-
-
-void unpack20(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 20) 
-  { 
-    p[0] = (w[0] >> 12);
-    p[1] = (w[0] << 8) & ((1<<20)-1);
-    p[1] |= (w[1] >> 24);
-    p[2] = (w[1] >> 4) & ((1<<20)-1);
-    p[3] = (w[1] << 16) & ((1<<20)-1);
-    p[3] |= (w[2] >> 16);
-    p[4] = (w[2] << 4) & ((1<<20)-1);
-    p[4] |= (w[3] >> 28);
-    p[5] = (w[3] >> 8) & ((1<<20)-1);
-    p[6] = (w[3] << 12) & ((1<<20)-1);
-    p[6] |= (w[4] >> 20);
-    p[7] = (w[4]) & ((1<<20)-1);
-    p[8] = (w[5] >> 12);
-    p[9] = (w[5] << 8) & ((1<<20)-1);
-    p[9] |= (w[6] >> 24);
-    p[10] = (w[6] >> 4) & ((1<<20)-1);
-    p[11] = (w[6] << 16) & ((1<<20)-1);
-    p[11] |= (w[7] >> 16);
-    p[12] = (w[7] << 4) & ((1<<20)-1);
-    p[12] |= (w[8] >> 28);
-    p[13] = (w[8] >> 8) & ((1<<20)-1);
-    p[14] = (w[8] << 12) & ((1<<20)-1);
-    p[14] |= (w[9] >> 20);
-    p[15] = (w[9]) & ((1<<20)-1);
-    p[16] = (w[10] >> 12);
-    p[17] = (w[10] << 8) & ((1<<20)-1);
-    p[17] |= (w[11] >> 24);
-    p[18] = (w[11] >> 4) & ((1<<20)-1);
-    p[19] = (w[11] << 16) & ((1<<20)-1);
-    p[19] |= (w[12] >> 16);
-    p[20] = (w[12] << 4) & ((1<<20)-1);
-    p[20] |= (w[13] >> 28);
-    p[21] = (w[13] >> 8) & ((1<<20)-1);
-    p[22] = (w[13] << 12) & ((1<<20)-1);
-    p[22] |= (w[14] >> 20);
-    p[23] = (w[14]) & ((1<<20)-1);
-    p[24] = (w[15] >> 12);
-    p[25] = (w[15] << 8) & ((1<<20)-1);
-    p[25] |= (w[16] >> 24);
-    p[26] = (w[16] >> 4) & ((1<<20)-1);
-    p[27] = (w[16] << 16) & ((1<<20)-1);
-    p[27] |= (w[17] >> 16);
-    p[28] = (w[17] << 4) & ((1<<20)-1);
-    p[28] |= (w[18] >> 28);
-    p[29] = (w[18] >> 8) & ((1<<20)-1);
-    p[30] = (w[18] << 12) & ((1<<20)-1);
-    p[30] |= (w[19] >> 20);
-    p[31] = (w[19]) & ((1<<20)-1);
-  }
-}
-
-
-static void unpack32(unsigned int *p, unsigned int *w)
-{
-  int i;
-
-  for (i = 0; i < BS; i += 32, p += 32, w += 32) 
-  { 
-    p[0] = w[0];
-    p[1] = w[1];
-    p[2] = w[2];
-    p[3] = w[3];
-    p[4] = w[4];
-    p[5] = w[5];
-    p[6] = w[6];
-    p[7] = w[7];
-    p[8] = w[8];
-    p[9] = w[9];
-    p[10] = w[10];
-    p[11] = w[11];
-    p[12] = w[12];
-    p[13] = w[13];
-    p[14] = w[14];
-    p[15] = w[15];
-    p[16] = w[16];
-    p[17] = w[17];
-    p[18] = w[18];
-    p[19] = w[19];
-    p[20] = w[20];
-    p[21] = w[21];
-    p[22] = w[22];
-    p[23] = w[23];
-    p[24] = w[24];
-    p[25] = w[25];
-    p[26] = w[26];
-    p[27] = w[27];
-    p[28] = w[28];
-    p[29] = w[29];
-    p[30] = w[30];
-    p[31] = w[31];
-  }
-}
-
-
-typedef void (*pf)(unsigned int *p, unsigned int *w);
-pf unpack[17] = {unpack0, unpack1, unpack2, unpack3, unpack4, unpack5, 
-                 unpack6, unpack7, unpack8, unpack9, unpack10, unpack11, 
-                 unpack12, unpack13, unpack16, unpack20, unpack32};
-
diff --git a/src/ext/for/ext/SPDP_10.c b/src/ext/for/ext/SPDP_10.c
deleted file mode 100644
index e8256954..00000000
--- a/src/ext/for/ext/SPDP_10.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
-SPDP code: SPDP is a unified compression/decompression algorithm that works
-well on both binary 32-bit single-precision (float) and binary 64-bit double-
-precision (double) floating-point data.
-
-Copyright (c) 2016, Texas State University. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted for academic, research, experimental, or personal use provided
-that the following conditions are met:
-
-   * Redistributions of source code must retain the above copyright notice,
-     this list of conditions, and the following disclaimer.
-   * Redistributions in binary form must reproduce the above copyright notice,
-     this list of conditions, and the following disclaimer in the documentation
-     and/or other materials provided with the distribution.
-   * Neither the name of Texas State University nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-For all other uses, please contact the Office for Commercialization and Industry
-Relations at Texas State University <http://www.txstate.edu/ocir/>.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Authors: Martin Burtscher and Steven Claggett
-*/
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#define MAX_TABLE_SIZE (1 << 18)
-
-typedef unsigned char byte_t;
-typedef unsigned int word_t;
-
-
-static size_t spdp_compress(const byte_t level, const size_t length, byte_t* const buf1, byte_t* const buf2)
-{
-  word_t* in = (word_t*)buf1;
-  word_t* out = (word_t*)buf2;
-  size_t len = length / sizeof(word_t);
-
-  word_t prev2 = 0;
-  word_t prev1 = 0;
-  size_t pos;
-  for (pos = 0; pos < len; pos++) {
-    word_t curr = in[pos];
-    out[pos] = curr - prev2;
-    prev2 = prev1;
-    prev1 = curr;
-  }
-
-  for (pos = len * sizeof(word_t); pos < length; pos++) {
-    buf2[pos] = buf1[pos];
-  }
-
-  byte_t prev = 0;
-  size_t wpos = 0;
-  size_t d;
-  for (d = 0; d < 8; d++) {
-    size_t rpos;
-    for (rpos = d; rpos < length; rpos += 8) {
-      byte_t curr = buf2[rpos];
-      buf1[wpos] = curr - prev;
-      prev = curr;
-      wpos++;
-    }
-  }
-
-  size_t predtabsize = 1 << (level + 9);
-  if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE;
-  const size_t predtabsizem1 = predtabsize - 1;
-
-  unsigned int lastpos[MAX_TABLE_SIZE];
-  memset(lastpos, 0, predtabsize * sizeof(unsigned int));
-
-  size_t rpos = 0;
-  wpos = 0;
-  unsigned int hist = 0;
-  while (rpos < length) {
-    byte_t val = buf1[rpos];
-    unsigned int lpos = lastpos[hist];
-    if (lpos >= 6) {
-      if ((buf1[lpos - 6] == buf1[rpos - 6]) && (buf1[lpos - 5] == buf1[rpos - 5]) &&
-          (buf1[lpos - 4] == buf1[rpos - 4]) && (buf1[lpos - 3] == buf1[rpos - 3]) &&
-          (buf1[lpos - 2] == buf1[rpos - 2]) && (buf1[lpos - 1] == buf1[rpos - 1])) {
-        byte_t cnt = 0;
-        while ((val == buf1[lpos]) && (cnt < 255) && (rpos < (length - 1))) {
-          lastpos[hist] = rpos;
-          hist = ((hist << 2) ^ val) & predtabsizem1;
-          rpos++;
-          lpos++;
-          cnt++;
-          val = buf1[rpos];
-        }
-        buf2[wpos] = cnt;
-        wpos++;
-      }
-    }
-    buf2[wpos] = val;
-    wpos++;
-    lastpos[hist] = rpos;
-    hist = ((hist << 2) ^ val) & predtabsizem1;
-    rpos++;
-  }
-
-  return wpos;
-}
-
-static void spdp_decompress(const byte_t level, const size_t length, byte_t* const buf2, byte_t* const buf1)
-{
-  unsigned int predtabsize = 1 << (level + 9);
-  if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE;
-  const unsigned int predtabsizem1 = predtabsize - 1;
-
-  unsigned int lastpos[MAX_TABLE_SIZE];
-  memset(lastpos, 0, predtabsize * sizeof(unsigned int));
-
-  size_t rpos = 0;
-  size_t wpos = 0;
-  unsigned int hist = 0;
-  while (rpos < length) {
-    unsigned int lpos = lastpos[hist];
-    if (lpos >= 6) {
-      if ((buf1[lpos - 6] == buf1[wpos - 6]) && (buf1[lpos - 5] == buf1[wpos - 5]) &&
-          (buf1[lpos - 4] == buf1[wpos - 4]) && (buf1[lpos - 3] == buf1[wpos - 3]) &&
-          (buf1[lpos - 2] == buf1[wpos - 2]) && (buf1[lpos - 1] == buf1[wpos - 1])) {
-        byte_t cnt = buf2[rpos];
-        rpos++;
-        byte_t j;
-        for (j = 0; j < cnt; j++) {
-          byte_t val = buf1[wpos] = buf1[lpos];
-          lastpos[hist] = wpos;
-          hist = ((hist << 2) ^ val) & predtabsizem1;
-          wpos++;
-          lpos++;
-        }
-      }
-    }
-    byte_t val = buf1[wpos] = buf2[rpos];
-    lastpos[hist] = wpos;
-    hist = ((hist << 2) ^ val) & predtabsizem1;
-    wpos++;
-    rpos++;
-  }
-  const size_t usize = wpos;
-
-  byte_t val = 0;
-  rpos = 0;
-  size_t d;
-  for (d = 0; d < 8; d++) {
-    size_t wpos;
-    for (wpos = d; wpos < usize; wpos += 8) {
-      val += buf1[rpos];
-      buf2[wpos] = val;
-      rpos++;
-    }
-  }
-
-  word_t* in = (word_t*)buf2;
-  word_t* out = (word_t*)buf1;
-  const size_t len = usize / sizeof(word_t);
-
-  word_t prev2 = 0;
-  word_t prev1 = 0;
-  size_t pos;
-  for (pos = 0; pos < len; pos++) {
-    word_t curr = in[pos] + prev2;
-    out[pos] = curr;
-    prev2 = prev1;
-    prev1 = curr;
-  }
-  for (pos = len * sizeof(word_t); pos < usize; pos++) {
-    buf1[pos] = buf2[pos];
-  }
-}
-#ifndef NMAIN
-#define BUFFER_SIZE (1 << 23)
-static byte_t buffer1[BUFFER_SIZE];
-static byte_t buffer2[BUFFER_SIZE * 2 + 9];
-int main(int argc, char *argv[])
-{
-  fprintf(stderr, "SPDP Floating-Point Compressor v1.0\n");
-  fprintf(stderr, "Copyright (c) 2016 Texas State University\n\n");
-
-  if ((argc != 1) && (argc != 2)) {
-    fprintf(stderr, "compression usage: %s level < uncompressed_file > compressed_file\n", argv[0]);
-    fprintf(stderr, "decompression usage: %s < compressed_file > decompressed_file\n", argv[0]);
-    return -1;
-  }
-
-  if (argc == 2) {  // compression
-    byte_t level = atoi(argv[1]);
-    if (level < 0) level = 0;
-    if (level > 9) level = 9;
-    fwrite(&level, sizeof(byte_t), 1, stdout);
-
-    int length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin);
-    while (length > 0) {
-      fwrite(&length, sizeof(int), 1, stdout);
-      int csize = compress(level, length, buffer1, buffer2);
-      fwrite(&csize, sizeof(int), 1, stdout);
-      fwrite(buffer2, sizeof(byte_t), csize, stdout);
-      length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin);
-    }
-  } else {  // decompression
-    byte_t level = 10;
-    fread(&level, sizeof(byte_t), 1, stdin);
-    if ((level < 0) || (level > 9)) {
-      fprintf(stderr, "incorrect input file type\n");
-      return -2;
-    }
-
-    int length;
-    while (fread(&length, sizeof(int), 1, stdin) > 0) {
-      int csize;
-      fread(&csize, sizeof(int), 1, stdin);
-      fread(buffer2, sizeof(byte_t), csize, stdin);
-      decompress(level, csize, buffer2, buffer1);
-      fwrite(buffer1, sizeof(byte_t), length, stdout);
-    }
-  }
-
-  return 0;
-}
-#endif
diff --git a/src/ext/for/ext/bg/bg.c b/src/ext/for/ext/bg/bg.c
deleted file mode 100644
index dc5a714b..00000000
--- a/src/ext/for/ext/bg/bg.c
+++ /dev/null
@@ -1,185 +0,0 @@
-#include "bg.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-int versionNumber[4] = {BG_VER_MAJOR,BG_VER_MINOR,BG_VER_BUILD,BG_VER_REVISION};
-//int BG_SIZE_TYPE = 8;
-
-int dataEndianType = LITTLE_ENDIAN_DATA; //*endian type of the data read from disk
-int sysEndianType; //*sysEndianType is actually set automatically.
-
-//the confparams should be separate between compression and decopmression, in case of mutual-affection when calling compression/decompression alternatively
-bg_params *confparams_cpr = NULL; //used for compression
-bg_params *confparams_dec = NULL; //used for decompression 
-
-bg_exedata *exe_params = NULL;
-
-int bgMode_libpressio = BITGROOM;
-int errorControlMode_libpressio = BG_NSD;
-int nsd_libpressio = 5;
-int dsd_libpressio = 5;
-
-#if 0
-unsigned char *BG_compress(int dataType, void *data, size_t *outSize, size_t nbEle)
-{
-	return BG_compress_args(dataType, data, outSize, confparams_cpr->bgMode, confparams_cpr->errorControlMode, confparams_cpr->NSD, confparams_cpr->DSD, nbEle);
-}
-#endif
-
-unsigned char* BG_compress_args(int dataType, void *data, size_t *outSize, int bgMode, int errorControlMode, int nsd, int dsd, size_t nbEle, unsigned char *data_)
-{
-
-	int dataTypeLen = dataType==BG_FLOAT?sizeof(float):sizeof(double);	
-
-	size_t bufferSize = dataTypeLen*nbEle;
-
-	const double bit_per_dcm_dgt_prc=M_LN10/M_LN2; /* 3.32 [frc] Bits per decimal digit of precision */
-	//const double dcm_per_bit_dgt_prc=M_LN2/M_LN10; /* 0.301 [frc] Bits per decimal digit of precision */
-	const int bit_xpl_nbr_sgn_flt=23; /* [nbr] Bits 0-22 of SP significands are explicit. Bit 23 is implicitly 1. */
-	const int bit_xpl_nbr_sgn_dbl=53; /* [nbr] Bits 0-52 of DP significands are explicit. Bit 53 is implicitly 1. */
-	//const int ieee_xpn_fst_flt=127; /* [nbr] IEEE "exponent bias" = actual exponent minus stored exponent */  	
-
-	double prc_bnr_xct; /* [nbr] Binary digits of precision, exact */
-
-	int bit_xpl_nbr_sgn=int_CEWI; /* [nbr] Number of explicit bits in significand */
-	int bit_xpl_nbr_zro; /* [nbr] Number of explicit bits to zero */
-
-	long idx;
-
-	unsigned int *u32_ptr;
-	unsigned int msk_f32_u32_zro;
-	unsigned int msk_f32_u32_one;
-	//unsigned int msk_f32_u32_hshv;
-	unsigned long long *u64_ptr;
-	unsigned long long msk_f64_u64_zro;
-	unsigned long long msk_f64_u64_one;
-	//unsigned long int msk_f64_u64_hshv;
-	unsigned short prc_bnr_ceil; /* [nbr] Exact binary digits of precision rounded-up */
-	unsigned short prc_bnr_xpl_rqr; /* [nbr] Explicitly represented binary digits required to retain */
-
-	if(errorControlMode == BG_NSD && (nsd < 0 || nsd >16))
-	{
-		printf("Error: wrong nsd input\n");
-		return NULL;
-	}
-
-	/* How many bits to preserve? */
-	prc_bnr_xct=nsd*bit_per_dcm_dgt_prc;
-	/* Be conservative, round upwards */
-	prc_bnr_ceil=(unsigned short)ceil(prc_bnr_xct);
-	/* First bit is implicit not explicit but corner cases prevent our taking advantage of this */
-	//prc_bnr_xpl_rqr=prc_bnr_ceil-1;
-	//prc_bnr_xpl_rqr=prc_bnr_ceil;
-	prc_bnr_xpl_rqr=prc_bnr_ceil+1;
-
-	//unsigned char* data_ = (unsigned char*)malloc(bufferSize);		
-	memcpy(data_, data, bufferSize);
-
-	if(dataType == BG_DOUBLE) prc_bnr_xpl_rqr++; /* Seems necessary for double-precision ppc=array(1.234567,1.0e-6,$dmn) */  
-	
-	if(!(dataType == BG_FLOAT  && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_flt) || (dataType == BG_DOUBLE && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_dbl)) //required # bits is greater than the full length of bits	
-	{	
-		if(dataType==BG_FLOAT)
-		{
-			bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_flt;
-			bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;		
-			if(bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN)
-			{
-				printf("Error: bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN\n");
-				return NULL;
-			}	
-			
-			u32_ptr = (unsigned int*)data_;
-			/* Create mask */
-			msk_f32_u32_zro=0u; /* Zero all bits */
-			msk_f32_u32_zro=~msk_f32_u32_zro; /* Turn all bits to ones */
-			/* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
-			msk_f32_u32_zro <<= bit_xpl_nbr_zro;
-			/* Bit Set   mask for OR:  Put ones into bits to be set, zeros in untouched bits */
-			msk_f32_u32_one=~msk_f32_u32_zro;
-			//msk_f32_u32_hshv=msk_f32_u32_one & (msk_f32_u32_zro >> 1); /* Set one bit: the MSB of LSBs */	
-			switch(bgMode)
-			{
-			case BITGROOM:
-				for(idx=0L;idx<nbEle;idx+=2L) u32_ptr[idx]&=msk_f32_u32_zro;
-				
-				for(idx=1L;idx<nbEle;idx+=2L)
-					if(u32_ptr[idx] != 0U) /* Never quantize upwards floating point values of zero */
-						u32_ptr[idx]|=msk_f32_u32_one;		
-				break;
-			case BITSHAVE:
-				for(idx=0L;idx<nbEle;idx++) u32_ptr[idx]&=msk_f32_u32_zro;
-				break;
-			case BITSET:
-				for(idx=0L;idx<nbEle;idx++)
-					if(u32_ptr[idx] != 0U) /* Never quantize upwards floating point values of zero */
-						u32_ptr[idx]|=msk_f32_u32_one;		
-				break;
-			}	
-		}
-		else //BG_DOUBLE
-		{
-			bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_dbl;
-			bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;
-			if(bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN)
-			{
-				printf("Error: bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN\n");
-				return NULL;
-			}	
-			
-			u64_ptr=(unsigned long int*)data_;
-			/* Create mask */
-			msk_f64_u64_zro=0ul; /* Zero all bits */
-			msk_f64_u64_zro=~msk_f64_u64_zro; /* Turn all bits to ones */
-			/* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
-			msk_f64_u64_zro <<= bit_xpl_nbr_zro;
-			/* Bit Set   mask for OR:  Put ones into bits to be set, zeros in untouched bits */
-			msk_f64_u64_one=~msk_f64_u64_zro;
-			//msk_f64_u64_hshv=msk_f64_u64_one & (msk_f64_u64_zro >> 1); /* Set one bit: the MSB of LSBs */		
-			switch(bgMode)
-			{
-			case BITGROOM:
-			for(idx=0L;idx<nbEle;idx+=2L) u64_ptr[idx]&=msk_f64_u64_zro;
-				for(idx=1L;idx<nbEle;idx+=2L)
-					if(u64_ptr[idx] != 0UL) /* Never quantize upwards floating point values of zero */
-						u64_ptr[idx]|=msk_f64_u64_one;			
-				break;
-			case BITSHAVE:
-				for(idx=0L;idx<nbEle;idx++) u64_ptr[idx]&=msk_f64_u64_zro;
-				break;
-			case BITSET:
-				for(idx=0L;idx<nbEle;idx++)
-					if(u64_ptr[idx] != 0UL) /* Never quantize upwards floating point values of zero */
-						u64_ptr[idx]|=msk_f64_u64_one;				
-				break;
-			}	
-		}	
-	}
-	
-	//perform DEFLATE algorithm by Zlib
-	//unsigned char* outBytes = NULL;
-	//*outSize = zlib_compress5(data_, bufferSize, &outBytes, 1);
-	
-	//free(data_);
-	return data_; //outBytes;
-
-}
-#if 0
-void *BG_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t nbEle)
-{	
-	if(dataType==BG_FLOAT)
-	{
-		unsigned char* decompressedData;
-		zlib_uncompress5(bytes, byteLength, &decompressedData, nbEle*sizeof(float));
-		return decompressedData;
-	}
-	else //BG_DOUBLE
-	{
-		unsigned char* decompressedData;
-		zlib_uncompress5(bytes, byteLength, &decompressedData, nbEle*sizeof(double));
-		return decompressedData;
-	}
-}
-#endif
diff --git a/src/ext/for/ext/bg/bg.h b/src/ext/for/ext/bg/bg.h
deleted file mode 100644
index 29a64918..00000000
--- a/src/ext/for/ext/bg/bg.h
+++ /dev/null
@@ -1,109 +0,0 @@
-//#include "defines.h"
-//#include <rw.h>
-#include <stdio.h>
-#include "defines.h"
-//#include "callZlib.h"
-#include <stdint.h>
-//#include <time.h>
-//#include <sys/time.h>
-//#include "conf.h"
-
-#ifndef _BG_H
-#define _BG_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-# define M_LN10      2.30258509299404568401799145468436421   /* loge(10)       */
-# define M_LN2       0.693147180559945309417232121458176568  /* loge(2)        */
-#define int_CEWI 0
-#define NCO_PPC_BIT_XPL_NBR_MIN 2
-
-typedef union lint16
-{
-	unsigned short usvalue;
-	short svalue;
-	unsigned char byte[2];
-} lint16;
-
-typedef union lint32
-{
-	int ivalue;
-	unsigned int uivalue;
-	unsigned char byte[4];
-} lint32;
-
-typedef union lint64
-{
-	long lvalue;
-	unsigned long ulvalue;
-	unsigned char byte[8];
-} lint64;
-
-typedef union ldouble
-{
-    double value;
-    unsigned long lvalue;
-    unsigned char byte[8];
-} ldouble;
-
-typedef union lfloat
-{
-    float value;
-    unsigned int ivalue;
-    unsigned char byte[4];
-} lfloat;
-
-typedef struct bg_params
-{
-	int dataType;
-
-	int sol_ID;// GB
-	int zlibMode; //* four options: Z_NO_COMPRESSION, or Z_BEST_SPEED, Z_BEST_COMPRESSION, Z_DEFAULT_COMPRESSION
-	int bgMode;  //BITGROOM, BITSHAVE or BITSET
-	int errorControlMode;
-	int NSD;
-	int DSD;
-	
-	float fmin, fmax;
-	double dmin, dmax;
-
-} bg_params;
-
-
-typedef struct bg_exedata
-{
-	unsigned int BG_SIZE_TYPE; //the length (# bytes) of the size_t in the system at runtime //4 or 8: sizeof(size_t) 
-} bg_exedata;
-
-
-extern int versionNumber[4];
-
-//-------------------key global variables--------------
-extern int dataEndianType; //*endian type of the data read from disk
-extern int sysEndianType; //*sysEndianType is actually set automatically.
-
-extern bg_params *confparams_cpr;
-extern bg_params *confparams_dec;
-extern bg_exedata *exe_params;
-
-
-//for libpressio
-extern int bgMode_libpressio;
-extern int errorControlMode_libpressio;
-extern int nsd_libpressio;
-extern int dsd_libpressio;
-
-
-unsigned char *BG_compress(int dataType, void *data, size_t *outSize, size_t nbEle);
-
-unsigned char* BG_compress_args(int dataType, void *data, size_t *outSize, int bgMode, int errorControlMode, int nsd, int dsd, size_t nbEle, unsigned char *data_);
-
-//void *BG_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t nbEle);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* ----- #ifndef _BG_H  ----- */
diff --git a/src/ext/for/ext/bg/defines.h b/src/ext/for/ext/bg/defines.h
deleted file mode 100644
index 2ed6cda7..00000000
--- a/src/ext/for/ext/bg/defines.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- *  @file defines.h
- *  @author Sheng Di
- *  @date July, 2019
- *  @brief Header file for the dataCompression.c.
- *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
- *      See COPYRIGHT in top-level directory.
- */
-
-#ifndef _BG_DEFINES_H
-#define _BG_DEFINES_H
-
-#define BG_VER_MAJOR 2
-#define BG_VER_MINOR 1
-#define BG_VER_BUILD 9
-#define BG_VER_REVISION 0
-
-#define BG 105
-#define BITGROOM 0
-#define BITSHAVE 1
-#define BITSET 2
-
-#define BG_FLOAT 0
-#define BG_DOUBLE 1
-#define BG_UINT8 2
-#define BG_INT8 3
-#define BG_UINT16 4
-#define BG_INT16 5
-#define BG_UINT32 6
-#define BG_INT32 7
-#define BG_UINT64 8
-#define BG_INT64 9
-
-#define BG_NSD 0
-#define BG_DSD 1
-
-#define LITTLE_ENDIAN_DATA 0 //refers to the endian type of the data read from the disk
-#define BIG_ENDIAN_DATA 1 //big_endian (ppc, max, etc.) ; little_endian (x86, x64, etc.)
-
-#define LITTLE_ENDIAN_SYSTEM 0 //refers to the endian type of the system
-#define BIG_ENDIAN_SYSTEM 1
-
-#define DynArrayInitLen 1024
-
-//SUCCESS returning status
-#define BG_SCES 0  //successful
-#define BG_NSCS -1 //Not successful
-#define BG_FERR -2 //Failed to open input file
-#define BG_TERR -3 //wrong data type (should be only float or double)
-#define BG_DERR -4 //dimension error
-#define BG_MERR -5 //sz_mode error
-#define BG_BERR -6 //bound-mode error (should be only ABS, REL, ABS_AND_REL, ABS_OR_REL, or PW_REL)
-
-#endif /* _BG_DEFINES_H */
diff --git a/src/ext/for/ext/fastpfor.cc b/src/ext/for/ext/fastpfor.cc
deleted file mode 100644
index 9d7ef55e..00000000
--- a/src/ext/for/ext/fastpfor.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "../vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-
-#include "fastpfor.h"
-#include "FastPFor/headers/variablebyte.h"
-#include "FastPFor/headers/simple16.h"
-//#include "FastPFor/headers/simple8b_rle.h"
-#include "FastPFor/headers/fastpfor.h"
-
-#include "FastPFor/headers/simdfastpfor.h"
-#include "FastPFor/headers/optpfor.h"
-#include "FastPFor/headers/simdoptpfor.h"
-#include "FastPFor/headers/simdgroupsimple.h"
-#include "FastPFor/headers/compositecodec.h"
-
-#define ctou32(_cp_) (*(unsigned *)(_cp_))
-
-unsigned FastPFore32(const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize) {
-  size_t nvalue = outsize/4;
-  FastPForLib::FastPFor<4> ic; 
-  ic.encodeArray((const uint32_t *)in, n & (~127), (uint32_t *)(out+4), nvalue);
-  if(n & 127) {
-    size_t nvalue2 = outsize/4 - nvalue;
-    FastPForLib::VariableByte vc; 
-	vc.encodeArray((const uint32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
-    nvalue += nvalue2;
-  }
-  ctou32(out) = nvalue;
-  return 4+nvalue*4;
-}
-
-unsigned FastPFord32(const unsigned char *in, unsigned n, uint32_t *out) {
-  size_t nvalue = n;
-  FastPForLib::FastPFor<4> ic; 
-  const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), ctou32(in), out, nvalue);
-  if(n & 127) { 
-    nvalue = n - nvalue;
-	FastPForLib::VariableByte vc;
-	ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue);
-  }
-  return ctou32(ip);
-}
-
-/*unsigned FastPFore64(const uint64_t *in, unsigned n, unsigned char *out, unsigned outsize) {
-  size_t nvalue = outsize/8;
-  FastPForLib::FastPFor<4> ic; 
-  ic.encodeArray(in, (size_t)(n & (~127)), (uint32_t *)(out+4), nvalue);
-  if(n & 127) {
-    size_t nvalue2 = outsize/8 - nvalue;
-    FastPForLib::VariableByte vc; 
-	
-	vc.encodeArray((const uint64_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
-    nvalue += nvalue2;
-  }
-  ctou32(out) = nvalue;
-  return 4+nvalue*4;
-}
-
-unsigned FastPFord64(const unsigned char *in, unsigned n, uint64_t *out) {
-  size_t nvalue = n;
-  FastPForLib::FastPFor<4> ic; 
-  const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), ctou32(in), (uint64_t *)out, nvalue);
-  if(n & 127) {
-    nvalue = n - nvalue;
-	FastPForLib::VariableByte vc;
-	ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue);	  
-  }
-  return ctou32(ip);
-}*/
-
-unsigned FastPFore128v32(const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize) {
-  size_t nvalue = outsize/4;
-  FastPForLib::SIMDFastPFor<4> ic; 
-  ic.encodeArray(in, n & (~127), (uint32_t *)(out+4), nvalue);
-  if(n & 127) {
-    size_t nvalue2 = outsize/4 - nvalue;
-    FastPForLib::VariableByte vc; vc.encodeArray((const uint32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
-    nvalue += nvalue2;
-  }
-  ctou32(out) = nvalue;
-  return 4+nvalue*4;
-}
-
-unsigned FastPFord128v32(const unsigned char *in, unsigned n, uint32_t *out) {
-  size_t nvalue = n;
-  FastPForLib::SIMDFastPFor<4> ic; 
-  const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), *(uint32_t *)in, out, nvalue);
-  if(n & 127) { 
-    nvalue = n - nvalue;
-	FastPForLib::VariableByte vc;
-	ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue);	  //return vbdec32((unsigned char *)ip, n & 127, out + mynvalue1);
-  }
-  return (unsigned char *)ip - (unsigned char *)in; 
-}
-
-unsigned OptPFore128v32(const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize) {
-  size_t nvalue = outsize/4;
-  FastPForLib::SIMDOPTPFor<4> ic; ic.encodeArray((const uint32_t *)in, n & (~127), (uint32_t *)(out+4), nvalue);
-  if(n & 127) {
-    size_t nvalue2 = outsize/4 - nvalue;
-    FastPForLib::VariableByte vc; vc.encodeArray((const uint32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
-    nvalue += nvalue2;
-  }
-  ctou32(out) = nvalue;
-  return 4+nvalue*4;
-}
-
-unsigned OptPFord128v32(const unsigned char *in, unsigned n, uint32_t *out) {
-  size_t nvalue = n;
-  FastPForLib::SIMDOPTPFor<4> ic; 
-  const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), ctou32(in), out, nvalue);
-  if(n & 127) { 
-    nvalue = n - nvalue;
-	FastPForLib::VariableByte vc;
-	ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue);	  //return vbdec32((unsigned char *)ip, n & 127, out + mynvalue1);
-  }
-  return (unsigned char *)ip-in; 
-}
diff --git a/src/ext/for/ext/fastpfor.h b/src/ext/for/ext/fastpfor.h
deleted file mode 100644
index 8bbfdc40..00000000
--- a/src/ext/for/ext/fastpfor.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-#else 
-#include <stdint.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-unsigned FastPFore32(    const uint32_t      *in, unsigned n, unsigned char *out, unsigned outsize);
-unsigned FastPFord32(    const unsigned char *in, unsigned n, uint32_t      *out);
-
-unsigned FastPFore128v32(const uint32_t      *in, unsigned n, unsigned char *out, unsigned outsize);
-unsigned FastPFord128v32(const unsigned char *in, unsigned n, uint32_t *out);
-
-unsigned OptPFore128v32( const uint32_t      *in, unsigned n, unsigned char *out, unsigned outsize);
-unsigned OptPFord128v32( const unsigned char *in, unsigned n, uint32_t *out);
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/ext/for/ext/gb.c b/src/ext/for/ext/gb.c
deleted file mode 100644
index 72480692..00000000
--- a/src/ext/for/ext/gb.c
+++ /dev/null
@@ -1,151 +0,0 @@
-// copy from https://github.com/ccr/ccr/tree/master/hdf5_plugins for benchmarking purpose 
-# define NC_FLOAT 5
-# define NC_DOUBLE 6
-# define NC_FILL_FLOAT   (9.9692099683868690e+36f) /* near 15 * 2^119 */
-# define NC_FILL_DOUBLE  (9.9692099683868690e+36)
-
-/* Minimum number of explicit significand bits to preserve when zeroing/bit-masking floating point values
-   Codes will preserve at least two explicit bits, IEEE significand representation contains one implicit bit
-   Thus preserve a least three bits which is approximately one sigificant decimal digit
-   Used in nco_ppc_bitmask() and nco_ppc_bitmask_scl() */
-#define NCO_PPC_BIT_XPL_NBR_MIN 2
-
-/* Pointer union for floating point and bitmask types */
-typedef union{ /* ptr_unn */
-  float *fp;
-  double *dp;
-  unsigned int *ui32p;
-  unsigned long long *ui64p;
-  void *vp;
-} ptr_unn;
-
-void ccr_gbr 			/* [fnc] Granular BitRound buffer of float values */
-(const int nsd, 		/* I [nbr] Number of decimal significant digits to quantize to */
- const int type, 		/* I [enm] netCDF type of operand */
- const size_t sz, 		/* I [nbr] Size (in elements) of buffer to quantize */
- const int has_mss_val, /* I [flg] Flag for missing values */
- ptr_unn mss_val, 		/* I [val] Value of missing value */
- void *op1) 			/* I/O [frc] Values to quantize */
-{
-  const char fnc_nm[] = "ccr_gbr()"; /* [sng] Function name */
-
-  /* Prefer constants defined in math.h, however, ...
-     20201002 GCC environments can have hard time defining M_LN10/M_LN2 despite finding math.h */
-#ifndef M_LN10
-# define M_LN10         2.30258509299404568402  /* log_e 10 */
-#endif /* M_LN10 */
-#ifndef M_LN2
-# define M_LN2          0.69314718055994530942  /* log_e 2 */
-#endif /* M_LN2 */
-  const double bit_per_dgt=M_LN10/M_LN2; /* 3.32 [frc] Bits per decimal digit of precision = log2(10) */
-  const double dgt_per_bit=M_LN2/M_LN10; /* 0.301 [frc] Decimal digits per bit of precision = log10(2) */
-  
-  const int bit_xpl_nbr_sgn_flt=23; /* [nbr] Bits 0-22 of SP significands are explicit. Bit 23 is implicitly 1. */
-  const int bit_xpl_nbr_sgn_dbl=52; /* [nbr] Bits 0-51 of DP significands are explicit. Bit 52 is implicitly 1. */
-  
-  double mnt; /* [frc] Mantissa, 0.5 <= mnt < 1.0 */
-  double mnt_fabs; /* [frc] fabs(mantissa) */
-  double mnt_log10_fabs; /* [frc] log10(fabs(mantissa))) */
-  double val; /* [frc] Copy of input value to avoid indirection */
-  
-  double prc_bnr_xct=0.0; /* [nbr] Binary digits of precision, exact */
-  double mss_val_cmp_dbl; /* Missing value for comparison to double precision values */
-
-  float mss_val_cmp_flt; /* Missing value for comparison to single precision values */
-  
-  int bit_xpl_nbr_sgn=-1; /* [nbr] Number of explicit bits in significand */
-  int bit_xpl_nbr_zro; /* [nbr] Number of explicit bits to zero */
-
-  int dgt_nbr; /* [nbr] Number of digits before decimal point */
-  int qnt_pwr; /* [nbr] Power of two in quantization mask: qnt_msk = 2^qnt_pwr */
-  int xpn_bs2; /* [nbr] Binary exponent xpn_bs2 in val = sign(val) * 2^xpn_bs2 * mnt, 0.5 < mnt <= 1.0 */
-
-  size_t idx;
-
-  unsigned int *u32_ptr;
-  unsigned int msk_f32_u32_zro;
-  unsigned int msk_f32_u32_one;
-  unsigned int msk_f32_u32_hshv;
-  unsigned long long int *u64_ptr;
-  unsigned long long int msk_f64_u64_zro;
-  unsigned long long int msk_f64_u64_one;
-  unsigned long long int msk_f64_u64_hshv;
-  unsigned short prc_bnr_ceil=0; /* [nbr] Exact binary digits of precision rounded-up */
-  unsigned short prc_bnr_xpl_rqr=0; /* [nbr] Explicitly represented binary digits required to retain */
-
-  /* Disallow unreasonable quantization */
-  //assert(nsd > 0);
-  //assert(nsd <= 16);
-
-  if(type == NC_FLOAT  && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_flt) return;
-  if(type == NC_DOUBLE && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_dbl) return;
-
-  switch(type){
-  case NC_FLOAT:
-    /* Missing value for comparison is _FillValue (if any) otherwise default NC_FILL_FLOAT/DOUBLE */
-    if(has_mss_val) mss_val_cmp_flt=*mss_val.fp; else mss_val_cmp_flt=NC_FILL_FLOAT;
-    bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_flt;
-    u32_ptr=op1; //.ui32p;
-	float *fp = op1;
-
-    for(idx=0L;idx<sz;idx++){
-      if((val=fp[idx]) != mss_val_cmp_flt && u32_ptr[idx] != 0U){
-	mnt=frexp(val,&xpn_bs2); /* DGG19 p. 4102 (8) */
-	mnt_fabs=fabs(mnt);
-	mnt_log10_fabs=log10(mnt_fabs);
-	/* 20211003 Continuous determination of dgt_nbr improves CR by ~10% */
-	dgt_nbr=(int)floor(xpn_bs2*dgt_per_bit+mnt_log10_fabs)+1; /* DGG19 p. 4102 (8.67) */
-	qnt_pwr=(int)floor(bit_per_dgt*(dgt_nbr-nsd)); /* DGG19 p. 4101 (7) */
-	prc_bnr_xpl_rqr= mnt_fabs == 0.0 ? 0 : abs((int)floor(xpn_bs2-bit_per_dgt*mnt_log10_fabs)-qnt_pwr); /* Protect against mnt = -0.0 */
-	prc_bnr_xpl_rqr--; /* 20211003 Reduce formula result by 1 bit: Passes all tests, improves CR by ~10% */
-
-	bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;
-	msk_f32_u32_zro=0u; /* Zero all bits */
-	msk_f32_u32_zro=~msk_f32_u32_zro; /* Turn all bits to ones */
-	/* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
-	msk_f32_u32_zro <<= bit_xpl_nbr_zro;
-	/* Bit Set   mask for OR:  Put ones into bits to be set, zeros in untouched bits */
-	msk_f32_u32_one=~msk_f32_u32_zro;
-	msk_f32_u32_hshv=msk_f32_u32_one & (msk_f32_u32_zro >> 1); /* Set one bit: the MSB of LSBs */
-	u32_ptr[idx]+=msk_f32_u32_hshv; /* Add 1 to the MSB of LSBs, carry 1 to mantissa or even exponent */
-	u32_ptr[idx]&=msk_f32_u32_zro; /* Shave it */
-      } /* !mss_val_cmp_flt */
-    } /* !idx */
-    break; /* !NC_FLOAT */
-  case NC_DOUBLE:
-    /* Missing value for comparison is _FillValue (if any) otherwise default NC_FILL_FLOAT/DOUBLE */
-    if(has_mss_val) mss_val_cmp_dbl=*mss_val.dp; else mss_val_cmp_dbl=NC_FILL_FLOAT;
-    bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_dbl;
-    u64_ptr=op1; 
-	double *dp = op1;
-
-    for(idx=0L;idx<sz;idx++){
-      if((val=dp[idx]) != mss_val_cmp_dbl && u64_ptr[idx] != 0U){
-	mnt=frexp(val,&xpn_bs2); /* DGG19 p. 4102 (8) */
-	mnt_fabs=fabs(mnt);
-	mnt_log10_fabs=log10(mnt_fabs);
-	/* 20211003 Continuous determination of dgt_nbr improves CR by ~10% */
-	dgt_nbr=(int)floor(xpn_bs2*dgt_per_bit+mnt_log10_fabs)+1; /* DGG19 p. 4102 (8.67) */
-	qnt_pwr=(int)floor(bit_per_dgt*(dgt_nbr-nsd)); /* DGG19 p. 4101 (7) */
-	prc_bnr_xpl_rqr= mnt_fabs == 0.0 ? 0 : abs((int)floor(xpn_bs2-bit_per_dgt*mnt_log10_fabs)-qnt_pwr); /* Protect against mnt = -0.0 */
-	prc_bnr_xpl_rqr--; /* 20211003 Reduce formula result by 1 bit: Passes all tests, improves CR by ~10% */
-
-	bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;
-	msk_f64_u64_zro=0u; /* Zero all bits */
-	msk_f64_u64_zro=~msk_f64_u64_zro; /* Turn all bits to ones */
-	/* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
-	msk_f64_u64_zro <<= bit_xpl_nbr_zro;
-	/* Bit Set   mask for OR:  Put ones into bits to be set, zeros in untouched bits */
-	msk_f64_u64_one=~msk_f64_u64_zro;
-	msk_f64_u64_hshv=msk_f64_u64_one & (msk_f64_u64_zro >> 1); /* Set one bit: the MSB of LSBs */
-	u64_ptr[idx]+=msk_f64_u64_hshv; /* Add 1 to the MSB of LSBs, carry 1 to mantissa or even exponent */
-	u64_ptr[idx]&=msk_f64_u64_zro; /* Shave it */
-      } /* !mss_val_cmp_dbl */
-    } /* !idx */
-    break; /* !NC_DOUBLE */
-  default: 
-    (void)fprintf(stderr,"ERROR: %s reports datum size = %d B is invalid for %s filter\n",fnc_nm,type,""/*CCR_FLT_NAME*/);
-    break;
-  } /* !type */
-  
-} /* ccr_gbr() */
diff --git a/src/ext/for/ext/gov2.png b/src/ext/for/ext/gov2.png
deleted file mode 100644
index 423dcf38..00000000
Binary files a/src/ext/for/ext/gov2.png and /dev/null differ
diff --git a/src/ext/for/ext/libdroundfast.c b/src/ext/for/ext/libdroundfast.c
deleted file mode 100644
index 740c82f6..00000000
--- a/src/ext/for/ext/libdroundfast.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019, CNES.
- *
- * This source code is licensed under MIT-style license (found in the
- * COPYING file in the root directory of this source tree).
- */
-//https://github.com/CNES/Digit_Rounding/blob/master/libdround/src/libdroundfast.c
-
-#include <math.h>
-
-#define LOG2_10		3.321928095		// log2(10)
-#define LOG10_2		0.301029996		// log10(2)
-
-#define SIGN(x)		( (x<0) ? -1 : 1 )
-
-const float TABLE[5][2] = {
-  {0.6, -LOG10_2},
-  {0.7,-0.221848749},
-  {0.8,-0.154901959},
-  {0.9,-0.096910013},
-  {1.0,-0.045757490},
-};
-
-/*
- * Round the float value keeping nsd significant digits.
- * Fast method that does not uses log10() function.
- */
-double droundFast(double v, int nsd)
-{
-	// compute the number of digits before the decimal point of the input floating-point value v
-	// The value v is interpreted as v = 10^d + eps = 2^e + m
-	// with 0 <= m < 0.5
-	int e;
-	double m = frexp(v, &e);	// return the binary exponent e of the input value v = 2^e + m with 0 <= m < 0.5
-
-	// =============
-	// --- tabulated method ---
-	// tabulate the LOG10(m)
-	int i = 0;
-	while (TABLE[i][0] < m)
-	  {
-	    i++;
-	  }
-	float log10m = TABLE[i][1];
-	
-	// --- low precision method ---
-	// float log10m = -LOG10_2;
-	// =============
-
-	// convert the binary exponent to a number of digits: d = floor(e*log10(2) + log10(m)) + 1
-	int d = (int) floor(e*LOG10_2 + log10m) + 1;
-
-	// compute the power of the quantization step: q = 2^p
-	int p = (int) floor(LOG2_10 * (d - nsd));
-	// compute quantization step: q = 2^p
-	double q = ldexp(1, p);
-
-	// apply the quantization step depending on the bias
-	return SIGN(v) * (floor(fabs(v) / q) + 0.5) * q;
-}
-
diff --git a/src/ext/for/ext/polycom/optp4.c b/src/ext/for/ext/polycom/optp4.c
deleted file mode 100644
index 414985e1..00000000
--- a/src/ext/for/ext/polycom/optp4.c
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "../OPT_PFD/opt_p4.h" 							// OptPFD
-
-unsigned char *optpfdenc32(unsigned *__restrict in, int n, unsigned *__restrict out) {
-  if(n < 128) 
-    out = vbyteenc(in, n, (unsigned *)out);
-  else { 
-    unsigned tmp[OPTPFDMAX]; 
-    for(i = 0; i < n; i++) tmp[i] = in[i]; 
-    return out += OPT4(tmp, n, (unsigned *)out); 
-  }  
-  return out;
-}
-
-unsigned char *optpfddec32(unsigned *__restrict in, int n, unsigned *__restrict out) { 
-  if(n < 128) 
-    in = vbytedec(in, n, out); 
-  else { 
-    unsigned all_array[OPTPFDMAX]; 
-    return (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); 
-  }
-}
-
diff --git a/src/ext/for/ext/polycom/optp4.h b/src/ext/for/ext/polycom/optp4.h
deleted file mode 100644
index 651513fe..00000000
--- a/src/ext/for/ext/polycom/optp4.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define OPTPFDMAX 2048
-unsigned char *optpfdenc32(unsigned *__restrict in, int n, unsigned *__restrict out);
-unsigned char *optpfddec32(unsigned *__restrict in, int n, unsigned *__restrict out);
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/src/ext/for/ext/polycom/optpfd.c b/src/ext/for/ext/polycom/optpfd.c
deleted file mode 100644
index 84a4ac59..00000000
--- a/src/ext/for/ext/polycom/optpfd.c
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <stdlib.h>
-#include "../OPT_PFD/opt_p4.h" 							// OptPFD
-
-#include "optpfd.h"
-#include "polyvbyte.h"
-unsigned char *optpfdenc32(unsigned *in, int n, unsigned char *out) {
-  if(n < 128) 
-    out = vbpolyenc(in, n, out);
-  else { 
-    unsigned tmp[OPTPFDMAX],i; 
-    for(i = 0; i < n; i++) tmp[i] = in[i]; 
-    return out += OPT4(tmp, n, (unsigned *)out); 
-  }  
-  return out;
-}
-
-unsigned char *optpfddec32(unsigned char *in, int n, unsigned *out) { 
-  if(n < 128) 
-    in = vbpolydec(in, n, out); 
-  else { 
-    unsigned all_array[OPTPFDMAX]; 
-    in = (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); 
-  }
-  return in;
-}
-
diff --git a/src/ext/for/ext/polycom/optpfd.h b/src/ext/for/ext/polycom/optpfd.h
deleted file mode 100644
index 9ff838e7..00000000
--- a/src/ext/for/ext/polycom/optpfd.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define OPTPFDMAX 2048
-unsigned char *optpfdenc32(unsigned      *in, int n, unsigned char *out);
-unsigned char *optpfddec32(unsigned char *in, int n, unsigned      *out);
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/src/ext/for/ext/polycom/polyvbyte.c b/src/ext/for/ext/polycom/polyvbyte.c
deleted file mode 100644
index 480180c5..00000000
--- a/src/ext/for/ext/polycom/polyvbyte.c
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "vbyte_poly.h"
-#include "polyvbyte.h"
-
-unsigned char *vbpolyenc(unsigned *in, unsigned n, unsigned char *out) {
-  unsigned i; 
-  for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); } 
-  return out;
-}
-unsigned char *vbpolydec(unsigned char *in, unsigned n, unsigned *out) {
-  unsigned i; 
-  for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } 
-  return in;
-}
-
diff --git a/src/ext/for/ext/polycom/polyvbyte.h b/src/ext/for/ext/polycom/polyvbyte.h
deleted file mode 100644
index f8b3a998..00000000
--- a/src/ext/for/ext/polycom/polyvbyte.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-unsigned char *vbpolyenc(unsigned      *in, unsigned n, unsigned char *out);
-unsigned char *vbpolydec(unsigned char *in, unsigned n, unsigned *out);
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/src/ext/for/ext/polycom/vbyte_poly.h b/src/ext/for/ext/polycom/vbyte_poly.h
deleted file mode 100644
index 3c2668d0..00000000
--- a/src/ext/for/ext/polycom/vbyte_poly.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// 
-#define VBYTE_ENC(_v, _n)	\
-{\
-	unsigned _num;				\
-	unsigned char _barray[5];	\
-	unsigned _i, _started = 0;	\
- 	_num = _n;					\
-	for (_i = 0; _i < 5; _i++)	\
-	{							\
-		_barray[_i] = ((_num%128)<<1);	\
-		_num = _num/128;		\
-	}							\
-	for (_i = 4; _i > 0; _i--)	\
-	{							\
-		if ((_barray[_i] != 0) || (_started == 1))	\
-		{						\
-			_started = 1;		\
-			*_v = _barray[_i]|0x1;	\
-			_v++;				\
-		}						\
-	}							\
-	*_v = _barray[0]|0x0;		\
-	_v++;						\
-}
-
-#define VBYTE_DEC(_v, _n)	\
-{\
-	_n = ((*_v>>1));						\
-	if ((*_v&0x1) != 0)		\
-        {							\
-          _v++;				\
-	  _n = (_n<<7) + ((*_v>>1));	\
-	  if ((*_v&0x1)!= 0)		\
-          {						\
-            _v++;				\
-	    _n = (_n<<7) + ((*_v>>1));	\
-	    if ((*_v&0x1) != 0)		\
-            {						\
-              _v++;				\
-	      _n = (_n<<7) + ((*_v>>1));	\
-	    }\
-	  }\
-	}\
-        _v++;				\
-}
-
diff --git a/src/ext/for/ext/rc.c b/src/ext/for/ext/rc.c
deleted file mode 100644
index d7088fac..00000000
--- a/src/ext/for/ext/rc.c
+++ /dev/null
@@ -1,1809 +0,0 @@
-// Copyright (c) 2008, WEST, Polytechnic Institute of NYU.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of WEST, Polytechnic Institute of NYU. nor the names 
-// of its contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: Torsten Suel, Jiangong Zhang, Jinru He
-// 
-// If you have any questions or problems about our codes, please contact:
-// jhe@cis.poly.edu
-// 
-// 
-
-//#include "rice_coding2.h"
-//#include <stdio.h>
-
-/*rc_rice_coding2() {
-	// TODO Auto-generated constructor stub
-	cnum[0] = 0;
-	cnum[1] = 1;
-	cnum[2] = 2;
-	cnum[3] = 3;
-	cnum[4] = 4;
-	cnum[5] = 5;
-	cnum[6] = 6;
-	cnum[7] = 7;
-	cnum[8] = 8;
-	cnum[9] = 9;
-	cnum[10] = 10;
-	cnum[11] = 11;
-	cnum[12] = 12;
-	cnum[13] = 13;
-	cnum[14] = 16;
-	cnum[15] = 20;
-	cnum[16] = 32;
-}*/
-#define	coding_type 3
-#define block_size 128
-
-static int cnum[] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32 };
-
-/*rc_~rice_coding2() {
-	// TODO Auto-generated destructor stub
-}*/
-
-/*int rc_get_type()
-{
-	return coding_type;
-}
-
-void rc_set_size(int size)
-{
-	this->block_size = size;
-}*/
-//void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w);
-#include "../bitpack.h"
-#include "rc.h"
-
-	void setBit(unsigned char *buf, unsigned int *bp, unsigned int val)
-	{                                     
-	  unsigned int bPtr;
-	  unsigned int w;
-
-	  bPtr = (*bp)&7;
-	  if (bPtr == 0)  buf[(*bp)>>3] = 0;
-	  if (val == 1)  buf[(*bp)>>3] |= (1<<bPtr);
-	  (*bp)++;
-	}
-
-/**********************
- * w: output buffer
- * buf: input buffer
- * bits: b value;
- * BS: block size
- */
-unsigned char *rc_turbo_rice_encode(unsigned *w, unsigned int **buf, unsigned int bits)
-{
-  unsigned int bp;
-  unsigned int val;
-  unsigned int i;
-  int s;
-  unsigned int out[block_size];
-
-  if (bits > 0)
-  {
-    s = ((bits * block_size)>>5);
-    for (i = 0; i < s; i++)  w[i] = 0;
-    for (i = 0; i < block_size; i++)  out[i] = (*buf)[i] & ((1u<<bits)-1);// MASK[bits];
-    //pack(out, bits, block_size, *w);
... 33309 lines suppressed ...


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org