You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ji...@apache.org on 2023/06/13 02:49:23 UTC
[doris-thirdparty] branch clucene updated: [Fix](PFOR) revert TurboPFOR to last version, and fix some build issue (#88)
This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new dae2b5d8 [Fix](PFOR) revert TurboPFOR to last version, and fix some build issue (#88)
dae2b5d8 is described below
commit dae2b5d830a942e2e9692e2f3c0f609eff767d5a
Author: airborne12 <ai...@gmail.com>
AuthorDate: Tue Jun 13 10:49:18 2023 +0800
[Fix](PFOR) revert TurboPFOR to last version, and fix some build issue (#88)
---
src/core/CLucene/index/SegmentTermDocs.cpp | 4 +
src/core/CMakeLists.txt | 2 +-
src/ext/for/CMakeLists.txt | 59 +-
src/ext/for/README.md | 585 +
src/ext/for/bic.c | 201 -
src/ext/for/bitpack.c | 339 +-
src/ext/for/{include_ => }/bitpack.h | 136 +-
src/ext/for/bitpack_.h | 1041 +-
src/ext/for/bitunpack.c | 818 +-
src/ext/for/bitunpack_.h | 1032 +-
src/ext/for/bitutil.c | 770 +-
src/ext/for/{include_/bitutil_.h => bitutil.h} | 350 +-
src/ext/for/{include_ => }/conf.h | 230 +-
src/ext/for/eliasfano.c | 213 -
src/ext/for/ext/OPT_PFD/main.cpp | 101 -
src/ext/for/ext/OPT_PFD/opt_p4.h | 54 -
src/ext/for/ext/OPT_PFD/pf.h | 158 -
src/ext/for/ext/OPT_PFD/s16head.h | 251 -
src/ext/for/ext/OPT_PFD/unpack.h | 773 --
src/ext/for/ext/SPDP_10.c | 238 -
src/ext/for/ext/bg/bg.c | 185 -
src/ext/for/ext/bg/bg.h | 109 -
src/ext/for/ext/bg/defines.h | 54 -
src/ext/for/ext/fastpfor.cc | 121 -
src/ext/for/ext/fastpfor.h | 20 -
src/ext/for/ext/gb.c | 151 -
src/ext/for/ext/gov2.png | Bin 33041 -> 0 bytes
src/ext/for/ext/libdroundfast.c | 61 -
src/ext/for/ext/polycom/optp4.c | 22 -
src/ext/for/ext/polycom/optp4.h | 11 -
src/ext/for/ext/polycom/optpfd.c | 26 -
src/ext/for/ext/polycom/optpfd.h | 11 -
src/ext/for/ext/polycom/polyvbyte.c | 14 -
src/ext/for/ext/polycom/polyvbyte.h | 10 -
src/ext/for/ext/polycom/vbyte_poly.h | 46 -
src/ext/for/ext/rc.c | 1809 ---
src/ext/for/ext/rc.h | 8 -
src/ext/for/ext/simdcomp_/simdfor.c | 14501 ----------------------
src/ext/for/ext/simple8b.c | 330 -
src/ext/for/ext/simple8b.h | 9 -
src/ext/for/ext/vabyte.h | 99 -
src/ext/for/ext/varintg8iu.c | 184 -
src/ext/for/ext/varintg8iu.h | 5 -
src/ext/for/ext/vas16c.h | 36 -
src/ext/for/ext/vas16d.h | 403 -
src/ext/for/fp.c | 954 --
src/ext/for/{include_ => }/fp.h | 65 +-
src/ext/for/icapp.c | 2326 ----
src/ext/for/iccodec.c | 813 --
src/ext/for/idx.h | 53 -
src/ext/for/idxcr.c | 175 -
src/ext/for/idxqry.c | 682 -
src/ext/for/idxseg.c | 133 -
src/ext/for/include_/bic.h | 66 -
src/ext/for/include_/bitiobe.h | 42 -
src/ext/for/include_/bitutil.h | 160 -
src/ext/for/include_/eliasfano.h | 36 -
src/ext/for/include_/iccodec.h | 109 -
src/ext/for/include_/transpose.h | 231 -
src/ext/for/include_/vbit.h | 29 -
src/ext/for/include_/vint.h | 249 -
src/ext/for/include_/vlcbit.h | 117 -
src/ext/for/include_/vlcbyte.h | 170 -
src/ext/for/jic.c | 175 -
src/ext/for/jic.h | 693 --
src/ext/for/libext.mak | 327 -
src/ext/for/makefile | 156 +
src/ext/for/makefile.vs | 78 +
src/ext/for/{include_ => }/sse_neon.h | 217 +-
src/ext/for/{include_ => }/time_.h | 159 +-
src/ext/for/transpose.c | 1171 +-
src/ext/for/transpose.h | 113 +
src/ext/for/transpose_.c | 472 -
src/ext/for/trle.c | 125 -
src/ext/for/{include_ => }/trle.h | 32 +-
src/ext/for/trle_.h | 8 +-
src/ext/for/trlec.c | 87 +-
src/ext/for/trled.c | 71 +-
src/ext/for/v8.c | 449 +-
src/ext/for/v8pack.c | 203 -
src/ext/for/vbit.c | 304 -
src/ext/for/vint.c | 153 +-
src/ext/for/vint.h | 401 +
src/ext/for/{include_ => }/vp4.h | 107 +-
src/ext/for/vp4c.c | 132 +-
src/ext/for/vp4d.c | 116 +-
src/ext/for/vs/bitpack_avx2.c | 2 +
src/ext/for/vs/bitpack_sse.c | 2 +
src/ext/for/vs/bitunpack_avx2.c | 2 +
src/ext/for/vs/bitunpack_sse.c | 2 +
src/ext/for/vs/getopt.c | 562 +
src/ext/for/vs/getopt.h | 97 +
src/ext/for/vs/inttypes.h | 306 +
src/ext/for/vs/stdint.h | 259 +
src/ext/for/vs/transpose_avx2.c | 2 +
src/ext/for/vs/transpose_sse.c | 2 +
src/ext/for/vs/vp4c_avx2.c | 2 +
src/ext/for/vs/vp4c_sse.c | 2 +
src/ext/for/vs/vp4d_avx2.c | 2 +
src/ext/for/vs/vp4d_sse.c | 2 +
src/ext/for/vs/vs2017/TurboPFor.sln | 41 +
src/ext/for/vs/vs2017/TurboPFor.vcxproj | 226 +
src/ext/for/vs/vs2017/TurboPFor.vcxproj.filters | 101 +
src/ext/for/vs/vs2017/icapp.vcxproj | 175 +
src/ext/for/vs/vs2017/icapp.vcxproj.filters | 21 +
src/ext/for/vsimple.c | 43 +-
src/ext/for/{include_ => }/vsimple.h | 31 +-
107 files changed, 6690 insertions(+), 32931 deletions(-)
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp b/src/core/CLucene/index/SegmentTermDocs.cpp
index f64256d8..0fe90357 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -162,6 +162,10 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t length) {
}
}
} else {
+ // NOTE: Pad arraySize from 511 to 512 for alignment since the first block size is 511, and add one more extra space to prevent overflow.
+ auto paddingSize = (arraySize / PFOR_BLOCK_SIZE) * PFOR_BLOCK_SIZE + PFOR_BLOCK_SIZE;
+ _docs.resize(paddingSize + 1);
+ _freqs.resize(paddingSize + 1);
{
uint32_t SerializedSize = freqStream->readVInt();
std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 8f9422fa..ce1dfcb5 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -209,7 +209,7 @@ file(GLOB_RECURSE HEADERS ${clucene-core_SOURCE_DIR}/*.h )
#create the libraries
INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/core)
-INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/ext/for/include_)
+INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/ext/for)
IF (BUILD_SHARED_LIBRARIES)
add_library(clucene-core SHARED
diff --git a/src/ext/for/CMakeLists.txt b/src/ext/for/CMakeLists.txt
index e14dcd48..3b14781f 100644
--- a/src/ext/for/CMakeLists.txt
+++ b/src/ext/for/CMakeLists.txt
@@ -1,37 +1,28 @@
cmake_minimum_required(VERSION 3.10)
project(powturbo)
-#INCLUDE (DefineOptions)
-#DEFINE_OPTIONS(EXTRA_OPTIONS EXTRA_LIBS)
-
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_C_STANDARD 99)
-# Compiler options
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
-#set(CMAKE_C_FLAGS_DEBUG "-DDEBUG -g")
-set(CMAKE_C_FLAGS "-DNDEBUG -s -O3")
+set(DEBUG "-DNDEBUG -g -O3")
set(OPT "-w -Wall -fstrict-aliasing -falign-loops -Wno-int-conversion")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPT}")
-
-
-# Architecture-specific settings
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
set(SSE "-march=corei7-avx -mtune=corei7-avx")
set(AVX2 "-march=haswell")
+ set(CMAKE_C_FLAGS ${SSE})
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(SSE "-march=armv8-a")
+ set(CMAKE_C_FLAGS "-march=armv8-a")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
set(SSE "-D__SSSE3__")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power9 -mtune=power9")
endif()
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE}")
-
if(FLOAT16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_FLOAT16")
endif()
@@ -39,58 +30,62 @@ endif()
if(STATIC)
set(CMAKE_EXE_LINKER_FLAGS "-static")
endif()
-
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${DEBUG} ${OPT}")
+
+separate_arguments(avx2_c_flags_list UNIX_COMMAND "${DEBUG} ${OPT}")
separate_arguments(c_flags_list UNIX_COMMAND "${CMAKE_C_FLAGS}")
-# Include directories
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-# Define base source files
set(SRC_FILES
- ${CMAKE_CURRENT_SOURCE_DIR}/bitutil.c
- ${CMAKE_CURRENT_SOURCE_DIR}/bitpack.c
- ${CMAKE_CURRENT_SOURCE_DIR}/bitunpack.c
- ${CMAKE_CURRENT_SOURCE_DIR}/vp4c.c
- ${CMAKE_CURRENT_SOURCE_DIR}/vp4d.c
- ${CMAKE_CURRENT_SOURCE_DIR}/transpose.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/bitpack.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/bitunpack.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/vp4c.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/vp4d.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/bitutil.c
)
-# Add base source files to library
add_library(ic STATIC
- fp.c
v8.c
vint.c
trlec.c
trled.c
- vsimple.c
- eliasfano.c
+ vsimple.c
+ bitutil.c
+ bitpack.c
+ bitunpack.c
+ vp4c.c
+ vp4d.c
+ transpose.c
)
-# Add custom commands to generate SSE and AVX2 versions of source files
foreach(SRC_FILE ${SRC_FILES})
get_filename_component(SRC_NAME ${SRC_FILE} NAME_WE)
- #set(SSE_OUTPUT ${SRC_NAME}_sse.o)
+ set(SSE_OUTPUT ${SRC_NAME}_sse.o)
set(AVX2_OUTPUT ${SRC_NAME}_avx2.o)
- set(OUTPUT ${SRC_NAME}.o)
add_custom_command(
- OUTPUT ${OUTPUT}
- COMMAND ${CMAKE_C_COMPILER} -c -o ${OUTPUT} ${SRC_FILE} ${c_flags_list}
+ OUTPUT ${SSE_OUTPUT}
+ COMMAND ${CMAKE_C_COMPILER} -DSSE2_ON ${c_flags_list} -c -o ${SSE_OUTPUT} ${SRC_FILE}
DEPENDS ${SRC_FILE}
)
- target_sources(ic PRIVATE ${OUTPUT})
target_sources(ic PRIVATE ${SSE_OUTPUT})
if(USE_AVX2)
add_custom_command(
OUTPUT ${AVX2_OUTPUT}
- COMMAND ${CMAKE_C_COMPILER} -c -o ${AVX2_OUTPUT} ${SRC_FILE} ${c_flags_list} ${AVX2}
+ COMMAND ${CMAKE_C_COMPILER} ${AVX2} -DAVX2_ON ${avx2_c_flags_list} -c -o ${AVX2_OUTPUT} ${SRC_FILE}
DEPENDS ${SRC_FILE}
)
target_sources(ic PRIVATE ${AVX2_OUTPUT})
endif()
endforeach()
+
+set(LIB_DESTINATION ../)
+
install(TARGETS ic
DESTINATION ${LIB_DESTINATION}
COMPONENT ext)
diff --git a/src/ext/for/README.md b/src/ext/for/README.md
new file mode 100644
index 00000000..31e8f25e
--- /dev/null
+++ b/src/ext/for/README.md
@@ -0,0 +1,585 @@
+TurboPFor: Fastest Integer Compression
+
+[//]: # ([![Build Status][travisBadge]][travisLink])
+[//]: # ([travisBadge]: https://api.travis-ci.com/powturbo/TurboPFor-Integer-Compression.svg?branch=master)
+[//]: # ([travisLink]: https://app.travis-ci.com/powturbo/TurboPFor-Integer-Compression)
+======================================
+* **TurboPFor: The synonym for "integer compression"**
+ * **ALL** functions available for **AMD/Intel**, **64 bits ARMv8 NEON** Linux+MacOS/M1 & **Power9 Altivec**
+ * 100% C (C++ headers), as simple as memcpy. OS:Linux amd64, arm64, Power9, MacOs (Amd/intel + Apple M1),
+ * :+1: **Java** Critical Natives/JNI. Access TurboPFor **incl. SIMD/AVX2!** from Java as fast as calling from C
+ * :sparkles: **FULL** range 8/16/32/64 bits scalar + 16/32/64 bits SIMD functions
+ * No other "Integer Compression" compress/decompress faster
+ * :sparkles: Direct Access, **integrated** (SIMD/AVX2) FOR/delta/Delta of Delta/Zigzag for sorted/unsorted arrays
+ * **16 bits** + **64 bits** SIMD integrated functions
+* **For/PFor/PForDelta**
+ * **Novel TurboPFor** (PFor/PForDelta) scheme w./ **direct access** + **SIMD/AVX2**. **+RLE**
+ * Outstanding compression/speed. More efficient than **ANY** other fast "integer compression" scheme.
+ * Compress 70 times faster and decompress up to 4 times faster than OptPFD
+* **Bit Packing**
+ * Fastest and most efficient **"SIMD Bit Packing"** **15 Billions integers/sec (60Gb/s!)**
+ * Scalar **"Bit Packing"** decoding nearly as fast as SIMD-Packing in realistic (No "pure cache") scenarios
+ * **Direct/Random Access** : Access any single bit packed entry with **zero decompression**
+* **Variable byte**
+ * Scalar **"Variable Byte"** faster and more efficient than **ANY** other implementation
+ * SIMD **TurboByte** fastest group varint (16+32 bits) incl. integrated delta,zigzag,...
+ * **TurboByte+TurboPackV** novel hybrid scheme combining the fastest SIMD codecs.
+* **Simple family**
+ * **Novel** **"Variable Simple"** (incl. **RLE**) faster and more efficient than simple16, simple-8b
+* **Elias fano**
+ * Fastest **"Elias Fano"** implementation w/ or w/o SIMD/AVX2
++ **Transform**
+ * Scalar & SIMD Transform: Delta, Zigzag, Zigzag of delta, XOR, Transpose/Shuffle,
+ * **lossy** floating point compression with *TurboPFor* or [TurboTranspose](https://github.com/powturbo/TurboTranspose)+lz77
+* **Floating Point Compression**
+ * Delta/Zigzag + improved gorilla style + (Differential) Finite Context Method FCM/DFCM floating point compression
+ * Using **TurboPFor**, unsurpassed compression and more than 5 GB/s throughput
+ * Point wise relative error bound **lossy** floating point compression
+ * **TurboFloat** novel efficient floating point compression using TurboPFor
+* **Time Series Compression**
+ * **Fastest Gorilla** 16/32/64 bits style compression (**zigzag of delta** + **RLE**).
+ * can compress times series to only 0.01%. Speed > 10 GB/s compression and > 13 GB/s decompress.
+* **Inverted Index ...do less, go fast!**
+ * Direct Access to compressed *frequency* and *position* data w/ zero decompression
+ * **Novel** **"Intersection w/ skip intervals"**, decompress the minimum necessary blocks (**~10-15%)!**.
+ * **Novel** Implicit skips with zero extra overhead
+ * **Novel** Efficient **Bidirectional** Inverted Index Architecture (forward/backwards traversal) incl. "integer compression".
+ * more than **2000! queries per second** on GOV2 dataset (25 millions documents) on a **SINGLE** core
+ * :sparkles: Revolutionary Parallel Query Processing on Multicores **> 7000!!! queries/sec** on a simple quad core PC.<br>
+ **...forget** ~~Map Reduce, Hadoop, multi-node clusters,~~ ...
+
+![Promo video](turbopfor.jpg?raw=true)
+
+### Integer Compression Benchmark (single thread):
+- Download [IcApp](https://sites.google.com/site/powturbo/downloads) a new benchmark for TurboPFor<br>
+ for testing allmost all integer and floating point file types.
+- Practical (No **PURE** cache) "integer compression" benchmark w/ **large** arrays.
+- [Benchmark Intel CPU: Skylake i7-6700 3.4GHz gcc 9.2](https://github.com/powturbo/TurboPFor/issues/47)
+- [Benchmark ARM: ARMv8 A73-ODROID-N2 1.8GHz](https://github.com/powturbo/TurboPFor/issues/49)
+
+##### - Synthetic data:
+ - Generate and test (zipfian) skewed distribution (100.000.000 integers, Block size=128/256)<br>
+ Note: Unlike general purpose compression, a small fixed size (ex. 128 integers) is in general used in "integer compression".
+ Large blocks involved, while processing queries (inverted index, search engines, databases, graphs, in memory computing,...) need to be entirely decoded.
+
+ ./icbench -a1.5 -m0 -M255 -n100M ZIPF
+
+|C Size|ratio%|Bits/Integer|C MB/s|D MB/s|Name 2019.11|
+|--------:|-----:|--------:|----------:|----------:|--------------|
+|62,939,886| 15.7| 5.04|**2369**|**10950**|**TurboPFor256**|
+|63,392,759| 15.8| 5.07|1359|7803|**TurboPFor128**|
+|63,392,801| 15.8| 5.07|1328|924|**TurboPForDA**|
+|65,060,504| 16.3| 5.20|60|2748|[FP_SIMDOptPFor](#FastPFor)|
+|65,359,916|16.3| 5.23| 32|2436|PC_OptPFD|
+|73,477,088|18.4| 5.88|408|2484|PC_Simple16|
+|73,481,096| 18.4| 5.88|624|8748|[FP_SimdFastPFor](#FastPFor) 64Ki *|
+|76,345,136| 19.1| 6.11|1072|2878|**VSimple**|
+|91,947,533| 23.0| 7.36|284|11737|[QMX](#QMX) 64k *|
+|93,285,864| 23.3| 7.46|1568|10232|[FP_GroupSimple](#FastPFor) 64Ki *|
+|95,915,096|24.0| 7.67| 848|3832|Simple-8b|
+|99,910,930| 25.0| 7.99|**17298**|**12408**|**TurboByte+TurboPack**|
+|99,910,930| 25.0| 7.99|**17357**|**12363**|**TurboPackV** sse|
+|99,910,930| 25.0| 7.99|11694|10138|**TurboPack** scalar|
+|99,910,930| 25.0| 7.99|8420|8876|**TurboFor**|
+|100,332,929| 25.1| 8.03|17077|11170|**TurboPack256V** avx2|
+|101,015,650| 25.3| 8.08|11191|10333|**TurboVByte**|
+|102,074,663| 25.5| 8.17|6689|9524|[MaskedVByte](#MaskedVByte)|
+|102,074,663| 25.5| 8.17|2260|4208|[PC_Vbyte](#PolyCom)|
+|102,083,036| 25.5| 8.17|5200|4268|[FP_VByte](#FastPFor)|
+|112,500,000| 28.1| 9.00|1528|12140|[VarintG8IU](#VarintG8IU)|
+|125,000,000| 31.2|10.00|13039|12366|**TurboByte**|
+|125,000,000| 31.2|10.00|11197|11984|[StreamVbyte 2019](#StreamVByte)|
+|400,000,000| 100.00| 32.00| 8960|8948|Copy|
+| | | | N/A | N/A |EliasFano|
+
+(*) codecs inefficient for small block sizes are tested with 64Ki integers/block.
+
+- MB/s: 1.000.000 bytes/second. **1000 MB/s = 1 GB/s**<br>
+- **#BOLD** = pareto frontier.<br>
+- FP=FastPFor SC:simdcomp PC:Polycom<br>
+- TurboPForDA,TurboForDA: Direct Access is normally used when accessing few individual values.<br>
+- Eliasfano can be directly used only for increasing sequences
+------------------------------------------------------------------------
+##### - Data files:
+ - gov2.sorted from [DocId data set](#DocId) Block size=128/Delta coding
+
+ ./icbench -fS -r gov2.sorted
+
+![Speed/Ratio](ext/gov2.png "Speed/Ratio: Decompression")
+
+|Size |Ratio %|Bits/Integer|C Time MB/s|D Time MB/s|Function 2019.11|
+|-----------:|------:|-----:|-------:|-------:|---------------------|
+| 3,321,663,893| 13.9| 4.44|**1320**|**6088**|**TurboPFor**|
+| 3,339,730,557| 14.0| 4.47| 32| 2144|PC.OptPFD|
+| 3,350,717,959| 14.0| 4.48|**1536**|**7128**|**TurboPFor256**|
+| 3,501,671,314| 14.6| 4.68| 56| 2840|**VSimple**|
+| 3,768,146,467| 15.8| 5.04|**3228**| 3652|**EliasFanoV**|
+| 3,822,161,885| 16.0| 5.11| 572| 2444|PC_Simple16|
+| 4,411,714,936| 18.4| 5.90|**9304**|**10444**|**TurboByte+TurboPack**|
+| 4,521,326,518| 18.9| 6.05| 836| 3296|Simple-8b|
+| 4,649,671,427| 19.4| 6.22|3084| 3848|**TurboVbyte**|
+| 4,955,740,045| 20.7| 6.63|7064|10268|**TurboPackV**|
+| 4,955,740,045| 20.7| 6.63|5724| 8020|**TurboPack**|
+| 5,205,324,760| 21.8| 6.96|6952| 9488|SC_SIMDPack128|
+| 5,393,769,503| 22.5| 7.21|**14466**|**11902**|**TurboPackV256**|
+| 6,221,886,390| 26.0| 8.32|6668| 6952|**TurboFor**|
+| 6,221,886,390| 26.0| 8.32|6644| 2260|**TurboForDA**|
+| 6,699,519,000| 28.0| 8.96|1888| 1980|FP_Vbyte|
+| 6,700,989,563| 28.0| 8.96|2740| 3384|MaskedVByte|
+| 7,622,896,878| 31.9|10.20| 836| 4792|VarintG8IU|
+| 8,060,125,035| 33.7|11.50|8456| 9476|Streamvbyte 2019|
+| 8,594,342,216| 35.9|11.50|5228| 6376|libfor|
+|23,918,861,764|100.0|32.00|5824| 5924|Copy|
+
+Block size: 64Ki = 256k bytes. Ki=1024 Integers
+
+|Size |Ratio %|Bits/Integer|C Time MB/s|D Time MB/s|Function |
+|----------:|-----:|----:|------:|------:|---------------------|
+| 3,164,940,562| 13.2|**4.23**|**1344**|**6004**|**TurboPFor 64Ki**|
+| 3,273,213,464| 13.7| 4.38|**1496**|**7008**|**TurboPFor256 64Ki**|
+| 3,965,982,954| 16.6| 5.30|**1520**| 2452|[lz4](#lz4)+DT 64Ki|
+| 4,234,154,427| 17.7| 5.66| 436| 5672|qmx 64Ki|
+| 6,074,995,117| 25.4| 8.13| 1976| 2916|[blosc_lz4](#blosc) 64Ki|
+| 8,773,150,644| 36.7|11.74| 2548|5204|blosc_lz 64Ki|
+
+"lz4+DT 64Ki" = Delta+Transpose from TurboPFor + lz4<br>
+"blosc_lz4" internal lz4 compressor+vectorized shuffle
+
+##### - Time Series:
+- Test file [Timestamps: ts.txt(sorted)](https://github.com/zhenjl/encoding/tree/master/benchmark/data)
+
+ ./icapp -Ft ts.txt -I15 -J15
+
+|Function |C MB/s| size |ratio%| D MB/s|Text
+|----------------|-----:|--------:|------:|------:|--------------------|
+|bvzenc32 |**10632**|45,909|0.008|**12823**|ZigZag|
+|bvzzenc32 |**8914**|56,713|0.010|**13499**|ZigZag Delta of delta|
+|vsenc32 |**12294**|140,400| 0.024 |12877 |Variable Simple|
+|p4nzenc256v32 | 1932| 596,018| 0.10 |13326 |TurboPFor256 ZigZag|
+|p4ndenc256v32 | 1961| 596,018| 0.10 |13339 |TurboPFor256 Delta|
+|bitndpack256v32 |**12564**|909,189| 0.16 |13505 |TurboPackV256 Delta|
+|p4nzenc32 | 1810| 1,159,633| 0.20 | 8502 |TurboPFor ZigZag|
+|p4nzenc128v32 | 1795| 1,159,633| 0.20 |13338 |TurboPFor ZigZag|
+|bitnzpack256v32 | 9651| 1,254,757| 0.22 |**13503**|TurboPackV256 ZigZag|
+|bitnzpack128v32 |10155| 1,472,804| 0.26 |13380 |TurboPackV ZigZag|
+|vbddenc32 | 6198| 18,057,296| 3.13 |10982 |TurboVByte Delta of delta|
+|memcpy |13397|577,141,992|100.00||
+
+##### - Transpose/Shuffle (no compression)
+ ./icbench -eTRANSFORM ZIPF
+
+|Size |C Time MB/s|D Time MB/s|Function|
+|----------:|------:|------:|-----------------------------------|
+|100,000,000|**9400**|**9132**|**TPbyte 4** TurboPFor Byte Transpose/shuffle AVX2|
+|100,000,000|8784|8860|**TPbyte 4** TurboPFor Byte Transpose/shuffle SSE|
+|100,000,000|7688|7656|Blosc_Shuffle AVX2|
+|100,000,000|**5204**|**7460**|**TPnibble 4** TurboPFor Nibble Transpose/shuffle SSE|
+|100,000,000|6620|6284|Blosc shuffle SSE|
+|100,000,000|3156|3372|Bitshuffle AVX2|
+|100,000,000|2100|2176|Bitshuffle SSE|
+
+##### - (Lossy) Floating point compression:
+ ./icapp -Fd file " 64 bits floating point raw file
+ ./icapp -Ff file " 32 bits floating point raw file
+ ./icapp -Fcf file " text file with miltiple entries (ex. 8.657,56.8,4.5 ...)
+ ./icapp -Ftf file " text file (1 entry per line)
+ ./icapp -Ftf file -v5 " + display the first entries read
+ ./icapp -Ftf file.csv -K3 " but 3th column in a csv file (ex. number,Text,456.5 -> 456.5
+ ./icapp -Ftf file -g.001 " lossy compression with allowed pointwise relative error 0.001
+
+- see also [TurboTranspose](https://github.com/powturbo/TurboTranspose)
+
+##### - Compressed Inverted Index Intersections with GOV2<br />
+ GOV2: 426GB, 25 Millions documents, average doc. size=18k.
+
+ + Aol query log: 18.000 queries<br />
+ **~1300** queries per second (single core)<br />
+ **~5000** queries per second (quad core)<br />
+ Ratio = 14.37% Decoded/Total Integers.
+
+ + TREC Million Query Track (1MQT):<br />
+ **~1100** queries per second (Single core)<br />
+ **~4500** queries per second (Quad core CPU)<br />
+ Ratio = 11.59% Decoded/Total Integers.
+
+- Benchmarking intersections (Single core, AOL query log)
+
+| max.docid/q|Time s| q/s | ms/q | % docid found|
+|-----------------:|---:|----:|-----:|-------:|
+|1.000|7.88|2283.1|0.438|81|
+|10.000|10.54|1708.5|0.585|84|
+| ALL |13.96|1289.0|0.776|100|
+q/s: queries/second, ms/q:milliseconds/query
+
+- Benchmarking Parallel Query Processing (Quad core, AOL query log)
+
+| max.docid/q|Time s| q/s | ms/q | % docids found|
+|-----------------:|----:|----:|-----:|-------:|
+|1.000|2.66|6772.6|0.148|81|
+|10.000|3.39|5307.5|0.188|84|
+|ALL|3.57|5036.5|0.199|100|
+
+###### Notes:
+- Search engines are spending 90% of the time in intersections when processing queries.
+- Most search engines are using pruning strategies, caching popular queries,... to reduce the time for intersections and query processing.
+- As indication, google is processing [40.000 Queries per seconds](http://www.internetlivestats.com/google-search-statistics/),
+using [900.000 multicore servers](https://www.cloudyn.com/blog/10-facts-didnt-know-server-farms/) for searching [8 billions web pages](http://searchenginewatch.com/sew/study/2063479/coincidentally-googles-index-size-jumps) (320 X size of GOV2).
+- Recent "integer compression" GOV2 experiments (best paper at ECIR 2014) [On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf) using 8-core Xeon PC are reporting 1.2 seconds per query (for 1.000 Top-k docids).
+
+### Compile:
+ Download or clone TurboPFor
+ git clone git://github.com/powturbo/TurboPFor.git
+ cd TurboPFor
+ make
+
+
+ To benchmark external libraries + lz77 compression:
+ git clone --recursive git://github.com/powturbo/TurboPFor.git
+ cd TurboPFor
+ make CODEC1=1 CODEC2=1 LZ=1
+
+###### Windows visual c++
+ nmake /f makefile.vs
+
+###### Windows visual studio c++
+ project files under vs/vs2017
+
+### Testing:
+##### - Synthetic data (use ZIPF parameter):
+ + benchmark groups of "integer compression" functions <br />
+
+ ./icbench -eBENCH -a1.2 -m0 -M255 -n100M ZIPF
+ ./icbench -eBITPACK/VBYTE -a1.2 -m0 -M255 -n100M ZIPF
+
+ >*Type "icbench -l1" for a list*
+
+ >*-zipfian distribution alpha = 1.2 (Ex. -a1.0=uniform -a1.5=skewed distribution)<br />
+ -number of integers = 100.000.000<br />
+ -integer range from 0 to 255<br />*
+
+ + Unsorted lists: individual function test (ex. Copy TurboPack TurboPFor)<br />
+
+ ./icbench -a1.5 -m0 -M255 -ecopy/turbopack/turbopfor/turbopack256v ZIPF
+
+ + Unsorted lists: Zigzag encoding w/ option **-fz** or FOR encoding<br />
+
+ ./icbench -fz -eturbovbyte/turbopfor/turbopackv ZIPF
+ ./icbench -eturboforv ZIPF
+
+ + Sorted lists: differential coding w/ option **-fs** (increasing) or **-fS** (strictly increasing)<br />
+
+ ./icbench -fs -eturbopack/turbopfor/turbopfor256v ZIPF
+
+ + Generate interactive "file.html" plot for browsing
+
+ ./icbench -p2 -S2 -Q3 file.tbb
+
+ + Unit test: test function from bit size 0 to 32
+
+ ./icbench -m0 -M32 -eturbpfor -fu
+ ./icbench -m0 -M8 -eturbopack -fs -n1M
+
+##### - Data files:
+ - Raw 32 bits binary data file [Test data](https://github.com/ot/partitioned_elias_fano/tree/master/test/test_data)
+
+ ./icbench file
+ ./icapp file
+ ./icapp -Fs file "16 bits raw binary file
+ ./icapp -Fu file "32 bits raw binary file
+ ./icapp -Fl file "64 bits raw binary file
+ ./icapp -Ff file "32 bits raw floating point binary file
+ ./icapp -Fd file "64 bits raw floating point binary file
+
+ - Text file: 1 entry per line. [Test data: ts.txt(sorted) and lat.txt(unsorted)](https://github.com/zhenjl/encoding/tree/master/benchmark/data))
+
+ ./icbench -eBENCH -fts ts.txt
+ ./icbench -eBENCH -ft lat.txt
+
+ ./icapp -Fts data.txt "text file, one 16 bits integer per line
+ ./icapp -Ftu ts.txt "text file, one 32 bits integer per line
+ ./icapp -Ftl ts.txt "text file, one 64 bits integer per line
+ ./icapp -Ftf file "text file, one 32 bits floating point (ex. 8.32456) per line
+ ./icapp -Ftd file "text file, one 64 bits floating point (ex. 8.324567789) per line
+ ./icapp -Ftd file -v5 "like prev., display the first 100 values read
+ ./icapp -Ftd file -v5 -g.00001 "like prev., error bound lossy floating point compression
+ ./icapp -Ftt file "text file, timestamp in seconds iso-8601 -> 32 bits integer (ex. 2018-03-12T04:31:06)
+ ./icapp -FtT file "text file, timestamp in milliseconds iso-8601 -> 64 bits integer (ex. 2018-03-12T04:31:06.345)
+ ./icapp -Ftl -D2 -H file "skip 1th line, convert numbers with 2 decimal digits to 64 bits integers (ex. 456.23 -> 45623)
+ ./icapp -Ftl -D2 -H -K3 file.csv "like prev., use the 3th number in the line (ex. label=3245, text=99 usage=456.23 -> 456.23 )
+ ./icapp -Ftl -D2 -H -K3 -k| file.csv "like prev., use '|' as separator
+
+ - Text file: multiple numbers separated by non-digits (0..9,-,.) characters (ex. 134534,-45678,98788,4345, )
+
+ ./icapp -Fc data.txt "text file, 32 bits integers (ex. 56789,3245,23,678 )
+ ./icapp -Fcd data.txt "text file, 64 bits floting-point numbers (ex. 34.7689,5.20,45.789 )
+
+ - Multiblocks of 32 bits binary file. (Example gov2 from [DocId data set](#DocId))<br />
+ Block format: [n1: #of Ids][Id1] [Id2]...[IdN] [n2: #of Ids][Id1][Id2]...[IdN]...
+
+ ./icbench -fS -r gov2.sorted
+
+
+##### - Intersections:
+ 1 - Download Gov2 (or ClueWeb09) + query files (Ex. "1mq.txt") from [DocId data set](#DocId)<br />
+ 8GB RAM required (16GB recommended for benchmarking "clueweb09" files).
+
+ 2 - Create index file
+
+
+ ./idxcr gov2.sorted .
+
+
+ >*create inverted index file "gov2.sorted.i" in the current directory*
+
+ 3 - Test intersections
+
+
+ ./idxqry gov2.sorted.i 1mq.txt
+
+
+ >*run queries in file "1mq.txt" over the index of gov2 file*
+
+##### - Parallel Query Processing:
+ 1 - Create partitions
+
+
+ ./idxseg gov2.sorted . -26m -s8
+
+
+ >*create 8 (CPU hardware threads) partitions for a total of ~26 millions document ids*
+
+ 2 - Create index file for each partition
+
+
+ ./idxcr gov2.sorted.s*
+
+
+ >*create inverted index file for all partitions "gov2.sorted.s00 - gov2.sorted.s07" in the current directory*
+
+ 3 - Intersections:
+
+ delete "idxqry.o" file and then type "make para" to compile "idxqry" w. multithreading
+
+
+ ./idxqry gov2.sorted.s*.i 1mq.txt
+
+ >*run queries in file "1mq.txt" over the index of all gov2 partitions "gov2.sorted.s00.i - gov2.sorted.s07.i".*
+
+### Function usage:
+See benchmark "icbench" program for "integer compression" usage examples.
+In general encoding/decoding functions are of the form:
+
+ >**char *endptr = encode( unsigned *in, unsigned n, char *out, [unsigned start], [int b])**<br />
+ endptr : set by encode to the next character in "out" after the encoded buffer<br />
+ in : input integer array<br />
+ n : number of elements<br />
+ out : pointer to output buffer<br />
+ b : number of bits. Only for bit packing functions<br />
+ start : previous value. Only for integrated delta encoding functions
+
+
+ >**char *endptr = decode( char *in, unsigned n, unsigned *out, [unsigned start], [int b])**<br />
+ endptr : set by decode to the next character in "in" after the decoded buffer<br />
+ in : pointer to input buffer<br />
+ n : number of elements<br />
+ out : output integer array<br />
+ b : number of bits. Only for bit unpacking functions<br />
+ start : previous value. Only for integrated delta decoding functions
+
+ **Simple high level functions:**
+ >**size_t compressed_size = encode( unsigned *in, size_t n, char *out)**<br />
+ compressed_size : number of bytes written into compressed output buffer out<br />
+
+ >**size_t compressed_size = decode( char *in, size_t n, unsigned *out)**<br />
+ compressed_size : number of bytes read from compressed input buffer in<br />
+
+### Function syntax:
+ - {vb | p4 | bit | vs}[n][d | d1 | f | fm | z ]{enc/dec | pack/unpack}[| 128V | 256V][8 | 16 | 32 | 64]:<br />
+ vb: variable byte<br />
+ p4: turbopfor<br />
+ vs: variable simple<br />
+ bit: bit packing<br />
+ n : high level array functions for large arrays.
+
+ '' : encoding for unsorted integer lists<br />
+ 'd' : delta encoding for increasing integer lists (sorted w/ duplicate)<br />
+ 'd1': delta encoding for strictly increasing integer lists (sorted unique)<br />
+ 'f' : FOR encoding for sorted integer lists<br />
+ 'z' : ZigZag encoding for unsorted integer lists<br />
+
+ 'enc' or 'pack' : encode or bitpack<br />
+ 'dec' or 'unpack': decode or bitunpack<br />
+ 'NN' : integer size (8/16/32/64)<br />
+
+header files to use with documentation:<br />
+
+| c/c++ header file|Integer Compression functions| examples |
+|------------|-----------------------------|-----------------|
+|vint.h|variable byte| vbenc32/vbdec32 vbdenc32/vbddec32 vbzenc32/vbzdec32 |
+|vsimple.h|variable simple| vsenc64/vsdec64 |
+|vp4.h|TurboPFor| p4enc32/p4dec32 p4denc32/p4ddec32 p4zenc32/p4zdec32 |
+|bitpack.h|Bit Packing, For, +Direct Access| bitpack256v32/bitunpack256v32 bitforenc64/bitfordec64|
+|eliasfano.h|Elias Fano| efanoenc256v32/efanoc256v32 |
+
+Note: Some low level functions (like p4enc32) are limited to 128/256 (SSE/AVX2) integers per call.
+
+### Environment:
+###### OS/Compiler (64 bits):
+- Windows: MinGW-w64 makefile
+- Windows: Visual c++ (>=VS2008) - makefile.vs (for nmake)
+- Windows: Visual Studio project file - vs/vs2017 - Thanks to [PavelP](https://github.com/pps83)
+- Linux amd64: GNU GCC (>=4.6)
+- Linux amd64: Clang (>=3.2)
+- Linux arm64: 64 bits aarch64 ARMv8: gcc (>=6.3)
+- Linux arm64: 64 bits aarch64 ARMv8: clang
+- MaxOS: XCode (>=9)
+- MaxOS: Apple M1 (Clang)
+- PowerPC ppc64le (incl. SIMD): gcc (>=8.0)
+
+###### Multithreading:
+- All TurboPFor integer compression functions are thread safe
+
+### References:
+
+* [TurboPFor: an analysis](https://michael.stapelberg.ch/posts/2019-02-05-turbopfor-analysis/)
+
+* **Applications:**
+ * [Debian Code Search](https://github.com/Debian/dcs/)</br>
+ [Debian Code Search: positional index, TurboPFor-compressed](https://michael.stapelberg.ch/posts/2019-09-29-dcs-positional-turbopfor-index/)
+ * [Graph500](https://github.com/julianromera/graph500)
+ * [Small Polygon Compression](https://arxiv.org/abs/1509.05505) + [Poster](http://abhinavjauhri.me/publications/dcc_poster_2016.pdf) + [code](https://github.com/ajauhri/bignum_compression)
+ * [Parallel Graph Analysis (Lecture 18)](http://www.cs.rpi.edu/~slotag/classes/FA16/) + [code](http://www.cs.rpi.edu/~slotag/classes/FA16/handson/lec18-comp2.cpp)
+
+* **Benchmark references:**
+ * <a name="FastPFor"></a>[FastPFor](https://github.com/lemire/FastPFor) + [Simdcomp](https://github.com/lemire/simdcomp): SIMDPack FPF, Vbyte FPF, VarintG8IU, StreamVbyte, GroupSimple
+ * <a name="OptPFD"></a><a name="Simple16"></a>[Optimized Pfor-delta compression code](http://jinruhe.com): OptPFD/OptP4, Simple16 (limited to 28 bits integers)
+ * <a name="MaskedVByte"></a>[MaskedVByte](http://maskedvbyte.org/). See also: [Vectorized VByte Decoding](http://engineering.indeed.com/blog/2015/03/vectorized-vbyte-decoding-high-performance-vector-instructions/)
+ * <a name="Streamvbyte"></a>[Streamvbyte](https://github.com/lemire/streamvbyte).
+ * <a name="Simple-8b"></a>[Index Compression Using 64-Bit Words](http://people.eng.unimelb.edu.au/ammoffat/abstracts/am10spe.html): Simple-8b (speed optimized version tested)
+ * <a name="libfor"></a>[libfor](https://github.com/cruppstahl/for)
+ * <a name="QMX"></a>[Compression, SIMD, and Postings Lists](http://www.cs.otago.ac.nz/homepages/andrew/papers/) QMX integer compression from the "simple family"
+ * <a name="lz4"></a>[lz4](https://github.com/Cyan4973/lz4). included w. block size 64K as indication. Tested after preprocessing w. delta+transpose
+ * <a name="blosc"></a>[blosc](https://github.com/Blosc/c-blosc). blosc is like transpose/shuffle+lz77. Tested blosc+lz4 and blosclz incl. vectorizeed shuffle.<br>
+ * <a name="DocId"></a>[Document identifier data set](http://lemire.me/data/integercompression2014.html)
+
+* **Integer compression publications:**
+ * :green_book:[Evaluating Lightweight Integer Compression Algorithms in Column-Oriented In-Memory DBMS](http://www.adms-conf.org/2021-camera-ready/heinzl_adms21.pdf)
+ * :green_book:[In Vacuo and In Situ Evaluation of SIMD Codecs (TurboPackV,TurboPFor/QMX)](http://dl.acm.org/citation.cfm?id=3015023) + [paper](http://www.cs.otago.ac.nz/homepages/andrew/papers/)
+ * :green_book:[SIMD Compression and the Intersection of Sorted Integers](http://arxiv.org/abs/1401.6399)
+ * :green_book:[Partitioned Elias-Fano Indexes](http://www.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf)
+ * :green_book:[On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf)
+ * :green_book:[Google's Group Varint Encoding](http://static.googleusercontent.com/media/research.google.com/de//people/jeff/WSDM09-keynote.pdf)
+ * :green_book:[Integer Compression tweets](https://twitter.com/search?q=%23integercompression&src=typd)
+ * :green_book:[Efficient Compression of Scientific Floating-Point Data and An Application in Structural Analysis](https://www.jstage.jst.go.jp/article/jsces/2017/0/2017_20170002/_article)
+ * :green_book:[SPDP is a compression/decompression algorithm for binary IEEE 754 32/64 bits floating-point data](http://cs.txstate.edu/~burtscher/research/SPDPcompressor/)<br />
+ :green_book:[ SPDP - An Automatically Synthesized Lossless Compression Algorithm for Floating-Point Data](http://cs.txstate.edu/~mb92/papers/dcc18.pdf) + [DCC 2018](http://www.cs.brandeis.edu//~dcc/Programs/Program2018.pdf)
+
+Last update: 13 Nov 2021
+
+## APPENDIX: icbench Integer Compression Benchmark
+
+##### TurboPFor + external libraries
+<pre>
+TurboPFor https://github.com/powturbo/TurboPFor
+FastPFor (FP) https://github.com/lemire/FastPFor
+lz4 https://github.com/Cyan4973/lz4
+LittleIntPacker (LI) https://github.com/lemire/LittleIntPacker
+MaskedVbyte http://maskedvbyte.org
+Polycom (PC) https://github.com/encode84/bcm
+simdcomp (SC) https://github.com/lemire/simdcomp
+Simple-8b optimized https://github.com/powturbo/TurboPFor
+Streamvbyte https://github.com/lemire/streamvbyte
+VarintG8IU https://github.com/lemire/FastPFor
+</pre>
+
+##### Functions integrated into 'icbench' for benchmarking
+<pre>
+Codec group:
+TURBOPFOR TurboPFor library TurboPFor256V/TurboPack256V/TurboPFor256N/TurboPFor/TurboPackV/TurboVByte/TurboPack/TurboForDA/EliasFano/VSimple/TurboPForN/TurboPackN/TurboPForDI
+DEFAULT Default TurboPFor/TurboPackV/TurboVByte/TurboPack/TurboFor/TurboPForN/TurboPackN/TurboPForDI/TurboPFor256V/TurboPack256V/TurboPFor256N
+BENCH Benchmark TurboPFor/TurboPackV/TurboVByte/TurboPack/QMX/FP.SimdFastPfor/FP.SimdOptPFor/MaskedVbyte/StreamVbyte
+EFFICIENT Efficient TurboPFor/vsimple/turbovbyte
+TRANSFORM transpose/shufle,delta,zigzag tpbyte4s/tpbyte,4/tpnibble,4/ZigZag_32/Delta_32/BitShuffle,4
+BITPACK Bit Packing TurboPack256V/TurboPackV/TurboPackH/TurboPack/SC.SimdPack128/SC.SimdPack256
+VBYTE Variable byte TurboVByte/FP.VByte/PC.Vbyte/VarintG8IU/MaskedVbyte/StreamVbyte
+SIMPLE Simple Family simple8b/simple16/vsimple/qmx
+LZ4 lz4+bitshufle/transpose 4,8 lz4_bitshufle/lz4_tp4/lz4_tp8
+LI Little Integer LI_Pack/LI_TurboPack/LI_SuperPack/LI_HorPack
+
+
+Function Description level
+
+-------- ----------- -----
+TurboPFor PFor (SSE2)
+TurboPForN PFor (SSE2) large blocks
+TurboPFor256 PFor (AVX2)
+TurboPFor256N PFor (AVX2) large blocks
+TurboPForDA PFor direct access
+TurboPForDI PFord min
+TurboPForZZ PFor zigzag of delta
+TurboFor FOR
+TurboForV FOR (SIMD)
+TurboFor256V FOR (AVX2)
+TurboForDA FOR direct access
+TurboPackDA Bit packing direct access
+TurboPack Bit packing (scalar)
+TurboPackN Bit packing (scalar) large blocks
+TurboPackV Bit packing (SSE2 Vertical)
+TurboPackH Bit packing (SSE2 Horizontal)
+TurboPackVN Bit packing (SSE2 large block)
+TurboPack256V Bit packing (AVX2 Vertical)
+TurboPack256N Bit packing (AVX2 large block)
+TurboVByte Variable byte (scalar)
+VSimple Variable simple (scalar)
+EliasFano Elias fano (scalar)
+EliasFanoV Eliasfano (SSE2)
+EliasFano256V Elias fano (AVX2)
+memcpy memcpy
+copy Integer copy
+tpbyte4s Byte Transpose (scalar)
+tpbyte Byte transpose (simd) 2,4,8
+tpnibble Nibble transpose (simd) 2,4,8
+ZigZag32 ZigZag encoding (sse2)
+Delta32 Delta encoding (sse2)
+DDelta32 Delta of delta encoding (sse2)
+Xor32 Xor encoding (sse2)
+FP_PREV64 Floating point PFOR
+FP_FCM64 Floating point PFOR (FCM)
+FP_DFCM64 Floating point PFOR (DFCM)
+TurboPFor64 PFOR 64
+TurboPFor64V PFOR 64
+Simple8b 64 bits Simple family (instable)
+PC_Simple16 Simple 16. limited to 28 bits
+PC_OptPFD OptPFD. limited to 28 bits
+PC_Vbyte Variable byte
+PC_Rice Rice coding (instable)
+VarintG8IU Variable byte SIMD
+MaskedVbyte Variable byte SIMD
+StreamVbyte Variable byte SIMD
+FP_FastPFor PFor scalar (inefficient for small blocks)
+FP_SimdFastPFor PFor SIMD (inefficient for small blocks)
+FP_OptPFor OptPFor scalar
+FP_SIMDOptPFor OptPFor SIMD
+FP_VByte Variable byte
+FP_Simple8bRLE Simple-8b + rle
+FP_GROUPSIMPLE Group Simple
+SC_SIMDPack128 Bit packing (SSE4.1)
+SC_SIMDPack256 Bit packing (SSE4.1)
+SC_For For (SSE4.1)
+SC_ForDA For direct access (SSE4.1)
+LibFor_For For
+LibFor_ForDA For direct access
+LI_Pack Bit packing (scalar)
+LI_TurboPack Bit packing (scalar)
+LI_SuperPack Bit packing (scalar)
+LI_HorPack Bit packing (sse4.1 horizontal)
+LI_BMIPack256 Bit packing (avx2)
+lz4 lz4
+lz4_bit Bitshuffle + [delta]+lz4 2,4,8
+lz4_nibble TurboPFor's [delta]+nibble transpose + lz4 2,4,8
+lz4_bitxor Bitshuffle + [xor]+lz4 2,4,8
+lz4_nibblexor TurboPFor's [xor]+nibble transpose + lz4 2,4,8
+lz4_byte TurboPFor's [delta]+byte transpose + lz4 2,4,8
+BitShuffle Bit shuffle (simd) 2,4,8
+</pre>
+
diff --git a/src/ext/for/bic.c b/src/ext/for/bic.c
deleted file mode 100644
index 16c532b8..00000000
--- a/src/ext/for/bic.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/**
- Copyright (C) powturbo 2019-2023
- SPDX-License-Identifier: GPL v2 License
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- - email : powturbo [AT] gmail.com
- - github : https://github.com/powturbo
- - homepage : https://sites.google.com/site/powturbo/
- - twitter : https://twitter.com/powturbo
-**/
-// Binary Interpolative Coding
-// Reference: "On Implementing the Binary Interpolative Coding Algorithm" GIULIO ERMANNO PIBIRI, ISTI-CNS http://pages.di.unipi.it/pibiri/papers/BIC.pdf
-// "Techniques for Inverted Index Compression" GIULIO ERMANNO PIBIRI, ROSSANO VENTURINI, University of Pisa https://arxiv.org/abs/1908.10598
-
-#ifndef USIZE //---------- implementation --------------------------------------------------------------------------------------------------------------------------------------
-#include "include_/conf.h"
-#include "include_/bic.h"
-
-#include "include_/bitutil_.h"
-
-static ALWAYS_INLINE unsigned pow2next(unsigned x) { return x<2?1:(1ull << (__bsr32((x)-1)+1)); }
-
-size_t bicbound16(size_t n) { return n*2+4; }
-size_t bicbound32(size_t n) { return n*4+4; }
-//-- Simple binary ----------------------------------------------------------------------
-#define bicput(bw,br, _u_, _x_, _usize_) bitput( bw,br, T2(__bsr,_usize_)(_u_) + 1, _x_) /*AS(_u_ > 0, "Fatal bicput"); AS(_x_ <= _u_, "Fatal bicput2");*/
-#define bicget(bw,br, _u_, _x_, _usize_) bitget57(bw,br, T2(__bsr,_usize_)(_u_) + 1, _x_)
-
-//------------------------------------------
-#define BICENC_ bicbenc_
-#define BICDEC_ bicbdec_
-#define BICENC bicbenc
-#define BICDEC bicbdec
-
-//---- 16 bits ----------
-#define USIZE 16
-#define uint_t uint16_t
-#include "bic.c"
-
-//---- 32 bits ----------
-#define USIZE 32
-#define uint_t uint32_t
-#include "bic.c"
-#undef bicput
-#undef bicget
-#undef BICENC_
-#undef BICDEC_
-#undef BICENC
-#undef BICDEC
-
-// -- Leftmost minimal ---------------------------------------------------------------------
-#define bicput(bw,br, _u_, _x_, _usize_) { \
- unsigned _x = _x_, _u = _u_, _b = T2(__bsr,_usize_)(_u), hi = (1ull << (_b + 1)) - _u - 1;\
- if(_x < hi) bitput(bw,br, _b, _x);\
- else { _x += hi; bitput(bw,br, _b+1, (_x&1)<<_b | _x >> 1); }\
-}
-
-#define bicget(bw,br, _u_, _x_, _usize_) {\
- unsigned _u = _u_;\
- unsigned _b = T2(__bsr,_usize_)(_u);\
- uint_t _hi = (1ull << (_b + 1)) - _u - 1;\
- if((_x_ = bitpeek57(bw,br,_b)) < _hi) bitrmv(bw,br,_b);\
- else { \
- unsigned _y = (bitbw(bw,br)>>_b)&1;\
- bitrmv(bw,br,_b+1);\
- _x_= (_x_<<1) + _y - _hi;\
- }\
-}
-
-//--------------------------------------------
-#define BICENC_ bicenc_
-#define BICDEC_ bicdec_
-#define BICENC bicenc
-#define BICDEC bicdec
-
-//---- 16 bits ----------
-#define USIZE 16
-#define uint_t uint16_t
-#include "bic.c"
-
-//---- 32 bits ----------
-#define USIZE 32
-#define uint_t uint32_t
-#include "bic.c"
-#undef bicput
-#undef bicget
-#undef BICENC_
-#undef BICDEC_
-#undef BICENC
-#undef BICDEC
-
-//-- Center Minimal -----------------------------------------------------
-#define bicput(bw,br, _u_, _x_, _usize_) { \
- unsigned _x = _x_, _u = _u_, _b = T2(__bsr,_usize_)(_u); \
- uint64_t _c = (1ull << (_b + 1)) - _u - 1; \
- unsigned _c2 = _c >> 1, _r2 = _u >> 1, _lo = _r2-_c2, _hi = _r2+_c2+1;\
- if(!(_u & 1)) _lo -= 1; \
- _b += (_x <= _lo || _x >= _hi);\
- bitput(bw,br, _b, _x);\
-}
-
-#define bicget(bw,br, _u_, _x_, _usize_) { \
- unsigned _u = _u_, _b = T2(__bsr,_usize_)(_u);\
- uint64_t _c = (1ull << (_b + 1)) - _u - 1;\
- unsigned _c2 = _c>>1, _r2 = _u>>1, _lo = _r2 - _c2;\
- _lo -= ((_u & 1) == 0);\
- if((_x_ = bitpeek57(bw,br,_b)) > _lo) bitrmv(bw,br,_b);\
- else bitget57(bw,br, _b+1, _x_);\
-}
-
-//--------------------------------------------
-#define BICENC_ bicmenc_
-#define BICDEC_ bicmdec_
-#define BICENC bicmenc
-#define BICDEC bicmdec
-
-//---- 16 bits ----------
-#define USIZE 16
-#define uint_t uint16_t
-#include "bic.c"
-
-//---- 32 bits ----------
-#define USIZE 32
-#define uint_t uint32_t
-#include "bic.c"
-
-
-#else //-------------------- Template functions ----------------------------------------------------------------------------------------------------------
-
-static void T2(BICENC_,USIZE)(uint_t *in, unsigned n, unsigned char **_op, unsigned lo, unsigned hi, unsigned h, uint64_t *bw, unsigned *br) {
- while(n)
- if(hi - lo + 1 != n) { //AC(lo <= hi,"bicenc fatal lo=%d>hi=%d n=%d\n", lo, hi, n); AS(hi - lo >= n - 1, "bicenc_32 fatal hi-lo>n-1\n");
- unsigned x = in[h];
- bicput(*bw, *br, hi-n-lo+1, x-lo-h, USIZE); bitenorm(*bw,*br,*_op);
- T2(BICENC_,USIZE)( in, h, _op, lo, x-1, h>>1, bw,br);
- in += h+1; n -= h+1; lo = x+1; h = n >> 1;
- } else break;
-}
-
-#define RE(a) //a // recursion : RE(a) a
-#define RD(a) a // recursion : RD(a)
-static void T2(BICDEC_,USIZE)(unsigned char **_ip, unsigned n, uint_t *out, unsigned lo, unsigned hi, unsigned h, uint64_t *bw, unsigned *br) {
- RE(if(!n) return);
- RD(do) {
- if(likely(hi - lo + 1 != n)) { //AS(lo <= hi, "bicdec fatal");
- unsigned x;
- bitdnorm(*bw,*br,*_ip); bicget(*bw,*br, hi-lo+1-n, x, USIZE);
- out[h] = (x += lo + h);
- if(n != 1) {
- T2(BICDEC_,USIZE)(_ip, h, out, lo, x-1, h>>1, bw,br);
- RE(T2(BICDEC_,USIZE)(_ip,n- h-1, out+ h+1, x+1, hi, (n-h-1)>>1, bw,br));
- RD( n-=h+1; out+=h+1; lo=x+1; h = n>>1);
- } RD(else break);
- } else {
- BITFORSET_(out, n, lo, 1); //for(unsigned i = 0; i != n; ++i) out[i] = lo+i; //
- RD(break);
- }
- } RD(while(n));
-}
-
-unsigned T2(BICENC,USIZE)(uint_t *in, unsigned n, unsigned char *out) {
- if(!n) return 0; //for(unsigned i = 1; i < n; i++) { AC(in[i]>in[i-1], "bicenc32: Not sorted at=%u,count=%d\n", i, n); } //printf("n=%u ", n);printf("%u,", in[i]);
- bitdef(bw,br);
- unsigned char *op = out;
- unsigned x = in[n-1];
-
- ctou32(op) = x; op += 4;
- T2(BICENC_,USIZE)(in, n-1, &op, 0, x, pow2next(n)>>1, &bw,&br);
- bitflush(bw,br,op);
- return op - out;
-}
-
-unsigned T2(BICDEC,USIZE)(unsigned char *in, unsigned n, uint_t *out) {
- if(!n) return 0;
- bitdef(bw,br);
- unsigned char *ip = in;
- unsigned x = ctou32(ip);
-
- ip += 4;
- out[n-1] = x;
- T2(BICDEC_,USIZE)(&ip, n-1, out, 0, x, pow2next(n)>>1, &bw,&br);
- bitalign(bw,br,ip);
- return ip - in;
-}
-
-#undef USIZE
-#undef uint_t
-#endif
diff --git a/src/ext/for/bitpack.c b/src/ext/for/bitpack.c
index d894b0df..666b1030 100644
--- a/src/ext/for/bitpack.c
+++ b/src/ext/for/bitpack.c
@@ -1,6 +1,6 @@
/**
- Copyright (C) powturbo 2013-2023
- SPDX-License-Identifier: GPL v2 License
+ Copyright (C) powturbo 2013-2019
+ GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -23,19 +23,13 @@
**/
// "Integer Compression" bit packing
-#pragma warning( disable : 4005)
-#pragma warning( disable : 4090)
-#pragma warning( disable : 4068)
-
#include <stdio.h>
-#include <string.h>
-#include "include_/conf.h"
-#include "include_/bitpack.h"
-#include "include_/bitutil.h"
-
-#include "include_/vlcbyte.h"
-#include "include_/bitutil_.h"
-
+#define BITUTIL_IN
+#define VINT_IN
+#include "conf.h"
+#include "bitutil.h"
+#include "vint.h"
+#include "bitpack.h"
#define PAD8(_x_) ( (((_x_)+8-1)/8) )
#ifdef __ARM_NEON
@@ -44,28 +38,13 @@
#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
#endif
+#pragma warning( disable : 4005)
+#pragma warning( disable : 4090)
+#pragma warning( disable : 4068)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunsequenced"
-#ifndef __AVX2__
-#define BITNBOUND(_n_, _esize_, _csize_) ((_esize_*_n_) + ((_n_+_csize_-1)/_csize_))
-
-size_t bitnbound8( size_t n){ return BITNBOUND(n, 1, 128); }
-size_t bitnbound16( size_t n){ return BITNBOUND(n, 2, 128); }
-size_t bitnbound32( size_t n){ return BITNBOUND(n, 4, 128); }
-size_t bitnbound64( size_t n){ return BITNBOUND(n, 8, 128); }
-
-size_t bitnbound128v8( size_t n){ return BITNBOUND(n, 1, 128); }
-size_t bitnbound128v16(size_t n){ return BITNBOUND(n, 2, 128); }
-size_t bitnbound128v32(size_t n){ return BITNBOUND(n, 4, 128); }
-size_t bitnbound128v64(size_t n){ return BITNBOUND(n, 8, 128); }
-
-size_t bitnbound256v8( size_t n){ return BITNBOUND(n, 1, 256); }
-size_t bitnbound256v16(size_t n){ return BITNBOUND(n, 2, 256); }
-size_t bitnbound256v32(size_t n){ return BITNBOUND(n, 4, 256); }
-size_t bitnbound256v64(size_t n){ return BITNBOUND(n, 8, 128); }
-
-//---------------------------------------------- Plain -----------------------------------------------------------------------
+#if !defined(SSE2_ON) && !defined(AVX2_ON) //----------------------------------- Plain -----------------------------------------------------------------------
typedef unsigned char *(*BITPACK_F8)( uint8_t *__restrict out, unsigned n, const unsigned char *__restrict in);
typedef unsigned char *(*BITPACK_D8)( uint8_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint8_t start);
typedef unsigned char *(*BITPACK_F16)(uint16_t *__restrict out, unsigned n, const unsigned char *__restrict in);
@@ -75,25 +54,24 @@ typedef unsigned char *(*BITPACK_D32)(uint32_t *__restrict out, unsigned n, cons
typedef unsigned char *(*BITPACK_F64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in);
typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint64_t start);
- #if 1 //def _MSC_VER
+#if 1 //def _MSC_VER
#define VX (v=x)
#define V x
- #else
+#else
#define VX v
#define V v
- #endif
+#endif
- #if 0
+#if 0
#define IP0(_ip_,_x_) *_ip_
#define IP( _ip_,_x_) *_ip_++
#define IPI(_ip_)
- #else
+#else
#define IP0(_ip_,_x_) _ip_[_x_]
#define IP( _ip_,_x_) _ip_[_x_]
#define IPI(_ip_) _ip_ += 32
- #endif
+#endif
-//---- bitpack ---------------
#define IP9(_ip_,_x_, _parm_)
#define IPW(_ip_,_x_) VX
#define IPX(_ip_,_x_) (V = IP(_ip_,_x_))
@@ -111,12 +89,11 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
#undef IP32
#undef IP64
-//----- bitpack delta --------------
#define DELTA
-#define IP9(_ip_,_x_, _parm_) V = IP0(_ip_,_x_) - start; start = IP(_ip_,_x_)
-#define IPV(_ip_,_x_) VX
-#define IPX(_ip_,_x_) (V = IP(_ip_,_x_) - start)
+#define IP9(_ip_,_x_, _parm_) V = IP0(_ip_,_x_) - start; start = IP(_ip_,_x_)
+#define IPV(_ip_,_x_) VX
+#define IPX(_ip_,_x_) (V = IP(_ip_,_x_) - start)
#define IP16(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
#define IP32(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
#define IP64(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
@@ -129,9 +106,8 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
#undef IP32
#undef IP64
-//----- bitpack FOR ---------------
#define IP9(_ip_,_x_, _parm_)
-#define IPV(_ip_,_x_) (IP(_ip_,_x_) - start)
+#define IPV(_ip_,_x_) IP(_ip_,_x_) - start
#define IPX(_ip_,_x_) (V = IP(_ip_,_x_) - start)
#define IP16(_ip_,_x_, _parm_)
#define IP32(_ip_,_x_, _parm_)
@@ -145,10 +121,9 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
#undef IP32
#undef IP64
-//----- bitpack delta 1 -----------
-#define IP9( _ip_,_x_, _parm_) V = IP0(_ip_,_x_) - start - 1; start = IP(_ip_,_x_)
-#define IPV( _ip_,_x_) VX
-#define IPX(_ip_,_x_) (V = IP(_ip_,_x_) - start - 1)
+#define IP9( _ip_,_x_, _parm_) V = IP0(_ip_,_x_) - start - 1; start = IP(_ip_,_x_)
+#define IPV( _ip_,_x_) VX
+#define IPX(_ip_,_x_) (V = IP(_ip_,_x_) - start - 1)
#define IP16(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
#define IP32(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
#define IP64(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
@@ -168,10 +143,9 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
#define _BITPACK_ bitepack
#include "bitpack_.h"*/
-//------ bitpack zigzag --------------------
-#define IP9(_ip_,_x_, _parm_) V = T2(zigzagenc, USIZE)(IP(_ip_,_x_) - start); start = IP(_ip_,_x_)
+#define IP9(_ip_,_x_, _parm_) V = TEMPLATE2(zigzagenc, USIZE)(IP(_ip_,_x_) - start); start = IP(_ip_,_x_)
#define IPV(_ip_,_x_) VX
-#define IPX(_ip_,_x_) (V = T2(zigzagenc, USIZE)(IP(_ip_,_x_) - start))
+#define IPX(_ip_,_x_) (V = TEMPLATE2(zigzagenc, USIZE)(IP(_ip_,_x_) - start))
#define IP16(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
#define IP32(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
#define IP64(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
@@ -184,23 +158,6 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
#undef IP32
#undef IP64
-//------ bitpack xor --------------------
-#define IP9(_ip_,_x_, _parm_) V = IP(_ip_,_x_) ^ start; start = IP(_ip_,_x_)
-#define IPV(_ip_,_x_) VX
-#define IPX(_ip_,_x_) (V = IP(_ip_,_x_) ^ start)
-#define IP16(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
-#define IP32(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
-#define IP64(_ip_,_x_, _parm_) start = IP(_ip_,_x_)
-#define _BITPACK_ bitxpack
-#include "bitpack_.h"
-#undef IP9
-#undef IPV
-#undef IPX
-#undef IP16
-#undef IP32
-#undef IP64
-
-//----- bitpack FOR 1 ---------------------
#define IPI(_ip_) _ip_ += 32; start += 32
#define IP9(_ip_,_x_, _parm_)
#define IPV(_ip_,_x_) (IP(_ip_,_x_) - start - (_x_) - 1)
@@ -218,14 +175,14 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
#undef IP32
#undef IP64
-//----------------------------------- bitnpack ----------------------------------------------------
#define BITNPACK(in, n, out, _csize_, _usize_) { unsigned char *op = out;\
for(ip = in, in += n; ip < in;) { \
- T3(uint, _usize_, _t) o,x;\
+ TEMPLATE3(uint, _usize_, _t) o,x;\
unsigned iplen = in - ip,b; \
- if(iplen > _csize_) iplen = _csize_; PREFETCH(ip+512,0);\
- o = T2(bit,_usize_)(ip, iplen, &x); b = T2(bsr,_usize_)(o);\
- *op++ = b; op = T2(bitpacka, _usize_)[b](ip, iplen, op);\
+ if(iplen > _csize_) iplen = _csize_; \
+ PREFETCH(ip+512,0);\
+ o = TEMPLATE2(bit,_usize_)(ip, iplen, &x); b = TEMPLATE2(bsr,_usize_)(o);\
+ *op++ = b; op = TEMPLATE2(bitpacka, _usize_)[b](ip, iplen, op);\
ip += iplen;\
}\
return op - out;\
@@ -233,15 +190,14 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
#define BITNDPACK(in, n, out, _csize_, _usize_, _bitd_, _bitpacka_) { if(!n) return 0;\
unsigned char *op = out; \
- T3(uint, _usize_, _t) o,x;\
+ TEMPLATE3(uint, _usize_, _t) o,x;\
start = *in++; \
- T2(vbxput, _usize_)(op, start);\
- for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { \
- unsigned b; PREFETCH(ip+512,0);\
- o = T2(_bitd_, _usize_)(ip, _csize_, &x, start); b = T2(bsr,_usize_)(o); *op++ = b; op = T2(_bitpacka_,_usize_)[b](ip, _csize_, op, start); ip += _csize_; start = ip[-1];\
+ TEMPLATE2(vbxput, _usize_)(op, start);\
+ for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { unsigned b; PREFETCH(ip+512,0);\
+ o = TEMPLATE2(_bitd_, _usize_)(ip, _csize_, &x, start); b = TEMPLATE2(bsr,_usize_)(o); *op++ = b; op = TEMPLATE2(_bitpacka_,_usize_)[b](ip, _csize_, op, start); ip += _csize_; start = ip[-1];\
}\
if(n&=(_csize_-1)) { unsigned b;\
- o = T2(_bitd_, _usize_)(ip, n, &x, start); b = T2(bsr,_usize_)(o); *op++ = b; op = T2(_bitpacka_,_usize_)[b](ip, n, op, start);\
+ o = TEMPLATE2(_bitd_, _usize_)(ip, n, &x, start); b = TEMPLATE2(bsr,_usize_)(o); *op++ = b; op = TEMPLATE2(_bitpacka_,_usize_)[b](ip, n, op, start);\
}\
return op - out;\
}
@@ -266,34 +222,28 @@ size_t bitnzpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict
size_t bitnzpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitz, bitzpacka); }
size_t bitnzpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitz, bitzpacka); }
-size_t bitnxpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t *ip,start; BITNDPACK(in, n, out, 128, 8, bitx, bitxpacka); }
-size_t bitnxpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNDPACK(in, n, out, 128, 16, bitx, bitxpacka); }
-size_t bitnxpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitx, bitxpacka); }
-size_t bitnxpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitx, bitxpacka); }
-
size_t bitnfpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t *ip,start; BITNDPACK(in, n, out, 128, 8, bitf, bitfpacka); }
size_t bitnfpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNDPACK(in, n, out, 128, 16, bitf, bitfpacka); }
size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitf, bitfpacka); }
size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitf, bitfpacka); }
- #endif // ifndef AVX2
-//--------------------------------------- SIMD ----------------------------------------------------------------------------------------------
+#else //--------------------------------------- SIMD ----------------------------------------------------------------------------------------------
#define _BITNPACKV(in, n, out, _csize_, _usize_, _bitpackv_) {\
- unsigned char *op = out; T3(uint, _usize_, _t) _o,_x;\
+ unsigned char *op = out; TEMPLATE3(uint, _usize_, _t) _o,_x;\
for(ip = in; ip != in + (n&~(_csize_-1)); ip += _csize_) { PREFETCH(ip+512,0);\
- unsigned _b; _o = T2(bit,_usize_)(ip, _csize_, &_x); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(_bitpackv_, _usize_)(ip, _csize_, op, _b);\
- } if(n&=(_csize_-1)) { unsigned _b; _o = T2(bit,_usize_)(ip, n, &_x); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(bitpack, _usize_)(ip, n, op, _b); }\
+ unsigned _b; _o = TEMPLATE2(bit,_usize_)(ip, _csize_, &_x); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(_bitpackv_, _usize_)(ip, _csize_, op, _b);\
+ } if(n&=(_csize_-1)) { unsigned _b; _o = TEMPLATE2(bit,_usize_)(ip, n, &_x); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(bitpack, _usize_)(ip, n, op, _b); }\
return op - out;\
}
#define _BITNDPACKV(in, n, out, _csize_, _usize_, _bitdv_, _bitpackv_, _bitd_, _bitpack_) { if(!n) return 0;\
- unsigned char *op = out; T3(uint, _usize_, _t) _o,_x;\
+ unsigned char *op = out; TEMPLATE3(uint, _usize_, _t) _o,_x;\
start = *in++; \
- T2(vbxput, _usize_)(op, start);\
+ TEMPLATE2(vbxput, _usize_)(op, start);\
for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\
- unsigned _b; _o = T2(_bitdv_, _usize_)(ip, _csize_, &_x, start); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(_bitpackv_, _usize_)(ip, _csize_, op, start, _b); ip += _csize_; start = ip[-1];\
- } if(n&=(_csize_-1)) { unsigned _b; _o = T2(_bitd_, _usize_)(ip, n, &_x, start); _b = T2(bsr,_usize_)(_o); *op++ = _b; op = T2(_bitpack_, _usize_)(ip, n, op, start, _b); }\
+ unsigned _b; _o = TEMPLATE2(_bitdv_, _usize_)(ip, _csize_, &_x, start); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(_bitpackv_, _usize_)(ip, _csize_, op, start, _b); ip += _csize_; start = ip[-1];\
+ } if(n&=(_csize_-1)) { unsigned _b; _o = TEMPLATE2(_bitd_, _usize_)(ip, n, &_x, start); _b = TEMPLATE2(bsr,_usize_)(_o); *op++ = _b; op = TEMPLATE2(_bitpack_, _usize_)(ip, n, op, start, _b); }\
return op - out;\
}
@@ -308,88 +258,63 @@ size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict
#define OPPE(__op)
#define IPPE(__op)
-//--- bitpack ---------------
#define VI32(ip, i, iv, parm)
#define IP32(ip, i, iv) _mm256_loadu_si256(ip++)
-unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
+unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
#undef VI32
#undef IP32
-//-- bipack FOR --------------------------------------------------------------------------------------------------------------
+
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),sv)
#define IP32(_ip_, i, _iv_) _iv_
#include "bitpack_.h"
-unsigned char *bitfpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(256*b);
- __m256i sv = _mm256_set1_epi32(start), v;
+unsigned char *bitfpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+ __m256i v, sv = _mm256_set1_epi32(start);
BITPACK256V32(in, b, out, sv);
return pout;
}
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),_sv_); _sv_ = _mm256_add_epi32(_sv_,cv);
#define IP32(ip, i, _iv_) _iv_
-unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(256*b);
- __m256i v, sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1),
- cv = _mm256_set1_epi32(8);
- BITPACK256V32(in, b, out, sv);
- return pout;
+unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+ __m256i v, sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm256_set1_epi32(8);
+ BITPACK256V32(in, b, out, sv); return pout;
}
-//-- bitpack delta -------------------------------------------------------------------------------------------------------------
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v
#define IP32(ip, i, _iv_) _iv_
#include "bitpack_.h"
-unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(256*b);
+unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
__m256i v,sv = _mm256_set1_epi32(start);
BITPACK256V32(in, b, out, sv);
return pout;
}
-//-- bitpack delta 1 ---------------------------------------------------------------------------------------------------------------
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = _mm256_sub_epi32(mm256_delta_epi32(v,_sv_),cv); _sv_ = v
-unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(256*b);
- __m256i sv = _mm256_set1_epi32(start), v,
- cv = _mm256_set1_epi32(1);
+unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+ __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
BITPACK256V32(in, b, out, sv);
return pout;
}
-//-- bitpack zigzag -------------------------------------------------------------------------------------------------------------------------
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v; _iv_ = mm256_zzage_epi32(_iv_)
-unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(256*b);
- __m256i sv = _mm256_set1_epi32(start), v,
- cv = _mm256_set1_epi32(1);
- BITPACK256V32(in, b, out, sv);
- return pout;
-}
-
-//-- bitpack xor --------------------------------------------------------------------------------------------------------------
-#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_xore_epi32(v,_sv_); _sv_ = v;
-unsigned char *bitxpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(256*b);
- __m256i sv = _mm256_set1_epi32(start), v;
+unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
+ __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
BITPACK256V32(in, b, out, sv);
return pout;
}
-//--------------------------------------------------- bitnpack --------------------------------------------------------------------------------------------------
-size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 256, 32, bitpack256v); }
-size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd256v, bitdpack256v, bitd, bitdpack); }
-size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1256v, bitd1pack256v,bitd1, bitd1pack); }
-size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz256v, bitzpack256v, bitz, bitzpack); }
-size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); }
-size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitx256v, bitxpack256v, bitx, bitxpack); }
+size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 256, 32, bitpack256v); }
+size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd, bitdpack256v, bitd, bitdpack); }
+size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1, bitd1pack256v,bitd1, bitd1pack); }
+size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz, bitzpack256v, bitz, bitzpack); }
+size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); }
- #elif defined(__SSE3__) || defined(__ARM_NEON) //----------------------------- SSE / AVX ---------------------------------------------------------------
+ #elif defined(__SSE2__) || defined(__ARM_NEON) //----------------------------- SSE ---------------------------------------------------------------
#define OPPE(__op)
#define IPPE(__op)
-//-- bitpack --------------------------------------------------------------------------------
#define VI16(ip, i, iv, parm)
#define VI32(ip, i, iv, parm)
#define IP16(_ip_, i, iv) _mm_loadu_si128(_ip_++)
@@ -397,15 +322,8 @@ size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__rest
#include "bitpack_.h"
unsigned char *bitpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V16(in, b, out, 0); return pout; }
unsigned char *bitpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; }
-unsigned char *bitpack256w32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) {
- unsigned char *_out = out;
- unsigned *_in = in;
- BITPACK128V32(in, b, out, 0);
- in = _in+128;
- out = _out+PAD8(128*b);
- BITPACK128V32(in, b, out, 0);
- return _out+PAD8(256*b);
-}
+unsigned char *bitpack256w32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *_out=out; unsigned *_in=in;
+BITPACK128V32(in, b, out, 0); in = _in+128; out = _out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return _out+PAD8(256*b); }
#ifdef __ARM_NEON
//#define IP32(_ip_, i, iv) _mm_or_si128(_mm_shuffle_epi32( _mm_loadu_si128(_ip_++),_MM_SHUFFLE(3, 1, 2, 0)), _mm_shuffle_epi32( _mm_loadu_si128(_ip_++),_MM_SHUFFLE(2, 0, 3, 1)) )
@@ -415,148 +333,75 @@ unsigned char *bitpack256w32(unsigned *__restrict in, unsigned n, unsigned
#endif
#include "bitpack_.h"
unsigned char *bitpack128v64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) {
- if(b <= 32) {
- unsigned char *pout = out+PAD8(128*b);
- BITPACK128V32(in, b, out, 0);
- return pout;
- } else return bitpack64(in,n,out,b);
+ if(b<=32) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; } else return bitpack64(in,n,out,b);
}
-//-- bitpack delta -----------------------------------------------------------------------------------------------------------------------
#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi16(v,_sv_); _sv_ = v
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi32(v,_sv_); _sv_ = v
#define IP16(ip, i, _iv_) _iv_
#define IP32(ip, i, _iv_) _iv_
#include "bitpack_.h"
-unsigned char *bitdpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start), v;
- BITPACK128V16(in, b, out, sv);
- return pout;
+unsigned char *bitdpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v,sv = _mm_set1_epi16(start); BITPACK128V16(in, b, out, sv); return pout;
}
-unsigned char *bitdpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start), v;
- BITPACK128V32(in, b, out, sv);
- return pout;
+unsigned char *bitdpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v,sv = _mm_set1_epi32(start); BITPACK128V32(in, b, out, sv); return pout;
}
-//-- bitpack FOR ---------------------------------------------------------------------------------------------------------------------------
#define VI16(_ip_, _i_, _iv_, _sv_)
#define VI32(_ip_, _i_, _iv_, _sv_)
#define IP16(_ip_, i, _iv_) _mm_sub_epi16(_mm_loadu_si128(_ip_++),sv)
#define IP32(_ip_, i, _iv_) _mm_sub_epi32(_mm_loadu_si128(_ip_++),sv)
#include "bitpack_.h"
-unsigned char *bitfpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start), v;
- BITPACK128V16(in, b, out, sv);
- return pout;
+unsigned char *bitfpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set1_epi16(start); BITPACK128V16(in, b, out, sv); return pout;
}
-unsigned char *bitfpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start), v;
- BITPACK128V32(in, b, out, sv);
- return pout;
+unsigned char *bitfpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set1_epi32(start); BITPACK128V32(in, b, out, sv); return pout;
}
-//-- bitpack delta 1 -----------------------------------------------------------------------------------------------------------------------
#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi16(mm_delta_epi16(v,_sv_),cv); _sv_ = v
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi32(mm_delta_epi32(v,_sv_),cv); _sv_ = v
#define IP16(ip, i, _iv_) _iv_
#define IP32(ip, i, _iv_) _iv_
-unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start),
- cv = _mm_set1_epi16(1), v;
- BITPACK128V16(in, b, out, sv);
- return pout;
+unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(1), v; BITPACK128V16(in, b, out, sv); return pout;
}
-
-unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start), v,
- cv = _mm_set1_epi32(1);
- BITPACK128V32(in, b, out, sv);
- return pout;
+unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); BITPACK128V32(in, b, out, sv); return pout;
}
-//-- bitpack sub -----------------------------------------------------------------------------------------------------------------------------
#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi16(SUBI16x8(v,_sv_),cv); _sv_ = v
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi32(SUBI32x4(v,_sv_),cv); _sv_ = v
#define IP16(ip, i, _iv_) _iv_
#define IP32(ip, i, _iv_) _iv_
-unsigned char *bits1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start), v,
- cv = _mm_set1_epi16(8);
- BITPACK128V16(in, b, out, sv);
- return pout;
+unsigned char *bits1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITPACK128V16(in, b, out, sv); return pout;
}
-unsigned char *bits1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start), v,
- cv = _mm_set1_epi32(4);
- BITPACK128V32(in, b, out, sv);
- return pout;
+unsigned char *bits1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); return pout;
}
-//-- bitpack FOR 1 -------------------------------------------------------------------------------------------------------------------------
#define VI16(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi16(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi16(_sv_,cv);
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi32(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi32(_sv_,cv);
#define IP16(ip, i, _iv_) _iv_
#define IP32(ip, i, _iv_) _iv_
-unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), v,
- cv = _mm_set1_epi16(8);
- BITPACK128V16(in, b, out, sv);
- return pout;
+unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm_set1_epi16(8); BITPACK128V16(in, b, out, sv); return pout;
}
-unsigned char *bitf1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set_epi32( start+4,start+3,start+2,start+1), v,
- cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv);
- return pout;
+unsigned char *bitf1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set_epi32( start+4,start+3,start+2,start+1), cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); return pout;
}
-//-- bitpack zigzag ----------------------------------------------------------------------------------------------------------------------
#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi16(v,_sv_); _sv_ = v; _iv_ = mm_zzage_epi16(_iv_)
-unsigned char *bitzpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start), v,
- cv = _mm_set1_epi16(1);
- BITPACK128V16(in, b, out, sv);
- return pout;
+unsigned char *bitzpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(1); BITPACK128V16(in, b, out, sv); return pout;
}
-
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_delta_epi32(v,_sv_); _sv_ = v; _iv_ = mm_zzage_epi32(_iv_)
-unsigned char *bitzpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start), v,
- cv = _mm_set1_epi32(1);
- BITPACK128V32(in, b, out, sv);
- return pout;
+unsigned char *bitzpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
+ __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); BITPACK128V32(in, b, out, sv); return pout;
}
-//-- bitpack xor --------------------------------------------------------------------------------------------------------------------------
-#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_xore_epi16(v,_sv_); _sv_ = v;
-unsigned char *bitxpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start), v;
- BITPACK128V16(in, b, out, sv);
- return pout;
-}
-
-#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = mm_xore_epi32(v,_sv_); _sv_ = v;
-unsigned char *bitxpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) {
- unsigned char *pout = out+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start), v;
- BITPACK128V32(in, b, out, sv);
- return pout;
-}
-
-//---------------------------- bitpack --------------------------------------------------------------------------------------------------------------------------
size_t bitnpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip; _BITNPACKV( in, n, out, 128, 16, bitpack128v); }
size_t bitnpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 128, 32, bitpack128v); }
size_t bitnpack128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip; _BITNPACKV( in, n, out, 128, 64, bitpack128v); }
@@ -574,11 +419,9 @@ size_t bitns1pack128v32(uint32_t *__restrict in, size_t n, unsigned char *__rest
size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitz, bitzpack128v, bitz, bitzpack); }
size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitz, bitzpack128v, bitz, bitzpack); }
-size_t bitnxpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitx, bitxpack128v, bitx, bitxpack); }
-size_t bitnxpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitx, bitxpack128v, bitx, bitxpack); }
-
size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitf, bitfpack128v, bitf, bitfpack); }
size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitf, bitfpack128v, bitf, bitfpack); }
#endif // SSE
+#endif // Plain
#pragma clang diagnostic pop
diff --git a/src/ext/for/include_/bitpack.h b/src/ext/for/bitpack.h
similarity index 81%
rename from src/ext/for/include_/bitpack.h
rename to src/ext/for/bitpack.h
index a4aa1f5a..b0b9e022 100644
--- a/src/ext/for/include_/bitpack.h
+++ b/src/ext/for/bitpack.h
@@ -1,24 +1,41 @@
-//-- bitpack -------------------------------------------------------------------------------------------------------
+/**
+ Copyright (C) powturbo 2013-2019
+ GPL v2 License
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ - homepage : https://sites.google.com/site/powturbo/
+ - github : https://github.com/powturbo
+ - twitter : https://twitter.com/powturbo
+ - email : powturbo [_AT_] gmail [_DOT_] com
+**/
+// bitpack.h - "Integer Compression" Binary Packing header file
+#ifndef BITPACK_H_
+#define BITPACK_H_
+#if defined(_MSC_VER) && _MSC_VER < 1600
+#include "vs/stdint.h"
+#else
+#include <stdint.h>
+#endif
+#include <stddef.h>
+
#ifdef __cplusplus
extern "C" {
#endif
-size_t bitnbound8( size_t n);
-size_t bitnbound16( size_t n);
-size_t bitnbound32( size_t n);
-size_t bitnbound64( size_t n);
-
-size_t bitnbound128v8( size_t n);
-size_t bitnbound128v16(size_t n);
-size_t bitnbound128v32(size_t n);
-size_t bitnbound128v64(size_t n);
-
-size_t bitnbound256v8( size_t n);
-size_t bitnbound256v16(size_t n);
-size_t bitnbound256v32(size_t n);
-size_t bitnbound256v64(size_t n);
-
-//******************** Bit Packing High Level API - n unlimited ****************************
+//******************** Bit Packing High Level API - n unlimited ***************************************************
size_t bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
@@ -52,14 +69,6 @@ size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__re
size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
size_t bitnfpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
@@ -101,14 +110,6 @@ size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__re
size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnxunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
-size_t bitnxunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnxunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnxunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitnxunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnxunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
size_t bitnfunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t bitnfunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnfunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
@@ -116,13 +117,6 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__re
size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t bitns1pack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitns1pack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitns1unpack128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitns1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-
//******** Bit Packing Low level API ****************************************************************
// bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
unsigned char *bitpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
@@ -158,12 +152,6 @@ unsigned char *bitzpack16( uint16_t *__restrict in, unsigned n, const unsigned
unsigned char *bitzpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
unsigned char *bitzpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-// xor : unsorted integer array
-unsigned char *bitxpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
-unsigned char *bitxpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitxpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitxpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-
//-------------------------------------- SIMD ------------------------------------------------------------------------------------------
// Pack array with 128 unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
unsigned char *bitpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
@@ -172,31 +160,24 @@ unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, uns
unsigned char *bitfpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
unsigned char *bitzpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitxpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b);
-unsigned char *bitdpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitfpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitf1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitzpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitxpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
+unsigned char *bitdpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitfpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitf1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitzpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
//unsigned char *bitpack256w32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
-unsigned char *bitpack128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b);
-
-unsigned char *bitpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b);
-unsigned char *bitdpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitfpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitzpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitxpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-
-unsigned char *bits1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bits1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bits1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
+unsigned char *bitpack128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
+
+unsigned char *bitpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
+unsigned char *bitdpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitfpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+unsigned char *bitzpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
+
//********************************** Bit Packing : Unpack ****************************************************************
// ---------------- Unpack a b-bits packed integer array -------------------------------------------------------------------------------
@@ -208,6 +189,15 @@ unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint
// ---------------- Direct Access to a single packed integer array entry --------------------------------------------------------------
#ifdef TURBOPFOR_DAC
+ #ifdef __AVX2__
+#include <immintrin.h>
+#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
+#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
+ #else
+#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
+#define bzhi32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
+ #endif
+
#include "conf.h"
static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
@@ -247,12 +237,6 @@ unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, uin
unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
unsigned char *bitzunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-// ---------------- Xor : integrated bitpacking, for xor packed unsorted
-unsigned char *bitxunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
-unsigned char *bitxunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitxunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitxunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-
// ---------------- For : Direct Access for packed SORTED array --------------------------------------------
// out[i] = start + in[i] + i
unsigned char *bitfunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
@@ -270,7 +254,6 @@ unsigned char *bitf1unpack64(const unsigned char *__restrict in, unsigned n, uin
// SIMD unpack a 128/256 bitpacked integer array. Return value = end of packed buffer in
unsigned char *bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b);
unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitxunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
@@ -278,7 +261,6 @@ unsigned char *bitf1unpack128v16(const unsigned char *__restrict in, unsigned n,
unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitxunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
@@ -289,7 +271,6 @@ unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n,
unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitxunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
@@ -325,4 +306,5 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n
#ifdef __cplusplus
}
#endif
+#endif
diff --git a/src/ext/for/bitpack_.h b/src/ext/for/bitpack_.h
index 6480df87..d20cf9f0 100644
--- a/src/ext/for/bitpack_.h
+++ b/src/ext/for/bitpack_.h
@@ -1,6 +1,6 @@
/**
- Copyright (C) powturbo 2013-2023
- SPDX-License-Identifier: GPL v2 License
+ Copyright (C) powturbo 2013-2017
+ GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -2258,556 +2258,543 @@
BITBLK64_64(ip, 31, op, parm); IPI(ip); op += 64*4/sizeof(op[0]);\
}
-#define BP(_b_,_usize_) unsigned char *out_=out+PAD8(n*_b_),*op, bout[PAD8(64*_b_)]; T3(uint,_usize_,_t) bin[64],*ip,*in_=in+n, v,x; \
- do { ip = in+32; op = out+PAD8(32*_b_); if(ip > in_) { memcpy(bin, in, (in_-in)*(_usize_/8)); in = bin; out = bout; } \
- T2(BITPACK64_,_b_)(in, out, start); in = ip; out = op; PREFETCH(in+384,0);\
- } while(in<in_); if(in>in_) { out -= PAD8(32*_b_); memcpy(out,bout,PAD8((in_-(in-32))*_b_)); } return out_
-
#ifndef DELTA
#define USIZE 8
-unsigned char *T2(_BITPACK_,8_0)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
-unsigned char *T2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(1,8);}
-unsigned char *T2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(2,8);}
-unsigned char *T2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(3,8);}
-unsigned char *T2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(4,8);}
-unsigned char *T2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(5,8);}
-unsigned char *T2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(6,8);}
-unsigned char *T2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(7,8);}
-unsigned char *T2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(8,8);}
-BITPACK_F8 T2(_BITPACK_,a8)[] = {
- &T2(_BITPACK_,8_0),
- &T2(_BITPACK_,8_1),
- &T2(_BITPACK_,8_2),
- &T2(_BITPACK_,8_3),
- &T2(_BITPACK_,8_4),
- &T2(_BITPACK_,8_5),
- &T2(_BITPACK_,8_6),
- &T2(_BITPACK_,8_7),
- &T2(_BITPACK_,8_8)
+unsigned char *TEMPLATE2(_BITPACK_,8_0)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*1); uint8_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*2); uint8_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*3); uint8_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*4); uint8_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*5); uint8_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*6); uint8_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*7); uint8_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*8); uint8_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F8 TEMPLATE2(_BITPACK_,a8)[] = {
+ &TEMPLATE2(_BITPACK_,8_0),
+ &TEMPLATE2(_BITPACK_,8_1),
+ &TEMPLATE2(_BITPACK_,8_2),
+ &TEMPLATE2(_BITPACK_,8_3),
+ &TEMPLATE2(_BITPACK_,8_4),
+ &TEMPLATE2(_BITPACK_,8_5),
+ &TEMPLATE2(_BITPACK_,8_6),
+ &TEMPLATE2(_BITPACK_,8_7),
+ &TEMPLATE2(_BITPACK_,8_8)
};
-unsigned char *T2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a8)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a8)[ b](in, n, out); }
#define USIZE 16
-unsigned char *T2(_BITPACK_,16_0 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
-unsigned char *T2(_BITPACK_,16_1 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(1,16);}
-unsigned char *T2(_BITPACK_,16_2 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(2,16);}
-unsigned char *T2(_BITPACK_,16_3 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(3,16);}
-unsigned char *T2(_BITPACK_,16_4 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(4,16);}
-unsigned char *T2(_BITPACK_,16_5 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(5,16);}
-unsigned char *T2(_BITPACK_,16_6 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(6,16);}
-unsigned char *T2(_BITPACK_,16_7 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(7,16);}
-unsigned char *T2(_BITPACK_,16_8 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(8,16);}
-unsigned char *T2(_BITPACK_,16_9 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(9,16);}
-unsigned char *T2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(10,16);}
-unsigned char *T2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(11,16);}
-unsigned char *T2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(12,16);}
-unsigned char *T2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(13,16);}
-unsigned char *T2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(14,16);}
-unsigned char *T2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(15,16);}
-unsigned char *T2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(16,16);}
-BITPACK_F16 T2(_BITPACK_,a16)[] = {
- &T2(_BITPACK_,16_0),
- &T2(_BITPACK_,16_1),
- &T2(_BITPACK_,16_2),
- &T2(_BITPACK_,16_3),
- &T2(_BITPACK_,16_4),
- &T2(_BITPACK_,16_5),
- &T2(_BITPACK_,16_6),
- &T2(_BITPACK_,16_7),
- &T2(_BITPACK_,16_8),
- &T2(_BITPACK_,16_9),
- &T2(_BITPACK_,16_10),
- &T2(_BITPACK_,16_11),
- &T2(_BITPACK_,16_12),
- &T2(_BITPACK_,16_13),
- &T2(_BITPACK_,16_14),
- &T2(_BITPACK_,16_15),
- &T2(_BITPACK_,16_16)
+unsigned char *TEMPLATE2(_BITPACK_,16_0)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,16_1)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*1); uint16_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_2)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*2); uint16_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_3)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*3); uint16_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_4)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*4); uint16_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_5)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*5); uint16_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_6)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*6); uint16_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_7)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*7); uint16_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_8)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*8); uint16_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_9)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*9); uint16_t v,x;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*10); uint16_t v,x;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*11); uint16_t v,x;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*12); uint16_t v,x;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*13); uint16_t v,x;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*14); uint16_t v,x;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*15); uint16_t v,x;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*16); uint16_t v,x;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F16 TEMPLATE2(_BITPACK_,a16)[] = {
+ &TEMPLATE2(_BITPACK_,16_0),
+ &TEMPLATE2(_BITPACK_,16_1),
+ &TEMPLATE2(_BITPACK_,16_2),
+ &TEMPLATE2(_BITPACK_,16_3),
+ &TEMPLATE2(_BITPACK_,16_4),
+ &TEMPLATE2(_BITPACK_,16_5),
+ &TEMPLATE2(_BITPACK_,16_6),
+ &TEMPLATE2(_BITPACK_,16_7),
+ &TEMPLATE2(_BITPACK_,16_8),
+ &TEMPLATE2(_BITPACK_,16_9),
+ &TEMPLATE2(_BITPACK_,16_10),
+ &TEMPLATE2(_BITPACK_,16_11),
+ &TEMPLATE2(_BITPACK_,16_12),
+ &TEMPLATE2(_BITPACK_,16_13),
+ &TEMPLATE2(_BITPACK_,16_14),
+ &TEMPLATE2(_BITPACK_,16_15),
+ &TEMPLATE2(_BITPACK_,16_16)
};
-unsigned char *T2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a16)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a16)[ b](in, n, out); }
#define USIZE 32
-unsigned char *T2(_BITPACK_,32_0 )( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
-unsigned char *T2(_BITPACK_,32_1 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 1,32);}
-unsigned char *T2(_BITPACK_,32_2 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 2,32);}
-unsigned char *T2(_BITPACK_,32_3 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 3,32);}
-unsigned char *T2(_BITPACK_,32_4 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 4,32);}
-unsigned char *T2(_BITPACK_,32_5 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 5,32);}
-unsigned char *T2(_BITPACK_,32_6 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 6,32);}
-unsigned char *T2(_BITPACK_,32_7 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 7,32);}
-unsigned char *T2(_BITPACK_,32_8 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 8,32);}
-unsigned char *T2(_BITPACK_,32_9 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 9,32);}
-unsigned char *T2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(10,32);}
-unsigned char *T2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(11,32);}
-unsigned char *T2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(12,32);}
-unsigned char *T2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(13,32);}
-unsigned char *T2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(14,32);}
-unsigned char *T2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(15,32);}
-unsigned char *T2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(16,32);}
-unsigned char *T2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(17,32);}
-unsigned char *T2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(18,32);}
-unsigned char *T2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(19,32);}
-unsigned char *T2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(20,32);}
-unsigned char *T2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(21,32);}
-unsigned char *T2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(22,32);}
-unsigned char *T2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(23,32);}
-unsigned char *T2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(24,32);}
-unsigned char *T2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(25,32);}
-unsigned char *T2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(26,32);}
-unsigned char *T2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(27,32);}
-unsigned char *T2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(28,32);}
-unsigned char *T2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(29,32);}
-unsigned char *T2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(30,32);}
-unsigned char *T2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(31,32);}
-unsigned char *T2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(32,32);}
-BITPACK_F32 T2(_BITPACK_,a32)[] = {
- &T2(_BITPACK_,32_0),
- &T2(_BITPACK_,32_1),
- &T2(_BITPACK_,32_2),
- &T2(_BITPACK_,32_3),
- &T2(_BITPACK_,32_4),
- &T2(_BITPACK_,32_5),
- &T2(_BITPACK_,32_6),
- &T2(_BITPACK_,32_7),
- &T2(_BITPACK_,32_8),
- &T2(_BITPACK_,32_9),
- &T2(_BITPACK_,32_10),
- &T2(_BITPACK_,32_11),
- &T2(_BITPACK_,32_12),
- &T2(_BITPACK_,32_13),
- &T2(_BITPACK_,32_14),
- &T2(_BITPACK_,32_15),
- &T2(_BITPACK_,32_16),
- &T2(_BITPACK_,32_17),
- &T2(_BITPACK_,32_18),
- &T2(_BITPACK_,32_19),
- &T2(_BITPACK_,32_20),
- &T2(_BITPACK_,32_21),
- &T2(_BITPACK_,32_22),
- &T2(_BITPACK_,32_23),
- &T2(_BITPACK_,32_24),
- &T2(_BITPACK_,32_25),
- &T2(_BITPACK_,32_26),
- &T2(_BITPACK_,32_27),
- &T2(_BITPACK_,32_28),
- &T2(_BITPACK_,32_29),
- &T2(_BITPACK_,32_30),
- &T2(_BITPACK_,32_31),
- &T2(_BITPACK_,32_32)
+unsigned char *TEMPLATE2(_BITPACK_,32_0)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,32_1)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*1); uint32_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_2)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*2); uint32_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_3)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*3); uint32_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_4)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*4); uint32_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_5)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*5); uint32_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_6)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*6); uint32_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_7)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*7); uint32_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_8)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*8); uint32_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_9)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*9); uint32_t v,x;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*10); uint32_t v,x;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*11); uint32_t v,x;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*12); uint32_t v,x;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*13); uint32_t v,x;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*14); uint32_t v,x;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*15); uint32_t v,x;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*16); uint32_t v,x;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*17); uint32_t v,x;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*18); uint32_t v,x;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*19); uint32_t v,x;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*20); uint32_t v,x;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*21); uint32_t v,x;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*22); uint32_t v,x;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*23); uint32_t v,x;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*24); uint32_t v,x;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*25); uint32_t v,x;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*26); uint32_t v,x;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*27); uint32_t v,x;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*28); uint32_t v,x;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*29); uint32_t v,x;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*30); uint32_t v,x;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*31); uint32_t v,x;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*32); uint32_t v,x;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F32 TEMPLATE2(_BITPACK_,a32)[] = {
+ &TEMPLATE2(_BITPACK_,32_0),
+ &TEMPLATE2(_BITPACK_,32_1),
+ &TEMPLATE2(_BITPACK_,32_2),
+ &TEMPLATE2(_BITPACK_,32_3),
+ &TEMPLATE2(_BITPACK_,32_4),
+ &TEMPLATE2(_BITPACK_,32_5),
+ &TEMPLATE2(_BITPACK_,32_6),
+ &TEMPLATE2(_BITPACK_,32_7),
+ &TEMPLATE2(_BITPACK_,32_8),
+ &TEMPLATE2(_BITPACK_,32_9),
+ &TEMPLATE2(_BITPACK_,32_10),
+ &TEMPLATE2(_BITPACK_,32_11),
+ &TEMPLATE2(_BITPACK_,32_12),
+ &TEMPLATE2(_BITPACK_,32_13),
+ &TEMPLATE2(_BITPACK_,32_14),
+ &TEMPLATE2(_BITPACK_,32_15),
+ &TEMPLATE2(_BITPACK_,32_16),
+ &TEMPLATE2(_BITPACK_,32_17),
+ &TEMPLATE2(_BITPACK_,32_18),
+ &TEMPLATE2(_BITPACK_,32_19),
+ &TEMPLATE2(_BITPACK_,32_20),
+ &TEMPLATE2(_BITPACK_,32_21),
+ &TEMPLATE2(_BITPACK_,32_22),
+ &TEMPLATE2(_BITPACK_,32_23),
+ &TEMPLATE2(_BITPACK_,32_24),
+ &TEMPLATE2(_BITPACK_,32_25),
+ &TEMPLATE2(_BITPACK_,32_26),
+ &TEMPLATE2(_BITPACK_,32_27),
+ &TEMPLATE2(_BITPACK_,32_28),
+ &TEMPLATE2(_BITPACK_,32_29),
+ &TEMPLATE2(_BITPACK_,32_30),
+ &TEMPLATE2(_BITPACK_,32_31),
+ &TEMPLATE2(_BITPACK_,32_32)
};
-unsigned char *T2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a32)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a32)[ b](in, n, out); }
#define USIZE 64
-unsigned char *T2(_BITPACK_,64_0 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
-unsigned char *T2(_BITPACK_,64_1 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 1,64);}
-unsigned char *T2(_BITPACK_,64_2 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 2,64);}
-unsigned char *T2(_BITPACK_,64_3 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 3,64);}
-unsigned char *T2(_BITPACK_,64_4 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 4,64);}
-unsigned char *T2(_BITPACK_,64_5 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 5,64);}
-unsigned char *T2(_BITPACK_,64_6 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 6,64);}
-unsigned char *T2(_BITPACK_,64_7 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 7,64);}
-unsigned char *T2(_BITPACK_,64_8 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 8,64);}
-unsigned char *T2(_BITPACK_,64_9 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP( 9,64);}
-unsigned char *T2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(10,64);}
-unsigned char *T2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(11,64);}
-unsigned char *T2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(12,64);}
-unsigned char *T2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(13,64);}
-unsigned char *T2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(14,64);}
-unsigned char *T2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(15,64);}
-unsigned char *T2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(16,64);}
-unsigned char *T2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(17,64);}
-unsigned char *T2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(18,64);}
-unsigned char *T2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(19,64);}
-unsigned char *T2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(20,64);}
-unsigned char *T2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(21,64);}
-unsigned char *T2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(22,64);}
-unsigned char *T2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(23,64);}
-unsigned char *T2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(24,64);}
-unsigned char *T2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(25,64);}
-unsigned char *T2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(26,64);}
-unsigned char *T2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(27,64);}
-unsigned char *T2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(28,64);}
-unsigned char *T2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(29,64);}
-unsigned char *T2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(30,64);}
-unsigned char *T2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(31,64);}
-unsigned char *T2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(32,64);}
-unsigned char *T2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(33,64);}
-unsigned char *T2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(34,64);}
-unsigned char *T2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(35,64);}
-unsigned char *T2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(36,64);}
-unsigned char *T2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(37,64);}
-unsigned char *T2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(38,64);}
-unsigned char *T2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(39,64);}
-unsigned char *T2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(40,64);}
-unsigned char *T2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(41,64);}
-unsigned char *T2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(42,64);}
-unsigned char *T2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(43,64);}
-unsigned char *T2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(44,64);}
-unsigned char *T2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(45,64);}
-unsigned char *T2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(46,64);}
-unsigned char *T2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(47,64);}
-unsigned char *T2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(48,64);}
-unsigned char *T2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(49,64);}
-unsigned char *T2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(50,64);}
-unsigned char *T2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(51,64);}
-unsigned char *T2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(52,64);}
-unsigned char *T2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(53,64);}
-unsigned char *T2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(54,64);}
-unsigned char *T2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(55,64);}
-unsigned char *T2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(56,64);}
-unsigned char *T2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(57,64);}
-unsigned char *T2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(58,64);}
-unsigned char *T2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(59,64);}
-unsigned char *T2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(60,64);}
-unsigned char *T2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(61,64);}
-unsigned char *T2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(62,64);}
-unsigned char *T2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(63,64);}
-unsigned char *T2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { BP(64,64);}
-BITPACK_F64 T2(_BITPACK_,a64)[] = {
- &T2(_BITPACK_,64_0),
- &T2(_BITPACK_,64_1),
- &T2(_BITPACK_,64_2),
- &T2(_BITPACK_,64_3),
- &T2(_BITPACK_,64_4),
- &T2(_BITPACK_,64_5),
- &T2(_BITPACK_,64_6),
- &T2(_BITPACK_,64_7),
- &T2(_BITPACK_,64_8),
- &T2(_BITPACK_,64_9),
- &T2(_BITPACK_,64_10),
- &T2(_BITPACK_,64_11),
- &T2(_BITPACK_,64_12),
- &T2(_BITPACK_,64_13),
- &T2(_BITPACK_,64_14),
- &T2(_BITPACK_,64_15),
- &T2(_BITPACK_,64_16),
- &T2(_BITPACK_,64_17),
- &T2(_BITPACK_,64_18),
- &T2(_BITPACK_,64_19),
- &T2(_BITPACK_,64_20),
- &T2(_BITPACK_,64_21),
- &T2(_BITPACK_,64_22),
- &T2(_BITPACK_,64_23),
- &T2(_BITPACK_,64_24),
- &T2(_BITPACK_,64_25),
- &T2(_BITPACK_,64_26),
- &T2(_BITPACK_,64_27),
- &T2(_BITPACK_,64_28),
- &T2(_BITPACK_,64_29),
- &T2(_BITPACK_,64_30),
- &T2(_BITPACK_,64_31),
- &T2(_BITPACK_,64_32),
- &T2(_BITPACK_,64_33),
- &T2(_BITPACK_,64_34),
- &T2(_BITPACK_,64_35),
- &T2(_BITPACK_,64_36),
- &T2(_BITPACK_,64_37),
- &T2(_BITPACK_,64_38),
- &T2(_BITPACK_,64_39),
- &T2(_BITPACK_,64_40),
- &T2(_BITPACK_,64_41),
- &T2(_BITPACK_,64_42),
- &T2(_BITPACK_,64_43),
- &T2(_BITPACK_,64_44),
- &T2(_BITPACK_,64_45),
- &T2(_BITPACK_,64_46),
- &T2(_BITPACK_,64_47),
- &T2(_BITPACK_,64_48),
- &T2(_BITPACK_,64_49),
- &T2(_BITPACK_,64_50),
- &T2(_BITPACK_,64_51),
- &T2(_BITPACK_,64_52),
- &T2(_BITPACK_,64_53),
- &T2(_BITPACK_,64_54),
- &T2(_BITPACK_,64_55),
- &T2(_BITPACK_,64_56),
- &T2(_BITPACK_,64_57),
- &T2(_BITPACK_,64_58),
- &T2(_BITPACK_,64_59),
- &T2(_BITPACK_,64_60),
- &T2(_BITPACK_,64_61),
- &T2(_BITPACK_,64_62),
- &T2(_BITPACK_,64_63),
- &T2(_BITPACK_,64_64)
+unsigned char *TEMPLATE2(_BITPACK_,64_0)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,64_1)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*1); uint64_t v,x;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_2)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*2); uint64_t v,x;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_3)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*3); uint64_t v,x;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_4)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*4); uint64_t v,x;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_5)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*5); uint64_t v,x;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_6)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*6); uint64_t v,x;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_7)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*7); uint64_t v,x;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_8)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*8); uint64_t v,x;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_9)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*9); uint64_t v,x;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*10); uint64_t v,x;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*11); uint64_t v,x;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*12); uint64_t v,x;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*13); uint64_t v,x;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*14); uint64_t v,x;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*15); uint64_t v,x;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*16); uint64_t v,x;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*17); uint64_t v,x;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*18); uint64_t v,x;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*19); uint64_t v,x;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*20); uint64_t v,x;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*21); uint64_t v,x;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*22); uint64_t v,x;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*23); uint64_t v,x;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*24); uint64_t v,x;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*25); uint64_t v,x;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*26); uint64_t v,x;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*27); uint64_t v,x;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*28); uint64_t v,x;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*29); uint64_t v,x;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*30); uint64_t v,x;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*31); uint64_t v,x;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*32); uint64_t v,x;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*33); uint64_t v,x;do { BITPACK64_33( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*34); uint64_t v,x;do { BITPACK64_34( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*35); uint64_t v,x;do { BITPACK64_35( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*36); uint64_t v,x;do { BITPACK64_36( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*37); uint64_t v,x;do { BITPACK64_37( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*38); uint64_t v,x;do { BITPACK64_38( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*39); uint64_t v,x;do { BITPACK64_39( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*40); uint64_t v,x;do { BITPACK64_40( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*41); uint64_t v,x;do { BITPACK64_41( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*42); uint64_t v,x;do { BITPACK64_42( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*43); uint64_t v,x;do { BITPACK64_43( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*44); uint64_t v,x;do { BITPACK64_44( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*45); uint64_t v,x;do { BITPACK64_45( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*46); uint64_t v,x;do { BITPACK64_46( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*47); uint64_t v,x;do { BITPACK64_47( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*48); uint64_t v,x;do { BITPACK64_48( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*49); uint64_t v,x;do { BITPACK64_49( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*50); uint64_t v,x;do { BITPACK64_50( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*51); uint64_t v,x;do { BITPACK64_51( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*52); uint64_t v,x;do { BITPACK64_52( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*53); uint64_t v,x;do { BITPACK64_53( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*54); uint64_t v,x;do { BITPACK64_54( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*55); uint64_t v,x;do { BITPACK64_55( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*56); uint64_t v,x;do { BITPACK64_56( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*57); uint64_t v,x;do { BITPACK64_57( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*58); uint64_t v,x;do { BITPACK64_58( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*59); uint64_t v,x;do { BITPACK64_59( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*60); uint64_t v,x;do { BITPACK64_60( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*61); uint64_t v,x;do { BITPACK64_61( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*62); uint64_t v,x;do { BITPACK64_62( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*63); uint64_t v,x;do { BITPACK64_63( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out ) { unsigned char *out_=out+PAD8(n*64); uint64_t v,x;do { BITPACK64_64( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_F64 TEMPLATE2(_BITPACK_,a64)[] = {
+ &TEMPLATE2(_BITPACK_,64_0),
+ &TEMPLATE2(_BITPACK_,64_1),
+ &TEMPLATE2(_BITPACK_,64_2),
+ &TEMPLATE2(_BITPACK_,64_3),
+ &TEMPLATE2(_BITPACK_,64_4),
+ &TEMPLATE2(_BITPACK_,64_5),
+ &TEMPLATE2(_BITPACK_,64_6),
+ &TEMPLATE2(_BITPACK_,64_7),
+ &TEMPLATE2(_BITPACK_,64_8),
+ &TEMPLATE2(_BITPACK_,64_9),
+ &TEMPLATE2(_BITPACK_,64_10),
+ &TEMPLATE2(_BITPACK_,64_11),
+ &TEMPLATE2(_BITPACK_,64_12),
+ &TEMPLATE2(_BITPACK_,64_13),
+ &TEMPLATE2(_BITPACK_,64_14),
+ &TEMPLATE2(_BITPACK_,64_15),
+ &TEMPLATE2(_BITPACK_,64_16),
+ &TEMPLATE2(_BITPACK_,64_17),
+ &TEMPLATE2(_BITPACK_,64_18),
+ &TEMPLATE2(_BITPACK_,64_19),
+ &TEMPLATE2(_BITPACK_,64_20),
+ &TEMPLATE2(_BITPACK_,64_21),
+ &TEMPLATE2(_BITPACK_,64_22),
+ &TEMPLATE2(_BITPACK_,64_23),
+ &TEMPLATE2(_BITPACK_,64_24),
+ &TEMPLATE2(_BITPACK_,64_25),
+ &TEMPLATE2(_BITPACK_,64_26),
+ &TEMPLATE2(_BITPACK_,64_27),
+ &TEMPLATE2(_BITPACK_,64_28),
+ &TEMPLATE2(_BITPACK_,64_29),
+ &TEMPLATE2(_BITPACK_,64_30),
+ &TEMPLATE2(_BITPACK_,64_31),
+ &TEMPLATE2(_BITPACK_,64_32),
+ &TEMPLATE2(_BITPACK_,64_33),
+ &TEMPLATE2(_BITPACK_,64_34),
+ &TEMPLATE2(_BITPACK_,64_35),
+ &TEMPLATE2(_BITPACK_,64_36),
+ &TEMPLATE2(_BITPACK_,64_37),
+ &TEMPLATE2(_BITPACK_,64_38),
+ &TEMPLATE2(_BITPACK_,64_39),
+ &TEMPLATE2(_BITPACK_,64_40),
+ &TEMPLATE2(_BITPACK_,64_41),
+ &TEMPLATE2(_BITPACK_,64_42),
+ &TEMPLATE2(_BITPACK_,64_43),
+ &TEMPLATE2(_BITPACK_,64_44),
+ &TEMPLATE2(_BITPACK_,64_45),
+ &TEMPLATE2(_BITPACK_,64_46),
+ &TEMPLATE2(_BITPACK_,64_47),
+ &TEMPLATE2(_BITPACK_,64_48),
+ &TEMPLATE2(_BITPACK_,64_49),
+ &TEMPLATE2(_BITPACK_,64_50),
+ &TEMPLATE2(_BITPACK_,64_51),
+ &TEMPLATE2(_BITPACK_,64_52),
+ &TEMPLATE2(_BITPACK_,64_53),
+ &TEMPLATE2(_BITPACK_,64_54),
+ &TEMPLATE2(_BITPACK_,64_55),
+ &TEMPLATE2(_BITPACK_,64_56),
+ &TEMPLATE2(_BITPACK_,64_57),
+ &TEMPLATE2(_BITPACK_,64_58),
+ &TEMPLATE2(_BITPACK_,64_59),
+ &TEMPLATE2(_BITPACK_,64_60),
+ &TEMPLATE2(_BITPACK_,64_61),
+ &TEMPLATE2(_BITPACK_,64_62),
+ &TEMPLATE2(_BITPACK_,64_63),
+ &TEMPLATE2(_BITPACK_,64_64)
};
-unsigned char *T2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return T2(_BITPACK_,a64)[ b](in, n, out); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b) { return TEMPLATE2(_BITPACK_,a64)[ b](in, n, out); }
#else
#define USIZE 8
-unsigned char *T2(_BITPACK_,8_0)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { return out; }
-unsigned char *T2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(1,8);}
-unsigned char *T2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(2,8);}
-unsigned char *T2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(3,8);}
-unsigned char *T2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(4,8);}
-unsigned char *T2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(5,8);}
-unsigned char *T2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(6,8);}
-unsigned char *T2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(7,8);}
-unsigned char *T2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { BP(8,8);}
-BITPACK_D8 T2(_BITPACK_,a8)[] = {
- &T2(_BITPACK_,8_0),
- &T2(_BITPACK_,8_1),
- &T2(_BITPACK_,8_2),
- &T2(_BITPACK_,8_3),
- &T2(_BITPACK_,8_4),
- &T2(_BITPACK_,8_5),
- &T2(_BITPACK_,8_6),
- &T2(_BITPACK_,8_7),
- &T2(_BITPACK_,8_8)
+unsigned char *TEMPLATE2(_BITPACK_,8_0)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,8_1)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*1); uint8_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_2)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*2); uint8_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_3)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*3); uint8_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_4)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*4); uint8_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_5)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*5); uint8_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_6)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*6); uint8_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_7)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*7); uint8_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,8_8)(uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start ) { unsigned char *out_=out+PAD8(n*8); uint8_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D8 TEMPLATE2(_BITPACK_,a8)[] = {
+ &TEMPLATE2(_BITPACK_,8_0),
+ &TEMPLATE2(_BITPACK_,8_1),
+ &TEMPLATE2(_BITPACK_,8_2),
+ &TEMPLATE2(_BITPACK_,8_3),
+ &TEMPLATE2(_BITPACK_,8_4),
+ &TEMPLATE2(_BITPACK_,8_5),
+ &TEMPLATE2(_BITPACK_,8_6),
+ &TEMPLATE2(_BITPACK_,8_7),
+ &TEMPLATE2(_BITPACK_,8_8)
};
-unsigned char *T2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start, unsigned b) { return T2(_BITPACK_,a8)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,8)( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint8_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a8)[ b](in, n, out, start); }
#define USIZE 16
-unsigned char *T2(_BITPACK_,16_0 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { return out; }
-unsigned char *T2(_BITPACK_,16_1 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 1,16);}
-unsigned char *T2(_BITPACK_,16_2 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 2,16);}
-unsigned char *T2(_BITPACK_,16_3 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 3,16);}
-unsigned char *T2(_BITPACK_,16_4 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 4,16);}
-unsigned char *T2(_BITPACK_,16_5 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 5,16);}
-unsigned char *T2(_BITPACK_,16_6 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 6,16);}
-unsigned char *T2(_BITPACK_,16_7 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 7,16);}
-unsigned char *T2(_BITPACK_,16_8 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 8,16);}
-unsigned char *T2(_BITPACK_,16_9 )(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP( 9,16);}
-unsigned char *T2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(10,16);}
-unsigned char *T2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(11,16);}
-unsigned char *T2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(12,16);}
-unsigned char *T2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(13,16);}
-unsigned char *T2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(14,16);}
-unsigned char *T2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(15,16);}
-unsigned char *T2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { BP(16,16);}
-BITPACK_D16 T2(_BITPACK_,a16)[] = {
- &T2(_BITPACK_,16_0),
- &T2(_BITPACK_,16_1),
- &T2(_BITPACK_,16_2),
- &T2(_BITPACK_,16_3),
- &T2(_BITPACK_,16_4),
- &T2(_BITPACK_,16_5),
- &T2(_BITPACK_,16_6),
- &T2(_BITPACK_,16_7),
- &T2(_BITPACK_,16_8),
- &T2(_BITPACK_,16_9),
- &T2(_BITPACK_,16_10),
- &T2(_BITPACK_,16_11),
- &T2(_BITPACK_,16_12),
- &T2(_BITPACK_,16_13),
- &T2(_BITPACK_,16_14),
- &T2(_BITPACK_,16_15),
- &T2(_BITPACK_,16_16)
+unsigned char *TEMPLATE2(_BITPACK_,16_0)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,16_1)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*1); uint16_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_2)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*2); uint16_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_3)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*3); uint16_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_4)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*4); uint16_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_5)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*5); uint16_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_6)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*6); uint16_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_7)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*7); uint16_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_8)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*8); uint16_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_9)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*9); uint16_t v,x=0;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_10)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*10); uint16_t v,x=0;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_11)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*11); uint16_t v,x=0;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_12)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*12); uint16_t v,x=0;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_13)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*13); uint16_t v,x=0;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_14)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*14); uint16_t v,x=0;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_15)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*15); uint16_t v,x=0;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,16_16)(uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start ) { unsigned char *out_=out+PAD8(n*16); uint16_t v,x=0;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D16 TEMPLATE2(_BITPACK_,a16)[] = {
+ &TEMPLATE2(_BITPACK_,16_0),
+ &TEMPLATE2(_BITPACK_,16_1),
+ &TEMPLATE2(_BITPACK_,16_2),
+ &TEMPLATE2(_BITPACK_,16_3),
+ &TEMPLATE2(_BITPACK_,16_4),
+ &TEMPLATE2(_BITPACK_,16_5),
+ &TEMPLATE2(_BITPACK_,16_6),
+ &TEMPLATE2(_BITPACK_,16_7),
+ &TEMPLATE2(_BITPACK_,16_8),
+ &TEMPLATE2(_BITPACK_,16_9),
+ &TEMPLATE2(_BITPACK_,16_10),
+ &TEMPLATE2(_BITPACK_,16_11),
+ &TEMPLATE2(_BITPACK_,16_12),
+ &TEMPLATE2(_BITPACK_,16_13),
+ &TEMPLATE2(_BITPACK_,16_14),
+ &TEMPLATE2(_BITPACK_,16_15),
+ &TEMPLATE2(_BITPACK_,16_16)
};
-unsigned char *T2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start, unsigned b) { return T2(_BITPACK_,a16)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,16)( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint16_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a16)[ b](in, n, out, start); }
#define USIZE 32
-unsigned char *T2(_BITPACK_,32_0 )( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { return out; }
-unsigned char *T2(_BITPACK_,32_1 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 1,32);}
-unsigned char *T2(_BITPACK_,32_2 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 2,32);}
-unsigned char *T2(_BITPACK_,32_3 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 3,32);}
-unsigned char *T2(_BITPACK_,32_4 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 4,32);}
-unsigned char *T2(_BITPACK_,32_5 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 5,32);}
-unsigned char *T2(_BITPACK_,32_6 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 6,32);}
-unsigned char *T2(_BITPACK_,32_7 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 7,32);}
-unsigned char *T2(_BITPACK_,32_8 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 8,32);}
-unsigned char *T2(_BITPACK_,32_9 )(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP( 9,32);}
-unsigned char *T2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(10,32);}
-unsigned char *T2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(11,32);}
-unsigned char *T2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(12,32);}
-unsigned char *T2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(13,32);}
-unsigned char *T2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(14,32);}
-unsigned char *T2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(15,32);}
-unsigned char *T2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(16,32);}
-unsigned char *T2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(17,32);}
-unsigned char *T2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(18,32);}
-unsigned char *T2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(19,32);}
-unsigned char *T2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(20,32);}
-unsigned char *T2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(21,32);}
-unsigned char *T2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(22,32);}
-unsigned char *T2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(23,32);}
-unsigned char *T2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(24,32);}
-unsigned char *T2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(25,32);}
-unsigned char *T2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(26,32);}
-unsigned char *T2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(27,32);}
-unsigned char *T2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(28,32);}
-unsigned char *T2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(29,32);}
-unsigned char *T2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(30,32);}
-unsigned char *T2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(31,32);}
-unsigned char *T2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { BP(32,32);}
-BITPACK_D32 T2(_BITPACK_,a32)[] = {
- &T2(_BITPACK_,32_0),
- &T2(_BITPACK_,32_1),
- &T2(_BITPACK_,32_2),
- &T2(_BITPACK_,32_3),
- &T2(_BITPACK_,32_4),
- &T2(_BITPACK_,32_5),
- &T2(_BITPACK_,32_6),
- &T2(_BITPACK_,32_7),
- &T2(_BITPACK_,32_8),
- &T2(_BITPACK_,32_9),
- &T2(_BITPACK_,32_10),
- &T2(_BITPACK_,32_11),
- &T2(_BITPACK_,32_12),
- &T2(_BITPACK_,32_13),
- &T2(_BITPACK_,32_14),
- &T2(_BITPACK_,32_15),
- &T2(_BITPACK_,32_16),
- &T2(_BITPACK_,32_17),
- &T2(_BITPACK_,32_18),
- &T2(_BITPACK_,32_19),
- &T2(_BITPACK_,32_20),
- &T2(_BITPACK_,32_21),
- &T2(_BITPACK_,32_22),
- &T2(_BITPACK_,32_23),
- &T2(_BITPACK_,32_24),
- &T2(_BITPACK_,32_25),
- &T2(_BITPACK_,32_26),
- &T2(_BITPACK_,32_27),
- &T2(_BITPACK_,32_28),
- &T2(_BITPACK_,32_29),
- &T2(_BITPACK_,32_30),
- &T2(_BITPACK_,32_31),
- &T2(_BITPACK_,32_32)
+unsigned char *TEMPLATE2(_BITPACK_,32_0)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,32_1)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*1); uint32_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_2)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*2); uint32_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_3)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*3); uint32_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_4)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*4); uint32_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_5)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*5); uint32_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_6)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*6); uint32_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_7)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*7); uint32_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_8)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*8); uint32_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_9)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*9); uint32_t v,x=0;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_10)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*10); uint32_t v,x=0;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_11)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*11); uint32_t v,x=0;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_12)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*12); uint32_t v,x=0;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_13)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*13); uint32_t v,x=0;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_14)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*14); uint32_t v,x=0;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_15)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*15); uint32_t v,x=0;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_16)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*16); uint32_t v,x=0;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_17)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*17); uint32_t v,x=0;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_18)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*18); uint32_t v,x=0;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_19)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*19); uint32_t v,x=0;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_20)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*20); uint32_t v,x=0;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_21)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*21); uint32_t v,x=0;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_22)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*22); uint32_t v,x=0;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_23)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*23); uint32_t v,x=0;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_24)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*24); uint32_t v,x=0;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_25)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*25); uint32_t v,x=0;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_26)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*26); uint32_t v,x=0;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_27)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*27); uint32_t v,x=0;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_28)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*28); uint32_t v,x=0;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_29)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*29); uint32_t v,x=0;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_30)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*30); uint32_t v,x=0;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_31)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*31); uint32_t v,x=0;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,32_32)(uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start ) { unsigned char *out_=out+PAD8(n*32); uint32_t v,x=0;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D32 TEMPLATE2(_BITPACK_,a32)[] = {
+ &TEMPLATE2(_BITPACK_,32_0),
+ &TEMPLATE2(_BITPACK_,32_1),
+ &TEMPLATE2(_BITPACK_,32_2),
+ &TEMPLATE2(_BITPACK_,32_3),
+ &TEMPLATE2(_BITPACK_,32_4),
+ &TEMPLATE2(_BITPACK_,32_5),
+ &TEMPLATE2(_BITPACK_,32_6),
+ &TEMPLATE2(_BITPACK_,32_7),
+ &TEMPLATE2(_BITPACK_,32_8),
+ &TEMPLATE2(_BITPACK_,32_9),
+ &TEMPLATE2(_BITPACK_,32_10),
+ &TEMPLATE2(_BITPACK_,32_11),
+ &TEMPLATE2(_BITPACK_,32_12),
+ &TEMPLATE2(_BITPACK_,32_13),
+ &TEMPLATE2(_BITPACK_,32_14),
+ &TEMPLATE2(_BITPACK_,32_15),
+ &TEMPLATE2(_BITPACK_,32_16),
+ &TEMPLATE2(_BITPACK_,32_17),
+ &TEMPLATE2(_BITPACK_,32_18),
+ &TEMPLATE2(_BITPACK_,32_19),
+ &TEMPLATE2(_BITPACK_,32_20),
+ &TEMPLATE2(_BITPACK_,32_21),
+ &TEMPLATE2(_BITPACK_,32_22),
+ &TEMPLATE2(_BITPACK_,32_23),
+ &TEMPLATE2(_BITPACK_,32_24),
+ &TEMPLATE2(_BITPACK_,32_25),
+ &TEMPLATE2(_BITPACK_,32_26),
+ &TEMPLATE2(_BITPACK_,32_27),
+ &TEMPLATE2(_BITPACK_,32_28),
+ &TEMPLATE2(_BITPACK_,32_29),
+ &TEMPLATE2(_BITPACK_,32_30),
+ &TEMPLATE2(_BITPACK_,32_31),
+ &TEMPLATE2(_BITPACK_,32_32)
};
-unsigned char *T2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start, unsigned b) { return T2(_BITPACK_,a32)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,32)( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint32_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a32)[ b](in, n, out, start); }
#define USIZE 64
-unsigned char *T2(_BITPACK_,64_0 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { return out; }
-unsigned char *T2(_BITPACK_,64_1 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 1,64);}
-unsigned char *T2(_BITPACK_,64_2 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 2,64);}
-unsigned char *T2(_BITPACK_,64_3 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 3,64);}
-unsigned char *T2(_BITPACK_,64_4 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 4,64);}
-unsigned char *T2(_BITPACK_,64_5 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 5,64);}
-unsigned char *T2(_BITPACK_,64_6 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 6,64);}
-unsigned char *T2(_BITPACK_,64_7 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 7,64);}
-unsigned char *T2(_BITPACK_,64_8 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 8,64);}
-unsigned char *T2(_BITPACK_,64_9 )(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP( 9,64);}
-unsigned char *T2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(10,64);}
-unsigned char *T2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(11,64);}
-unsigned char *T2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(12,64);}
-unsigned char *T2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(13,64);}
-unsigned char *T2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(14,64);}
-unsigned char *T2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(15,64);}
-unsigned char *T2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(16,64);}
-unsigned char *T2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(17,64);}
-unsigned char *T2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(18,64);}
-unsigned char *T2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(19,64);}
-unsigned char *T2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(20,64);}
-unsigned char *T2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(21,64);}
-unsigned char *T2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(22,64);}
-unsigned char *T2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(23,64);}
-unsigned char *T2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(24,64);}
-unsigned char *T2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(25,64);}
-unsigned char *T2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(26,64);}
-unsigned char *T2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(27,64);}
-unsigned char *T2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(28,64);}
-unsigned char *T2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(29,64);}
-unsigned char *T2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(30,64);}
-unsigned char *T2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(31,64);}
-unsigned char *T2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(32,64);}
-unsigned char *T2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(33,64);}
-unsigned char *T2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(34,64);}
-unsigned char *T2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(35,64);}
-unsigned char *T2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(36,64);}
-unsigned char *T2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(37,64);}
-unsigned char *T2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(38,64);}
-unsigned char *T2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(39,64);}
-unsigned char *T2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(40,64);}
-unsigned char *T2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(41,64);}
-unsigned char *T2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(42,64);}
-unsigned char *T2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(43,64);}
-unsigned char *T2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(44,64);}
-unsigned char *T2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(45,64);}
-unsigned char *T2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(46,64);}
-unsigned char *T2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(47,64);}
-unsigned char *T2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(48,64);}
-unsigned char *T2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(49,64);}
-unsigned char *T2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(50,64);}
-unsigned char *T2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(51,64);}
-unsigned char *T2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(52,64);}
-unsigned char *T2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(53,64);}
-unsigned char *T2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(54,64);}
-unsigned char *T2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(55,64);}
-unsigned char *T2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(56,64);}
-unsigned char *T2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(57,64);}
-unsigned char *T2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(58,64);}
-unsigned char *T2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(59,64);}
-unsigned char *T2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(60,64);}
-unsigned char *T2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(61,64);}
-unsigned char *T2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(62,64);}
-unsigned char *T2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(63,64);}
-unsigned char *T2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { BP(64,64);}
-BITPACK_D64 T2(_BITPACK_,a64)[] = {
- &T2(_BITPACK_,64_0),
- &T2(_BITPACK_,64_1),
- &T2(_BITPACK_,64_2),
- &T2(_BITPACK_,64_3),
- &T2(_BITPACK_,64_4),
- &T2(_BITPACK_,64_5),
- &T2(_BITPACK_,64_6),
- &T2(_BITPACK_,64_7),
- &T2(_BITPACK_,64_8),
- &T2(_BITPACK_,64_9),
- &T2(_BITPACK_,64_10),
- &T2(_BITPACK_,64_11),
- &T2(_BITPACK_,64_12),
- &T2(_BITPACK_,64_13),
- &T2(_BITPACK_,64_14),
- &T2(_BITPACK_,64_15),
- &T2(_BITPACK_,64_16),
- &T2(_BITPACK_,64_17),
- &T2(_BITPACK_,64_18),
- &T2(_BITPACK_,64_19),
- &T2(_BITPACK_,64_20),
- &T2(_BITPACK_,64_21),
- &T2(_BITPACK_,64_22),
- &T2(_BITPACK_,64_23),
- &T2(_BITPACK_,64_24),
- &T2(_BITPACK_,64_25),
- &T2(_BITPACK_,64_26),
- &T2(_BITPACK_,64_27),
- &T2(_BITPACK_,64_28),
- &T2(_BITPACK_,64_29),
- &T2(_BITPACK_,64_30),
- &T2(_BITPACK_,64_31),
- &T2(_BITPACK_,64_32),
- &T2(_BITPACK_,64_33),
- &T2(_BITPACK_,64_34),
- &T2(_BITPACK_,64_35),
- &T2(_BITPACK_,64_36),
- &T2(_BITPACK_,64_37),
- &T2(_BITPACK_,64_38),
- &T2(_BITPACK_,64_39),
- &T2(_BITPACK_,64_40),
- &T2(_BITPACK_,64_41),
- &T2(_BITPACK_,64_42),
- &T2(_BITPACK_,64_43),
- &T2(_BITPACK_,64_44),
- &T2(_BITPACK_,64_45),
- &T2(_BITPACK_,64_46),
- &T2(_BITPACK_,64_47),
- &T2(_BITPACK_,64_48),
- &T2(_BITPACK_,64_49),
- &T2(_BITPACK_,64_50),
- &T2(_BITPACK_,64_51),
- &T2(_BITPACK_,64_52),
- &T2(_BITPACK_,64_53),
- &T2(_BITPACK_,64_54),
- &T2(_BITPACK_,64_55),
- &T2(_BITPACK_,64_56),
- &T2(_BITPACK_,64_57),
- &T2(_BITPACK_,64_58),
- &T2(_BITPACK_,64_59),
- &T2(_BITPACK_,64_60),
- &T2(_BITPACK_,64_61),
- &T2(_BITPACK_,64_62),
- &T2(_BITPACK_,64_63),
- &T2(_BITPACK_,64_64)
+unsigned char *TEMPLATE2(_BITPACK_,64_0)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { return out; }
+unsigned char *TEMPLATE2(_BITPACK_,64_1)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*1); uint64_t v,x=0;do { BITPACK64_1( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_2)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*2); uint64_t v,x=0;do { BITPACK64_2( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_3)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*3); uint64_t v,x=0;do { BITPACK64_3( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_4)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*4); uint64_t v,x=0;do { BITPACK64_4( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_5)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*5); uint64_t v,x=0;do { BITPACK64_5( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_6)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*6); uint64_t v,x=0;do { BITPACK64_6( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_7)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*7); uint64_t v,x=0;do { BITPACK64_7( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_8)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*8); uint64_t v,x=0;do { BITPACK64_8( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_9)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*9); uint64_t v,x=0;do { BITPACK64_9( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_10)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*10); uint64_t v,x=0;do { BITPACK64_10( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_11)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*11); uint64_t v,x=0;do { BITPACK64_11( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_12)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*12); uint64_t v,x=0;do { BITPACK64_12( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_13)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*13); uint64_t v,x=0;do { BITPACK64_13( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_14)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*14); uint64_t v,x=0;do { BITPACK64_14( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_15)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*15); uint64_t v,x=0;do { BITPACK64_15( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_16)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*16); uint64_t v,x=0;do { BITPACK64_16( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_17)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*17); uint64_t v,x=0;do { BITPACK64_17( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_18)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*18); uint64_t v,x=0;do { BITPACK64_18( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_19)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*19); uint64_t v,x=0;do { BITPACK64_19( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_20)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*20); uint64_t v,x=0;do { BITPACK64_20( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_21)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*21); uint64_t v,x=0;do { BITPACK64_21( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_22)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*22); uint64_t v,x=0;do { BITPACK64_22( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_23)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*23); uint64_t v,x=0;do { BITPACK64_23( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_24)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*24); uint64_t v,x=0;do { BITPACK64_24( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_25)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*25); uint64_t v,x=0;do { BITPACK64_25( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_26)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*26); uint64_t v,x=0;do { BITPACK64_26( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_27)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*27); uint64_t v,x=0;do { BITPACK64_27( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_28)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*28); uint64_t v,x=0;do { BITPACK64_28( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_29)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*29); uint64_t v,x=0;do { BITPACK64_29( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_30)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*30); uint64_t v,x=0;do { BITPACK64_30( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_31)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*31); uint64_t v,x=0;do { BITPACK64_31( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_32)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*32); uint64_t v,x=0;do { BITPACK64_32( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_33)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*33); uint64_t v,x=0;do { BITPACK64_33( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_34)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*34); uint64_t v,x=0;do { BITPACK64_34( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_35)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*35); uint64_t v,x=0;do { BITPACK64_35( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_36)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*36); uint64_t v,x=0;do { BITPACK64_36( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_37)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*37); uint64_t v,x=0;do { BITPACK64_37( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_38)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*38); uint64_t v,x=0;do { BITPACK64_38( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_39)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*39); uint64_t v,x=0;do { BITPACK64_39( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_40)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*40); uint64_t v,x=0;do { BITPACK64_40( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_41)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*41); uint64_t v,x=0;do { BITPACK64_41( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_42)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*42); uint64_t v,x=0;do { BITPACK64_42( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_43)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*43); uint64_t v,x=0;do { BITPACK64_43( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_44)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*44); uint64_t v,x=0;do { BITPACK64_44( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_45)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*45); uint64_t v,x=0;do { BITPACK64_45( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_46)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*46); uint64_t v,x=0;do { BITPACK64_46( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_47)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*47); uint64_t v,x=0;do { BITPACK64_47( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_48)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*48); uint64_t v,x=0;do { BITPACK64_48( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_49)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*49); uint64_t v,x=0;do { BITPACK64_49( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_50)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*50); uint64_t v,x=0;do { BITPACK64_50( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_51)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*51); uint64_t v,x=0;do { BITPACK64_51( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_52)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*52); uint64_t v,x=0;do { BITPACK64_52( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_53)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*53); uint64_t v,x=0;do { BITPACK64_53( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_54)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*54); uint64_t v,x=0;do { BITPACK64_54( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_55)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*55); uint64_t v,x=0;do { BITPACK64_55( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_56)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*56); uint64_t v,x=0;do { BITPACK64_56( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_57)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*57); uint64_t v,x=0;do { BITPACK64_57( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_58)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*58); uint64_t v,x=0;do { BITPACK64_58( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_59)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*59); uint64_t v,x=0;do { BITPACK64_59( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_60)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*60); uint64_t v,x=0;do { BITPACK64_60( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_61)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*61); uint64_t v,x=0;do { BITPACK64_61( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_62)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*62); uint64_t v,x=0;do { BITPACK64_62( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_63)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*63); uint64_t v,x=0;do { BITPACK64_63( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+unsigned char *TEMPLATE2(_BITPACK_,64_64)(uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start ) { unsigned char *out_=out+PAD8(n*64); uint64_t v,x=0;do { BITPACK64_64( in, out, start); PREFETCH(in+512,0); } while(out<out_); return out_; }
+BITPACK_D64 TEMPLATE2(_BITPACK_,a64)[] = {
+ &TEMPLATE2(_BITPACK_,64_0),
+ &TEMPLATE2(_BITPACK_,64_1),
+ &TEMPLATE2(_BITPACK_,64_2),
+ &TEMPLATE2(_BITPACK_,64_3),
+ &TEMPLATE2(_BITPACK_,64_4),
+ &TEMPLATE2(_BITPACK_,64_5),
+ &TEMPLATE2(_BITPACK_,64_6),
+ &TEMPLATE2(_BITPACK_,64_7),
+ &TEMPLATE2(_BITPACK_,64_8),
+ &TEMPLATE2(_BITPACK_,64_9),
+ &TEMPLATE2(_BITPACK_,64_10),
+ &TEMPLATE2(_BITPACK_,64_11),
+ &TEMPLATE2(_BITPACK_,64_12),
+ &TEMPLATE2(_BITPACK_,64_13),
+ &TEMPLATE2(_BITPACK_,64_14),
+ &TEMPLATE2(_BITPACK_,64_15),
+ &TEMPLATE2(_BITPACK_,64_16),
+ &TEMPLATE2(_BITPACK_,64_17),
+ &TEMPLATE2(_BITPACK_,64_18),
+ &TEMPLATE2(_BITPACK_,64_19),
+ &TEMPLATE2(_BITPACK_,64_20),
+ &TEMPLATE2(_BITPACK_,64_21),
+ &TEMPLATE2(_BITPACK_,64_22),
+ &TEMPLATE2(_BITPACK_,64_23),
+ &TEMPLATE2(_BITPACK_,64_24),
+ &TEMPLATE2(_BITPACK_,64_25),
+ &TEMPLATE2(_BITPACK_,64_26),
+ &TEMPLATE2(_BITPACK_,64_27),
+ &TEMPLATE2(_BITPACK_,64_28),
+ &TEMPLATE2(_BITPACK_,64_29),
+ &TEMPLATE2(_BITPACK_,64_30),
+ &TEMPLATE2(_BITPACK_,64_31),
+ &TEMPLATE2(_BITPACK_,64_32),
+ &TEMPLATE2(_BITPACK_,64_33),
+ &TEMPLATE2(_BITPACK_,64_34),
+ &TEMPLATE2(_BITPACK_,64_35),
+ &TEMPLATE2(_BITPACK_,64_36),
+ &TEMPLATE2(_BITPACK_,64_37),
+ &TEMPLATE2(_BITPACK_,64_38),
+ &TEMPLATE2(_BITPACK_,64_39),
+ &TEMPLATE2(_BITPACK_,64_40),
+ &TEMPLATE2(_BITPACK_,64_41),
+ &TEMPLATE2(_BITPACK_,64_42),
+ &TEMPLATE2(_BITPACK_,64_43),
+ &TEMPLATE2(_BITPACK_,64_44),
+ &TEMPLATE2(_BITPACK_,64_45),
+ &TEMPLATE2(_BITPACK_,64_46),
+ &TEMPLATE2(_BITPACK_,64_47),
+ &TEMPLATE2(_BITPACK_,64_48),
+ &TEMPLATE2(_BITPACK_,64_49),
+ &TEMPLATE2(_BITPACK_,64_50),
+ &TEMPLATE2(_BITPACK_,64_51),
+ &TEMPLATE2(_BITPACK_,64_52),
+ &TEMPLATE2(_BITPACK_,64_53),
+ &TEMPLATE2(_BITPACK_,64_54),
+ &TEMPLATE2(_BITPACK_,64_55),
+ &TEMPLATE2(_BITPACK_,64_56),
+ &TEMPLATE2(_BITPACK_,64_57),
+ &TEMPLATE2(_BITPACK_,64_58),
+ &TEMPLATE2(_BITPACK_,64_59),
+ &TEMPLATE2(_BITPACK_,64_60),
+ &TEMPLATE2(_BITPACK_,64_61),
+ &TEMPLATE2(_BITPACK_,64_62),
+ &TEMPLATE2(_BITPACK_,64_63),
+ &TEMPLATE2(_BITPACK_,64_64)
};
-unsigned char *T2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start, unsigned b) { return T2(_BITPACK_,a64)[ b](in, n, out, start); }
-#undef USIZE
+unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , uint64_t start, unsigned b) { return TEMPLATE2(_BITPACK_,a64)[ b](in, n, out, start); }
#endif
#endif //IP9
diff --git a/src/ext/for/bitunpack.c b/src/ext/for/bitunpack.c
index 45060084..1dd78003 100644
--- a/src/ext/for/bitunpack.c
+++ b/src/ext/for/bitunpack.c
@@ -1,6 +1,6 @@
/**
- Copyright (C) powturbo 2013-2023
- SPDX-License-Identifier: GPL v2 License
+ Copyright (C) powturbo 2013-2019
+ GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -22,26 +22,25 @@
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Integer Compression" Bit Packing
+#define BITUTIL_IN
+#define VINT_IN
+#include "conf.h"
+#include "bitutil.h"
+#include "bitpack.h"
+#include "vint.h"
+
+#define PAD8(_x_) (((_x_)+7)/8)
+
#pragma warning( disable : 4005)
#pragma warning( disable : 4090)
#pragma warning( disable : 4068)
-#include <string.h>
-#include "include_/conf.h"
-#include "include_/bitpack.h"
-#include "include_/bitutil.h"
-#include "include_/vlcbyte.h"
-
-#include "include_/bitutil_.h"
-
-#define PAD8(_x_) (((_x_)+7)/8)
-
#pragma GCC push_options
#pragma GCC optimize ("align-functions=16")
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunsequenced"
-#ifndef __AVX2__ //----------------------------------- Plain -------------------------------------------------------------------------------------------
+#if !defined(SSE2_ON) && !defined(AVX2_ON) //----------------------------------- Plain -------------------------------------------------------------------------------------------
typedef unsigned char *(*BITUNPACK_F8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out);
typedef unsigned char *(*BITUNPACK_D8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start);
typedef unsigned char *(*BITUNPACK_F16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out);
@@ -59,51 +58,39 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi
#define OPX(_op_) _op_ += 32
#endif
-//-- bitpack -------------
#define OPI(_op_,_nb_,_parm_) OPX(_op_)
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _w_
#define _BITUNPACK_ bitunpack
#include "bitunpack_.h"
-
#define DELTA
-//-- bitunpack delta -------------
+
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_))
#define _BITUNPACK_ bitdunpack // delta + 0
#include "bitunpack_.h"
-//-- bitunpack zigzag -----------
-#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += T2(zigzagdec, USIZE)(_w_))
+#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += TEMPLATE2(zigzagdec, USIZE)(_w_))
#define _BITUNPACK_ bitzunpack // zigzag
#include "bitunpack_.h"
-//-- bitunpack xor -----------
-#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ ^= (_w_))
-#define _BITUNPACK_ bitxunpack // xor
-#include "bitunpack_.h"
-
-//-- bitunpack FOR -----------
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ + (_w_))
#define _BITUNPACK_ bitfunpack // for
#include "bitunpack_.h"
-//-- bitunpack delta 1 -----------
#define OPI(_op_,_nb_,_parm_) OPX(_op_); _parm_ += 32
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_)) + (_x_+1)
#define _BITUNPACK_ bitd1unpack // delta + 1
#include "bitunpack_.h"
-//-- bitunpack FOR 1 -----------
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _parm_ + (_w_)+(_x_+1)
#define _BITUNPACK_ bitf1unpack // for + 1
#include "bitunpack_.h"
#undef OPI
-//------------------------------------------------------- bitnunpack ----------------------------------------------------------
#define BITNUNPACK(in, n, out, _csize_, _usize_) {\
unsigned char *ip = in;\
- for(op = out,out+=n; op < out;) { unsigned oplen = out - op,b; if(oplen > _csize_) oplen = _csize_; /*PREFETCH(ip+512,0);*/\
- b = *ip++; ip = T2(bitunpacka, _usize_)[b](ip, oplen, op);\
+ for(op = out,out+=n; op < out;) { unsigned oplen = out - op,b; if(oplen > _csize_) oplen = _csize_; PREFETCH(ip+512,0);\
+ b = *ip++; ip = TEMPLATE2(bitunpacka, _usize_)[b](ip, oplen, op);\
op += oplen;\
} \
return ip - in;\
@@ -111,10 +98,10 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi
#define BITNDUNPACK(in, n, out, _csize_, _usize_, _bitunpacka_) { if(!n) return 0;\
unsigned char *ip = in;\
- T2(vbxget, _usize_)(ip, start);\
- for(*out++ = start,--n,op = out; op != out+(n&~(_csize_-1)); ) { /*PREFETCH(ip+512,0);*/\
- unsigned b = *ip++; ip = T2(_bitunpacka_, _usize_)[b](ip, _csize_, op, start); op += _csize_; start = op[-1];\
- } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(_bitunpacka_, _usize_)[b](ip, n, op, start); }\
+ TEMPLATE2(vbxget, _usize_)(ip, start);\
+ for(*out++ = start,--n,op = out; op != out+(n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\
+ unsigned b = *ip++; ip = TEMPLATE2(_bitunpacka_, _usize_)[b](ip, _csize_, op, start); op += _csize_; start = op[-1];\
+ } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(_bitunpacka_, _usize_)[b](ip, n, op, start); }\
return ip - in;\
}
@@ -138,33 +125,28 @@ size_t bitnzunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restri
size_t bitnzunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitzunpacka); }
size_t bitnzunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitzunpacka); }
-size_t bitnxunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op,start; BITNDUNPACK(in, n, out, 128, 8, bitxunpacka); }
-size_t bitnxunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitxunpacka); }
-size_t bitnxunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitxunpacka); }
-size_t bitnxunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitxunpacka); }
-
size_t bitnfunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op,start; BITNDUNPACK(in, n, out, 128, 8, bitfunpacka); }
size_t bitnfunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitfunpacka); }
size_t bitnfunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitfunpacka); }
size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitfunpacka); }
- #endif
-//#else //-------------------------------------------- SSE/AVX2 ---------------------------------------------------------------------------------------
+
+#else //-------------------------------------------- SSE/AVX2 ---------------------------------------------------------------------------------------
#define _BITNUNPACKV(in, n, out, _csize_, _usize_, _bitunpackv_) {\
unsigned char *ip = in;\
- for(op = out; op != out+(n&~(_csize_-1)); op += _csize_) { /*PREFETCH(in+512,0);*/\
- unsigned b = *ip++; ip = T2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\
- } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(bitunpack, _usize_)(ip, n, op,b); }\
+ for(op = out; op != out+(n&~(_csize_-1)); op += _csize_) { PREFETCH(in+512,0);\
+ unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\
+ } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(bitunpack, _usize_)(ip, n, op,b); }\
return ip - in;\
}
#define _BITNDUNPACKV(in, n, out, _csize_, _usize_, _bitunpackv_, _bitunpack_) { if(!n) return 0;\
unsigned char *ip = in;\
- T2(vbxget, _usize_)(ip, start); \
+ TEMPLATE2(vbxget, _usize_)(ip, start); \
*out++ = start;\
- for(--n,op = out; op != out+(n&~(_csize_-1)); ) { /*PREFETCH(ip+512,0);*/\
- unsigned b = *ip++; ip = T2(_bitunpackv_, _usize_)(ip, _csize_, op, start,b); op += _csize_; start = op[-1];\
- } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(_bitunpack_, _usize_)(ip, n, op, start,b); }\
+ for(--n,op = out; op != out+(n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\
+ unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op, start,b); op += _csize_; start = op[-1];\
+ } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(_bitunpack_, _usize_)(ip, n, op, start,b); }\
return ip - in;\
}
#ifdef __AVX2__ //-------------------------------- AVX2 ----------------------------------------------------------------------------
@@ -178,262 +160,262 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restri
static inline __m128i _mm_cvtsi64_si128(__int64 a) { return _mm_loadl_epi64((__m128i*)&a); }
#endif
static ALIGNED(unsigned char, permv[256][8], 32) = {
-{0,0,0,0,0,0,0,0},
-{0,1,1,1,1,1,1,1},
-{1,0,1,1,1,1,1,1},
-{0,1,2,2,2,2,2,2},
-{1,1,0,1,1,1,1,1},
-{0,2,1,2,2,2,2,2},
-{2,0,1,2,2,2,2,2},
-{0,1,2,3,3,3,3,3},
-{1,1,1,0,1,1,1,1},
-{0,2,2,1,2,2,2,2},
-{2,0,2,1,2,2,2,2},
-{0,1,3,2,3,3,3,3},
-{2,2,0,1,2,2,2,2},
-{0,3,1,2,3,3,3,3},
-{3,0,1,2,3,3,3,3},
-{0,1,2,3,4,4,4,4},
-{1,1,1,1,0,1,1,1},
-{0,2,2,2,1,2,2,2},
-{2,0,2,2,1,2,2,2},
-{0,1,3,3,2,3,3,3},
-{2,2,0,2,1,2,2,2},
-{0,3,1,3,2,3,3,3},
-{3,0,1,3,2,3,3,3},
-{0,1,2,4,3,4,4,4},
-{2,2,2,0,1,2,2,2},
-{0,3,3,1,2,3,3,3},
-{3,0,3,1,2,3,3,3},
-{0,1,4,2,3,4,4,4},
-{3,3,0,1,2,3,3,3},
-{0,4,1,2,3,4,4,4},
-{4,0,1,2,3,4,4,4},
-{0,1,2,3,4,5,5,5},
-{1,1,1,1,1,0,1,1},
-{0,2,2,2,2,1,2,2},
-{2,0,2,2,2,1,2,2},
-{0,1,3,3,3,2,3,3},
-{2,2,0,2,2,1,2,2},
-{0,3,1,3,3,2,3,3},
-{3,0,1,3,3,2,3,3},
-{0,1,2,4,4,3,4,4},
-{2,2,2,0,2,1,2,2},
-{0,3,3,1,3,2,3,3},
-{3,0,3,1,3,2,3,3},
-{0,1,4,2,4,3,4,4},
-{3,3,0,1,3,2,3,3},
-{0,4,1,2,4,3,4,4},
-{4,0,1,2,4,3,4,4},
-{0,1,2,3,5,4,5,5},
-{2,2,2,2,0,1,2,2},
-{0,3,3,3,1,2,3,3},
-{3,0,3,3,1,2,3,3},
-{0,1,4,4,2,3,4,4},
-{3,3,0,3,1,2,3,3},
-{0,4,1,4,2,3,4,4},
-{4,0,1,4,2,3,4,4},
-{0,1,2,5,3,4,5,5},
-{3,3,3,0,1,2,3,3},
-{0,4,4,1,2,3,4,4},
-{4,0,4,1,2,3,4,4},
-{0,1,5,2,3,4,5,5},
-{4,4,0,1,2,3,4,4},
-{0,5,1,2,3,4,5,5},
-{5,0,1,2,3,4,5,5},
-{0,1,2,3,4,5,6,6},
-{1,1,1,1,1,1,0,1},
-{0,2,2,2,2,2,1,2},
-{2,0,2,2,2,2,1,2},
-{0,1,3,3,3,3,2,3},
-{2,2,0,2,2,2,1,2},
-{0,3,1,3,3,3,2,3},
-{3,0,1,3,3,3,2,3},
-{0,1,2,4,4,4,3,4},
-{2,2,2,0,2,2,1,2},
-{0,3,3,1,3,3,2,3},
-{3,0,3,1,3,3,2,3},
-{0,1,4,2,4,4,3,4},
-{3,3,0,1,3,3,2,3},
-{0,4,1,2,4,4,3,4},
-{4,0,1,2,4,4,3,4},
-{0,1,2,3,5,5,4,5},
-{2,2,2,2,0,2,1,2},
-{0,3,3,3,1,3,2,3},
-{3,0,3,3,1,3,2,3},
-{0,1,4,4,2,4,3,4},
-{3,3,0,3,1,3,2,3},
-{0,4,1,4,2,4,3,4},
-{4,0,1,4,2,4,3,4},
-{0,1,2,5,3,5,4,5},
-{3,3,3,0,1,3,2,3},
-{0,4,4,1,2,4,3,4},
-{4,0,4,1,2,4,3,4},
-{0,1,5,2,3,5,4,5},
-{4,4,0,1,2,4,3,4},
-{0,5,1,2,3,5,4,5},
-{5,0,1,2,3,5,4,5},
-{0,1,2,3,4,6,5,6},
-{2,2,2,2,2,0,1,2},
-{0,3,3,3,3,1,2,3},
-{3,0,3,3,3,1,2,3},
-{0,1,4,4,4,2,3,4},
-{3,3,0,3,3,1,2,3},
-{0,4,1,4,4,2,3,4},
-{4,0,1,4,4,2,3,4},
-{0,1,2,5,5,3,4,5},
-{3,3,3,0,3,1,2,3},
-{0,4,4,1,4,2,3,4},
-{4,0,4,1,4,2,3,4},
-{0,1,5,2,5,3,4,5},
-{4,4,0,1,4,2,3,4},
-{0,5,1,2,5,3,4,5},
-{5,0,1,2,5,3,4,5},
-{0,1,2,3,6,4,5,6},
-{3,3,3,3,0,1,2,3},
-{0,4,4,4,1,2,3,4},
-{4,0,4,4,1,2,3,4},
-{0,1,5,5,2,3,4,5},
-{4,4,0,4,1,2,3,4},
-{0,5,1,5,2,3,4,5},
-{5,0,1,5,2,3,4,5},
-{0,1,2,6,3,4,5,6},
-{4,4,4,0,1,2,3,4},
-{0,5,5,1,2,3,4,5},
-{5,0,5,1,2,3,4,5},
-{0,1,6,2,3,4,5,6},
-{5,5,0,1,2,3,4,5},
-{0,6,1,2,3,4,5,6},
-{6,0,1,2,3,4,5,6},
-{0,1,2,3,4,5,6,7},
-{1,1,1,1,1,1,1,0},
-{0,2,2,2,2,2,2,1},
-{2,0,2,2,2,2,2,1},
-{0,1,3,3,3,3,3,2},
-{2,2,0,2,2,2,2,1},
-{0,3,1,3,3,3,3,2},
-{3,0,1,3,3,3,3,2},
-{0,1,2,4,4,4,4,3},
-{2,2,2,0,2,2,2,1},
-{0,3,3,1,3,3,3,2},
-{3,0,3,1,3,3,3,2},
-{0,1,4,2,4,4,4,3},
-{3,3,0,1,3,3,3,2},
-{0,4,1,2,4,4,4,3},
-{4,0,1,2,4,4,4,3},
-{0,1,2,3,5,5,5,4},
-{2,2,2,2,0,2,2,1},
-{0,3,3,3,1,3,3,2},
-{3,0,3,3,1,3,3,2},
-{0,1,4,4,2,4,4,3},
-{3,3,0,3,1,3,3,2},
-{0,4,1,4,2,4,4,3},
-{4,0,1,4,2,4,4,3},
-{0,1,2,5,3,5,5,4},
-{3,3,3,0,1,3,3,2},
-{0,4,4,1,2,4,4,3},
-{4,0,4,1,2,4,4,3},
-{0,1,5,2,3,5,5,4},
-{4,4,0,1,2,4,4,3},
-{0,5,1,2,3,5,5,4},
-{5,0,1,2,3,5,5,4},
-{0,1,2,3,4,6,6,5},
-{2,2,2,2,2,0,2,1},
-{0,3,3,3,3,1,3,2},
-{3,0,3,3,3,1,3,2},
-{0,1,4,4,4,2,4,3},
-{3,3,0,3,3,1,3,2},
-{0,4,1,4,4,2,4,3},
-{4,0,1,4,4,2,4,3},
-{0,1,2,5,5,3,5,4},
-{3,3,3,0,3,1,3,2},
-{0,4,4,1,4,2,4,3},
-{4,0,4,1,4,2,4,3},
-{0,1,5,2,5,3,5,4},
-{4,4,0,1,4,2,4,3},
-{0,5,1,2,5,3,5,4},
-{5,0,1,2,5,3,5,4},
-{0,1,2,3,6,4,6,5},
-{3,3,3,3,0,1,3,2},
-{0,4,4,4,1,2,4,3},
-{4,0,4,4,1,2,4,3},
-{0,1,5,5,2,3,5,4},
-{4,4,0,4,1,2,4,3},
-{0,5,1,5,2,3,5,4},
-{5,0,1,5,2,3,5,4},
-{0,1,2,6,3,4,6,5},
-{4,4,4,0,1,2,4,3},
-{0,5,5,1,2,3,5,4},
-{5,0,5,1,2,3,5,4},
-{0,1,6,2,3,4,6,5},
-{5,5,0,1,2,3,5,4},
-{0,6,1,2,3,4,6,5},
-{6,0,1,2,3,4,6,5},
-{0,1,2,3,4,5,7,6},
-{2,2,2,2,2,2,0,1},
-{0,3,3,3,3,3,1,2},
-{3,0,3,3,3,3,1,2},
-{0,1,4,4,4,4,2,3},
-{3,3,0,3,3,3,1,2},
-{0,4,1,4,4,4,2,3},
-{4,0,1,4,4,4,2,3},
-{0,1,2,5,5,5,3,4},
-{3,3,3,0,3,3,1,2},
-{0,4,4,1,4,4,2,3},
-{4,0,4,1,4,4,2,3},
-{0,1,5,2,5,5,3,4},
-{4,4,0,1,4,4,2,3},
-{0,5,1,2,5,5,3,4},
-{5,0,1,2,5,5,3,4},
-{0,1,2,3,6,6,4,5},
-{3,3,3,3,0,3,1,2},
-{0,4,4,4,1,4,2,3},
-{4,0,4,4,1,4,2,3},
-{0,1,5,5,2,5,3,4},
-{4,4,0,4,1,4,2,3},
-{0,5,1,5,2,5,3,4},
-{5,0,1,5,2,5,3,4},
-{0,1,2,6,3,6,4,5},
-{4,4,4,0,1,4,2,3},
-{0,5,5,1,2,5,3,4},
-{5,0,5,1,2,5,3,4},
-{0,1,6,2,3,6,4,5},
-{5,5,0,1,2,5,3,4},
-{0,6,1,2,3,6,4,5},
-{6,0,1,2,3,6,4,5},
-{0,1,2,3,4,7,5,6},
-{3,3,3,3,3,0,1,2},
-{0,4,4,4,4,1,2,3},
-{4,0,4,4,4,1,2,3},
-{0,1,5,5,5,2,3,4},
-{4,4,0,4,4,1,2,3},
-{0,5,1,5,5,2,3,4},
-{5,0,1,5,5,2,3,4},
-{0,1,2,6,6,3,4,5},
-{4,4,4,0,4,1,2,3},
-{0,5,5,1,5,2,3,4},
-{5,0,5,1,5,2,3,4},
-{0,1,6,2,6,3,4,5},
-{5,5,0,1,5,2,3,4},
-{0,6,1,2,6,3,4,5},
-{6,0,1,2,6,3,4,5},
-{0,1,2,3,7,4,5,6},
-{4,4,4,4,0,1,2,3},
-{0,5,5,5,1,2,3,4},
-{5,0,5,5,1,2,3,4},
-{0,1,6,6,2,3,4,5},
-{5,5,0,5,1,2,3,4},
-{0,6,1,6,2,3,4,5},
-{6,0,1,6,2,3,4,5},
-{0,1,2,7,3,4,5,6},
-{5,5,5,0,1,2,3,4},
-{0,6,6,1,2,3,4,5},
-{6,0,6,1,2,3,4,5},
-{0,1,7,2,3,4,5,6},
-{6,6,0,1,2,3,4,5},
-{0,7,1,2,3,4,5,6},
-{7,0,1,2,3,4,5,6},
-{0,1,2,3,4,5,6,7}
+0,0,0,0,0,0,0,0,
+0,1,1,1,1,1,1,1,
+1,0,1,1,1,1,1,1,
+0,1,2,2,2,2,2,2,
+1,1,0,1,1,1,1,1,
+0,2,1,2,2,2,2,2,
+2,0,1,2,2,2,2,2,
+0,1,2,3,3,3,3,3,
+1,1,1,0,1,1,1,1,
+0,2,2,1,2,2,2,2,
+2,0,2,1,2,2,2,2,
+0,1,3,2,3,3,3,3,
+2,2,0,1,2,2,2,2,
+0,3,1,2,3,3,3,3,
+3,0,1,2,3,3,3,3,
+0,1,2,3,4,4,4,4,
+1,1,1,1,0,1,1,1,
+0,2,2,2,1,2,2,2,
+2,0,2,2,1,2,2,2,
+0,1,3,3,2,3,3,3,
+2,2,0,2,1,2,2,2,
+0,3,1,3,2,3,3,3,
+3,0,1,3,2,3,3,3,
+0,1,2,4,3,4,4,4,
+2,2,2,0,1,2,2,2,
+0,3,3,1,2,3,3,3,
+3,0,3,1,2,3,3,3,
+0,1,4,2,3,4,4,4,
+3,3,0,1,2,3,3,3,
+0,4,1,2,3,4,4,4,
+4,0,1,2,3,4,4,4,
+0,1,2,3,4,5,5,5,
+1,1,1,1,1,0,1,1,
+0,2,2,2,2,1,2,2,
+2,0,2,2,2,1,2,2,
+0,1,3,3,3,2,3,3,
+2,2,0,2,2,1,2,2,
+0,3,1,3,3,2,3,3,
+3,0,1,3,3,2,3,3,
+0,1,2,4,4,3,4,4,
+2,2,2,0,2,1,2,2,
+0,3,3,1,3,2,3,3,
+3,0,3,1,3,2,3,3,
+0,1,4,2,4,3,4,4,
+3,3,0,1,3,2,3,3,
+0,4,1,2,4,3,4,4,
+4,0,1,2,4,3,4,4,
+0,1,2,3,5,4,5,5,
+2,2,2,2,0,1,2,2,
+0,3,3,3,1,2,3,3,
+3,0,3,3,1,2,3,3,
+0,1,4,4,2,3,4,4,
+3,3,0,3,1,2,3,3,
+0,4,1,4,2,3,4,4,
+4,0,1,4,2,3,4,4,
+0,1,2,5,3,4,5,5,
+3,3,3,0,1,2,3,3,
+0,4,4,1,2,3,4,4,
+4,0,4,1,2,3,4,4,
+0,1,5,2,3,4,5,5,
+4,4,0,1,2,3,4,4,
+0,5,1,2,3,4,5,5,
+5,0,1,2,3,4,5,5,
+0,1,2,3,4,5,6,6,
+1,1,1,1,1,1,0,1,
+0,2,2,2,2,2,1,2,
+2,0,2,2,2,2,1,2,
+0,1,3,3,3,3,2,3,
+2,2,0,2,2,2,1,2,
+0,3,1,3,3,3,2,3,
+3,0,1,3,3,3,2,3,
+0,1,2,4,4,4,3,4,
+2,2,2,0,2,2,1,2,
+0,3,3,1,3,3,2,3,
+3,0,3,1,3,3,2,3,
+0,1,4,2,4,4,3,4,
+3,3,0,1,3,3,2,3,
+0,4,1,2,4,4,3,4,
+4,0,1,2,4,4,3,4,
+0,1,2,3,5,5,4,5,
+2,2,2,2,0,2,1,2,
+0,3,3,3,1,3,2,3,
+3,0,3,3,1,3,2,3,
+0,1,4,4,2,4,3,4,
+3,3,0,3,1,3,2,3,
+0,4,1,4,2,4,3,4,
+4,0,1,4,2,4,3,4,
+0,1,2,5,3,5,4,5,
+3,3,3,0,1,3,2,3,
+0,4,4,1,2,4,3,4,
+4,0,4,1,2,4,3,4,
+0,1,5,2,3,5,4,5,
+4,4,0,1,2,4,3,4,
+0,5,1,2,3,5,4,5,
+5,0,1,2,3,5,4,5,
+0,1,2,3,4,6,5,6,
+2,2,2,2,2,0,1,2,
+0,3,3,3,3,1,2,3,
+3,0,3,3,3,1,2,3,
+0,1,4,4,4,2,3,4,
+3,3,0,3,3,1,2,3,
+0,4,1,4,4,2,3,4,
+4,0,1,4,4,2,3,4,
+0,1,2,5,5,3,4,5,
+3,3,3,0,3,1,2,3,
+0,4,4,1,4,2,3,4,
+4,0,4,1,4,2,3,4,
+0,1,5,2,5,3,4,5,
+4,4,0,1,4,2,3,4,
+0,5,1,2,5,3,4,5,
+5,0,1,2,5,3,4,5,
+0,1,2,3,6,4,5,6,
+3,3,3,3,0,1,2,3,
+0,4,4,4,1,2,3,4,
+4,0,4,4,1,2,3,4,
+0,1,5,5,2,3,4,5,
+4,4,0,4,1,2,3,4,
+0,5,1,5,2,3,4,5,
+5,0,1,5,2,3,4,5,
+0,1,2,6,3,4,5,6,
+4,4,4,0,1,2,3,4,
+0,5,5,1,2,3,4,5,
+5,0,5,1,2,3,4,5,
+0,1,6,2,3,4,5,6,
+5,5,0,1,2,3,4,5,
+0,6,1,2,3,4,5,6,
+6,0,1,2,3,4,5,6,
+0,1,2,3,4,5,6,7,
+1,1,1,1,1,1,1,0,
+0,2,2,2,2,2,2,1,
+2,0,2,2,2,2,2,1,
+0,1,3,3,3,3,3,2,
+2,2,0,2,2,2,2,1,
+0,3,1,3,3,3,3,2,
+3,0,1,3,3,3,3,2,
+0,1,2,4,4,4,4,3,
+2,2,2,0,2,2,2,1,
+0,3,3,1,3,3,3,2,
+3,0,3,1,3,3,3,2,
+0,1,4,2,4,4,4,3,
+3,3,0,1,3,3,3,2,
+0,4,1,2,4,4,4,3,
+4,0,1,2,4,4,4,3,
+0,1,2,3,5,5,5,4,
+2,2,2,2,0,2,2,1,
+0,3,3,3,1,3,3,2,
+3,0,3,3,1,3,3,2,
+0,1,4,4,2,4,4,3,
+3,3,0,3,1,3,3,2,
+0,4,1,4,2,4,4,3,
+4,0,1,4,2,4,4,3,
+0,1,2,5,3,5,5,4,
+3,3,3,0,1,3,3,2,
+0,4,4,1,2,4,4,3,
+4,0,4,1,2,4,4,3,
+0,1,5,2,3,5,5,4,
+4,4,0,1,2,4,4,3,
+0,5,1,2,3,5,5,4,
+5,0,1,2,3,5,5,4,
+0,1,2,3,4,6,6,5,
+2,2,2,2,2,0,2,1,
+0,3,3,3,3,1,3,2,
+3,0,3,3,3,1,3,2,
+0,1,4,4,4,2,4,3,
+3,3,0,3,3,1,3,2,
+0,4,1,4,4,2,4,3,
+4,0,1,4,4,2,4,3,
+0,1,2,5,5,3,5,4,
+3,3,3,0,3,1,3,2,
+0,4,4,1,4,2,4,3,
+4,0,4,1,4,2,4,3,
+0,1,5,2,5,3,5,4,
+4,4,0,1,4,2,4,3,
+0,5,1,2,5,3,5,4,
+5,0,1,2,5,3,5,4,
+0,1,2,3,6,4,6,5,
+3,3,3,3,0,1,3,2,
+0,4,4,4,1,2,4,3,
+4,0,4,4,1,2,4,3,
+0,1,5,5,2,3,5,4,
+4,4,0,4,1,2,4,3,
+0,5,1,5,2,3,5,4,
+5,0,1,5,2,3,5,4,
+0,1,2,6,3,4,6,5,
+4,4,4,0,1,2,4,3,
+0,5,5,1,2,3,5,4,
+5,0,5,1,2,3,5,4,
+0,1,6,2,3,4,6,5,
+5,5,0,1,2,3,5,4,
+0,6,1,2,3,4,6,5,
+6,0,1,2,3,4,6,5,
+0,1,2,3,4,5,7,6,
+2,2,2,2,2,2,0,1,
+0,3,3,3,3,3,1,2,
+3,0,3,3,3,3,1,2,
+0,1,4,4,4,4,2,3,
+3,3,0,3,3,3,1,2,
+0,4,1,4,4,4,2,3,
+4,0,1,4,4,4,2,3,
+0,1,2,5,5,5,3,4,
+3,3,3,0,3,3,1,2,
+0,4,4,1,4,4,2,3,
+4,0,4,1,4,4,2,3,
+0,1,5,2,5,5,3,4,
+4,4,0,1,4,4,2,3,
+0,5,1,2,5,5,3,4,
+5,0,1,2,5,5,3,4,
+0,1,2,3,6,6,4,5,
+3,3,3,3,0,3,1,2,
+0,4,4,4,1,4,2,3,
+4,0,4,4,1,4,2,3,
+0,1,5,5,2,5,3,4,
+4,4,0,4,1,4,2,3,
+0,5,1,5,2,5,3,4,
+5,0,1,5,2,5,3,4,
+0,1,2,6,3,6,4,5,
+4,4,4,0,1,4,2,3,
+0,5,5,1,2,5,3,4,
+5,0,5,1,2,5,3,4,
+0,1,6,2,3,6,4,5,
+5,5,0,1,2,5,3,4,
+0,6,1,2,3,6,4,5,
+6,0,1,2,3,6,4,5,
+0,1,2,3,4,7,5,6,
+3,3,3,3,3,0,1,2,
+0,4,4,4,4,1,2,3,
+4,0,4,4,4,1,2,3,
+0,1,5,5,5,2,3,4,
+4,4,0,4,4,1,2,3,
+0,5,1,5,5,2,3,4,
+5,0,1,5,5,2,3,4,
+0,1,2,6,6,3,4,5,
+4,4,4,0,4,1,2,3,
+0,5,5,1,5,2,3,4,
+5,0,5,1,5,2,3,4,
+0,1,6,2,6,3,4,5,
+5,5,0,1,5,2,3,4,
+0,6,1,2,6,3,4,5,
+6,0,1,2,6,3,4,5,
+0,1,2,3,7,4,5,6,
+4,4,4,4,0,1,2,3,
+0,5,5,5,1,2,3,4,
+5,0,5,5,1,2,3,4,
+0,1,6,6,2,3,4,5,
+5,5,0,5,1,2,3,4,
+0,6,1,6,2,3,4,5,
+6,0,1,6,2,3,4,5,
+0,1,2,7,3,4,5,6,
+5,5,5,0,1,2,3,4,
+0,6,6,1,2,3,4,5,
+6,0,6,1,2,3,4,5,
+0,1,7,2,3,4,5,6,
+6,6,0,1,2,3,4,5,
+0,7,1,2,3,4,5,6,
+7,0,1,2,3,4,5,6,
+0,1,2,3,4,5,6,7
};
#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_)
#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(permv[_m_]))) )
@@ -489,57 +471,38 @@ unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n,
}
//--------------------------------------- zeromask unpack for TurboPFor vp4d.c --------------------------------------
-
-//-- bitunpack used in vp4d.c ---------
#define VO32(_op_, _i_, _ov_, _nb_,_parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) )); pex += popcnt32(xm)
#define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm)
#define BITUNPACK0(_parm_)
#include "bitunpack_.h"
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(256*b);
- unsigned xm;
- __m256i zv = _mm256_setzero_si256(), sv,
- tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+ const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
-//-- bitunpack zigzag ---------
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_)
#define VO32(_op_, i, _ov_, _nb_,_sv_) _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
const unsigned char *ip = in+PAD8(256*b);
- __m256i sv = _mm256_set1_epi32(start); //, zv = _mm256_setzero_si256();
+ __m256i sv = _mm256_set1_epi32(start);//, zv = _mm256_setzero_si256();
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
-//-- bitunpack xor ---------
-#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_)
-#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_xord_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
-#include "bitunpack_.h"
-#define BITUNPACK0(_parm_)
-unsigned char *bitxunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
- const unsigned char *ip = in+PAD8(256*b);
- __m256i sv = _mm256_set1_epi32(start);
- BITUNPACK256V32(in, b, out, sv);
- return (unsigned char *)ip;
-}
-//-- bitunpack delta ---------
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
const unsigned char *ip = in+PAD8(256*b);
- __m256i sv = _mm256_set1_epi32(start); // zv = _mm256_setzero_si256();
+ __m256i sv = _mm256_set1_epi32(start);// zv = _mm256_setzero_si256();
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
-//-- bitunpack FOR ---------
#define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, sv))
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
@@ -549,8 +512,7 @@ unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n,
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
-
-//-- bitunpack delta used in vp4d.c ---------
+//-----------------------------------------------------------------------------
#define VX32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm)
#define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
@@ -565,7 +527,6 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
return (unsigned char *)ip;
}
-//-- bitunpack zigag used in vp4d.c ---------
#define VX32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm)
#define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
@@ -592,14 +553,12 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
const unsigned char *ip = in+PAD8(256*b);
unsigned xm;
- const __m256i zv = _mm256_setzero_si256(),
- tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+ const __m256i zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
__m256i sv = _mm256_set1_epi32(start);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
-//-- bitunpack delta 1 -----------------------------
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv)
#include "bitunpack_.h"
@@ -607,13 +566,11 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n
unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
const unsigned char *ip = in+PAD8(256*b);
const __m256i zv = _mm256_setzero_si256();
- __m256i sv = _mm256_set1_epi32(start),
- cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
+ __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
-//-- bitunpack FOR 1 -----------------------------
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _sv_)); _sv_ = _mm256_add_epi32(_sv_, cv)
#define VOZ32(_op_, _i_, ov, _nb_,_sv_) _mm256_storeu_si256(_op_++, _sv_); _sv_ = _mm256_add_epi32(_sv_, cv);
#include "bitunpack_.h"
@@ -626,31 +583,24 @@ unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n
return (unsigned char *)ip;
}
-//-- bitunpack delta 1 for vp4d.c -----------------------------
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
#include "bitunpack_.h"
#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128()
unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(256*b);
- unsigned xm;
- const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),
- zv = _mm256_setzero_si256(),
- tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+ const unsigned char *ip = in+PAD8(256*b); unsigned xm;
+ const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1), zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
__m256i sv = _mm256_set1_epi32(start);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
-//---------------------------------------------------- bitnunpack ---------------------------------------------------------------------
size_t bitnunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op; _BITNUNPACKV( in, n, out, 256, 32, bitunpack256v); }
size_t bitndunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitdunpack256v, bitdunpack); }
size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitd1unpack256v, bitd1unpack); }
//size_t bitns1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bits1unpack256v, bitd1unpack); }
size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitzunpack256v, bitzunpack); }
-size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitxunpack256v, bitxunpack); }
size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v, bitfunpack); }
-
#elif defined(__SSE2__) || defined(__ARM_NEON) //------------------------------ SSE2/SSSE3 ---------------------------------------------------------
#define BITMAX16 16
#define BITMAX32 32
@@ -1051,30 +1001,15 @@ ALIGNED(char, _shuffle_16[256][16],16) = {
#define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128()
#include "bitunpack_.h"
-//--- bitunpack for vp4d.c ------------------------------
unsigned char *_bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m;
- __m128i sv;
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv; BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m;
- __m128i sv;
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv; BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
-
unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
- const unsigned char *_in=in;
- unsigned *_out=out, m;
- __m128i sv;
- BITUNPACK128V32(in, b, out, sv);
- out = _out+128;
- in = _in+PAD8(128*b);
+ const unsigned char *_in=in; unsigned *_out=out, m; __m128i sv;
+ BITUNPACK128V32(in, b, out, sv); out = _out+128; in=_in+PAD8(128*b);
BITUNPACK128V32(in, b, out, sv);
return (unsigned char *)_in+PAD8(256*b);
}
@@ -1088,23 +1023,19 @@ unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n,
#include "bitunpack_.h"
unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m;
- __m128i zv = _mm_setzero_si128();
- BITUNPACK128V32(in, b, out, 0);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i zv = _mm_setzero_si128(); BITUNPACK128V32(in, b, out, 0); return (unsigned char *)ip;
}
+#define BITMAX16 16
+#define BITMAX32 32
+
#undef VO32
#undef VOZ32
#undef VO16
#undef VOZ16
#undef BITUNPACK0
-//--------------------------------------------------------------------------------------------------------------------------------------------
-#define BITMAX16 16
-#define BITMAX32 32
-//--- bitunpack zigzag --------------------
+//-------------------------------------------------------------------
#define VOZ16(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
#define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
@@ -1112,77 +1043,38 @@ unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n,
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
-}
-
-#define VO16( _op_, _i_, _ov_, _nb_,_sv_) _sv_ = mm_xord_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
-#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _sv_ = mm_xord_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
-#include "bitunpack_.h"
-#define BITUNPACK0(_parm_)
-unsigned char *bitxunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
-}
-
-unsigned char *bitxunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
-//-- bitunpack delta ------------------------------
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
#define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
-//-- bitunpack FOR ----------------------------
#define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, sv))
#define VO16( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, sv))
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
#if defined(__SSSE3__) || defined(__ARM_NEON)
#define BITMAX16 15
#define BITMAX32 31
-//-- bitunpack delta used in vp4d.c ---------
#define VX32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8( mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
#define VXZ32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m])); pex += popcnt32(m)
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
@@ -1195,18 +1087,10 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n,
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *_bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m;
- __m128i sv = _mm_set1_epi16(start);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m; __m128i sv = _mm_set1_epi32(start);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
/*
@@ -1216,7 +1100,6 @@ unsigned char *_bitdunpack128v64( const unsigned char *__restrict in, unsigned n
const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start),zv = _mm_setzero_si128(); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}*/
-//-- bitunpack zigzag used in vp4d.c --------------------------
#define VX16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8( mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
#define VXZ16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ); pex += popcnt32(m)
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
@@ -1230,47 +1113,29 @@ unsigned char *_bitdunpack128v64( const unsigned char *__restrict in, unsigned n
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *_bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m; __m128i sv = _mm_set1_epi16(start);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
unsigned char *_bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m; __m128i sv = _mm_set1_epi32(start);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
#define BITMAX16 16
#define BITMAX32 32
#endif
-//-- bitunpack delta 1 ------------------------------
#define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
#define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv)
#include "bitunpack_.h"
#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8)
-
unsigned char *bitd1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start),
- cv = _mm_set_epi16(8,7,6,5,4,3,2,1);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start), cv = _mm_set_epi16(8,7,6,5,4,3,2,1); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4)
unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start),
- cv = _mm_set_epi32(4,3,2,1);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
-//-- bitunpack sub 1 ------------------------------
#define VO16(_op_, i, _ov_, _nb_,_sv_) ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
#define VO32(_op_, i, _ov_, _nb_,_sv_) ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
#define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
@@ -1278,48 +1143,29 @@ unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n
#include "bitunpack_.h"
#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8)
unsigned char *bits1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi16(start),
- cv = _mm_set1_epi16(8);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4)
unsigned char *bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set1_epi32(start),
- cv = _mm_set1_epi32(4);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
-//-- bitunpack FOR 1 ------------------
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, _sv_)); _sv_ = _mm_add_epi16(_sv_, cv)
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _sv_)); _sv_ = _mm_add_epi32(_sv_, cv)
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _sv_); _sv_ = _mm_add_epi32(_sv_, cv);
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *bitf1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1),
- cv = _mm_set1_epi16(8);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
- const unsigned char *ip = in+PAD8(128*b);
- __m128i sv = _mm_set_epi32(start+4,start+3,start+2,start+1),
- cv = _mm_set1_epi32(4);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set_epi32(start+4,start+3,start+2,start+1), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
#if defined(__SSSE3__) || defined(__ARM_NEON)
#define BITMAX16 15
#define BITMAX32 31
-//-- bitunpack delta 1 for vp4d.c -----------------------
#define VX16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8( mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]))); pex += popcnt32(m)
#define VX32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8( mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
#define VXZ16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m])); pex += popcnt32(m)
@@ -1333,20 +1179,13 @@ unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n
#include "bitunpack_.h"
#define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128()
unsigned char *_bitd1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m;
- __m128i sv = _mm_set1_epi16(start),
- cv = _mm_set_epi16(8,7,6,5,4,3,2,1);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start), cv = _mm_set_epi16(8,7,6,5,4,3,2,1); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
#define BITUNPACK0(_parm_) mv = _mm_setzero_si128()
unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32( 4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
-//-- bitunpack sub 1 -----------------------
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
#define VOZ16(_op_, _i_, _ov_, _nb_,_sv_) VXZ16( _i_, _nb_,_ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
@@ -1355,28 +1194,16 @@ unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned
#include "bitunpack_.h"
#define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128()
unsigned char *_bits1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m;
- __m128i sv = _mm_set1_epi16(start),
- cv = _mm_set1_epi16(8);
- BITUNPACK128V16(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip;
}
-
#define BITUNPACK0(_parm_) mv = _mm_setzero_si128()
unsigned char *_bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
- const unsigned char *ip = in+PAD8(128*b);
- unsigned m;
- __m128i sv = _mm_set1_epi32(start),
- cv = _mm_set1_epi32(4);
- BITUNPACK128V32(in, b, out, sv);
- return (unsigned char *)ip;
+ const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
}
#define BITMAX16 16
#define BITMAX32 32
#endif
-//--------------------------------------------------- bitnunpack ------------------------------------------------------------------------------------------------------------------
size_t bitnunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op; _BITNUNPACKV( in, n, out, 128, 16, bitunpack128v); }
size_t bitnunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op; _BITNUNPACKV( in, n, out, 128, 32, bitunpack128v); }
size_t bitnunpack128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op; _BITNUNPACKV( in, n, out, 128, 64, bitunpack128v); }
@@ -1394,14 +1221,11 @@ size_t bitns1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__re
size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitzunpack128v, bitzunpack); }
size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitzunpack128v, bitzunpack); }
-size_t bitnxunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitxunpack128v, bitxunpack); }
-size_t bitnxunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitxunpack128v, bitxunpack); }
-
size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitfunpack128v, bitfunpack); }
size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitfunpack128v, bitfunpack); }
#endif
-//#endif
+#endif
#pragma clang diagnostic pop
#pragma GCC pop_options
diff --git a/src/ext/for/bitunpack_.h b/src/ext/for/bitunpack_.h
index 1e70ab40..cebbbe9f 100644
--- a/src/ext/for/bitunpack_.h
+++ b/src/ext/for/bitunpack_.h
@@ -1,6 +1,6 @@
/**
- Copyright (C) powturbo 2013-2023
- SPDX-License-Identifier: GPL v2 License
+ Copyright (C) powturbo 2013-2017
+ GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -3103,547 +3103,543 @@
BITUNBLK64_64(ip, 31, op, nb,parm); OPI(op, nb,parm); ip += 64*4/sizeof(ip[0]);\
}
-#define BU(_b_,_usize_) unsigned char *in_=in+PAD8(n*_b_),*ip, bin[PAD8(64*_b_)+1]; T3(uint,_usize_,_t) *out_=out+n,bout[64],*op; \
- do { ip=in+PAD8(32*_b_); op = out+32; if(op > out_) { memcpy(bin, in, in_-in); ip = NULL; in = bin; out = bout; } T2(BITUNPACK64_,_b_)(in, out, _b_,start); PREFETCH(in+384,0); in = ip; out = op; \
-} while(out < out_); if(!ip) { op-=32; memcpy(op,bout,(out_-op)*(_usize_/8)); } return in_
-
#ifndef DELTA
#define USIZE 8
-unsigned char *T2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(0,8); }
-unsigned char *T2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(1,8); }
-unsigned char *T2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(2,8); }
-unsigned char *T2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(3,8); }
-unsigned char *T2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(4,8); }
-unsigned char *T2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(5,8); }
-unsigned char *T2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(6,8); }
-unsigned char *T2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(7,8); }
-unsigned char *T2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { BU(8,8); }
-BITUNPACK_F8 T2(_BITUNPACK_,a8)[] = {
- &T2(_BITUNPACK_,8_0),
- &T2(_BITUNPACK_,8_1),
- &T2(_BITUNPACK_,8_2),
- &T2(_BITUNPACK_,8_3),
- &T2(_BITUNPACK_,8_4),
- &T2(_BITUNPACK_,8_5),
- &T2(_BITUNPACK_,8_6),
- &T2(_BITUNPACK_,8_7),
- &T2(_BITUNPACK_,8_8)
+unsigned char *TEMPLATE2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*0); const uint8_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F8 TEMPLATE2(_BITUNPACK_,a8)[] = {
+ &TEMPLATE2(_BITUNPACK_,8_0),
+ &TEMPLATE2(_BITUNPACK_,8_1),
+ &TEMPLATE2(_BITUNPACK_,8_2),
+ &TEMPLATE2(_BITUNPACK_,8_3),
+ &TEMPLATE2(_BITUNPACK_,8_4),
+ &TEMPLATE2(_BITUNPACK_,8_5),
+ &TEMPLATE2(_BITUNPACK_,8_6),
+ &TEMPLATE2(_BITUNPACK_,8_7),
+ &TEMPLATE2(_BITUNPACK_,8_8)
};
-unsigned char *T2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , unsigned b) { return T2(_BITUNPACK_,a8)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a8)[ b](in, n, out); }
#define USIZE 16
-unsigned char *T2(_BITUNPACK_,16_0 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 0,16); }
-unsigned char *T2(_BITUNPACK_,16_1 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 1,16); }
-unsigned char *T2(_BITUNPACK_,16_2 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 2,16); }
-unsigned char *T2(_BITUNPACK_,16_3 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 3,16); }
-unsigned char *T2(_BITUNPACK_,16_4 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 4,16); }
-unsigned char *T2(_BITUNPACK_,16_5 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 5,16); }
-unsigned char *T2(_BITUNPACK_,16_6 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 6,16); }
-unsigned char *T2(_BITUNPACK_,16_7 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 7,16); }
-unsigned char *T2(_BITUNPACK_,16_8 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 8,16); }
-unsigned char *T2(_BITUNPACK_,16_9 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU( 9,16); }
-unsigned char *T2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU(10,16); }
-unsigned char *T2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU(11,16); }
-unsigned char *T2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU(12,16); }
-unsigned char *T2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU(13,16); }
-unsigned char *T2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU(14,16); }
-unsigned char *T2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU(15,16); }
-unsigned char *T2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { BU(16,16); }
-BITUNPACK_F16 T2(_BITUNPACK_,a16)[] = {
- &T2(_BITUNPACK_,16_0),
- &T2(_BITUNPACK_,16_1),
- &T2(_BITUNPACK_,16_2),
- &T2(_BITUNPACK_,16_3),
- &T2(_BITUNPACK_,16_4),
- &T2(_BITUNPACK_,16_5),
- &T2(_BITUNPACK_,16_6),
- &T2(_BITUNPACK_,16_7),
- &T2(_BITUNPACK_,16_8),
- &T2(_BITUNPACK_,16_9),
- &T2(_BITUNPACK_,16_10),
- &T2(_BITUNPACK_,16_11),
- &T2(_BITUNPACK_,16_12),
- &T2(_BITUNPACK_,16_13),
- &T2(_BITUNPACK_,16_14),
- &T2(_BITUNPACK_,16_15),
- &T2(_BITUNPACK_,16_16)
+unsigned char *TEMPLATE2(_BITUNPACK_,16_0)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*0); const uint16_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_1)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_2)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_3)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_4)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_5)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_6)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_7)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_8)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_9)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*9); do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*10); do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*11); do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*12); do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*13); do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*14); do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*15); do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out ) { unsigned char *in_=in+PAD8(n*16); do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F16 TEMPLATE2(_BITUNPACK_,a16)[] = {
+ &TEMPLATE2(_BITUNPACK_,16_0),
+ &TEMPLATE2(_BITUNPACK_,16_1),
+ &TEMPLATE2(_BITUNPACK_,16_2),
+ &TEMPLATE2(_BITUNPACK_,16_3),
+ &TEMPLATE2(_BITUNPACK_,16_4),
+ &TEMPLATE2(_BITUNPACK_,16_5),
+ &TEMPLATE2(_BITUNPACK_,16_6),
+ &TEMPLATE2(_BITUNPACK_,16_7),
+ &TEMPLATE2(_BITUNPACK_,16_8),
+ &TEMPLATE2(_BITUNPACK_,16_9),
+ &TEMPLATE2(_BITUNPACK_,16_10),
+ &TEMPLATE2(_BITUNPACK_,16_11),
+ &TEMPLATE2(_BITUNPACK_,16_12),
+ &TEMPLATE2(_BITUNPACK_,16_13),
+ &TEMPLATE2(_BITUNPACK_,16_14),
+ &TEMPLATE2(_BITUNPACK_,16_15),
+ &TEMPLATE2(_BITUNPACK_,16_16)
};
-unsigned char *T2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , unsigned b) { return T2(_BITUNPACK_,a16)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a16)[ b](in, n, out); }
#define USIZE 32
-unsigned char *T2(_BITUNPACK_,32_0 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 0,32); }
-unsigned char *T2(_BITUNPACK_,32_1 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 1,32); }
-unsigned char *T2(_BITUNPACK_,32_2 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 2,32); }
-unsigned char *T2(_BITUNPACK_,32_3 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 3,32); }
-unsigned char *T2(_BITUNPACK_,32_4 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 4,32); }
-unsigned char *T2(_BITUNPACK_,32_5 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 5,32); }
-unsigned char *T2(_BITUNPACK_,32_6 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 6,32); }
-unsigned char *T2(_BITUNPACK_,32_7 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 7,32); }
-unsigned char *T2(_BITUNPACK_,32_8 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 8,32); }
-unsigned char *T2(_BITUNPACK_,32_9 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU( 9,32); }
-unsigned char *T2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(10,32); }
-unsigned char *T2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(11,32); }
-unsigned char *T2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(12,32); }
-unsigned char *T2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(13,32); }
-unsigned char *T2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(14,32); }
-unsigned char *T2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(15,32); }
-unsigned char *T2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(16,32); }
-unsigned char *T2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(17,32); }
-unsigned char *T2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(18,32); }
-unsigned char *T2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(19,32); }
-unsigned char *T2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(20,32); }
-unsigned char *T2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(21,32); }
-unsigned char *T2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(22,32); }
-unsigned char *T2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(23,32); }
-unsigned char *T2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(24,32); }
-unsigned char *T2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(25,32); }
-unsigned char *T2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(26,32); }
-unsigned char *T2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(27,32); }
-unsigned char *T2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(28,32); }
-unsigned char *T2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(29,32); }
-unsigned char *T2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(30,32); }
-unsigned char *T2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(31,32); }
-unsigned char *T2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { BU(32,32); }
-BITUNPACK_F32 T2(_BITUNPACK_,a32)[] = {
- &T2(_BITUNPACK_,32_0),
- &T2(_BITUNPACK_,32_1),
- &T2(_BITUNPACK_,32_2),
- &T2(_BITUNPACK_,32_3),
- &T2(_BITUNPACK_,32_4),
- &T2(_BITUNPACK_,32_5),
- &T2(_BITUNPACK_,32_6),
- &T2(_BITUNPACK_,32_7),
- &T2(_BITUNPACK_,32_8),
- &T2(_BITUNPACK_,32_9),
- &T2(_BITUNPACK_,32_10),
- &T2(_BITUNPACK_,32_11),
- &T2(_BITUNPACK_,32_12),
- &T2(_BITUNPACK_,32_13),
- &T2(_BITUNPACK_,32_14),
- &T2(_BITUNPACK_,32_15),
- &T2(_BITUNPACK_,32_16),
- &T2(_BITUNPACK_,32_17),
- &T2(_BITUNPACK_,32_18),
- &T2(_BITUNPACK_,32_19),
- &T2(_BITUNPACK_,32_20),
- &T2(_BITUNPACK_,32_21),
- &T2(_BITUNPACK_,32_22),
- &T2(_BITUNPACK_,32_23),
- &T2(_BITUNPACK_,32_24),
- &T2(_BITUNPACK_,32_25),
- &T2(_BITUNPACK_,32_26),
- &T2(_BITUNPACK_,32_27),
- &T2(_BITUNPACK_,32_28),
- &T2(_BITUNPACK_,32_29),
- &T2(_BITUNPACK_,32_30),
- &T2(_BITUNPACK_,32_31),
- &T2(_BITUNPACK_,32_32)
+unsigned char *TEMPLATE2(_BITUNPACK_,32_0)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*0); const uint32_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_1)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_2)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_3)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_4)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_5)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_6)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_7)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_8)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_9)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*9); do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*10); do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*11); do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*12); do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*13); do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*14); do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*15); do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*16); do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*17); do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*18); do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*19); do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*20); do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*21); do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*22); do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*23); do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*24); do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*25); do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*26); do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*27); do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*28); do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*29); do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*30); do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*31); do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out ) { unsigned char *in_=in+PAD8(n*32); do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F32 TEMPLATE2(_BITUNPACK_,a32)[] = {
+ &TEMPLATE2(_BITUNPACK_,32_0),
+ &TEMPLATE2(_BITUNPACK_,32_1),
+ &TEMPLATE2(_BITUNPACK_,32_2),
+ &TEMPLATE2(_BITUNPACK_,32_3),
+ &TEMPLATE2(_BITUNPACK_,32_4),
+ &TEMPLATE2(_BITUNPACK_,32_5),
+ &TEMPLATE2(_BITUNPACK_,32_6),
+ &TEMPLATE2(_BITUNPACK_,32_7),
+ &TEMPLATE2(_BITUNPACK_,32_8),
+ &TEMPLATE2(_BITUNPACK_,32_9),
+ &TEMPLATE2(_BITUNPACK_,32_10),
+ &TEMPLATE2(_BITUNPACK_,32_11),
+ &TEMPLATE2(_BITUNPACK_,32_12),
+ &TEMPLATE2(_BITUNPACK_,32_13),
+ &TEMPLATE2(_BITUNPACK_,32_14),
+ &TEMPLATE2(_BITUNPACK_,32_15),
+ &TEMPLATE2(_BITUNPACK_,32_16),
+ &TEMPLATE2(_BITUNPACK_,32_17),
+ &TEMPLATE2(_BITUNPACK_,32_18),
+ &TEMPLATE2(_BITUNPACK_,32_19),
+ &TEMPLATE2(_BITUNPACK_,32_20),
+ &TEMPLATE2(_BITUNPACK_,32_21),
+ &TEMPLATE2(_BITUNPACK_,32_22),
+ &TEMPLATE2(_BITUNPACK_,32_23),
+ &TEMPLATE2(_BITUNPACK_,32_24),
+ &TEMPLATE2(_BITUNPACK_,32_25),
+ &TEMPLATE2(_BITUNPACK_,32_26),
+ &TEMPLATE2(_BITUNPACK_,32_27),
+ &TEMPLATE2(_BITUNPACK_,32_28),
+ &TEMPLATE2(_BITUNPACK_,32_29),
+ &TEMPLATE2(_BITUNPACK_,32_30),
+ &TEMPLATE2(_BITUNPACK_,32_31),
+ &TEMPLATE2(_BITUNPACK_,32_32)
};
-unsigned char *T2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , unsigned b) { return T2(_BITUNPACK_,a32)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a32)[ b](in, n, out); }
#define USIZE 64
-unsigned char *T2(_BITUNPACK_,64_0 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 0,64); }
-unsigned char *T2(_BITUNPACK_,64_1 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 1,64); }
-unsigned char *T2(_BITUNPACK_,64_2 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 2,64); }
-unsigned char *T2(_BITUNPACK_,64_3 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 3,64); }
-unsigned char *T2(_BITUNPACK_,64_4 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 4,64); }
-unsigned char *T2(_BITUNPACK_,64_5 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 5,64); }
-unsigned char *T2(_BITUNPACK_,64_6 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 6,64); }
-unsigned char *T2(_BITUNPACK_,64_7 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 7,64); }
-unsigned char *T2(_BITUNPACK_,64_8 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 8,64); }
-unsigned char *T2(_BITUNPACK_,64_9 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU( 9,64); }
-unsigned char *T2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(10,64); }
-unsigned char *T2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(11,64); }
-unsigned char *T2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(12,64); }
-unsigned char *T2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(13,64); }
-unsigned char *T2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(14,64); }
-unsigned char *T2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(15,64); }
-unsigned char *T2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(16,64); }
-unsigned char *T2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(17,64); }
-unsigned char *T2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(18,64); }
-unsigned char *T2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(19,64); }
-unsigned char *T2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(20,64); }
-unsigned char *T2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(21,64); }
-unsigned char *T2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(22,64); }
-unsigned char *T2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(23,64); }
-unsigned char *T2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(24,64); }
-unsigned char *T2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(25,64); }
-unsigned char *T2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(26,64); }
-unsigned char *T2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(27,64); }
-unsigned char *T2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(28,64); }
-unsigned char *T2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(29,64); }
-unsigned char *T2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(30,64); }
-unsigned char *T2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(31,64); }
-unsigned char *T2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(32,64); }
-unsigned char *T2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(33,64); }
-unsigned char *T2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(34,64); }
-unsigned char *T2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(35,64); }
-unsigned char *T2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(36,64); }
-unsigned char *T2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(37,64); }
-unsigned char *T2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(38,64); }
-unsigned char *T2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(39,64); }
-unsigned char *T2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(40,64); }
-unsigned char *T2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(41,64); }
-unsigned char *T2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(42,64); }
-unsigned char *T2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(43,64); }
-unsigned char *T2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(44,64); }
-unsigned char *T2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(45,64); }
-unsigned char *T2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(46,64); }
-unsigned char *T2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(47,64); }
-unsigned char *T2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(48,64); }
-unsigned char *T2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(49,64); }
-unsigned char *T2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(50,64); }
-unsigned char *T2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(51,64); }
-unsigned char *T2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(52,64); }
-unsigned char *T2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(53,64); }
-unsigned char *T2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(54,64); }
-unsigned char *T2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(55,64); }
-unsigned char *T2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(56,64); }
-unsigned char *T2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(57,64); }
-unsigned char *T2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(58,64); }
-unsigned char *T2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(59,64); }
-unsigned char *T2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(60,64); }
-unsigned char *T2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(61,64); }
-unsigned char *T2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(62,64); }
-unsigned char *T2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(63,64); }
-unsigned char *T2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { BU(64,64); }
-BITUNPACK_F64 T2(_BITUNPACK_,a64)[] = {
- &T2(_BITUNPACK_,64_0),
- &T2(_BITUNPACK_,64_1),
- &T2(_BITUNPACK_,64_2),
- &T2(_BITUNPACK_,64_3),
- &T2(_BITUNPACK_,64_4),
- &T2(_BITUNPACK_,64_5),
- &T2(_BITUNPACK_,64_6),
- &T2(_BITUNPACK_,64_7),
- &T2(_BITUNPACK_,64_8),
- &T2(_BITUNPACK_,64_9),
- &T2(_BITUNPACK_,64_10),
- &T2(_BITUNPACK_,64_11),
- &T2(_BITUNPACK_,64_12),
- &T2(_BITUNPACK_,64_13),
- &T2(_BITUNPACK_,64_14),
- &T2(_BITUNPACK_,64_15),
- &T2(_BITUNPACK_,64_16),
- &T2(_BITUNPACK_,64_17),
- &T2(_BITUNPACK_,64_18),
- &T2(_BITUNPACK_,64_19),
- &T2(_BITUNPACK_,64_20),
- &T2(_BITUNPACK_,64_21),
- &T2(_BITUNPACK_,64_22),
- &T2(_BITUNPACK_,64_23),
- &T2(_BITUNPACK_,64_24),
- &T2(_BITUNPACK_,64_25),
- &T2(_BITUNPACK_,64_26),
- &T2(_BITUNPACK_,64_27),
- &T2(_BITUNPACK_,64_28),
- &T2(_BITUNPACK_,64_29),
- &T2(_BITUNPACK_,64_30),
- &T2(_BITUNPACK_,64_31),
- &T2(_BITUNPACK_,64_32),
- &T2(_BITUNPACK_,64_33),
- &T2(_BITUNPACK_,64_34),
- &T2(_BITUNPACK_,64_35),
- &T2(_BITUNPACK_,64_36),
- &T2(_BITUNPACK_,64_37),
- &T2(_BITUNPACK_,64_38),
- &T2(_BITUNPACK_,64_39),
- &T2(_BITUNPACK_,64_40),
- &T2(_BITUNPACK_,64_41),
- &T2(_BITUNPACK_,64_42),
- &T2(_BITUNPACK_,64_43),
- &T2(_BITUNPACK_,64_44),
- &T2(_BITUNPACK_,64_45),
- &T2(_BITUNPACK_,64_46),
- &T2(_BITUNPACK_,64_47),
- &T2(_BITUNPACK_,64_48),
- &T2(_BITUNPACK_,64_49),
- &T2(_BITUNPACK_,64_50),
- &T2(_BITUNPACK_,64_51),
- &T2(_BITUNPACK_,64_52),
- &T2(_BITUNPACK_,64_53),
- &T2(_BITUNPACK_,64_54),
- &T2(_BITUNPACK_,64_55),
- &T2(_BITUNPACK_,64_56),
- &T2(_BITUNPACK_,64_57),
- &T2(_BITUNPACK_,64_58),
- &T2(_BITUNPACK_,64_59),
- &T2(_BITUNPACK_,64_60),
- &T2(_BITUNPACK_,64_61),
- &T2(_BITUNPACK_,64_62),
- &T2(_BITUNPACK_,64_63),
- &T2(_BITUNPACK_,64_64)
+unsigned char *TEMPLATE2(_BITUNPACK_,64_0)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*0); const uint64_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_1)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*1); do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_2)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*2); do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_3)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*3); do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_4)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*4); do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_5)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*5); do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_6)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*6); do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_7)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*7); do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_8)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*8); do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_9)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*9); do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*10); do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*11); do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*12); do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*13); do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*14); do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*15); do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*16); do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*17); do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*18); do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*19); do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*20); do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*21); do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*22); do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*23); do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*24); do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*25); do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*26); do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*27); do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*28); do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*29); do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*30); do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*31); do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*32); do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*33); do { BITUNPACK64_33( in, out, 33,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*34); do { BITUNPACK64_34( in, out, 34,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*35); do { BITUNPACK64_35( in, out, 35,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*36); do { BITUNPACK64_36( in, out, 36,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*37); do { BITUNPACK64_37( in, out, 37,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*38); do { BITUNPACK64_38( in, out, 38,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*39); do { BITUNPACK64_39( in, out, 39,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*40); do { BITUNPACK64_40( in, out, 40,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*41); do { BITUNPACK64_41( in, out, 41,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*42); do { BITUNPACK64_42( in, out, 42,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*43); do { BITUNPACK64_43( in, out, 43,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*44); do { BITUNPACK64_44( in, out, 44,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*45); do { BITUNPACK64_45( in, out, 45,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*46); do { BITUNPACK64_46( in, out, 46,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*47); do { BITUNPACK64_47( in, out, 47,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*48); do { BITUNPACK64_48( in, out, 48,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*49); do { BITUNPACK64_49( in, out, 49,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*50); do { BITUNPACK64_50( in, out, 50,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*51); do { BITUNPACK64_51( in, out, 51,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*52); do { BITUNPACK64_52( in, out, 52,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*53); do { BITUNPACK64_53( in, out, 53,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*54); do { BITUNPACK64_54( in, out, 54,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*55); do { BITUNPACK64_55( in, out, 55,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*56); do { BITUNPACK64_56( in, out, 56,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*57); do { BITUNPACK64_57( in, out, 57,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*58); do { BITUNPACK64_58( in, out, 58,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*59); do { BITUNPACK64_59( in, out, 59,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*60); do { BITUNPACK64_60( in, out, 60,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*61); do { BITUNPACK64_61( in, out, 61,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*62); do { BITUNPACK64_62( in, out, 62,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*63); do { BITUNPACK64_63( in, out, 63,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out ) { unsigned char *in_=in+PAD8(n*64); do { BITUNPACK64_64( in, out, 64,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_F64 TEMPLATE2(_BITUNPACK_,a64)[] = {
+ &TEMPLATE2(_BITUNPACK_,64_0),
+ &TEMPLATE2(_BITUNPACK_,64_1),
+ &TEMPLATE2(_BITUNPACK_,64_2),
+ &TEMPLATE2(_BITUNPACK_,64_3),
+ &TEMPLATE2(_BITUNPACK_,64_4),
+ &TEMPLATE2(_BITUNPACK_,64_5),
+ &TEMPLATE2(_BITUNPACK_,64_6),
+ &TEMPLATE2(_BITUNPACK_,64_7),
+ &TEMPLATE2(_BITUNPACK_,64_8),
+ &TEMPLATE2(_BITUNPACK_,64_9),
+ &TEMPLATE2(_BITUNPACK_,64_10),
+ &TEMPLATE2(_BITUNPACK_,64_11),
+ &TEMPLATE2(_BITUNPACK_,64_12),
+ &TEMPLATE2(_BITUNPACK_,64_13),
+ &TEMPLATE2(_BITUNPACK_,64_14),
+ &TEMPLATE2(_BITUNPACK_,64_15),
+ &TEMPLATE2(_BITUNPACK_,64_16),
+ &TEMPLATE2(_BITUNPACK_,64_17),
+ &TEMPLATE2(_BITUNPACK_,64_18),
+ &TEMPLATE2(_BITUNPACK_,64_19),
+ &TEMPLATE2(_BITUNPACK_,64_20),
+ &TEMPLATE2(_BITUNPACK_,64_21),
+ &TEMPLATE2(_BITUNPACK_,64_22),
+ &TEMPLATE2(_BITUNPACK_,64_23),
+ &TEMPLATE2(_BITUNPACK_,64_24),
+ &TEMPLATE2(_BITUNPACK_,64_25),
+ &TEMPLATE2(_BITUNPACK_,64_26),
+ &TEMPLATE2(_BITUNPACK_,64_27),
+ &TEMPLATE2(_BITUNPACK_,64_28),
+ &TEMPLATE2(_BITUNPACK_,64_29),
+ &TEMPLATE2(_BITUNPACK_,64_30),
+ &TEMPLATE2(_BITUNPACK_,64_31),
+ &TEMPLATE2(_BITUNPACK_,64_32),
+ &TEMPLATE2(_BITUNPACK_,64_33),
+ &TEMPLATE2(_BITUNPACK_,64_34),
+ &TEMPLATE2(_BITUNPACK_,64_35),
+ &TEMPLATE2(_BITUNPACK_,64_36),
+ &TEMPLATE2(_BITUNPACK_,64_37),
+ &TEMPLATE2(_BITUNPACK_,64_38),
+ &TEMPLATE2(_BITUNPACK_,64_39),
+ &TEMPLATE2(_BITUNPACK_,64_40),
+ &TEMPLATE2(_BITUNPACK_,64_41),
+ &TEMPLATE2(_BITUNPACK_,64_42),
+ &TEMPLATE2(_BITUNPACK_,64_43),
+ &TEMPLATE2(_BITUNPACK_,64_44),
+ &TEMPLATE2(_BITUNPACK_,64_45),
+ &TEMPLATE2(_BITUNPACK_,64_46),
+ &TEMPLATE2(_BITUNPACK_,64_47),
+ &TEMPLATE2(_BITUNPACK_,64_48),
+ &TEMPLATE2(_BITUNPACK_,64_49),
+ &TEMPLATE2(_BITUNPACK_,64_50),
+ &TEMPLATE2(_BITUNPACK_,64_51),
+ &TEMPLATE2(_BITUNPACK_,64_52),
+ &TEMPLATE2(_BITUNPACK_,64_53),
+ &TEMPLATE2(_BITUNPACK_,64_54),
+ &TEMPLATE2(_BITUNPACK_,64_55),
+ &TEMPLATE2(_BITUNPACK_,64_56),
+ &TEMPLATE2(_BITUNPACK_,64_57),
+ &TEMPLATE2(_BITUNPACK_,64_58),
+ &TEMPLATE2(_BITUNPACK_,64_59),
+ &TEMPLATE2(_BITUNPACK_,64_60),
+ &TEMPLATE2(_BITUNPACK_,64_61),
+ &TEMPLATE2(_BITUNPACK_,64_62),
+ &TEMPLATE2(_BITUNPACK_,64_63),
+ &TEMPLATE2(_BITUNPACK_,64_64)
};
-unsigned char *T2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , unsigned b) { return T2(_BITUNPACK_,a64)[ b](in, n, out); }
+unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , unsigned b) { return TEMPLATE2(_BITUNPACK_,a64)[ b](in, n, out); }
#else
#define USIZE 8
-unsigned char *T2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(0,8); }
-unsigned char *T2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(1,8); }
-unsigned char *T2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(2,8); }
-unsigned char *T2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(3,8); }
-unsigned char *T2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(4,8); }
-unsigned char *T2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(5,8); }
-unsigned char *T2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(6,8); }
-unsigned char *T2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(7,8); }
-unsigned char *T2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { BU(8,8); }
-BITUNPACK_D8 T2(_BITUNPACK_,a8)[] = {
- &T2(_BITUNPACK_,8_0),
- &T2(_BITUNPACK_,8_1),
- &T2(_BITUNPACK_,8_2),
- &T2(_BITUNPACK_,8_3),
- &T2(_BITUNPACK_,8_4),
- &T2(_BITUNPACK_,8_5),
- &T2(_BITUNPACK_,8_6),
- &T2(_BITUNPACK_,8_7),
- &T2(_BITUNPACK_,8_8)
+unsigned char *TEMPLATE2(_BITUNPACK_,8_0)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint8_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_1)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_2)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_3)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_4)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_5)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_6)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_7)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,8_8)(const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D8 TEMPLATE2(_BITUNPACK_,a8)[] = {
+ &TEMPLATE2(_BITUNPACK_,8_0),
+ &TEMPLATE2(_BITUNPACK_,8_1),
+ &TEMPLATE2(_BITUNPACK_,8_2),
+ &TEMPLATE2(_BITUNPACK_,8_3),
+ &TEMPLATE2(_BITUNPACK_,8_4),
+ &TEMPLATE2(_BITUNPACK_,8_5),
+ &TEMPLATE2(_BITUNPACK_,8_6),
+ &TEMPLATE2(_BITUNPACK_,8_7),
+ &TEMPLATE2(_BITUNPACK_,8_8)
};
-unsigned char *T2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start, unsigned b) { return T2(_BITUNPACK_,a8)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out , uint8_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a8)[ b](in, n, out, start); }
#define USIZE 16
-unsigned char *T2(_BITUNPACK_,16_0 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 0,16); }
-unsigned char *T2(_BITUNPACK_,16_1 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 1,16); }
-unsigned char *T2(_BITUNPACK_,16_2 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 2,16); }
-unsigned char *T2(_BITUNPACK_,16_3 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 3,16); }
-unsigned char *T2(_BITUNPACK_,16_4 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 4,16); }
-unsigned char *T2(_BITUNPACK_,16_5 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 5,16); }
-unsigned char *T2(_BITUNPACK_,16_6 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 6,16); }
-unsigned char *T2(_BITUNPACK_,16_7 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 7,16); }
-unsigned char *T2(_BITUNPACK_,16_8 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 8,16); }
-unsigned char *T2(_BITUNPACK_,16_9 )(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU( 9,16); }
-unsigned char *T2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(10,16); }
-unsigned char *T2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(11,16); }
-unsigned char *T2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(12,16); }
-unsigned char *T2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(13,16); }
-unsigned char *T2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(14,16); }
-unsigned char *T2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(15,16); }
-unsigned char *T2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { BU(16,16); }
-BITUNPACK_D16 T2(_BITUNPACK_,a16)[] = {
- &T2(_BITUNPACK_,16_0),
- &T2(_BITUNPACK_,16_1),
- &T2(_BITUNPACK_,16_2),
- &T2(_BITUNPACK_,16_3),
- &T2(_BITUNPACK_,16_4),
- &T2(_BITUNPACK_,16_5),
- &T2(_BITUNPACK_,16_6),
- &T2(_BITUNPACK_,16_7),
- &T2(_BITUNPACK_,16_8),
- &T2(_BITUNPACK_,16_9),
- &T2(_BITUNPACK_,16_10),
- &T2(_BITUNPACK_,16_11),
- &T2(_BITUNPACK_,16_12),
- &T2(_BITUNPACK_,16_13),
- &T2(_BITUNPACK_,16_14),
- &T2(_BITUNPACK_,16_15),
- &T2(_BITUNPACK_,16_16)
+unsigned char *TEMPLATE2(_BITUNPACK_,16_0)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint16_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_1)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_2)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_3)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_4)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_5)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_6)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_7)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_8)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_9)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*9),x=0; do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_10)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*10),x=0; do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_11)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*11),x=0; do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_12)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*12),x=0; do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_13)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*13),x=0; do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_14)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*14),x=0; do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_15)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*15),x=0; do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,16_16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start ) { unsigned char *in_=in+PAD8(n*16),x=0; do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D16 TEMPLATE2(_BITUNPACK_,a16)[] = {
+ &TEMPLATE2(_BITUNPACK_,16_0),
+ &TEMPLATE2(_BITUNPACK_,16_1),
+ &TEMPLATE2(_BITUNPACK_,16_2),
+ &TEMPLATE2(_BITUNPACK_,16_3),
+ &TEMPLATE2(_BITUNPACK_,16_4),
+ &TEMPLATE2(_BITUNPACK_,16_5),
+ &TEMPLATE2(_BITUNPACK_,16_6),
+ &TEMPLATE2(_BITUNPACK_,16_7),
+ &TEMPLATE2(_BITUNPACK_,16_8),
+ &TEMPLATE2(_BITUNPACK_,16_9),
+ &TEMPLATE2(_BITUNPACK_,16_10),
+ &TEMPLATE2(_BITUNPACK_,16_11),
+ &TEMPLATE2(_BITUNPACK_,16_12),
+ &TEMPLATE2(_BITUNPACK_,16_13),
+ &TEMPLATE2(_BITUNPACK_,16_14),
+ &TEMPLATE2(_BITUNPACK_,16_15),
+ &TEMPLATE2(_BITUNPACK_,16_16)
};
-unsigned char *T2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start, unsigned b) { return T2(_BITUNPACK_,a16)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,16)( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out , uint16_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a16)[ b](in, n, out, start); }
#define USIZE 32
-unsigned char *T2(_BITUNPACK_,32_0 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 0,32); }
-unsigned char *T2(_BITUNPACK_,32_1 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 1,32); }
-unsigned char *T2(_BITUNPACK_,32_2 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 2,32); }
-unsigned char *T2(_BITUNPACK_,32_3 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 3,32); }
-unsigned char *T2(_BITUNPACK_,32_4 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 4,32); }
-unsigned char *T2(_BITUNPACK_,32_5 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 5,32); }
-unsigned char *T2(_BITUNPACK_,32_6 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 6,32); }
-unsigned char *T2(_BITUNPACK_,32_7 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 7,32); }
-unsigned char *T2(_BITUNPACK_,32_8 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 8,32); }
-unsigned char *T2(_BITUNPACK_,32_9 )(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU( 9,32); }
-unsigned char *T2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(10,32); }
-unsigned char *T2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(11,32); }
-unsigned char *T2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(12,32); }
-unsigned char *T2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(13,32); }
-unsigned char *T2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(14,32); }
-unsigned char *T2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(15,32); }
-unsigned char *T2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(16,32); }
-unsigned char *T2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(17,32); }
-unsigned char *T2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(18,32); }
-unsigned char *T2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(19,32); }
-unsigned char *T2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(20,32); }
-unsigned char *T2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(21,32); }
-unsigned char *T2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(22,32); }
-unsigned char *T2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(23,32); }
-unsigned char *T2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(24,32); }
-unsigned char *T2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(25,32); }
-unsigned char *T2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(26,32); }
-unsigned char *T2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(27,32); }
-unsigned char *T2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(28,32); }
-unsigned char *T2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(29,32); }
-unsigned char *T2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(30,32); }
-unsigned char *T2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(31,32); }
-unsigned char *T2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { BU(32,32); }
-BITUNPACK_D32 T2(_BITUNPACK_,a32)[] = {
- &T2(_BITUNPACK_,32_0),
- &T2(_BITUNPACK_,32_1),
- &T2(_BITUNPACK_,32_2),
- &T2(_BITUNPACK_,32_3),
- &T2(_BITUNPACK_,32_4),
- &T2(_BITUNPACK_,32_5),
- &T2(_BITUNPACK_,32_6),
- &T2(_BITUNPACK_,32_7),
- &T2(_BITUNPACK_,32_8),
- &T2(_BITUNPACK_,32_9),
- &T2(_BITUNPACK_,32_10),
- &T2(_BITUNPACK_,32_11),
- &T2(_BITUNPACK_,32_12),
- &T2(_BITUNPACK_,32_13),
- &T2(_BITUNPACK_,32_14),
- &T2(_BITUNPACK_,32_15),
- &T2(_BITUNPACK_,32_16),
- &T2(_BITUNPACK_,32_17),
- &T2(_BITUNPACK_,32_18),
- &T2(_BITUNPACK_,32_19),
- &T2(_BITUNPACK_,32_20),
- &T2(_BITUNPACK_,32_21),
- &T2(_BITUNPACK_,32_22),
- &T2(_BITUNPACK_,32_23),
- &T2(_BITUNPACK_,32_24),
- &T2(_BITUNPACK_,32_25),
- &T2(_BITUNPACK_,32_26),
- &T2(_BITUNPACK_,32_27),
- &T2(_BITUNPACK_,32_28),
- &T2(_BITUNPACK_,32_29),
- &T2(_BITUNPACK_,32_30),
- &T2(_BITUNPACK_,32_31),
- &T2(_BITUNPACK_,32_32)
+unsigned char *TEMPLATE2(_BITUNPACK_,32_0)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint32_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_1)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_2)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_3)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_4)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_5)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_6)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_7)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_8)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_9)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*9),x=0; do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_10)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*10),x=0; do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_11)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*11),x=0; do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_12)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*12),x=0; do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_13)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*13),x=0; do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_14)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*14),x=0; do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_15)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*15),x=0; do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_16)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*16),x=0; do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_17)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*17),x=0; do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_18)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*18),x=0; do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_19)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*19),x=0; do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_20)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*20),x=0; do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_21)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*21),x=0; do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_22)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*22),x=0; do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_23)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*23),x=0; do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_24)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*24),x=0; do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_25)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*25),x=0; do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_26)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*26),x=0; do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_27)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*27),x=0; do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_28)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*28),x=0; do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_29)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*29),x=0; do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_30)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*30),x=0; do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_31)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*31),x=0; do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,32_32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start ) { unsigned char *in_=in+PAD8(n*32),x=0; do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D32 TEMPLATE2(_BITUNPACK_,a32)[] = {
+ &TEMPLATE2(_BITUNPACK_,32_0),
+ &TEMPLATE2(_BITUNPACK_,32_1),
+ &TEMPLATE2(_BITUNPACK_,32_2),
+ &TEMPLATE2(_BITUNPACK_,32_3),
+ &TEMPLATE2(_BITUNPACK_,32_4),
+ &TEMPLATE2(_BITUNPACK_,32_5),
+ &TEMPLATE2(_BITUNPACK_,32_6),
+ &TEMPLATE2(_BITUNPACK_,32_7),
+ &TEMPLATE2(_BITUNPACK_,32_8),
+ &TEMPLATE2(_BITUNPACK_,32_9),
+ &TEMPLATE2(_BITUNPACK_,32_10),
+ &TEMPLATE2(_BITUNPACK_,32_11),
+ &TEMPLATE2(_BITUNPACK_,32_12),
+ &TEMPLATE2(_BITUNPACK_,32_13),
+ &TEMPLATE2(_BITUNPACK_,32_14),
+ &TEMPLATE2(_BITUNPACK_,32_15),
+ &TEMPLATE2(_BITUNPACK_,32_16),
+ &TEMPLATE2(_BITUNPACK_,32_17),
+ &TEMPLATE2(_BITUNPACK_,32_18),
+ &TEMPLATE2(_BITUNPACK_,32_19),
+ &TEMPLATE2(_BITUNPACK_,32_20),
+ &TEMPLATE2(_BITUNPACK_,32_21),
+ &TEMPLATE2(_BITUNPACK_,32_22),
+ &TEMPLATE2(_BITUNPACK_,32_23),
+ &TEMPLATE2(_BITUNPACK_,32_24),
+ &TEMPLATE2(_BITUNPACK_,32_25),
+ &TEMPLATE2(_BITUNPACK_,32_26),
+ &TEMPLATE2(_BITUNPACK_,32_27),
+ &TEMPLATE2(_BITUNPACK_,32_28),
+ &TEMPLATE2(_BITUNPACK_,32_29),
+ &TEMPLATE2(_BITUNPACK_,32_30),
+ &TEMPLATE2(_BITUNPACK_,32_31),
+ &TEMPLATE2(_BITUNPACK_,32_32)
};
-unsigned char *T2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start, unsigned b) { return T2(_BITUNPACK_,a32)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,32)( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out , uint32_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a32)[ b](in, n, out, start); }
#define USIZE 64
-unsigned char *T2(_BITUNPACK_,64_0 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 0,64); }
-unsigned char *T2(_BITUNPACK_,64_1 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 1,64); }
-unsigned char *T2(_BITUNPACK_,64_2 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 2,64); }
-unsigned char *T2(_BITUNPACK_,64_3 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 3,64); }
-unsigned char *T2(_BITUNPACK_,64_4 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 4,64); }
-unsigned char *T2(_BITUNPACK_,64_5 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 5,64); }
-unsigned char *T2(_BITUNPACK_,64_6 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 6,64); }
-unsigned char *T2(_BITUNPACK_,64_7 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 7,64); }
-unsigned char *T2(_BITUNPACK_,64_8 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 8,64); }
-unsigned char *T2(_BITUNPACK_,64_9 )(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU( 9,64); }
-unsigned char *T2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(10,64); }
-unsigned char *T2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(11,64); }
-unsigned char *T2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(12,64); }
-unsigned char *T2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(13,64); }
-unsigned char *T2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(14,64); }
-unsigned char *T2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(15,64); }
-unsigned char *T2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(16,64); }
-unsigned char *T2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(17,64); }
-unsigned char *T2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(18,64); }
-unsigned char *T2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(19,64); }
-unsigned char *T2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(20,64); }
-unsigned char *T2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(21,64); }
-unsigned char *T2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(22,64); }
-unsigned char *T2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(23,64); }
-unsigned char *T2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(24,64); }
-unsigned char *T2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(25,64); }
-unsigned char *T2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(26,64); }
-unsigned char *T2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(27,64); }
-unsigned char *T2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(28,64); }
-unsigned char *T2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(29,64); }
-unsigned char *T2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(30,64); }
-unsigned char *T2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(31,64); }
-unsigned char *T2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(32,64); }
-unsigned char *T2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(33,64); }
-unsigned char *T2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(34,64); }
-unsigned char *T2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(35,64); }
-unsigned char *T2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(36,64); }
-unsigned char *T2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(37,64); }
-unsigned char *T2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(38,64); }
-unsigned char *T2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(39,64); }
-unsigned char *T2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(40,64); }
-unsigned char *T2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(41,64); }
-unsigned char *T2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(42,64); }
-unsigned char *T2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(43,64); }
-unsigned char *T2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(44,64); }
-unsigned char *T2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(45,64); }
-unsigned char *T2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(46,64); }
-unsigned char *T2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(47,64); }
-unsigned char *T2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(48,64); }
-unsigned char *T2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(49,64); }
-unsigned char *T2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(50,64); }
-unsigned char *T2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(51,64); }
-unsigned char *T2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(52,64); }
-unsigned char *T2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(53,64); }
-unsigned char *T2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(54,64); }
-unsigned char *T2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(55,64); }
-unsigned char *T2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(56,64); }
-unsigned char *T2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(57,64); }
-unsigned char *T2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(58,64); }
-unsigned char *T2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(59,64); }
-unsigned char *T2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(60,64); }
-unsigned char *T2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(61,64); }
-unsigned char *T2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(62,64); }
-unsigned char *T2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(63,64); }
-unsigned char *T2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { BU(64,64); }
-BITUNPACK_D64 T2(_BITUNPACK_,a64)[] = {
- &T2(_BITUNPACK_,64_0),
- &T2(_BITUNPACK_,64_1),
- &T2(_BITUNPACK_,64_2),
- &T2(_BITUNPACK_,64_3),
- &T2(_BITUNPACK_,64_4),
- &T2(_BITUNPACK_,64_5),
- &T2(_BITUNPACK_,64_6),
- &T2(_BITUNPACK_,64_7),
- &T2(_BITUNPACK_,64_8),
- &T2(_BITUNPACK_,64_9),
- &T2(_BITUNPACK_,64_10),
- &T2(_BITUNPACK_,64_11),
- &T2(_BITUNPACK_,64_12),
- &T2(_BITUNPACK_,64_13),
- &T2(_BITUNPACK_,64_14),
- &T2(_BITUNPACK_,64_15),
- &T2(_BITUNPACK_,64_16),
- &T2(_BITUNPACK_,64_17),
- &T2(_BITUNPACK_,64_18),
- &T2(_BITUNPACK_,64_19),
- &T2(_BITUNPACK_,64_20),
- &T2(_BITUNPACK_,64_21),
- &T2(_BITUNPACK_,64_22),
- &T2(_BITUNPACK_,64_23),
- &T2(_BITUNPACK_,64_24),
- &T2(_BITUNPACK_,64_25),
- &T2(_BITUNPACK_,64_26),
- &T2(_BITUNPACK_,64_27),
- &T2(_BITUNPACK_,64_28),
- &T2(_BITUNPACK_,64_29),
- &T2(_BITUNPACK_,64_30),
- &T2(_BITUNPACK_,64_31),
- &T2(_BITUNPACK_,64_32),
- &T2(_BITUNPACK_,64_33),
- &T2(_BITUNPACK_,64_34),
- &T2(_BITUNPACK_,64_35),
- &T2(_BITUNPACK_,64_36),
- &T2(_BITUNPACK_,64_37),
- &T2(_BITUNPACK_,64_38),
- &T2(_BITUNPACK_,64_39),
- &T2(_BITUNPACK_,64_40),
- &T2(_BITUNPACK_,64_41),
- &T2(_BITUNPACK_,64_42),
- &T2(_BITUNPACK_,64_43),
- &T2(_BITUNPACK_,64_44),
- &T2(_BITUNPACK_,64_45),
- &T2(_BITUNPACK_,64_46),
- &T2(_BITUNPACK_,64_47),
- &T2(_BITUNPACK_,64_48),
- &T2(_BITUNPACK_,64_49),
- &T2(_BITUNPACK_,64_50),
- &T2(_BITUNPACK_,64_51),
- &T2(_BITUNPACK_,64_52),
- &T2(_BITUNPACK_,64_53),
- &T2(_BITUNPACK_,64_54),
- &T2(_BITUNPACK_,64_55),
- &T2(_BITUNPACK_,64_56),
- &T2(_BITUNPACK_,64_57),
- &T2(_BITUNPACK_,64_58),
- &T2(_BITUNPACK_,64_59),
- &T2(_BITUNPACK_,64_60),
- &T2(_BITUNPACK_,64_61),
- &T2(_BITUNPACK_,64_62),
- &T2(_BITUNPACK_,64_63),
- &T2(_BITUNPACK_,64_64)
+unsigned char *TEMPLATE2(_BITUNPACK_,64_0)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*0),x=0; const uint64_t *out_ = out+n; do { BITUNPACK64_0( in, out, 0,start); PREFETCH(in+512,0); } while(out<out_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_1)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*1),x=0; do { BITUNPACK64_1( in, out, 1,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_2)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*2),x=0; do { BITUNPACK64_2( in, out, 2,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_3)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*3),x=0; do { BITUNPACK64_3( in, out, 3,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_4)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*4),x=0; do { BITUNPACK64_4( in, out, 4,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_5)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*5),x=0; do { BITUNPACK64_5( in, out, 5,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_6)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*6),x=0; do { BITUNPACK64_6( in, out, 6,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_7)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*7),x=0; do { BITUNPACK64_7( in, out, 7,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_8)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*8),x=0; do { BITUNPACK64_8( in, out, 8,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_9)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*9),x=0; do { BITUNPACK64_9( in, out, 9,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_10)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*10),x=0; do { BITUNPACK64_10( in, out, 10,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_11)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*11),x=0; do { BITUNPACK64_11( in, out, 11,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_12)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*12),x=0; do { BITUNPACK64_12( in, out, 12,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_13)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*13),x=0; do { BITUNPACK64_13( in, out, 13,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_14)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*14),x=0; do { BITUNPACK64_14( in, out, 14,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_15)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*15),x=0; do { BITUNPACK64_15( in, out, 15,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_16)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*16),x=0; do { BITUNPACK64_16( in, out, 16,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_17)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*17),x=0; do { BITUNPACK64_17( in, out, 17,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_18)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*18),x=0; do { BITUNPACK64_18( in, out, 18,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_19)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*19),x=0; do { BITUNPACK64_19( in, out, 19,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_20)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*20),x=0; do { BITUNPACK64_20( in, out, 20,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_21)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*21),x=0; do { BITUNPACK64_21( in, out, 21,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_22)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*22),x=0; do { BITUNPACK64_22( in, out, 22,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_23)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*23),x=0; do { BITUNPACK64_23( in, out, 23,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_24)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*24),x=0; do { BITUNPACK64_24( in, out, 24,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_25)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*25),x=0; do { BITUNPACK64_25( in, out, 25,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_26)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*26),x=0; do { BITUNPACK64_26( in, out, 26,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_27)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*27),x=0; do { BITUNPACK64_27( in, out, 27,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_28)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*28),x=0; do { BITUNPACK64_28( in, out, 28,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_29)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*29),x=0; do { BITUNPACK64_29( in, out, 29,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_30)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*30),x=0; do { BITUNPACK64_30( in, out, 30,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_31)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*31),x=0; do { BITUNPACK64_31( in, out, 31,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_32)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*32),x=0; do { BITUNPACK64_32( in, out, 32,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_33)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*33),x=0; do { BITUNPACK64_33( in, out, 33,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_34)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*34),x=0; do { BITUNPACK64_34( in, out, 34,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_35)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*35),x=0; do { BITUNPACK64_35( in, out, 35,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_36)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*36),x=0; do { BITUNPACK64_36( in, out, 36,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_37)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*37),x=0; do { BITUNPACK64_37( in, out, 37,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_38)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*38),x=0; do { BITUNPACK64_38( in, out, 38,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_39)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*39),x=0; do { BITUNPACK64_39( in, out, 39,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_40)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*40),x=0; do { BITUNPACK64_40( in, out, 40,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_41)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*41),x=0; do { BITUNPACK64_41( in, out, 41,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_42)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*42),x=0; do { BITUNPACK64_42( in, out, 42,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_43)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*43),x=0; do { BITUNPACK64_43( in, out, 43,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_44)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*44),x=0; do { BITUNPACK64_44( in, out, 44,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_45)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*45),x=0; do { BITUNPACK64_45( in, out, 45,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_46)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*46),x=0; do { BITUNPACK64_46( in, out, 46,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_47)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*47),x=0; do { BITUNPACK64_47( in, out, 47,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_48)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*48),x=0; do { BITUNPACK64_48( in, out, 48,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_49)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*49),x=0; do { BITUNPACK64_49( in, out, 49,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_50)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*50),x=0; do { BITUNPACK64_50( in, out, 50,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_51)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*51),x=0; do { BITUNPACK64_51( in, out, 51,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_52)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*52),x=0; do { BITUNPACK64_52( in, out, 52,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_53)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*53),x=0; do { BITUNPACK64_53( in, out, 53,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_54)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*54),x=0; do { BITUNPACK64_54( in, out, 54,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_55)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*55),x=0; do { BITUNPACK64_55( in, out, 55,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_56)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*56),x=0; do { BITUNPACK64_56( in, out, 56,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_57)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*57),x=0; do { BITUNPACK64_57( in, out, 57,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_58)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*58),x=0; do { BITUNPACK64_58( in, out, 58,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_59)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*59),x=0; do { BITUNPACK64_59( in, out, 59,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_60)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*60),x=0; do { BITUNPACK64_60( in, out, 60,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_61)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*61),x=0; do { BITUNPACK64_61( in, out, 61,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_62)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*62),x=0; do { BITUNPACK64_62( in, out, 62,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_63)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*63),x=0; do { BITUNPACK64_63( in, out, 63,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+unsigned char *TEMPLATE2(_BITUNPACK_,64_64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start ) { unsigned char *in_=in+PAD8(n*64),x=0; do { BITUNPACK64_64( in, out, 64,start); PREFETCH(in+512,0); } while(in<in_); return in_; }
+BITUNPACK_D64 TEMPLATE2(_BITUNPACK_,a64)[] = {
+ &TEMPLATE2(_BITUNPACK_,64_0),
+ &TEMPLATE2(_BITUNPACK_,64_1),
+ &TEMPLATE2(_BITUNPACK_,64_2),
+ &TEMPLATE2(_BITUNPACK_,64_3),
+ &TEMPLATE2(_BITUNPACK_,64_4),
+ &TEMPLATE2(_BITUNPACK_,64_5),
+ &TEMPLATE2(_BITUNPACK_,64_6),
+ &TEMPLATE2(_BITUNPACK_,64_7),
+ &TEMPLATE2(_BITUNPACK_,64_8),
+ &TEMPLATE2(_BITUNPACK_,64_9),
+ &TEMPLATE2(_BITUNPACK_,64_10),
+ &TEMPLATE2(_BITUNPACK_,64_11),
+ &TEMPLATE2(_BITUNPACK_,64_12),
+ &TEMPLATE2(_BITUNPACK_,64_13),
+ &TEMPLATE2(_BITUNPACK_,64_14),
+ &TEMPLATE2(_BITUNPACK_,64_15),
+ &TEMPLATE2(_BITUNPACK_,64_16),
+ &TEMPLATE2(_BITUNPACK_,64_17),
+ &TEMPLATE2(_BITUNPACK_,64_18),
+ &TEMPLATE2(_BITUNPACK_,64_19),
+ &TEMPLATE2(_BITUNPACK_,64_20),
+ &TEMPLATE2(_BITUNPACK_,64_21),
+ &TEMPLATE2(_BITUNPACK_,64_22),
+ &TEMPLATE2(_BITUNPACK_,64_23),
+ &TEMPLATE2(_BITUNPACK_,64_24),
+ &TEMPLATE2(_BITUNPACK_,64_25),
+ &TEMPLATE2(_BITUNPACK_,64_26),
+ &TEMPLATE2(_BITUNPACK_,64_27),
+ &TEMPLATE2(_BITUNPACK_,64_28),
+ &TEMPLATE2(_BITUNPACK_,64_29),
+ &TEMPLATE2(_BITUNPACK_,64_30),
+ &TEMPLATE2(_BITUNPACK_,64_31),
+ &TEMPLATE2(_BITUNPACK_,64_32),
+ &TEMPLATE2(_BITUNPACK_,64_33),
+ &TEMPLATE2(_BITUNPACK_,64_34),
+ &TEMPLATE2(_BITUNPACK_,64_35),
+ &TEMPLATE2(_BITUNPACK_,64_36),
+ &TEMPLATE2(_BITUNPACK_,64_37),
+ &TEMPLATE2(_BITUNPACK_,64_38),
+ &TEMPLATE2(_BITUNPACK_,64_39),
+ &TEMPLATE2(_BITUNPACK_,64_40),
+ &TEMPLATE2(_BITUNPACK_,64_41),
+ &TEMPLATE2(_BITUNPACK_,64_42),
+ &TEMPLATE2(_BITUNPACK_,64_43),
+ &TEMPLATE2(_BITUNPACK_,64_44),
+ &TEMPLATE2(_BITUNPACK_,64_45),
+ &TEMPLATE2(_BITUNPACK_,64_46),
+ &TEMPLATE2(_BITUNPACK_,64_47),
+ &TEMPLATE2(_BITUNPACK_,64_48),
+ &TEMPLATE2(_BITUNPACK_,64_49),
+ &TEMPLATE2(_BITUNPACK_,64_50),
+ &TEMPLATE2(_BITUNPACK_,64_51),
+ &TEMPLATE2(_BITUNPACK_,64_52),
+ &TEMPLATE2(_BITUNPACK_,64_53),
+ &TEMPLATE2(_BITUNPACK_,64_54),
+ &TEMPLATE2(_BITUNPACK_,64_55),
+ &TEMPLATE2(_BITUNPACK_,64_56),
+ &TEMPLATE2(_BITUNPACK_,64_57),
+ &TEMPLATE2(_BITUNPACK_,64_58),
+ &TEMPLATE2(_BITUNPACK_,64_59),
+ &TEMPLATE2(_BITUNPACK_,64_60),
+ &TEMPLATE2(_BITUNPACK_,64_61),
+ &TEMPLATE2(_BITUNPACK_,64_62),
+ &TEMPLATE2(_BITUNPACK_,64_63),
+ &TEMPLATE2(_BITUNPACK_,64_64)
};
-unsigned char *T2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start, unsigned b) { return T2(_BITUNPACK_,a64)[ b](in, n, out, start); }
+unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , uint64_t start, unsigned b) { return TEMPLATE2(_BITUNPACK_,a64)[ b](in, n, out, start); }
#endif
#endif //OPI
diff --git a/src/ext/for/bitutil.c b/src/ext/for/bitutil.c
index 075a5727..5edca0a0 100644
--- a/src/ext/for/bitutil.c
+++ b/src/ext/for/bitutil.c
@@ -1,6 +1,6 @@
/**
- Copyright (C) powturbo 2013-2023
- SPDX-License-Identifier: GPL v2 License
+ Copyright (C) powturbo 2013-2019
+ GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -21,198 +21,14 @@
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
-// "Integer Compression" utility - delta, for, zigzag / Floating point compression
-#pragma warning( disable : 4005)
-#pragma warning( disable : 4090)
-#pragma warning( disable : 4068)
-
+// "Integer Compression" utility - delta, for, zigzag / Floating point compression
#include <math.h> //nan
-#include "include_/conf.h"
-#include "include_/bitutil.h"
-
-#include "include_/bitutil_.h"
-
-#define BT(_i_) { o |= ip[_i_]; x |= ip[_i_] ^ u0; }
-
-#ifdef __AVX2__
-
-uint32_t bit256v32(uint32_t *in, unsigned n, uint32_t *px) {
- uint32_t o = 0,x,u0 = in[0], *ip = in;
- __m256i vb0 = _mm256_set1_epi32(*in),
- vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
- vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();
- for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
- __m256i v0 = _mm256_loadu_si256((__m256i *) ip);
- __m256i v1 = _mm256_loadu_si256((__m256i *)(ip+8));
- vo0 = _mm256_or_si256(vo0, v0);
- vo1 = _mm256_or_si256(vo1, v1);
- vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
- vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
- }
- vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
- vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
- for(; ip != in+n; ip++) BT(0);
- if(px) *px = x;
- return o;
-}
-
-// delta ---------------------------------------------------------------------------------------------------------------
-uint32_t bitd256v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
- uint32_t o = 0, x, *ip = in, u0 = in[0] - start;
- __m256i vb0 = _mm256_set1_epi32(u0),
- vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
- vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start);
- for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
- __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
- __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0;
- __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1;
- vo0 = _mm256_or_si256(vo0, v0);
- vo1 = _mm256_or_si256(vo1, v1);
- vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
- vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
- } start = (unsigned)_mm256_extract_epi32(vs, 7);
- vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
- vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
-
- for(;ip != in+n; ip++) {
- uint32_t u = *ip - start; start = *ip;
- o |= u;
- x |= u ^ u0;
- }
- if(px) *px = x;
- return o;
-}
-
-void bitddec256v32(uint32_t *in, unsigned n, unsigned start) {
- unsigned *ip = in;
- __m256i vs = _mm256_set1_epi32(start);
- for(; ip != in+(n&~(8-1)); ip += 8) {
- __m256i v = _mm256_loadu_si256((__m256i *)ip);
- vs = mm256_scan_epi32(v,vs);
- _mm256_storeu_si256((__m256i *)ip, vs);
- }
- start = (unsigned)_mm256_extract_epi32(vs, 7);
- while(ip != in+n) {
- *ip = (start += (*ip));
- ip++;
- }
-}
-
-//-- delta 1 --------------------------------------------------------------------------------------------------------------------------------------
-uint32_t bitd1256v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
- uint32_t o, x, *ip = in, u0 = in[0]-start-1;
- __m256i vb0 = _mm256_set1_epi32(u0),
- vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
- vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
- for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
- __m256i vi0 = _mm256_loadu_si256((__m256i *)ip);
- __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = _mm256_sub_epi32(mm256_delta_epi32(vi0,vs),cv); vs = vi0;
- __m256i v1 = _mm256_sub_epi32(mm256_delta_epi32(vi1,vs),cv); vs = vi1;
- vo0 = _mm256_or_si256(vo0, v0);
- vo1 = _mm256_or_si256(vo1, v1);
- vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
- vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
- } start = (unsigned)_mm256_extract_epi32(vs, 7);
- vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
- vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
- for(;ip != in+n; ip++) {
- uint32_t u = ip[0] - start-1; start = *ip;
- o |= u;
- x |= u ^ u0;
-}
- if(px) *px = x;
- return o;
-}
-
-void bitd1dec256v32(uint32_t *in, unsigned n, uint32_t start) {
- __m256i vs = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
- unsigned *ip = in;
- for(; ip != in+(n&~(8-1)); ip += 8) {
- __m256i v = _mm256_loadu_si256((__m256i *)ip); vs = mm256_scani_epi32(v, vs, cv);
- _mm256_storeu_si256((__m256i *)ip, vs);
- }
- start = (unsigned)_mm256_extract_epi32(vs, 7);
- while(ip != in+n) {
- *ip = (start += (*ip) + 1);
- ip++;
- }
-}
-
-//-- Xor ----------------------------------------------------------------------------------------------------------------------
-uint32_t bitx256v32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
- uint32_t o = 0, *ip = in;
- __m256i vo0 = _mm256_setzero_si256(),
- vo1 = _mm256_setzero_si256(),
- vs = _mm256_set1_epi32(start);
-
- for(ip = in; ip != in+(n&~(16-1)); ip += 16) { //PREFETCH(ip+512,0);
- __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
- __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_xore_epi32(vi0,vs); vs = vi0;
- __m256i v1 = mm256_xore_epi32(vi1,vs); vs = vi1;
- vo0 = _mm256_or_si256(vo0, v0);
- vo1 = _mm256_or_si256(vo1, v1);
- } start = (unsigned)_mm256_extract_epi32(vs, 7);
- vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
- for(;ip != in+n; ip++) {
- o |= ip[0] ^ start; start = ip[0];
- }
- if(px) *px = o;
- return o;
-}
-
-//-- zigzag ------------------------------------------------------------------------------------------------------------------------------------------------
-uint32_t bitz256v32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
- uint32_t o, x, *ip; uint32_t u0 = zigzagenc32((int)in[0] - (int)start);
- __m256i vb0 = _mm256_set1_epi32(u0),
- vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
- vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(),
- vs = _mm256_set1_epi32(start);
-
- for(ip = in; ip != in+(n&~(16-1)); ip += 16) { //PREFETCH(ip+512,0);
- __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
- __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0; v0 = mm256_zzage_epi32(v0);
- __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1; v1 = mm256_zzage_epi32(v1);
- vo0 = _mm256_or_si256(vo0, v0);
- vo1 = _mm256_or_si256(vo1, v1);
- vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
- vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
- } start = (unsigned)_mm256_extract_epi32(vs, 7);
- vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
- vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+#include "conf.h"
+#define BITUTIL_IN
+#include "bitutil.h"
- for(;ip != in+n; ip++) {
- uint32_t u = zigzagenc32((int)ip[0] - (int)start); start = *ip; //((int)(*ip) - (int)start); //i = (i << 1) ^ (i >> 31);
- o |= u;
- x |= u ^ u0;
- }
- if(px) *px = x;
- return o;
-}
-
-/* slower than SSE
-void bitzdec256v32(unsigned *in, unsigned n, unsigned start) {
- __m256i vs = _mm256_set1_epi32(start);
- unsigned *ip = in;
- for(; ip != in+(n&~(16-1)); ip += 16) {
- __m256i iv0 = _mm256_loadu_si256((__m256i *)ip),
- iv1 = _mm256_loadu_si256((__m256i *)(ip+8));
- iv0 = mm256_zzagd_epi32(iv0);
- iv1 = mm256_zzagd_epi32(iv1);
- vs = mm256_scan_epi32(iv0, vs);
- //__m256i _vs = vs;
- _mm256_storeu_si256((__m256i *)ip, vs);
- vs = mm256_scan_epi32(iv1, vs);
- _mm256_storeu_si256((__m256i *)(ip+8), vs);
- }
- start = (unsigned)_mm256_extract_epi32(_mm256_srli_si256(vs,12), 4);
- while(ip != in+n) {
- unsigned z = *ip;
- *ip++ = (start += (z >> 1 ^ -(z & 1)));
- }
-}*/
-
-#else // avx2
//------------ 'or' for bitsize + 'xor' for all duplicate ------------------
+#define BT(_i_) { o |= ip[_i_]; x |= ip[_i_] ^ u0; }
#define BIT(_in_, _n_, _usize_) {\
u0 = _in_[0]; o = x = 0;\
for(ip = _in_; ip != _in_+(_n_&~(4-1)); ip += 4) { BT(0); BT(1); BT(2); BT(3); }\
@@ -223,13 +39,11 @@ uint8_t bit8( uint8_t *in, unsigned n, uint8_t *px) { uint8_t o,x,u0,*ip; BI
uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px) { uint64_t o,x,u0,*ip; BIT(in, n, 64); if(px) *px = x; return o; }
uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
- uint16_t o, x, u0 = in[0], *ip = in;
-
+ uint16_t o, x, u0 = in[0], *ip;
#if defined(__SSE2__) || defined(__ARM_NEON)
- __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
- vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi16(u0);
-
- for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
+ __m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
+ vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();
+ for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m128i v0 = _mm_loadu_si128((__m128i *) ip);
__m128i v1 = _mm_loadu_si128((__m128i *)(ip+8));
vo0 = _mm_or_si128( vo0, v0);
@@ -240,22 +54,32 @@ uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
#else
- ip = in; o = x = 0;
+ ip = in; o = x = 0; //BIT( in, n, 16);
#endif
-
for(; ip != in+n; ip++) BT(0);
if(px) *px = x;
return o;
}
uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) {
- uint32_t o,x,u0 = in[0], *ip = in;
-
- #if defined(__SSE2__) || defined(__ARM_NEON)
- __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
- vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi32(u0);
-
- for(; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
+ uint32_t o,x,u0 = in[0], *ip;
+ #ifdef __AVX2__
+ __m256i vb0 = _mm256_set1_epi32(*in), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+ vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();
+ for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
+ __m256i v0 = _mm256_loadu_si256((__m256i *) ip);
+ __m256i v1 = _mm256_loadu_si256((__m256i *)(ip+8));
+ vo0 = _mm256_or_si256(vo0, v0);
+ vo1 = _mm256_or_si256(vo1, v1);
+ vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+ vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+ }
+ vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+ vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+ #elif defined(__SSE2__) || defined(__ARM_NEON)
+ __m128i vb0 = _mm_set1_epi32(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
+ vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();
+ for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i v0 = _mm_loadu_si128((__m128i *) ip);
__m128i v1 = _mm_loadu_si128((__m128i *)(ip+4));
vo0 = _mm_or_si128(vo0, v0);
@@ -266,9 +90,8 @@ uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) {
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
#else
- ip = in; o = x = 0;
+ ip = in; o = x = 0; //BIT( in, n, 32);
#endif
-
for(; ip != in+n; ip++) BT(0);
if(px) *px = x;
return o;
@@ -286,12 +109,12 @@ uint8_t bitd8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { uint8
uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t u, u0 = in[0]-start, o, x; BITDE(uint64_t, in, n, 0, o |= u; x |= u^u0); if(px) *px = x; return o; }
uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
- uint16_t o, x, *ip = in, u0 = in[0] - start;
+ uint16_t o, x, *ip, u0 = in[0]-start;
#if defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi16(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
- for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
+ for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *) ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0;
__m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1;
@@ -305,7 +128,6 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
#else
ip = in; o = x = 0;
#endif
-
for(;ip != in+n; ip++) {
uint16_t u = *ip - start; start = *ip;
o |= u;
@@ -316,13 +138,27 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
}
uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
- uint32_t o = 0, x=0, *ip = in, u0 = in[0] - start;
-
- #if defined(__SSE2__) || defined(__ARM_NEON)
+ uint32_t o, x, *ip, u0 = in[0] - start;
+ #ifdef __AVX2__
+ __m256i vb0 = _mm256_set1_epi32(u0),
+ vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+ vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start);
+ for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
+ __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
+ __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0;
+ __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1;
+ vo0 = _mm256_or_si256(vo0, v0);
+ vo1 = _mm256_or_si256(vo1, v1);
+ vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+ vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+ } start = (unsigned)_mm256_extract_epi32(vs, 7);
+ vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+ vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+ #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
- for(; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
+ for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *)ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0;
__m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1;
@@ -336,7 +172,6 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
#else
ip = in; o = x = 0;
#endif
-
for(;ip != in+n; ip++) {
uint32_t u = *ip - start; start = *ip;
o |= u;
@@ -348,39 +183,48 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
//----- Undelta: In-place prefix sum (min. Delta = 0) -------------------
#define DD(i) _ip[i] = (start += _ip[i] + _md);
-#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const unsigned _md = _md_;\
+#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const int _md = _md_;\
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { DD(0); DD(1); DD(2); DD(3); }\
for(;_ip != _in_+_n_; _ip++) DD(0);\
}
-void bitddec8( uint8_t *in, unsigned n, uint8_t start) { BITDD(uint8_t, in, n, 0); }
-void bitddec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, n, 0); }
-void bitddec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 0); }
-
-void bitddec32(uint32_t *in, unsigned n, unsigned start) {
- #if defined(__SSSE3__) || defined(__ARM_NEON)
+void bitddec8( uint8_t *p, unsigned n, uint8_t start) { BITDD(uint8_t, p, n, 0); }
+void bitddec16(uint16_t *p, unsigned n, uint16_t start) { BITDD(uint16_t, p, n, 0); }
+void bitddec64(uint64_t *p, unsigned n, uint64_t start) { BITDD(uint64_t, p, n, 0); }
+void bitddec32(uint32_t *p, unsigned n, unsigned start) {
+ #ifdef __AVX2__
+ __m256i vs = _mm256_set1_epi32(start);
+ unsigned *ip;
+ for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+ __m256i v = _mm256_loadu_si256((__m256i *)ip);
+ vs = mm256_scan_epi32(v,vs);
+ _mm256_storeu_si256((__m256i *)ip, vs);
+ }
+ start = (unsigned)_mm256_extract_epi32(vs, 7);
+ while(ip != p+n) {
+ *ip = (start += (*ip));
+ ip++;
+ }
+ #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vs = _mm_set1_epi32(start);
- unsigned *ip = in;
- for(; ip != in+(n&~(8-1)); ip += 8) {
- __m128i v0 = _mm_loadu_si128((__m128i *)ip);
- __m128i v1 = _mm_loadu_si128((__m128i *)(ip+4));
- vs = mm_scan_epi32(v0, vs);
+ unsigned *ip;
+ for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
+ __m128i v = _mm_loadu_si128((__m128i *)ip);
+ vs = mm_scan_epi32(v, vs);
_mm_storeu_si128((__m128i *)ip, vs);
- vs = mm_scan_epi32(v1, vs);
- _mm_storeu_si128((__m128i *)(ip+4), vs);
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
- while(ip != in+n) {
+ while(ip != p+n) {
*ip = (start += (*ip));
ip++;
}
#else
- BITDD(uint32_t, in, n, 0);
+ BITDD(uint32_t, p, n, 0);
#endif
}
-//----------- Zigzag Delta ----------------------------------------------------------------------------------------------------------------------------------------------------------------
-#define ZDE(i, _usize_) d = (_ip[i]-start)-_md; u = T2(zigzagenc, _usize_)(d - startd); startd = d; start = _ip[i]
+//----------- Zigzag of Delta --------------------------
+#define ZDE(i, _usize_) d = (_ip[i]-start)-_md; u = TEMPLATE2(zigzagenc, _usize_)(d - startd); startd = d; start = _ip[i]
#define BITZDE(_t_, _in_, _n_, _md_, _usize_, _act_) { _t_ *_ip, _md = _md_;\
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDE(0, _usize_);_act_; ZDE(1, _usize_);_act_; ZDE(2, _usize_);_act_; ZDE(3, _usize_);_act_; }\
for(;_ip != _in_+_n_;_ip++) { ZDE(0, _usize_); _act_; }\
@@ -390,35 +234,48 @@ uint8_t bitzz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { uint8
uint16_t bitzz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o=0, x=0,d,startd=0,u; BITZDE(uint16_t, in, n, 1, 16, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
uint32_t bitzz32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint64_t o=0, x=0,d,startd=0,u; BITZDE(uint32_t, in, n, 1, 32, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
uint64_t bitzz64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0, x=0,d,startd=0,u; BITZDE(uint64_t, in, n, 1, 64, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
-
uint8_t bitzzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta) { uint8_t o=0,*op = out,u,d,startd=0; BITZDE(uint8_t, in, n, mindelta, 8,o |= u;*op++ = u); return o;}
uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,*op = out,u,d,startd=0; BITZDE(uint16_t, in, n, mindelta, 16,o |= u;*op++ = u); return o;}
uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) { uint32_t o=0,*op = out,u,d,startd=0; BITZDE(uint32_t, in, n, mindelta, 32,o |= u;*op++ = u); return o;}
uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,*op = out,u,d,startd=0; BITZDE(uint64_t, in, n, mindelta, 64,o |= u;*op++ = u); return o;}
#define ZDD(i) u = _ip[i]; d = u - start; _ip[i] = zigzagdec64(u)+(int64_t)startd+_md; startd = d; start = u
-#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const unsigned _md = _md_;\
+#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const int _md = _md_;\
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDD(0); ZDD(1); ZDD(2); ZDD(3); }\
for(;_ip != _in_+_n_; _ip++) ZDD(0);\
}
-void bitzzdec8( uint8_t *in, unsigned n, uint8_t start) { BITZDD(uint8_t, in, n, 1); }
-void bitzzdec16(uint16_t *in, unsigned n, uint16_t start) { BITZDD(uint16_t, in, n, 1); }
-void bitzzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDD(uint64_t, in, n, 1); }
-void bitzzdec32(uint32_t *in, unsigned n, uint32_t start) { BITZDD(uint32_t, in, n, 1); }
+void bitzzdec8( uint8_t *p, unsigned n, uint8_t start) { BITZDD(uint8_t, p, n, 1); }
+void bitzzdec16(uint16_t *p, unsigned n, uint16_t start) { BITZDD(uint16_t, p, n, 1); }
+void bitzzdec64(uint64_t *p, unsigned n, uint64_t start) { BITZDD(uint64_t, p, n, 1); }
+void bitzzdec32(uint32_t *p, unsigned n, uint32_t start) { BITZDD(uint32_t, p, n, 1); }
//-----Undelta: In-place prefix sum (min. Delta = 1) -------------------
-uint8_t bitd18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { uint8_t o=0,x=0,u; BITDE(uint8_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
-uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o=0,x=0,u; BITDE(uint16_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
-uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0,x=0,u; BITDE(uint64_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
+uint8_t bitd18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { uint8_t o=0,x=0,u,*ip; BITDE(uint8_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
+uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o=0,x=0,u,*ip; BITDE(uint16_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
+uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0,x=0,u,*ip; BITDE(uint64_t, in, n, 1, o |= u; x |= u ^ in[0]); if(px) *px = x; return o; }
uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
- uint32_t o = 0, x=0, *ip = in, u0 = in[0]-start-1;
-
- #if defined(__SSE2__) || defined(__ARM_NEON)
+ uint32_t o, x, *ip, u0 = in[0]-start-1;
+ #ifdef __AVX2__
+ __m256i vb0 = _mm256_set1_epi32(u0),
+ vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+ vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
+ for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
+ __m256i vi0 = _mm256_loadu_si256((__m256i *)ip);
+ __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = _mm256_sub_epi32(mm256_delta_epi32(vi0,vs),cv); vs = vi0;
+ __m256i v1 = _mm256_sub_epi32(mm256_delta_epi32(vi1,vs),cv); vs = vi1;
+ vo0 = _mm256_or_si256(vo0, v0);
+ vo1 = _mm256_or_si256(vo1, v1);
+ vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+ vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+ } start = (unsigned)_mm256_extract_epi32(vs, 7);
+ vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+ vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+ #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1);
- for(; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
+ for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *)ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = _mm_sub_epi32(mm_delta_epi32(vi0,vs),cv); vs = vi0;
__m128i v1 = _mm_sub_epi32(mm_delta_epi32(vi1,vs),cv); vs = vi1;
@@ -432,7 +289,6 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
#else
ip = in; o = x = 0;
#endif
-
for(;ip != in+n; ip++) {
uint32_t u = ip[0] - start-1; start = *ip;
o |= u;
@@ -444,8 +300,8 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
#if defined(__SSE2__) || defined(__ARM_NEON)
- uint16_t *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8);
- for(; ip != in+(n&~(8-1)); ip += 8) {
+ unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8);
+ for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip);
bv = _mm_or_si128(bv,_mm_sub_epi16(SUBI16x8(iv,vs),cv));
vs = iv;
@@ -459,8 +315,8 @@ uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
#if defined(__SSE2__) || defined(__ARM_NEON)
- unsigned *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4);
- for(; ip != in+(n&~(4-1)); ip += 4) {
+ unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4);
+ for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip);
bv = _mm_or_si128(bv,_mm_sub_epi32(SUBI32x4(iv,vs),cv));
vs = iv;
@@ -472,26 +328,37 @@ unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
#endif
}
-void bitd1dec8( uint8_t *in, unsigned n, uint8_t start) { BITDD(uint8_t, in, n, 1); }
-void bitd1dec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, n, 1); }
-void bitd1dec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 1); }
-
-void bitd1dec32(uint32_t *in, unsigned n, uint32_t start) {
- #if defined(__SSSE3__) || defined(__ARM_NEON)
+void bitd1dec8( uint8_t *p, unsigned n, uint8_t start) { BITDD(uint8_t, p, n, 1); }
+void bitd1dec16(uint16_t *p, unsigned n, uint16_t start) { BITDD(uint16_t, p, n, 1); }
+void bitd1dec64(uint64_t *p, unsigned n, uint64_t start) { BITDD(uint64_t, p, n, 1); }
+void bitd1dec32(uint32_t *p, unsigned n, uint32_t start) {
+ #ifdef __AVX2__
+ __m256i vs = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
+ unsigned *ip;
+ for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+ __m256i v = _mm256_loadu_si256((__m256i *)ip); vs = mm256_scani_epi32(v, vs, cv);
+ _mm256_storeu_si256((__m256i *)ip, vs);
+ }
+ start = (unsigned)_mm256_extract_epi32(vs, 7);
+ while(ip != p+n) {
+ *ip = (start += (*ip) + 1);
+ ip++;
+ }
+ #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
- unsigned *ip = in;
- for(; ip != in+(n&~(4-1)); ip += 4) {
+ unsigned *ip;
+ for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
__m128i v = _mm_loadu_si128((__m128i *)ip);
vs = mm_scani_epi32(v, vs, cv);
_mm_storeu_si128((__m128i *)ip, vs);
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
- while(ip != in+n) {
+ while(ip != p+n) {
*ip = (start += (*ip) + 1);
ip++;
}
#else
- BITDD(uint32_t, in, n, 1);
+ BITDD(uint32_t, p, n, 1);
#endif
}
@@ -508,14 +375,14 @@ uint16_t bitdi16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint1
uint32_t bitdi32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t mindelta,u,*_ip; BITDIE(in, n); if(px) *px = 0; return mindelta; }
uint64_t bitdi64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t mindelta,u,*_ip; BITDIE(in, n); if(px) *px = 0; return mindelta; }
-uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta) { uint8_t o=0,x=0,*op = out,u; BITDE(uint8_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
-uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
-uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
+uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta) { uint8_t o=0,x=0,*op = out,u,*ip; BITDE(uint8_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
+uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u,*ip; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
+uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u,*ip; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
#if defined(__SSE2__) || defined(__ARM_NEON)
- unsigned *ip = in,b,*op = out;
+ unsigned *ip,b,*op = out;
__m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv;
- for(; ip != in+(n&~(4-1)); ip += 4,op += 4) {
+ for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip);
bv = _mm_or_si128(bv, dv = _mm_sub_epi32(mm_delta_epi32(iv,vs),cv));
vs = iv;
@@ -536,12 +403,12 @@ uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
return b;
}
-void bitdidec8( uint8_t *in, unsigned n, uint8_t start, uint8_t mindelta) { BITDD(uint8_t, in, n, mindelta); }
-void bitdidec16( uint16_t *in, unsigned n, uint16_t start, uint16_t mindelta) { BITDD(uint16_t, in, n, mindelta); }
-void bitdidec32( uint32_t *in, unsigned n, uint32_t start, uint32_t mindelta) { BITDD(uint32_t, in, n, mindelta); }
-void bitdidec64( uint64_t *in, unsigned n, uint64_t start, uint64_t mindelta) { BITDD(uint64_t, in, n, mindelta); }
+void bitdidec8( uint8_t *p, unsigned n, uint8_t start, uint8_t mindelta) { BITDD(uint8_t, p, n, mindelta); }
+void bitdidec16( uint16_t *p, unsigned n, uint16_t start, uint16_t mindelta) { BITDD(uint16_t, p, n, mindelta); }
+void bitdidec32( uint32_t *p, unsigned n, uint32_t start, uint32_t mindelta) { BITDD(uint32_t, p, n, mindelta); }
+void bitdidec64( uint64_t *p, unsigned n, uint64_t start, uint64_t mindelta) { BITDD(uint64_t, p, n, mindelta); }
-//------------------- For ---------------------------------------------------------------------------------------------------
+//------------------- For ------------------------------
uint8_t bitf8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { if(px) *px = 0; return n?in[n-1] - start :0; }
uint8_t bitf18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { if(px) *px = 0; return n?in[n-1] - start - n:0; }
uint16_t bitf16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { if(px) *px = 0; return n?in[n-1] - start :0; }
@@ -551,25 +418,24 @@ uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { if(px
uint64_t bitf64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { if(px) *px = 0; return n?in[n-1] - start :0; }
uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { if(px) *px = 0; return n?in[n-1] - start - n:0; }
-//------------------- Zigzag -------------------------------------------------------------------------------------------------------------------------------------
-#define ZE(i,_it_,_usize_) u = T2(zigzagenc, _usize_)((_it_)_ip[i]-(_it_)start); start = _ip[i]
-#define BITZENC(_ut_, _it_, _usize_, _in_,_n_, _act_) { _ut_ *_ip; x = -1;\
+//------------------- Zigzag ---------------------------
+#define ZE(i,_it_,_usize_) u = TEMPLATE2(zigzagenc, _usize_)((_it_)_ip[i]-(_it_)start); start = _ip[i]
+#define BITZENC(_ut_, _it_, _usize_, _in_,_n_, _act_) { _ut_ *_ip; o = 0; x = -1;\
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZE(0,_it_,_usize_);_act_; ZE(1,_it_,_usize_);_act_; ZE(2,_it_,_usize_);_act_; ZE(3,_it_,_usize_);_act_; }\
for(;_ip != _in_+_n_; _ip++) { ZE(0,_it_,_usize_); _act_; }\
}
// 'or' bits for zigzag encoding
-uint8_t bitz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { uint8_t o=0, u,x; BITZENC(uint8_t, int8_t, 8, in, n, o |= x); if(px) *px = 0; return o; }
-uint64_t bitz64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0, u,x; BITZENC(uint64_t, int64_t,64,in, n, o |= x); if(px) *px = 0; return o; }
+uint8_t bitz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { uint8_t o, u,x; BITZENC(uint8_t, int8_t, 8, in, n, o |= x); if(px) *px = 0; return o; }
+uint64_t bitz64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o, u,x; BITZENC(uint64_t, int64_t,64,in, n, o |= x); if(px) *px = 0; return o; }
uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
- uint16_t o, x, *ip = in;
- uint32_t u0 = zigzagenc16((int)in[0] - (int)start);
+ uint16_t o, x, *ip; uint32_t u0 = zigzagenc16((int)in[0] - (int)start);
#if defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
- for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
+ for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *) ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0; v0 = mm_zzage_epi16(v0);
__m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1; v1 = mm_zzage_epi16(v1);
@@ -594,13 +460,27 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
}
uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
- uint32_t o, x, *ip=in,
- u0 = zigzagenc32((int)in[0] - (int)start);
- #if defined(__SSE2__) || defined(__ARM_NEON)
+ uint32_t o, x, *ip; uint32_t u0 = zigzagenc32((int)in[0] - (int)start);
+ #ifdef __AVX2__
+ __m256i vb0 = _mm256_set1_epi32(u0), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
+ vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start);
+ for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
+ __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
+ __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0; v0 = mm256_zzage_epi32(v0);
+ __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1; v1 = mm256_zzage_epi32(v1);
+ vo0 = _mm256_or_si256(vo0, v0);
+ vo1 = _mm256_or_si256(vo1, v1);
+ vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
+ vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
+ } start = (unsigned)_mm256_extract_epi32(vs, 7);
+ vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
+ vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
+
+ #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
- for(; ip != in+(n&~(8-1)); ip += 8) { //PREFETCH(ip+512,0);
+ for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *) ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0; v0 = mm_zzage_epi32(v0);
__m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1; v1 = mm_zzage_epi32(v1);
@@ -608,7 +488,7 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
vo1 = _mm_or_si128(vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
- } start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12));
+ } start = mm_cvtsi128_si16(_mm_srli_si128(vs,12));
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
#else
@@ -623,25 +503,19 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
return o;
}
-uint8_t bitzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta) { uint8_t o=0,x,u,*op = out; BITZENC(uint8_t, int8_t, 8,in, n, o |= u; *op++ = u); return o; }
-uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x,u,*op = out; BITZENC(uint16_t, int16_t,16,in, n, o |= u; *op++ = u); return o; }
-uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; }
-
+uint8_t bitzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta) { uint8_t o,x,u,*op = out; BITZENC(uint8_t, int8_t, 8,in, n, o |= u; *op++ = u); return o; }
+uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o,x,u,*op = out; BITZENC(uint16_t, int16_t,16,in, n, o |= u; *op++ = u); return o; }
+uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; }
uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
#if defined(__SSE2__) || defined(__ARM_NEON)
- unsigned *ip = in,b,*op = out;
- __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start);
- for(; ip != in+(n&~(8-1)); ip += 8,op += 8) {
- __m128i iv0 = _mm_loadu_si128((__m128i *)ip), dv0;
- __m128i iv1 = _mm_loadu_si128((__m128i *)(ip+4)), dv1;
- dv0 = mm_delta_epi32(iv0,vs); vs = iv0;
- dv0 = mm_zzage_epi32(dv0);
- bv = _mm_or_si128(bv, dv0);
- dv1 = mm_delta_epi32(iv1,vs); vs = iv1;
- dv1 = mm_zzage_epi32(dv1);
- bv = _mm_or_si128(bv, dv1);
- _mm_storeu_si128((__m128i *)op, dv0);
- _mm_storeu_si128((__m128i *)(op+4), dv1);
+ unsigned *ip,b,*op = out;
+ __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), dv;
+ for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
+ __m128i iv = _mm_loadu_si128((__m128i *)ip);
+ dv = mm_delta_epi32(iv,vs); vs = iv;
+ dv = mm_zzage_epi32(dv);
+ bv = _mm_or_si128(bv, dv);
+ _mm_storeu_si128((__m128i *)op, dv);
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
b = mm_hor_epi32(bv);
@@ -653,122 +527,81 @@ uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint
*op++ = x;
}
#else
- uint32_t b = 0, *op = out,x, u;
+ uint32_t b = 0, *op = out,x;
BITZENC(uint32_t, int32_t, 32,in, n, b |= x; *op++ = x);
#endif
return bsr32(b);
}
-#define ZD(_t_, _usize_, i) { _t_ _z = _ip[i]; _ip[i] = (start += T2(zigzagdec, _usize_)(_z)); }
+#define ZD(_t_, _usize_, i) { _t_ _z = _ip[i]; _ip[i] = (start += TEMPLATE2(zigzagdec, _usize_)(_z)); }
#define BITZDEC(_t_, _usize_, _in_, _n_) { _t_ *_ip;\
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZD(_t_, _usize_, 0); ZD(_t_, _usize_, 1); ZD(_t_, _usize_, 2); ZD(_t_, _usize_, 3); }\
for(;_ip != _in_+_n_;_ip++) ZD(_t_, _usize_, 0);\
}
-void bitzdec8( uint8_t *in, unsigned n, uint8_t start) { BITZDEC(uint8_t, 8, in, n); }
-void bitzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,in, n); }
+void bitzdec8( uint8_t *p, unsigned n, uint8_t start) { BITZDEC(uint8_t, 8, p, n); }
+void bitzdec64(uint64_t *p, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,p, n); }
-void bitzdec16(uint16_t *in, unsigned n, uint16_t start) {
+void bitzdec16(uint16_t *p, unsigned n, uint16_t start) {
#if defined(__SSSE3__) || defined(__ARM_NEON)
__m128i vs = _mm_set1_epi16(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
- uint16_t *ip = in;
- for(; ip != in+(n&~(8-1)); ip += 8) {
- __m128i iv = _mm_loadu_si128((__m128i *)ip);
- iv = mm_zzagd_epi16(iv);
+ uint16_t *ip;
+ for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+ __m128i iv = _mm_loadu_si128((__m128i *)ip);
+ iv = mm_zzagd_epi16(iv);
vs = mm_scan_epi16(iv, vs);
_mm_storeu_si128((__m128i *)ip, vs);
}
start = (uint16_t)_mm_cvtsi128_si32(_mm_srli_si128(vs,14));
- while(ip != in+n) {
+ while(ip != p+n) {
uint16_t z = *ip;
*ip++ = (start += (z >> 1 ^ -(z & 1)));
}
#else
- BITZDEC(uint16_t, 16, in, n);
+ BITZDEC(uint16_t, 16, p, n);
#endif
}
-void bitzdec32(unsigned *in, unsigned n, unsigned start) {
- #if defined(__SSSE3__) || defined(__ARM_NEON)
- __m128i vs = _mm_set1_epi32(start);
- unsigned *ip = in;
- for(; ip != in+(n&~(8-1)); ip += 8) {
- __m128i iv0 = _mm_loadu_si128((__m128i *)ip),
- iv1 = _mm_loadu_si128((__m128i *)(ip+4));
- iv0 = mm_zzagd_epi32(iv0);
- iv1 = mm_zzagd_epi32(iv1);
- vs = mm_scan_epi32(iv0, vs);
- __m128i _vs = vs;
- vs = mm_scan_epi32(iv1, vs);
- _mm_storeu_si128((__m128i *)ip, _vs);
- _mm_storeu_si128((__m128i *)(ip+4), vs);
+void bitzdec32(unsigned *p, unsigned n, unsigned start) {
+ #ifdef __AVX2__
+ __m256i vs = _mm256_set1_epi32(start); //, zv = _mm256_setzero_si256()*/; //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
+ unsigned *ip;
+ for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
+ __m256i iv = _mm256_loadu_si256((__m256i *)ip);
+ iv = mm256_zzagd_epi32(iv);
+ vs = mm256_scan_epi32(iv,vs);
+ _mm256_storeu_si256((__m256i *)ip, vs);
+ }
+ start = (unsigned)_mm256_extract_epi32(_mm256_srli_si256(vs,12), 4);
+ while(ip != p+n) {
+ unsigned z = *ip;
+ *ip++ = (start += (z >> 1 ^ -(z & 1)));
+ }
+ #elif defined(__SSE2__) || defined(__ARM_NEON)
+ __m128i vs = _mm_set1_epi32(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
+ unsigned *ip;
+ for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
+ __m128i iv = _mm_loadu_si128((__m128i *)ip);
+ iv = mm_zzagd_epi32(iv);
+ vs = mm_scan_epi32(iv, vs);
+ _mm_storeu_si128((__m128i *)ip, vs);
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
- while(ip != in+n) {
+ while(ip != p+n) {
unsigned z = *ip;
*ip++ = (start += zigzagdec32(z));
}
#else
- BITZDEC(uint32_t, 32, in, n);
+ BITZDEC(uint32_t, 32, p, n);
#endif
}
-//----------------------- XOR ------------------------------------------------------------------------------------------------------
+//----------------------- XOR : return max. bits ---------------------------------
#define XE(i) x = _ip[i] ^ start; start = _ip[i]
#define BITXENC(_t_, _in_, _n_, _act_) { _t_ *_ip;\
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { XE(0);_act_; XE(1);_act_; XE(2);_act_; XE(3);_act_; }\
for( ; _ip != _in_+ _n_; _ip++ ) { XE(0);_act_; }\
}
-
-uint8_t bitx8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start) { uint8_t o=0, u=0,x; BITXENC(uint8_t, in, n, o |= x); if(px) *px = 0; return o; }
-uint64_t bitx64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64_t o=0, u=0,x; BITXENC(uint64_t, in, n, o |= x); if(px) *px = 0; return o; }
-
-uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
- uint16_t o = 0, *ip = in;
-
- #if defined(__SSE2__) || defined(__ARM_NEON)
- __m128i vo0 = _mm_setzero_si128(),
- vo1 = _mm_setzero_si128(),
- vs = _mm_set1_epi16(start);
- for(; ip != in+(n&~(16-1)); ip += 16) { //PREFETCH(ip+512,0);
- __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
- __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v0 = mm_xore_epi16(vi0,vs); vs = vi0;
- __m128i v1 = mm_xore_epi16(vi1,vs); vs = vi1;
- vo0 = _mm_or_si128(vo0, v0);
- vo1 = _mm_or_si128(vo1, v1);
- } start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
- vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
- #endif
- for(;ip != in+n; ip++) {
- o |= ip[0] ^ start; start = ip[0];
- }
- if(px) *px = o;
- return o;
-}
-
-uint32_t bitx32(unsigned *in, unsigned n, uint32_t *px, uint32_t start) {
- uint32_t o = 0, *ip = in;
-
- #if defined(__SSE2__) || defined(__ARM_NEON)
- __m128i vo0 = _mm_setzero_si128(),
- vo1 = _mm_setzero_si128(),
- vs = _mm_set1_epi32(start);
- for(; ip != in+(n&~(8-1)); ip += 8) { //PREFETCH(ip+512,0);
- __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
- __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = mm_xore_epi32(vi0,vs); vs = vi0;
- __m128i v1 = mm_xore_epi32(vi1,vs); vs = vi1;
- vo0 = _mm_or_si128(vo0, v0);
- vo1 = _mm_or_si128(vo1, v1);
- } start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12));
- vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
- #endif
- for(;ip != in+n; ip++) {
- o |= ip[0] ^ start; start = ip[0];
- }
- if(px) *px = o;
- return o;
-}
-
uint8_t bitxenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start) { uint8_t b = 0,*op = out,x; BITXENC(uint8_t, in, n, b |= x; *op++ = x); return b; }
uint16_t bitxenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start) { uint16_t b = 0,*op = out,x; BITXENC(uint16_t, in, n, b |= x; *op++ = x); return b; }
uint32_t bitxenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start) { uint32_t b = 0,*op = out,x; BITXENC(uint32_t, in, n, b |= x; *op++ = x); return b; }
@@ -780,50 +613,10 @@ uint64_t bitxenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start) { ui
for( ;_ip != _in_+ _n_ ; _ip++ ) XD(0);\
}
-void bitxdec8( uint8_t *in, unsigned n, uint8_t start) { BITXDEC(uint8_t, in, n); }
-void bitxdec64(uint64_t *in, unsigned n, uint64_t start) { BITXDEC(uint64_t, in, n); }
-
-void bitxdec16(uint16_t *in, unsigned n, uint16_t start) {
- #if defined(__SSSE3__) || defined(__ARM_NEON)
- __m128i vs = _mm_set1_epi16(start);
- uint16_t *ip = in;
- for(; ip != in+(n&~(8-1)); ip += 8) {
- __m128i iv = _mm_loadu_si128((__m128i *)ip);
- vs = mm_xord_epi16(iv, vs);
- _mm_storeu_si128((__m128i *)ip, vs);
- }
- start = (uint16_t)_mm_cvtsi128_si32(_mm_srli_si128(vs,14));
- while(ip != in+n) {
- uint16_t z = *ip;
- *ip++ = (start ^= z);
- }
- #else
- BITXDEC(uint16_t, in, n);
- #endif
-}
-
-void bitxdec32(unsigned *in, unsigned n, unsigned start) {
- #if defined(__SSSE3__) || defined(__ARM_NEON)
- __m128i vs = _mm_set1_epi32(start);
- unsigned *ip = in;
- for(; ip != in+(n&~(8-1)); ip += 8) {
- __m128i iv0 = _mm_loadu_si128((__m128i *)ip),
- iv1 = _mm_loadu_si128((__m128i *)(ip+4));
- vs = mm_xord_epi32(iv0, vs);
- __m128i _vs = vs;
- vs = mm_xord_epi32(iv1, vs);
- _mm_storeu_si128((__m128i *)ip, _vs);
- _mm_storeu_si128((__m128i *)(ip+4), vs);
- }
- start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
- while(ip != in+n) {
- unsigned z = *ip;
- *ip++ = (start ^= z);
- }
- #else
- BITXDEC(uint32_t, 32, in, n);
- #endif
-}
+void bitxdec8( uint8_t *p, unsigned n, uint8_t start) { BITXDEC(uint8_t, p, n); }
+void bitxdec16(uint16_t *p, unsigned n, uint16_t start) { BITXDEC(uint16_t, p, n); }
+void bitxdec32(uint32_t *p, unsigned n, uint32_t start) { BITXDEC(uint32_t, p, n); }
+void bitxdec64(uint64_t *p, unsigned n, uint64_t start) { BITXDEC(uint64_t, p, n); }
//-------------- For : calc max. bits, min,max value ------------------------
#define FM(i) mi = _ip[i] < mi?_ip[i]:mi; mx = _ip[i] > mx?_ip[i]:mx
@@ -837,125 +630,60 @@ uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t *px, uint16_t *pmin) { uint
uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin) { uint32_t mi,mx; BITFM(uint32_t, in, n); *pmin = mi; if(px) *px = 0; return mx - mi; }
uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin) { uint64_t mi,mx; BITFM(uint64_t, in, n); *pmin = mi; if(px) *px = 0; return mx - mi; }
-//---------------------- any esize ----------------------------------
-void bitxenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
- switch(esize) {
- case 2 : bitxenc16(in, n/2, out, 0); break;
- case 4 : bitxenc32(in, n/4, out, 0); break;
- case 8 : bitxenc64(in, n/8, out, 0); break;
- default: bitxenc8( in, n/1, out, 0); break;
- }
-}
-
-void bitxdec(unsigned char *in, unsigned n, unsigned esize) {
- switch(esize) {
- case 2 : bitxdec16(in, n/2, 0);break;
- case 4 : bitxdec32(in, n/4, 0);break;
- case 8 : bitxdec64(in, n/8, 0);break;
- default: bitxdec8( in, n/1, 0);break;
- }
-}
-
-void bitzenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
- switch(esize) {
- case 2 : bitzenc16(in, n/2, out, 0, 0); break;
- case 4 : bitzenc32(in, n/4, out, 0, 0); break;
- case 8 : bitzenc64(in, n/8, out, 0, 0); break;
- default: bitzenc8( in, n/1, out, 0, 0); break;
- }
-}
-
-void bitzdec(unsigned char *in, unsigned n, unsigned esize) {
- switch(esize) {
- case 2 : bitzdec16(in, n/2, 0);break;
- case 4 : bitzdec32(in, n/4, 0);break;
- case 8 : bitzdec64(in, n/8, 0);break;
- default: bitzdec8( in, n/1, 0);break;
- }
-}
-
//----------- Lossy floating point conversion: pad the trailing mantissa bits with zero bits according to the relative error e (ex. 0.00001) ----------
#ifdef USE_FLOAT16
// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
#define ctof16(_cp_) (*(_Float16 *)(_cp_))
-_Float16 _fprazor16(_Float16 d, float e, int lg2e) {
- uint16_t du = ctou16(&d), sign, u;
- int b = (du>>10 & 0x1f) - 15; // mantissa=10 bits, exponent=5bits, bias=15
- _Float16 ed;
- if ((b = 12 - b - lg2e) <= 0)
- return d;
- b = b > 10?10:b;
- sign = du & (1<<15);
- du &= 0x7fff;
- for(d = ctof16(&du), ed = e * d;;) {
- u = du & (~((1u<<(--b))-1)); if(d - ctof16(&u) <= ed) break;
- u = du & (~((1u<<(--b))-1)); if(d - ctof16(&u) <= ed) break;
- }
- u |= sign;
+static inline _Float16 _fppad16(_Float16 d, float e, int lg2e) {
+ uint16_t u, du = ctou16(&d);
+ int b = (du>>10 & 0x1f)-15; // mantissa=10 bits, exponent=5bits, bias=15
+ if ((b = 12 - b - lg2e) <= 0) return d;
+ b = (b > 10) ? 10 : b;
+ do { u = du & (~((1u<<(--b))-1)); } while (fabs((ctof16(&u) - d)/d) > e);
return ctof16(&u);
}
-void fprazor16(_Float16 *in, unsigned n, _Float16 *out, float e) {
- int lg2e = -log(e)/log(2.0); _Float16 *ip;
-
- for (ip = in; ip < in+n; ip++,out++)
- *out = _fprazor16(*ip, e, lg2e);
-}
+void fppad16(_Float16 *in, size_t n, _Float16 *out, float e) { int lg2e = -log(e)/log(2.0); _Float16 *ip; for (ip = in; ip < in+n; ip++,out++) *out = _fppad16(*ip, e, lg2e); }
#endif
-float _fprazor32(float d, float e, int lg2e) {
- uint32_t du = ctou32(&d), sign, u;
- int b = (du>>23 & 0xff) - 0x7e;
- float ed;
-
+//do u = du & (~((1u<<(--b))-1)); while(fabsf((ctof32(&u) - d)/d) > e);
+#define OP(t,s) sign = du & ((t)1<<(s-1)); du &= ~((t)1<<(s-1)); d = TEMPLATE2(ctof,s)(&du);\
+ do u = du & (~(((t)1<<(--b))-1)); while(d - TEMPLATE2(ctof,s)(&u) > e*d);\
+ u |= sign;\
+ return TEMPLATE2(ctof,s)(&u);
+
+static inline float _fppad32(float d, float e, int lg2e) {
+ uint32_t u, du = ctou32(&d), sign;
+ int b = (du>>23 & 0xff)-0x7e;
if((b = 25 - b - lg2e) <= 0)
- return d; AS(!isnan(d), "_fprazor32: isnan");
+ return d;
b = b > 23?23:b;
sign = du & (1<<31);
du &= 0x7fffffffu;
-
- for(d = ctof32(&du), ed = e * d;;) {
- u = du & (~((1u<<(--b))-1)); if(d - ctof32(&u) <= ed) break;
- u = du & (~((1u<<(--b))-1)); if(d - ctof32(&u) <= ed) break;
- u = du & (~((1u<<(--b))-1)); if(d - ctof32(&u) <= ed) break;
- }
+ d = ctof32(&du);
+ do u = du & (~((1u<<(--b))-1)); while(d - ctof32(&u) > e*d);
u |= sign;
return ctof32(&u);
}
-void fprazor32(float *in, unsigned n, float *out, float e) {
- int lg2e = -log(e)/log(2.0);
- float *ip;
- for(ip = in; ip < in+n; ip++,out++)
- *out = _fprazor32(*ip, e, lg2e);
-}
+void fppad32(float *in, size_t n, float *out, float e) { int lg2e = -log(e)/log(2.0); float *ip; for(ip = in; ip < in+n; ip++,out++) *out = _fppad32(*ip, e, lg2e); }
-double _fprazor64(double d, double e, int lg2e) { //if(isnan(d)) return d;
- uint64_t du = ctou64(&d), sign, u;
- int b = (du>>52 & 0x7ff) - 0x3fe;
- double ed;
-
+static inline double _fppad64(double d, double e, int lg2e) { if(isnan(d)) return d;
+ union r { uint64_t u; double d; } u,du; du.d = d; //if((du.u>>52)==0xfff)
+ uint64_t sign;
+ int b = (du.u>>52 & 0x7ff)-0x3fe;
if((b = 54 - b - lg2e) <= 0)
return d;
- b = b > 52?52:b;
- sign = du & (1ull<<63);
- du &= 0x7fffffffffffffffull;
-
- for(d = ctof64(&du), ed = e * d;;) {
- u = du & (~((1ull<<(--b))-1)); if(d - ctof64(&u) <= ed) break;
- u = du & (~((1ull<<(--b))-1)); if(d - ctof64(&u) <= ed) break;
- }
- u |= sign;
+ b = b > 52?52:b;
+ sign = du.u & (1ull<<63); du.u &= 0x7fffffffffffffffull;
+ int _b = b;
+ for(;;) { if((_b -= 8) <= 0) break; u.u = du.u & (~((1ull<<_b)-1)); if(d - u.d <= e*d) break; b = _b; }
+ do u.u = du.u & (~((1ull<<(--b))-1)); while(d - u.d > e*d);
+ u.u |= sign;
return ctof64(&u);
}
-void fprazor64(double *in, unsigned n, double *out, double e) {
- int lg2e = -log(e)/log(2.0);
- double *ip;
-
- for(ip = in; ip < in+n; ip++,out++)
- *out = _fprazor64(*ip, e, lg2e);
-}
-#endif
+void fppad64(double *in, size_t n, double *out, double e) { int lg2e = -log(e)/log(2.0); double *ip; for(ip = in; ip < in+n; ip++,out++) *out = _fppad64(*ip, e, lg2e); }
+
diff --git a/src/ext/for/include_/bitutil_.h b/src/ext/for/bitutil.h
similarity index 55%
rename from src/ext/for/include_/bitutil_.h
rename to src/ext/for/bitutil.h
index 69850433..e311b414 100644
--- a/src/ext/for/include_/bitutil_.h
+++ b/src/ext/for/bitutil.h
@@ -1,5 +1,5 @@
/**
- Copyright (C) powturbo 2013-2023
+ Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
@@ -22,6 +22,8 @@
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Integer Compression: max.bits, delta, zigzag, xor"
+
+#ifdef BITUTIL_IN
#ifdef __AVX2__
#include <immintrin.h>
#elif defined(__AVX__)
@@ -46,15 +48,14 @@
#else
#include <stdint.h>
#endif
-#include "../include_/sse_neon.h"
+#include "sse_neon.h"
#ifdef __ARM_NEON
#define PREFETCH(_ip_,_rw_)
#else
-#define PREFETCH(_ip_,_rw_) //__builtin_prefetch(_ip_,_rw_)
+#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
#endif
-
-//------------------------ zigzag encoding ----------------------------------------------
+//------------------------ zigzag encoding -------------------------------------------------------------
static inline unsigned char zigzagenc8( signed char x) { return x << 1 ^ x >> 7; }
static inline char zigzagdec8( unsigned char x) { return x >> 1 ^ -(x & 1); }
@@ -67,137 +68,128 @@ static inline int zigzagdec32(unsigned x) { return x >> 1 ^ -(x
static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; }
static inline int64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); }
+ #if defined(__SSE2__) || defined(__ARM_NEON)
+static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16(v,15)); }
+static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1), mm_srai_epi32(v,31)); }
+//static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1), _mm_srai_epi64(v,63)); }
+
+static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1), mm_srai_epi16( mm_slli_epi16(v,15),15) ); }
+static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1), mm_srai_epi32( mm_slli_epi32(v,31),31) ); }
+//static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128(mm_srli_epi64(v,1), _mm_srai_epi64( m_slli_epi64(v,63),63) ); }
+
+ #endif
#ifdef __AVX2__
-#define mm256_srai_epi64_63(v, s) _mm256_srai_epi32(_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)), 31)
-static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32( v,31)); }
-static ALWAYS_INLINE __m256i mm256_zzage_epi64(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi64(v,1), mm256_srai_epi64_63(v,63)); }
-static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32( _mm256_slli_epi32(v,31),31) ); }
-static ALWAYS_INLINE __m256i mm256_zzagd_epi64(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi64(v,1), mm256_srai_epi64_63(_mm256_slli_epi64(v,63),63) ); }
+static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32(v,31)); }
+static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32(_mm256_slli_epi32(v,31),31) ); }
+ #endif
-//-- AVX2 delta <-> prefix sum (scan) / xor encode <-> xor decode ---------------------------------------------------------------------------------------
+//-------------- AVX2 delta + prefix sum (scan) / xor encode/decode ---------------------------------------------------------------------------------------
+ #ifdef __AVX2__
static ALWAYS_INLINE __m256i mm256_delta_epi32(__m256i v, __m256i sv) { return _mm256_sub_epi32(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
static ALWAYS_INLINE __m256i mm256_delta_epi64(__m256i v, __m256i sv) { return _mm256_sub_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); }
-
static ALWAYS_INLINE __m256i mm256_xore_epi32( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
static ALWAYS_INLINE __m256i mm256_xore_epi64( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); }
-#define MM256_HDEC_EPI32(_v_,_sv_,_ho_) {\
- _v_ = _ho_(_v_, _mm256_slli_si256(_v_, 4));\
- _v_ = _ho_(_v_, _mm256_slli_si256(_v_, 8));\
- return _ho_( _mm256_permute2x128_si256( _mm256_shuffle_epi32(_sv_,_MM_SHUFFLE(3, 3, 3, 3)), _sv_, 0x11),\
- _ho_(_v_, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));\
+static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) {
+ v = _mm256_add_epi32(v, _mm256_slli_si256(v, 4));
+ v = _mm256_add_epi32(v, _mm256_slli_si256(v, 8));
+ return _mm256_add_epi32( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
+ _mm256_add_epi32(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));
}
-static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) { MM256_HDEC_EPI32(v,sv,_mm256_add_epi32); }
-static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) { MM256_HDEC_EPI32(v,sv,_mm256_xor_si256); }
-
-#define MM256_HDEC_EPI64(_v_,_sv_,_ho_) {\
- _v_ = _ho_(_v_, _mm256_alignr_epi8(_v_, _mm256_permute2x128_si256(_v_, _v_, _MM_SHUFFLE(0, 0, 2, 0)), 8));\
- return _ho_(_mm256_permute4x64_epi64(_sv_, _MM_SHUFFLE(3, 3, 3, 3)), _ho_(_mm256_permute2x128_si256(_v_, _v_, _MM_SHUFFLE(0, 0, 2, 0)), _v_) );\
+static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) {
+ v = _mm256_xor_si256(v, _mm256_slli_si256(v, 4));
+ v = _mm256_xor_si256(v, _mm256_slli_si256(v, 8));
+ return _mm256_xor_si256( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
+ _mm256_xor_si256(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));
}
-static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) { MM256_HDEC_EPI64(v,sv,_mm256_add_epi64); }
-static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) { MM256_HDEC_EPI64(v,sv,_mm256_xor_si256); }
-
-static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); }
-//-- Horizontal OR ---------------------------------------
-static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) {
- v = _mm256_or_si256(v, _mm256_srli_si256(v, 8));
- v = _mm256_or_si256(v, _mm256_srli_si256(v, 4));
- return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4);
+static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) {
+ v = _mm256_add_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
+ return _mm256_add_epi64(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_add_epi64(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
}
-
-static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
- v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1)));
- return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0);
+static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) {
+ v = _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
+ return _mm256_xor_si256(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_xor_si256(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
}
- #endif
-
- #if defined(__SSSE3__) || defined(__ARM_NEON)
-#define mm_srai_epi64_63(_v_, _s_) _mm_srai_epi32(_mm_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 1, 1)), 31)
-
-static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16( v,15)); }
-static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1), mm_srai_epi32( v,31)); }
-static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1), mm_srai_epi64_63(v,63)); }
-static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1), mm_srai_epi16( mm_slli_epi16(v,15),15)); }
-static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1), mm_srai_epi32( mm_slli_epi32(v,31),31)); }
-static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128( mm_srli_epi64(v,1), mm_srai_epi64_63( mm_slli_epi64(v,63),63)); }
+static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); }
+ #endif
+ #if defined(__SSSE3__) || defined(__ARM_NEON)
static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_alignr_epi8(v, sv, 14)); }
static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_alignr_epi8(v, sv, 12)); }
-static ALWAYS_INLINE __m128i mm_delta_epi64(__m128i v, __m128i sv) { return _mm_sub_epi64(v, _mm_alignr_epi8(v, sv, 8)); }
-
static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 14)); }
static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 12)); }
-static ALWAYS_INLINE __m128i mm_xore_epi64( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 8)); }
-#define MM_HDEC_EPI32(_v_,_sv_,_ho_) { \
- _v_ = _ho_(_v_, _mm_slli_si128(_v_, 4)); \
- _v_ = _ho_(mm_shuffle_nnnn_epi32(_sv_, 3), _ho_(_mm_slli_si128(_v_, 8), _v_));\
-}
+#define MM_HDEC_EPI32(_v_,_sv_,_hop_) { _v_ = _hop_(_v_, _mm_slli_si128(_v_, 4)); _v_ = _hop_(mm_shuffle_nnnn_epi32(_sv_, 3), _hop_(_mm_slli_si128(_v_, 8), _v_)); }
static ALWAYS_INLINE __m128i mm_scan_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_add_epi32); return v; }
static ALWAYS_INLINE __m128i mm_xord_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_xor_si128); return v; }
-#define MM_HDEC_EPI64(_v_,_sv_,_ho_) { \
- _v_ = _ho_(_v_, _mm_slli_si128(_v_, 8)); \
- _v_ = _ho_(_mm_shuffle_epi8(_sv_, _mm_set_epi8(15,14,13,12,11,10,9,8, 15,14,13,12,11,10,9,8)), _v_);\
+#define MM_HDEC_EPI16(_v_,_sv_,_hop_) {\
+ _v_ = _hop_( _v_, _mm_slli_si128(_v_, 2));\
+ _v_ = _hop_( _v_, _mm_slli_si128(_v_, 4));\
+ _v_ = _hop_(_hop_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\
}
-static ALWAYS_INLINE __m128i mm_scan_epi64(__m128i v, __m128i sv) { MM_HDEC_EPI64(v,sv,_mm_add_epi64); return v; }
-static ALWAYS_INLINE __m128i mm_xord_epi64(__m128i v, __m128i sv) { MM_HDEC_EPI64(v,sv,_mm_xor_si128); return v; }
-#define MM_HDEC_EPI16(_v_,_sv_,_ho_) {\
- _v_ = _ho_( _v_, _mm_slli_si128(_v_, 2));\
- _v_ = _ho_( _v_, _mm_slli_si128(_v_, 4));\
- _v_ = _ho_(_ho_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\
-}
static ALWAYS_INLINE __m128i mm_scan_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_add_epi16); return v; }
static ALWAYS_INLINE __m128i mm_xord_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_xor_si128); return v; }
-
-#define MM_HDEC_EPI8(_v_,_sv_,_ho_) {\
- _v_ = _ho_( _v_, _mm_slli_si128(_v_, 1));\
- _v_ = _ho_( _v_, _mm_slli_si128(_v_, 2));\
- _v_ = _ho_( _v_, _mm_slli_si128(_v_, 4));\
- _v_ = _ho_(_ho_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi8(0xfe)));/*TODO: test*/\
-}
-static ALWAYS_INLINE __m128i mm_scan_epi8(__m128i v, __m128i sv) { MM_HDEC_EPI8(v,sv,_mm_add_epi8); return v; }
-static ALWAYS_INLINE __m128i mm_xord_epi8(__m128i v, __m128i sv) { MM_HDEC_EPI8(v,sv,_mm_xor_si128); return v; }
-
//-------- scan with vi delta > 0 -----------------------------
static ALWAYS_INLINE __m128i mm_scani_epi16(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi16(mm_scan_epi16(v, sv), vi); }
static ALWAYS_INLINE __m128i mm_scani_epi32(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi32(mm_scan_epi32(v, sv), vi); }
-#define MM_HOZ_EPI16(v,_ho_) {\
- v = _ho_(v, _mm_srli_si128(v, 8));\
- v = _ho_(v, _mm_srli_si128(v, 6));\
- v = _ho_(v, _mm_srli_si128(v, 4));\
- v = _ho_(v, _mm_srli_si128(v, 2));\
+ #elif defined(__SSE2__)
+static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
+static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
+static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
+static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
+ #endif
+
+#if !defined(_M_X64) && !defined(__x86_64__) && defined(__AVX__)
+#define _mm256_extract_epi64(v, index) ((__int64)((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2) | (((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2 + 1)) << 32)))
+#endif
+
+//------------------ Horizontal OR -----------------------------------------------
+ #ifdef __AVX2__
+static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) {
+ v = _mm256_or_si256(v, _mm256_srli_si256(v, 8));
+ v = _mm256_or_si256(v, _mm256_srli_si256(v, 4));
+ return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4);
+}
+
+static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
+ v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1)));
+ return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0);
+}
+ #endif
+
+ #if defined(__SSE2__) || defined(__ARM_NEON)
+#define MM_HOZ_EPI16(v,_hop_) {\
+ v = _hop_(v, _mm_srli_si128(v, 8));\
+ v = _hop_(v, _mm_srli_si128(v, 6));\
+ v = _hop_(v, _mm_srli_si128(v, 4));\
+ v = _hop_(v, _mm_srli_si128(v, 2));\
}
-#define MM_HOZ_EPI32(v,_ho_) {\
- v = _ho_(v, _mm_srli_si128(v, 8));\
- v = _ho_(v, _mm_srli_si128(v, 4));\
+#define MM_HOZ_EPI32(v,_hop_) {\
+ v = _hop_(v, _mm_srli_si128(v, 8));\
+ v = _hop_(v, _mm_srli_si128(v, 4));\
}
static ALWAYS_INLINE uint16_t mm_hor_epi16( __m128i v) { MM_HOZ_EPI16(v,_mm_or_si128); return (unsigned short)_mm_cvtsi128_si32(v); }
static ALWAYS_INLINE uint32_t mm_hor_epi32( __m128i v) { MM_HOZ_EPI32(v,_mm_or_si128); return (unsigned )_mm_cvtsi128_si32(v); }
static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _mm_srli_si128(v, 8)); return (uint64_t )_mm_cvtsi128_si64(v); }
-
+ #endif
+
//----------------- sub / add ----------------------------------------------------------
+ #if defined(__SSE2__) || defined(__ARM_NEON)
#define SUBI16x8(_v_, _sv_) _mm_sub_epi16(_v_, _sv_)
#define SUBI32x4(_v_, _sv_) _mm_sub_epi32(_v_, _sv_)
#define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
#define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)
-//---------------- Convert _mm_cvtsi128_siXX -------------------------------------------
-static ALWAYS_INLINE uint8_t mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
-static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
-#define mm_cvtsi128_si32(_v_) _mm_cvtsi128_si32(_v_)
-
- #elif defined(__SSE2__)
-static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
-static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
-static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
-static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
+//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
+static ALWAYS_INLINE uint8_t mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
+static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
#endif
//--------- memset -----------------------------------------
@@ -251,7 +243,6 @@ static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_
#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) BITFORSET_(_out_, _n_, _start_, _mindelta_)
#define BITZERO32( _out_, _n_, _start_) BITFORSET_(_out_, _n_, _start_, 0)
#endif
-#define BITZERO16( _out_, _n_, _start_) BITFORSET_(_out_, _n_, _start_, 0)
#define DELTR( _in_, _n_, _start_, _mindelta_, _out_) { unsigned _v; for( _v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_); }
#define DELTRB(_in_, _n_, _start_, _mindelta_, _b_, _out_) { unsigned _v; for(_b_=0,_v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_), _b_ |= _out_[_v]; _b_ = bsr32(_b_); }
@@ -348,7 +339,28 @@ static ALWAYS_INLINE __m256i mm256_rbit_epi64(__m256i v) { return mm256_rbit_epi
static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi8(mm256_rev_si128(v)); }
#endif
-// ------------------ bitio general macros ---------------------------
+// ------------------ bitio genaral macros ---------------------------
+ #ifdef __AVX2__
+ #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#include <intrin.h>
+ #else
+#include <x86intrin.h>
+ #endif
+#define bzhi_u32(_u_, _b_) _bzhi_u32(_u_, _b_)
+
+ #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
+#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
+ #else
+#define bzhi_u64(_u_, _b_) _bzhi_u64(_u_, _b_)
+ #endif
+ #else
+#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
+#define bzhi_u32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
+ #endif
+
+#define BZHI64(_u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))
+#define BZHI32(_u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1)))
+
#define bitdef( _bw_,_br_) uint64_t _bw_=0; unsigned _br_=0
#define bitini( _bw_,_br_) _bw_=_br_=0
//-- bitput ---------
@@ -367,9 +379,9 @@ static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi
#define BITPEEK64( _bw_,_br_,_nb_) BZHI64(bitbw(_bw_,_br_), _nb_)
#define BITGET64( _bw_,_br_,_nb_,_x_) _x_ = BITPEEK64(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
-#define bitpeek57( _bw_,_br_,_nb_) bzhi64(bitbw(_bw_,_br_), _nb_)
+#define bitpeek57( _bw_,_br_,_nb_) bzhi_u64(bitbw(_bw_,_br_), _nb_)
#define bitget57( _bw_,_br_,_nb_,_x_) _x_ = bitpeek57(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
-#define bitpeek31( _bw_,_br_,_nb_) bzhi32(bitbw(_bw_,_br_), _nb_)
+#define bitpeek31( _bw_,_br_,_nb_) bzhi_u32(bitbw(_bw_,_br_), _nb_)
#define bitget31( _bw_,_br_,_nb_,_x_) _x_ = bitpeek31(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
//------------------ templates -----------------------------------
#define bitput8( _bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
@@ -381,11 +393,155 @@ static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi
#define bitget16(_bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
#define bitget32(_bw_,_br_,_b_,_x_,_ip_) bitget57(_bw_,_br_,_b_,_x_)
#define bitget64(_bw_,_br_,_b_,_x_,_ip_) if((_b_)>45) { unsigned _v; bitget57(_bw_,_br_,(_b_)-32,_x_); bitdnorm(_bw_,_br_,_ip_); BITGET64(_bw_,_br_,32,_v); _x_ = _x_<<32|_v; } else bitget57(_bw_,_br_,_b_,_x_)
+#endif
+
+//---------- max. bit length + transform for sorted/unsorted arrays, delta,delta 1, delta > 1, zigzag, zigzag of delta, xor, FOR,----------------
+#ifdef __cplusplus
+extern "C" {
+#endif
+//------ ORed array, used to determine the maximum bit length of the elements in an unsorted integer array ---------------------
+uint8_t bit8( uint8_t *in, unsigned n, uint8_t *px);
+uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px);
+uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px);
+uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px);
+
+//-------------- delta = 0: Sorted integer array w/ mindelta = 0 ----------------------------------------------
+//-- ORed array, maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1]
+uint8_t bitd8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//-- in-place reverse delta 0
+void bitddec8( uint8_t *p, unsigned n, uint8_t start); // non decreasing (out[i] = in[i] - in[i-1])
+void bitddec16( uint16_t *p, unsigned n, uint16_t start);
+void bitddec32( uint32_t *p, unsigned n, uint32_t start);
+void bitddec64( uint64_t *p, unsigned n, uint64_t start);
+
+//-- vectorized fast delta4 one: out[0] = in[4]-in[0], out[1]=in[5]-in[1], out[2]=in[6]-in[2], out[3]=in[7]-in[3],...
+uint16_t bits128v16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bits128v32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+
+//------------- delta = 1: Sorted integer array w/ mindelta = 1 ---------------------------------------------
+//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
+uint8_t bitd18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//-- in-place reverse delta one
+void bitd1dec8( uint8_t *p, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
+void bitd1dec16( uint16_t *p, unsigned n, uint16_t start);
+void bitd1dec32( uint32_t *p, unsigned n, uint32_t start);
+void bitd1dec64( uint64_t *p, unsigned n, uint64_t start);
+
+//------------- delta > 1: Sorted integer array w/ mindelta > 1 ---------------------------------------------
+//-- ORed array, for max. bit length get min. delta ()
+uint8_t bitdi8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitdi16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitdi32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitdi64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+//-- transform sorted integer array to delta array: out[i] = in[i] - in[i-1] - mindelta
+uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
+uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
+uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
+uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
+//-- in-place reverse delta
+void bitdidec8( uint8_t *in, unsigned n, uint8_t start, uint8_t mindelta);
+void bitdidec16(uint16_t *in, unsigned n, uint16_t start, uint16_t mindelta);
+void bitdidec32(uint32_t *in, unsigned n, uint32_t start, uint32_t mindelta);
+void bitdidec64(uint64_t *in, unsigned n, uint64_t start, uint64_t mindelta);
+
+//------------- FOR : array bit length: ---------------------------------------------------------------------
+//------ ORed array, for max. bit length of the non decreasing integer array. out[i] = in[i] - start
+uint8_t bitf8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitf16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitf32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitf64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//------ ORed array, for max. bit length of the non strictly decreasing integer array out[i] = in[i] - 1 - start
+uint8_t bitf18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitf116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//------ ORed array, for max. bit length for usorted array
+uint8_t bitfm8( uint8_t *in, unsigned n, uint8_t *px, uint8_t *pmin); // unsorted
+uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t *px, uint16_t *pmin);
+uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin);
+uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin);
+
+//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------------------
+//-- ORed array, to get maximum zigzag bit length integer array
+uint8_t bitz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+//-- Zigzag transform
+uint8_t bitzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
+uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
+uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
+uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
+//-- in-place zigzag reverse transform
+void bitzdec8( uint8_t *in, unsigned n, uint8_t start);
+void bitzdec16( uint16_t *in, unsigned n, uint16_t start);
+void bitzdec32( uint32_t *in, unsigned n, uint32_t start);
+void bitzdec64( uint64_t *in, unsigned n, uint64_t start);
+
+//------------- Zigzag of zigzag/delta : unsorted/sorted integer array ----------------------------------------------------
+//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
+uint8_t bitzz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitzz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitzz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitzz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+uint8_t bitzzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
+uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
+uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
+uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
+
+//-- in-place reverse zigzag of delta (encoded w/ bitdiencNN and parameter mindelta = 1)
+void bitzzdec8( uint8_t *in, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
+void bitzzdec16( uint16_t *in, unsigned n, uint16_t start);
+void bitzzdec32( uint32_t *in, unsigned n, uint32_t start);
+void bitzzdec64( uint64_t *in, unsigned n, uint64_t start);
+
+//------------- XOR encoding for unsorted integer lists: out[i] = in[i] - in[i-1] -------------
+//-- ORed array, to get maximum zigzag bit length integer array
+uint8_t bitx8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
+uint16_t bitx16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
+uint32_t bitx32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
+uint64_t bitx64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
+
+//-- XOR transform
+uint8_t bitxenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start);
+uint16_t bitxenc16( uint16_t *in, unsigned n, uint16_t *out, uint16_t start);
+uint32_t bitxenc32( uint32_t *in, unsigned n, uint32_t *out, uint32_t start);
+uint64_t bitxenc64( uint64_t *in, unsigned n, uint64_t *out, uint64_t start);
+
+//-- XOR in-place reverse transform
+void bitxdec8( uint8_t *p, unsigned n, uint8_t start);
+void bitxdec16( uint16_t *p, unsigned n, uint16_t start);
+void bitxdec32( uint32_t *p, unsigned n, uint32_t start);
+void bitxdec64( uint64_t *p, unsigned n, uint64_t start);
+
+//------- Lossy floating point transform: pad the trailing mantissa bits with zeros according to the error e (ex. e=0.00001)
+ #ifdef USE_FLOAT16
+void fppad16(_Float16 *in, size_t n, _Float16 *out, float e);
+ #endif
+void fppad32(float *in, size_t n, float *out, float e);
+void fppad64(double *in, size_t n, double *out, double e);
+
+#ifdef __cplusplus
+}
+#endif
//---- Floating point to Integer decomposition ---------------------------------
// seeeeeeee21098765432109876543210 (s:sign, e:exponent, 0-9:mantissa)
+ #ifdef BITUTIL_IN
#define MANTF32 23
#define MANTF64 52
#define BITFENC(_u_, _sgn_, _expo_, _mant_, _mantbits_, _one_) _sgn_ = _u_ >> (sizeof(_u_)*8-1); _expo_ = ((_u_ >> (_mantbits_)) & ( (_one_<<(sizeof(_u_)*8 - 1 - _mantbits_)) -1)); _mant_ = _u_ & ((_one_<<_mantbits_)-1);
#define BITFDEC( _sgn_, _expo_, _mant_, _u_, _mantbits_) _u_ = (_sgn_) << (sizeof(_u_)*8-1) | (_expo_) << _mantbits_ | (_mant_)
+ #endif
diff --git a/src/ext/for/include_/conf.h b/src/ext/for/conf.h
similarity index 50%
rename from src/ext/for/include_/conf.h
rename to src/ext/for/conf.h
index d04bb8cc..be6face4 100644
--- a/src/ext/for/include_/conf.h
+++ b/src/ext/for/conf.h
@@ -1,10 +1,10 @@
/**
- Copyright (C) powturbo 2016-2023
- GPL v3 License
+ Copyright (C) powturbo 2013-2019
+ GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3 of the License, or
+ the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
@@ -23,20 +23,8 @@
**/
// conf.h - config & common
-#ifndef CONF_H_
-#define CONF_H_
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
- #if !defined(_STDINT) && !defined(_MSC_STDINT_H_)
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
- #endif
-#else
-#include <stdint.h>
-#endif
-#include <stddef.h>
-
+#ifndef CONF_H
+#define CONF_H
//------------------------- Compiler ------------------------------------------
#if defined(__GNUC__)
#include <stdint.h>
@@ -47,40 +35,30 @@ typedef unsigned long long uint64_t;
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
-//#define bswap8(x) (x)
- #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
-#define bswap16(x) __builtin_bswap16(x)
- #else
-static ALWAYS_INLINE unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
- #endif
-#define bswap32(x) __builtin_bswap32(x)
-#define bswap64(x) __builtin_bswap64(x)
-
#define popcnt32(_x_) __builtin_popcount(_x_)
#define popcnt64(_x_) __builtin_popcountll(_x_)
#if defined(__i386__) || defined(__x86_64__)
-//x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5,...
-//x, bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,...
-static ALWAYS_INLINE int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; }
-static ALWAYS_INLINE int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
-static ALWAYS_INLINE int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; }
-static ALWAYS_INLINE int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); }
-
-static ALWAYS_INLINE unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static ALWAYS_INLINE unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static ALWAYS_INLINE uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static ALWAYS_INLINE uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+//x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
+// x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
+static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; }
+static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
+static inline int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; }
+static inline int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); }
+
+static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
#else
-static ALWAYS_INLINE int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); }
-static ALWAYS_INLINE int bsr32(int x ) { return x?32 - __builtin_clz( x):0; }
-static ALWAYS_INLINE int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
-static ALWAYS_INLINE int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); }
-
-static ALWAYS_INLINE unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
-static ALWAYS_INLINE unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
-static ALWAYS_INLINE unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
-static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
+static inline int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); }
+static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; }
+static inline int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
+
+static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
+static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
+static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
+static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
#endif
#define ctz64(_x_) __builtin_ctzll(_x_)
@@ -88,6 +66,15 @@ static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (6
#define clz64(_x_) __builtin_clzll(_x_)
#define clz32(_x_) __builtin_clz(_x_) // 00000000 00000000 00000000 01000000 = 25
+//#define bswap8(x) (x)
+ #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+#define bswap16(x) __builtin_bswap16(x)
+ #else
+static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
+ #endif
+#define bswap32(x) __builtin_bswap32(x)
+#define bswap64(x) __builtin_bswap64(x)
+
#elif _MSC_VER //----------------------------------------------------
#include <windows.h>
#include <intrin.h>
@@ -107,12 +94,12 @@ static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (6
#define likely(x) (x)
#define unlikely(x) (x)
-static ALWAYS_INLINE int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
-static ALWAYS_INLINE int bsr32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?z+1:0; }
-static ALWAYS_INLINE int ctz32( unsigned x) { unsigned long z; _BitScanForward(&z, x); return x?z:32; }
-static ALWAYS_INLINE int clz32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?31-z:32; }
+static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
+static inline int bsr32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?z+1:0; }
+static inline int ctz32( unsigned x) { unsigned long z; _BitScanForward(&z, x); return x?z:32; }
+static inline int clz32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?31-z:32; }
#if !defined(_M_ARM64) && !defined(_M_X64)
-static ALWAYS_INLINE unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
+static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
unsigned long x0 = (unsigned long)x, top, bottom; _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0);
*ret = x0 ? bottom : 32 + top; return x != 0;
}
@@ -121,10 +108,9 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
*ret = x1 ? top + 32 : bottom; return x != 0;
}
#endif
-static ALWAYS_INLINE int __bsr64(uint64_t x) { unsigned long z = 0; _BitScanReverse64(&z, x); return z; }
-static ALWAYS_INLINE int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
-static ALWAYS_INLINE int ctz64(uint64_t x) { unsigned long z; _BitScanForward64(&z, x); return x?z:64; }
-static ALWAYS_INLINE int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x); return x?63-z:64; }
+static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
+static inline int ctz64(uint64_t x) { unsigned long z; _BitScanForward64(&z, x); return x?z:64; }
+static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x); return x?63-z:64; }
#define rol32(x,s) _lrotl(x, s)
#define ror32(x,s) _lrotr(x, s)
@@ -140,47 +126,43 @@ static ALWAYS_INLINE int clz64(uint64_t x) { unsigned long z; _BitScanReverse6
#define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
#endif
-#define sleep(x) Sleep(x/1000)
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#define strcasecmp _stricmp
-#define strncasecmp _strnicmp
-#define strtoull _strtoui64
-static ALWAYS_INLINE double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
+#define sleep(x) Sleep(x/1000)
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define strtoull _strtoui64
+static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
#endif
#define __bsr8(_x_) __bsr32(_x_)
#define __bsr16(_x_) __bsr32(_x_)
-#define bsr8(_x_) bsr32(_x_)
-#define bsr16(_x_) bsr32(_x_)
-#define ctz8(_x_) ctz32((_x_)+(1<< 8))
-#define ctz16(_x_) ctz32((_x_)+(1<<16))
-#define clz8(_x_) (clz32(_x_)-24)
-#define clz16(_x_) (clz32(_x_)-16)
+#define bsr8(_x_) bsr32(_x_)
+#define bsr16(_x_) bsr32(_x_)
+#define ctz8(_x_) ctz32(_x_)
+#define ctz16(_x_) ctz32(_x_)
+#define clz8(_x_) (clz32(_x_)-24)
+#define clz16(_x_) (clz32(_x_)-16)
-#define popcnt8(x) popcnt32(x)
-#define popcnt16(x) popcnt32(x)
+#define popcnt8(x) popcnt32(x)
+#define popcnt16(x) popcnt32(x)
//--------------- Unaligned memory access -------------------------------------
#ifdef UA_MEMCPY
#include <string.h>
-static ALWAYS_INLINE unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; } // ua read
-static ALWAYS_INLINE unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE size_t ctousz(const void *cp) { size_t x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE float ctof32(const void *cp) { float x; memcpy(&x, cp, sizeof(x)); return x; }
-static ALWAYS_INLINE double ctof64(const void *cp) { double x; memcpy(&x, cp, sizeof(x)); return x; }
-
-static ALWAYS_INLINE void stou16( void *cp, unsigned short x) { memcpy(cp, &x, sizeof(x)); } // ua write
-static ALWAYS_INLINE void stou32( void *cp, unsigned x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void stou64( void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void stousz( void *cp, size_t x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void stof32( void *cp, float x) { memcpy(cp, &x, sizeof(x)); }
-static ALWAYS_INLINE void stof64( void *cp, double x) { memcpy(cp, &x, sizeof(x)); }
-
-static ALWAYS_INLINE void ltou32(unsigned *x, const void *cp) { memcpy(x, cp, sizeof(*x)); } // ua read into ptr
-static ALWAYS_INLINE void ltou64(unsigned long long *x, const void *cp) { memcpy(x, cp, sizeof(*x)); }
-
+static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline size_t ctousz(const void *cp) { size_t x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline float ctof32(const void *cp) { float x; memcpy(&x, cp, sizeof(x)); return x; }
+static inline double ctof64(const void *cp) { double x; memcpy(&x, cp, sizeof(x)); return x; }
+
+static inline void stou16( void *cp, unsigned short x) { memcpy(cp, &x, sizeof(x)); }
+static inline void stou32( void *cp, unsigned x) { memcpy(cp, &x, sizeof(x)); }
+static inline void stou64( void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
+static inline void stousz( void *cp, size_t x) { memcpy(cp, &x, sizeof(x)); }
+static inline void stof32( void *cp, float x) { memcpy(cp, &x, sizeof(x)); }
+static inline void stof64( void *cp, double x) { memcpy(cp, &x, sizeof(x)); }
#elif defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\
defined(__powerpc__) || defined(__s390__) ||\
@@ -192,30 +174,14 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
#define ctou32(_cp_) (*(unsigned *)(_cp_))
#define ctof32(_cp_) (*(float *)(_cp_))
-#define stou16(_cp_, _x_) (*(unsigned short *)(_cp_) = _x_)
-#define stou32(_cp_, _x_) (*(unsigned *)(_cp_) = _x_)
-#define stof32(_cp_, _x_) (*(float *)(_cp_) = _x_)
-
-#define ltou32(_px_, _cp_) *(_px_) = *(unsigned *)(_cp_)
-
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER)
#define ctou64(_cp_) (*(uint64_t *)(_cp_))
#define ctof64(_cp_) (*(double *)(_cp_))
-
-#define stou64(_cp_, _x_) (*(uint64_t *)(_cp_) = _x_)
-#define stof64(_cp_, _x_) (*(double *)(_cp_) = _x_)
-
-#define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_)
-
#elif defined(__ARM_FEATURE_UNALIGNED)
struct _PACKED longu { uint64_t l; };
struct _PACKED doubleu { double d; };
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
-
-#define stou64(_cp_) ((struct longu *)(_cp_))->l = _x_
-#define stof64(_cp_) ((struct doubleu *)(_cp_))->d = _x_
-#define ltou64(_px_, _cp_) *(_px_) = ((struct longu *)(_cp_))->l
#endif
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)
@@ -230,15 +196,6 @@ struct _PACKED doubleu { double d; };
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
#define ctof32(_cp_) ((struct floatu *)(_cp_))->f
#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
-
-#define stou16(_cp_, _x_) ((struct shortu *)(_cp_))->s = _x_
-#define stou32(_cp_, _x_) ((struct unsignedu *)(_cp_))->u = _x_
-#define stou64(_cp_, _x_) ((struct longu *)(_cp_))->l = _x_
-#define stof32(_cp_, _x_) ((struct floatu *)(_cp_))->f = _x_
-#define stof64(_cp_, _x_) ((struct doubleu *)(_cp_))->d = _x_
-
-#define ltou32(_cp_) *(_px_) = ((struct unsignedu *)(_cp_))->u
-#define ltou64(_cp_) *(_px_) = ((struct longu *)(_cp_))->l
#else
#error "unknown cpu"
#endif
@@ -261,16 +218,12 @@ struct _PACKED doubleu { double d; };
#endif
//---------------------misc ---------------------------------------------------
-#define BZMASK64(_b_) (~(~0ull << (_b_)))
-#define BZMASK32(_b_) (~(~0u << (_b_)))
-#define BZMASK16(_b_) BZMASK32(_b_)
-#define BZMASK8( _b_) BZMASK32(_b_)
-
-#define BZHI64(_u_, _b_) ((_u_) & BZMASK64(_b_)) // b Constant
-#define BZHI32(_u_, _b_) ((_u_) & BZMASK32(_b_))
-#define BZHI16(_u_, _b_) BZHI32(_u_, _b_)
-#define BZHI8( _u_, _b_) BZHI32(_u_, _b_)
-#define BEXTR32(x,start,len) (((x) >> (start)) & ((1u << (len)) - 1)) //Bit field extract (with register)
+#define BZHI64F(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) // _b_ < 64
+#define BZHI32F(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) // _b_ < 32
+#define BZHI64( _u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) // Constant
+#define BZHI32( _u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1)))
+#define BZHI16( _u_, _b_) BZHI32(_u_, _b_)
+#define BZHI8( _u_, _b_) BZHI32(_u_, _b_)
#ifdef __AVX2__
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
@@ -278,31 +231,26 @@ struct _PACKED doubleu { double d; };
#else
#include <x86intrin.h>
#endif
-#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_) // b variable
-#define bextr32(x,start,len) _bextr_u32(x,start,len)
+#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
-#define bzhi64(_u_, _b_) BZHI64(_u_, _b_)
+#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
#else
#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
#endif
#else
-#define bzhi64(_u_, _b_) BZHI64(_u_, _b_)
-#define bzhi32(_u_, _b_) BZHI32(_u_, _b_)
-#define bextr32(x,start,len) (((x) >> (start)) & ((1u << (len)) - 1)) //Bit field extract (with register)
+#define bzhi_u64(_u_, _b_) BZHI64(_u_, _b_)
+#define bzhi_u32(_u_, _b_) BZHI32(_u_, _b_)
#endif
-#define bzhi16(_u_, _b_) bzhi32(_u_, _b_)
-#define bzhi8( _u_, _b_) bzhi32(_u_, _b_)
-
#define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
#define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
-#define T2_(_x_, _y_) _x_##_y_
-#define T2(_x_, _y_) T2_(_x_,_y_)
+#define TEMPLATE2_(_x_, _y_) _x_##_y_
+#define TEMPLATE2(_x_, _y_) TEMPLATE2_(_x_,_y_)
-#define T3_(_x_,_y_,_z_) _x_##_y_##_z_
-#define T3(_x_,_y_,_z_) T3_(_x_, _y_, _z_)
+#define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_
+#define TEMPLATE3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_)
#define CACHE_LINE_SIZE 64
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
@@ -314,21 +262,21 @@ struct _PACKED doubleu { double d; };
#ifdef _MSC_VER
#ifdef NDEBUG
#define AS(expr, fmt, ...)
-#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
+#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
#define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
#else
-#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
-#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
+#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
+#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
#define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
#endif
#else
#ifdef NDEBUG
#define AS(expr, fmt,args...)
-#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
+#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
#define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
#else
-#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
-#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
+#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
+#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
#endif
#endif
diff --git a/src/ext/for/eliasfano.c b/src/ext/for/eliasfano.c
deleted file mode 100644
index 730d0919..00000000
--- a/src/ext/for/eliasfano.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- Copyright (C) powturbo 2013-2023
- SPDX-License-Identifier: GPL v2 License
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- - homepage : https://sites.google.com/site/powturbo/
- - github : https://github.com/powturbo
- - twitter : https://twitter.com/powturbo
- - email : powturbo [_AT_] gmail [_DOT_] com
-**/
-
-// eliasfano.c - "Integer Compression" Elias Fano
-#ifndef USIZE
-#include <stdlib.h>
-#include <string.h>
-#include "include_/conf.h"
-#include "include_/bitpack.h"
-#include "include_/bitutil.h"
-#include "include_/eliasfano.h"
-
-#include "include_/bitutil_.h"
-
-#pragma warning( disable : 4005)
-#pragma warning( disable : 4090)
-#pragma warning( disable : 4068)
-
-
-#define PAD8(__x) ( (((__x)+8-1)/8) )
-
- #ifdef __SSE42__
-#include <nmmintrin.h>
-#define bslr32(x) _blsr_u32(x)
-#define bslr64(x) _blsr_u64(x)
- #else
-//static inline unsigned long long blsr(unsigned long long x) { return x & (x - 1); }
-#define blsr32(_x_) ((_x_) & ((_x_) - 1))
-#define blsr64(_x_) ((_x_) & ((_x_) - 1))
- #endif
-#define blsr8(_x_) blsr32(_x_)
-#define blsr16(_x_) blsr32(_x_)
-
-#define EFE(__x,__i,__start) ((__x[__i] - __start)-(__i)*EF_INC)
-
-#define BITPACK bitpack
-#define BITUNPACK bitunpack
-#define EF_INC 1
-#define EFANOENC efano1enc
-#define EFANODEC efano1dec
-
-#define USIZE 32
-#include "eliasfano.c"
-#undef USIZE
-
-/*#define USIZE 16
-#include "eliasfano.c"
-#undef USIZE*/
-
-#undef EF_INC
-#undef EFANOENC
-#undef EFANODEC
-
-//----------
-#define EF_INC 0
-#define EFANOENC efanoenc
-#define EFANODEC efanodec
-
-#define USIZE 32
-#include "eliasfano.c"
-#undef USIZE
-
-#define USIZE 64
-#include "eliasfano.c"
-#undef USIZE
-
-/*#define USIZE 16
-#include "eliasfano.c"
-#undef USIZE*/
-
-#undef BITPACK
-#undef BITUNPACK
-
-#undef EF_INC
-#undef EFANOENC
-#undef EFANODEC
-
-//----------------------
- #if defined(__SSE2__) || defined(__ARM_NEON)
-#define VSIZE 128
-
-#define BITPACK bitpack128v
-#define BITUNPACK bitunpack128v
-#define EF_INC 1
-#define EFANOENC efano1enc128v
-#define EFANODEC efano1dec128v
-
-#define USIZE 32
-#include "eliasfano.c"
-#undef EF_INC
-#undef EFANOENC
-#undef EFANODEC
-
-#define EF_INC 0
-#define EFANOENC efanoenc128v
-#define EFANODEC efanodec128v
-#include "eliasfano.c"
- #endif
-
- #ifdef __AVX2__
-#define VSIZE 256
-#define BITPACK bitpack256v
-#define BITUNPACK bitunpack256v
-#define EF_INC 1
-#define EFANOENC efano1enc256v
-#define EFANODEC efano1dec256v
-#include "eliasfano.c"
-
-#define EF_INC 0
-#define EFANOENC efanoenc256v
-#define EFANODEC efanodec256v
-#include "eliasfano.c"
- #endif
-
-#else //--------------------------------------------- implementation ---------------------------------------------------------------
-#define uint_t T3(uint, USIZE, _t)
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wparentheses"
-
-unsigned char *T2(EFANOENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) {
- uint_t *ip, e,x,hl,i;
- unsigned char *op;
- unsigned lb;
- uint_t _pa[1024+64],*pa=_pa;
- if(!n) return out;
- if(n > 1024) pa = malloc(sizeof(pa[0])*(n+64)); if(!pa) die("efanoenc:malloc error size=%d ", n);
- e = EFE(in,n-1,start);
- if(!e) { out[0] = 0; if(pa != _pa) free(pa);return out+1; }
-
- lb = T2(bsr, USIZE)(e/n);
- x = ((uint_t)1 << lb)-1; hl = PAD8((e>>lb)+n);
-
- for(i = 0; i != n&~3;) {
- pa[i] = EFE(in,i,start) & x; ++i;
- pa[i] = EFE(in,i,start) & x; ++i;
- pa[i] = EFE(in,i,start) & x; ++i;
- pa[i] = EFE(in,i,start) & x; ++i;
- }
- while(i < n) pa[i] = EFE(in,i,start) & x, ++i;
- *out = lb+1;
- op = T2(BITPACK,USIZE)(pa, n, out+1, lb);
-
- memset(op, 0, hl);
- for(i = 0; i != n&~3; ) {
- x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
- x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
- x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
- x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7); ++i;
- }
- while(i < n) x = i + (EFE(in,i,start) >> lb), op[x >> 3] |= (uint_t)1 << (x & 7),++i;
- if(pa != _pa) free(pa);
- return op+hl;
-}
-
-unsigned char *T2(EFANODEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) {
- unsigned char *ip = in;
- uint_t i,j,lb = *ip++;
- uint64_t b,x;
- if(!n)
- return in;
-
- if(!lb) {
- #if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32
- #if EF_INC == 1
- BITFORZERO32(out, n, start, 1);
- #else
- BITZERO32( out, n, start);
- #endif
- #else
- BITFORSET_(out, n, start, EF_INC);
- #endif
- return ip;
- }
-
- ip = T2(BITUNPACK,USIZE)(ip, n, out, --lb);
- #define EFD(i) if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; b = blsr64(b); ++i;
-
- for(i=j=0;; j += sizeof(uint64_t)*8) { //PREFETCH(ip+256,0);
- for(b = ctou64(ip+(j>>3)); ; ) {
- EFD(i); EFD(i); EFD(i); EFD(i);
- if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC;
- if(unlikely(++i >= n))
- goto e;
- b = blsr64(b);
- }
- }
- e:return ip + PAD8((EFE(out,n-1,start)>>lb)+n);
-}
-
-#pragma clang diagnostic pop
-#endif
diff --git a/src/ext/for/ext/OPT_PFD/main.cpp b/src/ext/for/ext/OPT_PFD/main.cpp
deleted file mode 100644
index 2c0ec066..00000000
--- a/src/ext/for/ext/OPT_PFD/main.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * test for OPT-pfd
- *
- * Author: sding
- *
- *
- */
-
-
-
-#include<iostream>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "opt_p4.h"
-
-using namespace std;
-
-char PATH[128] = "/usr/home/shuai/dumplist/wordlist_Excite"; // for reading list
-
-int get_list(char *term, unsigned int *doc_id, unsigned int *freq, unsigned int *maxc)
-{
- char fpath[128];
- sprintf(fpath,"%s/%s",PATH,term);
- FILE *fdd = fopen(fpath,"r");
- if(fdd==NULL) return 0;
-
- int nread, npos;
-
- nread = fread(&npos, sizeof(unsigned), 1, fdd);
- npos = 0;
-
- while (nread > 0)
- {
- nread = fread(&doc_id[npos], sizeof(unsigned), 1, fdd);
- if (nread <= 0) break;
- fread(&freq[npos], sizeof(unsigned), 1, fdd);
- npos++;
- }
- fclose(fdd);
-
- int i;
-
- /* fill out the max values */
- for (i = 0; i < npos; i += BS)
- maxc[(i/BS)] = doc_id[i+BS-1];
-
- /* take the gap for doc_id */
- for (i = npos-1; i > 0; i--)
- {
- doc_id[i] -= doc_id[i-1];
- doc_id[i] --;
- }
-
- for (i = 0; i < npos; i++)
- freq[i]--;
- return npos;
-}
-
-int main() // just for testing
-{
- int MAX_NDOC = 25205179;
- unsigned int *docid = new unsigned int[MAX_NDOC];
- unsigned int *docid_check = new unsigned int[MAX_NDOC ];
-
- unsigned int *fre = new unsigned int[MAX_NDOC];
- unsigned int *maxc = new unsigned int[MAX_NDOC/BS];
- unsigned int *aux = new unsigned int[MAX_NDOC];
- unsigned int * all_array = new unsigned int[2048]; // extra array for coding
-
-
- int listSize = get_list("information", docid, fre, maxc);
- cout<<"list size is "<<listSize<<endl;
- for(int i=0;i<listSize; i++)
- docid_check[i] = docid[i];
-
- int cSize = OPT4(docid,listSize,aux);
- cout<<"Compressed size is "<<cSize<<" byte"<<endl;
-
- unsigned int *_ww = aux;
- for (int i = 0; i*BS < listSize; i++)
- {
- /* _ww = detailed_p4_decode(docid_check, _ww, all_array); */ // this is fast
- _ww = detailed_p4_decode(docid_check + BS * i, _ww, all_array); // this is slow
- }
- // check correctness
- for(int i=0;i<listSize;i++)
- {
- if( docid_check[i]!= docid[i] )
- {
- cout<<"Exceptions happen"<<endl;
- exit (1);
- }
- }
- delete []docid;
- delete []docid_check;
- delete []fre;
- delete []maxc;
- delete []aux;
- delete []all_array;
-}
diff --git a/src/ext/for/ext/OPT_PFD/opt_p4.h b/src/ext/for/ext/OPT_PFD/opt_p4.h
deleted file mode 100644
index 24038fa0..00000000
--- a/src/ext/for/ext/OPT_PFD/opt_p4.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "pf.h"
-#define BS 128
-//using namespace std;
-//file "OPT_PFD.zip" form: http://jinruhe.com/
-//int dnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32};
-
-void p4_encode(unsigned int *doc_id, int npos, int b,unsigned int *buf , int *size, int *ex_n)
-{
- int i = 0;
- unsigned int *ww = buf;
- detailed_p4_encode(&ww, &(doc_id[i]), b, size,ex_n);
-}
-
-/*
-* when list_size is too small, not good to use this function
-*/
-int OPT4(unsigned int *doc_id,unsigned int list_size,unsigned int *aux)
-{
- int i,j,l;
- for(i=0; i<2*BS; i++)
- {
- doc_id[i+list_size] = 0 ; // pack the input, avoid garbage data in the end
- }
- int size = 0;
- int ex_n = 0;
- int csize = 0; // compressed size in bytes
-
- int chunk_size = 0;
- int b = -1, temp_en = 0;
- int offset = 0;
- for(j=0;j<list_size;j+=BS) // for each chunk
- {
- chunk_size = 999999999;
- b = -1;
- // get the smallest chunk size by trying all b's
- for(l=0;l<16;l++)
- {
- p4_encode(doc_id+j, BS, l, aux+offset, &size, &ex_n);
- if(chunk_size > size * 4) // int bytes
- {
- chunk_size = size *4;
- b = l;
- temp_en = ex_n;
- }
- }
-
- csize += chunk_size;
- //printf("encode:%u\n", b);
- p4_encode(doc_id + j, BS, b, aux + offset, &size, &ex_n);
- offset += size;
- }
-
- return csize;
-}
diff --git a/src/ext/for/ext/OPT_PFD/pf.h b/src/ext/for/ext/OPT_PFD/pf.h
deleted file mode 100644
index 788f8cca..00000000
--- a/src/ext/for/ext/OPT_PFD/pf.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "s16head.h"
-#include "unpack.h"
-
-
-#define BS 128
-#define FRAC 0.10
-#define S 16
-#define PCHUNK 128
-
-void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w);
-
-
-int detailed_p4_encode(unsigned int **w, unsigned int* p, int num , int *chunk_size, int * exception_n)
-{
- int i, j, t, s;
-
- unsigned int b = cnum[num];
- int bb_e;
- int bb_p;
- int p_low;
- unsigned int e_n = 0;
- int max_p = 0;
- int max_e = 0;
-
- unsigned int* out = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
- unsigned int* ex = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
- unsigned int* po = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
-
- unsigned int* tp = NULL;
- unsigned int *_pp, *_ww;
-
- if (b == 32)
- {
- (*w)[0] = ((b<<10)) + (0);
- *w +=1;
- for (i = 0; i < PCHUNK ; i++) (*w)[i] = p[i];
- *w += (PCHUNK);
- (*chunk_size) = 1 + BS;
-
- free(out);
- free(ex);
- free(po);
- return 0;
- }
-
- for (i = 0; i < PCHUNK ; i++)
- {
- if ( p[i] >= (1<<b) ) //exception
- {
- p_low = p[i] & ((1<<b)-1);
- out[i] = p_low;
- ex[e_n] = (p[i] >> b);
- po[(e_n++)] = i; //
- }
- else
- out[i] = p[i];
- }
-
- if (1) // force to pass every time
- {
- /*get the gap of position*/
- for(j = e_n-1;j>0;j--)
- {
- po[j] = po[j] - po[j-1] ;
- po[j] --;
- }
-
- s = ((b * PCHUNK)>>5);
- tp = (*w);
- (*w)[0] = ((num<<10))+e_n; // record b and number of exceptions into this value, in the other version we pick this value out and did not count it
- (*w) += 1;
- for (i = 0; i < s; i++) (*w)[i] = 0;
- pack(out, b, PCHUNK , *w);
- *w += s;
-
- unsigned int *all_array = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*4) ;
- for(j=0;j<e_n;j++)
- {
- all_array[j] = po[j];
- all_array[e_n+j] =ex[j];
- }
- for (_pp = all_array, _ww = (*w); _pp < &(all_array[2*e_n]); )
- s16_encode(&_ww, &_pp, &(all_array[2*e_n]) - _pp);
-
- (*chunk_size) = 1 + s + (_ww - (*w)) ;
-
- (*w) += (_ww - (*w)) ;
-
- (*exception_n) = e_n;
-
- free(out);
- free(ex);
- free(po);
- free(all_array);
- return (e_n);
-
- }
-}
-
-
-void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w)
-{
- int i, bp, wp, s;
-
- for (bp = 0, i = 0; i < n; i++, bp += b)
- {
- wp = bp>>5;
- s = 32 - b - (bp & 31);
- if (s >= 0)
- w[wp] |= (v[i]<<s);
- else
- {
- s = -s;
- w[wp] |= (v[i]>>s);
- w[wp+1] = (v[i]<<(32-s));
- }
- }
-}
-
-/*modified p4decode */
-unsigned int *detailed_p4_decode(unsigned int *_p, unsigned int *_w, unsigned int * all_array)
-{
-
- int i, s;
- unsigned int x;
- int flag = _w[0];
- (_w)++;
-
- unsigned int *_ww,*_pp;
- unsigned int b = ((flag>>10) & 31);
- unsigned int e_n = (flag & 1023) ;
-
- (unpack[b])(_p, _w);
-
- b = cnum[b];
- _w += ((b * BS)>>5);
- unsigned int _k = 0;
- unsigned int psum = 0;
- if(e_n != 0 )
- {
- for (_pp = all_array, _ww = (unsigned int *)(_w); _pp < &(all_array[e_n*2]);)
- {
- S16_DECODE(_ww, _pp);
- }
-
- _w += (_ww - _w);
- psum = all_array[0];
-
- for(i=0;i<e_n;i++)
- {
- _p[psum] += (all_array[e_n+i]<<b);
- psum += all_array[ i + 1] + 1;
- }
- }
- return(_w);
-}
-
-
diff --git a/src/ext/for/ext/OPT_PFD/s16head.h b/src/ext/for/ext/OPT_PFD/s16head.h
deleted file mode 100644
index 99ae4ff1..00000000
--- a/src/ext/for/ext/OPT_PFD/s16head.h
+++ /dev/null
@@ -1,251 +0,0 @@
-
-void s16_encode(unsigned int **_w, unsigned int **_p, unsigned int m)
-{
-int cnum[16] = {28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1};
-int cbits[16][28] = { {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
- {2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0},
- {1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,0},
- {1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,0,0,0,0,0,0,0},
- {2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {4,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {3,4,4,4,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {5,5,5,5,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {4,4,5,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {6,6,6,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {5,5,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {7,7,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {10,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
- {28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} };
-
- unsigned int _k, _j, _m, _o;
-
- for (_k = 0; _k < 16; _k++)
- {
- (**_w) = _k<<28;
- _m = (cnum[_k] < m)? cnum[_k]:m;
- for (_j = 0, _o = 0; (_j < _m) && (*((*_p)+_j) < (1<<cbits[_k][_j])); )
- {
- (**_w) += ((*((*_p)+_j))<<_o);
- _o += cbits[_k][_j];
- _j++;
- }
- if (_j == _m)
- {
- (*_p) += _m;
- (*_w)++;
- break;
- }
- }
-}
-
-
-/* more optimized handcoded edition */
-
-#define S16_DECODE(_w, _p) \
-{ \
- _k = (*_w)>>28; \
- switch(_k) \
- { \
- case 0: \
- *_p = (*_w) & 1; _p++; \
- *_p = (*_w>>1) & 1; _p++; \
- *_p = (*_w>>2) & 1; _p++; \
- *_p = (*_w>>3) & 1; _p++; \
- *_p = (*_w>>4) & 1; _p++; \
- *_p = (*_w>>5) & 1; _p++; \
- *_p = (*_w>>6) & 1; _p++; \
- *_p = (*_w>>7) & 1; _p++; \
- *_p = (*_w>>8) & 1; _p++; \
- *_p = (*_w>>9) & 1; _p++; \
- *_p = (*_w>>10) & 1; _p++; \
- *_p = (*_w>>11) & 1; _p++; \
- *_p = (*_w>>12) & 1; _p++; \
- *_p = (*_w>>13) & 1; _p++; \
- *_p = (*_w>>14) & 1; _p++; \
- *_p = (*_w>>15) & 1; _p++; \
- *_p = (*_w>>16) & 1; _p++; \
- *_p = (*_w>>17) & 1; _p++; \
- *_p = (*_w>>18) & 1; _p++; \
- *_p = (*_w>>19) & 1; _p++; \
- *_p = (*_w>>20) & 1; _p++; \
- *_p = (*_w>>21) & 1; _p++; \
- *_p = (*_w>>22) & 1; _p++; \
- *_p = (*_w>>23) & 1; _p++; \
- *_p = (*_w>>24) & 1; _p++; \
- *_p = (*_w>>25) & 1; _p++; \
- *_p = (*_w>>26) & 1; _p++; \
- *_p = (*_w>>27) & 1; _p++; \
- break; \
- case 1: \
- *_p = (*_w) & 3; _p++; \
- *_p = (*_w>>2) & 3; _p++; \
- *_p = (*_w>>4) & 3; _p++; \
- *_p = (*_w>>6) & 3; _p++; \
- *_p = (*_w>>8) & 3; _p++; \
- *_p = (*_w>>10) & 3; _p++; \
- *_p = (*_w>>12) & 3; _p++; \
- *_p = (*_w>>14) & 1; _p++; \
- *_p = (*_w>>15) & 1; _p++; \
- *_p = (*_w>>16) & 1; _p++; \
- *_p = (*_w>>17) & 1; _p++; \
- *_p = (*_w>>18) & 1; _p++; \
- *_p = (*_w>>19) & 1; _p++; \
- *_p = (*_w>>20) & 1; _p++; \
- *_p = (*_w>>21) & 1; _p++; \
- *_p = (*_w>>22) & 1; _p++; \
- *_p = (*_w>>23) & 1; _p++; \
- *_p = (*_w>>24) & 1; _p++; \
- *_p = (*_w>>25) & 1; _p++; \
- *_p = (*_w>>26) & 1; _p++; \
- *_p = (*_w>>27) & 1; _p++; \
- break; \
- case 2: \
- *_p = (*_w) & 1; _p++; \
- *_p = (*_w>>1) & 1; _p++; \
- *_p = (*_w>>2) & 1; _p++; \
- *_p = (*_w>>3) & 1; _p++; \
- *_p = (*_w>>4) & 1; _p++; \
- *_p = (*_w>>5) & 1; _p++; \
- *_p = (*_w>>6) & 1; _p++; \
- *_p = (*_w>>7) & 3; _p++; \
- *_p = (*_w>>9) & 3; _p++; \
- *_p = (*_w>>11) & 3; _p++; \
- *_p = (*_w>>13) & 3; _p++; \
- *_p = (*_w>>15) & 3; _p++; \
- *_p = (*_w>>17) & 3; _p++; \
- *_p = (*_w>>19) & 3; _p++; \
- *_p = (*_w>>21) & 1; _p++; \
- *_p = (*_w>>22) & 1; _p++; \
- *_p = (*_w>>23) & 1; _p++; \
- *_p = (*_w>>24) & 1; _p++; \
- *_p = (*_w>>25) & 1; _p++; \
- *_p = (*_w>>26) & 1; _p++; \
- *_p = (*_w>>27) & 1; _p++; \
- break; \
- case 3: \
- *_p = (*_w) & 1; _p++; \
- *_p = (*_w>>1) & 1; _p++; \
- *_p = (*_w>>2) & 1; _p++; \
- *_p = (*_w>>3) & 1; _p++; \
- *_p = (*_w>>4) & 1; _p++; \
- *_p = (*_w>>5) & 1; _p++; \
- *_p = (*_w>>6) & 1; _p++; \
- *_p = (*_w>>7) & 1; _p++; \
- *_p = (*_w>>8) & 1; _p++; \
- *_p = (*_w>>9) & 1; _p++; \
- *_p = (*_w>>10) & 1; _p++; \
- *_p = (*_w>>11) & 1; _p++; \
- *_p = (*_w>>12) & 1; _p++; \
- *_p = (*_w>>13) & 1; _p++; \
- *_p = (*_w>>14) & 3; _p++; \
- *_p = (*_w>>16) & 3; _p++; \
- *_p = (*_w>>18) & 3; _p++; \
- *_p = (*_w>>20) & 3; _p++; \
- *_p = (*_w>>22) & 3; _p++; \
- *_p = (*_w>>24) & 3; _p++; \
- *_p = (*_w>>26) & 3; _p++; \
- break; \
- case 4: \
- *_p = (*_w) & 3; _p++; \
- *_p = (*_w>>2) & 3; _p++; \
- *_p = (*_w>>4) & 3; _p++; \
- *_p = (*_w>>6) & 3; _p++; \
- *_p = (*_w>>8) & 3; _p++; \
- *_p = (*_w>>10) & 3; _p++; \
- *_p = (*_w>>12) & 3; _p++; \
- *_p = (*_w>>14) & 3; _p++; \
- *_p = (*_w>>16) & 3; _p++; \
- *_p = (*_w>>18) & 3; _p++; \
- *_p = (*_w>>20) & 3; _p++; \
- *_p = (*_w>>22) & 3; _p++; \
- *_p = (*_w>>24) & 3; _p++; \
- *_p = (*_w>>26) & 3; _p++; \
- break; \
- case 5: \
- *_p = (*_w) & 15; _p++; \
- *_p = (*_w>>4) & 7; _p++; \
- *_p = (*_w>>7) & 7; _p++; \
- *_p = (*_w>>10) & 7; _p++; \
- *_p = (*_w>>13) & 7; _p++; \
- *_p = (*_w>>16) & 7; _p++; \
- *_p = (*_w>>19) & 7; _p++; \
- *_p = (*_w>>22) & 7; _p++; \
- *_p = (*_w>>25) & 7; _p++; \
- break; \
- case 6: \
- *_p = (*_w) & 7; _p++; \
- *_p = (*_w>>3) & 15; _p++; \
- *_p = (*_w>>7) & 15; _p++; \
- *_p = (*_w>>11) & 15; _p++; \
- *_p = (*_w>>15) & 15; _p++; \
- *_p = (*_w>>19) & 7; _p++; \
- *_p = (*_w>>22) & 7; _p++; \
- *_p = (*_w>>25) & 7; _p++; \
- break; \
- case 7: \
- *_p = (*_w) & 15; _p++; \
- *_p = (*_w>>4) & 15; _p++; \
- *_p = (*_w>>8) & 15; _p++; \
- *_p = (*_w>>12) & 15; _p++; \
- *_p = (*_w>>16) & 15; _p++; \
- *_p = (*_w>>20) & 15; _p++; \
- *_p = (*_w>>24) & 15; _p++; \
- break; \
- case 8: \
- *_p = (*_w) & 31; _p++; \
- *_p = (*_w>>5) & 31; _p++; \
- *_p = (*_w>>10) & 31; _p++; \
- *_p = (*_w>>15) & 31; _p++; \
- *_p = (*_w>>20) & 15; _p++; \
- *_p = (*_w>>24) & 15; _p++; \
- break; \
- case 9: \
- *_p = (*_w) & 15; _p++; \
- *_p = (*_w>>4) & 15; _p++; \
- *_p = (*_w>>8) & 31; _p++; \
- *_p = (*_w>>13) & 31; _p++; \
- *_p = (*_w>>18) & 31; _p++; \
- *_p = (*_w>>23) & 31; _p++; \
- break; \
- case 10: \
- *_p = (*_w) & 63; _p++; \
- *_p = (*_w>>6) & 63; _p++; \
- *_p = (*_w>>12) & 63; _p++; \
- *_p = (*_w>>18) & 31; _p++; \
- *_p = (*_w>>23) & 31; _p++; \
- break; \
- case 11: \
- *_p = (*_w) & 31; _p++; \
- *_p = (*_w>>5) & 31; _p++; \
- *_p = (*_w>>10) & 63; _p++; \
- *_p = (*_w>>16) & 63; _p++; \
- *_p = (*_w>>22) & 63; _p++; \
- break; \
- case 12: \
- *_p = (*_w) & 127; _p++; \
- *_p = (*_w>>7) & 127; _p++; \
- *_p = (*_w>>14) & 127; _p++; \
- *_p = (*_w>>21) & 127; _p++; \
- break; \
- case 13: \
- *_p = (*_w) & 1023; _p++; \
- *_p = (*_w>>10) & 511; _p++; \
- *_p = (*_w>>19) & 511; _p++; \
- break; \
- case 14: \
- *_p = (*_w) & 16383; _p++; \
- *_p = (*_w>>14) & 16383; _p++; \
- break; \
- case 15: \
- *_p = (*_w) & ((1<<28)-1); _p++; \
- break; \
- }\
- _w++; \
-}
-
-
-
-
-
diff --git a/src/ext/for/ext/OPT_PFD/unpack.h b/src/ext/for/ext/OPT_PFD/unpack.h
deleted file mode 100644
index abb225cd..00000000
--- a/src/ext/for/ext/OPT_PFD/unpack.h
+++ /dev/null
@@ -1,773 +0,0 @@
-
-/*************************************************************/
-/* macros for fast unpacking of integers of fixed bit length */
-/*************************************************************/
-
-#define BS 128
-
-/* supported bit lengths */
-int cnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32};
-
-void unpack0(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i++) p[i] = 0;
-}
-
-
-void unpack1(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 1)
- {
- p[0] = (w[0] >> 31);
- p[1] = (w[0] >> 30) & 1;
- p[2] = (w[0] >> 29) & 1;
- p[3] = (w[0] >> 28) & 1;
- p[4] = (w[0] >> 27) & 1;
- p[5] = (w[0] >> 26) & 1;
- p[6] = (w[0] >> 25) & 1;
- p[7] = (w[0] >> 24) & 1;
- p[8] = (w[0] >> 23) & 1;
- p[9] = (w[0] >> 22) & 1;
- p[10] = (w[0] >> 21) & 1;
- p[11] = (w[0] >> 20) & 1;
- p[12] = (w[0] >> 19) & 1;
- p[13] = (w[0] >> 18) & 1;
- p[14] = (w[0] >> 17) & 1;
- p[15] = (w[0] >> 16) & 1;
- p[16] = (w[0] >> 15) & 1;
- p[17] = (w[0] >> 14) & 1;
- p[18] = (w[0] >> 13) & 1;
- p[19] = (w[0] >> 12) & 1;
- p[20] = (w[0] >> 11) & 1;
- p[21] = (w[0] >> 10) & 1;
- p[22] = (w[0] >> 9) & 1;
- p[23] = (w[0] >> 8) & 1;
- p[24] = (w[0] >> 7) & 1;
- p[25] = (w[0] >> 6) & 1;
- p[26] = (w[0] >> 5) & 1;
- p[27] = (w[0] >> 4) & 1;
- p[28] = (w[0] >> 3) & 1;
- p[29] = (w[0] >> 2) & 1;
- p[30] = (w[0] >> 1) & 1;
- p[31] = (w[0]) & 1;
- }
-}
-
-
-void unpack2(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 2)
- {
- p[0] = (w[0] >> 30);
- p[1] = (w[0] >> 28) & 3;
- p[2] = (w[0] >> 26) & 3;
- p[3] = (w[0] >> 24) & 3;
- p[4] = (w[0] >> 22) & 3;
- p[5] = (w[0] >> 20) & 3;
- p[6] = (w[0] >> 18) & 3;
- p[7] = (w[0] >> 16) & 3;
- p[8] = (w[0] >> 14) & 3;
- p[9] = (w[0] >> 12) & 3;
- p[10] = (w[0] >> 10) & 3;
- p[11] = (w[0] >> 8) & 3;
- p[12] = (w[0] >> 6) & 3;
- p[13] = (w[0] >> 4) & 3;
- p[14] = (w[0] >> 2) & 3;
- p[15] = (w[0]) & 3;
- p[16] = (w[1] >> 30);
- p[17] = (w[1] >> 28) & 3;
- p[18] = (w[1] >> 26) & 3;
- p[19] = (w[1] >> 24) & 3;
- p[20] = (w[1] >> 22) & 3;
- p[21] = (w[1] >> 20) & 3;
- p[22] = (w[1] >> 18) & 3;
- p[23] = (w[1] >> 16) & 3;
- p[24] = (w[1] >> 14) & 3;
- p[25] = (w[1] >> 12) & 3;
- p[26] = (w[1] >> 10) & 3;
- p[27] = (w[1] >> 8) & 3;
- p[28] = (w[1] >> 6) & 3;
- p[29] = (w[1] >> 4) & 3;
- p[30] = (w[1] >> 2) & 3;
- p[31] = (w[1]) & 3;
- }
-}
-
-
-void unpack3(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 3)
- {
- p[0] = (w[0] >> 29);
- p[1] = (w[0] >> 26) & 7;
- p[2] = (w[0] >> 23) & 7;
- p[3] = (w[0] >> 20) & 7;
- p[4] = (w[0] >> 17) & 7;
- p[5] = (w[0] >> 14) & 7;
- p[6] = (w[0] >> 11) & 7;
- p[7] = (w[0] >> 8) & 7;
- p[8] = (w[0] >> 5) & 7;
- p[9] = (w[0] >> 2) & 7;
- p[10] = (w[0] << 1) & 7;
- p[10] |= (w[1] >> 31);
- p[11] = (w[1] >> 28) & 7;
- p[12] = (w[1] >> 25) & 7;
- p[13] = (w[1] >> 22) & 7;
- p[14] = (w[1] >> 19) & 7;
- p[15] = (w[1] >> 16) & 7;
- p[16] = (w[1] >> 13) & 7;
- p[17] = (w[1] >> 10) & 7;
- p[18] = (w[1] >> 7) & 7;
- p[19] = (w[1] >> 4) & 7;
- p[20] = (w[1] >> 1) & 7;
- p[21] = (w[1] << 2) & 7;
- p[21] |= (w[2] >> 30);
- p[22] = (w[2] >> 27) & 7;
- p[23] = (w[2] >> 24) & 7;
- p[24] = (w[2] >> 21) & 7;
- p[25] = (w[2] >> 18) & 7;
- p[26] = (w[2] >> 15) & 7;
- p[27] = (w[2] >> 12) & 7;
- p[28] = (w[2] >> 9) & 7;
- p[29] = (w[2] >> 6) & 7;
- p[30] = (w[2] >> 3) & 7;
- p[31] = (w[2]) & 7;
- }
-}
-
-
-void unpack4(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 4)
- {
- p[0] = (w[0] >> 28);
- p[1] = (w[0] >> 24) & 15;
- p[2] = (w[0] >> 20) & 15;
- p[3] = (w[0] >> 16) & 15;
- p[4] = (w[0] >> 12) & 15;
- p[5] = (w[0] >> 8) & 15;
- p[6] = (w[0] >> 4) & 15;
- p[7] = (w[0]) & 15;
- p[8] = (w[1] >> 28);
- p[9] = (w[1] >> 24) & 15;
- p[10] = (w[1] >> 20) & 15;
- p[11] = (w[1] >> 16) & 15;
- p[12] = (w[1] >> 12) & 15;
- p[13] = (w[1] >> 8) & 15;
- p[14] = (w[1] >> 4) & 15;
- p[15] = (w[1]) & 15;
- p[16] = (w[2] >> 28);
- p[17] = (w[2] >> 24) & 15;
- p[18] = (w[2] >> 20) & 15;
- p[19] = (w[2] >> 16) & 15;
- p[20] = (w[2] >> 12) & 15;
- p[21] = (w[2] >> 8) & 15;
- p[22] = (w[2] >> 4) & 15;
- p[23] = (w[2]) & 15;
- p[24] = (w[3] >> 28);
- p[25] = (w[3] >> 24) & 15;
- p[26] = (w[3] >> 20) & 15;
- p[27] = (w[3] >> 16) & 15;
- p[28] = (w[3] >> 12) & 15;
- p[29] = (w[3] >> 8) & 15;
- p[30] = (w[3] >> 4) & 15;
- p[31] = (w[3]) & 15;
- }
-}
-
-
-void unpack5(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 5)
- {
- p[0] = (w[0] >> 27);
- p[1] = (w[0] >> 22) & 31;
- p[2] = (w[0] >> 17) & 31;
- p[3] = (w[0] >> 12) & 31;
- p[4] = (w[0] >> 7) & 31;
- p[5] = (w[0] >> 2) & 31;
- p[6] = (w[0] << 3) & 31;
- p[6] |= (w[1] >> 29);
- p[7] = (w[1] >> 24) & 31;
- p[8] = (w[1] >> 19) & 31;
- p[9] = (w[1] >> 14) & 31;
- p[10] = (w[1] >> 9) & 31;
- p[11] = (w[1] >> 4) & 31;
- p[12] = (w[1] << 1) & 31;
- p[12] |= (w[2] >> 31);
- p[13] = (w[2] >> 26) & 31;
- p[14] = (w[2] >> 21) & 31;
- p[15] = (w[2] >> 16) & 31;
- p[16] = (w[2] >> 11) & 31;
- p[17] = (w[2] >> 6) & 31;
- p[18] = (w[2] >> 1) & 31;
- p[19] = (w[2] << 4) & 31;
- p[19] |= (w[3] >> 28);
- p[20] = (w[3] >> 23) & 31;
- p[21] = (w[3] >> 18) & 31;
- p[22] = (w[3] >> 13) & 31;
- p[23] = (w[3] >> 8) & 31;
- p[24] = (w[3] >> 3) & 31;
- p[25] = (w[3] << 2) & 31;
- p[25] |= (w[4] >> 30);
- p[26] = (w[4] >> 25) & 31;
- p[27] = (w[4] >> 20) & 31;
- p[28] = (w[4] >> 15) & 31;
- p[29] = (w[4] >> 10) & 31;
- p[30] = (w[4] >> 5) & 31;
- p[31] = (w[4]) & 31;
- }
-}
-
-
-void unpack6(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 6)
- {
- p[0] = (w[0] >> 26);
- p[1] = (w[0] >> 20) & 63;
- p[2] = (w[0] >> 14) & 63;
- p[3] = (w[0] >> 8) & 63;
- p[4] = (w[0] >> 2) & 63;
- p[5] = (w[0] << 4) & 63;
- p[5] |= (w[1] >> 28);
- p[6] = (w[1] >> 22) & 63;
- p[7] = (w[1] >> 16) & 63;
- p[8] = (w[1] >> 10) & 63;
- p[9] = (w[1] >> 4) & 63;
- p[10] = (w[1] << 2) & 63;
- p[10] |= (w[2] >> 30);
- p[11] = (w[2] >> 24) & 63;
- p[12] = (w[2] >> 18) & 63;
- p[13] = (w[2] >> 12) & 63;
- p[14] = (w[2] >> 6) & 63;
- p[15] = (w[2]) & 63;
- p[16] = (w[3] >> 26);
- p[17] = (w[3] >> 20) & 63;
- p[18] = (w[3] >> 14) & 63;
- p[19] = (w[3] >> 8) & 63;
- p[20] = (w[3] >> 2) & 63;
- p[21] = (w[3] << 4) & 63;
- p[21] |= (w[4] >> 28);
- p[22] = (w[4] >> 22) & 63;
- p[23] = (w[4] >> 16) & 63;
- p[24] = (w[4] >> 10) & 63;
- p[25] = (w[4] >> 4) & 63;
- p[26] = (w[4] << 2) & 63;
- p[26] |= (w[5] >> 30);
- p[27] = (w[5] >> 24) & 63;
- p[28] = (w[5] >> 18) & 63;
- p[29] = (w[5] >> 12) & 63;
- p[30] = (w[5] >> 6) & 63;
- p[31] = (w[5]) & 63;
- }
-}
-
-
-void unpack7(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 7)
- {
- p[0] = (w[0] >> 25);
- p[1] = (w[0] >> 18) & 127;
- p[2] = (w[0] >> 11) & 127;
- p[3] = (w[0] >> 4) & 127;
- p[4] = (w[0] << 3) & 127;
- p[4] |= (w[1] >> 29);
- p[5] = (w[1] >> 22) & 127;
- p[6] = (w[1] >> 15) & 127;
- p[7] = (w[1] >> 8) & 127;
- p[8] = (w[1] >> 1) & 127;
- p[9] = (w[1] << 6) & 127;
- p[9] |= (w[2] >> 26);
- p[10] = (w[2] >> 19) & 127;
- p[11] = (w[2] >> 12) & 127;
- p[12] = (w[2] >> 5) & 127;
- p[13] = (w[2] << 2) & 127;
- p[13] |= (w[3] >> 30);
- p[14] = (w[3] >> 23) & 127;
- p[15] = (w[3] >> 16) & 127;
- p[16] = (w[3] >> 9) & 127;
- p[17] = (w[3] >> 2) & 127;
- p[18] = (w[3] << 5) & 127;
- p[18] |= (w[4] >> 27);
- p[19] = (w[4] >> 20) & 127;
- p[20] = (w[4] >> 13) & 127;
- p[21] = (w[4] >> 6) & 127;
- p[22] = (w[4] << 1) & 127;
- p[22] |= (w[5] >> 31);
- p[23] = (w[5] >> 24) & 127;
- p[24] = (w[5] >> 17) & 127;
- p[25] = (w[5] >> 10) & 127;
- p[26] = (w[5] >> 3) & 127;
- p[27] = (w[5] << 4) & 127;
- p[27] |= (w[6] >> 28);
- p[28] = (w[6] >> 21) & 127;
- p[29] = (w[6] >> 14) & 127;
- p[30] = (w[6] >> 7) & 127;
- p[31] = (w[6]) & 127;
- }
-}
-
-
-void unpack8(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 8)
- {
- p[0] = (w[0] >> 24);
- p[1] = (w[0] >> 16) & 255;
- p[2] = (w[0] >> 8) & 255;
- p[3] = (w[0]) & 255;
- p[4] = (w[1] >> 24);
- p[5] = (w[1] >> 16) & 255;
- p[6] = (w[1] >> 8) & 255;
- p[7] = (w[1]) & 255;
- p[8] = (w[2] >> 24);
- p[9] = (w[2] >> 16) & 255;
- p[10] = (w[2] >> 8) & 255;
- p[11] = (w[2]) & 255;
- p[12] = (w[3] >> 24);
- p[13] = (w[3] >> 16) & 255;
- p[14] = (w[3] >> 8) & 255;
- p[15] = (w[3]) & 255;
- p[16] = (w[4] >> 24);
- p[17] = (w[4] >> 16) & 255;
- p[18] = (w[4] >> 8) & 255;
- p[19] = (w[4]) & 255;
- p[20] = (w[5] >> 24);
- p[21] = (w[5] >> 16) & 255;
- p[22] = (w[5] >> 8) & 255;
- p[23] = (w[5]) & 255;
- p[24] = (w[6] >> 24);
- p[25] = (w[6] >> 16) & 255;
- p[26] = (w[6] >> 8) & 255;
- p[27] = (w[6]) & 255;
- p[28] = (w[7] >> 24);
- p[29] = (w[7] >> 16) & 255;
- p[30] = (w[7] >> 8) & 255;
- p[31] = (w[7]) & 255;
- }
-}
-
-
-void unpack9(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 9)
- {
- p[0] = (w[0] >> 23);
- p[1] = (w[0] >> 14) & 511;
- p[2] = (w[0] >> 5) & 511;
- p[3] = (w[0] << 4) & 511;
- p[3] |= (w[1] >> 28);
- p[4] = (w[1] >> 19) & 511;
- p[5] = (w[1] >> 10) & 511;
- p[6] = (w[1] >> 1) & 511;
- p[7] = (w[1] << 8) & 511;
- p[7] |= (w[2] >> 24);
- p[8] = (w[2] >> 15) & 511;
- p[9] = (w[2] >> 6) & 511;
- p[10] = (w[2] << 3) & 511;
- p[10] |= (w[3] >> 29);
- p[11] = (w[3] >> 20) & 511;
- p[12] = (w[3] >> 11) & 511;
- p[13] = (w[3] >> 2) & 511;
- p[14] = (w[3] << 7) & 511;
- p[14] |= (w[4] >> 25);
- p[15] = (w[4] >> 16) & 511;
- p[16] = (w[4] >> 7) & 511;
- p[17] = (w[4] << 2) & 511;
- p[17] |= (w[5] >> 30);
- p[18] = (w[5] >> 21) & 511;
- p[19] = (w[5] >> 12) & 511;
- p[20] = (w[5] >> 3) & 511;
- p[21] = (w[5] << 6) & 511;
- p[21] |= (w[6] >> 26);
- p[22] = (w[6] >> 17) & 511;
- p[23] = (w[6] >> 8) & 511;
- p[24] = (w[6] << 1) & 511;
- p[24] |= (w[7] >> 31);
- p[25] = (w[7] >> 22) & 511;
- p[26] = (w[7] >> 13) & 511;
- p[27] = (w[7] >> 4) & 511;
- p[28] = (w[7] << 5) & 511;
- p[28] |= (w[8] >> 27);
- p[29] = (w[8] >> 18) & 511;
- p[30] = (w[8] >> 9) & 511;
- p[31] = (w[8]) & 511;
- }
-}
-
-
-void unpack10(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 10)
- {
- p[0] = (w[0] >> 22);
- p[1] = (w[0] >> 12) & 1023;
- p[2] = (w[0] >> 2) & 1023;
- p[3] = (w[0] << 8) & 1023;
- p[3] |= (w[1] >> 24);
- p[4] = (w[1] >> 14) & 1023;
- p[5] = (w[1] >> 4) & 1023;
- p[6] = (w[1] << 6) & 1023;
- p[6] |= (w[2] >> 26);
- p[7] = (w[2] >> 16) & 1023;
- p[8] = (w[2] >> 6) & 1023;
- p[9] = (w[2] << 4) & 1023;
- p[9] |= (w[3] >> 28);
- p[10] = (w[3] >> 18) & 1023;
- p[11] = (w[3] >> 8) & 1023;
- p[12] = (w[3] << 2) & 1023;
- p[12] |= (w[4] >> 30);
- p[13] = (w[4] >> 20) & 1023;
- p[14] = (w[4] >> 10) & 1023;
- p[15] = (w[4]) & 1023;
- p[16] = (w[5] >> 22);
- p[17] = (w[5] >> 12) & 1023;
- p[18] = (w[5] >> 2) & 1023;
- p[19] = (w[5] << 8) & 1023;
- p[19] |= (w[6] >> 24);
- p[20] = (w[6] >> 14) & 1023;
- p[21] = (w[6] >> 4) & 1023;
- p[22] = (w[6] << 6) & 1023;
- p[22] |= (w[7] >> 26);
- p[23] = (w[7] >> 16) & 1023;
- p[24] = (w[7] >> 6) & 1023;
- p[25] = (w[7] << 4) & 1023;
- p[25] |= (w[8] >> 28);
- p[26] = (w[8] >> 18) & 1023;
- p[27] = (w[8] >> 8) & 1023;
- p[28] = (w[8] << 2) & 1023;
- p[28] |= (w[9] >> 30);
- p[29] = (w[9] >> 20) & 1023;
- p[30] = (w[9] >> 10) & 1023;
- p[31] = (w[9]) & 1023;
- }
-}
-
-
-void unpack11(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 11)
- {
- p[0] = (w[0] >> 21);
- p[1] = (w[0] >> 10) & 2047;
- p[2] = (w[0] << 1) & 2047;
- p[2] |= (w[1] >> 31);
- p[3] = (w[1] >> 20) & 2047;
- p[4] = (w[1] >> 9) & 2047;
- p[5] = (w[1] << 2) & 2047;
- p[5] |= (w[2] >> 30);
- p[6] = (w[2] >> 19) & 2047;
- p[7] = (w[2] >> 8) & 2047;
- p[8] = (w[2] << 3) & 2047;
- p[8] |= (w[3] >> 29);
- p[9] = (w[3] >> 18) & 2047;
- p[10] = (w[3] >> 7) & 2047;
- p[11] = (w[3] << 4) & 2047;
- p[11] |= (w[4] >> 28);
- p[12] = (w[4] >> 17) & 2047;
- p[13] = (w[4] >> 6) & 2047;
- p[14] = (w[4] << 5) & 2047;
- p[14] |= (w[5] >> 27);
- p[15] = (w[5] >> 16) & 2047;
- p[16] = (w[5] >> 5) & 2047;
- p[17] = (w[5] << 6) & 2047;
- p[17] |= (w[6] >> 26);
- p[18] = (w[6] >> 15) & 2047;
- p[19] = (w[6] >> 4) & 2047;
- p[20] = (w[6] << 7) & 2047;
- p[20] |= (w[7] >> 25);
- p[21] = (w[7] >> 14) & 2047;
- p[22] = (w[7] >> 3) & 2047;
- p[23] = (w[7] << 8) & 2047;
- p[23] |= (w[8] >> 24);
- p[24] = (w[8] >> 13) & 2047;
- p[25] = (w[8] >> 2) & 2047;
- p[26] = (w[8] << 9) & 2047;
- p[26] |= (w[9] >> 23);
- p[27] = (w[9] >> 12) & 2047;
- p[28] = (w[9] >> 1) & 2047;
- p[29] = (w[9] << 10) & 2047;
- p[29] |= (w[10] >> 22);
- p[30] = (w[10] >> 11) & 2047;
- p[31] = (w[10]) & 2047;
- }
-}
-
-
-void unpack12(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 12)
- {
- p[0] = (w[0] >> 20);
- p[1] = (w[0] >> 8) & 4095;
- p[2] = (w[0] << 4) & 4095;
- p[2] |= (w[1] >> 28);
- p[3] = (w[1] >> 16) & 4095;
- p[4] = (w[1] >> 4) & 4095;
- p[5] = (w[1] << 8) & 4095;
- p[5] |= (w[2] >> 24);
- p[6] = (w[2] >> 12) & 4095;
- p[7] = (w[2]) & 4095;
- p[8] = (w[3] >> 20);
- p[9] = (w[3] >> 8) & 4095;
- p[10] = (w[3] << 4) & 4095;
- p[10] |= (w[4] >> 28);
- p[11] = (w[4] >> 16) & 4095;
- p[12] = (w[4] >> 4) & 4095;
- p[13] = (w[4] << 8) & 4095;
- p[13] |= (w[5] >> 24);
- p[14] = (w[5] >> 12) & 4095;
- p[15] = (w[5]) & 4095;
- p[16] = (w[6] >> 20);
- p[17] = (w[6] >> 8) & 4095;
- p[18] = (w[6] << 4) & 4095;
- p[18] |= (w[7] >> 28);
- p[19] = (w[7] >> 16) & 4095;
- p[20] = (w[7] >> 4) & 4095;
- p[21] = (w[7] << 8) & 4095;
- p[21] |= (w[8] >> 24);
- p[22] = (w[8] >> 12) & 4095;
- p[23] = (w[8]) & 4095;
- p[24] = (w[9] >> 20);
- p[25] = (w[9] >> 8) & 4095;
- p[26] = (w[9] << 4) & 4095;
- p[26] |= (w[10] >> 28);
- p[27] = (w[10] >> 16) & 4095;
- p[28] = (w[10] >> 4) & 4095;
- p[29] = (w[10] << 8) & 4095;
- p[29] |= (w[11] >> 24);
- p[30] = (w[11] >> 12) & 4095;
- p[31] = (w[11]) & 4095;
- }
-}
-
-
-void unpack13(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 13)
- {
- p[0] = (w[0] >> 19);
- p[1] = (w[0] >> 6) & 8191;
- p[2] = (w[0] << 7) & 8191;
- p[2] |= (w[1] >> 25);
- p[3] = (w[1] >> 12) & 8191;
- p[4] = (w[1] << 1) & 8191;
- p[4] |= (w[2] >> 31);
- p[5] = (w[2] >> 18) & 8191;
- p[6] = (w[2] >> 5) & 8191;
- p[7] = (w[2] << 8) & 8191;
- p[7] |= (w[3] >> 24);
- p[8] = (w[3] >> 11) & 8191;
- p[9] = (w[3] << 2) & 8191;
- p[9] |= (w[4] >> 30);
- p[10] = (w[4] >> 17) & 8191;
- p[11] = (w[4] >> 4) & 8191;
- p[12] = (w[4] << 9) & 8191;
- p[12] |= (w[5] >> 23);
- p[13] = (w[5] >> 10) & 8191;
- p[14] = (w[5] << 3) & 8191;
- p[14] |= (w[6] >> 29);
- p[15] = (w[6] >> 16) & 8191;
- p[16] = (w[6] >> 3) & 8191;
- p[17] = (w[6] << 10) & 8191;
- p[17] |= (w[7] >> 22);
- p[18] = (w[7] >> 9) & 8191;
- p[19] = (w[7] << 4) & 8191;
- p[19] |= (w[8] >> 28);
- p[20] = (w[8] >> 15) & 8191;
- p[21] = (w[8] >> 2) & 8191;
- p[22] = (w[8] << 11) & 8191;
- p[22] |= (w[9] >> 21);
- p[23] = (w[9] >> 8) & 8191;
- p[24] = (w[9] << 5) & 8191;
- p[24] |= (w[10] >> 27);
- p[25] = (w[10] >> 14) & 8191;
- p[26] = (w[10] >> 1) & 8191;
- p[27] = (w[10] << 12) & 8191;
- p[27] |= (w[11] >> 20);
- p[28] = (w[11] >> 7) & 8191;
- p[29] = (w[11] << 6) & 8191;
- p[29] |= (w[12] >> 26);
- p[30] = (w[12] >> 13) & 8191;
- p[31] = (w[12]) & 8191;
- }
-}
-
-
-void unpack16(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 16)
- {
- p[0] = (w[0] >> 16);
- p[1] = (w[0]) & 65535;
- p[2] = (w[1] >> 16);
- p[3] = (w[1]) & 65535;
- p[4] = (w[2] >> 16);
- p[5] = (w[2]) & 65535;
- p[6] = (w[3] >> 16);
- p[7] = (w[3]) & 65535;
- p[8] = (w[4] >> 16);
- p[9] = (w[4]) & 65535;
- p[10] = (w[5] >> 16);
- p[11] = (w[5]) & 65535;
- p[12] = (w[6] >> 16);
- p[13] = (w[6]) & 65535;
- p[14] = (w[7] >> 16);
- p[15] = (w[7]) & 65535;
- p[16] = (w[8] >> 16);
- p[17] = (w[8]) & 65535;
- p[18] = (w[9] >> 16);
- p[19] = (w[9]) & 65535;
- p[20] = (w[10] >> 16);
- p[21] = (w[10]) & 65535;
- p[22] = (w[11] >> 16);
- p[23] = (w[11]) & 65535;
- p[24] = (w[12] >> 16);
- p[25] = (w[12]) & 65535;
- p[26] = (w[13] >> 16);
- p[27] = (w[13]) & 65535;
- p[28] = (w[14] >> 16);
- p[29] = (w[14]) & 65535;
- p[30] = (w[15] >> 16);
- p[31] = (w[15]) & 65535;
- }
-}
-
-
-void unpack20(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 20)
- {
- p[0] = (w[0] >> 12);
- p[1] = (w[0] << 8) & ((1<<20)-1);
- p[1] |= (w[1] >> 24);
- p[2] = (w[1] >> 4) & ((1<<20)-1);
- p[3] = (w[1] << 16) & ((1<<20)-1);
- p[3] |= (w[2] >> 16);
- p[4] = (w[2] << 4) & ((1<<20)-1);
- p[4] |= (w[3] >> 28);
- p[5] = (w[3] >> 8) & ((1<<20)-1);
- p[6] = (w[3] << 12) & ((1<<20)-1);
- p[6] |= (w[4] >> 20);
- p[7] = (w[4]) & ((1<<20)-1);
- p[8] = (w[5] >> 12);
- p[9] = (w[5] << 8) & ((1<<20)-1);
- p[9] |= (w[6] >> 24);
- p[10] = (w[6] >> 4) & ((1<<20)-1);
- p[11] = (w[6] << 16) & ((1<<20)-1);
- p[11] |= (w[7] >> 16);
- p[12] = (w[7] << 4) & ((1<<20)-1);
- p[12] |= (w[8] >> 28);
- p[13] = (w[8] >> 8) & ((1<<20)-1);
- p[14] = (w[8] << 12) & ((1<<20)-1);
- p[14] |= (w[9] >> 20);
- p[15] = (w[9]) & ((1<<20)-1);
- p[16] = (w[10] >> 12);
- p[17] = (w[10] << 8) & ((1<<20)-1);
- p[17] |= (w[11] >> 24);
- p[18] = (w[11] >> 4) & ((1<<20)-1);
- p[19] = (w[11] << 16) & ((1<<20)-1);
- p[19] |= (w[12] >> 16);
- p[20] = (w[12] << 4) & ((1<<20)-1);
- p[20] |= (w[13] >> 28);
- p[21] = (w[13] >> 8) & ((1<<20)-1);
- p[22] = (w[13] << 12) & ((1<<20)-1);
- p[22] |= (w[14] >> 20);
- p[23] = (w[14]) & ((1<<20)-1);
- p[24] = (w[15] >> 12);
- p[25] = (w[15] << 8) & ((1<<20)-1);
- p[25] |= (w[16] >> 24);
- p[26] = (w[16] >> 4) & ((1<<20)-1);
- p[27] = (w[16] << 16) & ((1<<20)-1);
- p[27] |= (w[17] >> 16);
- p[28] = (w[17] << 4) & ((1<<20)-1);
- p[28] |= (w[18] >> 28);
- p[29] = (w[18] >> 8) & ((1<<20)-1);
- p[30] = (w[18] << 12) & ((1<<20)-1);
- p[30] |= (w[19] >> 20);
- p[31] = (w[19]) & ((1<<20)-1);
- }
-}
-
-
-static void unpack32(unsigned int *p, unsigned int *w)
-{
- int i;
-
- for (i = 0; i < BS; i += 32, p += 32, w += 32)
- {
- p[0] = w[0];
- p[1] = w[1];
- p[2] = w[2];
- p[3] = w[3];
- p[4] = w[4];
- p[5] = w[5];
- p[6] = w[6];
- p[7] = w[7];
- p[8] = w[8];
- p[9] = w[9];
- p[10] = w[10];
- p[11] = w[11];
- p[12] = w[12];
- p[13] = w[13];
- p[14] = w[14];
- p[15] = w[15];
- p[16] = w[16];
- p[17] = w[17];
- p[18] = w[18];
- p[19] = w[19];
- p[20] = w[20];
- p[21] = w[21];
- p[22] = w[22];
- p[23] = w[23];
- p[24] = w[24];
- p[25] = w[25];
- p[26] = w[26];
- p[27] = w[27];
- p[28] = w[28];
- p[29] = w[29];
- p[30] = w[30];
- p[31] = w[31];
- }
-}
-
-
-typedef void (*pf)(unsigned int *p, unsigned int *w);
-pf unpack[17] = {unpack0, unpack1, unpack2, unpack3, unpack4, unpack5,
- unpack6, unpack7, unpack8, unpack9, unpack10, unpack11,
- unpack12, unpack13, unpack16, unpack20, unpack32};
-
diff --git a/src/ext/for/ext/SPDP_10.c b/src/ext/for/ext/SPDP_10.c
deleted file mode 100644
index e8256954..00000000
--- a/src/ext/for/ext/SPDP_10.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
-SPDP code: SPDP is a unified compression/decompression algorithm that works
-well on both binary 32-bit single-precision (float) and binary 64-bit double-
-precision (double) floating-point data.
-
-Copyright (c) 2016, Texas State University. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted for academic, research, experimental, or personal use provided
-that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions, and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions, and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of Texas State University nor the names of its
- contributors may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
-For all other uses, please contact the Office for Commercialization and Industry
-Relations at Texas State University <http://www.txstate.edu/ocir/>.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Authors: Martin Burtscher and Steven Claggett
-*/
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#define MAX_TABLE_SIZE (1 << 18)
-
-typedef unsigned char byte_t;
-typedef unsigned int word_t;
-
-
-static size_t spdp_compress(const byte_t level, const size_t length, byte_t* const buf1, byte_t* const buf2)
-{
- word_t* in = (word_t*)buf1;
- word_t* out = (word_t*)buf2;
- size_t len = length / sizeof(word_t);
-
- word_t prev2 = 0;
- word_t prev1 = 0;
- size_t pos;
- for (pos = 0; pos < len; pos++) {
- word_t curr = in[pos];
- out[pos] = curr - prev2;
- prev2 = prev1;
- prev1 = curr;
- }
-
- for (pos = len * sizeof(word_t); pos < length; pos++) {
- buf2[pos] = buf1[pos];
- }
-
- byte_t prev = 0;
- size_t wpos = 0;
- size_t d;
- for (d = 0; d < 8; d++) {
- size_t rpos;
- for (rpos = d; rpos < length; rpos += 8) {
- byte_t curr = buf2[rpos];
- buf1[wpos] = curr - prev;
- prev = curr;
- wpos++;
- }
- }
-
- size_t predtabsize = 1 << (level + 9);
- if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE;
- const size_t predtabsizem1 = predtabsize - 1;
-
- unsigned int lastpos[MAX_TABLE_SIZE];
- memset(lastpos, 0, predtabsize * sizeof(unsigned int));
-
- size_t rpos = 0;
- wpos = 0;
- unsigned int hist = 0;
- while (rpos < length) {
- byte_t val = buf1[rpos];
- unsigned int lpos = lastpos[hist];
- if (lpos >= 6) {
- if ((buf1[lpos - 6] == buf1[rpos - 6]) && (buf1[lpos - 5] == buf1[rpos - 5]) &&
- (buf1[lpos - 4] == buf1[rpos - 4]) && (buf1[lpos - 3] == buf1[rpos - 3]) &&
- (buf1[lpos - 2] == buf1[rpos - 2]) && (buf1[lpos - 1] == buf1[rpos - 1])) {
- byte_t cnt = 0;
- while ((val == buf1[lpos]) && (cnt < 255) && (rpos < (length - 1))) {
- lastpos[hist] = rpos;
- hist = ((hist << 2) ^ val) & predtabsizem1;
- rpos++;
- lpos++;
- cnt++;
- val = buf1[rpos];
- }
- buf2[wpos] = cnt;
- wpos++;
- }
- }
- buf2[wpos] = val;
- wpos++;
- lastpos[hist] = rpos;
- hist = ((hist << 2) ^ val) & predtabsizem1;
- rpos++;
- }
-
- return wpos;
-}
-
-static void spdp_decompress(const byte_t level, const size_t length, byte_t* const buf2, byte_t* const buf1)
-{
- unsigned int predtabsize = 1 << (level + 9);
- if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE;
- const unsigned int predtabsizem1 = predtabsize - 1;
-
- unsigned int lastpos[MAX_TABLE_SIZE];
- memset(lastpos, 0, predtabsize * sizeof(unsigned int));
-
- size_t rpos = 0;
- size_t wpos = 0;
- unsigned int hist = 0;
- while (rpos < length) {
- unsigned int lpos = lastpos[hist];
- if (lpos >= 6) {
- if ((buf1[lpos - 6] == buf1[wpos - 6]) && (buf1[lpos - 5] == buf1[wpos - 5]) &&
- (buf1[lpos - 4] == buf1[wpos - 4]) && (buf1[lpos - 3] == buf1[wpos - 3]) &&
- (buf1[lpos - 2] == buf1[wpos - 2]) && (buf1[lpos - 1] == buf1[wpos - 1])) {
- byte_t cnt = buf2[rpos];
- rpos++;
- byte_t j;
- for (j = 0; j < cnt; j++) {
- byte_t val = buf1[wpos] = buf1[lpos];
- lastpos[hist] = wpos;
- hist = ((hist << 2) ^ val) & predtabsizem1;
- wpos++;
- lpos++;
- }
- }
- }
- byte_t val = buf1[wpos] = buf2[rpos];
- lastpos[hist] = wpos;
- hist = ((hist << 2) ^ val) & predtabsizem1;
- wpos++;
- rpos++;
- }
- const size_t usize = wpos;
-
- byte_t val = 0;
- rpos = 0;
- size_t d;
- for (d = 0; d < 8; d++) {
- size_t wpos;
- for (wpos = d; wpos < usize; wpos += 8) {
- val += buf1[rpos];
- buf2[wpos] = val;
- rpos++;
- }
- }
-
- word_t* in = (word_t*)buf2;
- word_t* out = (word_t*)buf1;
- const size_t len = usize / sizeof(word_t);
-
- word_t prev2 = 0;
- word_t prev1 = 0;
- size_t pos;
- for (pos = 0; pos < len; pos++) {
- word_t curr = in[pos] + prev2;
- out[pos] = curr;
- prev2 = prev1;
- prev1 = curr;
- }
- for (pos = len * sizeof(word_t); pos < usize; pos++) {
- buf1[pos] = buf2[pos];
- }
-}
-#ifndef NMAIN
-#define BUFFER_SIZE (1 << 23)
-static byte_t buffer1[BUFFER_SIZE];
-static byte_t buffer2[BUFFER_SIZE * 2 + 9];
-int main(int argc, char *argv[])
-{
- fprintf(stderr, "SPDP Floating-Point Compressor v1.0\n");
- fprintf(stderr, "Copyright (c) 2016 Texas State University\n\n");
-
- if ((argc != 1) && (argc != 2)) {
- fprintf(stderr, "compression usage: %s level < uncompressed_file > compressed_file\n", argv[0]);
- fprintf(stderr, "decompression usage: %s < compressed_file > decompressed_file\n", argv[0]);
- return -1;
- }
-
- if (argc == 2) { // compression
- byte_t level = atoi(argv[1]);
- if (level < 0) level = 0;
- if (level > 9) level = 9;
- fwrite(&level, sizeof(byte_t), 1, stdout);
-
- int length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin);
- while (length > 0) {
- fwrite(&length, sizeof(int), 1, stdout);
- int csize = compress(level, length, buffer1, buffer2);
- fwrite(&csize, sizeof(int), 1, stdout);
- fwrite(buffer2, sizeof(byte_t), csize, stdout);
- length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin);
- }
- } else { // decompression
- byte_t level = 10;
- fread(&level, sizeof(byte_t), 1, stdin);
- if ((level < 0) || (level > 9)) {
- fprintf(stderr, "incorrect input file type\n");
- return -2;
- }
-
- int length;
- while (fread(&length, sizeof(int), 1, stdin) > 0) {
- int csize;
- fread(&csize, sizeof(int), 1, stdin);
- fread(buffer2, sizeof(byte_t), csize, stdin);
- decompress(level, csize, buffer2, buffer1);
- fwrite(buffer1, sizeof(byte_t), length, stdout);
- }
- }
-
- return 0;
-}
-#endif
diff --git a/src/ext/for/ext/bg/bg.c b/src/ext/for/ext/bg/bg.c
deleted file mode 100644
index dc5a714b..00000000
--- a/src/ext/for/ext/bg/bg.c
+++ /dev/null
@@ -1,185 +0,0 @@
-#include "bg.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-int versionNumber[4] = {BG_VER_MAJOR,BG_VER_MINOR,BG_VER_BUILD,BG_VER_REVISION};
-//int BG_SIZE_TYPE = 8;
-
-int dataEndianType = LITTLE_ENDIAN_DATA; //*endian type of the data read from disk
-int sysEndianType; //*sysEndianType is actually set automatically.
-
-//the confparams should be separate between compression and decopmression, in case of mutual-affection when calling compression/decompression alternatively
-bg_params *confparams_cpr = NULL; //used for compression
-bg_params *confparams_dec = NULL; //used for decompression
-
-bg_exedata *exe_params = NULL;
-
-int bgMode_libpressio = BITGROOM;
-int errorControlMode_libpressio = BG_NSD;
-int nsd_libpressio = 5;
-int dsd_libpressio = 5;
-
-#if 0
-unsigned char *BG_compress(int dataType, void *data, size_t *outSize, size_t nbEle)
-{
- return BG_compress_args(dataType, data, outSize, confparams_cpr->bgMode, confparams_cpr->errorControlMode, confparams_cpr->NSD, confparams_cpr->DSD, nbEle);
-}
-#endif
-
-unsigned char* BG_compress_args(int dataType, void *data, size_t *outSize, int bgMode, int errorControlMode, int nsd, int dsd, size_t nbEle, unsigned char *data_)
-{
-
- int dataTypeLen = dataType==BG_FLOAT?sizeof(float):sizeof(double);
-
- size_t bufferSize = dataTypeLen*nbEle;
-
- const double bit_per_dcm_dgt_prc=M_LN10/M_LN2; /* 3.32 [frc] Bits per decimal digit of precision */
- //const double dcm_per_bit_dgt_prc=M_LN2/M_LN10; /* 0.301 [frc] Bits per decimal digit of precision */
- const int bit_xpl_nbr_sgn_flt=23; /* [nbr] Bits 0-22 of SP significands are explicit. Bit 23 is implicitly 1. */
- const int bit_xpl_nbr_sgn_dbl=53; /* [nbr] Bits 0-52 of DP significands are explicit. Bit 53 is implicitly 1. */
- //const int ieee_xpn_fst_flt=127; /* [nbr] IEEE "exponent bias" = actual exponent minus stored exponent */
-
- double prc_bnr_xct; /* [nbr] Binary digits of precision, exact */
-
- int bit_xpl_nbr_sgn=int_CEWI; /* [nbr] Number of explicit bits in significand */
- int bit_xpl_nbr_zro; /* [nbr] Number of explicit bits to zero */
-
- long idx;
-
- unsigned int *u32_ptr;
- unsigned int msk_f32_u32_zro;
- unsigned int msk_f32_u32_one;
- //unsigned int msk_f32_u32_hshv;
- unsigned long long *u64_ptr;
- unsigned long long msk_f64_u64_zro;
- unsigned long long msk_f64_u64_one;
- //unsigned long int msk_f64_u64_hshv;
- unsigned short prc_bnr_ceil; /* [nbr] Exact binary digits of precision rounded-up */
- unsigned short prc_bnr_xpl_rqr; /* [nbr] Explicitly represented binary digits required to retain */
-
- if(errorControlMode == BG_NSD && (nsd < 0 || nsd >16))
- {
- printf("Error: wrong nsd input\n");
- return NULL;
- }
-
- /* How many bits to preserve? */
- prc_bnr_xct=nsd*bit_per_dcm_dgt_prc;
- /* Be conservative, round upwards */
- prc_bnr_ceil=(unsigned short)ceil(prc_bnr_xct);
- /* First bit is implicit not explicit but corner cases prevent our taking advantage of this */
- //prc_bnr_xpl_rqr=prc_bnr_ceil-1;
- //prc_bnr_xpl_rqr=prc_bnr_ceil;
- prc_bnr_xpl_rqr=prc_bnr_ceil+1;
-
- //unsigned char* data_ = (unsigned char*)malloc(bufferSize);
- memcpy(data_, data, bufferSize);
-
- if(dataType == BG_DOUBLE) prc_bnr_xpl_rqr++; /* Seems necessary for double-precision ppc=array(1.234567,1.0e-6,$dmn) */
-
- if(!(dataType == BG_FLOAT && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_flt) || (dataType == BG_DOUBLE && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_dbl)) //required # bits is greater than the full length of bits
- {
- if(dataType==BG_FLOAT)
- {
- bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_flt;
- bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;
- if(bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN)
- {
- printf("Error: bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN\n");
- return NULL;
- }
-
- u32_ptr = (unsigned int*)data_;
- /* Create mask */
- msk_f32_u32_zro=0u; /* Zero all bits */
- msk_f32_u32_zro=~msk_f32_u32_zro; /* Turn all bits to ones */
- /* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
- msk_f32_u32_zro <<= bit_xpl_nbr_zro;
- /* Bit Set mask for OR: Put ones into bits to be set, zeros in untouched bits */
- msk_f32_u32_one=~msk_f32_u32_zro;
- //msk_f32_u32_hshv=msk_f32_u32_one & (msk_f32_u32_zro >> 1); /* Set one bit: the MSB of LSBs */
- switch(bgMode)
- {
- case BITGROOM:
- for(idx=0L;idx<nbEle;idx+=2L) u32_ptr[idx]&=msk_f32_u32_zro;
-
- for(idx=1L;idx<nbEle;idx+=2L)
- if(u32_ptr[idx] != 0U) /* Never quantize upwards floating point values of zero */
- u32_ptr[idx]|=msk_f32_u32_one;
- break;
- case BITSHAVE:
- for(idx=0L;idx<nbEle;idx++) u32_ptr[idx]&=msk_f32_u32_zro;
- break;
- case BITSET:
- for(idx=0L;idx<nbEle;idx++)
- if(u32_ptr[idx] != 0U) /* Never quantize upwards floating point values of zero */
- u32_ptr[idx]|=msk_f32_u32_one;
- break;
- }
- }
- else //BG_DOUBLE
- {
- bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_dbl;
- bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;
- if(bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN)
- {
- printf("Error: bit_xpl_nbr_zro > bit_xpl_nbr_sgn-NCO_PPC_BIT_XPL_NBR_MIN\n");
- return NULL;
- }
-
- u64_ptr=(unsigned long int*)data_;
- /* Create mask */
- msk_f64_u64_zro=0ul; /* Zero all bits */
- msk_f64_u64_zro=~msk_f64_u64_zro; /* Turn all bits to ones */
- /* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
- msk_f64_u64_zro <<= bit_xpl_nbr_zro;
- /* Bit Set mask for OR: Put ones into bits to be set, zeros in untouched bits */
- msk_f64_u64_one=~msk_f64_u64_zro;
- //msk_f64_u64_hshv=msk_f64_u64_one & (msk_f64_u64_zro >> 1); /* Set one bit: the MSB of LSBs */
- switch(bgMode)
- {
- case BITGROOM:
- for(idx=0L;idx<nbEle;idx+=2L) u64_ptr[idx]&=msk_f64_u64_zro;
- for(idx=1L;idx<nbEle;idx+=2L)
- if(u64_ptr[idx] != 0UL) /* Never quantize upwards floating point values of zero */
- u64_ptr[idx]|=msk_f64_u64_one;
- break;
- case BITSHAVE:
- for(idx=0L;idx<nbEle;idx++) u64_ptr[idx]&=msk_f64_u64_zro;
- break;
- case BITSET:
- for(idx=0L;idx<nbEle;idx++)
- if(u64_ptr[idx] != 0UL) /* Never quantize upwards floating point values of zero */
- u64_ptr[idx]|=msk_f64_u64_one;
- break;
- }
- }
- }
-
- //perform DEFLATE algorithm by Zlib
- //unsigned char* outBytes = NULL;
- //*outSize = zlib_compress5(data_, bufferSize, &outBytes, 1);
-
- //free(data_);
- return data_; //outBytes;
-
-}
-#if 0
-void *BG_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t nbEle)
-{
- if(dataType==BG_FLOAT)
- {
- unsigned char* decompressedData;
- zlib_uncompress5(bytes, byteLength, &decompressedData, nbEle*sizeof(float));
- return decompressedData;
- }
- else //BG_DOUBLE
- {
- unsigned char* decompressedData;
- zlib_uncompress5(bytes, byteLength, &decompressedData, nbEle*sizeof(double));
- return decompressedData;
- }
-}
-#endif
diff --git a/src/ext/for/ext/bg/bg.h b/src/ext/for/ext/bg/bg.h
deleted file mode 100644
index 29a64918..00000000
--- a/src/ext/for/ext/bg/bg.h
+++ /dev/null
@@ -1,109 +0,0 @@
-//#include "defines.h"
-//#include <rw.h>
-#include <stdio.h>
-#include "defines.h"
-//#include "callZlib.h"
-#include <stdint.h>
-//#include <time.h>
-//#include <sys/time.h>
-//#include "conf.h"
-
-#ifndef _BG_H
-#define _BG_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-# define M_LN10 2.30258509299404568401799145468436421 /* loge(10) */
-# define M_LN2 0.693147180559945309417232121458176568 /* loge(2) */
-#define int_CEWI 0
-#define NCO_PPC_BIT_XPL_NBR_MIN 2
-
-typedef union lint16
-{
- unsigned short usvalue;
- short svalue;
- unsigned char byte[2];
-} lint16;
-
-typedef union lint32
-{
- int ivalue;
- unsigned int uivalue;
- unsigned char byte[4];
-} lint32;
-
-typedef union lint64
-{
- long lvalue;
- unsigned long ulvalue;
- unsigned char byte[8];
-} lint64;
-
-typedef union ldouble
-{
- double value;
- unsigned long lvalue;
- unsigned char byte[8];
-} ldouble;
-
-typedef union lfloat
-{
- float value;
- unsigned int ivalue;
- unsigned char byte[4];
-} lfloat;
-
-typedef struct bg_params
-{
- int dataType;
-
- int sol_ID;// GB
- int zlibMode; //* four options: Z_NO_COMPRESSION, or Z_BEST_SPEED, Z_BEST_COMPRESSION, Z_DEFAULT_COMPRESSION
- int bgMode; //BITGROOM, BITSHAVE or BITSET
- int errorControlMode;
- int NSD;
- int DSD;
-
- float fmin, fmax;
- double dmin, dmax;
-
-} bg_params;
-
-
-typedef struct bg_exedata
-{
- unsigned int BG_SIZE_TYPE; //the length (# bytes) of the size_t in the system at runtime //4 or 8: sizeof(size_t)
-} bg_exedata;
-
-
-extern int versionNumber[4];
-
-//-------------------key global variables--------------
-extern int dataEndianType; //*endian type of the data read from disk
-extern int sysEndianType; //*sysEndianType is actually set automatically.
-
-extern bg_params *confparams_cpr;
-extern bg_params *confparams_dec;
-extern bg_exedata *exe_params;
-
-
-//for libpressio
-extern int bgMode_libpressio;
-extern int errorControlMode_libpressio;
-extern int nsd_libpressio;
-extern int dsd_libpressio;
-
-
-unsigned char *BG_compress(int dataType, void *data, size_t *outSize, size_t nbEle);
-
-unsigned char* BG_compress_args(int dataType, void *data, size_t *outSize, int bgMode, int errorControlMode, int nsd, int dsd, size_t nbEle, unsigned char *data_);
-
-//void *BG_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t nbEle);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* ----- #ifndef _BG_H ----- */
diff --git a/src/ext/for/ext/bg/defines.h b/src/ext/for/ext/bg/defines.h
deleted file mode 100644
index 2ed6cda7..00000000
--- a/src/ext/for/ext/bg/defines.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * @file defines.h
- * @author Sheng Di
- * @date July, 2019
- * @brief Header file for the dataCompression.c.
- * (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
- * See COPYRIGHT in top-level directory.
- */
-
-#ifndef _BG_DEFINES_H
-#define _BG_DEFINES_H
-
-#define BG_VER_MAJOR 2
-#define BG_VER_MINOR 1
-#define BG_VER_BUILD 9
-#define BG_VER_REVISION 0
-
-#define BG 105
-#define BITGROOM 0
-#define BITSHAVE 1
-#define BITSET 2
-
-#define BG_FLOAT 0
-#define BG_DOUBLE 1
-#define BG_UINT8 2
-#define BG_INT8 3
-#define BG_UINT16 4
-#define BG_INT16 5
-#define BG_UINT32 6
-#define BG_INT32 7
-#define BG_UINT64 8
-#define BG_INT64 9
-
-#define BG_NSD 0
-#define BG_DSD 1
-
-#define LITTLE_ENDIAN_DATA 0 //refers to the endian type of the data read from the disk
-#define BIG_ENDIAN_DATA 1 //big_endian (ppc, max, etc.) ; little_endian (x86, x64, etc.)
-
-#define LITTLE_ENDIAN_SYSTEM 0 //refers to the endian type of the system
-#define BIG_ENDIAN_SYSTEM 1
-
-#define DynArrayInitLen 1024
-
-//SUCCESS returning status
-#define BG_SCES 0 //successful
-#define BG_NSCS -1 //Not successful
-#define BG_FERR -2 //Failed to open input file
-#define BG_TERR -3 //wrong data type (should be only float or double)
-#define BG_DERR -4 //dimension error
-#define BG_MERR -5 //sz_mode error
-#define BG_BERR -6 //bound-mode error (should be only ABS, REL, ABS_AND_REL, ABS_OR_REL, or PW_REL)
-
-#endif /* _BG_DEFINES_H */
diff --git a/src/ext/for/ext/fastpfor.cc b/src/ext/for/ext/fastpfor.cc
deleted file mode 100644
index 9d7ef55e..00000000
--- a/src/ext/for/ext/fastpfor.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "../vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-
-#include "fastpfor.h"
-#include "FastPFor/headers/variablebyte.h"
-#include "FastPFor/headers/simple16.h"
-//#include "FastPFor/headers/simple8b_rle.h"
-#include "FastPFor/headers/fastpfor.h"
-
-#include "FastPFor/headers/simdfastpfor.h"
-#include "FastPFor/headers/optpfor.h"
-#include "FastPFor/headers/simdoptpfor.h"
-#include "FastPFor/headers/simdgroupsimple.h"
-#include "FastPFor/headers/compositecodec.h"
-
-#define ctou32(_cp_) (*(unsigned *)(_cp_))
-
-unsigned FastPFore32(const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize) {
- size_t nvalue = outsize/4;
- FastPForLib::FastPFor<4> ic;
- ic.encodeArray((const uint32_t *)in, n & (~127), (uint32_t *)(out+4), nvalue);
- if(n & 127) {
- size_t nvalue2 = outsize/4 - nvalue;
- FastPForLib::VariableByte vc;
- vc.encodeArray((const uint32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
- nvalue += nvalue2;
- }
- ctou32(out) = nvalue;
- return 4+nvalue*4;
-}
-
-unsigned FastPFord32(const unsigned char *in, unsigned n, uint32_t *out) {
- size_t nvalue = n;
- FastPForLib::FastPFor<4> ic;
- const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), ctou32(in), out, nvalue);
- if(n & 127) {
- nvalue = n - nvalue;
- FastPForLib::VariableByte vc;
- ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue);
- }
- return ctou32(ip);
-}
-
-/*unsigned FastPFore64(const uint64_t *in, unsigned n, unsigned char *out, unsigned outsize) {
- size_t nvalue = outsize/8;
- FastPForLib::FastPFor<4> ic;
- ic.encodeArray(in, (size_t)(n & (~127)), (uint32_t *)(out+4), nvalue);
- if(n & 127) {
- size_t nvalue2 = outsize/8 - nvalue;
- FastPForLib::VariableByte vc;
-
- vc.encodeArray((const uint64_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
- nvalue += nvalue2;
- }
- ctou32(out) = nvalue;
- return 4+nvalue*4;
-}
-
-unsigned FastPFord64(const unsigned char *in, unsigned n, uint64_t *out) {
- size_t nvalue = n;
- FastPForLib::FastPFor<4> ic;
- const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), ctou32(in), (uint64_t *)out, nvalue);
- if(n & 127) {
- nvalue = n - nvalue;
- FastPForLib::VariableByte vc;
- ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue);
- }
- return ctou32(ip);
-}*/
-
-unsigned FastPFore128v32(const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize) {
- size_t nvalue = outsize/4;
- FastPForLib::SIMDFastPFor<4> ic;
- ic.encodeArray(in, n & (~127), (uint32_t *)(out+4), nvalue);
- if(n & 127) {
- size_t nvalue2 = outsize/4 - nvalue;
- FastPForLib::VariableByte vc; vc.encodeArray((const uint32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
- nvalue += nvalue2;
- }
- ctou32(out) = nvalue;
- return 4+nvalue*4;
-}
-
-unsigned FastPFord128v32(const unsigned char *in, unsigned n, uint32_t *out) {
- size_t nvalue = n;
- FastPForLib::SIMDFastPFor<4> ic;
- const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), *(uint32_t *)in, out, nvalue);
- if(n & 127) {
- nvalue = n - nvalue;
- FastPForLib::VariableByte vc;
- ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue); //return vbdec32((unsigned char *)ip, n & 127, out + mynvalue1);
- }
- return (unsigned char *)ip - (unsigned char *)in;
-}
-
-unsigned OptPFore128v32(const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize) {
- size_t nvalue = outsize/4;
- FastPForLib::SIMDOPTPFor<4> ic; ic.encodeArray((const uint32_t *)in, n & (~127), (uint32_t *)(out+4), nvalue);
- if(n & 127) {
- size_t nvalue2 = outsize/4 - nvalue;
- FastPForLib::VariableByte vc; vc.encodeArray((const uint32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2);
- nvalue += nvalue2;
- }
- ctou32(out) = nvalue;
- return 4+nvalue*4;
-}
-
-unsigned OptPFord128v32(const unsigned char *in, unsigned n, uint32_t *out) {
- size_t nvalue = n;
- FastPForLib::SIMDOPTPFor<4> ic;
- const uint32_t *ip = ic.decodeArray((const uint32_t *)(in+4), ctou32(in), out, nvalue);
- if(n & 127) {
- nvalue = n - nvalue;
- FastPForLib::VariableByte vc;
- ip = vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue); //return vbdec32((unsigned char *)ip, n & 127, out + mynvalue1);
- }
- return (unsigned char *)ip-in;
-}
diff --git a/src/ext/for/ext/fastpfor.h b/src/ext/for/ext/fastpfor.h
deleted file mode 100644
index 8bbfdc40..00000000
--- a/src/ext/for/ext/fastpfor.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-unsigned FastPFore32( const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize);
-unsigned FastPFord32( const unsigned char *in, unsigned n, uint32_t *out);
-
-unsigned FastPFore128v32(const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize);
-unsigned FastPFord128v32(const unsigned char *in, unsigned n, uint32_t *out);
-
-unsigned OptPFore128v32( const uint32_t *in, unsigned n, unsigned char *out, unsigned outsize);
-unsigned OptPFord128v32( const unsigned char *in, unsigned n, uint32_t *out);
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/ext/for/ext/gb.c b/src/ext/for/ext/gb.c
deleted file mode 100644
index 72480692..00000000
--- a/src/ext/for/ext/gb.c
+++ /dev/null
@@ -1,151 +0,0 @@
-// copy from https://github.com/ccr/ccr/tree/master/hdf5_plugins for benchmarking purpose
-# define NC_FLOAT 5
-# define NC_DOUBLE 6
-# define NC_FILL_FLOAT (9.9692099683868690e+36f) /* near 15 * 2^119 */
-# define NC_FILL_DOUBLE (9.9692099683868690e+36)
-
-/* Minimum number of explicit significand bits to preserve when zeroing/bit-masking floating point values
- Codes will preserve at least two explicit bits, IEEE significand representation contains one implicit bit
- Thus preserve a least three bits which is approximately one sigificant decimal digit
- Used in nco_ppc_bitmask() and nco_ppc_bitmask_scl() */
-#define NCO_PPC_BIT_XPL_NBR_MIN 2
-
-/* Pointer union for floating point and bitmask types */
-typedef union{ /* ptr_unn */
- float *fp;
- double *dp;
- unsigned int *ui32p;
- unsigned long long *ui64p;
- void *vp;
-} ptr_unn;
-
-void ccr_gbr /* [fnc] Granular BitRound buffer of float values */
-(const int nsd, /* I [nbr] Number of decimal significant digits to quantize to */
- const int type, /* I [enm] netCDF type of operand */
- const size_t sz, /* I [nbr] Size (in elements) of buffer to quantize */
- const int has_mss_val, /* I [flg] Flag for missing values */
- ptr_unn mss_val, /* I [val] Value of missing value */
- void *op1) /* I/O [frc] Values to quantize */
-{
- const char fnc_nm[] = "ccr_gbr()"; /* [sng] Function name */
-
- /* Prefer constants defined in math.h, however, ...
- 20201002 GCC environments can have hard time defining M_LN10/M_LN2 despite finding math.h */
-#ifndef M_LN10
-# define M_LN10 2.30258509299404568402 /* log_e 10 */
-#endif /* M_LN10 */
-#ifndef M_LN2
-# define M_LN2 0.69314718055994530942 /* log_e 2 */
-#endif /* M_LN2 */
- const double bit_per_dgt=M_LN10/M_LN2; /* 3.32 [frc] Bits per decimal digit of precision = log2(10) */
- const double dgt_per_bit=M_LN2/M_LN10; /* 0.301 [frc] Decimal digits per bit of precision = log10(2) */
-
- const int bit_xpl_nbr_sgn_flt=23; /* [nbr] Bits 0-22 of SP significands are explicit. Bit 23 is implicitly 1. */
- const int bit_xpl_nbr_sgn_dbl=52; /* [nbr] Bits 0-51 of DP significands are explicit. Bit 52 is implicitly 1. */
-
- double mnt; /* [frc] Mantissa, 0.5 <= mnt < 1.0 */
- double mnt_fabs; /* [frc] fabs(mantissa) */
- double mnt_log10_fabs; /* [frc] log10(fabs(mantissa))) */
- double val; /* [frc] Copy of input value to avoid indirection */
-
- double prc_bnr_xct=0.0; /* [nbr] Binary digits of precision, exact */
- double mss_val_cmp_dbl; /* Missing value for comparison to double precision values */
-
- float mss_val_cmp_flt; /* Missing value for comparison to single precision values */
-
- int bit_xpl_nbr_sgn=-1; /* [nbr] Number of explicit bits in significand */
- int bit_xpl_nbr_zro; /* [nbr] Number of explicit bits to zero */
-
- int dgt_nbr; /* [nbr] Number of digits before decimal point */
- int qnt_pwr; /* [nbr] Power of two in quantization mask: qnt_msk = 2^qnt_pwr */
- int xpn_bs2; /* [nbr] Binary exponent xpn_bs2 in val = sign(val) * 2^xpn_bs2 * mnt, 0.5 < mnt <= 1.0 */
-
- size_t idx;
-
- unsigned int *u32_ptr;
- unsigned int msk_f32_u32_zro;
- unsigned int msk_f32_u32_one;
- unsigned int msk_f32_u32_hshv;
- unsigned long long int *u64_ptr;
- unsigned long long int msk_f64_u64_zro;
- unsigned long long int msk_f64_u64_one;
- unsigned long long int msk_f64_u64_hshv;
- unsigned short prc_bnr_ceil=0; /* [nbr] Exact binary digits of precision rounded-up */
- unsigned short prc_bnr_xpl_rqr=0; /* [nbr] Explicitly represented binary digits required to retain */
-
- /* Disallow unreasonable quantization */
- //assert(nsd > 0);
- //assert(nsd <= 16);
-
- if(type == NC_FLOAT && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_flt) return;
- if(type == NC_DOUBLE && prc_bnr_xpl_rqr >= bit_xpl_nbr_sgn_dbl) return;
-
- switch(type){
- case NC_FLOAT:
- /* Missing value for comparison is _FillValue (if any) otherwise default NC_FILL_FLOAT/DOUBLE */
- if(has_mss_val) mss_val_cmp_flt=*mss_val.fp; else mss_val_cmp_flt=NC_FILL_FLOAT;
- bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_flt;
- u32_ptr=op1; //.ui32p;
- float *fp = op1;
-
- for(idx=0L;idx<sz;idx++){
- if((val=fp[idx]) != mss_val_cmp_flt && u32_ptr[idx] != 0U){
- mnt=frexp(val,&xpn_bs2); /* DGG19 p. 4102 (8) */
- mnt_fabs=fabs(mnt);
- mnt_log10_fabs=log10(mnt_fabs);
- /* 20211003 Continuous determination of dgt_nbr improves CR by ~10% */
- dgt_nbr=(int)floor(xpn_bs2*dgt_per_bit+mnt_log10_fabs)+1; /* DGG19 p. 4102 (8.67) */
- qnt_pwr=(int)floor(bit_per_dgt*(dgt_nbr-nsd)); /* DGG19 p. 4101 (7) */
- prc_bnr_xpl_rqr= mnt_fabs == 0.0 ? 0 : abs((int)floor(xpn_bs2-bit_per_dgt*mnt_log10_fabs)-qnt_pwr); /* Protect against mnt = -0.0 */
- prc_bnr_xpl_rqr--; /* 20211003 Reduce formula result by 1 bit: Passes all tests, improves CR by ~10% */
-
- bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;
- msk_f32_u32_zro=0u; /* Zero all bits */
- msk_f32_u32_zro=~msk_f32_u32_zro; /* Turn all bits to ones */
- /* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
- msk_f32_u32_zro <<= bit_xpl_nbr_zro;
- /* Bit Set mask for OR: Put ones into bits to be set, zeros in untouched bits */
- msk_f32_u32_one=~msk_f32_u32_zro;
- msk_f32_u32_hshv=msk_f32_u32_one & (msk_f32_u32_zro >> 1); /* Set one bit: the MSB of LSBs */
- u32_ptr[idx]+=msk_f32_u32_hshv; /* Add 1 to the MSB of LSBs, carry 1 to mantissa or even exponent */
- u32_ptr[idx]&=msk_f32_u32_zro; /* Shave it */
- } /* !mss_val_cmp_flt */
- } /* !idx */
- break; /* !NC_FLOAT */
- case NC_DOUBLE:
- /* Missing value for comparison is _FillValue (if any) otherwise default NC_FILL_FLOAT/DOUBLE */
- if(has_mss_val) mss_val_cmp_dbl=*mss_val.dp; else mss_val_cmp_dbl=NC_FILL_FLOAT;
- bit_xpl_nbr_sgn=bit_xpl_nbr_sgn_dbl;
- u64_ptr=op1;
- double *dp = op1;
-
- for(idx=0L;idx<sz;idx++){
- if((val=dp[idx]) != mss_val_cmp_dbl && u64_ptr[idx] != 0U){
- mnt=frexp(val,&xpn_bs2); /* DGG19 p. 4102 (8) */
- mnt_fabs=fabs(mnt);
- mnt_log10_fabs=log10(mnt_fabs);
- /* 20211003 Continuous determination of dgt_nbr improves CR by ~10% */
- dgt_nbr=(int)floor(xpn_bs2*dgt_per_bit+mnt_log10_fabs)+1; /* DGG19 p. 4102 (8.67) */
- qnt_pwr=(int)floor(bit_per_dgt*(dgt_nbr-nsd)); /* DGG19 p. 4101 (7) */
- prc_bnr_xpl_rqr= mnt_fabs == 0.0 ? 0 : abs((int)floor(xpn_bs2-bit_per_dgt*mnt_log10_fabs)-qnt_pwr); /* Protect against mnt = -0.0 */
- prc_bnr_xpl_rqr--; /* 20211003 Reduce formula result by 1 bit: Passes all tests, improves CR by ~10% */
-
- bit_xpl_nbr_zro=bit_xpl_nbr_sgn-prc_bnr_xpl_rqr;
- msk_f64_u64_zro=0u; /* Zero all bits */
- msk_f64_u64_zro=~msk_f64_u64_zro; /* Turn all bits to ones */
- /* Bit Shave mask for AND: Left shift zeros into bits to be rounded, leave ones in untouched bits */
- msk_f64_u64_zro <<= bit_xpl_nbr_zro;
- /* Bit Set mask for OR: Put ones into bits to be set, zeros in untouched bits */
- msk_f64_u64_one=~msk_f64_u64_zro;
- msk_f64_u64_hshv=msk_f64_u64_one & (msk_f64_u64_zro >> 1); /* Set one bit: the MSB of LSBs */
- u64_ptr[idx]+=msk_f64_u64_hshv; /* Add 1 to the MSB of LSBs, carry 1 to mantissa or even exponent */
- u64_ptr[idx]&=msk_f64_u64_zro; /* Shave it */
- } /* !mss_val_cmp_dbl */
- } /* !idx */
- break; /* !NC_DOUBLE */
- default:
- (void)fprintf(stderr,"ERROR: %s reports datum size = %d B is invalid for %s filter\n",fnc_nm,type,""/*CCR_FLT_NAME*/);
- break;
- } /* !type */
-
-} /* ccr_gbr() */
diff --git a/src/ext/for/ext/gov2.png b/src/ext/for/ext/gov2.png
deleted file mode 100644
index 423dcf38..00000000
Binary files a/src/ext/for/ext/gov2.png and /dev/null differ
diff --git a/src/ext/for/ext/libdroundfast.c b/src/ext/for/ext/libdroundfast.c
deleted file mode 100644
index 740c82f6..00000000
--- a/src/ext/for/ext/libdroundfast.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019, CNES.
- *
- * This source code is licensed under MIT-style license (found in the
- * COPYING file in the root directory of this source tree).
- */
-//https://github.com/CNES/Digit_Rounding/blob/master/libdround/src/libdroundfast.c
-
-#include <math.h>
-
-#define LOG2_10 3.321928095 // log2(10)
-#define LOG10_2 0.301029996 // log10(2)
-
-#define SIGN(x) ( (x<0) ? -1 : 1 )
-
-const float TABLE[5][2] = {
- {0.6, -LOG10_2},
- {0.7,-0.221848749},
- {0.8,-0.154901959},
- {0.9,-0.096910013},
- {1.0,-0.045757490},
-};
-
-/*
- * Round the float value keeping nsd significant digits.
- * Fast method that does not uses log10() function.
- */
-double droundFast(double v, int nsd)
-{
- // compute the number of digits before the decimal point of the input floating-point value v
- // The value v is interpreted as v = 10^d + eps = 2^e + m
- // with 0 <= m < 0.5
- int e;
- double m = frexp(v, &e); // return the binary exponent e of the input value v = 2^e + m with 0 <= m < 0.5
-
- // =============
- // --- tabulated method ---
- // tabulate the LOG10(m)
- int i = 0;
- while (TABLE[i][0] < m)
- {
- i++;
- }
- float log10m = TABLE[i][1];
-
- // --- low precision method ---
- // float log10m = -LOG10_2;
- // =============
-
- // convert the binary exponent to a number of digits: d = floor(e*log10(2) + log10(m)) + 1
- int d = (int) floor(e*LOG10_2 + log10m) + 1;
-
- // compute the power of the quantization step: q = 2^p
- int p = (int) floor(LOG2_10 * (d - nsd));
- // compute quantization step: q = 2^p
- double q = ldexp(1, p);
-
- // apply the quantization step depending on the bias
- return SIGN(v) * (floor(fabs(v) / q) + 0.5) * q;
-}
-
diff --git a/src/ext/for/ext/polycom/optp4.c b/src/ext/for/ext/polycom/optp4.c
deleted file mode 100644
index 414985e1..00000000
--- a/src/ext/for/ext/polycom/optp4.c
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "../OPT_PFD/opt_p4.h" // OptPFD
-
-unsigned char *optpfdenc32(unsigned *__restrict in, int n, unsigned *__restrict out) {
- if(n < 128)
- out = vbyteenc(in, n, (unsigned *)out);
- else {
- unsigned tmp[OPTPFDMAX];
- for(i = 0; i < n; i++) tmp[i] = in[i];
- return out += OPT4(tmp, n, (unsigned *)out);
- }
- return out;
-}
-
-unsigned char *optpfddec32(unsigned *__restrict in, int n, unsigned *__restrict out) {
- if(n < 128)
- in = vbytedec(in, n, out);
- else {
- unsigned all_array[OPTPFDMAX];
- return (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array);
- }
-}
-
diff --git a/src/ext/for/ext/polycom/optp4.h b/src/ext/for/ext/polycom/optp4.h
deleted file mode 100644
index 651513fe..00000000
--- a/src/ext/for/ext/polycom/optp4.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define OPTPFDMAX 2048
-unsigned char *optpfdenc32(unsigned *__restrict in, int n, unsigned *__restrict out);
-unsigned char *optpfddec32(unsigned *__restrict in, int n, unsigned *__restrict out);
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/src/ext/for/ext/polycom/optpfd.c b/src/ext/for/ext/polycom/optpfd.c
deleted file mode 100644
index 84a4ac59..00000000
--- a/src/ext/for/ext/polycom/optpfd.c
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <stdlib.h>
-#include "../OPT_PFD/opt_p4.h" // OptPFD
-
-#include "optpfd.h"
-#include "polyvbyte.h"
-unsigned char *optpfdenc32(unsigned *in, int n, unsigned char *out) {
- if(n < 128)
- out = vbpolyenc(in, n, out);
- else {
- unsigned tmp[OPTPFDMAX],i;
- for(i = 0; i < n; i++) tmp[i] = in[i];
- return out += OPT4(tmp, n, (unsigned *)out);
- }
- return out;
-}
-
-unsigned char *optpfddec32(unsigned char *in, int n, unsigned *out) {
- if(n < 128)
- in = vbpolydec(in, n, out);
- else {
- unsigned all_array[OPTPFDMAX];
- in = (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array);
- }
- return in;
-}
-
diff --git a/src/ext/for/ext/polycom/optpfd.h b/src/ext/for/ext/polycom/optpfd.h
deleted file mode 100644
index 9ff838e7..00000000
--- a/src/ext/for/ext/polycom/optpfd.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define OPTPFDMAX 2048
-unsigned char *optpfdenc32(unsigned *in, int n, unsigned char *out);
-unsigned char *optpfddec32(unsigned char *in, int n, unsigned *out);
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/src/ext/for/ext/polycom/polyvbyte.c b/src/ext/for/ext/polycom/polyvbyte.c
deleted file mode 100644
index 480180c5..00000000
--- a/src/ext/for/ext/polycom/polyvbyte.c
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "vbyte_poly.h"
-#include "polyvbyte.h"
-
-unsigned char *vbpolyenc(unsigned *in, unsigned n, unsigned char *out) {
- unsigned i;
- for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); }
- return out;
-}
-unsigned char *vbpolydec(unsigned char *in, unsigned n, unsigned *out) {
- unsigned i;
- for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; }
- return in;
-}
-
diff --git a/src/ext/for/ext/polycom/polyvbyte.h b/src/ext/for/ext/polycom/polyvbyte.h
deleted file mode 100644
index f8b3a998..00000000
--- a/src/ext/for/ext/polycom/polyvbyte.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-unsigned char *vbpolyenc(unsigned *in, unsigned n, unsigned char *out);
-unsigned char *vbpolydec(unsigned char *in, unsigned n, unsigned *out);
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/src/ext/for/ext/polycom/vbyte_poly.h b/src/ext/for/ext/polycom/vbyte_poly.h
deleted file mode 100644
index 3c2668d0..00000000
--- a/src/ext/for/ext/polycom/vbyte_poly.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//
-#define VBYTE_ENC(_v, _n) \
-{\
- unsigned _num; \
- unsigned char _barray[5]; \
- unsigned _i, _started = 0; \
- _num = _n; \
- for (_i = 0; _i < 5; _i++) \
- { \
- _barray[_i] = ((_num%128)<<1); \
- _num = _num/128; \
- } \
- for (_i = 4; _i > 0; _i--) \
- { \
- if ((_barray[_i] != 0) || (_started == 1)) \
- { \
- _started = 1; \
- *_v = _barray[_i]|0x1; \
- _v++; \
- } \
- } \
- *_v = _barray[0]|0x0; \
- _v++; \
-}
-
-#define VBYTE_DEC(_v, _n) \
-{\
- _n = ((*_v>>1)); \
- if ((*_v&0x1) != 0) \
- { \
- _v++; \
- _n = (_n<<7) + ((*_v>>1)); \
- if ((*_v&0x1)!= 0) \
- { \
- _v++; \
- _n = (_n<<7) + ((*_v>>1)); \
- if ((*_v&0x1) != 0) \
- { \
- _v++; \
- _n = (_n<<7) + ((*_v>>1)); \
- }\
- }\
- }\
- _v++; \
-}
-
diff --git a/src/ext/for/ext/rc.c b/src/ext/for/ext/rc.c
deleted file mode 100644
index d7088fac..00000000
--- a/src/ext/for/ext/rc.c
+++ /dev/null
@@ -1,1809 +0,0 @@
-// Copyright (c) 2008, WEST, Polytechnic Institute of NYU.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of WEST, Polytechnic Institute of NYU. nor the names
-// of its contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: Torsten Suel, Jiangong Zhang, Jinru He
-//
-// If you have any questions or problems about our codes, please contact:
-// jhe@cis.poly.edu
-//
-//
-
-//#include "rice_coding2.h"
-//#include <stdio.h>
-
-/*rc_rice_coding2() {
- // TODO Auto-generated constructor stub
- cnum[0] = 0;
- cnum[1] = 1;
- cnum[2] = 2;
- cnum[3] = 3;
- cnum[4] = 4;
- cnum[5] = 5;
- cnum[6] = 6;
- cnum[7] = 7;
- cnum[8] = 8;
- cnum[9] = 9;
- cnum[10] = 10;
- cnum[11] = 11;
- cnum[12] = 12;
- cnum[13] = 13;
- cnum[14] = 16;
- cnum[15] = 20;
- cnum[16] = 32;
-}*/
-#define coding_type 3
-#define block_size 128
-
-static int cnum[] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32 };
-
-/*rc_~rice_coding2() {
- // TODO Auto-generated destructor stub
-}*/
-
-/*int rc_get_type()
-{
- return coding_type;
-}
-
-void rc_set_size(int size)
-{
- this->block_size = size;
-}*/
-//void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w);
-#include "../bitpack.h"
-#include "rc.h"
-
- void setBit(unsigned char *buf, unsigned int *bp, unsigned int val)
- {
- unsigned int bPtr;
- unsigned int w;
-
- bPtr = (*bp)&7;
- if (bPtr == 0) buf[(*bp)>>3] = 0;
- if (val == 1) buf[(*bp)>>3] |= (1<<bPtr);
- (*bp)++;
- }
-
-/**********************
- * w: output buffer
- * buf: input buffer
- * bits: b value;
- * BS: block size
- */
-unsigned char *rc_turbo_rice_encode(unsigned *w, unsigned int **buf, unsigned int bits)
-{
- unsigned int bp;
- unsigned int val;
- unsigned int i;
- int s;
- unsigned int out[block_size];
-
- if (bits > 0)
- {
- s = ((bits * block_size)>>5);
- for (i = 0; i < s; i++) w[i] = 0;
- for (i = 0; i < block_size; i++) out[i] = (*buf)[i] & ((1u<<bits)-1);// MASK[bits];
- //pack(out, bits, block_size, *w);
... 33309 lines suppressed ...
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org