You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/08/08 21:19:18 UTC
[2/5] orc git commit: ORC-81 Add lzo and lz4 support for the C++
reader. (omalley)
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/libs/lz4-r131/lib/xxhash.h
----------------------------------------------------------------------
diff --git a/c++/libs/lz4-r131/lib/xxhash.h b/c++/libs/lz4-r131/lib/xxhash.h
new file mode 100644
index 0000000..c60aa61
--- /dev/null
+++ b/c++/libs/lz4-r131/lib/xxhash.h
@@ -0,0 +1,192 @@
+/*
+ xxHash - Extremely Fast Hash algorithm
+ Header File
+ Copyright (C) 2012-2015, Yann Collet.
+
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name Speed Q.Score Author
+xxHash 5.4 GB/s 10
+CrapWow 3.2 GB/s 2 Andrew
+MumurHash 3a 2.7 GB/s 10 Austin Appleby
+SpookyHash 2.0 GB/s 10 Bob Jenkins
+SBox 1.4 GB/s 9 Bret Mulvey
+Lookup3 1.2 GB/s 9 Bob Jenkins
+SuperFastHash 1.2 GB/s 1 Paul Hsieh
+CityHash64 1.05 GB/s 10 Pike & Alakuijala
+FNV 0.55 GB/s 5 Fowler, Noll, Vo
+CRC32 0.43 GB/s 9
+MD5-32 0.33 GB/s 10 Ronald L. Rivest
+SHA1-32 0.28 GB/s 10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name Speed on 64 bits Speed on 32 bits
+XXH64 13.8 GB/s 1.9 GB/s
+XXH32 6.8 GB/s 6.0 GB/s
+*/
+
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*****************************
+* Definitions
+*****************************/
+#include <stddef.h> /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*****************************
+* Namespace Emulation
+*****************************/
+/* Motivations :
+
+If you need to include xxHash into your library,
+but wish to avoid xxHash symbols to be present on your library interface
+in an effort to avoid potential name collision if another library also includes xxHash,
+
+you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
+with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
+
+Note that no change is required within the calling program :
+it can still call xxHash functions using their regular name.
+They will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+# define XXH_CAT(A,B) A##B
+# define XXH_NAME2(A,B) XXH_CAT(A,B)
+# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#endif
+
+
+/*****************************
+* Simple Hash Functions
+*****************************/
+
+unsigned int XXH32 (const void* input, size_t length, unsigned seed);
+unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*
+XXH32() :
+ Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+ The memory between input & input+length must be valid (allocated and read-accessible).
+ "seed" can be used to alter the result predictably.
+ This function successfully passes all SMHasher tests.
+ Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+ Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+ Faster on 64-bits systems. Slower on 32-bits systems.
+*/
+
+
+
+/*****************************
+* Advanced Hash Functions
+*****************************/
+typedef struct { long long ll[ 6]; } XXH32_state_t;
+typedef struct { long long ll[11]; } XXH64_state_t;
+
+/*
+These structures allow static allocation of XXH states.
+States must then be initialized using XXHnn_reset() before first use.
+
+If you prefer dynamic allocation, please refer to functions below.
+*/
+
+XXH32_state_t* XXH32_createState(void);
+XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
+
+/*
+These functions create and release memory for XXH state.
+States must then be initialized using XXHnn_reset() before first use.
+*/
+
+
+XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed);
+XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+unsigned int XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions calculate the xxHash of an input provided in multiple smaller packets,
+as opposed to an input provided as a single block.
+
+XXH state space must first be allocated, using either static or dynamic method provided above.
+
+Start a new hash by initializing state with a seed, using XXHnn_reset().
+
+Then, feed the hash state by calling XXHnn_update() as many times as necessary.
+Obviously, input must be valid, meaning allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, you can produce a hash anytime, by using XXHnn_digest().
+This function returns the final nn-bits hash.
+You can nonetheless continue feeding the hash state with more input,
+and therefore get some new hashes, by calling again XXHnn_digest().
+
+When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/libs/lz4-r131/lz4_Block_format.md
----------------------------------------------------------------------
diff --git a/c++/libs/lz4-r131/lz4_Block_format.md b/c++/libs/lz4-r131/lz4_Block_format.md
new file mode 100644
index 0000000..ea568d8
--- /dev/null
+++ b/c++/libs/lz4-r131/lz4_Block_format.md
@@ -0,0 +1,127 @@
+LZ4 Block Format Description
+============================
+Last revised: 2015-05-07.
+Author : Yann Collet
+
+
+This specification is intended for developers
+willing to produce LZ4-compatible compressed data blocks
+using any programming language.
+
+LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding.
+There is no entropy encoder back-end nor framing layer.
+The latter is assumed to be handled by other parts of the system (see [LZ4 Frame format]).
+This design is assumed to favor simplicity and speed.
+It helps later on for optimizations, compactness, and features.
+
+This document describes only the block format,
+not how the compressor nor decompressor actually work.
+The correctness of the decompressor should not depend
+on implementation details of the compressor, and vice versa.
+
+[LZ4 Frame format]: LZ4_Frame_format.md
+
+
+
+Compressed block format
+-----------------------
+An LZ4 compressed block is composed of sequences.
+A sequence is a suite of literals (not-compressed bytes),
+followed by a match copy.
+
+Each sequence starts with a token.
+The token is a one byte value, separated into two 4-bits fields.
+Therefore each field ranges from 0 to 15.
+
+
+The first field uses the 4 high-bits of the token.
+It provides the length of literals to follow.
+
+If the field value is 0, then there is no literal.
+If it is 15, then we need to add some more bytes to indicate the full length.
+Each additional byte then represent a value from 0 to 255,
+which is added to the previous value to produce a total length.
+When the byte value is 255, another byte is output.
+There can be any number of bytes following the token. There is no "size limit".
+(Side note : this is why a not-compressible input block is expanded by 0.4%).
+
+Example 1 : A length of 48 will be represented as :
+
+ - 15 : value for the 4-bits High field
+ - 33 : (=48-15) remaining length to reach 48
+
+Example 2 : A length of 280 will be represented as :
+
+ - 15 : value for the 4-bits High field
+ - 255 : following byte is maxed, since 280-15 >= 255
+ - 10 : (=280 - 15 - 255) ) remaining length to reach 280
+
+Example 3 : A length of 15 will be represented as :
+
+ - 15 : value for the 4-bits High field
+ - 0 : (=15-15) yes, the zero must be output
+
+Following the token and optional length bytes, are the literals themselves.
+They are exactly as numerous as previously decoded (length of literals).
+It's possible that there are zero literal.
+
+
+Following the literals is the match copy operation.
+
+It starts by the offset.
+This is a 2 bytes value, in little endian format
+(the 1st byte is the "low" byte, the 2nd one is the "high" byte).
+
+The offset represents the position of the match to be copied from.
+1 means "current position - 1 byte".
+The maximum offset value is 65535, 65536 cannot be coded.
+Note that 0 is an invalid value, not used.
+
+Then we need to extract the match length.
+For this, we use the second token field, the low 4-bits.
+Value, obviously, ranges from 0 to 15.
+However here, 0 means that the copy operation will be minimal.
+The minimum length of a match, called minmatch, is 4.
+As a consequence, a 0 value means 4 bytes, and a value of 15 means 19+ bytes.
+Similar to literal length, on reaching the highest possible value (15),
+we output additional bytes, one at a time, with values ranging from 0 to 255.
+They are added to total to provide the final match length.
+A 255 value means there is another byte to read and add.
+There is no limit to the number of optional bytes that can be output this way.
+(This points towards a maximum achievable compression ratio of about 250).
+
+With the offset and the matchlength,
+the decoder can now proceed to copy the data from the already decoded buffer.
+On decoding the matchlength, we reach the end of the compressed sequence,
+and therefore start another one.
+
+
+Parsing restrictions
+-----------------------
+There are specific parsing rules to respect in order to remain compatible
+with assumptions made by the decoder :
+
+1. The last 5 bytes are always literals
+2. The last match must start at least 12 bytes before end of block.
+ Consequently, a block with less than 13 bytes cannot be compressed.
+
+These rules are in place to ensure that the decoder
+will never read beyond the input buffer, nor write beyond the output buffer.
+
+Note that the last sequence is also incomplete,
+and stops right after literals.
+
+
+Additional notes
+-----------------------
+There is no assumption nor limits to the way the compressor
+searches and selects matches within the source data block.
+It could be a fast scan, a multi-probe, a full search using BST,
+standard hash chains or MMC, well whatever.
+
+Advanced parsing strategies can also be implemented, such as lazy match,
+or full optimal parsing.
+
+All these trade-off offer distinctive speed/memory/compression advantages.
+Whatever the method used by the compressor, its result will be decodable
+by any LZ4 decoder if it follows the format specification described above.
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/libs/lz4-r131/lz4_Frame_format.md
----------------------------------------------------------------------
diff --git a/c++/libs/lz4-r131/lz4_Frame_format.md b/c++/libs/lz4-r131/lz4_Frame_format.md
new file mode 100644
index 0000000..73d3cba
--- /dev/null
+++ b/c++/libs/lz4-r131/lz4_Frame_format.md
@@ -0,0 +1,385 @@
+LZ4 Frame Format Description
+============================
+
+###Notices
+
+Copyright (c) 2013-2015 Yann Collet
+
+Permission is granted to copy and distribute this document
+for any purpose and without charge,
+including translations into other languages
+and incorporation into compilations,
+provided that the copyright notice and this notice are preserved,
+and that any substantive changes or deletions from the original
+are clearly marked.
+Distribution of this document is unlimited.
+
+###Version
+
+1.5.1 (31/03/2015)
+
+
+Introduction
+------------
+
+The purpose of this document is to define a lossless compressed data format,
+that is independent of CPU type, operating system,
+file system and character set, suitable for
+File compression, Pipe and streaming compression
+using the [LZ4 algorithm](http://www.lz4.info).
+
+The data can be produced or consumed,
+even for an arbitrarily long sequentially presented input data stream,
+using only an a priori bounded amount of intermediate storage,
+and hence can be used in data communications.
+The format uses the LZ4 compression method,
+and optional [xxHash-32 checksum method](https://github.com/Cyan4973/xxHash),
+for detection of data corruption.
+
+The data format defined by this specification
+does not attempt to allow random access to compressed data.
+
+This specification is intended for use by implementers of software
+to compress data into LZ4 format and/or decompress data from LZ4 format.
+The text of the specification assumes a basic background in programming
+at the level of bits and other primitive data representations.
+
+Unless otherwise indicated below,
+a compliant compressor must produce data sets
+that conform to the specifications presented here.
+It doesn\u2019t need to support all options though.
+
+A compliant decompressor must be able to decompress
+at least one working set of parameters
+that conforms to the specifications presented here.
+It may also ignore checksums.
+Whenever it does not support a specific parameter within the compressed stream,
+it must produce a non-ambiguous error code
+and associated error message explaining which parameter is unsupported.
+
+
+General Structure of LZ4 Frame format
+-------------------------------------
+
+| MagicNb | F. Descriptor | Block | (...) | EndMark | C. Checksum |
+|:-------:|:-------------:| ----- | ----- | ------- | ----------- |
+| 4 bytes | 3-11 bytes | | | 4 bytes | 4 bytes |
+
+__Magic Number__
+
+4 Bytes, Little endian format.
+Value : 0x184D2204
+
+__Frame Descriptor__
+
+3 to 11 Bytes, to be detailed in the next part.
+Most important part of the spec.
+
+__Data Blocks__
+
+To be detailed later on.
+That\u2019s where compressed data is stored.
+
+__EndMark__
+
+The flow of blocks ends when the last data block has a size of \u201c0\u201d.
+The size is expressed as a 32-bits value.
+
+__Content Checksum__
+
+Content Checksum verify that the full content has been decoded correctly.
+The content checksum is the result
+of [xxh32() hash function](https://github.com/Cyan4973/xxHash)
+digesting the original (decoded) data as input, and a seed of zero.
+Content checksum is only present when its associated flag
+is set in the frame descriptor.
+Content Checksum validates the result,
+that all blocks were fully transmitted in the correct order and without error,
+and also that the encoding/decoding process itself generated no distortion.
+Its usage is recommended.
+
+__Frame Concatenation__
+
+In some circumstances, it may be preferable to append multiple frames,
+for example in order to add new data to an existing compressed file
+without re-framing it.
+
+In such case, each frame has its own set of descriptor flags.
+Each frame is considered independent.
+The only relation between frames is their sequential order.
+
+The ability to decode multiple concatenated frames
+within a single stream or file
+is left outside of this specification.
+As an example, the reference lz4 command line utility behavior is
+to decode all concatenated frames in their sequential order.
+
+
+Frame Descriptor
+----------------
+
+| FLG | BD | (Content Size) | HC |
+| ------- | ------- |:--------------:| ------- |
+| 1 byte | 1 byte | 0 - 8 bytes | 1 byte |
+
+The descriptor uses a minimum of 3 bytes,
+and up to 11 bytes depending on optional parameters.
+
+__FLG byte__
+
+| BitNb | 7-6 | 5 | 4 | 3 | 2 | 1-0 |
+| ------- | ------- | ------- | --------- | ------- | --------- | -------- |
+|FieldName| Version | B.Indep | B.Checksum| C.Size | C.Checksum|*Reserved*|
+
+
+__BD byte__
+
+| BitNb | 7 | 6-5-4 | 3-2-1-0 |
+| ------- | -------- | ------------ | -------- |
+|FieldName|*Reserved*| Block MaxSize|*Reserved*|
+
+In the tables, bit 7 is highest bit, while bit 0 is lowest.
+
+__Version Number__
+
+2-bits field, must be set to \u201c01\u201d.
+Any other value cannot be decoded by this version of the specification.
+Other version numbers will use different flag layouts.
+
+__Block Independence flag__
+
+If this flag is set to \u201c1\u201d, blocks are independent.
+If this flag is set to \u201c0\u201d, each block depends on previous ones
+(up to LZ4 window size, which is 64 KB).
+In such case, it\u2019s necessary to decode all blocks in sequence.
+
+Block dependency improves compression ratio, especially for small blocks.
+On the other hand, it makes direct jumps or multi-threaded decoding impossible.
+
+__Block checksum flag__
+
+If this flag is set, each data block will be followed by a 4-bytes checksum,
+calculated by using the xxHash-32 algorithm on the raw (compressed) data block.
+The intention is to detect data corruption (storage or transmission errors)
+immediately, before decoding.
+Block checksum usage is optional.
+
+__Content Size flag__
+
+If this flag is set, the uncompressed size of data included within the frame
+will be present as an 8 bytes unsigned little endian value, after the flags.
+Content Size usage is optional.
+
+__Content checksum flag__
+
+If this flag is set, a content checksum will be appended after the EndMark.
+
+Recommended value : \u201c1\u201d (content checksum is present)
+
+__Block Maximum Size__
+
+This information is intended to help the decoder allocate memory.
+Size here refers to the original (uncompressed) data size.
+Block Maximum Size is one value among the following table :
+
+| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+| --- | --- | --- | --- | ----- | ------ | ---- | ---- |
+| N/A | N/A | N/A | N/A | 64 KB | 256 KB | 1 MB | 4 MB |
+
+The decoder may refuse to allocate block sizes above a (system-specific) size.
+Unused values may be used in a future revision of the spec.
+A decoder conformant to the current version of the spec
+is only able to decode blocksizes defined in this spec.
+
+__Reserved bits__
+
+Value of reserved bits **must** be 0 (zero).
+Reserved bit might be used in a future version of the specification,
+typically enabling new optional features.
+If this happens, a decoder respecting the current version of the specification
+shall not be able to decode such a frame.
+
+__Content Size__
+
+This is the original (uncompressed) size.
+This information is optional, and only present if the associated flag is set.
+Content size is provided using unsigned 8 Bytes, for a maximum of 16 HexaBytes.
+Format is Little endian.
+This value is informational, typically for display or memory allocation.
+It can be skipped by a decoder, or used to validate content correctness.
+
+__Header Checksum__
+
+One-byte checksum of combined descriptor fields, including optional ones.
+The value is the second byte of xxh32() : ` (xxh32()>>8) & 0xFF `
+using zero as a seed,
+and the full Frame Descriptor as an input
+(including optional fields when they are present).
+A wrong checksum indicates an error in the descriptor.
+Header checksum is informational and can be skipped.
+
+
+Data Blocks
+-----------
+
+| Block Size | data | (Block Checksum) |
+|:----------:| ------ |:----------------:|
+| 4 bytes | | 0 - 4 bytes |
+
+
+__Block Size__
+
+This field uses 4-bytes, format is little-endian.
+
+The highest bit is \u201c1\u201d if data in the block is uncompressed.
+
+The highest bit is \u201c0\u201d if data in the block is compressed by LZ4.
+
+All other bits give the size, in bytes, of the following data block
+(the size does not include the block checksum if present).
+
+Block Size shall never be larger than Block Maximum Size.
+Such a thing could happen for incompressible source data.
+In such case, such a data block shall be passed in uncompressed format.
+
+__Data__
+
+Where the actual data to decode stands.
+It might be compressed or not, depending on previous field indications.
+Uncompressed size of Data can be any size, up to \u201cblock maximum size\u201d.
+Note that data block is not necessarily full :
+an arbitrary \u201cflush\u201d may happen anytime. Any block can be \u201cpartially filled\u201d.
+
+__Block checksum__
+
+Only present if the associated flag is set.
+This is a 4-bytes checksum value, in little endian format,
+calculated by using the xxHash-32 algorithm on the raw (undecoded) data block,
+and a seed of zero.
+The intention is to detect data corruption (storage or transmission errors)
+before decoding.
+
+Block checksum is cumulative with Content checksum.
+
+
+Skippable Frames
+----------------
+
+| Magic Number | Frame Size | User Data |
+|:------------:|:----------:| --------- |
+| 4 bytes | 4 bytes | |
+
+Skippable frames allow the integration of user-defined data
+into a flow of concatenated frames.
+Its design is pretty straightforward,
+with the sole objective to allow the decoder to quickly skip
+over user-defined data and continue decoding.
+
+For the purpose of facilitating identification,
+it is discouraged to start a flow of concatenated frames with a skippable frame.
+If there is a need to start such a flow with some user data
+encapsulated into a skippable frame,
+it\u2019s recommended to start with a zero-byte LZ4 frame
+followed by a skippable frame.
+This will make it easier for file type identifiers.
+
+
+__Magic Number__
+
+4 Bytes, Little endian format.
+Value : 0x184D2A5X, which means any value from 0x184D2A50 to 0x184D2A5F.
+All 16 values are valid to identify a skippable frame.
+
+__Frame Size__
+
+This is the size, in bytes, of the following User Data
+(without including the magic number nor the size field itself).
+4 Bytes, Little endian format, unsigned 32-bits.
+This means User Data can\u2019t be bigger than (2^32-1) Bytes.
+
+__User Data__
+
+User Data can be anything. Data will just be skipped by the decoder.
+
+
+Legacy frame
+------------
+
+The Legacy frame format was defined into the initial versions of \u201cLZ4Demo\u201d.
+Newer compressors should not use this format anymore, as it is too restrictive.
+
+Main characteristics of the legacy format :
+
+- Fixed block size : 8 MB.
+- All blocks must be completely filled, except the last one.
+- All blocks are always compressed, even when compression is detrimental.
+- The last block is detected either because
+ it is followed by the \u201cEOF\u201d (End of File) mark,
+ or because it is followed by a known Frame Magic Number.
+- No checksum
+- Convention is Little endian
+
+| MagicNb | B.CSize | CData | B.CSize | CData | (...) | EndMark |
+| ------- | ------- | ----- | ------- | ----- | ------- | ------- |
+| 4 bytes | 4 bytes | CSize | 4 bytes | CSize | x times | EOF |
+
+
+__Magic Number__
+
+4 Bytes, Little endian format.
+Value : 0x184C2102
+
+__Block Compressed Size__
+
+This is the size, in bytes, of the following compressed data block.
+4 Bytes, Little endian format.
+
+__Data__
+
+Where the actual compressed data stands.
+Data is always compressed, even when compression is detrimental.
+
+__EndMark__
+
+End of legacy frame is implicit only.
+It must be followed by a standard EOF (End Of File) signal,
+wether it is a file or a stream.
+
+Alternatively, if the frame is followed by a valid Frame Magic Number,
+it is considered completed.
+It makes legacy frames compatible with frame concatenation.
+
+Any other value will be interpreted as a block size,
+and trigger an error if it does not fit within acceptable range.
+
+
+Version changes
+---------------
+
+1.5.1 : changed format to MarkDown compatible
+
+1.5 : removed Dictionary ID from specification
+
+1.4.1 : changed wording from \u201cstream\u201d to \u201cframe\u201d
+
+1.4 : added skippable streams, re-added stream checksum
+
+1.3 : modified header checksum
+
+1.2 : reduced choice of \u201cblock size\u201d, to postpone decision on \u201cdynamic size of BlockSize Field\u201d.
+
+1.1 : optional fields are now part of the descriptor
+
+1.0 : changed \u201cblock size\u201d specification, adding a compressed/uncompressed flag
+
+0.9 : reduced scale of \u201cblock maximum size\u201d table
+
+0.8 : removed : high compression flag
+
+0.7 : removed : stream checksum
+
+0.6 : settled : stream size uses 8 bytes, endian convention is little endian
+
+0.5: added copyright notice
+
+0.4 : changed format to Google Doc compatible OpenDocument
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index 75782f2..9e7cfa6 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -113,6 +113,7 @@ include_directories (
${PROTOBUF_INCLUDE_DIRS}
${ZLIB_INCLUDE_DIRS}
${SNAPPY_INCLUDE_DIRS}
+ ${LZ4_INCLUDE_DIRS}
)
add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc
@@ -132,6 +133,7 @@ add_library (orc STATIC
Compression.cc
Exceptions.cc
Int128.cc
+ LzoDecompressor.cc
MemoryPool.cc
OrcFile.cc
Reader.cc
@@ -149,6 +151,7 @@ target_link_libraries (orc
${PROTOBUF_LIBRARIES}
${ZLIB_LIBRARIES}
${SNAPPY_LIBRARIES}
+ ${LZ4_LIBRARIES}
)
add_dependencies(orc protoc)
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/src/Compression.cc
----------------------------------------------------------------------
diff --git a/c++/src/Compression.cc b/c++/src/Compression.cc
index 9bf8aaa..81cc578 100644
--- a/c++/src/Compression.cc
+++ b/c++/src/Compression.cc
@@ -19,6 +19,8 @@
#include "Adaptor.hh"
#include "Compression.hh"
#include "Exceptions.hh"
+#include "LzoDecompressor.hh"
+#include "lz4.h"
#include <algorithm>
#include <iomanip>
@@ -496,19 +498,27 @@ DIAGNOSTIC_POP
return result.str();
}
- class SnappyDecompressionStream: public SeekableInputStream {
+ class BlockDecompressionStream: public SeekableInputStream {
public:
- SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool);
+ BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool);
- virtual ~SnappyDecompressionStream() {}
+ virtual ~BlockDecompressionStream() {}
virtual bool Next(const void** data, int*size) override;
virtual void BackUp(int count) override;
virtual bool Skip(int count) override;
virtual int64_t ByteCount() const override;
virtual void seek(PositionProvider& position) override;
- virtual std::string getName() const override;
+ virtual std::string getName() const override = 0;
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength) = 0;
+
+ std::string getStreamName() const {
+ return input->getName();
+ }
private:
void readBuffer(bool failOnEof) {
@@ -516,7 +526,7 @@ DIAGNOSTIC_POP
if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr),
&length)) {
if (failOnEof) {
- throw ParseError("SnappyDecompressionStream read past EOF");
+ throw ParseError(getName() + "read past EOF");
}
state = DECOMPRESS_EOF;
inputBufferPtr = nullptr;
@@ -581,7 +591,7 @@ DIAGNOSTIC_POP
off_t bytesReturned;
};
- SnappyDecompressionStream::SnappyDecompressionStream
+ BlockDecompressionStream::BlockDecompressionStream
(std::unique_ptr<SeekableInputStream> inStream,
size_t bufferSize,
MemoryPool& _pool
@@ -598,7 +608,7 @@ DIAGNOSTIC_POP
input.reset(inStream.release());
}
- bool SnappyDecompressionStream::Next(const void** data, int*size) {
+ bool BlockDecompressionStream::Next(const void** data, int*size) {
// if the user pushed back, return them the partial buffer
if (outputBufferLength) {
*data = outputBufferPtr;
@@ -645,7 +655,8 @@ DIAGNOSTIC_POP
for (size_t pos = availSize; pos < remainingLength; ) {
readBuffer(true);
size_t avail =
- std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr),
+ std::min(static_cast<size_t>(inputBufferPtrEnd -
+ inputBufferPtr),
remainingLength - pos);
::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail);
pos += avail;
@@ -653,19 +664,9 @@ DIAGNOSTIC_POP
}
}
- if (!snappy::GetUncompressedLength(compressed, remainingLength,
- &outputBufferLength)) {
- throw ParseError("SnappyDecompressionStream choked on corrupt input");
- }
-
- if (outputBufferLength > outputBuffer.capacity()) {
- throw std::logic_error("uncompressed length exceeds block size");
- }
-
- if (!snappy::RawUncompress(compressed, remainingLength,
- outputBuffer.data())) {
- throw ParseError("SnappyDecompressionStream choked on corrupt input");
- }
+ outputBufferLength = decompress(compressed, remainingLength,
+ outputBuffer.data(),
+ outputBuffer.capacity());
remainingLength = 0;
state = DECOMPRESS_HEADER;
@@ -679,17 +680,16 @@ DIAGNOSTIC_POP
return true;
}
- void SnappyDecompressionStream::BackUp(int count) {
+ void BlockDecompressionStream::BackUp(int count) {
if (outputBufferPtr == nullptr || outputBufferLength != 0) {
- throw std::logic_error("Backup without previous Next in "
- "SnappyDecompressionStream");
+ throw std::logic_error("Backup without previous Next in "+getName());
}
outputBufferPtr -= static_cast<size_t>(count);
outputBufferLength = static_cast<size_t>(count);
bytesReturned -= count;
}
- bool SnappyDecompressionStream::Skip(int count) {
+ bool BlockDecompressionStream::Skip(int count) {
bytesReturned += count;
// this is a stupid implementation for now.
// should skip entire blocks without decompressing
@@ -709,21 +709,126 @@ DIAGNOSTIC_POP
return true;
}
- int64_t SnappyDecompressionStream::ByteCount() const {
+ int64_t BlockDecompressionStream::ByteCount() const {
return bytesReturned;
}
- void SnappyDecompressionStream::seek(PositionProvider& position) {
+ void BlockDecompressionStream::seek(PositionProvider& position) {
input->seek(position);
if (!Skip(static_cast<int>(position.next()))) {
- throw ParseError("Bad skip in SnappyDecompressionStream::seek");
+ throw ParseError("Bad skip in " + getName());
}
}
- std::string SnappyDecompressionStream::getName() const {
- std::ostringstream result;
- result << "snappy(" << input->getName() << ")";
- return result.str();
+ class SnappyDecompressionStream: public BlockDecompressionStream {
+ public:
+ SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool
+ ): BlockDecompressionStream
+ (std::move(inStream),
+ blockSize,
+ pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "snappy(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength
+ ) override;
+ };
+
+ uint64_t SnappyDecompressionStream::decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) {
+ size_t outLength;
+ if (!snappy::GetUncompressedLength(input, length, &outLength)) {
+ throw ParseError("SnappyDecompressionStream choked on corrupt input");
+ }
+
+ if (outLength > maxOutputLength) {
+ throw std::logic_error("Snappy length exceeds block size");
+ }
+
+ if (!snappy::RawUncompress(input, length, output)) {
+ throw ParseError("SnappyDecompressionStream choked on corrupt input");
+ }
+ return outLength;
+ }
+
+ class LzoDecompressionStream: public BlockDecompressionStream {
+ public:
+ LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool
+ ): BlockDecompressionStream
+ (std::move(inStream),
+ blockSize,
+ pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "lzo(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength
+ ) override;
+ };
+
+ uint64_t LzoDecompressionStream::decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) {
+ return lzoDecompress(input, input + length, output,
+ output + maxOutputLength);
+ }
+
+ class Lz4DecompressionStream: public BlockDecompressionStream {
+ public:
+ Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool
+ ): BlockDecompressionStream
+ (std::move(inStream),
+ blockSize,
+ pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "lz4(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength
+ ) override;
+ };
+
+ uint64_t Lz4DecompressionStream::decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) {
+ int result = LZ4_decompress_safe(input, output, static_cast<int>(length),
+ static_cast<int>(maxOutputLength));
+ if (result < 0) {
+ throw new ParseError(getName() + " - failed to decompress");
+ }
+ return static_cast<uint64_t>(result);
}
std::unique_ptr<SeekableInputStream>
@@ -741,8 +846,16 @@ DIAGNOSTIC_POP
return std::unique_ptr<SeekableInputStream>
(new SnappyDecompressionStream(std::move(input), blockSize, pool));
case CompressionKind_LZO:
- default:
- throw NotImplementedYet("compression codec");
+ return std::unique_ptr<SeekableInputStream>
+ (new LzoDecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_LZ4:
+ return std::unique_ptr<SeekableInputStream>
+ (new Lz4DecompressionStream(std::move(input), blockSize, pool));
+ default: {
+ std::ostringstream buffer;
+ buffer << "Unknown compression codec " << kind;
+ throw NotImplementedYet(buffer.str());
+ }
}
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/src/LzoDecompressor.cc
----------------------------------------------------------------------
diff --git a/c++/src/LzoDecompressor.cc b/c++/src/LzoDecompressor.cc
new file mode 100644
index 0000000..e86ab3c
--- /dev/null
+++ b/c++/src/LzoDecompressor.cc
@@ -0,0 +1,391 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "Exceptions.hh"
+
+#include <string>
+
+namespace orc {
+
+ static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4};
+ static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3};
+
+ static const int32_t SIZE_OF_SHORT = 2;
+ static const int32_t SIZE_OF_INT = 4;
+ static const int32_t SIZE_OF_LONG = 8;
+
+ static std::string toHex(uint64_t val) {
+ std::ostringstream out;
+ out << "0x" << std::hex << val;
+ return out.str();
+ }
+
+ static std::string toString(int64_t val) {
+ std::ostringstream out;
+ out << val;
+ return out.str();
+ }
+
+ class MalformedInputException: public ParseError {
+ public:
+ MalformedInputException(int64_t off
+ ) :ParseError("MalformedInputException at " +
+ toString(off)) {
+ }
+
+ MalformedInputException(int64_t off, const std::string& msg
+ ): ParseError("MalformedInputException " + msg +
+ " at " + toString(off)) {
+ }
+
+ MalformedInputException(const MalformedInputException& other
+ ): ParseError(other.what()) {
+ }
+
+ virtual ~MalformedInputException();
+ };
+
+ MalformedInputException::~MalformedInputException() {
+ // PASS
+ }
+
+ uint64_t lzoDecompress(const char *inputAddress,
+ const char *inputLimit,
+ char *outputAddress,
+ char *outputLimit) {
+ // nothing compresses to nothing
+ if (inputAddress == inputLimit) {
+ return 0;
+ }
+
+ // maximum offset in buffers to which it's safe to write long-at-a-time
+ char * const fastOutputLimit = outputLimit - SIZE_OF_LONG;
+
+ // LZO can concat two blocks together so, decode until the input data is
+ // consumed
+ const char *input = inputAddress;
+ char *output = outputAddress;
+ while (input < inputLimit) {
+ //
+ // Note: For safety some of the code below may stop decoding early or
+ // skip decoding, because input is not available. This makes the code
+ // safe, and since LZO requires an explicit "stop" command, the decoder
+ // will still throw a exception.
+ //
+
+ bool firstCommand = true;
+ uint32_t lastLiteralLength = 0;
+ while (true) {
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ uint32_t command = *(input++) & 0xFF;
+ if (command == 0x11) {
+ break;
+ }
+
+ // Commands are described using a bit pattern notation:
+ // 0: bit is not set
+ // 1: bit is set
+ // L: part of literal length
+ // P: part of match offset position
+ // M: part of match length
+ // ?: see documentation in command decoder
+
+ int32_t matchLength;
+ int32_t matchOffset;
+ uint32_t literalLength;
+ if ((command & 0xf0) == 0) {
+ if (lastLiteralLength == 0) {
+ // 0b0000_LLLL (0bLLLL_LLLL)*
+
+ // copy length :: fixed
+ // 0
+ matchOffset = 0;
+
+ // copy offset :: fixed
+ // 0
+ matchLength = 0;
+
+ // literal length - 3 :: variable bits :: valid range [4..]
+ // 3 + variableLength(command bits [0..3], 4)
+ literalLength = command & 0xf;
+ if (literalLength == 0) {
+ literalLength = 0xf;
+
+ uint32_t nextByte = 0;
+ while (input < inputLimit &&
+ (nextByte = *(input++) & 0xFF) == 0) {
+ literalLength += 0xff;
+ }
+ literalLength += nextByte;
+ }
+ literalLength += 3;
+ } else if (lastLiteralLength <= 3) {
+ // 0b0000_PPLL 0bPPPP_PPPP
+
+ // copy length: fixed
+ // 3
+ matchLength = 3;
+
+ // copy offset :: 12 bits :: valid range [2048..3071]
+ // [0..1] from command [2..3]
+ // [2..9] from trailer [0..7]
+ // [10] unset
+ // [11] set
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ matchOffset = (command & 0xc) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 2;
+ matchOffset |= 0x800;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ } else {
+ // 0b0000_PPLL 0bPPPP_PPPP
+
+ // copy length :: fixed
+ // 2
+ matchLength = 2;
+
+ // copy offset :: 10 bits :: valid range [0..1023]
+ // [0..1] from command [2..3]
+ // [2..9] from trailer [0..7]
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ matchOffset = (command & 0xc) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 2;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ }
+ } else if (firstCommand) {
+ // first command has special handling when high nibble is set
+ matchLength = 0;
+ matchOffset = 0;
+ literalLength = command - 17;
+ } else if ((command & 0xf0) == 0x10) {
+ // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
+
+ // copy length - 2 :: variable bits :: valid range [3..]
+ // 2 + variableLength(command bits [0..2], 3)
+ matchLength = command & 0x7;
+ if (matchLength == 0) {
+ matchLength = 0x7;
+
+ int32_t nextByte = 0;
+ while (input < inputLimit &&
+ (nextByte = *(input++) & 0xFF) == 0) {
+ matchLength += 0xff;
+ }
+ matchLength += nextByte;
+ }
+ matchLength += 2;
+
+ // read trailer
+ if (input + SIZE_OF_SHORT > inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF;
+ input += SIZE_OF_SHORT;
+
+ // copy offset :: 16 bits :: valid range [32767..49151]
+ // [0..13] from trailer [2..15]
+ // [14] if command bit [3] unset
+ // [15] if command bit [3] set
+ matchOffset = trailer >> 2;
+ if ((command & 0x8) == 0) {
+ matchOffset |= 0x4000;
+ } else {
+ matchOffset |= 0x8000;
+ }
+ matchOffset--;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from trailer [0..1]
+ literalLength = trailer & 0x3;
+ } else if ((command & 0xe0) == 0x20) {
+ // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
+
+ // copy length - 2 :: variable bits :: valid range [3..]
+ // 2 + variableLength(command bits [0..4], 5)
+ matchLength = command & 0x1f;
+ if (matchLength == 0) {
+ matchLength = 0x1f;
+
+ int nextByte = 0;
+ while (input < inputLimit &&
+ (nextByte = *(input++) & 0xFF) == 0) {
+ matchLength += 0xff;
+ }
+ matchLength += nextByte;
+ }
+ matchLength += 2;
+
+ // read trailer
+ if (input + SIZE_OF_SHORT > inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF;
+ input += SIZE_OF_SHORT;
+
+ // copy offset :: 14 bits :: valid range [0..16383]
+ // [0..13] from trailer [2..15]
+ matchOffset = trailer >> 2;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from trailer [0..1]
+ literalLength = trailer & 0x3;
+ } else if ((command & 0xc0) != 0) {
+ // 0bMMMP_PPLL 0bPPPP_PPPP
+
+ // copy length - 1 :: 3 bits :: valid range [1..8]
+ // [0..2] from command [5..7]
+ // add 1
+ matchLength = (command & 0xe0) >> 5;
+ matchLength += 1;
+
+ // copy offset :: 11 bits :: valid range [0..4095]
+ // [0..2] from command [2..4]
+ // [3..10] from trailer [0..7]
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ matchOffset = (command & 0x1c) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 3;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ } else {
+ throw MalformedInputException(input - inputAddress - 1,
+ "Invalid LZO command " +
+ toHex(command));
+ }
+ firstCommand = false;
+
+ // copy match
+ if (matchLength != 0) {
+ // lzo encodes match offset minus one
+ matchOffset++;
+
+ char *matchAddress = output - matchOffset;
+ if (matchAddress < outputAddress ||
+ output + matchLength > outputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ char *matchOutputLimit = output + matchLength;
+
+ if (output > fastOutputLimit) {
+ // slow match copy
+ while (output < matchOutputLimit) {
+ *(output++) = *(matchAddress++);
+ }
+ } else {
+ // copy repeated sequence
+ if (matchOffset < SIZE_OF_LONG) {
+ // 8 bytes apart so that we can copy long-at-a-time below
+ int32_t increment32 = DEC_32_TABLE[matchOffset];
+ int32_t decrement64 = DEC_64_TABLE[matchOffset];
+
+ output[0] = *matchAddress;
+ output[1] = *(matchAddress + 1);
+ output[2] = *(matchAddress + 2);
+ output[3] = *(matchAddress + 3);
+ output += SIZE_OF_INT;
+ matchAddress += increment32;
+
+ *reinterpret_cast<int32_t*>(output) =
+ *reinterpret_cast<int32_t*>(matchAddress);
+ output += SIZE_OF_INT;
+ matchAddress -= decrement64;
+ } else {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<int64_t*>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+
+ if (matchOutputLimit >= fastOutputLimit) {
+ if (matchOutputLimit > outputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+
+ while (output < fastOutputLimit) {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<int64_t*>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+
+ while (output < matchOutputLimit) {
+ *(output++) = *(matchAddress++);
+ }
+ } else {
+ while (output < matchOutputLimit) {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<int64_t*>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+ }
+ }
+ output = matchOutputLimit; // correction in case we over-copied
+ }
+
+ // copy literal
+ char *literalOutputLimit = output + literalLength;
+ if (literalOutputLimit > fastOutputLimit ||
+ input + literalLength > inputLimit - SIZE_OF_LONG) {
+ if (literalOutputLimit > outputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+
+ // slow, precise copy
+ memcpy(output, input, literalLength);
+ input += literalLength;
+ output += literalLength;
+ } else {
+ // fast copy. We may over-copy but there's enough room in input
+ // and output to not overrun them
+ do {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<const int64_t*>(input);
+ input += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ } while (output < literalOutputLimit);
+ // adjust index if we over-copied
+ input -= (output - literalOutputLimit);
+ output = literalOutputLimit;
+ }
+ lastLiteralLength = literalLength;
+ }
+
+ if (input + SIZE_OF_SHORT > inputLimit &&
+ *reinterpret_cast<const int16_t*>(input) != 0) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ input += SIZE_OF_SHORT;
+ }
+
+ return static_cast<uint64_t>(output - outputAddress);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/src/LzoDecompressor.hh
----------------------------------------------------------------------
diff --git a/c++/src/LzoDecompressor.hh b/c++/src/LzoDecompressor.hh
new file mode 100644
index 0000000..9de8537
--- /dev/null
+++ b/c++/src/LzoDecompressor.hh
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_LZO_HH
+#define ORC_LZO_HH
+
+#include "orc/OrcFile.hh"
+
+#include "Adaptor.hh"
+
+namespace orc {
+
+ /**
+ * Decompress the bytes in to the output buffer.
+ * @param inputAddress the start of the input
+ * @param inputLimit one past the last byte of the input
+ * @param outputAddress the start of the output buffer
+ * @param outputLimit one past the last byte of the output buffer
+ * @result the number of bytes decompressed
+ */
+ uint64_t lzoDecompress(const char *inputAddress,
+ const char *inputLimit,
+ char *outputAddress,
+ char *outputLimit);
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/test/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt
index 6b7fd00..775e15f 100644
--- a/c++/test/CMakeLists.txt
+++ b/c++/test/CMakeLists.txt
@@ -37,10 +37,11 @@ add_executable (orc-test
target_link_libraries (orc-test
orc
- ${PROTOBUF_LIBRARIES}
${GMOCK_LIBRARIES}
- ${ZLIB_LIBRARIES}
+ ${LZ4_LIBRARIES}
+ ${PROTOBUF_LIBRARIES}
${SNAPPY_LIBRARIES}
+ ${ZLIB_LIBRARIES}
)
add_executable (create-test-files
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/c++/test/TestCompression.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestCompression.cc b/c++/test/TestCompression.cc
index 4a79203..efd982a 100644
--- a/c++/test/TestCompression.cc
+++ b/c++/test/TestCompression.cc
@@ -333,13 +333,154 @@ namespace orc {
}
}
- TEST_F(TestCompression, testCreateLzo) {
+ TEST_F(TestCompression, testLzoEmpty) {
const unsigned char buffer[] = {0};
- EXPECT_THROW(createDecompressor(CompressionKind_LZO,
- std::unique_ptr<SeekableInputStream>
- (new SeekableArrayInputStream(buffer, 0)),
- 32768, *getDefaultPool()),
- NotImplementedYet);
+ std::unique_ptr<SeekableInputStream> result =
+ createDecompressor(CompressionKind_LZO,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableArrayInputStream(buffer, 0)),
+ 32768, *getDefaultPool());
+ EXPECT_EQ("lzo(SeekableArrayInputStream 0 of 0)", result->getName());
+ const void *ptr;
+ int length;
+ ASSERT_TRUE(!result->Next(&ptr, &length));
+ }
+
+ TEST_F(TestCompression, testLzoSmall) {
+ const unsigned char buffer[] = { 70, 0, 0,
+ 48, 88, 88, 88, 88, 97, 98,
+ 99, 100, 97, 98, 99, 100, 65,
+ 66, 67, 68, 65, 66, 67, 68,
+ 119, 120, 121, 122, 119, 122, 121,
+ 122, 49, 50, 51, 17, 0, 0};
+ std::unique_ptr<SeekableInputStream> result =
+ createDecompressor(CompressionKind_LZO,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableArrayInputStream(buffer,
+ ARRAY_SIZE(buffer))),
+ 128*1024, *getDefaultPool());
+ const void *ptr;
+ int length;
+ ASSERT_EQ(true, result->Next(&ptr, &length));
+ const char *expected = "XXXXabcdabcdABCDABCDwxyzwzyz123";
+ ASSERT_EQ(strlen(expected), length);
+ for(uint64_t i=0; i < length; ++i) {
+ ASSERT_EQ(static_cast<const char>(expected[i]),
+ static_cast<const char*>(ptr)[i]);
+ }
+ ASSERT_TRUE(!result->Next(&ptr, &length));
+ }
+
+ TEST_F(TestCompression, testLzoLong) {
+ // set up a framed lzo buffer with 100,000 'a'
+ unsigned char buffer[482];
+ bzero(buffer, ARRAY_SIZE(buffer));
+ // header
+ buffer[0] = 190;
+ buffer[1] = 3;
+
+ // lzo data
+ buffer[3] = 2;
+ memset(buffer + 4, 97, 5);
+ buffer[9] = 32;
+ buffer[202] = 134;
+ buffer[203] = 16;
+ buffer[206] = 3;
+ memset(buffer + 207, 97, 21);
+ buffer[228] = 32;
+ buffer[421] = 138;
+ buffer[425] = 3;
+ memset(buffer + 426, 97, 21);
+ buffer[447] = 32;
+ buffer[454] = 112;
+ buffer[458] = 2;
+ memset(buffer + 459, 97, 20);
+ buffer[479] = 17;
+ std::unique_ptr<SeekableInputStream> result =
+ createDecompressor(CompressionKind_LZO,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableArrayInputStream(buffer,
+ ARRAY_SIZE(buffer))),
+ 128*1024, *getDefaultPool());
+ const void *ptr;
+ int length;
+ ASSERT_EQ(true, result->Next(&ptr, &length));
+ ASSERT_EQ(100000, length);
+ for(uint64_t i=0; i < length; ++i) {
+ ASSERT_EQ('a', static_cast<const char*>(ptr)[i]);
+ }
+ ASSERT_TRUE(!result->Next(&ptr, &length));
+ }
+
+ TEST_F(TestCompression, testLz4Empty) {
+ const unsigned char buffer[] = {0};
+ std::unique_ptr<SeekableInputStream> result =
+ createDecompressor(CompressionKind_LZ4,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableArrayInputStream(buffer, 0)),
+ 32768, *getDefaultPool());
+ EXPECT_EQ("lz4(SeekableArrayInputStream 0 of 0)", result->getName());
+ const void *ptr;
+ int length;
+ ASSERT_TRUE(!result->Next(&ptr, &length));
+ }
+
+ TEST_F(TestCompression, testLz4Small) {
+ const unsigned char buffer[] = { 60, 0, 0,
+ 128, 88, 88, 88, 88, 97, 98, 99,
+ 100, 4, 0, 64, 65, 66, 67, 68,
+ 4, 0, 176, 119, 120, 121, 122, 119,
+ 122, 121, 122, 49, 50, 51};
+ std::unique_ptr<SeekableInputStream> result =
+ createDecompressor(CompressionKind_LZ4,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableArrayInputStream(buffer,
+ ARRAY_SIZE(buffer))),
+ 128*1024, *getDefaultPool());
+ const void *ptr;
+ int length;
+ ASSERT_EQ(true, result->Next(&ptr, &length));
+ const char *expected = "XXXXabcdabcdABCDABCDwxyzwzyz123";
+ ASSERT_EQ(strlen(expected), length);
+ for(uint64_t i=0; i < length; ++i) {
+ ASSERT_EQ(static_cast<const char>(expected[i]),
+ static_cast<const char*>(ptr)[i]);
+ }
+ ASSERT_TRUE(!result->Next(&ptr, &length));
+ }
+
+ TEST_F(TestCompression, testLz4Long) {
+ // set up a framed lzo buffer with 100,000 'a'
+ unsigned char buffer[406];
+ memset(buffer, 255, ARRAY_SIZE(buffer));
+ // header
+ buffer[0] = 38;
+ buffer[1] = 3;
+ buffer[2] = 0;
+
+ // lz4 data
+ buffer[3] = 31;
+ buffer[4] = 97;
+ buffer[5] = 1;
+ buffer[6] = 0;
+ buffer[399] = 15;
+ buffer[400] = 80;
+ memset(buffer + 401, 97, 5);
+
+ std::unique_ptr<SeekableInputStream> result =
+ createDecompressor(CompressionKind_LZ4,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableArrayInputStream(buffer,
+ ARRAY_SIZE(buffer))),
+ 128*1024, *getDefaultPool());
+ const void *ptr;
+ int length;
+ ASSERT_EQ(true, result->Next(&ptr, &length));
+ ASSERT_EQ(100000, length);
+ for(uint64_t i=0; i < length; ++i) {
+ ASSERT_EQ('a', static_cast<const char*>(ptr)[i]);
+ }
+ ASSERT_TRUE(!result->Next(&ptr, &length));
}
TEST(Zlib, testCreateZlib) {
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/examples/TestVectorOrcFile.testLz4.orc
----------------------------------------------------------------------
diff --git a/examples/TestVectorOrcFile.testLz4.orc b/examples/TestVectorOrcFile.testLz4.orc
new file mode 100644
index 0000000..dacba8d
Binary files /dev/null and b/examples/TestVectorOrcFile.testLz4.orc differ
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/examples/TestVectorOrcFile.testLzo.orc
----------------------------------------------------------------------
diff --git a/examples/TestVectorOrcFile.testLzo.orc b/examples/TestVectorOrcFile.testLzo.orc
new file mode 100644
index 0000000..2b01fb5
Binary files /dev/null and b/examples/TestVectorOrcFile.testLzo.orc differ
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/examples/expected/TestVectorOrcFile.testLz4.jsn.gz
----------------------------------------------------------------------
diff --git a/examples/expected/TestVectorOrcFile.testLz4.jsn.gz b/examples/expected/TestVectorOrcFile.testLz4.jsn.gz
new file mode 100644
index 0000000..60a846e
Binary files /dev/null and b/examples/expected/TestVectorOrcFile.testLz4.jsn.gz differ
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/examples/expected/TestVectorOrcFile.testLzo.jsn.gz
----------------------------------------------------------------------
diff --git a/examples/expected/TestVectorOrcFile.testLzo.jsn.gz b/examples/expected/TestVectorOrcFile.testLzo.jsn.gz
new file mode 100644
index 0000000..e002379
Binary files /dev/null and b/examples/expected/TestVectorOrcFile.testLzo.jsn.gz differ
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 31ac1c4..42d1176 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -1698,20 +1698,20 @@ public class TestVectorOrcFile {
@Test
public void testLzo() throws Exception {
TypeDescription schema =
- TypeDescription.fromString("struct<x:bigint,y:double,z:bigint>");
+ TypeDescription.fromString("struct<x:bigint,y:int,z:bigint>");
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.setSchema(schema)
- .stripeSize(1000)
+ .stripeSize(10000)
.compress(CompressionKind.LZO)
- .bufferSize(100));
+ .bufferSize(1000));
VectorizedRowBatch batch = schema.createRowBatch();
Random rand = new Random(69);
batch.size = 1000;
for(int b=0; b < 10; ++b) {
for (int r=0; r < 1000; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
- ((DoubleColumnVector) batch.cols[1]).vector[r] = rand.nextDouble();
+ ((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r;
((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong();
}
writer.addRowBatch(batch);
@@ -1729,8 +1729,8 @@ public class TestVectorOrcFile {
for(int r=0; r < batch.size; ++r) {
assertEquals(rand.nextInt(),
((LongColumnVector) batch.cols[0]).vector[r]);
- assertEquals(rand.nextDouble(),
- ((DoubleColumnVector) batch.cols[1]).vector[r], 0.00001);
+ assertEquals(b * 1000 + r,
+ ((LongColumnVector) batch.cols[1]).vector[r]);
assertEquals(rand.nextLong(),
((LongColumnVector) batch.cols[2]).vector[r]);
}
@@ -1747,20 +1747,20 @@ public class TestVectorOrcFile {
@Test
public void testLz4() throws Exception {
TypeDescription schema =
- TypeDescription.fromString("struct<x:bigint,y:double,z:bigint>");
+ TypeDescription.fromString("struct<x:bigint,y:int,z:bigint>");
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.setSchema(schema)
- .stripeSize(1000)
+ .stripeSize(10000)
.compress(CompressionKind.LZ4)
- .bufferSize(100));
+ .bufferSize(1000));
VectorizedRowBatch batch = schema.createRowBatch();
Random rand = new Random(3);
batch.size = 1000;
for(int b=0; b < 10; ++b) {
for (int r=0; r < 1000; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
- ((DoubleColumnVector) batch.cols[1]).vector[r] = rand.nextDouble();
+ ((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r;
((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong();
}
writer.addRowBatch(batch);
@@ -1778,8 +1778,8 @@ public class TestVectorOrcFile {
for(int r=0; r < batch.size; ++r) {
assertEquals(rand.nextInt(),
((LongColumnVector) batch.cols[0]).vector[r]);
- assertEquals(rand.nextDouble(),
- ((DoubleColumnVector) batch.cols[1]).vector[r], 0.00001);
+ assertEquals(b * 1000 + r,
+ ((LongColumnVector) batch.cols[1]).vector[r]);
assertEquals(rand.nextLong(),
((LongColumnVector) batch.cols[2]).vector[r]);
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9aba074b/tools/test/TestMatch.cc
----------------------------------------------------------------------
diff --git a/tools/test/TestMatch.cc b/tools/test/TestMatch.cc
index 40efe53..c7759ae 100644
--- a/tools/test/TestMatch.cc
+++ b/tools/test/TestMatch.cc
@@ -466,6 +466,28 @@ namespace orc {
CompressionKind_ZLIB,
262144,
10000,
+ std::map<std::string, std::string>()),
+ OrcFileDescription("TestVectorOrcFile.testLz4.orc",
+ "TestVectorOrcFile.testLz4.jsn.gz",
+ "struct<x:bigint,y:int,z:bigint>",
+ "0.12",
+ 10000,
+ 120952,
+ 2,
+ CompressionKind_LZ4,
+ 1000,
+ 10000,
+ std::map<std::string, std::string>()),
+ OrcFileDescription("TestVectorOrcFile.testLzo.orc",
+ "TestVectorOrcFile.testLzo.jsn.gz",
+ "struct<x:bigint,y:int,z:bigint>",
+ "0.12",
+ 10000,
+ 120955,
+ 2,
+ CompressionKind_LZO,
+ 1000,
+ 10000,
std::map<std::string, std::string>())
));