You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ju...@apache.org on 2016/01/29 02:38:31 UTC
parquet-cpp git commit: PARQUET-428: Support INT96 and
FIXED_LEN_BYTE_ARRAY types
Repository: parquet-cpp
Updated Branches:
refs/heads/master 5af54bec2 -> 89f5a5439
PARQUET-428: Support INT96 and FIXED_LEN_BYTE_ARRAY types
This PR adds support for INT96 and FIXED_LEN_BYTE_ARRAY types.
It modifies the examples and DebugPrint to handle these types.
Author: Deepak Majeti <de...@hp.com>
Closes #27 from majetideepak/master and squashes the following commits:
5ba0a03 [Deepak Majeti] PARQUET-428: Support INT96 and FIXED_LEN_BYTE_ARRAY types
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/89f5a543
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/89f5a543
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/89f5a543
Branch: refs/heads/master
Commit: 89f5a543929b7d2bb3249626fb03cf9130540439
Parents: 5af54be
Author: Deepak Majeti <de...@hp.com>
Authored: Thu Jan 28 17:38:27 2016 -0800
Committer: Julien Le Dem <ju...@dremio.com>
Committed: Thu Jan 28 17:38:27 2016 -0800
----------------------------------------------------------------------
src/parquet/column_reader.cc | 2 ++
src/parquet/column_reader.h | 1 +
src/parquet/encodings/dictionary-encoding.h | 19 ++++++++++++
src/parquet/encodings/plain-encoding.h | 16 ++++++++++
src/parquet/reader.cc | 38 ++++++++++++++++--------
src/parquet/types.h | 38 ++++++++++++++++++++++--
src/parquet/util/compiler-util.h | 21 +++++++++++++
7 files changed, 120 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/89f5a543/src/parquet/column_reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc
index 168bc94..4f565cf 100644
--- a/src/parquet/column_reader.cc
+++ b/src/parquet/column_reader.cc
@@ -187,6 +187,8 @@ std::shared_ptr<ColumnReader> ColumnReader::Make(const parquet::ColumnMetaData*
return std::make_shared<DoubleReader>(metadata, element, stream);
case Type::BYTE_ARRAY:
return std::make_shared<ByteArrayReader>(metadata, element, stream);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<FixedLenByteArrayReader>(metadata, element, stream);
default:
ParquetException::NYI("type reader not implemented");
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/89f5a543/src/parquet/column_reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h
index cd6cc02..00722f5 100644
--- a/src/parquet/column_reader.h
+++ b/src/parquet/column_reader.h
@@ -159,6 +159,7 @@ typedef TypedColumnReader<parquet::Type::INT96> Int96Reader;
typedef TypedColumnReader<parquet::Type::FLOAT> FloatReader;
typedef TypedColumnReader<parquet::Type::DOUBLE> DoubleReader;
typedef TypedColumnReader<parquet::Type::BYTE_ARRAY> ByteArrayReader;
+typedef TypedColumnReader<parquet::Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayReader;
template <int TYPE>
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/89f5a543/src/parquet/encodings/dictionary-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/dictionary-encoding.h b/src/parquet/encodings/dictionary-encoding.h
index a4ef342..9641e7e 100644
--- a/src/parquet/encodings/dictionary-encoding.h
+++ b/src/parquet/encodings/dictionary-encoding.h
@@ -111,6 +111,25 @@ inline void DictionaryDecoder<parquet::Type::BYTE_ARRAY>::Init(
}
}
+template <>
+inline void DictionaryDecoder<parquet::Type::FIXED_LEN_BYTE_ARRAY>::Init(
+ Decoder<parquet::Type::FIXED_LEN_BYTE_ARRAY>* dictionary) {
+ int num_dictionary_values = dictionary->values_left();
+ dictionary_.resize(num_dictionary_values);
+ dictionary->Decode(&dictionary_[0], num_dictionary_values);
+
+ int fixed_len = schema_->type_length;
+ int total_size = num_dictionary_values*fixed_len;
+
+ byte_array_data_.resize(total_size);
+ int offset = 0;
+ for (int i = 0; i < num_dictionary_values; ++i) {
+ memcpy(&byte_array_data_[offset], dictionary_[i].ptr, fixed_len);
+ dictionary_[i].ptr = &byte_array_data_[offset];
+ offset += fixed_len;
+ }
+}
+
} // namespace parquet_cpp
#endif
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/89f5a543/src/parquet/encodings/plain-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/plain-encoding.h b/src/parquet/encodings/plain-encoding.h
index 4ddf878..dc71e39 100644
--- a/src/parquet/encodings/plain-encoding.h
+++ b/src/parquet/encodings/plain-encoding.h
@@ -74,6 +74,22 @@ inline int PlainDecoder<parquet::Type::BYTE_ARRAY>::Decode(ByteArray* buffer,
return max_values;
}
+// Template specialization for FIXED_LEN_BYTE_ARRAY
+template <>
+inline int PlainDecoder<parquet::Type::FIXED_LEN_BYTE_ARRAY>::Decode(FixedLenByteArray* buffer,
+ int max_values) {
+ max_values = std::min(max_values, num_values_);
+ int len = schema_->type_length;
+ for (int i = 0; i < max_values; ++i) {
+ if (len_ < len) ParquetException::EofException();
+ buffer[i].ptr = data_;
+ data_ += len;
+ len_ -= len;
+ }
+ num_values_ -= max_values;
+ return max_values;
+}
+
template <>
class PlainDecoder<parquet::Type::BOOLEAN> : public Decoder<parquet::Type::BOOLEAN> {
public:
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/89f5a543/src/parquet/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader.cc b/src/parquet/reader.cc
index 7c727ba..4654f14 100644
--- a/src/parquet/reader.cc
+++ b/src/parquet/reader.cc
@@ -217,6 +217,9 @@ static string parquet_type_to_string(Type::type t) {
case Type::INT64:
return "INT64";
break;
+ case Type::INT96:
+ return "INT96";
+ break;
case Type::FLOAT:
return "FLOAT";
break;
@@ -226,9 +229,6 @@ static string parquet_type_to_string(Type::type t) {
case Type::BYTE_ARRAY:
return "BYTE_ARRAY";
break;
- case Type::INT96:
- return "INT96";
- break;
case Type::FIXED_LEN_BYTE_ARRAY:
return "FIXED_LEN_BYTE_ARRAY";
break;
@@ -239,7 +239,7 @@ static string parquet_type_to_string(Type::type t) {
}
// the fixed initial size is just for an example
-#define COL_WIDTH "17"
+#define COL_WIDTH "20"
void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
if (!parsed_metadata_) {
@@ -251,10 +251,6 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
for (int c = 1; c < metadata_.schema.size(); ++c) {
stream << "Column " << c-1 << ": " << metadata_.schema[c].name << " ("
<< parquet_type_to_string(metadata_.schema[c].type);
- if (metadata_.schema[c].type == Type::INT96 ||
- metadata_.schema[c].type == Type::FIXED_LEN_BYTE_ARRAY) {
- stream << " - not supported";
- }
stream << ")\n";
}
@@ -291,10 +287,6 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
printf("%-" COL_WIDTH"s", metadata_.schema[c+1].name.c_str());
- if (col_type == Type::INT96 || col_type == Type::FIXED_LEN_BYTE_ARRAY) {
- continue;
- }
-
// This is OK in this method as long as the RowGroupReader does not get deleted
readers[c] = col_reader;
}
@@ -345,6 +337,16 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
}
break;
}
+ case Type::INT96: {
+ Int96 val = reinterpret_cast<Int96Reader*>(readers[c])->NextValue(
+ &def_level[c], &rep_level[c]);
+ if (def_level[c] >= rep_level[c]) {
+ string result = Int96ToString(val);
+ snprintf(buffer, bufsize, "%-" COL_WIDTH"s", result.c_str());
+ stream << buffer;
+ }
+ break;
+ }
case Type::FLOAT: {
float val = reinterpret_cast<FloatReader*>(readers[c])->NextValue(
&def_level[c], &rep_level[c]);
@@ -373,7 +375,17 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
}
break;
}
- default:
+ case Type::FIXED_LEN_BYTE_ARRAY: {
+ FixedLenByteArray val = reinterpret_cast<FixedLenByteArrayReader*>(
+ readers[c])->NextValue(&def_level[c], &rep_level[c]);
+ if (def_level[c] >= rep_level[c]) {
+ string result = FixedLenByteArrayToString(val, metadata_.schema[c+1].type_length);
+ snprintf(buffer, bufsize, "%-" COL_WIDTH"s", result.c_str());
+ stream << buffer;
+ }
+ break;
+ }
+ default:
continue;
}
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/89f5a543/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index 37f538a..409e335 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -22,8 +22,10 @@
#include <cstdint>
#include <cstring>
#include <string>
+#include <sstream>
#include "parquet/thrift/parquet_types.h"
+#include "parquet/util/compiler-util.h"
namespace parquet_cpp {
@@ -32,11 +34,36 @@ struct ByteArray {
const uint8_t* ptr;
};
+struct FixedLenByteArray {
+ const uint8_t* ptr;
+};
+
+MANUALLY_ALIGNED_STRUCT(1) Int96 {
+ uint32_t value[3];
+};
+STRUCT_END(Int96, 12);
static inline std::string ByteArrayToString(const ByteArray& a) {
return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
}
+static inline std::string Int96ToString(const Int96& a) {
+ std::stringstream result;
+ for (int i = 0; i < 3; i++) {
+ result << a.value[i] << " ";
+ }
+ return result.str();
+}
+
+static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
+ const uint8_t *bytes = reinterpret_cast<const uint8_t*>(a.ptr);
+ std::stringstream result;
+ for (int i = 0; i < len; i++) {
+ result << (uint32_t)bytes[i] << " ";
+ }
+ return result.str();
+}
+
static inline int ByteCompare(const ByteArray& x1, const ByteArray& x2) {
int len = std::min(x1.len, x2.len);
int cmp = memcmp(x1.ptr, x2.ptr, len);
@@ -76,8 +103,7 @@ struct type_traits<parquet::Type::INT64> {
template <>
struct type_traits<parquet::Type::INT96> {
- // TODO
- typedef void* value_type;
+ typedef Int96 value_type;
static constexpr parquet::Type::type parquet_type = parquet::Type::INT96;
static constexpr size_t value_byte_size = 12;
@@ -107,6 +133,14 @@ struct type_traits<parquet::Type::BYTE_ARRAY> {
static constexpr size_t value_byte_size = sizeof(ByteArray);
};
+template <>
+struct type_traits<parquet::Type::FIXED_LEN_BYTE_ARRAY> {
+ typedef FixedLenByteArray value_type;
+ static constexpr parquet::Type::type parquet_type = parquet::Type::FIXED_LEN_BYTE_ARRAY;
+
+ static constexpr size_t value_byte_size = sizeof(FixedLenByteArray);
+};
+
} // namespace parquet_cpp
#endif // PARQUET_TYPES_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/89f5a543/src/parquet/util/compiler-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/compiler-util.h b/src/parquet/util/compiler-util.h
index bd0dbed..9048ba1 100644
--- a/src/parquet/util/compiler-util.h
+++ b/src/parquet/util/compiler-util.h
@@ -36,4 +36,25 @@
#define PREFETCH(addr) __builtin_prefetch(addr)
+//macros to disable padding
+//these macros are portable across different compilers and platforms
+//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
+#if defined(_MSC_VER)
+ #define MANUALLY_ALIGNED_STRUCT(alignment) \
+ __pragma(pack(1)); \
+ struct __declspec(align(alignment))
+ #define STRUCT_END(name, size) \
+ __pragma(pack()); \
+ static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#elif defined(__GNUC__) || defined(__clang__)
+ #define MANUALLY_ALIGNED_STRUCT(alignment) \
+ _Pragma("pack(1)") \
+ struct __attribute__((aligned(alignment)))
+ #define STRUCT_END(name, size) \
+ _Pragma("pack()") \
+ static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#else
+ #error Unknown compiler, please define structure alignment macros
+#endif
+
#endif // PARQUET_UTIL_COMPILER_UTIL_H