You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by md...@apache.org on 2018/10/17 00:50:49 UTC
[orc] branch master updated: ORC-415: [C++] Fix writing
ColumnStatistics (#319)
This is an automated email from the ASF dual-hosted git repository.
mdeepak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new 58e1a33 ORC-415: [C++] Fix writing ColumnStatistics (#319)
58e1a33 is described below
commit 58e1a33024a1f260d5b40ca70402da715026e4ba
Author: Gang Wu <us...@gmail.com>
AuthorDate: Tue Oct 16 17:50:45 2018 -0700
ORC-415: [C++] Fix writing ColumnStatistics (#319)
* ORC-415: [C++] Fix writing ColumnStatistics
* move out of for-loop
* fix misspelling
---
c++/src/ColumnWriter.cc | 83 +++++++++++++++++++++++++++++++++----------------
c++/src/Statistics.hh | 36 ++++++++++++++++++---
c++/src/Writer.cc | 2 +-
3 files changed, 89 insertions(+), 32 deletions(-)
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 8feb077..9b128e3 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -280,10 +280,10 @@ namespace orc {
}
// update stats
- bool hasNull = false;
if (!structBatch->hasNulls) {
colIndexStatistics->increase(numValues);
} else {
+ bool hasNull = false;
const char* notNull = structBatch->notNull.data() + offset;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
@@ -292,8 +292,10 @@ namespace orc {
hasNull = true;
}
}
+ if (hasNull) {
+ colIndexStatistics->setHasNull(true);
+ }
}
- colIndexStatistics->setHasNull(hasNull);
}
void StructColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -461,7 +463,9 @@ namespace orc {
hasNull = true;
}
}
- intStats->setHasNull(hasNull);
+ if (hasNull) {
+ intStats->setHasNull(true);
+ }
}
void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -564,7 +568,9 @@ namespace orc {
hasNull = true;
}
}
- intStats->setHasNull(hasNull);
+ if (hasNull) {
+ intStats->setHasNull(true);
+ }
}
void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -666,7 +672,9 @@ namespace orc {
hasNull = true;
}
}
- boolStats->setHasNull(hasNull);
+ if (hasNull) {
+ boolStats->setHasNull(true);
+ }
}
void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -774,7 +782,6 @@ namespace orc {
size_t bytes = isFloat ? 4 : 8;
char* data = buffer.data();
bool hasNull = false;
-
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
if (isFloat) {
@@ -790,7 +797,9 @@ namespace orc {
hasNull = true;
}
}
- doubleStats->setHasNull(hasNull);
+ if (hasNull) {
+ doubleStats->setHasNull(true);
+ }
}
void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -900,7 +909,9 @@ namespace orc {
hasNull = true;
}
}
- strStats->setHasNull(hasNull);
+ if (hasNull) {
+ strStats->setHasNull(true);
+ }
}
void StringColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -1052,8 +1063,8 @@ namespace orc {
if (strStats == nullptr) {
throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
}
- bool hasNull = false;
+ bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
const char * charData = nullptr;
@@ -1080,7 +1091,9 @@ namespace orc {
}
}
lengthEncoder->add(length, numValues, notNull);
- strStats->setHasNull(hasNull);
+ if (hasNull) {
+ strStats->setHasNull(true);
+ }
}
class VarCharColumnWriter : public StringColumnWriter {
@@ -1120,8 +1133,8 @@ namespace orc {
if (strStats == nullptr) {
throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
}
- bool hasNull = false;
+ bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
uint64_t itemLength = Utf8Utils::truncateBytesTo(
@@ -1137,7 +1150,9 @@ namespace orc {
}
}
lengthEncoder->add(length, numValues, notNull);
- strStats->setHasNull(hasNull);
+ if (hasNull) {
+ strStats->setHasNull(true);
+ }
}
class BinaryColumnWriter : public StringColumnWriter {
@@ -1187,7 +1202,9 @@ namespace orc {
}
}
lengthEncoder->add(length, numValues, notNull);
- binStats->setHasNull(hasNull);
+ if (hasNull) {
+ binStats->setHasNull(true);
+ }
}
class TimestampColumnWriter : public ColumnWriter {
@@ -1302,7 +1319,9 @@ namespace orc {
hasNull = true;
}
}
- tsStats->setHasNull(hasNull);
+ if (hasNull) {
+ tsStats->setHasNull(true);
+ }
secRleEncoder->add(secs, numValues, notNull);
nanoRleEncoder->add(nanos, numValues, notNull);
@@ -1394,7 +1413,9 @@ namespace orc {
hasNull = true;
}
}
- dateStats->setHasNull(hasNull);
+ if (hasNull) {
+ dateStats->setHasNull(true);
+ }
}
class Decimal64ColumnWriter : public ColumnWriter {
@@ -1471,8 +1492,8 @@ namespace orc {
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
- bool hasNull = false;
+ bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
int64_t val = zigZag(values[i]);
@@ -1495,10 +1516,11 @@ namespace orc {
hasNull = true;
}
}
+ if (hasNull) {
+ decStats->setHasNull(true);
+ }
std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
scaleEncoder->add(scales.data(), numValues, notNull);
-
- decStats->setHasNull(hasNull);
}
void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -1591,10 +1613,10 @@ namespace orc {
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
- bool hasNull = false;
// The current encoding of decimal columns stores the integer representation
// of the value as an unbounded length zigzag encoded base 128 varint.
+ bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
Int128 val = zigZagInt128(values[i]);
@@ -1616,10 +1638,11 @@ namespace orc {
hasNull = true;
}
}
+ if (hasNull) {
+ decStats->setHasNull(true);
+ }
std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
scaleEncoder->add(scales.data(), numValues, notNull);
-
- decStats->setHasNull(hasNull);
}
class ListColumnWriter : public ColumnWriter {
@@ -1719,10 +1742,10 @@ namespace orc {
lengthEncoder->add(offsets, numValues, notNull);
if (enableIndex) {
- bool hasNull = false;
if (!notNull) {
colIndexStatistics->increase(numValues);
} else {
+ bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
colIndexStatistics->increase(1);
@@ -1730,8 +1753,10 @@ namespace orc {
hasNull = true;
}
}
+ if (hasNull) {
+ colIndexStatistics->setHasNull(true);
+ }
}
- colIndexStatistics->setHasNull(hasNull);
}
}
@@ -1923,10 +1948,10 @@ namespace orc {
}
if (enableIndex) {
- bool hasNull = false;
if (!notNull) {
colIndexStatistics->increase(numValues);
} else {
+ bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
colIndexStatistics->increase(1);
@@ -1934,8 +1959,10 @@ namespace orc {
hasNull = true;
}
}
+ if (hasNull) {
+ colIndexStatistics->setHasNull(true);
+ }
}
- colIndexStatistics->setHasNull(hasNull);
}
}
@@ -2153,10 +2180,10 @@ namespace orc {
// update stats
if (enableIndex) {
- bool hasNull = false;
if (!notNull) {
colIndexStatistics->increase(numValues);
} else {
+ bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
colIndexStatistics->increase(1);
@@ -2164,8 +2191,10 @@ namespace orc {
hasNull = true;
}
}
+ if (hasNull) {
+ colIndexStatistics->setHasNull(true);
+ }
}
- colIndexStatistics->setHasNull(hasNull);
}
}
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index 8122758..d8cc765 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -418,6 +418,8 @@ namespace orc {
proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
if (_hasCount) {
bucketStats->add_count(_trueCount);
+ } else {
+ bucketStats->clear_count();
}
}
@@ -519,11 +521,14 @@ namespace orc {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ proto::DateStatistics* dateStatistics =
+ pbStats.mutable_datestatistics();
if (_stats.hasMinimum()) {
- proto::DateStatistics* dateStatistics =
- pbStats.mutable_datestatistics();
dateStatistics->set_maximum(_stats.getMaximum());
dateStatistics->set_minimum(_stats.getMinimum());
+ } else {
+ dateStatistics->clear_minimum();
+ dateStatistics->clear_maximum();
}
}
@@ -662,9 +667,14 @@ namespace orc {
if (_stats.hasMinimum()) {
decStats->set_minimum(_stats.getMinimum().toString());
decStats->set_maximum(_stats.getMaximum().toString());
+ } else {
+ decStats->clear_minimum();
+ decStats->clear_maximum();
}
if (_stats.hasSum()) {
decStats->set_sum(_stats.getSum().toString());
+ } else {
+ decStats->clear_sum();
}
}
@@ -836,9 +846,14 @@ namespace orc {
if (_stats.hasMinimum()) {
doubleStats->set_minimum(_stats.getMinimum());
doubleStats->set_maximum(_stats.getMaximum());
+ } else {
+ doubleStats->clear_minimum();
+ doubleStats->clear_maximum();
}
if (_stats.hasSum()) {
doubleStats->set_sum(_stats.getSum());
+ } else {
+ doubleStats->clear_sum();
}
}
@@ -980,9 +995,14 @@ namespace orc {
if (_stats.hasMinimum()) {
intStats->set_minimum(_stats.getMinimum());
intStats->set_maximum(_stats.getMaximum());
+ } else {
+ intStats->clear_minimum();
+ intStats->clear_maximum();
}
if (_stats.hasSum()) {
intStats->set_sum(_stats.getSum());
+ } else {
+ intStats->clear_sum();
}
}
@@ -1148,9 +1168,14 @@ namespace orc {
if (_stats.hasMinimum()) {
strStats->set_minimum(_stats.getMinimum());
strStats->set_maximum(_stats.getMaximum());
+ } else {
+ strStats->clear_minimum();
+ strStats->clear_maximum();
}
if (_stats.hasTotalLength()) {
strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
+ } else {
+ strStats->clear_sum();
}
}
@@ -1267,11 +1292,14 @@ namespace orc {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ proto::TimestampStatistics* tsStats =
+ pbStats.mutable_timestampstatistics();
if (_stats.hasMinimum()) {
- proto::TimestampStatistics* tsStats =
- pbStats.mutable_timestampstatistics();
tsStats->set_minimumutc(_stats.getMinimum());
tsStats->set_maximumutc(_stats.getMaximum());
+ } else {
+ tsStats->clear_minimumutc();
+ tsStats->clear_maximumutc();
}
}
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index fee3318..6218769 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -40,7 +40,7 @@ namespace orc {
bool enableIndex;
WriterOptionsPrivate() :
- fileVersion(FileVersion::v_0_12()) { // default to Hive_0_11
+ fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12
stripeSize = 64 * 1024 * 1024; // 64M
compressionBlockSize = 64 * 1024; // 64K
rowIndexStride = 10000;