You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by md...@apache.org on 2018/10/17 00:50:49 UTC

[orc] branch master updated: ORC-415: [C++] Fix writing ColumnStatistics (#319)

This is an automated email from the ASF dual-hosted git repository.

mdeepak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new 58e1a33  ORC-415: [C++] Fix writing ColumnStatistics (#319)
58e1a33 is described below

commit 58e1a33024a1f260d5b40ca70402da715026e4ba
Author: Gang Wu <us...@gmail.com>
AuthorDate: Tue Oct 16 17:50:45 2018 -0700

    ORC-415: [C++] Fix writing ColumnStatistics (#319)
    
    * ORC-415: [C++] Fix writing ColumnStatistics
    
    * move out of for-loop
    
    * fix misspelling
---
 c++/src/ColumnWriter.cc | 83 +++++++++++++++++++++++++++++++++----------------
 c++/src/Statistics.hh   | 36 ++++++++++++++++++---
 c++/src/Writer.cc       |  2 +-
 3 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 8feb077..9b128e3 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -280,10 +280,10 @@ namespace orc {
     }
 
     // update stats
-    bool hasNull = false;
     if (!structBatch->hasNulls) {
       colIndexStatistics->increase(numValues);
     } else {
+      bool hasNull = false;
       const char* notNull = structBatch->notNull.data() + offset;
       for (uint64_t i = 0; i < numValues; ++i) {
         if (notNull[i]) {
@@ -292,8 +292,10 @@ namespace orc {
           hasNull = true;
         }
       }
+      if (hasNull) {
+        colIndexStatistics->setHasNull(true);
+      }
     }
-    colIndexStatistics->setHasNull(hasNull);
   }
 
   void StructColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -461,7 +463,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    intStats->setHasNull(hasNull);
+    if (hasNull) {
+      intStats->setHasNull(true);
+    }
   }
 
   void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -564,7 +568,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    intStats->setHasNull(hasNull);
+    if (hasNull) {
+      intStats->setHasNull(true);
+    }
   }
 
   void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -666,7 +672,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    boolStats->setHasNull(hasNull);
+    if (hasNull) {
+      boolStats->setHasNull(true);
+    }
   }
 
   void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -774,7 +782,6 @@ namespace orc {
     size_t bytes = isFloat ? 4 : 8;
     char* data = buffer.data();
     bool hasNull = false;
-
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         if (isFloat) {
@@ -790,7 +797,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    doubleStats->setHasNull(hasNull);
+    if (hasNull) {
+      doubleStats->setHasNull(true);
+    }
   }
 
   void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -900,7 +909,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    strStats->setHasNull(hasNull);
+    if (hasNull) {
+      strStats->setHasNull(true);
+    }
   }
 
   void StringColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -1052,8 +1063,8 @@ namespace orc {
     if (strStats == nullptr) {
       throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         const char * charData = nullptr;
@@ -1080,7 +1091,9 @@ namespace orc {
       }
     }
     lengthEncoder->add(length, numValues, notNull);
-    strStats->setHasNull(hasNull);
+    if (hasNull) {
+      strStats->setHasNull(true);
+    }
   }
 
   class VarCharColumnWriter : public StringColumnWriter {
@@ -1120,8 +1133,8 @@ namespace orc {
     if (strStats == nullptr) {
       throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         uint64_t itemLength = Utf8Utils::truncateBytesTo(
@@ -1137,7 +1150,9 @@ namespace orc {
       }
     }
     lengthEncoder->add(length, numValues, notNull);
-    strStats->setHasNull(hasNull);
+    if (hasNull) {
+      strStats->setHasNull(true);
+    }
   }
 
   class BinaryColumnWriter : public StringColumnWriter {
@@ -1187,7 +1202,9 @@ namespace orc {
       }
     }
     lengthEncoder->add(length, numValues, notNull);
-    binStats->setHasNull(hasNull);
+    if (hasNull) {
+      binStats->setHasNull(true);
+    }
   }
 
   class TimestampColumnWriter : public ColumnWriter {
@@ -1302,7 +1319,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    tsStats->setHasNull(hasNull);
+    if (hasNull) {
+      tsStats->setHasNull(true);
+    }
 
     secRleEncoder->add(secs, numValues, notNull);
     nanoRleEncoder->add(nanos, numValues, notNull);
@@ -1394,7 +1413,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    dateStats->setHasNull(hasNull);
+    if (hasNull) {
+      dateStats->setHasNull(true);
+    }
   }
 
   class Decimal64ColumnWriter : public ColumnWriter {
@@ -1471,8 +1492,8 @@ namespace orc {
     if (decStats == nullptr) {
       throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         int64_t val = zigZag(values[i]);
@@ -1495,10 +1516,11 @@ namespace orc {
         hasNull = true;
       }
     }
+    if (hasNull) {
+      decStats->setHasNull(true);
+    }
     std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
     scaleEncoder->add(scales.data(), numValues, notNull);
-
-    decStats->setHasNull(hasNull);
   }
 
   void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -1591,10 +1613,10 @@ namespace orc {
     if (decStats == nullptr) {
       throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
     // The current encoding of decimal columns stores the integer representation
     // of the value as an unbounded length zigzag encoded base 128 varint.
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         Int128 val = zigZagInt128(values[i]);
@@ -1616,10 +1638,11 @@ namespace orc {
         hasNull = true;
       }
     }
+    if (hasNull) {
+      decStats->setHasNull(true);
+    }
     std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
     scaleEncoder->add(scales.data(), numValues, notNull);
-
-    decStats->setHasNull(hasNull);
   }
 
   class ListColumnWriter : public ColumnWriter {
@@ -1719,10 +1742,10 @@ namespace orc {
     lengthEncoder->add(offsets, numValues, notNull);
 
     if (enableIndex) {
-      bool hasNull = false;
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             colIndexStatistics->increase(1);
@@ -1730,8 +1753,10 @@ namespace orc {
             hasNull = true;
           }
         }
+        if (hasNull) {
+          colIndexStatistics->setHasNull(true);
+        }
       }
-      colIndexStatistics->setHasNull(hasNull);
     }
   }
 
@@ -1923,10 +1948,10 @@ namespace orc {
     }
 
     if (enableIndex) {
-      bool hasNull = false;
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             colIndexStatistics->increase(1);
@@ -1934,8 +1959,10 @@ namespace orc {
             hasNull = true;
           }
         }
+        if (hasNull) {
+          colIndexStatistics->setHasNull(true);
+        }
       }
-      colIndexStatistics->setHasNull(hasNull);
     }
   }
 
@@ -2153,10 +2180,10 @@ namespace orc {
 
     // update stats
     if (enableIndex) {
-      bool hasNull = false;
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             colIndexStatistics->increase(1);
@@ -2164,8 +2191,10 @@ namespace orc {
             hasNull = true;
           }
         }
+        if (hasNull) {
+          colIndexStatistics->setHasNull(true);
+        }
       }
-      colIndexStatistics->setHasNull(hasNull);
     }
   }
 
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index 8122758..d8cc765 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -418,6 +418,8 @@ namespace orc {
       proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
       if (_hasCount) {
         bucketStats->add_count(_trueCount);
+      } else {
+        bucketStats->clear_count();
       }
     }
 
@@ -519,11 +521,14 @@ namespace orc {
       pbStats.set_hasnull(_stats.hasNull());
       pbStats.set_numberofvalues(_stats.getNumberOfValues());
 
+      proto::DateStatistics* dateStatistics =
+        pbStats.mutable_datestatistics();
       if (_stats.hasMinimum()) {
-        proto::DateStatistics* dateStatistics =
-          pbStats.mutable_datestatistics();
         dateStatistics->set_maximum(_stats.getMaximum());
         dateStatistics->set_minimum(_stats.getMinimum());
+      } else {
+        dateStatistics->clear_minimum();
+        dateStatistics->clear_maximum();
       }
     }
 
@@ -662,9 +667,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         decStats->set_minimum(_stats.getMinimum().toString());
         decStats->set_maximum(_stats.getMaximum().toString());
+      } else {
+        decStats->clear_minimum();
+        decStats->clear_maximum();
       }
       if (_stats.hasSum()) {
         decStats->set_sum(_stats.getSum().toString());
+      } else {
+        decStats->clear_sum();
       }
     }
 
@@ -836,9 +846,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         doubleStats->set_minimum(_stats.getMinimum());
         doubleStats->set_maximum(_stats.getMaximum());
+      } else {
+        doubleStats->clear_minimum();
+        doubleStats->clear_maximum();
       }
       if (_stats.hasSum()) {
         doubleStats->set_sum(_stats.getSum());
+      } else {
+        doubleStats->clear_sum();
       }
     }
 
@@ -980,9 +995,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         intStats->set_minimum(_stats.getMinimum());
         intStats->set_maximum(_stats.getMaximum());
+      } else {
+        intStats->clear_minimum();
+        intStats->clear_maximum();
       }
       if (_stats.hasSum()) {
         intStats->set_sum(_stats.getSum());
+      } else {
+        intStats->clear_sum();
       }
     }
 
@@ -1148,9 +1168,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         strStats->set_minimum(_stats.getMinimum());
         strStats->set_maximum(_stats.getMaximum());
+      } else {
+        strStats->clear_minimum();
+        strStats->clear_maximum();
       }
       if (_stats.hasTotalLength()) {
         strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
+      } else {
+        strStats->clear_sum();
       }
     }
 
@@ -1267,11 +1292,14 @@ namespace orc {
       pbStats.set_hasnull(_stats.hasNull());
       pbStats.set_numberofvalues(_stats.getNumberOfValues());
 
+      proto::TimestampStatistics* tsStats =
+        pbStats.mutable_timestampstatistics();
       if (_stats.hasMinimum()) {
-        proto::TimestampStatistics* tsStats =
-          pbStats.mutable_timestampstatistics();
         tsStats->set_minimumutc(_stats.getMinimum());
         tsStats->set_maximumutc(_stats.getMaximum());
+      } else {
+        tsStats->clear_minimumutc();
+        tsStats->clear_maximumutc();
       }
     }
 
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index fee3318..6218769 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -40,7 +40,7 @@ namespace orc {
     bool enableIndex;
 
     WriterOptionsPrivate() :
-                            fileVersion(FileVersion::v_0_12()) { // default to Hive_0_11
+                            fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12
       stripeSize = 64 * 1024 * 1024; // 64M
       compressionBlockSize = 64 * 1024; // 64K
       rowIndexStride = 10000;