You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2017/05/16 16:39:31 UTC

orc git commit: ORC-185 : [C++] Simplify Statististics Implementation

Repository: orc
Updated Branches:
  refs/heads/master 02055f733 -> 68994174f


ORC-185 : [C++] Simplify Statististics Implementation

Fixes #120

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/68994174
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/68994174
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/68994174

Branch: refs/heads/master
Commit: 68994174fe254bc581ee0ae1486d023032bfb520
Parents: 02055f7
Author: Deepak Majeti <de...@hpe.com>
Authored: Tue May 2 13:39:13 2017 -0400
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue May 16 09:39:02 2017 -0700

----------------------------------------------------------------------
 c++/include/orc/Statistics.hh         |   6 +
 c++/include/orc/Vector.hh             |   1 +
 c++/src/Statistics.cc                 | 192 ++++++-------
 c++/src/Statistics.hh                 | 445 +++++++++++++++++------------
 c++/src/Vector.cc                     |   4 +
 c++/test/TestStripeIndexStatistics.cc |  12 +-
 c++/test/TestTimestampStatistics.cc   |   4 +-
 7 files changed, 377 insertions(+), 287 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/include/orc/Statistics.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index d6bc05a..a108d35 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -40,6 +40,12 @@ namespace orc {
     virtual uint64_t getNumberOfValues() const = 0;
 
     /**
+     * Check whether column has null value
+     * @return true if has null value
+     */
+    virtual bool hasNull() const = 0;
+
+    /**
      * print out statistics of column if any
      */
     virtual std::string toString() const = 0;

http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/include/orc/Vector.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh
index 8f6a0da..f3f1343 100644
--- a/c++/include/orc/Vector.hh
+++ b/c++/include/orc/Vector.hh
@@ -187,6 +187,7 @@ namespace orc {
   struct Decimal {
     Decimal(const Int128& value, int32_t scale);
     explicit Decimal(const std::string& value);
+    Decimal();
 
     std::string toString() const;
     Int128 value;

http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/src/Statistics.cc
----------------------------------------------------------------------
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index 9bbc3f1..f5a3e7b 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -169,187 +169,175 @@ namespace orc {
 
   ColumnStatisticsImpl::ColumnStatisticsImpl
   (const proto::ColumnStatistics& pb) {
-    valueCount = pb.numberofvalues();
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
   }
 
   BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl
   (const proto::ColumnStatistics& pb, const StatContext& statContext){
-    valueCount = pb.numberofvalues();
-    if (!pb.has_binarystatistics() || !statContext.correctStats) {
-      _hasTotalLength = false;
-
-      totalLength = 0;
-    }else{
-      _hasTotalLength = pb.binarystatistics().has_sum();
-      totalLength = static_cast<uint64_t>(pb.binarystatistics().sum());
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
+    if (pb.has_binarystatistics() && statContext.correctStats) {
+      _stats.setHasTotalLength(pb.binarystatistics().has_sum());
+      _stats.setTotalLength(
+          static_cast<uint64_t>(pb.binarystatistics().sum()));
     }
   }
 
   BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl
   (const proto::ColumnStatistics& pb, const StatContext& statContext){
-    valueCount = pb.numberofvalues();
-    if (!pb.has_bucketstatistics() || !statContext.correctStats) {
-      _hasCount = false;
-      trueCount = 0;
-    }else{
-      _hasCount = true;
-      trueCount = pb.bucketstatistics().count(0);
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
+    if (pb.has_bucketstatistics() && statContext.correctStats) {
+      _stats.setHasSum(true);
+      _stats.setSum(pb.bucketstatistics().count(0));
     }
   }
 
   DateColumnStatisticsImpl::DateColumnStatisticsImpl
   (const proto::ColumnStatistics& pb, const StatContext& statContext){
-    valueCount = pb.numberofvalues();
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
     if (!pb.has_datestatistics() || !statContext.correctStats) {
-      _hasMinimum = false;
-      _hasMaximum = false;
-
-      minimum = 0;
-      maximum = 0;
+      // hasMinimum_ is false by default;
+      // hasMaximum_ is false by default;
+      _stats.setMinimum(0);
+      _stats.setMaximum(0);
     } else {
-      _hasMinimum = pb.datestatistics().has_minimum();
-      _hasMaximum = pb.datestatistics().has_maximum();
-      minimum = pb.datestatistics().minimum();
-      maximum = pb.datestatistics().maximum();
+      _stats.setHasMinimum(pb.datestatistics().has_minimum());
+      _stats.setHasMaximum(pb.datestatistics().has_maximum());
+      _stats.setMinimum(pb.datestatistics().minimum());
+      _stats.setMaximum(pb.datestatistics().maximum());
     }
   }
 
   DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl
   (const proto::ColumnStatistics& pb, const StatContext& statContext){
-    valueCount = pb.numberofvalues();
-    if (!pb.has_decimalstatistics() || !statContext.correctStats) {
-      _hasMinimum = false;
-      _hasMaximum = false;
-      _hasSum = false;
-    }else{
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
+    if (pb.has_decimalstatistics() && statContext.correctStats) {
       const proto::DecimalStatistics& stats = pb.decimalstatistics();
-      _hasMinimum = stats.has_minimum();
-      _hasMaximum = stats.has_maximum();
-      _hasSum = stats.has_sum();
+      _stats.setHasMinimum(stats.has_minimum());
+      _stats.setHasMaximum(stats.has_maximum());
+      _stats.setHasSum(stats.has_sum());
 
-      minimum = stats.minimum();
-      maximum = stats.maximum();
-      sum = stats.sum();
+      _stats.setMinimum(Decimal(stats.minimum()));
+      _stats.setMaximum(Decimal(stats.maximum()));
+      _stats.setSum(Decimal(stats.sum()));
     }
   }
 
   DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl
   (const proto::ColumnStatistics& pb){
-    valueCount = pb.numberofvalues();
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
     if (!pb.has_doublestatistics()) {
-      _hasMinimum = false;
-      _hasMaximum = false;
-      _hasSum = false;
-
-      minimum = 0;
-      maximum = 0;
-      sum = 0;
+      _stats.setMinimum(0);
+      _stats.setMaximum(0);
+      _stats.setSum(0);
     }else{
       const proto::DoubleStatistics& stats = pb.doublestatistics();
-      _hasMinimum = stats.has_minimum();
-      _hasMaximum = stats.has_maximum();
-      _hasSum = stats.has_sum();
+      _stats.setHasMinimum(stats.has_minimum());
+      _stats.setHasMaximum(stats.has_maximum());
+      _stats.setHasSum(stats.has_sum());
 
-      minimum = stats.minimum();
-      maximum = stats.maximum();
-      sum = stats.sum();
+      _stats.setMinimum(stats.minimum());
+      _stats.setMaximum(stats.maximum());
+      _stats.setSum(stats.sum());
     }
   }
 
   IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl
   (const proto::ColumnStatistics& pb){
-    valueCount = pb.numberofvalues();
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
     if (!pb.has_intstatistics()) {
-      _hasMinimum = false;
-      _hasMaximum = false;
-      _hasSum = false;
-
-      minimum = 0;
-      maximum = 0;
-      sum = 0;
+      _stats.setMinimum(0);
+      _stats.setMaximum(0);
+      _stats.setSum(0);
     }else{
       const proto::IntegerStatistics& stats = pb.intstatistics();
-      _hasMinimum = stats.has_minimum();
-      _hasMaximum = stats.has_maximum();
-      _hasSum = stats.has_sum();
+      _stats.setHasMinimum(stats.has_minimum());
+      _stats.setHasMaximum(stats.has_maximum());
+      _stats.setHasSum(stats.has_sum());
 
-      minimum = stats.minimum();
-      maximum = stats.maximum();
-      sum = stats.sum();
+      _stats.setMinimum(stats.minimum());
+      _stats.setMaximum(stats.maximum());
+      _stats.setSum(stats.sum());
     }
   }
 
   StringColumnStatisticsImpl::StringColumnStatisticsImpl
   (const proto::ColumnStatistics& pb, const StatContext& statContext){
-    valueCount = pb.numberofvalues();
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
     if (!pb.has_stringstatistics() || !statContext.correctStats) {
-      _hasMinimum = false;
-      _hasMaximum = false;
-      _hasTotalLength = false;
-
-      totalLength = 0;
+      _stats.setTotalLength(0);
     }else{
       const proto::StringStatistics& stats = pb.stringstatistics();
-      _hasMinimum = stats.has_minimum();
-      _hasMaximum = stats.has_maximum();
-      _hasTotalLength = stats.has_sum();
+      _stats.setHasMinimum(stats.has_minimum());
+      _stats.setHasMaximum(stats.has_maximum());
+      _stats.setHasTotalLength(stats.has_sum());
 
-      minimum = stats.minimum();
-      maximum = stats.maximum();
-      totalLength = static_cast<uint64_t>(stats.sum());
+      _stats.setMinimum(stats.minimum());
+      _stats.setMaximum(stats.maximum());
+      _stats.setTotalLength(static_cast<uint64_t>(stats.sum()));
     }
   }
 
   TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl
   (const proto::ColumnStatistics& pb, const StatContext& statContext) {
-    valueCount = pb.numberofvalues();
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
     if (!pb.has_timestampstatistics() || !statContext.correctStats) {
-      _hasMinimum = false;
-      _hasMaximum = false;
-      _hasLowerBound = false;
-      _hasUpperBound = false;
-      minimum = 0;
-      maximum = 0;
-      lowerBound = 0;
-      upperBound = 0;
+      _stats.setMinimum(0);
+      _stats.setMaximum(0);
+      _lowerBound = 0;
+      _upperBound = 0;
     }else{
       const proto::TimestampStatistics& stats = pb.timestampstatistics();
-      _hasMinimum = stats.has_minimumutc() || (stats.has_minimum() && (statContext.writerTimezone != NULL));
-      _hasMaximum = stats.has_maximumutc() || (stats.has_maximum() && (statContext.writerTimezone != NULL));
+      _stats.setHasMinimum(
+                  stats.has_minimumutc() || (stats.has_minimum() && (statContext.writerTimezone != NULL)));
+      _stats.setHasMaximum(
+                  stats.has_maximumutc() || (stats.has_maximum() && (statContext.writerTimezone != NULL)));
       _hasLowerBound = stats.has_minimumutc() || stats.has_minimum();
       _hasUpperBound = stats.has_maximumutc() || stats.has_maximum();
 
       // Timestamp stats are stored in milliseconds
       if (stats.has_minimumutc()) {
-        minimum = stats.minimumutc();
-        lowerBound = minimum;
+        int64_t minimum = stats.minimumutc();
+        _stats.setMinimum(minimum);
+        _lowerBound = minimum;
       } else if (statContext.writerTimezone) {
         int64_t writerTimeSec = stats.minimum() / 1000;
         // multiply the offset by 1000 to convert to millisecond
-        minimum = stats.minimum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
-        lowerBound = minimum;
+        int64_t minimum = stats.minimum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
+        _stats.setMinimum(minimum);
+        _lowerBound = minimum;
       } else {
-        minimum = 0;
+        _stats.setMinimum(0);
         // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown TZ and daylight savings
-        lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
+        _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
       }
 
       // Timestamp stats are stored in milliseconds
       if (stats.has_maximumutc()) {
-        maximum = stats.maximumutc();
-        upperBound = maximum;
+        int64_t maximum = stats.maximumutc();
+        _stats.setMaximum(maximum);
+        _upperBound = maximum;
       } else if (statContext.writerTimezone) {
         int64_t writerTimeSec = stats.maximum() / 1000;
         // multiply the offset by 1000 to convert to millisecond
-        maximum = stats.maximum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
-        upperBound = maximum;
+        int64_t maximum = stats.maximum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
+        _stats.setMaximum(maximum);
+        _upperBound = maximum;
       } else {
-        maximum = 0;
+        _stats.setMaximum(0);
         // add 1 day 1 hour (25 hours) in milliseconds to handle unknown TZ and daylight savings
-        upperBound = stats.maximum() +  (25 * SECONDS_PER_HOUR * 1000);
+        _upperBound = stats.maximum() +  (25 * SECONDS_PER_HOUR * 1000);
       }
       // Add 1 millisecond to account for microsecond precision of values
-      upperBound += 1;
+      _upperBound += 1;
     }
   }
 

http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/src/Statistics.hh
----------------------------------------------------------------------
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index 3a5996b..54699ed 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -41,49 +41,141 @@ namespace orc {
   };
 
 /**
+ * Internal Statistics Implementation
+ */
+
+  template <typename T>
+  class InternalStatisticsImpl {
+  private:
+    bool _hasNull;
+    bool _hasMinimum;
+    bool _hasMaximum;
+    bool _hasSum;
+    bool _hasTotalLength;
+    uint64_t _totalLength;
+    uint64_t _valueCount;
+    T _minimum;
+    T _maximum;
+    T _sum;
+  public:
+    InternalStatisticsImpl() {
+      _hasNull = false;
+      _hasMinimum = false;
+      _hasMaximum = false;
+      _hasSum = false;
+      _hasTotalLength = false;
+      _totalLength = 0;
+      _valueCount = 0;
+    }
+
+    ~InternalStatisticsImpl() {}
+
+    // GET / SET totalLength_
+    bool hasTotalLength() const { return _hasTotalLength; }
+
+    void setHasTotalLength(bool hasTotalLength) { _hasTotalLength = hasTotalLength; }
+
+    uint64_t getTotalLength() const { return _totalLength; }
+
+    void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; }
+
+    // GET / SET sum_
+    bool hasSum() const { return _hasSum; }
+
+    void setHasSum(bool hasSum) { _hasSum = hasSum; }
+
+    T getSum() const { return _sum; }
+
+    void setSum(T sum) { _sum = sum; }
+
+    // GET / SET maximum_
+    bool hasMaximum() const { return _hasMaximum; }
+
+    T getMaximum() const { return _maximum; }
+
+    void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; }
+
+    void setMaximum(T max) { _maximum = max; }
+
+    // GET / SET minimum_
+    bool hasMinimum() const { return _hasMinimum; }
+
+    void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; }
+
+    T getMinimum() const { return _minimum; }
+
+    void setMinimum(T min) { _minimum = min; }
+
+    // GET / SET valueCount_
+    uint64_t getNumberOfValues() const { return _valueCount; }
+
+    void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; }
+
+    // GET / SET hasNullValue_
+    bool hasNull() const { return _hasNull; }
+
+    void setHasNull(bool hasNull) { _hasNull = hasNull; }
+   };
+
+  typedef InternalStatisticsImpl<char> InternalCharStatistics;
+  typedef InternalStatisticsImpl<uint64_t> InternalBooleanStatistics;
+  typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
+  typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
+  typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
+  typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
+  typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
+
+/**
  * ColumnStatistics Implementation
  */
 
   class ColumnStatisticsImpl: public ColumnStatistics {
   private:
-    uint64_t valueCount;
-
+    InternalCharStatistics _stats;
   public:
     ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
     virtual ~ColumnStatisticsImpl();
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     std::string toString() const override {
       std::ostringstream buffer;
-      buffer << "Column has " << valueCount << " values" << std::endl;
+      buffer << "Column has " << getNumberOfValues() << " values"
+             << " and has null value: " << (hasNull() ? "yes" : "no")
+             << std::endl;
       return buffer.str();
     }
   };
 
   class BinaryColumnStatisticsImpl: public BinaryColumnStatistics {
   private:
-    bool _hasTotalLength;
-    uint64_t valueCount;
-    uint64_t totalLength;
-
+    InternalCharStatistics _stats;
   public:
     BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
                                const StatContext& statContext);
     virtual ~BinaryColumnStatisticsImpl();
 
-    bool hasTotalLength() const override {
-      return _hasTotalLength;
-    }
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
+    }
+
+    bool hasTotalLength() const override {
+      return _stats.hasTotalLength();
     }
 
     uint64_t getTotalLength() const override {
-      if(_hasTotalLength){
-        return totalLength;
+      if(hasTotalLength()){
+        return _stats.getTotalLength();
       }else{
         throw ParseError("Total length is not defined.");
       }
@@ -92,9 +184,10 @@ namespace orc {
     std::string toString() const override {
       std::ostringstream buffer;
       buffer << "Data type: Binary" << std::endl
-             << "Values: " << valueCount << std::endl;
-      if(_hasTotalLength){
-        buffer << "Total length: " << totalLength << std::endl;
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasTotalLength()){
+        buffer << "Total length: " << getTotalLength() << std::endl;
       }else{
         buffer << "Total length: not defined" << std::endl;
       }
@@ -104,33 +197,35 @@ namespace orc {
 
   class BooleanColumnStatisticsImpl: public BooleanColumnStatistics {
   private:
-    bool _hasCount;
-    uint64_t valueCount;
-    uint64_t trueCount;
+    InternalBooleanStatistics _stats;
 
   public:
     BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
     virtual ~BooleanColumnStatisticsImpl();
 
     bool hasCount() const override {
-      return _hasCount;
+      return _stats.hasSum();
     }
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     uint64_t getFalseCount() const override {
-      if(_hasCount){
-        return valueCount - trueCount;
+      if(hasCount()){
+        return getNumberOfValues() - _stats.getSum();
       }else{
         throw ParseError("False count is not defined.");
       }
     }
 
     uint64_t getTrueCount() const override {
-      if(_hasCount){
-        return trueCount;
+      if(hasCount()){
+        return _stats.getSum();
       }else{
         throw ParseError("True count is not defined.");
       }
@@ -139,10 +234,11 @@ namespace orc {
     std::string toString() const override {
       std::ostringstream buffer;
       buffer << "Data type: Boolean" << std::endl
-             << "Values: " << valueCount << std::endl;
-      if(_hasCount){
-        buffer << "(true: " << trueCount << "; false: "
-               << valueCount - trueCount << ")" << std::endl;
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasCount()){
+        buffer << "(true: " << getTrueCount() << "; false: "
+               << getFalseCount() << ")" << std::endl;
       } else {
         buffer << "(true: not defined; false: not defined)" << std::endl;
         buffer << "True and false count are not defined" << std::endl;
@@ -153,39 +249,38 @@ namespace orc {
 
   class DateColumnStatisticsImpl: public DateColumnStatistics {
   private:
-    bool _hasMinimum;
-    bool _hasMaximum;
-    uint64_t valueCount;
-    int32_t minimum;
-    int32_t maximum;
-
+    InternalDateStatistics _stats; 
   public:
     DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
     virtual ~DateColumnStatisticsImpl();
 
     bool hasMinimum() const override {
-      return _hasMinimum;
+      return _stats.hasMinimum();
     }
 
     bool hasMaximum() const override {
-      return _hasMaximum;
+      return _stats.hasMaximum();
     }
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     int32_t getMinimum() const override {
-      if(_hasMinimum){
-        return minimum;
+      if(hasMinimum()){
+        return _stats.getMinimum();
       }else{
         throw ParseError("Minimum is not defined.");
       }
     }
 
     int32_t getMaximum() const override {
-      if(_hasMaximum){
-        return maximum;
+      if(hasMaximum()){
+        return _stats.getMaximum();
       }else{
         throw ParseError("Maximum is not defined.");
       }
@@ -194,15 +289,16 @@ namespace orc {
     std::string toString() const override {
       std::ostringstream buffer;
       buffer << "Data type: Date" << std::endl
-             << "Values: " << valueCount << std::endl;
-      if(_hasMinimum){
-        buffer << "Minimum: " << minimum << std::endl;
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasMinimum()){
+        buffer << "Minimum: " << getMinimum() << std::endl;
       }else{
         buffer << "Minimum: not defined" << std::endl;
       }
 
-      if(_hasMaximum){
-        buffer << "Maximum: " << maximum << std::endl;
+      if(hasMaximum()){
+        buffer << "Maximum: " << getMaximum() << std::endl;
       }else{
         buffer << "Maximum: not defined" << std::endl;
       }
@@ -212,53 +308,51 @@ namespace orc {
 
   class DecimalColumnStatisticsImpl: public DecimalColumnStatistics {
   private:
-    bool _hasMinimum;
-    bool _hasMaximum;
-    bool _hasSum;
-    uint64_t valueCount;
-    std::string minimum;
-    std::string maximum;
-    std::string sum;
+    InternalDecimalStatistics _stats; 
 
   public:
     DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
     virtual ~DecimalColumnStatisticsImpl();
 
     bool hasMinimum() const override {
-      return _hasMinimum;
+      return _stats.hasMinimum();
     }
 
     bool hasMaximum() const override {
-      return _hasMaximum;
+      return _stats.hasMaximum();
     }
 
     bool hasSum() const override {
-      return _hasSum;
+      return _stats.hasSum();
     }
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     Decimal getMinimum() const override {
-      if(_hasMinimum){
-        return Decimal(minimum);
+      if(hasMinimum()){
+        return _stats.getMinimum();
       }else{
         throw ParseError("Minimum is not defined.");
       }
     }
 
     Decimal getMaximum() const override {
-      if(_hasMaximum){
-        return Decimal(maximum);
+      if(hasMaximum()){
+        return _stats.getMaximum();
       }else{
         throw ParseError("Maximum is not defined.");
       }
     }
 
     Decimal getSum() const override {
-      if(_hasSum){
-        return Decimal(sum);
+      if(hasSum()){
+        return _stats.getSum();
       }else{
         throw ParseError("Sum is not defined.");
       }
@@ -267,21 +361,22 @@ namespace orc {
     std::string toString() const override {
       std::ostringstream buffer;
       buffer << "Data type: Decimal" << std::endl
-          << "Values: " << valueCount << std::endl;
-      if(_hasMinimum){
-        buffer << "Minimum: " << minimum << std::endl;
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasMinimum()){
+        buffer << "Minimum: " << getMinimum().toString() << std::endl;
       }else{
         buffer << "Minimum: not defined" << std::endl;
       }
 
-      if(_hasMaximum){
-        buffer << "Maximum: " << maximum << std::endl;
+      if(hasMaximum()){
+        buffer << "Maximum: " << getMaximum().toString() << std::endl;
       }else{
         buffer << "Maximum: not defined" << std::endl;
       }
 
-      if(_hasSum){
-        buffer << "Sum: " << sum << std::endl;
+      if(hasSum()){
+        buffer << "Sum: " << getSum().toString() << std::endl;
       }else{
         buffer << "Sum: not defined" << std::endl;
       }
@@ -292,53 +387,50 @@ namespace orc {
 
   class DoubleColumnStatisticsImpl: public DoubleColumnStatistics {
   private:
-    bool _hasMinimum;
-    bool _hasMaximum;
-    bool _hasSum;
-    uint64_t valueCount;
-    double minimum;
-    double maximum;
-    double sum;
-
+    InternalDoubleStatistics _stats;
   public:
     DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
     virtual ~DoubleColumnStatisticsImpl();
 
     bool hasMinimum() const override {
-      return _hasMinimum;
+      return _stats.hasMinimum();
     }
 
     bool hasMaximum() const override {
-      return _hasMaximum;
+      return _stats.hasMaximum();
     }
 
     bool hasSum() const override {
-      return _hasSum;
+      return _stats.hasSum();
     }
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     double getMinimum() const override {
-      if(_hasMinimum){
-        return minimum;
+      if(hasMinimum()){
+        return _stats.getMinimum();
       }else{
         throw ParseError("Minimum is not defined.");
       }
     }
 
     double getMaximum() const override {
-      if(_hasMaximum){
-        return maximum;
+      if(hasMaximum()){
+        return _stats.getMaximum();
       }else{
         throw ParseError("Maximum is not defined.");
       }
     }
 
     double getSum() const override {
-      if(_hasSum){
-        return sum;
+      if(hasSum()){
+        return _stats.hasSum();
       }else{
         throw ParseError("Sum is not defined.");
       }
@@ -347,21 +439,22 @@ namespace orc {
     std::string toString() const override {
       std::ostringstream buffer;
       buffer << "Data type: Double" << std::endl
-          << "Values: " << valueCount << std::endl;
-      if(_hasMinimum){
-        buffer << "Minimum: " << minimum << std::endl;
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasMinimum()){
+        buffer << "Minimum: " << getMinimum() << std::endl;
       }else{
         buffer << "Minimum: not defined" << std::endl;
       }
 
-      if(_hasMaximum){
-        buffer << "Maximum: " << maximum << std::endl;
+      if(hasMaximum()){
+        buffer << "Maximum: " << getMaximum() << std::endl;
       }else{
         buffer << "Maximum: not defined" << std::endl;
       }
 
-      if(_hasSum){
-        buffer << "Sum: " << sum << std::endl;
+      if(hasSum()){
+        buffer << "Sum: " << getSum() << std::endl;
       }else{
         buffer << "Sum: not defined" << std::endl;
       }
@@ -371,53 +464,50 @@ namespace orc {
 
   class IntegerColumnStatisticsImpl: public IntegerColumnStatistics {
   private:
-    bool _hasMinimum;
-    bool _hasMaximum;
-    bool _hasSum;
-    uint64_t valueCount;
-    int64_t minimum;
-    int64_t maximum;
-    int64_t sum;
-
+    InternalIntegerStatistics _stats;
   public:
     IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
     virtual ~IntegerColumnStatisticsImpl();
 
     bool hasMinimum() const override {
-      return _hasMinimum;
+      return _stats.hasMinimum();
     }
 
     bool hasMaximum() const override {
-      return _hasMaximum;
+      return _stats.hasMaximum();
     }
 
     bool hasSum() const override {
-      return _hasSum;
+      return _stats.hasSum();
     }
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     int64_t getMinimum() const override {
-      if(_hasMinimum){
-        return minimum;
+      if(hasMinimum()){
+        return _stats.getMinimum();
       }else{
         throw ParseError("Minimum is not defined.");
       }
     }
 
     int64_t getMaximum() const override {
-      if(_hasMaximum){
-        return maximum;
+      if(hasMaximum()){
+        return _stats.getMaximum();
       }else{
         throw ParseError("Maximum is not defined.");
       }
     }
 
     int64_t getSum() const override {
-      if(_hasSum){
-        return sum;
+      if(hasSum()){
+        return _stats.getSum();
       }else{
         throw ParseError("Sum is not defined.");
       }
@@ -426,21 +516,22 @@ namespace orc {
     std::string toString() const override {
       std::ostringstream buffer;
       buffer << "Data type: Integer" << std::endl
-          << "Values: " << valueCount << std::endl;
-      if(_hasMinimum){
-        buffer << "Minimum: " << minimum << std::endl;
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasMinimum()){
+        buffer << "Minimum: " << getMinimum() << std::endl;
       }else{
         buffer << "Minimum: not defined" << std::endl;
       }
 
-      if(_hasMaximum){
-        buffer << "Maximum: " << maximum << std::endl;
+      if(hasMaximum()){
+        buffer << "Maximum: " << getMaximum() << std::endl;
       }else{
         buffer << "Maximum: not defined" << std::endl;
       }
 
-      if(_hasSum){
-        buffer << "Sum: " << sum << std::endl;
+      if(hasSum()){
+        buffer << "Sum: " << getSum() << std::endl;
       }else{
         buffer << "Sum: not defined" << std::endl;
       }
@@ -450,53 +541,51 @@ namespace orc {
 
   class StringColumnStatisticsImpl: public StringColumnStatistics {
   private:
-    bool _hasMinimum;
-    bool _hasMaximum;
-    bool _hasTotalLength;
-    uint64_t valueCount;
-    std::string minimum;
-    std::string maximum;
-    uint64_t totalLength;
+    InternalStringStatistics _stats;
 
   public:
     StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
     virtual ~StringColumnStatisticsImpl();
 
     bool hasMinimum() const override {
-      return _hasMinimum;
+      return _stats.hasMinimum();
     }
 
     bool hasMaximum() const override {
-      return _hasMaximum;
+      return _stats.hasMaximum();
     }
 
     bool hasTotalLength() const override {
-      return _hasTotalLength;
+      return _stats.hasTotalLength();
     }
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     std::string getMinimum() const override {
-      if(_hasMinimum){
-        return minimum;
+      if(hasMinimum()){
+        return _stats.getMinimum();
       }else{
         throw ParseError("Minimum is not defined.");
       }
     }
 
     std::string getMaximum() const override {
-      if(_hasMaximum){
-        return maximum;
+      if(hasMaximum()){
+        return _stats.getMaximum();
       }else{
         throw ParseError("Maximum is not defined.");
       }
     }
 
     uint64_t getTotalLength() const override {
-      if(_hasTotalLength){
-        return totalLength;
+      if(hasTotalLength()){
+        return _stats.getTotalLength();
       }else{
         throw ParseError("Total length is not defined.");
       }
@@ -505,21 +594,22 @@ namespace orc {
     std::string toString() const override {
       std::ostringstream buffer;
       buffer << "Data type: String" << std::endl
-          << "Values: " << valueCount << std::endl;
-      if(_hasMinimum){
-        buffer << "Minimum: " << minimum << std::endl;
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasMinimum()){
+        buffer << "Minimum: " << getMinimum() << std::endl;
       }else{
         buffer << "Minimum is not defined" << std::endl;
       }
 
-      if(_hasMaximum){
-        buffer << "Maximum: " << maximum << std::endl;
+      if(hasMaximum()){
+        buffer << "Maximum: " << getMaximum() << std::endl;
       }else{
         buffer << "Maximum is not defined" << std::endl;
       }
 
-      if(_hasTotalLength){
-        buffer << "Total length: " << totalLength << std::endl;
+      if(hasTotalLength()){
+        buffer << "Total length: " << getTotalLength() << std::endl;
       }else{
         buffer << "Total length is not defined" << std::endl;
       }
@@ -529,15 +619,11 @@ namespace orc {
 
   class TimestampColumnStatisticsImpl: public TimestampColumnStatistics {
   private:
-    bool _hasMinimum;
-    bool _hasMaximum;
-    uint64_t valueCount;
-    int64_t minimum;
-    int64_t maximum;
+    InternalIntegerStatistics _stats;
     bool _hasLowerBound;
     bool _hasUpperBound;
-    int64_t lowerBound;
-    int64_t upperBound;
+    int64_t _lowerBound;
+    int64_t _upperBound;
 
   public:
     TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
@@ -545,28 +631,32 @@ namespace orc {
     virtual ~TimestampColumnStatisticsImpl();
 
     bool hasMinimum() const override {
-      return _hasMinimum;
+      return _stats.hasMinimum();
     }
 
     bool hasMaximum() const override {
-      return _hasMaximum;
+      return _stats.hasMaximum();
     }
 
     uint64_t getNumberOfValues() const override {
-      return valueCount;
+      return _stats.getNumberOfValues();
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
     }
 
     int64_t getMinimum() const override {
-      if(_hasMinimum){
-        return minimum;
+      if(hasMinimum()){
+        return _stats.getMinimum();
       }else{
         throw ParseError("Minimum is not defined.");
       }
     }
 
     int64_t getMaximum() const override {
-      if(_hasMaximum){
-        return maximum;
+      if(hasMaximum()){
+        return _stats.getMaximum();
       }else{
         throw ParseError("Maximum is not defined.");
       }
@@ -579,39 +669,40 @@ namespace orc {
       time_t secs = 0;
 
       buffer << "Data type: Timestamp" << std::endl
-          << "Values: " << valueCount << std::endl;
-      if(_hasMinimum){
-        secs = static_cast<time_t>(minimum/1000);
+             << "Values: " << getNumberOfValues() << std::endl
+             << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if(hasMinimum()){
+        secs = static_cast<time_t>(getMinimum() / 1000);
         gmtime_r(&secs, &tmValue);
         strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
-        buffer << "Minimum: " << timeBuffer << "." << (minimum % 1000) << std::endl;
+        buffer << "Minimum: " << timeBuffer << "." << (getMinimum() % 1000) << std::endl;
       }else{
         buffer << "Minimum is not defined" << std::endl;
       }
 
-      if(_hasLowerBound){
-        secs = static_cast<time_t>(lowerBound/1000);
+      if(hasLowerBound()){
+        secs = static_cast<time_t>(getLowerBound() / 1000);
         gmtime_r(&secs, &tmValue);
         strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
-        buffer << "LowerBound: " << timeBuffer << "." << (lowerBound % 1000) << std::endl;
+        buffer << "LowerBound: " << timeBuffer << "." << (getLowerBound() % 1000) << std::endl;
       }else{
         buffer << "LowerBound is not defined" << std::endl;
       }
 
-      if(_hasMaximum){
-        secs = static_cast<time_t>(maximum/1000);
+      if(hasMaximum()){
+        secs = static_cast<time_t>(getMaximum()/1000);
         gmtime_r(&secs, &tmValue);
         strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
-        buffer << "Maximum: " << timeBuffer << "." << (maximum % 1000) << std::endl;
+        buffer << "Maximum: " << timeBuffer << "." << (getMaximum() % 1000) << std::endl;
       }else{
         buffer << "Maximum is not defined" << std::endl;
       }
 
-      if(_hasUpperBound){
-        secs = static_cast<time_t>(upperBound/1000);
+      if(hasUpperBound()){
+        secs = static_cast<time_t>(getUpperBound() / 1000);
         gmtime_r(&secs, &tmValue);
         strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
-        buffer << "UpperBound: " << timeBuffer << "." << (upperBound % 1000) << std::endl;
+        buffer << "UpperBound: " << timeBuffer << "." << (getUpperBound() % 1000) << std::endl;
       }else{
         buffer << "UpperBound is not defined" << std::endl;
       }
@@ -628,16 +719,16 @@ namespace orc {
     }
 
     int64_t getLowerBound() const override {
-      if(_hasLowerBound){
-        return lowerBound;
+      if(hasLowerBound()){
+        return _lowerBound;
       }else{
         throw ParseError("LowerBound is not defined.");
       }
     }
 
     int64_t getUpperBound() const override {
-      if(_hasUpperBound){
-        return upperBound;
+      if(hasUpperBound()){
+        return _upperBound;
       }else{
         throw ParseError("UpperBound is not defined.");
       }

http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/src/Vector.cc
----------------------------------------------------------------------
diff --git a/c++/src/Vector.cc b/c++/src/Vector.cc
index d5df7f7..2c7e2d3 100644
--- a/c++/src/Vector.cc
+++ b/c++/src/Vector.cc
@@ -396,6 +396,10 @@ namespace orc {
     }
   }
 
+  Decimal::Decimal() : value(0), scale(0) {
+    // PASS
+  }
+
   std::string Decimal::toString() const {
     return value.toDecimalString(scale);
   }

http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/test/TestStripeIndexStatistics.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc
index 342bafb..e6607d4 100644
--- a/c++/test/TestStripeIndexStatistics.cc
+++ b/c++/test/TestStripeIndexStatistics.cc
@@ -45,19 +45,19 @@ namespace orc {
 
     const orc::IntegerColumnStatistics* intColStats;
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getRowIndexStatistics(1, 0));
-    EXPECT_EQ("Data type: Integer\nValues: 2000\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n", intColStats->toString());
+    EXPECT_EQ("Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n", intColStats->toString());
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getRowIndexStatistics(1, 1));
-    EXPECT_EQ("Data type: Integer\nValues: 2000\nMinimum: 2001\nMaximum: 4000\nSum: 6001000\n", intColStats->toString());
+    EXPECT_EQ("Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nSum: 6001000\n", intColStats->toString());
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getRowIndexStatistics(1, 2));
-    EXPECT_EQ("Data type: Integer\nValues: 2000\nMinimum: 4001\nMaximum: 6000\nSum: 10001000\n", intColStats->toString());
+    EXPECT_EQ("Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nSum: 10001000\n", intColStats->toString());
 
     const orc::StringColumnStatistics* stringColStats;
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getRowIndexStatistics(2, 0));
-    EXPECT_EQ("Data type: String\nValues: 2000\nMinimum: 1000\nMaximum: 9a\nTotal length: 7892\n", stringColStats->toString());
+    EXPECT_EQ("Data type: String\nValues: 2000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: 7892\n", stringColStats->toString());
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getRowIndexStatistics(2, 1));
-    EXPECT_EQ("Data type: String\nValues: 2000\nMinimum: 2001\nMaximum: 4000\nTotal length: 8000\n", stringColStats->toString());
+    EXPECT_EQ("Data type: String\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nTotal length: 8000\n", stringColStats->toString());
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getRowIndexStatistics(2, 2));
-    EXPECT_EQ("Data type: String\nValues: 2000\nMinimum: 4001\nMaximum: 6000\nTotal length: 8000\n", stringColStats->toString());
+    EXPECT_EQ("Data type: String\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nTotal length: 8000\n", stringColStats->toString());
   }
 
 }  // namespace

http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/test/TestTimestampStatistics.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestTimestampStatistics.cc b/c++/test/TestTimestampStatistics.cc
index b5ce2a7..302ef9b 100644
--- a/c++/test/TestTimestampStatistics.cc
+++ b/c++/test/TestTimestampStatistics.cc
@@ -50,11 +50,11 @@ namespace orc {
     EXPECT_FALSE(footerColStats->hasMaximum());
     EXPECT_TRUE(footerColStats->hasLowerBound());
     EXPECT_TRUE(footerColStats->hasUpperBound());
-    EXPECT_EQ("Data type: Timestamp\nValues: 12\nMinimum is not defined\nLowerBound: 1994-12-31 07:00:00.688\nMaximum is not defined\nUpperBound: 2037-01-02 09:00:00.1\n", footerColStats->toString());
+    EXPECT_EQ("Data type: Timestamp\nValues: 12\nHas null: no\nMinimum is not defined\nLowerBound: 1994-12-31 07:00:00.688\nMaximum is not defined\nUpperBound: 2037-01-02 09:00:00.1\n", footerColStats->toString());
 
     EXPECT_TRUE(stripeColStats->hasMinimum());
     EXPECT_TRUE(stripeColStats->hasMaximum());
-    EXPECT_EQ("Data type: Timestamp\nValues: 12\nMinimum: 1995-01-01 00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString());
+    EXPECT_EQ("Data type: Timestamp\nValues: 12\nHas null: no\nMinimum: 1995-01-01 00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString());
   }
 
 }  // namespace