You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2017/05/16 16:39:31 UTC
orc git commit: ORC-185 : [C++] Simplify Statististics Implementation
Repository: orc
Updated Branches:
refs/heads/master 02055f733 -> 68994174f
ORC-185 : [C++] Simplify Statististics Implementation
Fixes #120
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/68994174
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/68994174
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/68994174
Branch: refs/heads/master
Commit: 68994174fe254bc581ee0ae1486d023032bfb520
Parents: 02055f7
Author: Deepak Majeti <de...@hpe.com>
Authored: Tue May 2 13:39:13 2017 -0400
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue May 16 09:39:02 2017 -0700
----------------------------------------------------------------------
c++/include/orc/Statistics.hh | 6 +
c++/include/orc/Vector.hh | 1 +
c++/src/Statistics.cc | 192 ++++++-------
c++/src/Statistics.hh | 445 +++++++++++++++++------------
c++/src/Vector.cc | 4 +
c++/test/TestStripeIndexStatistics.cc | 12 +-
c++/test/TestTimestampStatistics.cc | 4 +-
7 files changed, 377 insertions(+), 287 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/include/orc/Statistics.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index d6bc05a..a108d35 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -40,6 +40,12 @@ namespace orc {
virtual uint64_t getNumberOfValues() const = 0;
/**
+ * Check whether column has null value
+ * @return true if has null value
+ */
+ virtual bool hasNull() const = 0;
+
+ /**
* print out statistics of column if any
*/
virtual std::string toString() const = 0;
http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/include/orc/Vector.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh
index 8f6a0da..f3f1343 100644
--- a/c++/include/orc/Vector.hh
+++ b/c++/include/orc/Vector.hh
@@ -187,6 +187,7 @@ namespace orc {
struct Decimal {
Decimal(const Int128& value, int32_t scale);
explicit Decimal(const std::string& value);
+ Decimal();
std::string toString() const;
Int128 value;
http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/src/Statistics.cc
----------------------------------------------------------------------
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index 9bbc3f1..f5a3e7b 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -169,187 +169,175 @@ namespace orc {
ColumnStatisticsImpl::ColumnStatisticsImpl
(const proto::ColumnStatistics& pb) {
- valueCount = pb.numberofvalues();
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
}
BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl
(const proto::ColumnStatistics& pb, const StatContext& statContext){
- valueCount = pb.numberofvalues();
- if (!pb.has_binarystatistics() || !statContext.correctStats) {
- _hasTotalLength = false;
-
- totalLength = 0;
- }else{
- _hasTotalLength = pb.binarystatistics().has_sum();
- totalLength = static_cast<uint64_t>(pb.binarystatistics().sum());
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (pb.has_binarystatistics() && statContext.correctStats) {
+ _stats.setHasTotalLength(pb.binarystatistics().has_sum());
+ _stats.setTotalLength(
+ static_cast<uint64_t>(pb.binarystatistics().sum()));
}
}
BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl
(const proto::ColumnStatistics& pb, const StatContext& statContext){
- valueCount = pb.numberofvalues();
- if (!pb.has_bucketstatistics() || !statContext.correctStats) {
- _hasCount = false;
- trueCount = 0;
- }else{
- _hasCount = true;
- trueCount = pb.bucketstatistics().count(0);
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (pb.has_bucketstatistics() && statContext.correctStats) {
+ _stats.setHasSum(true);
+ _stats.setSum(pb.bucketstatistics().count(0));
}
}
DateColumnStatisticsImpl::DateColumnStatisticsImpl
(const proto::ColumnStatistics& pb, const StatContext& statContext){
- valueCount = pb.numberofvalues();
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
if (!pb.has_datestatistics() || !statContext.correctStats) {
- _hasMinimum = false;
- _hasMaximum = false;
-
- minimum = 0;
- maximum = 0;
+ // hasMinimum_ is false by default;
+ // hasMaximum_ is false by default;
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
} else {
- _hasMinimum = pb.datestatistics().has_minimum();
- _hasMaximum = pb.datestatistics().has_maximum();
- minimum = pb.datestatistics().minimum();
- maximum = pb.datestatistics().maximum();
+ _stats.setHasMinimum(pb.datestatistics().has_minimum());
+ _stats.setHasMaximum(pb.datestatistics().has_maximum());
+ _stats.setMinimum(pb.datestatistics().minimum());
+ _stats.setMaximum(pb.datestatistics().maximum());
}
}
DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl
(const proto::ColumnStatistics& pb, const StatContext& statContext){
- valueCount = pb.numberofvalues();
- if (!pb.has_decimalstatistics() || !statContext.correctStats) {
- _hasMinimum = false;
- _hasMaximum = false;
- _hasSum = false;
- }else{
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (pb.has_decimalstatistics() && statContext.correctStats) {
const proto::DecimalStatistics& stats = pb.decimalstatistics();
- _hasMinimum = stats.has_minimum();
- _hasMaximum = stats.has_maximum();
- _hasSum = stats.has_sum();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasSum(stats.has_sum());
- minimum = stats.minimum();
- maximum = stats.maximum();
- sum = stats.sum();
+ _stats.setMinimum(Decimal(stats.minimum()));
+ _stats.setMaximum(Decimal(stats.maximum()));
+ _stats.setSum(Decimal(stats.sum()));
}
}
DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl
(const proto::ColumnStatistics& pb){
- valueCount = pb.numberofvalues();
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
if (!pb.has_doublestatistics()) {
- _hasMinimum = false;
- _hasMaximum = false;
- _hasSum = false;
-
- minimum = 0;
- maximum = 0;
- sum = 0;
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ _stats.setSum(0);
}else{
const proto::DoubleStatistics& stats = pb.doublestatistics();
- _hasMinimum = stats.has_minimum();
- _hasMaximum = stats.has_maximum();
- _hasSum = stats.has_sum();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasSum(stats.has_sum());
- minimum = stats.minimum();
- maximum = stats.maximum();
- sum = stats.sum();
+ _stats.setMinimum(stats.minimum());
+ _stats.setMaximum(stats.maximum());
+ _stats.setSum(stats.sum());
}
}
IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl
(const proto::ColumnStatistics& pb){
- valueCount = pb.numberofvalues();
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
if (!pb.has_intstatistics()) {
- _hasMinimum = false;
- _hasMaximum = false;
- _hasSum = false;
-
- minimum = 0;
- maximum = 0;
- sum = 0;
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ _stats.setSum(0);
}else{
const proto::IntegerStatistics& stats = pb.intstatistics();
- _hasMinimum = stats.has_minimum();
- _hasMaximum = stats.has_maximum();
- _hasSum = stats.has_sum();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasSum(stats.has_sum());
- minimum = stats.minimum();
- maximum = stats.maximum();
- sum = stats.sum();
+ _stats.setMinimum(stats.minimum());
+ _stats.setMaximum(stats.maximum());
+ _stats.setSum(stats.sum());
}
}
StringColumnStatisticsImpl::StringColumnStatisticsImpl
(const proto::ColumnStatistics& pb, const StatContext& statContext){
- valueCount = pb.numberofvalues();
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
if (!pb.has_stringstatistics() || !statContext.correctStats) {
- _hasMinimum = false;
- _hasMaximum = false;
- _hasTotalLength = false;
-
- totalLength = 0;
+ _stats.setTotalLength(0);
}else{
const proto::StringStatistics& stats = pb.stringstatistics();
- _hasMinimum = stats.has_minimum();
- _hasMaximum = stats.has_maximum();
- _hasTotalLength = stats.has_sum();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasTotalLength(stats.has_sum());
- minimum = stats.minimum();
- maximum = stats.maximum();
- totalLength = static_cast<uint64_t>(stats.sum());
+ _stats.setMinimum(stats.minimum());
+ _stats.setMaximum(stats.maximum());
+ _stats.setTotalLength(static_cast<uint64_t>(stats.sum()));
}
}
TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl
(const proto::ColumnStatistics& pb, const StatContext& statContext) {
- valueCount = pb.numberofvalues();
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
if (!pb.has_timestampstatistics() || !statContext.correctStats) {
- _hasMinimum = false;
- _hasMaximum = false;
- _hasLowerBound = false;
- _hasUpperBound = false;
- minimum = 0;
- maximum = 0;
- lowerBound = 0;
- upperBound = 0;
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ _lowerBound = 0;
+ _upperBound = 0;
}else{
const proto::TimestampStatistics& stats = pb.timestampstatistics();
- _hasMinimum = stats.has_minimumutc() || (stats.has_minimum() && (statContext.writerTimezone != NULL));
- _hasMaximum = stats.has_maximumutc() || (stats.has_maximum() && (statContext.writerTimezone != NULL));
+ _stats.setHasMinimum(
+ stats.has_minimumutc() || (stats.has_minimum() && (statContext.writerTimezone != NULL)));
+ _stats.setHasMaximum(
+ stats.has_maximumutc() || (stats.has_maximum() && (statContext.writerTimezone != NULL)));
_hasLowerBound = stats.has_minimumutc() || stats.has_minimum();
_hasUpperBound = stats.has_maximumutc() || stats.has_maximum();
// Timestamp stats are stored in milliseconds
if (stats.has_minimumutc()) {
- minimum = stats.minimumutc();
- lowerBound = minimum;
+ int64_t minimum = stats.minimumutc();
+ _stats.setMinimum(minimum);
+ _lowerBound = minimum;
} else if (statContext.writerTimezone) {
int64_t writerTimeSec = stats.minimum() / 1000;
// multiply the offset by 1000 to convert to millisecond
- minimum = stats.minimum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
- lowerBound = minimum;
+ int64_t minimum = stats.minimum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
+ _stats.setMinimum(minimum);
+ _lowerBound = minimum;
} else {
- minimum = 0;
+ _stats.setMinimum(0);
// subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown TZ and daylight savings
- lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
+ _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
}
// Timestamp stats are stored in milliseconds
if (stats.has_maximumutc()) {
- maximum = stats.maximumutc();
- upperBound = maximum;
+ int64_t maximum = stats.maximumutc();
+ _stats.setMaximum(maximum);
+ _upperBound = maximum;
} else if (statContext.writerTimezone) {
int64_t writerTimeSec = stats.maximum() / 1000;
// multiply the offset by 1000 to convert to millisecond
- maximum = stats.maximum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
- upperBound = maximum;
+ int64_t maximum = stats.maximum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
+ _stats.setMaximum(maximum);
+ _upperBound = maximum;
} else {
- maximum = 0;
+ _stats.setMaximum(0);
// add 1 day 1 hour (25 hours) in milliseconds to handle unknown TZ and daylight savings
- upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
+ _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
}
// Add 1 millisecond to account for microsecond precision of values
- upperBound += 1;
+ _upperBound += 1;
}
}
http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/src/Statistics.hh
----------------------------------------------------------------------
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index 3a5996b..54699ed 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -41,49 +41,141 @@ namespace orc {
};
/**
+ * Internal Statistics Implementation
+ */
+
+ template <typename T>
+ class InternalStatisticsImpl {
+ private:
+ bool _hasNull;
+ bool _hasMinimum;
+ bool _hasMaximum;
+ bool _hasSum;
+ bool _hasTotalLength;
+ uint64_t _totalLength;
+ uint64_t _valueCount;
+ T _minimum;
+ T _maximum;
+ T _sum;
+ public:
+ InternalStatisticsImpl() {
+ _hasNull = false;
+ _hasMinimum = false;
+ _hasMaximum = false;
+ _hasSum = false;
+ _hasTotalLength = false;
+ _totalLength = 0;
+ _valueCount = 0;
+ }
+
+ ~InternalStatisticsImpl() {}
+
+ // GET / SET totalLength_
+ bool hasTotalLength() const { return _hasTotalLength; }
+
+ void setHasTotalLength(bool hasTotalLength) { _hasTotalLength = hasTotalLength; }
+
+ uint64_t getTotalLength() const { return _totalLength; }
+
+ void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; }
+
+ // GET / SET sum_
+ bool hasSum() const { return _hasSum; }
+
+ void setHasSum(bool hasSum) { _hasSum = hasSum; }
+
+ T getSum() const { return _sum; }
+
+ void setSum(T sum) { _sum = sum; }
+
+ // GET / SET maximum_
+ bool hasMaximum() const { return _hasMaximum; }
+
+ T getMaximum() const { return _maximum; }
+
+ void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; }
+
+ void setMaximum(T max) { _maximum = max; }
+
+ // GET / SET minimum_
+ bool hasMinimum() const { return _hasMinimum; }
+
+ void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; }
+
+ T getMinimum() const { return _minimum; }
+
+ void setMinimum(T min) { _minimum = min; }
+
+ // GET / SET valueCount_
+ uint64_t getNumberOfValues() const { return _valueCount; }
+
+ void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; }
+
+ // GET / SET hasNullValue_
+ bool hasNull() const { return _hasNull; }
+
+ void setHasNull(bool hasNull) { _hasNull = hasNull; }
+ };
+
+ typedef InternalStatisticsImpl<char> InternalCharStatistics;
+ typedef InternalStatisticsImpl<uint64_t> InternalBooleanStatistics;
+ typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
+ typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
+ typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
+ typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
+ typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
+
+/**
* ColumnStatistics Implementation
*/
class ColumnStatisticsImpl: public ColumnStatistics {
private:
- uint64_t valueCount;
-
+ InternalCharStatistics _stats;
public:
ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~ColumnStatisticsImpl();
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
std::string toString() const override {
std::ostringstream buffer;
- buffer << "Column has " << valueCount << " values" << std::endl;
+ buffer << "Column has " << getNumberOfValues() << " values"
+ << " and has null value: " << (hasNull() ? "yes" : "no")
+ << std::endl;
return buffer.str();
}
};
class BinaryColumnStatisticsImpl: public BinaryColumnStatistics {
private:
- bool _hasTotalLength;
- uint64_t valueCount;
- uint64_t totalLength;
-
+ InternalCharStatistics _stats;
public:
BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~BinaryColumnStatisticsImpl();
- bool hasTotalLength() const override {
- return _hasTotalLength;
- }
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ bool hasTotalLength() const override {
+ return _stats.hasTotalLength();
}
uint64_t getTotalLength() const override {
- if(_hasTotalLength){
- return totalLength;
+ if(hasTotalLength()){
+ return _stats.getTotalLength();
}else{
throw ParseError("Total length is not defined.");
}
@@ -92,9 +184,10 @@ namespace orc {
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Binary" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasTotalLength){
- buffer << "Total length: " << totalLength << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasTotalLength()){
+ buffer << "Total length: " << getTotalLength() << std::endl;
}else{
buffer << "Total length: not defined" << std::endl;
}
@@ -104,33 +197,35 @@ namespace orc {
class BooleanColumnStatisticsImpl: public BooleanColumnStatistics {
private:
- bool _hasCount;
- uint64_t valueCount;
- uint64_t trueCount;
+ InternalBooleanStatistics _stats;
public:
BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
virtual ~BooleanColumnStatisticsImpl();
bool hasCount() const override {
- return _hasCount;
+ return _stats.hasSum();
}
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
uint64_t getFalseCount() const override {
- if(_hasCount){
- return valueCount - trueCount;
+ if(hasCount()){
+ return getNumberOfValues() - _stats.getSum();
}else{
throw ParseError("False count is not defined.");
}
}
uint64_t getTrueCount() const override {
- if(_hasCount){
- return trueCount;
+ if(hasCount()){
+ return _stats.getSum();
}else{
throw ParseError("True count is not defined.");
}
@@ -139,10 +234,11 @@ namespace orc {
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Boolean" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasCount){
- buffer << "(true: " << trueCount << "; false: "
- << valueCount - trueCount << ")" << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasCount()){
+ buffer << "(true: " << getTrueCount() << "; false: "
+ << getFalseCount() << ")" << std::endl;
} else {
buffer << "(true: not defined; false: not defined)" << std::endl;
buffer << "True and false count are not defined" << std::endl;
@@ -153,39 +249,38 @@ namespace orc {
class DateColumnStatisticsImpl: public DateColumnStatistics {
private:
- bool _hasMinimum;
- bool _hasMaximum;
- uint64_t valueCount;
- int32_t minimum;
- int32_t maximum;
-
+ InternalDateStatistics _stats;
public:
DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
virtual ~DateColumnStatisticsImpl();
bool hasMinimum() const override {
- return _hasMinimum;
+ return _stats.hasMinimum();
}
bool hasMaximum() const override {
- return _hasMaximum;
+ return _stats.hasMaximum();
}
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
int32_t getMinimum() const override {
- if(_hasMinimum){
- return minimum;
+ if(hasMinimum()){
+ return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
int32_t getMaximum() const override {
- if(_hasMaximum){
- return maximum;
+ if(hasMaximum()){
+ return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
@@ -194,15 +289,16 @@ namespace orc {
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Date" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasMinimum){
- buffer << "Minimum: " << minimum << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
- if(_hasMaximum){
- buffer << "Maximum: " << maximum << std::endl;
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
@@ -212,53 +308,51 @@ namespace orc {
class DecimalColumnStatisticsImpl: public DecimalColumnStatistics {
private:
- bool _hasMinimum;
- bool _hasMaximum;
- bool _hasSum;
- uint64_t valueCount;
- std::string minimum;
- std::string maximum;
- std::string sum;
+ InternalDecimalStatistics _stats;
public:
DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
virtual ~DecimalColumnStatisticsImpl();
bool hasMinimum() const override {
- return _hasMinimum;
+ return _stats.hasMinimum();
}
bool hasMaximum() const override {
- return _hasMaximum;
+ return _stats.hasMaximum();
}
bool hasSum() const override {
- return _hasSum;
+ return _stats.hasSum();
}
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
Decimal getMinimum() const override {
- if(_hasMinimum){
- return Decimal(minimum);
+ if(hasMinimum()){
+ return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
Decimal getMaximum() const override {
- if(_hasMaximum){
- return Decimal(maximum);
+ if(hasMaximum()){
+ return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
Decimal getSum() const override {
- if(_hasSum){
- return Decimal(sum);
+ if(hasSum()){
+ return _stats.getSum();
}else{
throw ParseError("Sum is not defined.");
}
@@ -267,21 +361,22 @@ namespace orc {
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Decimal" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasMinimum){
- buffer << "Minimum: " << minimum << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum().toString() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
- if(_hasMaximum){
- buffer << "Maximum: " << maximum << std::endl;
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum().toString() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
- if(_hasSum){
- buffer << "Sum: " << sum << std::endl;
+ if(hasSum()){
+ buffer << "Sum: " << getSum().toString() << std::endl;
}else{
buffer << "Sum: not defined" << std::endl;
}
@@ -292,53 +387,50 @@ namespace orc {
class DoubleColumnStatisticsImpl: public DoubleColumnStatistics {
private:
- bool _hasMinimum;
- bool _hasMaximum;
- bool _hasSum;
- uint64_t valueCount;
- double minimum;
- double maximum;
- double sum;
-
+ InternalDoubleStatistics _stats;
public:
DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~DoubleColumnStatisticsImpl();
bool hasMinimum() const override {
- return _hasMinimum;
+ return _stats.hasMinimum();
}
bool hasMaximum() const override {
- return _hasMaximum;
+ return _stats.hasMaximum();
}
bool hasSum() const override {
- return _hasSum;
+ return _stats.hasSum();
}
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
double getMinimum() const override {
- if(_hasMinimum){
- return minimum;
+ if(hasMinimum()){
+ return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
double getMaximum() const override {
- if(_hasMaximum){
- return maximum;
+ if(hasMaximum()){
+ return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
double getSum() const override {
- if(_hasSum){
- return sum;
+ if(hasSum()){
+ return _stats.hasSum();
}else{
throw ParseError("Sum is not defined.");
}
@@ -347,21 +439,22 @@ namespace orc {
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Double" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasMinimum){
- buffer << "Minimum: " << minimum << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
- if(_hasMaximum){
- buffer << "Maximum: " << maximum << std::endl;
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
- if(_hasSum){
- buffer << "Sum: " << sum << std::endl;
+ if(hasSum()){
+ buffer << "Sum: " << getSum() << std::endl;
}else{
buffer << "Sum: not defined" << std::endl;
}
@@ -371,53 +464,50 @@ namespace orc {
class IntegerColumnStatisticsImpl: public IntegerColumnStatistics {
private:
- bool _hasMinimum;
- bool _hasMaximum;
- bool _hasSum;
- uint64_t valueCount;
- int64_t minimum;
- int64_t maximum;
- int64_t sum;
-
+ InternalIntegerStatistics _stats;
public:
IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~IntegerColumnStatisticsImpl();
bool hasMinimum() const override {
- return _hasMinimum;
+ return _stats.hasMinimum();
}
bool hasMaximum() const override {
- return _hasMaximum;
+ return _stats.hasMaximum();
}
bool hasSum() const override {
- return _hasSum;
+ return _stats.hasSum();
}
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
int64_t getMinimum() const override {
- if(_hasMinimum){
- return minimum;
+ if(hasMinimum()){
+ return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
- if(_hasMaximum){
- return maximum;
+ if(hasMaximum()){
+ return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
int64_t getSum() const override {
- if(_hasSum){
- return sum;
+ if(hasSum()){
+ return _stats.getSum();
}else{
throw ParseError("Sum is not defined.");
}
@@ -426,21 +516,22 @@ namespace orc {
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Integer" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasMinimum){
- buffer << "Minimum: " << minimum << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
- if(_hasMaximum){
- buffer << "Maximum: " << maximum << std::endl;
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
- if(_hasSum){
- buffer << "Sum: " << sum << std::endl;
+ if(hasSum()){
+ buffer << "Sum: " << getSum() << std::endl;
}else{
buffer << "Sum: not defined" << std::endl;
}
@@ -450,53 +541,51 @@ namespace orc {
class StringColumnStatisticsImpl: public StringColumnStatistics {
private:
- bool _hasMinimum;
- bool _hasMaximum;
- bool _hasTotalLength;
- uint64_t valueCount;
- std::string minimum;
- std::string maximum;
- uint64_t totalLength;
+ InternalStringStatistics _stats;
public:
StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
virtual ~StringColumnStatisticsImpl();
bool hasMinimum() const override {
- return _hasMinimum;
+ return _stats.hasMinimum();
}
bool hasMaximum() const override {
- return _hasMaximum;
+ return _stats.hasMaximum();
}
bool hasTotalLength() const override {
- return _hasTotalLength;
+ return _stats.hasTotalLength();
}
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
std::string getMinimum() const override {
- if(_hasMinimum){
- return minimum;
+ if(hasMinimum()){
+ return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
std::string getMaximum() const override {
- if(_hasMaximum){
- return maximum;
+ if(hasMaximum()){
+ return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
uint64_t getTotalLength() const override {
- if(_hasTotalLength){
- return totalLength;
+ if(hasTotalLength()){
+ return _stats.getTotalLength();
}else{
throw ParseError("Total length is not defined.");
}
@@ -505,21 +594,22 @@ namespace orc {
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: String" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasMinimum){
- buffer << "Minimum: " << minimum << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum is not defined" << std::endl;
}
- if(_hasMaximum){
- buffer << "Maximum: " << maximum << std::endl;
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum is not defined" << std::endl;
}
- if(_hasTotalLength){
- buffer << "Total length: " << totalLength << std::endl;
+ if(hasTotalLength()){
+ buffer << "Total length: " << getTotalLength() << std::endl;
}else{
buffer << "Total length is not defined" << std::endl;
}
@@ -529,15 +619,11 @@ namespace orc {
class TimestampColumnStatisticsImpl: public TimestampColumnStatistics {
private:
- bool _hasMinimum;
- bool _hasMaximum;
- uint64_t valueCount;
- int64_t minimum;
- int64_t maximum;
+ InternalIntegerStatistics _stats;
bool _hasLowerBound;
bool _hasUpperBound;
- int64_t lowerBound;
- int64_t upperBound;
+ int64_t _lowerBound;
+ int64_t _upperBound;
public:
TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
@@ -545,28 +631,32 @@ namespace orc {
virtual ~TimestampColumnStatisticsImpl();
bool hasMinimum() const override {
- return _hasMinimum;
+ return _stats.hasMinimum();
}
bool hasMaximum() const override {
- return _hasMaximum;
+ return _stats.hasMaximum();
}
uint64_t getNumberOfValues() const override {
- return valueCount;
+ return _stats.getNumberOfValues();
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
}
int64_t getMinimum() const override {
- if(_hasMinimum){
- return minimum;
+ if(hasMinimum()){
+ return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
- if(_hasMaximum){
- return maximum;
+ if(hasMaximum()){
+ return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
@@ -579,39 +669,40 @@ namespace orc {
time_t secs = 0;
buffer << "Data type: Timestamp" << std::endl
- << "Values: " << valueCount << std::endl;
- if(_hasMinimum){
- secs = static_cast<time_t>(minimum/1000);
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ secs = static_cast<time_t>(getMinimum() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "Minimum: " << timeBuffer << "." << (minimum % 1000) << std::endl;
+ buffer << "Minimum: " << timeBuffer << "." << (getMinimum() % 1000) << std::endl;
}else{
buffer << "Minimum is not defined" << std::endl;
}
- if(_hasLowerBound){
- secs = static_cast<time_t>(lowerBound/1000);
+ if(hasLowerBound()){
+ secs = static_cast<time_t>(getLowerBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "LowerBound: " << timeBuffer << "." << (lowerBound % 1000) << std::endl;
+ buffer << "LowerBound: " << timeBuffer << "." << (getLowerBound() % 1000) << std::endl;
}else{
buffer << "LowerBound is not defined" << std::endl;
}
- if(_hasMaximum){
- secs = static_cast<time_t>(maximum/1000);
+ if(hasMaximum()){
+ secs = static_cast<time_t>(getMaximum()/1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "Maximum: " << timeBuffer << "." << (maximum % 1000) << std::endl;
+ buffer << "Maximum: " << timeBuffer << "." << (getMaximum() % 1000) << std::endl;
}else{
buffer << "Maximum is not defined" << std::endl;
}
- if(_hasUpperBound){
- secs = static_cast<time_t>(upperBound/1000);
+ if(hasUpperBound()){
+ secs = static_cast<time_t>(getUpperBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "UpperBound: " << timeBuffer << "." << (upperBound % 1000) << std::endl;
+ buffer << "UpperBound: " << timeBuffer << "." << (getUpperBound() % 1000) << std::endl;
}else{
buffer << "UpperBound is not defined" << std::endl;
}
@@ -628,16 +719,16 @@ namespace orc {
}
int64_t getLowerBound() const override {
- if(_hasLowerBound){
- return lowerBound;
+ if(hasLowerBound()){
+ return _lowerBound;
}else{
throw ParseError("LowerBound is not defined.");
}
}
int64_t getUpperBound() const override {
- if(_hasUpperBound){
- return upperBound;
+ if(hasUpperBound()){
+ return _upperBound;
}else{
throw ParseError("UpperBound is not defined.");
}
http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/src/Vector.cc
----------------------------------------------------------------------
diff --git a/c++/src/Vector.cc b/c++/src/Vector.cc
index d5df7f7..2c7e2d3 100644
--- a/c++/src/Vector.cc
+++ b/c++/src/Vector.cc
@@ -396,6 +396,10 @@ namespace orc {
}
}
+ Decimal::Decimal() : value(0), scale(0) {
+ // PASS
+ }
+
std::string Decimal::toString() const {
return value.toDecimalString(scale);
}
http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/test/TestStripeIndexStatistics.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc
index 342bafb..e6607d4 100644
--- a/c++/test/TestStripeIndexStatistics.cc
+++ b/c++/test/TestStripeIndexStatistics.cc
@@ -45,19 +45,19 @@ namespace orc {
const orc::IntegerColumnStatistics* intColStats;
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getRowIndexStatistics(1, 0));
- EXPECT_EQ("Data type: Integer\nValues: 2000\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n", intColStats->toString());
+ EXPECT_EQ("Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n", intColStats->toString());
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getRowIndexStatistics(1, 1));
- EXPECT_EQ("Data type: Integer\nValues: 2000\nMinimum: 2001\nMaximum: 4000\nSum: 6001000\n", intColStats->toString());
+ EXPECT_EQ("Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nSum: 6001000\n", intColStats->toString());
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getRowIndexStatistics(1, 2));
- EXPECT_EQ("Data type: Integer\nValues: 2000\nMinimum: 4001\nMaximum: 6000\nSum: 10001000\n", intColStats->toString());
+ EXPECT_EQ("Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nSum: 10001000\n", intColStats->toString());
const orc::StringColumnStatistics* stringColStats;
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getRowIndexStatistics(2, 0));
- EXPECT_EQ("Data type: String\nValues: 2000\nMinimum: 1000\nMaximum: 9a\nTotal length: 7892\n", stringColStats->toString());
+ EXPECT_EQ("Data type: String\nValues: 2000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: 7892\n", stringColStats->toString());
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getRowIndexStatistics(2, 1));
- EXPECT_EQ("Data type: String\nValues: 2000\nMinimum: 2001\nMaximum: 4000\nTotal length: 8000\n", stringColStats->toString());
+ EXPECT_EQ("Data type: String\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nTotal length: 8000\n", stringColStats->toString());
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getRowIndexStatistics(2, 2));
- EXPECT_EQ("Data type: String\nValues: 2000\nMinimum: 4001\nMaximum: 6000\nTotal length: 8000\n", stringColStats->toString());
+ EXPECT_EQ("Data type: String\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nTotal length: 8000\n", stringColStats->toString());
}
} // namespace
http://git-wip-us.apache.org/repos/asf/orc/blob/68994174/c++/test/TestTimestampStatistics.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestTimestampStatistics.cc b/c++/test/TestTimestampStatistics.cc
index b5ce2a7..302ef9b 100644
--- a/c++/test/TestTimestampStatistics.cc
+++ b/c++/test/TestTimestampStatistics.cc
@@ -50,11 +50,11 @@ namespace orc {
EXPECT_FALSE(footerColStats->hasMaximum());
EXPECT_TRUE(footerColStats->hasLowerBound());
EXPECT_TRUE(footerColStats->hasUpperBound());
- EXPECT_EQ("Data type: Timestamp\nValues: 12\nMinimum is not defined\nLowerBound: 1994-12-31 07:00:00.688\nMaximum is not defined\nUpperBound: 2037-01-02 09:00:00.1\n", footerColStats->toString());
+ EXPECT_EQ("Data type: Timestamp\nValues: 12\nHas null: no\nMinimum is not defined\nLowerBound: 1994-12-31 07:00:00.688\nMaximum is not defined\nUpperBound: 2037-01-02 09:00:00.1\n", footerColStats->toString());
EXPECT_TRUE(stripeColStats->hasMinimum());
EXPECT_TRUE(stripeColStats->hasMaximum());
- EXPECT_EQ("Data type: Timestamp\nValues: 12\nMinimum: 1995-01-01 00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString());
+ EXPECT_EQ("Data type: Timestamp\nValues: 12\nHas null: no\nMinimum: 1995-01-01 00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString());
}
} // namespace