You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by mo...@apache.org on 2019/06/26 20:38:59 UTC

[orc] branch master updated: ORC-398 col stats list map (#395)

This is an automated email from the ASF dual-hosted git repository.

more pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new c90d567  ORC-398 col stats list map (#395)
c90d567 is described below

commit c90d5672c569353a71af67ca3673d3f2ac9fbe99
Author: Sandeep Moré <mo...@apache.org>
AuthorDate: Wed Jun 26 16:38:55 2019 -0400

    ORC-398 col stats list map (#395)
    
    Fixes #395
---
 .../org/apache/orc/CollectionColumnStatistics.java |  41 +++++
 .../org/apache/orc/impl/ColumnStatisticsImpl.java  | 183 ++++++++++++++++++++-
 .../java/org/apache/orc/impl/RecordReaderImpl.java |   8 +-
 .../org/apache/orc/impl/writer/ListTreeWriter.java |   3 +
 .../org/apache/orc/impl/writer/MapTreeWriter.java  |   4 +
 .../apache/orc/impl/TestColumnStatisticsImpl.java  |  26 +++
 .../java/org/apache/orc/tools/JsonFileDump.java    |   5 +
 proto/orc_proto.proto                              |   8 +
 8 files changed, 270 insertions(+), 8 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/CollectionColumnStatistics.java b/java/core/src/java/org/apache/orc/CollectionColumnStatistics.java
new file mode 100644
index 0000000..7a18026
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/CollectionColumnStatistics.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+/**
+ * Statistics for all of collections such as Map and List.
+ */
+public interface CollectionColumnStatistics extends ColumnStatistics {
+  /**
+   * Get minimum number of children in the collection.
+   * @return the minimum children count
+   */
+  long getMinimumChildren();
+
+  /**
+   * Get maximum number of children in the collection.
+   * @return the maximum children count
+   */
+  long getMaximumChildren();
+
+  /**
+   * Get the total number of children in the collection.
+   * @return the total number of children
+   */
+  long getTotalChildren();
+}
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index a6a3780..5791737 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparator;
 import org.apache.orc.BinaryColumnStatistics;
 import org.apache.orc.BooleanColumnStatistics;
+import org.apache.orc.CollectionColumnStatistics;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.DateColumnStatistics;
 import org.apache.orc.DecimalColumnStatistics;
@@ -35,13 +36,8 @@ import org.apache.orc.StringColumnStatistics;
 import org.apache.orc.TimestampColumnStatistics;
 import org.apache.orc.TypeDescription;
 
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
 import java.sql.Date;
 import java.sql.Timestamp;
-import java.util.Arrays;
 import java.util.TimeZone;
 
 public class ColumnStatisticsImpl implements ColumnStatistics {
@@ -66,7 +62,6 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
     if (bytesOnDisk != that.bytesOnDisk) {
       return false;
     }
-
     return true;
   }
 
@@ -170,6 +165,166 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
     }
   }
 
+  /**
+   * Column statistics for List and Map types.
+   */
+  private static final class CollectionColumnStatisticsImpl extends ColumnStatisticsImpl
+      implements CollectionColumnStatistics {
+
+    protected long minimum = Long.MAX_VALUE;
+    protected long maximum = 0;
+    protected long sum = 0;
+
+    CollectionColumnStatisticsImpl() {
+      super();
+    }
+
+    CollectionColumnStatisticsImpl(OrcProto.ColumnStatistics stats) {
+      super(stats);
+      OrcProto.CollectionStatistics collStat = stats.getCollectionStatistics();
+
+      minimum = collStat.hasMinChildren() ? collStat.getMinChildren() : Long.MAX_VALUE;
+      maximum = collStat.hasMaxChildren() ? collStat.getMaxChildren() : 0;
+      sum = collStat.hasTotalChildren() ? collStat.getTotalChildren() : 0;
+    }
+
+    @Override
+    public void updateCollectionLength(final long length) {
+      /*
+       * Here, minimum = minCollectionLength
+       * maximum = maxCollectionLength
+       * sum = childCount
+       */
+      if (length < minimum) {
+        minimum = length;
+      }
+      if (length > maximum) {
+        maximum = length;
+      }
+
+      this.sum += length;
+    }
+
+    @Override
+    public void reset() {
+      super.reset();
+      minimum = Long.MAX_VALUE;
+      maximum = 0;
+      sum = 0;
+    }
+
+    @Override
+    public void merge(ColumnStatisticsImpl other) {
+      if (other instanceof CollectionColumnStatisticsImpl) {
+        CollectionColumnStatisticsImpl otherColl = (CollectionColumnStatisticsImpl) other;
+
+        if(count == 0) {
+          minimum = otherColl.minimum;
+          maximum = otherColl.maximum;
+        } else {
+          if (otherColl.minimum < minimum) {
+            minimum = otherColl.minimum;
+          }
+          if (otherColl.maximum > maximum) {
+            maximum = otherColl.maximum;
+          }
+        }
+        sum += otherColl.sum;
+      } else {
+        if (isStatsExists()) {
+          throw new IllegalArgumentException("Incompatible merging of collection column statistics");
+        }
+      }
+      super.merge(other);
+    }
+
+    @Override
+    public long getMinimumChildren() {
+      return minimum;
+    }
+
+    @Override
+    public long getMaximumChildren() {
+      return maximum;
+    }
+
+    @Override
+    public long getTotalChildren() {
+      return sum;
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder buf = new StringBuilder(super.toString());
+      if (count != 0) {
+        buf.append(" minChildren: ");
+        buf.append(minimum);
+        buf.append(" maxChildren: ");
+        buf.append(maximum);
+        if (sum != 0) {
+          buf.append(" totalChildren: ");
+          buf.append(sum);
+        }
+      }
+      return buf.toString();
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o) {
+        return true;
+      }
+      if (!(o instanceof CollectionColumnStatisticsImpl)) {
+        return false;
+      }
+      if (!super.equals(o)) {
+        return false;
+      }
+
+      CollectionColumnStatisticsImpl that = (CollectionColumnStatisticsImpl) o;
+
+      if (minimum != that.minimum) {
+        return false;
+      }
+      if (maximum != that.maximum) {
+        return false;
+      }
+      if (sum != that.sum) {
+        return false;
+      }
+
+      return true;
+    }
+
+    @Override
+    public int hashCode() {
+      int result = super.hashCode();
+      result = 31 * result + (count != 0 ? (int) (minimum ^ (minimum >>> 32)): 0) ;
+      result = 31 * result + (count != 0 ? (int) (maximum ^ (maximum >>> 32)): 0);
+      result = 31 * result + (sum != 0 ? (int) (sum ^ (sum >>> 32)): 0);
+      return result;
+    }
+
+    @Override
+    public OrcProto.ColumnStatistics.Builder serialize() {
+      OrcProto.ColumnStatistics.Builder builder = super.serialize();
+      OrcProto.CollectionStatistics.Builder collectionStats =
+          OrcProto.CollectionStatistics.newBuilder();
+      if (count != 0) {
+        collectionStats.setMinChildren(minimum);
+        collectionStats.setMaxChildren(maximum);
+      }
+      if (sum != 0) {
+        collectionStats.setTotalChildren(sum);
+      }
+      builder.setCollectionStatistics(collectionStats);
+      return builder;
+    }
+  }
+
+  /**
+   * Implementation of IntegerColumnStatistics
+   */
   private static final class IntegerStatisticsImpl extends ColumnStatisticsImpl
       implements IntegerColumnStatistics {
 
@@ -266,7 +421,7 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
     public OrcProto.ColumnStatistics.Builder serialize() {
       OrcProto.ColumnStatistics.Builder builder = super.serialize();
       OrcProto.IntegerStatistics.Builder intb =
-        OrcProto.IntegerStatistics.newBuilder();
+          OrcProto.IntegerStatistics.newBuilder();
       if (hasMinimum) {
         intb.setMinimum(minimum);
         intb.setMaximum(maximum);
@@ -1673,6 +1828,15 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
     hasNull = true;
   }
 
+  /**
+   * Update the collection length for Map and List type.
+   * @param value length of collection
+   */
+  public void updateCollectionLength(final long value) {
+    throw new UnsupportedOperationException(
+        "Can't update collection count");
+  }
+
   public void updateBoolean(boolean value, int repetitions) {
     throw new UnsupportedOperationException("Can't update boolean");
   }
@@ -1789,6 +1953,9 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
       case INT:
       case LONG:
         return new IntegerStatisticsImpl();
+      case LIST:
+      case MAP:
+        return new CollectionColumnStatisticsImpl();
       case FLOAT:
       case DOUBLE:
         return new DoubleStatisticsImpl();
@@ -1819,6 +1986,8 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
       return new BooleanStatisticsImpl(stats);
     } else if (stats.hasIntStatistics()) {
       return new IntegerStatisticsImpl(stats);
+    } else if (stats.hasCollectionStatistics()) {
+      return new CollectionColumnStatisticsImpl(stats);
     } else if (stats.hasDoubleStatistics()) {
       return new DoubleStatisticsImpl(stats);
     } else if (stats.hasStringStatistics()) {
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 786f963..c1b9652 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.hive.serde2.io.DateWritable;
 import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.orc.BooleanColumnStatistics;
+import org.apache.orc.CollectionColumnStatistics;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.DataReader;
@@ -411,7 +412,12 @@ public class RecordReaderImpl implements RecordReader {
       Long min = stats.getMinimum();
       Long max = stats.getMaximum();
       return new ValueRange<>(predicate, min, max, stats.hasNull());
-    } else if (index instanceof DoubleColumnStatistics) {
+    } else if (index instanceof CollectionColumnStatistics) {
+      CollectionColumnStatistics stats = (CollectionColumnStatistics) index;
+      Long min = stats.getMinimumChildren();
+      Long max = stats.getMaximumChildren();
+      return new ValueRange<>(predicate, min, max, stats.hasNull());
+    }else if (index instanceof DoubleColumnStatistics) {
       DoubleColumnStatistics stats = (DoubleColumnStatistics) index;
       Double min = stats.getMinimum();
       Double max = stats.getMaximum();
diff --git a/java/core/src/java/org/apache/orc/impl/writer/ListTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/ListTreeWriter.java
index 3cd3ed1..e21bace 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/ListTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/ListTreeWriter.java
@@ -71,6 +71,9 @@ public class ListTreeWriter extends TreeWriterBase {
                          int length) throws IOException {
     super.writeBatch(vector, offset, length);
     ListColumnVector vec = (ListColumnVector) vector;
+    /* update aggregate statistics */
+    indexStatistics.updateCollectionLength(vec.lengths.length);
+
     if (vector.isRepeating) {
       if (vector.noNulls || !vector.isNull[0]) {
         int childOffset = (int) vec.offsets[0];
diff --git a/java/core/src/java/org/apache/orc/impl/writer/MapTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/MapTreeWriter.java
index 02191ad..6f8ed2d 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/MapTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/MapTreeWriter.java
@@ -75,6 +75,10 @@ public class MapTreeWriter extends TreeWriterBase {
                          int length) throws IOException {
     super.writeBatch(vector, offset, length);
     MapColumnVector vec = (MapColumnVector) vector;
+
+    /* update aggregate statistics */
+    indexStatistics.updateCollectionLength(vec.lengths.length);
+
     if (vector.isRepeating) {
       if (vector.noNulls || !vector.isNull[0]) {
         int childOffset = (int) vec.offsets[0];
diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
index 64d4626..45274b1 100644
--- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
@@ -148,4 +148,30 @@ public class TestColumnStatisticsImpl {
     updateStats1.merge(updateStats2);
     assertEquals(null, stats1.getSum());
   }
+
+  @Test
+  public void testCollectionColumnStats() throws Exception {
+    /* test List */
+    final ColumnStatisticsImpl statList = ColumnStatisticsImpl.create(TypeDescription.createList(TypeDescription.createInt()));
+
+    statList.increment();
+    statList.updateCollectionLength(10);
+
+    statList.increment();
+    statList.updateCollectionLength(20);
+
+    statList.increment();
+    statList.updateCollectionLength(30);
+
+    statList.increment();
+    statList.updateCollectionLength(40);
+
+    final OrcProto.ColumnStatistics.Builder builder = statList.serialize();
+    final OrcProto.CollectionStatistics collectionStatistics = builder.getCollectionStatistics();
+
+    assertEquals(10, collectionStatistics.getMinChildren());
+    assertEquals(40, collectionStatistics.getMaxChildren());
+    assertEquals(100, collectionStatistics.getTotalChildren());
+
+  }
 }
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index 8813bde..db51fed 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -24,6 +24,7 @@ import java.util.List;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.orc.CollectionColumnStatistics;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
@@ -352,6 +353,10 @@ public class JsonFileDump {
           writer.key("sum").value(((DecimalColumnStatistics) cs).getSum());
         }
         writer.key("type").value(OrcProto.Type.Kind.DECIMAL);
+      } else if (cs instanceof CollectionColumnStatistics) {
+        writer.key("minChildren").value(((CollectionColumnStatistics) cs).getMinimumChildren());
+        writer.key("maxChildren").value(((CollectionColumnStatistics) cs).getMaximumChildren());
+        writer.key("totalChildren").value(((CollectionColumnStatistics) cs).getTotalChildren());
       }
     }
   }
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index dc8f080..f0c66f1 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -74,6 +74,13 @@ message BinaryStatistics {
   optional sint64 sum = 1;
 }
 
+// Statistics for list and map
+message CollectionStatistics {
+  optional uint64 minChildren = 1;
+  optional uint64 maxChildren = 2;
+  optional uint64 totalChildren = 3;
+}
+
 message ColumnStatistics {
   optional uint64 numberOfValues = 1;
   optional IntegerStatistics intStatistics = 2;
@@ -86,6 +93,7 @@ message ColumnStatistics {
   optional TimestampStatistics timestampStatistics = 9;
   optional bool hasNull = 10;
   optional uint64 bytesOnDisk = 11;
+  optional CollectionStatistics collectionStatistics = 12;
 }
 
 message RowIndexEntry {