You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/05/03 22:30:37 UTC
orc git commit: ORC-345. Create and use Decimal64StatisticsImpl for
the write path.
Repository: orc
Updated Branches:
refs/heads/master 4cc0968f5 -> c4c7b28f6
ORC-345. Create and use Decimal64StatisticsImpl for the write path.
Fixes #252
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/c4c7b28f
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/c4c7b28f
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/c4c7b28f
Branch: refs/heads/master
Commit: c4c7b28f6863a9e2b5d1442c1a9e6b4f19c217b6
Parents: 4cc0968
Author: Owen O'Malley <om...@apache.org>
Authored: Wed Apr 18 14:30:35 2018 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Thu May 3 15:30:14 2018 -0700
----------------------------------------------------------------------
java/bench/pom.xml | 8 +-
.../java/org/apache/orc/bench/DecimalBench.java | 272 +++++++++++++++++++
.../src/java/org/apache/orc/bench/Driver.java | 4 +
.../org/apache/orc/bench/NullFileSystem.java | 121 +++++++++
.../orc/bench/convert/GenerateVariants.java | 4 +-
.../java/org/apache/orc/StripeStatistics.java | 2 +-
.../java/org/apache/orc/TypeDescription.java | 6 +-
.../apache/orc/impl/ColumnStatisticsImpl.java | 236 +++++++++++++++-
.../java/org/apache/orc/impl/ReaderImpl.java | 5 +-
.../org/apache/orc/impl/RecordReaderImpl.java | 2 +-
.../java/org/apache/orc/impl/WriterImpl.java | 2 +-
.../orc/impl/writer/Decimal64TreeWriter.java | 12 +-
.../orc/impl/writer/DecimalTreeWriter.java | 13 +-
.../apache/orc/impl/writer/TreeWriterBase.java | 5 +-
.../apache/orc/impl/writer/WriterImplV2.java | 2 +-
.../org/apache/orc/TestColumnStatistics.java | 2 +-
.../test/org/apache/orc/TestVectorOrcFile.java | 3 -
.../orc/impl/TestColumnStatisticsImpl.java | 63 +++++
.../apache/orc/impl/TestRecordReaderImpl.java | 51 ++--
.../src/java/org/apache/orc/tools/FileDump.java | 12 +-
.../java/org/apache/orc/tools/JsonFileDump.java | 8 +-
21 files changed, 771 insertions(+), 62 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/bench/pom.xml
----------------------------------------------------------------------
diff --git a/java/bench/pom.xml b/java/bench/pom.xml
index 148693a..739bcf1 100644
--- a/java/bench/pom.xml
+++ b/java/bench/pom.xml
@@ -39,6 +39,7 @@
<avro.version>1.8.2</avro.version>
<hadoop.version>2.7.3</hadoop.version>
<hive.version>2.3.3</hive.version>
+ <jmh.version>1.20</jmh.version>
<orc.version>1.5.0-SNAPSHOT</orc.version>
<parquet.version>1.9.0</parquet.version>
<storage-api.version>2.5.0</storage-api.version>
@@ -139,7 +140,12 @@
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-core</artifactId>
- <version>1.18</version>
+ <version>${jmh.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.openjdk.jmh</groupId>
+ <artifactId>jmh-generator-annprocess</artifactId>
+ <version>${jmh.version}</version>
</dependency>
</dependencies>
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/bench/src/java/org/apache/orc/bench/DecimalBench.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/DecimalBench.java b/java/bench/src/java/org/apache/orc/bench/DecimalBench.java
new file mode 100644
index 0000000..71a1c33
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/DecimalBench.java
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import com.google.gson.JsonStreamParser;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.mapred.FsInput;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.TrackingLocalFileSystem;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
+import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.bench.convert.BatchReader;
+import org.apache.orc.bench.convert.GenerateVariants;
+import org.apache.orc.bench.convert.csv.CsvReader;
+import org.apache.parquet.hadoop.ParquetInputFormat;
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@Warmup(iterations=2, time=30, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations=10, time=30, timeUnit = TimeUnit.SECONDS)
+@State(Scope.Thread)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Fork(2)
+public class DecimalBench {
+
+ private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir";
+ private static final Path root;
+ static {
+ String value = System.getProperty(ROOT_ENVIRONMENT_NAME);
+ root = value == null ? null : new Path(value);
+ }
+
+ /**
+ * Abstract out whether we are writing short or long decimals
+ */
+ interface Loader {
+ /**
+ * Load the data from the values array into the ColumnVector.
+ * @param vector the output
+ * @param values the intput
+ * @param offset the first input value
+ * @param length the number of values to copy
+ */
+ void loadData(ColumnVector vector, long[] values, int offset, int length);
+ }
+
+ static class Decimal64Loader implements Loader {
+ final int scale;
+ final int precision;
+
+ Decimal64Loader(int precision, int scale) {
+ this.precision = precision;
+ this.scale = scale;
+ }
+
+ @Override
+ public void loadData(ColumnVector vector, long[] values, int offset, int length) {
+ Decimal64ColumnVector v = (Decimal64ColumnVector) vector;
+ v.ensureSize(length, false);
+ v.noNulls = true;
+ for(int p=0; p < length; ++p) {
+ v.vector[p] = values[p + offset];
+ }
+ v.precision = (short) precision;
+ v.scale = (short) scale;
+ }
+ }
+
+ static class DecimalLoader implements Loader {
+ final int scale;
+ final int precision;
+
+ DecimalLoader(int precision, int scale) {
+ this.precision = precision;
+ this.scale = scale;
+ }
+
+ @Override
+ public void loadData(ColumnVector vector, long[] values, int offset, int length) {
+ DecimalColumnVector v = (DecimalColumnVector) vector;
+ v.noNulls = true;
+ for(int p=0; p < length; ++p) {
+ v.vector[p].setFromLongAndScale(values[offset + p], scale);
+ }
+ v.precision = (short) precision;
+ v.scale = (short) scale;
+ }
+ }
+
+ @State(Scope.Thread)
+ public static class OutputState {
+
+ // try both short and long decimals
+ @Param({"8", "19"})
+ public int precision;
+
+ long[] total_amount = new long[1024 * 1024];
+ Configuration conf = new Configuration();
+ FileSystem fs = new NullFileSystem();
+ TypeDescription schema;
+ VectorizedRowBatch batch;
+ Loader loader;
+
+ @Setup
+ public void setup() throws IOException {
+ schema = TypeDescription.createDecimal()
+ .withScale(2)
+ .withPrecision(precision);
+ loader = precision <= 18 ?
+ new Decimal64Loader(precision, 2) :
+ new DecimalLoader(precision, 2);
+ readCsvData(total_amount, root, "total_amount", conf);
+ batch = schema.createRowBatchV2();
+ }
+ }
+
+ @Benchmark
+ public void write(OutputState state) throws Exception {
+ Writer writer = OrcFile.createWriter(new Path("null"),
+ OrcFile.writerOptions(state.conf)
+ .fileSystem(state.fs)
+ .setSchema(state.schema)
+ .compress(CompressionKind.NONE));
+ int r = 0;
+ int batchSize = state.batch.getMaxSize();
+ while (r < state.total_amount.length) {
+ state.batch.size = batchSize;
+ state.loader.loadData(state.batch.cols[0], state.total_amount, r, batchSize);
+ writer.addRowBatch(state.batch);
+ r += batchSize;
+ }
+ writer.close();
+ }
+
+ static void readCsvData(long[] data,
+ Path root,
+ String column,
+ Configuration conf) throws IOException {
+ TypeDescription schema = Utilities.loadSchema("taxi.schema");
+ int row = 0;
+ int batchPosn = 0;
+ BatchReader reader =
+ new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), "csv",
+ schema, conf, org.apache.orc.bench.CompressionKind.ZLIB);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 0;
+ TypeDescription columnSchema = schema.findSubtype(column);
+ DecimalColumnVector cv = (DecimalColumnVector) batch.cols[columnSchema.getId() - 1];
+ int scale = columnSchema.getScale();
+ while (row < data.length) {
+ if (batchPosn >= batch.size) {
+ if (!reader.nextBatch(batch)) {
+ throw new IllegalArgumentException("Not enough data");
+ }
+ batchPosn = 0;
+ }
+ data[row++] = cv.vector[batchPosn++].serialize64(scale);
+ }
+ }
+
+ @State(Scope.Thread)
+ public static class InputState {
+
+ // try both DecimalColumnVector and Decimal64ColumnVector
+ @Param({"ORIGINAL", "USE_DECIMAL64"})
+ public TypeDescription.RowBatchVersion version;
+
+ Configuration conf = new Configuration();
+ FileSystem fs;
+ TypeDescription schema;
+ VectorizedRowBatch batch;
+ Path path;
+ boolean[] include;
+ Reader reader;
+ OrcFile.ReaderOptions options;
+
+ @Setup
+ public void setup() throws IOException {
+ fs = FileSystem.getLocal(conf).getRaw();
+ path = new Path(root, "generated/taxi/orc.none");
+ schema = Utilities.loadSchema("taxi.schema");
+ batch = schema.createRowBatch(version, 1024);
+ // only include the columns with decimal values
+ include = new boolean[schema.getMaximumId() + 1];
+ for(TypeDescription child: schema.getChildren()) {
+ if (child.getCategory() == TypeDescription.Category.DECIMAL) {
+ include[child.getId()] = true;
+ }
+ }
+ reader = OrcFile.createReader(path,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // just read the decimal columns from the first stripe
+ reader.options().include(include).range(0, 1000);
+ }
+ }
+
+ @Benchmark
+ public void read(Blackhole blackhole, InputState state) throws Exception {
+ RecordReader rows = state.reader.rows();
+ while (rows.nextBatch(state.batch)) {
+ blackhole.consume(state.batch);
+ }
+ rows.close();
+ }
+
+ public static void main(String[] args) throws Exception {
+ new Runner(new OptionsBuilder()
+ .include(DecimalBench.class.getSimpleName())
+ .jvmArgs("-server", "-Xms256m", "-Xmx2g",
+ "-D" + ROOT_ENVIRONMENT_NAME + "=" + args[0]).build()
+ ).run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/bench/src/java/org/apache/orc/bench/Driver.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/Driver.java b/java/bench/src/java/org/apache/orc/bench/Driver.java
index c8f1592..6a86f90 100644
--- a/java/bench/src/java/org/apache/orc/bench/Driver.java
+++ b/java/bench/src/java/org/apache/orc/bench/Driver.java
@@ -47,6 +47,7 @@ public class Driver {
System.err.println(" scan - Scan data variants");
System.err.println(" read-all - Full table scan benchmark");
System.err.println(" read-some - Column projection benchmark");
+ System.err.println(" decimal - Decimal benchmark");
System.exit(1);
}
return result;
@@ -70,6 +71,9 @@ public class Driver {
case "read-some":
ColumnProjectionBenchmark.main(args);
break;
+ case "decimal":
+ DecimalBench.main(args);
+ break;
default:
System.err.println("Unknown command " + command);
System.exit(1);
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java b/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java
new file mode 100644
index 0000000..23d19cc
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.util.Progressable;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+public class NullFileSystem extends FileSystem {
+ @Override
+ public URI getUri() {
+ try {
+ return new URI("null:///");
+ } catch (URISyntaxException e) {
+ throw new IllegalArgumentException("Bad URL", e);
+ }
+ }
+
+ @Override
+ public FSDataInputStream open(Path path, int i) throws IOException {
+ return new FSDataInputStream(new InputStream() {
+ @Override
+ public int read() throws IOException {
+ return -1;
+ }
+ });
+ }
+
+ static class NullOutput extends OutputStream {
+
+ @Override
+ public void write(int b) {
+ // pass
+ }
+
+ public void write(byte[] buffer, int offset, int length) {
+ // pass
+ }
+ }
+ private static final OutputStream NULL_OUTPUT = new NullOutput();
+
+ @Override
+ public FSDataOutputStream create(Path path,
+ FsPermission fsPermission,
+ boolean b,
+ int i,
+ short i1,
+ long l,
+ Progressable progressable) throws IOException {
+ return new FSDataOutputStream(NULL_OUTPUT);
+ }
+
+ @Override
+ public FSDataOutputStream append(Path path,
+ int i,
+ Progressable progressable) throws IOException {
+ return new FSDataOutputStream(NULL_OUTPUT);
+ }
+
+ @Override
+ public boolean rename(Path path, Path path1) {
+ return false;
+ }
+
+ @Override
+ public boolean delete(Path path, boolean b) {
+ return false;
+ }
+
+ @Override
+ public FileStatus[] listStatus(Path path) {
+ return null;
+ }
+
+ @Override
+ public void setWorkingDirectory(Path path) {
+ // pass
+ }
+
+ @Override
+ public Path getWorkingDirectory() {
+ return null;
+ }
+
+ @Override
+ public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException {
+ return false;
+ }
+
+ @Override
+ public FileStatus getFileStatus(Path path) throws IOException {
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java b/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java
index 7f57468..57cf4c9 100644
--- a/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java
+++ b/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java
@@ -95,7 +95,7 @@ public class GenerateVariants {
}
}
- static class RecursiveReader implements BatchReader {
+ public static class RecursiveReader implements BatchReader {
private final RemoteIterator<LocatedFileStatus> filenames;
private final String format;
private final TypeDescription schema;
@@ -103,7 +103,7 @@ public class GenerateVariants {
private final CompressionKind compress;
private BatchReader current = null;
- RecursiveReader(Path root,
+ public RecursiveReader(Path root,
String format,
TypeDescription schema,
Configuration conf,
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/StripeStatistics.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/StripeStatistics.java b/java/core/src/java/org/apache/orc/StripeStatistics.java
index 8fc91cb..d1738ff 100644
--- a/java/core/src/java/org/apache/orc/StripeStatistics.java
+++ b/java/core/src/java/org/apache/orc/StripeStatistics.java
@@ -37,7 +37,7 @@ public class StripeStatistics {
public ColumnStatistics[] getColumnStatistics() {
ColumnStatistics[] result = new ColumnStatistics[cs.size()];
for (int i = 0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(cs.get(i));
+ result[i] = ColumnStatisticsImpl.deserialize(null, cs.get(i));
}
return result;
}
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index 264adff..50cd022 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -47,6 +47,8 @@ public class TypeDescription
private static final int DEFAULT_PRECISION = 38;
private static final int DEFAULT_SCALE = 10;
public static final int MAX_DECIMAL64_PRECISION = 18;
+ public static final long MAX_DECIMAL64 = 999_999_999_999_999_999L;
+ public static final long MIN_DECIMAL64 = -MAX_DECIMAL64;
private static final int DEFAULT_LENGTH = 256;
static final Pattern UNQUOTED_NAMES = Pattern.compile("^[a-zA-Z0-9_]+$");
@@ -681,12 +683,12 @@ public class TypeDescription
/**
* Specify the version of the VectorizedRowBatch that the user desires.
*/
- enum RowBatchVersion {
+ public enum RowBatchVersion {
ORIGINAL,
USE_DECIMAL64;
}
- VectorizedRowBatch createRowBatch(RowBatchVersion version, int size) {
+ public VectorizedRowBatch createRowBatch(RowBatchVersion version, int size) {
VectorizedRowBatch result;
if (category == Category.STRUCT) {
result = new VectorizedRowBatch(children.size(), size);
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index 0cd69f4..98a4e17 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -835,6 +835,13 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
}
@Override
+ public void updateDecimal64(long value, int scale) {
+ HiveDecimalWritable dValue = new HiveDecimalWritable();
+ dValue.setFromLongAndScale(value, scale);
+ updateDecimal(dValue);
+ }
+
+ @Override
public void merge(ColumnStatisticsImpl other) {
if (other instanceof DecimalStatisticsImpl) {
DecimalStatisticsImpl dec = (DecimalStatisticsImpl) other;
@@ -948,6 +955,215 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
}
}
+ private static final class Decimal64StatisticsImpl extends ColumnStatisticsImpl
+ implements DecimalColumnStatistics {
+
+ private final int scale;
+ private long minimum;
+ private long maximum;
+ private boolean hasSum = true;
+ private long sum = 0;
+ private final HiveDecimalWritable scratch = new HiveDecimalWritable();
+
+ Decimal64StatisticsImpl(int scale) {
+ this.scale = scale;
+ }
+
+ Decimal64StatisticsImpl(int scale, OrcProto.ColumnStatistics stats) {
+ super(stats);
+ this.scale = scale;
+ OrcProto.DecimalStatistics dec = stats.getDecimalStatistics();
+ if (dec.hasMaximum()) {
+ maximum = new HiveDecimalWritable(dec.getMaximum()).serialize64(scale);
+ }
+ if (dec.hasMinimum()) {
+ minimum = new HiveDecimalWritable(dec.getMinimum()).serialize64(scale);
+ }
+ if (dec.hasSum()) {
+ hasSum = true;
+ HiveDecimalWritable sumTmp = new HiveDecimalWritable(dec.getSum());
+ if (sumTmp.getHiveDecimal().integerDigitCount() + scale <=
+ TypeDescription.MAX_DECIMAL64_PRECISION) {
+ hasSum = true;
+ sum = sumTmp.serialize64(scale);
+ return;
+ }
+ }
+ hasSum = false;
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = 0;
+ maximum = 0;
+ hasSum = true;
+ sum = 0;
+ }
+
+ @Override
+ public void updateDecimal(HiveDecimalWritable value) {
+ updateDecimal64(value.serialize64(scale), scale);
+ }
+
+ @Override
+ public void updateDecimal64(long value, int valueScale) {
+ // normalize the scale to our desired level
+ while (valueScale != scale) {
+ if (valueScale > scale) {
+ value /= 10;
+ valueScale -= 1;
+ } else {
+ value *= 10;
+ valueScale += 1;
+ }
+ }
+ if (value < TypeDescription.MIN_DECIMAL64 ||
+ value > TypeDescription.MAX_DECIMAL64) {
+ throw new IllegalArgumentException("Out of bounds decimal64 " + value);
+ }
+ if (getNumberOfValues() == 0) {
+ minimum = value;
+ maximum = value;
+ } else if (minimum > value) {
+ minimum = value;
+ } else if (maximum < value) {
+ maximum = value;
+ }
+ if (hasSum) {
+ sum += value;
+ hasSum = sum <= TypeDescription.MAX_DECIMAL64 &&
+ sum >= TypeDescription.MIN_DECIMAL64;
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof Decimal64StatisticsImpl) {
+ Decimal64StatisticsImpl dec = (Decimal64StatisticsImpl) other;
+ if (getNumberOfValues() == 0) {
+ minimum = dec.minimum;
+ maximum = dec.maximum;
+ sum = dec.sum;
+ } else {
+ if (minimum > dec.minimum) {
+ minimum = dec.minimum;
+ }
+ if (maximum < dec.maximum) {
+ maximum = dec.maximum;
+ }
+ if (hasSum && dec.hasSum) {
+ sum += dec.sum;
+ hasSum = sum <= TypeDescription.MAX_DECIMAL64 &&
+ sum >= TypeDescription.MIN_DECIMAL64;
+ } else {
+ hasSum = false;
+ }
+ }
+ } else {
+ if (getNumberOfValues() != 0) {
+ throw new IllegalArgumentException("Incompatible merging of decimal column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.DecimalStatistics.Builder dec =
+ OrcProto.DecimalStatistics.newBuilder();
+ if (getNumberOfValues() != 0) {
+ scratch.setFromLongAndScale(minimum, scale);
+ dec.setMinimum(scratch.toString());
+ scratch.setFromLongAndScale(maximum, scale);
+ dec.setMaximum(scratch.toString());
+ }
+ // Check hasSum for overflow.
+ if (hasSum) {
+ scratch.setFromLongAndScale(sum, scale);
+ dec.setSum(scratch.toString());
+ }
+ result.setDecimalStatistics(dec);
+ return result;
+ }
+
+ @Override
+ public HiveDecimal getMinimum() {
+ if (getNumberOfValues() > 0) {
+ scratch.setFromLongAndScale(minimum, scale);
+ return scratch.getHiveDecimal();
+ }
+ return null;
+ }
+
+ @Override
+ public HiveDecimal getMaximum() {
+ if (getNumberOfValues() > 0) {
+ scratch.setFromLongAndScale(maximum, scale);
+ return scratch.getHiveDecimal();
+ }
+ return null;
+ }
+
+ @Override
+ public HiveDecimal getSum() {
+ if (hasSum) {
+ scratch.setFromLongAndScale(sum, scale);
+ return scratch.getHiveDecimal();
+ }
+ return null;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(minimum);
+ buf.append(" max: ");
+ buf.append(maximum);
+ if (hasSum) {
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ }
+ return buf.toString();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof Decimal64StatisticsImpl)) {
+ return false;
+ }
+ if (!super.equals(o)) {
+ return false;
+ }
+
+ Decimal64StatisticsImpl that = (Decimal64StatisticsImpl) o;
+
+ if (minimum != that.minimum ||
+ maximum != that.maximum ||
+ hasSum != that.hasSum) {
+ return false;
+ }
+ return !hasSum || (sum == that.sum);
+ }
+
+ @Override
+ public int hashCode() {
+ int result = super.hashCode();
+ boolean hasValues = getNumberOfValues() > 0;
+ result = 31 * result + (hasValues ? (int) minimum : 0);
+ result = 31 * result + (hasValues ? (int) maximum : 0);
+ result = 31 * result + (hasSum ? (int) sum : 0);
+ return result;
+ }
+ }
+
private static final class DateStatisticsImpl extends ColumnStatisticsImpl
implements DateColumnStatistics {
private Integer minimum = null;
@@ -1329,6 +1545,10 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
throw new UnsupportedOperationException("Can't update decimal");
}
+ public void updateDecimal64(long value, int scale) {
+ throw new UnsupportedOperationException("Can't update decimal");
+ }
+
public void updateDate(DateWritable value) {
throw new UnsupportedOperationException("Can't update date");
}
@@ -1415,7 +1635,11 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
case VARCHAR:
return new StringStatisticsImpl();
case DECIMAL:
- return new DecimalStatisticsImpl();
+ if (schema.getPrecision() <= TypeDescription.MAX_DECIMAL64_PRECISION) {
+ return new Decimal64StatisticsImpl(schema.getScale());
+ } else {
+ return new DecimalStatisticsImpl();
+ }
case DATE:
return new DateStatisticsImpl();
case TIMESTAMP:
@@ -1427,7 +1651,8 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
}
}
- public static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) {
+ public static ColumnStatisticsImpl deserialize(TypeDescription schema,
+ OrcProto.ColumnStatistics stats) {
if (stats.hasBucketStatistics()) {
return new BooleanStatisticsImpl(stats);
} else if (stats.hasIntStatistics()) {
@@ -1437,7 +1662,12 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
} else if (stats.hasStringStatistics()) {
return new StringStatisticsImpl(stats);
} else if (stats.hasDecimalStatistics()) {
- return new DecimalStatisticsImpl(stats);
+ if (schema != null &&
+ schema.getPrecision() <= TypeDescription.MAX_DECIMAL64_PRECISION) {
+ return new Decimal64StatisticsImpl(schema.getScale(), stats);
+ } else {
+ return new DecimalStatisticsImpl(stats);
+ }
} else if (stats.hasDateStatistics()) {
return new DateStatisticsImpl(stats);
} else if (stats.hasTimestampStatistics()) {
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
index e8d6be9..feb8160 100644
--- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -221,14 +221,15 @@ public class ReaderImpl implements Reader {
@Override
public ColumnStatistics[] getStatistics() {
- return deserializeStats(fileStats);
+ return deserializeStats(schema, fileStats);
}
public static ColumnStatistics[] deserializeStats(
+ TypeDescription schema,
List<OrcProto.ColumnStatistics> fileStats) {
ColumnStatistics[] result = new ColumnStatistics[fileStats.size()];
for(int i=0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i));
+ result[i] = ColumnStatisticsImpl.deserialize(schema, fileStats.get(i));
}
return result;
}
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 53cc761..abb487e 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -401,7 +401,7 @@ public class RecordReaderImpl implements RecordReader {
OrcProto.BloomFilter bloomFilter,
OrcFile.WriterVersion writerVersion,
TypeDescription.Category type) {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, statsProto);
Object minValue = getMin(cs);
Object maxValue = getMax(cs);
// files written before ORC-135 stores timestamp wrt to local timezone causing issues with PPD.
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/WriterImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index c3656d4..7a4a0b9 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -651,7 +651,7 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
// add the column statistics
writeFileStatistics(builder, treeWriter);
- return ReaderImpl.deserializeStats(builder.getStatisticsList());
+ return ReaderImpl.deserializeStats(schema, builder.getStatisticsList());
}
public CompressionCodec getCompressionCodec() {
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/writer/Decimal64TreeWriter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/writer/Decimal64TreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/Decimal64TreeWriter.java
index b79e3b2..020d8ff 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/Decimal64TreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/Decimal64TreeWriter.java
@@ -58,7 +58,7 @@ public class Decimal64TreeWriter extends TreeWriterBase {
if (vector.noNulls || !vector.isNull[0]) {
HiveDecimalWritable value = vector.vector[0];
long lg = value.serialize64(scale);
- indexStatistics.updateDecimal(value);
+ indexStatistics.updateDecimal64(lg, scale);
if (createBloomFilter) {
bloomFilterUtf8.addLong(lg);
}
@@ -72,7 +72,7 @@ public class Decimal64TreeWriter extends TreeWriterBase {
HiveDecimalWritable value = vector.vector[i + offset];
long lg = value.serialize64(scale);
valueWriter.write(lg);
- indexStatistics.updateDecimal(value);
+ indexStatistics.updateDecimal64(lg, scale);
if (createBloomFilter) {
bloomFilterUtf8.addLong(lg);
}
@@ -86,10 +86,8 @@ public class Decimal64TreeWriter extends TreeWriterBase {
assert(scale == vector.scale);
if (vector.isRepeating) {
if (vector.noNulls || !vector.isNull[0]) {
- HiveDecimalWritable value = vector.getScratchWritable();
long lg = vector.vector[0];
- value.setFromLongAndScale(lg, vector.scale);
- indexStatistics.updateDecimal(value);
+ indexStatistics.updateDecimal64(lg, scale);
if (createBloomFilter) {
bloomFilterUtf8.addLong(lg);
}
@@ -98,13 +96,11 @@ public class Decimal64TreeWriter extends TreeWriterBase {
}
}
} else {
- HiveDecimalWritable value = vector.getScratchWritable();
for (int i = 0; i < length; ++i) {
if (vector.noNulls || !vector.isNull[i + offset]) {
long lg = vector.vector[i + offset];
valueWriter.write(lg);
- value.setFromLongAndScale(lg, vector.scale);
- indexStatistics.updateDecimal(value);
+ indexStatistics.updateDecimal64(lg, scale);
if (createBloomFilter) {
bloomFilterUtf8.addLong(lg);
}
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/writer/DecimalTreeWriter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/writer/DecimalTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/DecimalTreeWriter.java
index 9b2f2f0..822042e 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/DecimalTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/DecimalTreeWriter.java
@@ -113,10 +113,10 @@ public class DecimalTreeWriter extends TreeWriterBase {
int length) throws IOException {
if (vector.isRepeating) {
if (vector.noNulls || !vector.isNull[0]) {
- HiveDecimalWritable value = vector.getScratchWritable();
- value.setFromLongAndScale(vector.vector[0], vector.scale);
- indexStatistics.updateDecimal(value);
+ indexStatistics.updateDecimal64(vector.vector[0], vector.scale);
if (createBloomFilter) {
+ HiveDecimalWritable value = vector.getScratchWritable();
+ value.setFromLongAndScale(vector.vector[0], vector.scale);
String str = value.toString(scratchBuffer);
if (bloomFilter != null) {
bloomFilter.addString(str);
@@ -132,11 +132,12 @@ public class DecimalTreeWriter extends TreeWriterBase {
HiveDecimalWritable value = vector.getScratchWritable();
for (int i = 0; i < length; ++i) {
if (vector.noNulls || !vector.isNull[i + offset]) {
- utils.writeVslong(valueStream, vector.vector[i + offset]);
+ long num = vector.vector[i + offset];
+ utils.writeVslong(valueStream, num);
scaleStream.write(vector.scale);
- value.setFromLongAndScale(vector.vector[i + offset], vector.scale);
- indexStatistics.updateDecimal(value);
+ indexStatistics.updateDecimal64(num, vector.scale);
if (createBloomFilter) {
+ value.setFromLongAndScale(num, vector.scale);
String str = value.toString(scratchBuffer);
if (bloomFilter != null) {
bloomFilter.addString(str);
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java b/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
index 74ef3cc..d6145cd 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
@@ -64,6 +64,7 @@ public abstract class TreeWriterBase implements TreeWriter {
private boolean foundNulls;
private OutStream isPresentOutStream;
private final WriterContext streamFactory;
+ private final TypeDescription schema;
/**
* Create a tree writer.
@@ -76,6 +77,7 @@ public abstract class TreeWriterBase implements TreeWriter {
TypeDescription schema,
WriterContext streamFactory,
boolean nullable) throws IOException {
+ this.schema = schema;
this.streamFactory = streamFactory;
this.isCompressed = streamFactory.isCompressed();
this.id = columnId;
@@ -340,7 +342,8 @@ public abstract class TreeWriterBase implements TreeWriter {
@Override
public void updateFileStatistics(OrcProto.StripeStatistics stats) {
- fileStatistics.merge(ColumnStatisticsImpl.deserialize(stats.getColStats(id)));
+ fileStatistics.merge(ColumnStatisticsImpl.deserialize(schema,
+ stats.getColStats(id)));
}
/**
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/java/org/apache/orc/impl/writer/WriterImplV2.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/writer/WriterImplV2.java b/java/core/src/java/org/apache/orc/impl/writer/WriterImplV2.java
index ab4fc58..668ecc2 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/WriterImplV2.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/WriterImplV2.java
@@ -593,7 +593,7 @@ public class WriterImplV2 implements WriterInternal, MemoryManager.Callback {
// add the column statistics
writeFileStatistics(builder, treeWriter);
- return ReaderImpl.deserializeStats(builder.getStatisticsList());
+ return ReaderImpl.deserializeStats(schema, builder.getStatisticsList());
}
public CompressionCodec getCompressionCodec() {
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/test/org/apache/orc/TestColumnStatistics.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
index bbc85ea..2045004 100644
--- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java
+++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
@@ -209,7 +209,7 @@ public class TestColumnStatistics {
// serialize and read back in with phoenix timezone
OrcProto.ColumnStatistics serial = stats2.serialize().build();
TimeZone.setDefault(TimeZone.getTimeZone("America/Phoenix"));
- ColumnStatisticsImpl stats3 = ColumnStatisticsImpl.deserialize(serial);
+ ColumnStatisticsImpl stats3 = ColumnStatisticsImpl.deserialize(schema, serial);
assertEquals("2000-10-29 01:30:00.0",
((TimestampColumnStatistics) stats3).getMinimum().toString());
assertEquals("2000-10-29 03:30:00.0",
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 424eb96..658c1ce 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -3615,9 +3615,6 @@ public class TestVectorOrcFile {
LongColumnVector ints2 = (LongColumnVector) struct1.fields[0];
BytesColumnVector strs = (BytesColumnVector) struct1.fields[1];
- System.out.println("------------------------------------------------------------------------------------------------------------------------");
- System.out.println(rows.getRowNumber());
-
Assert.assertEquals(1000L, rows.getRowNumber());
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1000, batch.size);
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
index e53cdda..64d4626 100644
--- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.orc.ColumnStatistics;
+import org.apache.orc.DecimalColumnStatistics;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
import org.apache.orc.Reader;
@@ -85,4 +86,66 @@ public class TestColumnStatisticsImpl {
assertEquals("2037-01-01 00:00:00.0", stats.getMaximum().toString());
TimeZone.setDefault(original);
}
+
+ @Test
+ public void testDecimal64Overflow() throws IOException {
+ TypeDescription schema = TypeDescription.fromString("decimal(18,6)");
+ OrcProto.ColumnStatistics.Builder pb =
+ OrcProto.ColumnStatistics.newBuilder();
+ OrcProto.DecimalStatistics.Builder decimalBuilder =
+ OrcProto.DecimalStatistics.newBuilder();
+ decimalBuilder.setMaximum("1000.0");
+ decimalBuilder.setMinimum("1.010");
+ decimalBuilder.setSum("123456789.123456");
+ pb.setDecimalStatistics(decimalBuilder);
+ pb.setHasNull(false);
+ pb.setNumberOfValues(3);
+
+ // the base case doesn't overflow
+ DecimalColumnStatistics stats1 = (DecimalColumnStatistics)
+ ColumnStatisticsImpl.deserialize(schema, pb.build());
+ ColumnStatisticsImpl updateStats1 = (ColumnStatisticsImpl) stats1;
+ assertEquals("1.01", stats1.getMinimum().toString());
+ assertEquals("1000", stats1.getMaximum().toString());
+ assertEquals("123456789.123456", stats1.getSum().toString());
+ assertEquals(3, stats1.getNumberOfValues());
+
+ // Now set the sum to something that overflows Decimal64.
+ decimalBuilder.setSum("1234567890123.45");
+ pb.setDecimalStatistics(decimalBuilder);
+ DecimalColumnStatistics stats2 = (DecimalColumnStatistics)
+ ColumnStatisticsImpl.deserialize(schema, pb.build());
+ assertEquals(null, stats2.getSum());
+
+ // merge them together
+ updateStats1.merge((ColumnStatisticsImpl) stats2);
+ assertEquals(null, stats1.getSum());
+
+ updateStats1.reset();
+ assertEquals("0", stats1.getSum().toString());
+ updateStats1.increment();
+ updateStats1.updateDecimal64(10000, 6);
+ assertEquals("0.01", stats1.getSum().toString());
+ updateStats1.updateDecimal64(1, 4);
+ assertEquals("0.0101", stats1.getSum().toString());
+ updateStats1.updateDecimal64(TypeDescription.MAX_DECIMAL64, 6);
+ assertEquals(null, stats1.getSum());
+ updateStats1.reset();
+ updateStats1.updateDecimal64(TypeDescription.MAX_DECIMAL64, 6);
+ assertEquals("999999999999.999999", stats1.getSum().toString());
+ updateStats1.updateDecimal64(1, 6);
+ assertEquals(null, stats1.getSum());
+
+ updateStats1.reset();
+ ColumnStatisticsImpl updateStats2 = (ColumnStatisticsImpl) stats2;
+ updateStats2.reset();
+ updateStats1.increment();
+ updateStats2.increment();
+ updateStats1.updateDecimal64(TypeDescription.MAX_DECIMAL64, 6);
+ updateStats2.updateDecimal64(TypeDescription.MAX_DECIMAL64, 6);
+ assertEquals("999999999999.999999", stats1.getSum().toString());
+ assertEquals("999999999999.999999", stats2.getSum().toString());
+ updateStats1.merge(updateStats2);
+ assertEquals(null, stats1.getSum());
+ }
}
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index bc85724..ae8bab5 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -275,23 +275,26 @@ public class TestRecordReaderImpl {
@Test
public void testGetMin() throws Exception {
assertEquals(10L, RecordReaderImpl.getMin(
- ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
+ ColumnStatisticsImpl.deserialize(null, createIntStats(10L, 100L))));
assertEquals(10.0d, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ null,
OrcProto.ColumnStatistics.newBuilder()
.setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
.setMinimum(10.0d).setMaximum(100.0d).build()).build())));
assertEquals(null, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
.build())));
assertEquals("a", RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder()
.setMinimum("a").setMaximum("b").build()).build())));
assertEquals("hello", RecordReaderImpl.getMin(ColumnStatisticsImpl
- .deserialize(createStringStats("hello", "world"))));
+ .deserialize(null, createStringStats("hello", "world"))));
assertEquals(HiveDecimal.create("111.1"), RecordReaderImpl.getMin(ColumnStatisticsImpl
- .deserialize(createDecimalStats("111.1", "112.1"))));
+ .deserialize(null, createDecimalStats("111.1", "112.1"))));
}
private static OrcProto.ColumnStatistics createIntStats(Long min,
@@ -379,23 +382,27 @@ public class TestRecordReaderImpl {
@Test
public void testGetMax() throws Exception {
- assertEquals(100L, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
+ assertEquals(100L, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ null, createIntStats(10L, 100L))));
assertEquals(100.0d, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ null,
OrcProto.ColumnStatistics.newBuilder()
.setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
.setMinimum(10.0d).setMaximum(100.0d).build()).build())));
assertEquals(null, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
.build())));
assertEquals("b", RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder()
.setMinimum("a").setMaximum("b").build()).build())));
assertEquals("world", RecordReaderImpl.getMax(ColumnStatisticsImpl
- .deserialize(createStringStats("hello", "world"))));
+ .deserialize(null, createStringStats("hello", "world"))));
assertEquals(HiveDecimal.create("112.1"), RecordReaderImpl.getMax(ColumnStatisticsImpl
- .deserialize(createDecimalStats("111.1", "112.1"))));
+ .deserialize(null, createDecimalStats("111.1", "112.1"))));
}
static TruthValue evaluateBoolean(OrcProto.ColumnStatistics stats,
@@ -1471,7 +1478,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createIntStats(10, 100));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong(15);
@@ -1486,7 +1493,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createIntStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong(15);
@@ -1505,7 +1512,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createIntStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong(19);
@@ -1523,7 +1530,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDoubleStats(10.0, 100.0));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addDouble(15.0);
@@ -1538,7 +1545,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDoubleStats(10.0, 100.0));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addDouble(15.0);
@@ -1557,7 +1564,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDoubleStats(10.0, 100.0));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addDouble(19.0);
@@ -1575,7 +1582,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createStringStats("str_10", "str_200"));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString("str_15");
@@ -1590,7 +1597,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createStringStats("str_10", "str_200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString("str_15");
@@ -1609,7 +1616,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createStringStats("str_10", "str_200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString("str_19");
@@ -1628,7 +1635,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDateStats(10, 100));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong((new DateWritable(15)).getDays());
@@ -1644,7 +1651,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDateStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong((new DateWritable(15)).getDays());
@@ -1663,7 +1670,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDateStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong((new DateWritable(19)).getDays());
@@ -1683,7 +1690,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString(HiveDecimal.create(15).toString());
@@ -1702,7 +1709,7 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString(HiveDecimal.create(19).toString());
@@ -1725,11 +1732,11 @@ public class TestRecordReaderImpl {
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", false));
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200", false));
// hasNull is false, so bloom filter should return NO
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", true));
+ cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200", true));
// hasNull is true, so bloom filter should return YES_NO_NULL
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/tools/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index dc89a31..3eae30d 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -302,7 +302,7 @@ public final class FileDump {
if (reader == null) {
return;
}
-
+ TypeDescription schema = reader.getSchema();
System.out.println("Structure for " + filename);
System.out.println("File Version: " + reader.getFileVersion().getName() +
" with " + reader.getWriterVersion());
@@ -384,7 +384,8 @@ public final class FileDump {
.readRowIndex(stripeIx, null, null, null, sargColumns);
for (int col : rowIndexCols) {
StringBuilder buf = new StringBuilder();
- String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
+ String rowIdxString = getFormattedRowIndices(col,
+ indices.getRowGroupIndex(), schema);
buf.append(rowIdxString);
String bloomFilString = getFormattedBloomFilters(col, indices,
reader.getWriterVersion(),
@@ -662,7 +663,8 @@ public final class FileDump {
}
private static String getFormattedRowIndices(int col,
- OrcProto.RowIndex[] rowGroupIndex) {
+ OrcProto.RowIndex[] rowGroupIndex,
+ TypeDescription schema) {
StringBuilder buf = new StringBuilder();
OrcProto.RowIndex index;
buf.append(" Row group indices for column ").append(col).append(":");
@@ -672,6 +674,7 @@ public final class FileDump {
return buf.toString();
}
+ TypeDescription colSchema = schema.findSubtype(col);
for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
buf.append("\n Entry ").append(entryIx).append(": ");
OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
@@ -683,7 +686,8 @@ public final class FileDump {
if (colStats == null) {
buf.append("no stats at ");
} else {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
+ ColumnStatistics cs =
+ ColumnStatisticsImpl.deserialize(colSchema, colStats);
buf.append(cs.toString());
}
buf.append(" positions: ");
http://git-wip-us.apache.org/repos/asf/orc/blob/c4c7b28f/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index e5f3b94..c02ff20 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -190,7 +190,8 @@ public class JsonFileDump {
for (int col : rowIndexCols) {
writer.object();
writer.key("columnId").value(col);
- writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
+ writeRowGroupIndexes(writer, col, indices.getRowGroupIndex(),
+ reader.getSchema());
writeBloomFilterIndexes(writer, col, indices,
reader.getWriterVersion(),
reader.getSchema().findSubtype(col).getCategory(),
@@ -398,7 +399,7 @@ public class JsonFileDump {
}
private static void writeRowGroupIndexes(JSONWriter writer, int col,
- OrcProto.RowIndex[] rowGroupIndex)
+ OrcProto.RowIndex[] rowGroupIndex, TypeDescription schema)
throws JSONException {
OrcProto.RowIndex index;
@@ -416,7 +417,8 @@ public class JsonFileDump {
continue;
}
OrcProto.ColumnStatistics colStats = entry.getStatistics();
- writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(colStats));
+ writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(
+ schema.findSubtype(col), colStats));
writer.key("positions").array();
for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
writer.value(entry.getPositions(posIx));