You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2019/12/04 00:12:26 UTC
[orc] branch branch-1.5 updated: ORC-27: Add support for proleptic
Gregorian calendar for better support of dates before 1600AD.
This is an automated email from the ASF dual-hosted git repository.
omalley pushed a commit to branch branch-1.5
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.5 by this push:
new 3b39b27 ORC-27: Add support for proleptic Gregorian calendar for better support of dates before 1600AD.
3b39b27 is described below
commit 3b39b2708ecc2dc7b8ebf264a9edede8f7b70d57
Author: Owen O'Malley <om...@apache.org>
AuthorDate: Tue Nov 26 09:57:16 2019 -0800
ORC-27: Add support for proleptic Gregorian calendar for better support of
dates before 1600AD.
Signed-off-by: Owen O'Malley <om...@apache.org>
---
java/core/src/java/org/apache/orc/OrcConf.java | 11 +-
java/core/src/java/org/apache/orc/OrcFile.java | 43 ++++-
java/core/src/java/org/apache/orc/Reader.java | 5 +
.../src/java/org/apache/orc/StripeStatistics.java | 14 +-
.../src/java/org/apache/orc/TypeDescription.java | 4 +-
.../org/apache/orc/impl/ColumnStatisticsImpl.java | 52 ++++--
.../src/java/org/apache/orc/impl/DateUtils.java | 182 +++++++++++++++++++++
.../src/java/org/apache/orc/impl/ReaderImpl.java | 15 +-
.../java/org/apache/orc/impl/RecordReaderImpl.java | 16 +-
.../org/apache/orc/impl/TreeReaderFactory.java | 47 ++++++
.../src/java/org/apache/orc/impl/WriterImpl.java | 42 ++++-
.../org/apache/orc/impl/writer/DateTreeWriter.java | 9 +
.../orc/impl/writer/TimestampTreeWriter.java | 3 +
.../org/apache/orc/impl/writer/WriterContext.java | 6 +
.../org/apache/orc/TestProlepticConversions.java | 177 ++++++++++++++++++++
.../test/org/apache/orc/TestStringDictionary.java | 5 +
.../org/apache/orc/impl/TestRecordReaderImpl.java | 48 +++++-
java/pom.xml | 2 +-
.../src/java/org/apache/orc/tools/FileDump.java | 12 +-
.../java/org/apache/orc/tools/JsonFileDump.java | 20 +--
.../test/resources/orc-file-dump-bloomfilter.out | 1 +
.../test/resources/orc-file-dump-bloomfilter2.out | 1 +
.../orc-file-dump-dictionary-threshold.out | 1 +
java/tools/src/test/resources/orc-file-dump.json | 1 +
java/tools/src/test/resources/orc-file-dump.out | 1 +
.../tools/src/test/resources/orc-file-has-null.out | 1 +
proto/orc_proto.proto | 11 ++
27 files changed, 688 insertions(+), 42 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java
index a6fbad1..6586937 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -162,7 +162,16 @@ public enum OrcConf {
"Comma-separated list of columns for which dictionary encoding is to be skipped."),
// some JVM doesn't allow array creation of size Integer.MAX_VALUE, so chunk size is slightly less than max int
ORC_MAX_DISK_RANGE_CHUNK_LIMIT("orc.max.disk.range.chunk.limit", "hive.exec.orc.max.disk.range.chunk.limit",
- Integer.MAX_VALUE - 1024, "When reading stripes >2GB, specify max limit for the chunk size.")
+ Integer.MAX_VALUE - 1024, "When reading stripes >2GB, specify max limit for the chunk size."),
+ PROLEPTIC_GREGORIAN("orc.proleptic.gregorian", "orc.proleptic.gregorian", false,
+ "Should we read and write dates & times using the proleptic Gregorian calendar\n" +
+ "instead of the hybrid Julian Gregorian? Hive before 3.1 and Spark before 3.0\n" +
+ "used hybrid."),
+ PROLEPTIC_GREGORIAN_DEFAULT("orc.proleptic.gregorian.default",
+ "orc.proleptic.gregorian.default", false,
+ "This value controls whether pre-ORC 27 files are using the hybrid or proleptic\n" +
+ "calendar. Only Hive 3.1 and the C++ library wrote using the proleptic, so hybrid\n" +
+ "is the default.")
;
private final String attribute;
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java
index 62e6260..5b15be2 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -278,9 +278,11 @@ public class OrcFile {
// For now keeping this around to avoid complex surgery
private FileMetadata fileMetadata;
private boolean useUTCTimestamp;
+ private boolean useProlepticGregorian;
public ReaderOptions(Configuration conf) {
this.conf = conf;
+ this.useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf);
}
public ReaderOptions filesystem(FileSystem fs) {
@@ -298,6 +300,17 @@ public class OrcFile {
return this;
}
+ /**
+ * Should the reader convert dates and times to the proleptic Gregorian
+ * calendar?
+ * @param newValue should it use the proleptic Gregorian calendar?
+ * @return this
+ */
+ public ReaderOptions convertToProlepticGregorian(boolean newValue) {
+ this.useProlepticGregorian = newValue;
+ return this;
+ }
+
public Configuration getConfiguration() {
return conf;
}
@@ -332,6 +345,9 @@ public class OrcFile {
return useUTCTimestamp;
}
+ public boolean getConvertToProlepticGregorian() {
+ return useProlepticGregorian;
+ }
}
public static ReaderOptions readerOptions(Configuration conf) {
@@ -409,6 +425,7 @@ public class OrcFile {
private boolean writeVariableLengthBlocks;
private HadoopShims shims;
private String directEncodingColumns;
+ private boolean useProlepticGregorian;
protected WriterOptions(Properties tableProperties, Configuration conf) {
configuration = conf;
@@ -453,6 +470,7 @@ public class OrcFile {
OrcConf.WRITE_VARIABLE_LENGTH_BLOCKS.getBoolean(tableProperties,conf);
directEncodingColumns = OrcConf.DIRECT_ENCODING_COLUMNS.getString(
tableProperties, conf);
+ useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf);
}
/**
@@ -701,6 +719,17 @@ public class OrcFile {
return this;
}
+ /**
+ * Should the writer use the proleptic Gregorian calendar for
+ * times and dates.
+ * @param newValue true if we should use the proleptic calendar
+ * @return this
+ */
+ public WriterOptions setProlepticGregorian(boolean newValue) {
+ this.useProlepticGregorian = newValue;
+ return this;
+ }
+
public boolean getBlockPadding() {
return blockPaddingValue;
}
@@ -804,6 +833,10 @@ public class OrcFile {
public String getDirectEncodingColumns() {
return directEncodingColumns;
}
+
+ public boolean getProlepticGregorian() {
+ return useProlepticGregorian;
+ }
}
/**
@@ -898,6 +931,7 @@ public class OrcFile {
int rowIndexStride,
CompressionKind compression,
Map<String, ByteBuffer> userMetadata,
+ boolean writerUsedProlepticGregorian,
Path path,
Reader reader) {
// now we have to check compatibility
@@ -937,6 +971,10 @@ public class OrcFile {
}
}
}
+ if (writerUsedProlepticGregorian != reader.writerUsedProlepticGregorian()) {
+ LOG.info("Can't merge {} because it uses a different calendar", path);
+ return false;
+ }
return true;
}
@@ -978,6 +1016,7 @@ public class OrcFile {
int rowIndexStride = 0;
List<Path> result = new ArrayList<>(inputFiles.size());
Map<String, ByteBuffer> userMetadata = new HashMap<>();
+ boolean writerUsedProlepticGregorian = false;
for (Path input : inputFiles) {
FileSystem fs = input.getFileSystem(conf);
@@ -994,6 +1033,7 @@ public class OrcFile {
rowIndexStride = reader.getRowIndexStride();
fileVersion = reader.getFileVersion();
writerVersion = reader.getWriterVersion();
+ writerUsedProlepticGregorian = reader.writerUsedProlepticGregorian();
options.bufferSize(bufferSize)
.version(fileVersion)
.writerVersion(writerVersion)
@@ -1006,7 +1046,8 @@ public class OrcFile {
mergeMetadata(userMetadata, reader);
output = createWriter(outputPath, options);
} else if (!readerIsCompatible(schema, fileVersion, writerVersion,
- rowIndexStride, compression, userMetadata, input, reader)) {
+ rowIndexStride, compression, userMetadata,
+ writerUsedProlepticGregorian, input, reader)) {
continue;
} else {
mergeMetadata(userMetadata, reader);
diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java
index 6d6e04b..3c7d005 100644
--- a/java/core/src/java/org/apache/orc/Reader.java
+++ b/java/core/src/java/org/apache/orc/Reader.java
@@ -456,4 +456,9 @@ public interface Reader extends Closeable {
* @return Serialized file metadata read from disk for the purposes of caching, etc.
*/
ByteBuffer getSerializedFileFooter();
+
+ /**
+ * Was the file written using the proleptic Gregorian calendar.
+ */
+ boolean writerUsedProlepticGregorian();
}
diff --git a/java/core/src/java/org/apache/orc/StripeStatistics.java b/java/core/src/java/org/apache/orc/StripeStatistics.java
index d1738ff..6fa5764 100644
--- a/java/core/src/java/org/apache/orc/StripeStatistics.java
+++ b/java/core/src/java/org/apache/orc/StripeStatistics.java
@@ -19,14 +19,22 @@
package org.apache.orc;
import org.apache.orc.impl.ColumnStatisticsImpl;
+import org.apache.orc.impl.ReaderImpl;
import java.util.List;
public class StripeStatistics {
private final List<OrcProto.ColumnStatistics> cs;
+ private final ReaderImpl reader;
public StripeStatistics(List<OrcProto.ColumnStatistics> list) {
+ this(list, null);
+ }
+
+ public StripeStatistics(List<OrcProto.ColumnStatistics> list,
+ ReaderImpl reader) {
this.cs = list;
+ this.reader = reader;
}
/**
@@ -37,7 +45,11 @@ public class StripeStatistics {
public ColumnStatistics[] getColumnStatistics() {
ColumnStatistics[] result = new ColumnStatistics[cs.size()];
for (int i = 0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(null, cs.get(i));
+ if (reader == null) {
+ result[i] = ColumnStatisticsImpl.deserialize(null, cs.get(i));
+ } else {
+ result[i] = ColumnStatisticsImpl.deserialize(null, cs.get(i), reader);
+ }
}
return result;
}
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index 8372207..1e6e056 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -20,6 +20,7 @@ package org.apache.orc;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
@@ -634,8 +635,9 @@ public class TypeDescription
case SHORT:
case INT:
case LONG:
- case DATE:
return new LongColumnVector(maxSize);
+ case DATE:
+ return new DateColumnVector(maxSize);
case TIMESTAMP:
return new TimestampColumnVector(maxSize);
case FLOAT:
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index fc77d76..80f69e5 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -1174,15 +1174,19 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
DateStatisticsImpl() {
}
- DateStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ DateStatisticsImpl(OrcProto.ColumnStatistics stats,
+ boolean writerUsedProlepticGregorian,
+ boolean convertToProlepticGregorian) {
super(stats);
OrcProto.DateStatistics dateStats = stats.getDateStatistics();
// min,max values serialized/deserialized as int (days since epoch)
if (dateStats.hasMaximum()) {
- maximum = dateStats.getMaximum();
+ maximum = DateUtils.convertDate(dateStats.getMaximum(),
+ writerUsedProlepticGregorian, convertToProlepticGregorian);
}
if (dateStats.hasMinimum()) {
- minimum = dateStats.getMinimum();
+ minimum = DateUtils.convertDate(dateStats.getMinimum(),
+ writerUsedProlepticGregorian, convertToProlepticGregorian);
}
}
@@ -1335,23 +1339,31 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
TimestampStatisticsImpl() {
}
- TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ TimestampStatisticsImpl(OrcProto.ColumnStatistics stats,
+ boolean writerUsedProlepticGregorian,
+ boolean convertToProlepticGregorian) {
super(stats);
OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics();
// min,max values serialized/deserialized as int (milliseconds since epoch)
if (timestampStats.hasMaximum()) {
- maximum = SerializationUtils.convertToUtc(TimeZone.getDefault(),
- timestampStats.getMaximum());
+ maximum = DateUtils.convertTime(
+ SerializationUtils.convertToUtc(TimeZone.getDefault(),
+ timestampStats.getMaximum()),
+ writerUsedProlepticGregorian, convertToProlepticGregorian);
}
if (timestampStats.hasMinimum()) {
- minimum = SerializationUtils.convertToUtc(TimeZone.getDefault(),
- timestampStats.getMinimum());
+ minimum = DateUtils.convertTime(
+ SerializationUtils.convertToUtc(TimeZone.getDefault(),
+ timestampStats.getMinimum()),
+ writerUsedProlepticGregorian, convertToProlepticGregorian);
}
if (timestampStats.hasMaximumUtc()) {
- maximum = timestampStats.getMaximumUtc();
+ maximum = DateUtils.convertTime(timestampStats.getMaximumUtc(),
+ writerUsedProlepticGregorian, convertToProlepticGregorian);
}
if (timestampStats.hasMinimumUtc()) {
- minimum = timestampStats.getMinimumUtc();
+ minimum = DateUtils.convertTime(timestampStats.getMinimumUtc(),
+ writerUsedProlepticGregorian, convertToProlepticGregorian);
}
}
@@ -1665,6 +1677,20 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
public static ColumnStatisticsImpl deserialize(TypeDescription schema,
OrcProto.ColumnStatistics stats) {
+ return deserialize(schema, stats, false, false);
+ }
+
+ public static ColumnStatisticsImpl deserialize(TypeDescription schema,
+ OrcProto.ColumnStatistics stats,
+ ReaderImpl reader) {
+ return deserialize(schema, stats, reader.writerUsedProlepticGregorian(),
+ reader.options.getConvertToProlepticGregorian());
+ }
+
+ public static ColumnStatisticsImpl deserialize(TypeDescription schema,
+ OrcProto.ColumnStatistics stats,
+ boolean writerUsedProlepticGregorian,
+ boolean convertToProlepticGregorian) {
if (stats.hasBucketStatistics()) {
return new BooleanStatisticsImpl(stats);
} else if (stats.hasIntStatistics()) {
@@ -1681,9 +1707,11 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
return new DecimalStatisticsImpl(stats);
}
} else if (stats.hasDateStatistics()) {
- return new DateStatisticsImpl(stats);
+ return new DateStatisticsImpl(stats, writerUsedProlepticGregorian,
+ convertToProlepticGregorian);
} else if (stats.hasTimestampStatistics()) {
- return new TimestampStatisticsImpl(stats);
+ return new TimestampStatisticsImpl(stats, writerUsedProlepticGregorian,
+ convertToProlepticGregorian);
} else if(stats.hasBinaryStatistics()) {
return new BinaryStatisticsImpl(stats);
} else {
diff --git a/java/core/src/java/org/apache/orc/impl/DateUtils.java b/java/core/src/java/org/apache/orc/impl/DateUtils.java
new file mode 100644
index 0000000..8ac574c
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/DateUtils.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.GregorianCalendar;
+import java.util.TimeZone;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Conversion utilities from the hybrid Julian/Gregorian calendar to/from the
+ * proleptic Gregorian.
+ *
+ * The semantics here are to hold the string representation constant and change
+ * the epoch offset rather than holding the instant in time constant and change
+ * the string representation.
+ *
+ * These utilities will be fast for the common case (> 1582 AD), but slow for
+ * old dates.
+ */
+public class DateUtils {
+ private static SimpleDateFormat createFormatter(String fmt,
+ GregorianCalendar calendar) {
+ SimpleDateFormat result = new SimpleDateFormat(fmt);
+ result.setCalendar(calendar);
+ return result;
+ }
+
+ private static final String DATE = "yyyy-MM-dd";
+ private static final String TIME = DATE + " HH:mm:ss";
+ private static final TimeZone UTC = TimeZone.getTimeZone("UTC");
+ private static final GregorianCalendar HYBRID = new GregorianCalendar();
+ private static final ThreadLocal<SimpleDateFormat> HYBRID_DATE_FORMAT =
+ ThreadLocal.withInitial(() -> createFormatter(DATE, HYBRID));
+ private static final ThreadLocal<SimpleDateFormat> HYBRID_TIME_FORMAT =
+ ThreadLocal.withInitial(() -> createFormatter(TIME, HYBRID));
+ private static final long SWITCHOVER_MILLIS;
+ private static final long SWITCHOVER_DAYS;
+ private static final GregorianCalendar PROLEPTIC = new GregorianCalendar();
+ private static final ThreadLocal<SimpleDateFormat> PROLEPTIC_DATE_FORMAT =
+ ThreadLocal.withInitial(() -> createFormatter(DATE, PROLEPTIC));
+ private static final ThreadLocal<SimpleDateFormat> PROLEPTIC_TIME_FORMAT =
+ ThreadLocal.withInitial(() -> createFormatter(TIME, PROLEPTIC));
+
+ static {
+ HYBRID.setTimeZone(UTC);
+ PROLEPTIC.setTimeZone(UTC);
+ PROLEPTIC.setGregorianChange(new Date(Long.MIN_VALUE));
+
+ // Get the last day where the two calendars agree with each other.
+ try {
+ SWITCHOVER_MILLIS = HYBRID_DATE_FORMAT.get().parse("1582-10-15").getTime();
+ SWITCHOVER_DAYS = TimeUnit.MILLISECONDS.toDays(SWITCHOVER_MILLIS);
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Can't parse switch over date", e);
+ }
+ }
+
+ /**
+ * Convert an epoch day from the hybrid Julian/Gregorian calendar to the
+ * proleptic Gregorian.
+ * @param hybrid day of epoch in the hybrid Julian/Gregorian
+ * @return day of epoch in the proleptic Gregorian
+ */
+ public static int convertDateToProleptic(int hybrid) {
+ int proleptic = hybrid;
+ if (hybrid < SWITCHOVER_DAYS) {
+ String dateStr = HYBRID_DATE_FORMAT.get().format(
+ new Date(TimeUnit.DAYS.toMillis(hybrid)));
+ try {
+ proleptic = (int) TimeUnit.MILLISECONDS.toDays(
+ PROLEPTIC_DATE_FORMAT.get().parse(dateStr).getTime());
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Can't parse " + dateStr, e);
+ }
+ }
+ return proleptic;
+ }
+
+ /**
+ * Convert an epoch day from the proleptic Gregorian calendar to the hybrid
+ * Julian/Gregorian.
+ * @param proleptic day of epoch in the proleptic Gregorian
+ * @return day of epoch in the hybrid Julian/Gregorian
+ */
+ public static int convertDateToHybrid(int proleptic) {
+ int hyrbid = proleptic;
+ if (proleptic < SWITCHOVER_DAYS) {
+ String dateStr = PROLEPTIC_DATE_FORMAT.get().format(
+ new Date(TimeUnit.DAYS.toMillis(proleptic)));
+ try {
+ hyrbid = (int) TimeUnit.MILLISECONDS.toDays(
+ HYBRID_DATE_FORMAT.get().parse(dateStr).getTime());
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Can't parse " + dateStr, e);
+ }
+ }
+ return hyrbid;
+ }
+
+ public static int convertDate(int original,
+ boolean fromProleptic,
+ boolean toProleptic) {
+ if (fromProleptic != toProleptic) {
+ return toProleptic
+ ? convertDateToProleptic(original)
+ : convertDateToHybrid(original);
+ } else {
+ return original;
+ }
+ }
+
+ public static long convertTime(long original,
+ boolean fromProleptic,
+ boolean toProleptic) {
+ if (fromProleptic != toProleptic) {
+ return toProleptic
+ ? convertTimeToProleptic(original)
+ : convertTimeToHybrid(original);
+ } else {
+ return original;
+ }
+ }
+ /**
+ * Convert epoch millis from the hybrid Julian/Gregorian calendar to the
+ * proleptic Gregorian.
+ * @param hybrid millis of epoch in the hybrid Julian/Gregorian
+ * @return millis of epoch in the proleptic Gregorian
+ */
+ public static long convertTimeToProleptic(long hybrid) {
+ long proleptic = hybrid;
+ if (hybrid < SWITCHOVER_MILLIS) {
+ String dateStr = HYBRID_TIME_FORMAT.get().format(new Date(hybrid));
+ try {
+ proleptic = PROLEPTIC_TIME_FORMAT.get().parse(dateStr).getTime();
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Can't parse " + dateStr, e);
+ }
+ }
+ return proleptic;
+ }
+
+ /**
+ * Convert epoch millis from the proleptic Gregorian calendar to the hybrid
+ * Julian/Gregorian.
+ * @param proleptic millis of epoch in the proleptic Gregorian
+ * @return millis of epoch in the hybrid Julian/Gregorian
+ */
+ public static long convertTimeToHybrid(long proleptic) {
+ long hybrid = proleptic;
+ if (proleptic < SWITCHOVER_MILLIS) {
+ String dateStr = PROLEPTIC_TIME_FORMAT.get().format(new Date(proleptic));
+ try {
+ hybrid = HYBRID_TIME_FORMAT.get().parse(dateStr).getTime();
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Can't parse " + dateStr, e);
+ }
+ }
+ return hybrid;
+ }
+
+ private DateUtils() {
+ throw new UnsupportedOperationException();
+ }
+}
diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
index 09cfe82..5a86440 100644
--- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -30,6 +30,7 @@ import java.util.function.Supplier;
import org.apache.hadoop.fs.FileStatus;
import org.apache.orc.CompressionKind;
import org.apache.orc.FileMetadata;
+import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcUtils;
import org.apache.orc.Reader;
@@ -228,12 +229,12 @@ public class ReaderImpl implements Reader {
return deserializeStats(schema, fileStats);
}
- public static ColumnStatistics[] deserializeStats(
+ public ColumnStatistics[] deserializeStats(
TypeDescription schema,
List<OrcProto.ColumnStatistics> fileStats) {
ColumnStatistics[] result = new ColumnStatistics[fileStats.size()];
for(int i=0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(schema, fileStats.get(i));
+ result[i] = ColumnStatisticsImpl.deserialize(schema, fileStats.get(i), this);
}
return result;
}
@@ -645,6 +646,14 @@ public class ReaderImpl implements Reader {
}
@Override
+ public boolean writerUsedProlepticGregorian() {
+ OrcProto.Footer footer = tail.getFooter();
+ return footer.hasCalendar()
+ ? footer.getCalendar() == OrcProto.CalendarKind.PROLEPTIC_GREGORIAN
+ : OrcConf.PROLEPTIC_GREGORIAN_DEFAULT.getBoolean(conf);
+ }
+
+ @Override
public Options options() {
return new Options(conf);
}
@@ -825,7 +834,7 @@ public class ReaderImpl implements Reader {
}
List<StripeStatistics> result = new ArrayList<>();
for (OrcProto.StripeStatistics ss : stripeStats) {
- result.add(new StripeStatistics(ss.getColStatsList()));
+ result.add(new StripeStatistics(ss.getColStatsList(), this));
}
return result;
}
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 84e1b93..b07dbb2 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -215,7 +215,9 @@ public class RecordReaderImpl implements RecordReader {
rowIndexStride,
evolution,
writerVersion,
- fileReader.useUTCTimestamp);
+ fileReader.useUTCTimestamp,
+ fileReader.writerUsedProlepticGregorian(),
+ fileReader.options.getConvertToProlepticGregorian());
} else {
sargApp = null;
}
@@ -269,7 +271,9 @@ public class RecordReaderImpl implements RecordReader {
.setSchemaEvolution(evolution)
.skipCorrupt(skipCorrupt)
.fileFormat(fileReader.getFileVersion())
- .useUTCTimestamp(fileReader.useUTCTimestamp);
+ .useUTCTimestamp(fileReader.useUTCTimestamp)
+ .setProlepticGregorian(fileReader.writerUsedProlepticGregorian(),
+ fileReader.options.getConvertToProlepticGregorian());
reader = TreeReaderFactory.createTreeReader(evolution.getReaderSchema(),
readerContext);
@@ -887,15 +891,21 @@ public class RecordReaderImpl implements RecordReader {
private SchemaEvolution evolution;
private final long[] exceptionCount;
private final boolean useUTCTimestamp;
+ private final boolean writerUsedProlepticGregorian;
+ private final boolean convertToProlepticGregorian;
public SargApplier(SearchArgument sarg,
long rowIndexStride,
SchemaEvolution evolution,
OrcFile.WriterVersion writerVersion,
- boolean useUTCTimestamp) {
+ boolean useUTCTimestamp,
+ boolean writerUsedProlepticGregorian,
+ boolean convertToProlepticGregorian) {
this.writerVersion = writerVersion;
this.sarg = sarg;
sargLeaves = sarg.getLeaves();
+ this.writerUsedProlepticGregorian = writerUsedProlepticGregorian;
+ this.convertToProlepticGregorian = convertToProlepticGregorian;
filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves,
evolution);
this.rowIndexStride = rowIndexStride;
diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
index e8e189a..539a57a 100644
--- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
@@ -30,6 +30,7 @@ import java.util.TimeZone;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
@@ -61,6 +62,9 @@ public class TreeReaderFactory {
String getWriterTimezone();
OrcFile.Version getFileFormat();
+ boolean useProlepticGregorian();
+
+ boolean fileUsedProlepticGregorian();
}
public static class ReaderContext implements Context {
@@ -69,6 +73,8 @@ public class TreeReaderFactory {
private boolean useUTCTimestamp = false;
private String writerTimezone;
private OrcFile.Version fileFormat;
+ private boolean useProlepticGregorian;
+ private boolean fileUsedProlepticGregorian;
public ReaderContext setSchemaEvolution(SchemaEvolution evolution) {
this.evolution = evolution;
@@ -95,6 +101,13 @@ public class TreeReaderFactory {
return this;
}
+ public ReaderContext setProlepticGregorian(boolean file,
+ boolean reader) {
+ this.useProlepticGregorian = reader;
+ this.fileUsedProlepticGregorian = file;
+ return this;
+ }
+
@Override
public SchemaEvolution getSchemaEvolution() {
return evolution;
@@ -119,6 +132,16 @@ public class TreeReaderFactory {
public OrcFile.Version getFileFormat() {
return fileFormat;
}
+
+ @Override
+ public boolean useProlepticGregorian() {
+ return useProlepticGregorian;
+ }
+
+ @Override
+ public boolean fileUsedProlepticGregorian() {
+ return fileUsedProlepticGregorian;
+ }
}
public abstract static class TreeReader {
@@ -900,6 +923,8 @@ public class TreeReaderFactory {
private TimeZone writerTimeZone;
private boolean hasSameTZRules;
private ThreadLocal<DateFormat> threadLocalDateFormat;
+ private final boolean useProleptic;
+ private final boolean fileUsesProleptic;
TimestampTreeReader(int columnId, Context context) throws IOException {
this(columnId, null, null, null, null, context);
@@ -936,6 +961,8 @@ public class TreeReaderFactory {
}
base_timestamp = getBaseTimestamp(context.getWriterTimezone());
}
+ fileUsesProleptic = context.fileUsedProlepticGregorian();
+ useProleptic = context.useProlepticGregorian();
}
@Override
@@ -1004,6 +1031,7 @@ public class TreeReaderFactory {
boolean[] isNull,
final int batchSize) throws IOException {
TimestampColumnVector result = (TimestampColumnVector) previousVector;
+ result.changeCalendar(fileUsesProleptic, false);
super.nextVector(previousVector, isNull, batchSize);
result.setIsUTC(context.getUseUTCTimestamp());
@@ -1041,6 +1069,7 @@ public class TreeReaderFactory {
}
}
}
+ result.changeCalendar(useProleptic, true);
}
private static int parseNanos(long serialized) {
@@ -1064,6 +1093,9 @@ public class TreeReaderFactory {
public static class DateTreeReader extends TreeReader {
protected IntegerReader reader = null;
+ private final boolean needsDateColumnVector;
+ private final boolean useProleptic;
+ private final boolean fileUsesProleptic;
DateTreeReader(int columnId, Context context) throws IOException {
this(columnId, null, null, null, context);
@@ -1072,6 +1104,10 @@ public class TreeReaderFactory {
protected DateTreeReader(int columnId, InStream present, InStream data,
OrcProto.ColumnEncoding encoding, Context context) throws IOException {
super(columnId, present, context);
+ useProleptic = context.useProlepticGregorian();
+ fileUsesProleptic = context.fileUsedProlepticGregorian();
+ // if either side is proleptic, we need a DateColumnVector
+ needsDateColumnVector = useProleptic || fileUsesProleptic;
if (data != null && encoding != null) {
checkEncoding(encoding);
reader = createIntegerReader(encoding.getKind(), data, true, context);
@@ -1114,12 +1150,23 @@ public class TreeReaderFactory {
boolean[] isNull,
final int batchSize) throws IOException {
final LongColumnVector result = (LongColumnVector) previousVector;
+ if (needsDateColumnVector) {
+ if (result instanceof DateColumnVector) {
+ ((DateColumnVector) result).changeCalendar(fileUsesProleptic, false);
+ } else {
+ throw new IllegalArgumentException("Can't use LongColumnVector to " +
+ "read proleptic Gregorian dates.");
+ }
+ }
// Read present/isNull stream
super.nextVector(result, isNull, batchSize);
// Read value entries based on isNull entries
reader.nextVector(result, result.vector, batchSize);
+ if (needsDateColumnVector) {
+ ((DateColumnVector) result).changeCalendar(useProleptic, true);
+ }
}
@Override
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index c7e5818..7eea7f7 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -120,12 +120,14 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
private final boolean useUTCTimeZone;
private final double dictionaryKeySizeThreshold;
private final boolean[] directEncodingColumns;
+ private final boolean useProlepticGregorian;
public WriterImpl(FileSystem fs,
Path path,
OrcFile.WriterOptions opts) throws IOException {
this.path = path;
this.conf = opts.getConfiguration();
+ useProlepticGregorian = opts.getProlepticGregorian();
this.callback = opts.getCallback();
this.schema = opts.getSchema();
this.writerVersion = opts.getWriterVersion();
@@ -438,6 +440,11 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
public double getDictionaryKeySizeThreshold(int columnId) {
return directEncodingColumns[columnId] ? 0.0 : dictionaryKeySizeThreshold;
}
+
+ @Override
+ public boolean getProlepticGregorian() {
+ return useProlepticGregorian;
+ }
}
@@ -536,6 +543,11 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
rawDataSize = computeRawDataSize();
// serialize the types
writeTypes(builder, schema);
+ if (hasDateOrTime(schema)) {
+ builder.setCalendar(useProlepticGregorian
+ ? OrcProto.CalendarKind.PROLEPTIC_GREGORIAN
+ : OrcProto.CalendarKind.JULIAN_GREGORIAN);
+ }
// add the stripe information
for(OrcProto.StripeInformation stripe: stripes) {
builder.addStripes(stripe);
@@ -643,8 +655,9 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
@Override
public void appendStripe(byte[] stripe, int offset, int length,
- StripeInformation stripeInfo,
- OrcProto.StripeStatistics stripeStatistics) throws IOException {
+ StripeInformation stripeInfo,
+ OrcProto.StripeStatistics stripeStatistics
+ ) throws IOException {
checkArgument(stripe != null, "Stripe must not be null");
checkArgument(length <= stripe.length,
"Specified length must not be greater specified array length");
@@ -691,7 +704,12 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
// add the column statistics
writeFileStatistics(builder, treeWriter);
- return ReaderImpl.deserializeStats(schema, builder.getStatisticsList());
+ List<OrcProto.ColumnStatistics> fileStats = builder.getStatisticsList();
+ ColumnStatistics[] result = new ColumnStatistics[fileStats.size()];
+ for(int i=0; i < result.length; ++i) {
+ result[i] = ColumnStatisticsImpl.deserialize(schema, fileStats.get(i));
+ }
+ return result;
}
public CompressionCodec getCompressionCodec() {
@@ -712,4 +730,22 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
}
return false;
}
+
+ private static boolean hasDateOrTime(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case TIMESTAMP:
+ case DATE:
+ return true;
+ default:
+ }
+ List<TypeDescription> children = schema.getChildren();
+ if (children != null) {
+ for(TypeDescription child: children) {
+ if (hasDateOrTime(child)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
}
diff --git a/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java
index 209dd0e..4289b57 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java
@@ -19,6 +19,7 @@
package org.apache.orc.impl.writer;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.orc.OrcProto;
@@ -32,6 +33,7 @@ import java.io.IOException;
public class DateTreeWriter extends TreeWriterBase {
private final IntegerWriter writer;
private final boolean isDirectV2;
+ private final boolean useProleptic;
public DateTreeWriter(int columnId,
TypeDescription schema,
@@ -45,6 +47,7 @@ public class DateTreeWriter extends TreeWriterBase {
if (rowIndexPosition != null) {
recordPosition(rowIndexPosition);
}
+ useProleptic = writer.getProlepticGregorian();
}
@Override
@@ -52,6 +55,12 @@ public class DateTreeWriter extends TreeWriterBase {
int length) throws IOException {
super.writeBatch(vector, offset, length);
LongColumnVector vec = (LongColumnVector) vector;
+ if (vector instanceof DateColumnVector) {
+ ((DateColumnVector) vec).changeCalendar(useProleptic, true);
+ } else if (useProleptic) {
+ throw new IllegalArgumentException("Can't use LongColumnVector to write" +
+ " proleptic dates");
+ }
if (vector.isRepeating) {
if (vector.noNulls || !vector.isNull[0]) {
int value = (int) vec.vector[0];
diff --git a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
index 0f30d07..c7a751a 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
@@ -44,6 +44,7 @@ public class TimestampTreeWriter extends TreeWriterBase {
private final TimeZone localTimezone;
private final long baseEpochSecsLocalTz;
private final long baseEpochSecsUTC;
+ private final boolean useProleptic;
public TimestampTreeWriter(int columnId,
TypeDescription schema,
@@ -77,6 +78,7 @@ public class TimestampTreeWriter extends TreeWriterBase {
} catch (ParseException e) {
throw new IOException("Unable to create base timestamp tree writer", e);
}
+ useProleptic = writer.getProlepticGregorian();
}
@Override
@@ -95,6 +97,7 @@ public class TimestampTreeWriter extends TreeWriterBase {
int length) throws IOException {
super.writeBatch(vector, offset, length);
TimestampColumnVector vec = (TimestampColumnVector) vector;
+ vec.changeCalendar(useProleptic, true);
if (vector.isRepeating) {
if (vector.noNulls || !vector.isNull[0]) {
// ignore the bottom three digits from the vec.time field
diff --git a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
index 9ef3dda..0727d30 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
@@ -105,4 +105,10 @@ public interface WriterContext {
boolean getUseUTCTimestamp();
double getDictionaryKeySizeThreshold(int column);
+
+ /**
+ * Should we write the data using the proleptic Gregorian calendar?
+ * @return true if we should use the proleptic Gregorian calendar
+ */
+ boolean getProlepticGregorian();
}
diff --git a/java/core/src/test/org/apache/orc/TestProlepticConversions.java b/java/core/src/test/org/apache/orc/TestProlepticConversions.java
new file mode 100644
index 0000000..4d18412
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/TestProlepticConversions.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.io.File;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.GregorianCalendar;
+import java.util.List;
+import java.util.TimeZone;
+import java.util.concurrent.TimeUnit;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(Parameterized.class)
+public class TestProlepticConversions {
+
+ @Parameterized.Parameter
+ public boolean writerProlepticGregorian;
+
+ @Parameterized.Parameter(1)
+ public boolean readerProlepticGregorian;
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> getParameters() {
+ List<Object[]> result = new ArrayList<>();
+ final boolean[] BOOLEANS = new boolean[]{false, true};
+ for(Boolean writer: BOOLEANS) {
+ for (Boolean reader: BOOLEANS) {
+ result.add(new Object[]{writer, reader});
+ }
+ }
+ return result;
+ }
+
+ private Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ private final Configuration conf;
+ private final TimeZone UTC = TimeZone.getTimeZone("UTC");
+ private final GregorianCalendar PROLEPTIC = new GregorianCalendar();
+ private final GregorianCalendar HYBRID = new GregorianCalendar();
+ {
+ conf = new Configuration();
+ PROLEPTIC.setTimeZone(UTC);
+ PROLEPTIC.setGregorianChange(new Date(Long.MIN_VALUE));
+ HYBRID.setTimeZone(UTC);
+ }
+
+ private FileSystem fs;
+ private Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void setupPath() throws Exception {
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestProlepticConversion." +
+ testCaseName.getMethodName().replaceFirst("\\[[0-9]+]", "") + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ private SimpleDateFormat createParser(String format, GregorianCalendar calendar) {
+ SimpleDateFormat result = new SimpleDateFormat(format);
+ result.setCalendar(calendar);
+ return result;
+ }
+
+ @Test
+ public void testReadWrite() throws Exception {
+ TypeDescription schema = TypeDescription.fromString(
+ "struct<d:date,t:timestamp>");
+ try (Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .fileSystem(fs)
+ .setProlepticGregorian(writerProlepticGregorian))) {
+ VectorizedRowBatch batch = schema.createRowBatchV2();
+ batch.size = 1024;
+ DateColumnVector d = (DateColumnVector) batch.cols[0];
+ TimestampColumnVector t = (TimestampColumnVector) batch.cols[1];
+ d.changeCalendar(writerProlepticGregorian, false);
+ t.changeCalendar(writerProlepticGregorian, false);
+ GregorianCalendar cal = writerProlepticGregorian ? PROLEPTIC : HYBRID;
+ SimpleDateFormat dateFormat = createParser("yyyy-MM-dd", cal);
+ SimpleDateFormat timeFormat = createParser("yyyy-MM-dd HH:mm:ss", cal);
+ for(int r=0; r < batch.size; ++r) {
+ d.vector[r] = TimeUnit.MILLISECONDS.toDays(
+ dateFormat.parse(String.format("%04d-01-23", r * 2 + 1)).getTime());
+ Date val = timeFormat.parse(
+ String.format("%04d-03-21 %02d:12:34", 2 * r + 1, r % 24));
+ t.time[r] = val.getTime();
+ t.nanos[r] = 0;
+ }
+ writer.addRowBatch(batch);
+ }
+ try (Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf)
+ .filesystem(fs)
+ .convertToProlepticGregorian(readerProlepticGregorian));
+ RecordReader rows = reader.rows(reader.options())) {
+ assertEquals(writerProlepticGregorian, reader.writerUsedProlepticGregorian());
+ VectorizedRowBatch batch = reader.getSchema().createRowBatchV2();
+ DateColumnVector d = (DateColumnVector) batch.cols[0];
+ TimestampColumnVector t = (TimestampColumnVector) batch.cols[1];
+ GregorianCalendar cal = readerProlepticGregorian ? PROLEPTIC : HYBRID;
+ SimpleDateFormat dateFormat = createParser("yyyy-MM-dd", cal);
+ SimpleDateFormat timeFormat = createParser("yyyy-MM-dd HH:mm:ss", cal);
+
+ // Check the file statistics
+ ColumnStatistics[] colStats = reader.getStatistics();
+ DateColumnStatistics dStats = (DateColumnStatistics) colStats[1];
+ TimestampColumnStatistics tStats = (TimestampColumnStatistics) colStats[2];
+ assertEquals("0001-01-23", dateFormat.format(dStats.getMinimum()));
+ assertEquals("2047-01-23", dateFormat.format(dStats.getMaximum()));
+ assertEquals("0001-03-21 00:12:34", timeFormat.format(tStats.getMinimum()));
+ assertEquals("2047-03-21 15:12:34", timeFormat.format(tStats.getMaximum()));
+
+ // Check the stripe stats
+ List<StripeStatistics> stripeStats = reader.getStripeStatistics();
+ assertEquals(1, stripeStats.size());
+ colStats = stripeStats.get(0).getColumnStatistics();
+ dStats = (DateColumnStatistics) colStats[1];
+ tStats = (TimestampColumnStatistics) colStats[2];
+ assertEquals("0001-01-23", dateFormat.format(dStats.getMinimum()));
+ assertEquals("2047-01-23", dateFormat.format(dStats.getMaximum()));
+ assertEquals("0001-03-21 00:12:34", timeFormat.format(tStats.getMinimum()));
+ assertEquals("2047-03-21 15:12:34", timeFormat.format(tStats.getMaximum()));
+
+ // Check the data
+ assertTrue(rows.nextBatch(batch));
+ assertEquals(1024, batch.size);
+ // Ensure the column vectors are using the right calendar
+ assertEquals(readerProlepticGregorian, d.isUsingProlepticCalendar());
+ assertEquals(readerProlepticGregorian, t.usingProlepticCalendar());
+ for(int r=0; r < batch.size; ++r) {
+ String expectedD = String.format("%04d-01-23", r * 2 + 1);
+ String expectedT = String.format("%04d-03-21 %02d:12:34", 2 * r + 1, r % 24);
+ assertEquals("row " + r, expectedD, dateFormat.format(
+ new Date(TimeUnit.DAYS.toMillis(d.vector[r]))));
+ assertEquals("row " + r, expectedT, timeFormat.format(t.asScratchTimestamp(r)));
+ }
+ }
+ }
+}
diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java b/java/core/src/test/org/apache/orc/TestStringDictionary.java
index 203f58e..91fff0b 100644
--- a/java/core/src/test/org/apache/orc/TestStringDictionary.java
+++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java
@@ -249,6 +249,11 @@ public class TestStringDictionary {
public double getDictionaryKeySizeThreshold(int column) {
return OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
}
+
+ @Override
+ public boolean getProlepticGregorian() {
+ return false;
+ }
}
@Test
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index 72d2211..4c740c2 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -2023,7 +2023,7 @@ public class TestRecordReaderImpl {
.end().build();
RecordReaderImpl.SargApplier applier =
new RecordReaderImpl.SargApplier(sarg, 1000, evolution,
- OrcFile.WriterVersion.ORC_135, false);
+ OrcFile.WriterVersion.ORC_135, false, false, false);
OrcProto.StripeInformation stripe =
OrcProto.StripeInformation.newBuilder().setNumberOfRows(4000).build();
OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3];
@@ -2071,7 +2071,7 @@ public class TestRecordReaderImpl {
.end().build();
RecordReaderImpl.SargApplier applier =
new RecordReaderImpl.SargApplier(sarg, 1000, evolution,
- OrcFile.WriterVersion.ORC_135, false);
+ OrcFile.WriterVersion.ORC_135, false, false, false);
OrcProto.StripeInformation stripe =
OrcProto.StripeInformation.newBuilder().setNumberOfRows(3000).build();
OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3];
@@ -2103,6 +2103,50 @@ public class TestRecordReaderImpl {
}
@Test
+ public void testPositionalEvolutionAddColumnPPD() throws IOException {
+ Reader.Options opts = new Reader.Options();
+ opts.forcePositionalEvolution(true);
+
+ TypeDescription file = TypeDescription.fromString("struct<x:int>");
+ // new column added on reader side
+ TypeDescription read = TypeDescription.fromString("struct<x:int,y:boolean>");
+ opts.include(includeAll(read));
+
+ SchemaEvolution evo = new SchemaEvolution(file, read, opts);
+
+ SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd()
+ .equals("y", PredicateLeaf.Type.BOOLEAN, true).end().build();
+
+ RecordReaderImpl.SargApplier applier =
+ new RecordReaderImpl.SargApplier(sarg, 1000, evo,
+ OrcFile.WriterVersion.ORC_135, false, false, false);
+
+ OrcProto.StripeInformation stripe =
+ OrcProto.StripeInformation.newBuilder().setNumberOfRows(2000).build();
+
+ OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3];
+ indexes[1] = OrcProto.RowIndex.newBuilder() // index for original x column
+ .addEntry(createIndexEntry(0L, 10L))
+ .addEntry(createIndexEntry(100L, 200L))
+ .build();
+ indexes[2] = null; // no-op, just for clarifying that new reader column doesn't have an index
+
+ List<OrcProto.ColumnEncoding> encodings = new ArrayList<>();
+ encodings.add(OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+
+ boolean[] rows = applier.pickRowGroups(new ReaderImpl.StripeInformationImpl(stripe),
+ indexes, null, encodings, null, false);
+ assertEquals(RecordReaderImpl.SargApplier.READ_ALL_RGS, rows); //cannot filter for new column, return all rows
+ }
+
+ private boolean[] includeAll(TypeDescription readerType) {
+ int numColumns = readerType.getMaximumId() + 1;
+ boolean[] result = new boolean[numColumns];
+ Arrays.fill(result, true);
+ return result;
+ }
+
+ @Test
public void testSkipDataReaderOpen() throws Exception {
IOException ioe = new IOException("Don't open when there is no stripe");
diff --git a/java/pom.xml b/java/pom.xml
index 5dadc0c..6d1c696 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -70,7 +70,7 @@
<min.hadoop.version>2.2.0</min.hadoop.version>
<hadoop.version>2.7.3</hadoop.version>
- <storage-api.version>2.6.0</storage-api.version>
+ <storage-api.version>2.7.1</storage-api.version>
<zookeeper.version>3.4.6</zookeeper.version>
</properties>
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 3eae30d..a536f55 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -39,6 +39,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.util.BloomFilter;
import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.ColumnStatistics;
@@ -47,7 +48,6 @@ import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
-import org.apache.orc.impl.AcidStats;
import org.apache.orc.impl.ColumnStatisticsImpl;
import org.apache.orc.impl.OrcAcidUtils;
import org.apache.orc.impl.OrcIndex;
@@ -312,6 +312,9 @@ public final class FileDump {
if (reader.getCompressionKind() != CompressionKind.NONE) {
System.out.println("Compression size: " + reader.getCompressionSize());
}
+ System.out.println("Calendar: " + (reader.writerUsedProlepticGregorian()
+ ? "Proleptic Gregorian"
+ : "Julian/Gregorian"));
System.out.println("Type: " + reader.getSchema().toString());
System.out.println("\nStripe Statistics:");
List<StripeStatistics> stripeStats = reader.getStripeStatistics();
@@ -385,7 +388,7 @@ public final class FileDump {
for (int col : rowIndexCols) {
StringBuilder buf = new StringBuilder();
String rowIdxString = getFormattedRowIndices(col,
- indices.getRowGroupIndex(), schema);
+ indices.getRowGroupIndex(), schema, (ReaderImpl) reader);
buf.append(rowIdxString);
String bloomFilString = getFormattedBloomFilters(col, indices,
reader.getWriterVersion(),
@@ -664,7 +667,8 @@ public final class FileDump {
private static String getFormattedRowIndices(int col,
OrcProto.RowIndex[] rowGroupIndex,
- TypeDescription schema) {
+ TypeDescription schema,
+ ReaderImpl reader) {
StringBuilder buf = new StringBuilder();
OrcProto.RowIndex index;
buf.append(" Row group indices for column ").append(col).append(":");
@@ -687,7 +691,7 @@ public final class FileDump {
buf.append("no stats at ");
} else {
ColumnStatistics cs =
- ColumnStatisticsImpl.deserialize(colSchema, colStats);
+ ColumnStatisticsImpl.deserialize(colSchema, colStats, reader);
buf.append(cs.toString());
}
buf.append(" positions: ");
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index c02ff20..e1d6301 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -30,6 +30,7 @@ import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.AcidStats;
import org.apache.orc.impl.OrcAcidUtils;
+import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.RecordReaderImpl;
import org.apache.orc.util.BloomFilter;
import org.codehaus.jettison.json.JSONArray;
@@ -52,16 +53,12 @@ import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import org.codehaus.jettison.json.JSONStringer;
import org.codehaus.jettison.json.JSONWriter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
/**
* File dump tool with json formatted output.
*/
public class JsonFileDump {
- private static final Logger LOG = LoggerFactory.getLogger(JsonFileDump.class);
-
public static void printJsonMetaData(List<String> files,
Configuration conf,
List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
@@ -101,6 +98,9 @@ public class JsonFileDump {
writeSchema(writer, reader.getTypes());
writer.endArray();
+ writer.key("calendar").value(reader.writerUsedProlepticGregorian()
+ ? "proleptic Gregorian"
+ : "Julian/Gregorian");
writer.key("stripeStatistics").array();
List<StripeStatistics> stripeStatistics = reader.getStripeStatistics();
for (int n = 0; n < stripeStatistics.size(); n++) {
@@ -191,7 +191,7 @@ public class JsonFileDump {
writer.object();
writer.key("columnId").value(col);
writeRowGroupIndexes(writer, col, indices.getRowGroupIndex(),
- reader.getSchema());
+ reader.getSchema(), (ReaderImpl) reader);
writeBloomFilterIndexes(writer, col, indices,
reader.getWriterVersion(),
reader.getSchema().findSubtype(col).getCategory(),
@@ -399,9 +399,9 @@ public class JsonFileDump {
}
private static void writeRowGroupIndexes(JSONWriter writer, int col,
- OrcProto.RowIndex[] rowGroupIndex, TypeDescription schema)
- throws JSONException {
-
+ OrcProto.RowIndex[] rowGroupIndex,
+ TypeDescription schema,
+ ReaderImpl reader) throws JSONException {
OrcProto.RowIndex index;
if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
((index = rowGroupIndex[col]) == null)) {
@@ -418,7 +418,7 @@ public class JsonFileDump {
}
OrcProto.ColumnStatistics colStats = entry.getStatistics();
writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(
- schema.findSubtype(col), colStats));
+ schema.findSubtype(col), colStats, reader));
writer.key("positions").array();
for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
writer.value(entry.getPositions(posIx));
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index a519efe..ddcf385 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -3,6 +3,7 @@ File Version: 0.12 with ORC_517
Rows: 21000
Compression: ZLIB
Compression size: 4096
+Calendar: Julian/Gregorian
Type: struct<i:int,l:bigint,s:string>
Stripe Statistics:
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index 5e1925c..6e55f1e 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -3,6 +3,7 @@ File Version: 0.12 with ORC_517
Rows: 21000
Compression: ZLIB
Compression size: 4096
+Calendar: Julian/Gregorian
Type: struct<i:int,l:bigint,s:string>
Stripe Statistics:
diff --git a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 5befc78..64dcefc 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -3,6 +3,7 @@ File Version: 0.12 with ORC_517
Rows: 21000
Compression: ZLIB
Compression size: 4096
+Calendar: Julian/Gregorian
Type: struct<i:int,l:bigint,s:string>
Stripe Statistics:
diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json
index 4f756c9..0db1ccf 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -34,6 +34,7 @@
"columnType": "STRING"
}
],
+ "calendar": "Julian\/Gregorian",
"stripeStatistics": [
{
"stripeNumber": 1,
diff --git a/java/tools/src/test/resources/orc-file-dump.out b/java/tools/src/test/resources/orc-file-dump.out
index 6d2a912..f601edd 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -3,6 +3,7 @@ File Version: 0.12 with ORC_517
Rows: 21000
Compression: ZLIB
Compression size: 4096
+Calendar: Julian/Gregorian
Type: struct<i:int,l:bigint,s:string>
Stripe Statistics:
diff --git a/java/tools/src/test/resources/orc-file-has-null.out b/java/tools/src/test/resources/orc-file-has-null.out
index ee8fb12..70e2081 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -3,6 +3,7 @@ File Version: 0.12 with ORC_517
Rows: 20000
Compression: ZLIB
Compression size: 4096
+Calendar: Julian/Gregorian
Type: struct<bytes1:binary,string1:string>
Stripe Statistics:
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index 27820b4..24a62a4 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -194,6 +194,15 @@ message Metadata {
repeated StripeStatistics stripeStats = 1;
}
+enum CalendarKind {
+ UNKNOWN_CALENDAR = 0;
+ // The Java default calendar changes from Julian to Gregorian
+ // in 1583.
+ JULIAN_GREGORIAN = 1;
+ // A calendar that extends the Gregorian calendar back forever.
+ PROLEPTIC_GREGORIAN = 2;
+}
+
message Footer {
optional uint64 headerLength = 1;
optional uint64 contentLength = 2;
@@ -210,6 +219,8 @@ message Footer {
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
optional uint32 writer = 9;
+ // encryption is 10
+ optional CalendarKind calendar = 11;
}
enum CompressionKind {