You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2021/01/26 19:34:51 UTC
[orc] branch master updated: ORC-741: Schema Evolution missing
column not handled in filters.
This is an automated email from the ASF dual-hosted git repository.
omalley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new 1c41def ORC-741: Schema Evolution missing column not handled in filters.
1c41def is described below
commit 1c41def3de61f833df8515cf4d613ecef3fc6b4b
Author: Pavan Lanka <pl...@apple.com>
AuthorDate: Fri Jan 22 15:05:04 2021 -0800
ORC-741: Schema Evolution missing column not handled in filters.
Fixes #634
Signed-off-by: Owen O'Malley <om...@apache.org>
---
.../java/org/apache/orc/impl/RecordReaderImpl.java | 14 ++---
.../test/org/apache/orc/TestRowFilteringSkip.java | 68 ++++++++++++++++++++++
.../apache/orc/mapred/TestOrcFileEvolution.java | 42 +++++++++++--
3 files changed, 112 insertions(+), 12 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 9b6d563..508ca65 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -100,7 +100,8 @@ public class RecordReaderImpl implements RecordReader {
*
* @param evolution the mapping from reader to file schema
* @param columnName the fully qualified column name to look for
- * @return the file column number or -1 if the column wasn't found
+ * @return the file column number or -1 if the column wasn't found in the file schema
+ * @throws IllegalArgumentException if the column was not found in the reader schema
*/
static int findColumns(SchemaEvolution evolution,
String columnName) {
@@ -110,10 +111,9 @@ public class RecordReaderImpl implements RecordReader {
TypeDescription fileColumn = evolution.getFileType(readerColumn);
return fileColumn == null ? -1 : fileColumn.getId();
} catch (IllegalArgumentException e) {
- if (LOG.isDebugEnabled()){
- LOG.debug("{}", e.getMessage());
- }
- return -1;
+ throw new IllegalArgumentException("Filter could not find column with name: " +
+ columnName + " on " + evolution.getReaderBaseSchema(),
+ e);
}
}
@@ -231,11 +231,9 @@ public class RecordReaderImpl implements RecordReader {
if (options.getPreFilterColumnNames() != null) {
for (String colName : options.getPreFilterColumnNames()) {
int expandColId = findColumns(evolution, colName);
+ // If the column is not present in the file then this can be ignored from read.
if (expandColId != -1) {
filterColIds.add(expandColId);
- } else {
- throw new IllegalArgumentException("Filter could not find column with name: " +
- colName + " on " + evolution.getReaderBaseSchema());
}
}
LOG.info("Filter Columns: " + filterColIds);
diff --git a/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java b/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java
index 0be05b3..10d5594 100644
--- a/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java
+++ b/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java
@@ -1237,6 +1237,74 @@ public class TestRowFilteringSkip {
}
@Test
+ public void testSchemaEvolutionMissingColumn() throws Exception {
+ // Set the row stride to a multiple of the batch size
+ final int INDEX_STRIDE = 16 * ColumnBatchRows;
+ final int NUM_BATCHES = 10;
+
+ TypeDescription fileSchema = TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("ts2", TypeDescription.createTimestamp());
+
+ try (Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(fileSchema)
+ .rowIndexStride(INDEX_STRIDE))) {
+ VectorizedRowBatch batch = fileSchema.createRowBatchV2();
+ LongColumnVector col1 = (LongColumnVector) batch.cols[0];
+ TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[1];
+ for (int b=0; b < NUM_BATCHES; ++b) {
+ batch.reset();
+ batch.size = ColumnBatchRows;
+ for (int row = 0; row < batch.size; row++) {
+ col1.vector[row] = row;
+ if ((row % 2) == 0)
+ col2.set(row, Timestamp.valueOf((1900+row)+"-04-01 12:34:56.9"));
+ else {
+ col2.isNull[row] = true;
+ col2.set(row, null);
+ }
+ }
+ col1.isRepeating = true;
+ col1.noNulls = false;
+ col2.noNulls = false;
+ writer.addRowBatch(batch);
+ }
+ }
+
+ TypeDescription readSchema = fileSchema
+ .clone()
+ .addField("missing", TypeDescription.createInt());
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf));
+
+ try (RecordReaderImpl rows = (RecordReaderImpl) reader.rows(
+ reader.options()
+ .schema(readSchema)
+ .setRowFilter(new String[]{"missing"}, TestRowFilteringSkip::notNullFilterMissing))) {
+ VectorizedRowBatch batch = readSchema.createRowBatchV2();
+
+ long rowCount = 0;
+ while (rows.nextBatch(batch)) {
+ // All rows are selected as NullTreeReader does not support filters
+ Assert.assertFalse(batch.selectedInUse);
+ rowCount += batch.size;
+ }
+ Assert.assertEquals(reader.getNumberOfRows(), rowCount);
+ }
+ }
+
+ private static void notNullFilterMissing(VectorizedRowBatch batch) {
+ int selIdx = 0;
+ for (int i = 0; i < batch.size; i++) {
+ if (!batch.cols[2].isNull[i]) {
+ batch.selected[selIdx++] = i;
+ }
+ }
+ batch.selectedInUse = true;
+ batch.size = selIdx;
+ }
+
+ @Test
public void testcustomFileTimestampRoundRobbinRowFilterCallback() throws Exception {
testFilePath = new Path(getClass().getClassLoader().
getSystemResource("orc_split_elim.orc").getPath());
diff --git a/java/mapreduce/src/test/org/apache/orc/mapred/TestOrcFileEvolution.java b/java/mapreduce/src/test/org/apache/orc/mapred/TestOrcFileEvolution.java
index 7e907b7..0b5e037 100644
--- a/java/mapreduce/src/test/org/apache/orc/mapred/TestOrcFileEvolution.java
+++ b/java/mapreduce/src/test/org/apache/orc/mapred/TestOrcFileEvolution.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.mapred.Reporter;
import org.apache.orc.*;
import org.apache.orc.TypeDescription.Category;
import org.apache.orc.impl.SchemaEvolution;
+import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
@@ -233,9 +234,22 @@ public class TestOrcFileEvolution {
}
@Test
+ public void testMissingColumnFromReaderSchema() {
+ // Expect failure if the column is missing from the reader schema, as column a that is added
+ // by the SArg is missing from the reader schema
+ Assert.assertThrows("Field a not found in", IllegalArgumentException.class,
+ () -> checkEvolution("struct<b:int,c:string>",
+ "struct<b:int,c:string>",
+ struct(1, "foo"),
+ struct(1, "foo", null),
+ true, true, false));
+
+ }
+
+ @Test
public void testPreHive4243CheckEqual() {
// Expect success on equal schemas
- checkEvolution("struct<_col0:int,_col1:string>",
+ checkEvolutionPosn("struct<_col0:int,_col1:string>",
"struct<_col0:int,_col1:string>",
struct(1, "foo"),
struct(1, "foo", null), false, addSarg, false);
@@ -245,7 +259,7 @@ public class TestOrcFileEvolution {
public void testPreHive4243Check() {
// Expect exception on strict compatibility check
thrown.expectMessage("HIVE-4243");
- checkEvolution("struct<_col0:int,_col1:string>",
+ checkEvolutionPosn("struct<_col0:int,_col1:string>",
"struct<_col0:int,_col1:string,_col2:double>",
struct(1, "foo"),
struct(1, "foo", null), false, addSarg, false);
@@ -253,7 +267,7 @@ public class TestOrcFileEvolution {
@Test
public void testPreHive4243AddColumn() {
- checkEvolution("struct<_col0:int,_col1:string>",
+ checkEvolutionPosn("struct<_col0:int,_col1:string>",
"struct<_col0:int,_col1:string,_col2:double>",
struct(1, "foo"),
struct(1, "foo", null), true, addSarg, false);
@@ -263,7 +277,7 @@ public class TestOrcFileEvolution {
public void testPreHive4243AddColumnMiddle() {
// Expect exception on type mismatch
thrown.expect(SchemaEvolution.IllegalEvolutionException.class);
- checkEvolution("struct<_col0:int,_col1:double>",
+ checkEvolutionPosn("struct<_col0:int,_col1:double>",
"struct<_col0:int,_col1:date,_col2:double>",
struct(1, 1.0),
null, true, addSarg, false);
@@ -312,6 +326,26 @@ public class TestOrcFileEvolution {
false, addSarg, true);
}
+ private void checkEvolutionPosn(String writerType, String readerType,
+ Object inputRow, Object expectedOutput,
+ boolean tolerateSchema, boolean addSarg,
+ boolean positional) {
+ SearchArgument sArg = null;
+ String[] sCols = null;
+ if (addSarg) {
+ sArg = SearchArgumentFactory
+ .newBuilder()
+ .lessThan("_col0", PredicateLeaf.Type.LONG, 10L)
+ .build();
+ sCols = new String[]{null, "_col0", null};
+ }
+
+ checkEvolution(writerType, readerType,
+ inputRow, expectedOutput,
+ tolerateSchema,
+ sArg, sCols, positional);
+ }
+
private void checkEvolution(String writerType, String readerType,
Object inputRow, Object expectedOutput,
boolean tolerateSchema, boolean addSarg,