You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by GitBox <gi...@apache.org> on 2018/08/18 14:10:45 UTC

[GitHub] ppadma commented on a change in pull request #1429: DRILL-6676: Add Union, List and Repeated List types to Result Set Loader

ppadma commented on a change in pull request #1429: DRILL-6676: Add Union, List and Repeated List types to Result Set Loader
URL: https://github.com/apache/drill/pull/1429#discussion_r211061558
 
 

 ##########
 File path: exec/java-exec/src/test/java/org/apache/drill/exec/physical/rowSet/impl/TestResultSetLoaderUnions.java
 ##########
 @@ -0,0 +1,647 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.physical.rowSet.impl;
+
+import static org.apache.drill.test.rowSet.RowSetUtilities.listValue;
+import static org.apache.drill.test.rowSet.RowSetUtilities.mapValue;
+import static org.apache.drill.test.rowSet.RowSetUtilities.strArray;
+import static org.apache.drill.test.rowSet.RowSetUtilities.variantArray;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Arrays;
+
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MajorType;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.rowSet.ResultSetLoader;
+import org.apache.drill.exec.physical.rowSet.RowSetLoader;
+import org.apache.drill.exec.record.MaterializedField;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.record.metadata.MetadataUtils;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.vector.NullableIntVector;
+import org.apache.drill.exec.vector.NullableVarCharVector;
+import org.apache.drill.exec.vector.ValueVector;
+import org.apache.drill.exec.vector.accessor.ArrayWriter;
+import org.apache.drill.exec.vector.accessor.ObjectType;
+import org.apache.drill.exec.vector.accessor.ObjectWriter;
+import org.apache.drill.exec.vector.accessor.ScalarWriter;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+import org.apache.drill.exec.vector.accessor.ValueType;
+import org.apache.drill.exec.vector.accessor.VariantWriter;
+import org.apache.drill.exec.vector.accessor.writer.EmptyListShim;
+import org.apache.drill.exec.vector.accessor.writer.ListWriterImpl;
+import org.apache.drill.exec.vector.accessor.writer.SimpleListShim;
+import org.apache.drill.exec.vector.accessor.writer.UnionVectorShim;
+import org.apache.drill.exec.vector.accessor.writer.UnionWriterImpl;
+import org.apache.drill.exec.vector.complex.ListVector;
+import org.apache.drill.exec.vector.complex.UnionVector;
+import org.apache.drill.test.SubOperatorTest;
+import org.apache.drill.test.rowSet.RowSet;
+import org.apache.drill.test.rowSet.RowSet.SingleRowSet;
+import org.apache.drill.test.rowSet.RowSetBuilder;
+import org.apache.drill.test.rowSet.RowSetReader;
+import org.apache.drill.test.rowSet.RowSetUtilities;
+import org.apache.drill.test.rowSet.schema.SchemaBuilder;
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Tests the result set loader support for union vectors. Union vectors
+ * are only lightly supported in Apache Drill and not supported at all
+ * in commercial versions. They have may problems: both in code and in theory.
+ * Most operators do not support them. But, JSON uses them, so they must
+ * be made to work in the result set loader layer.
+ */
+
+public class TestResultSetLoaderUnions extends SubOperatorTest {
+
+  @Test
+  public void testUnionBasics() {
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addUnion("u")
+          .addType(MinorType.VARCHAR)
+          .addMap()
+            .addNullable("a", MinorType.INT)
+            .addNullable("b", MinorType.VARCHAR)
+            .resumeUnion()
+          .resumeSchema()
+        .buildSchema();
+
+    final ResultSetLoaderImpl.ResultSetOptions options = new OptionBuilder()
+        .setSchema(schema)
+        .build();
+    final ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator(), options);
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Sanity check of writer structure
+
+    final ObjectWriter wo = writer.column(1);
+    assertEquals(ObjectType.VARIANT, wo.type());
+    final VariantWriter vw = wo.variant();
+
+    assertTrue(vw.hasType(MinorType.VARCHAR));
+    assertNotNull(vw.memberWriter(MinorType.VARCHAR));
+
+    assertTrue(vw.hasType(MinorType.MAP));
+    assertNotNull(vw.memberWriter(MinorType.MAP));
+
+    // Write values
+
+    rsLoader.startBatch();
+    writer
+      .addRow(1, "first")
+      .addRow(2, mapValue(20, "fred"))
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last");
+
+    // Verify the values.
+    // (Relies on the row set level union tests having passed.)
+
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+      .addRow(1, "first")
+      .addRow(2, mapValue(20, "fred"))
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last")
+      .build();
+
+    RowSetUtilities.verify(expected, fixture.wrap(rsLoader.harvest()));
+  }
+
+  @Test
+  public void testUnionAddTypes() {
+    final ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator());
+    final RowSetLoader writer = rsLoader.writer();
+
+    rsLoader.startBatch();
+
+    // First row, (1, "first"), create types as we go.
+
+    writer.start();
+    writer.addColumn(SchemaBuilder.columnSchema("id", MinorType.INT, DataMode.REQUIRED));
+    writer.scalar("id").setInt(1);
+    writer.addColumn(SchemaBuilder.columnSchema("u", MinorType.UNION, DataMode.OPTIONAL));
+    final VariantWriter variant = writer.column("u").variant();
+    variant.member(MinorType.VARCHAR).scalar().setString("first");
+    writer.save();
+
+    // Second row, (2, {20, "fred"}), create types as we go.
+
+    writer.start();
+    writer.scalar("id").setInt(2);
+    final TupleWriter innerMap = variant.member(MinorType.MAP).tuple();
+    innerMap.addColumn(SchemaBuilder.columnSchema("a", MinorType.INT, DataMode.OPTIONAL));
+    innerMap.scalar("a").setInt(20);
+    innerMap.addColumn(SchemaBuilder.columnSchema("b", MinorType.VARCHAR, DataMode.OPTIONAL));
+    innerMap.scalar("b").setString("fred");
+    writer.save();
+
+    // Write remaining rows using convenient methods, using
+    // schema defined above.
+
+    writer
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last")
+      ;
+
+    // Verify the values.
+    // (Relies on the row set level union tests having passed.)
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addUnion("u")
+          .addType(MinorType.VARCHAR)
+          .addMap()
+            .addNullable("a", MinorType.INT)
+            .addNullable("b", MinorType.VARCHAR)
+            .resumeUnion()
+          .resumeSchema()
+        .buildSchema();
+
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+      .addRow(1, "first")
+      .addRow(2, mapValue(20, "fred"))
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last")
+      .build();
+
+    final RowSet result = fixture.wrap(rsLoader.harvest());
+    RowSetUtilities.verify(expected, result);
+  }
+
+  @Test
+  public void testUnionOverflow() {
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addUnion("u")
+          .addType(MinorType.INT)
+          .addType(MinorType.VARCHAR)
+          .resumeSchema()
+        .buildSchema();
+
+    final ResultSetLoaderImpl.ResultSetOptions options = new OptionBuilder()
+        .setRowCountLimit(ValueVector.MAX_ROW_COUNT)
+        .setSchema(schema)
+        .build();
+    final ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator(), options);
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Fill the batch with enough data to cause overflow.
+    // Fill even rows with a Varchar, odd rows with an int.
+    // Data must be large enough to cause overflow before 32K rows
+    // (the half that get strings.
+    // 16 MB / 32 K = 512 bytes
+    // Make a bit bigger to overflow early.
+
+    final int strLength = 600;
+    final byte value[] = new byte[strLength - 6];
+    Arrays.fill(value, (byte) 'X');
+    final String strValue = new String(value, Charsets.UTF_8);
+    int count = 0;
+
+    rsLoader.startBatch();
+    while (! writer.isFull()) {
+      if (count % 2 == 0) {
+        writer.addRow(count, String.format("%s%06d", strValue, count));
+      } else {
+        writer.addRow(count, count * 10);
+      }
+      count++;
+    }
+
+    // Number of rows should be driven by vector size.
+    // Our row count should include the overflow row
+
+    final int expectedCount = ValueVector.MAX_BUFFER_SIZE / strLength * 2;
+    assertEquals(expectedCount + 1, count);
+
+    // Loader's row count should include only "visible" rows
+
+    assertEquals(expectedCount, writer.rowCount());
+
+    // Total count should include invisible and look-ahead rows.
+
+    assertEquals(expectedCount + 1, rsLoader.totalRowCount());
+
+    // Result should exclude the overflow row
+
+    RowSet result = fixture.wrap(rsLoader.harvest());
+    assertEquals(expectedCount, result.rowCount());
+
+    // Verify the data.
+
+    RowSetReader reader = result.reader();
+    int readCount = 0;
+    while (reader.next()) {
+      assertEquals(readCount, reader.scalar(0).getInt());
+      if (readCount % 2 == 0) {
+        assertEquals(String.format("%s%06d", strValue, readCount),
+            reader.variant(1).scalar().getString());
+      } else {
+        assertEquals(readCount * 10, reader.variant(1).scalar().getInt());
+      }
+      readCount++;
+    }
+    assertEquals(readCount, result.rowCount());
+    result.clear();
+
+    // Write a few more rows to verify the overflow row.
+
+    rsLoader.startBatch();
+    for (int i = 0; i < 1000; i++) {
+      if (count % 2 == 0) {
+        writer.addRow(count, String.format("%s%06d", strValue, count));
+      } else {
+        writer.addRow(count, count * 10);
+      }
+      count++;
+    }
+
+    result = fixture.wrap(rsLoader.harvest());
+    assertEquals(1001, result.rowCount());
+
+    final int startCount = readCount;
+    reader = result.reader();
+    while (reader.next()) {
+      assertEquals(readCount, reader.scalar(0).getInt());
+      if (readCount % 2 == 0) {
+        assertEquals(String.format("%s%06d", strValue, readCount),
+            reader.variant(1).scalar().getString());
+      } else {
+        assertEquals(readCount * 10, reader.variant(1).scalar().getInt());
+      }
+      readCount++;
+    }
+    assertEquals(readCount - startCount, result.rowCount());
+    result.clear();
+    rsLoader.close();
+  }
+
+  /**
+   * Test for the case of a list defined to contain exactly one type.
+   * Relies on the row set tests to verify that the single type model
+   * works for lists. Here we test that the ResultSetLoader put the
+   * pieces together correctly.
+   */
+
+  @Test
+  public void testSimpleList() {
+
+    // Schema with a list declared with one type, not expandable
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addList("list")
+          .addType(MinorType.VARCHAR)
+          .resumeSchema()
+        .buildSchema();
+
+    schema.metadata("list").variantSchema().becomeSimple();
+
+    final ResultSetLoaderImpl.ResultSetOptions options = new OptionBuilder()
+        .setSchema(schema)
+        .build();
+    final ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator(), options);
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Sanity check: should be an array of Varchar because we said the
+    // types within the list is not expandable.
+
+    final ArrayWriter arrWriter = writer.array("list");
+    assertEquals(ObjectType.SCALAR, arrWriter.entryType());
+    final ScalarWriter strWriter = arrWriter.scalar();
+    assertEquals(ValueType.STRING, strWriter.valueType());
+
+    // Can write a batch as if this was a repeated Varchar, except
+    // that any value can also be null.
+
+    rsLoader.startBatch();
+    writer
+      .addRow(1, strArray("fred", "barney"))
+      .addRow(2, null)
+      .addRow(3, strArray("wilma", "betty", "pebbles"))
+      ;
+
+    // Verify
+
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+        .addRow(1, strArray("fred", "barney"))
+        .addRow(2, null)
+        .addRow(3, strArray("wilma", "betty", "pebbles"))
+        .build();
+
+    RowSetUtilities.verify(expected, fixture.wrap(rsLoader.harvest()));
+  }
+
+  /**
+   * Test a simple list created dynamically at load time.
+   * The list must include a single type member.
+   */
+
+  @Test
+  public void testSimpleListDynamic() {
+
+    final ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator());
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Can write a batch as if this was a repeated Varchar, except
+    // that any value can also be null.
+
+    rsLoader.startBatch();
+
+    writer.addColumn(MaterializedField.create("id", Types.required(MinorType.INT)));
+
+    final ColumnMetadata colSchema = MetadataUtils.newVariant("list", DataMode.REPEATED);
+    colSchema.variantSchema().addType(MinorType.VARCHAR);
+    colSchema.variantSchema().becomeSimple();
+    writer.addColumn(colSchema);
+
+    // Sanity check: should be an array of Varchar because we said the
+    // types within the list is not expandable.
+
+    final ArrayWriter arrWriter = writer.array("list");
+    assertEquals(ObjectType.SCALAR, arrWriter.entryType());
+    final ScalarWriter strWriter = arrWriter.scalar();
+    assertEquals(ValueType.STRING, strWriter.valueType());
+
+    writer
+      .addRow(1, strArray("fred", "barney"))
+      .addRow(2, null)
+      .addRow(3, strArray("wilma", "betty", "pebbles"))
+      ;
+
+    // Verify
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addList("list")
+          .addType(MinorType.VARCHAR)
+          .resumeSchema()
+        .buildSchema();
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+        .addRow(1, strArray("fred", "barney"))
+        .addRow(2, null)
+        .addRow(3, strArray("wilma", "betty", "pebbles"))
+        .build();
+
+    RowSetUtilities.verify(expected, fixture.wrap(rsLoader.harvest()));
+  }
+
+  /**
+   * Try to create a simple (non-expandable) list without
+   * giving a member type. Expected to fail.
+   */
+
+  @Test
+  public void testSimpleListNoTypes() {
+
+    // Schema with a list declared with one type, not expandable
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addList("list")
+          .resumeSchema()
+        .buildSchema();
+
+    try {
+      schema.metadata("list").variantSchema().becomeSimple();
+    } catch (final IllegalStateException e) {
+      // expected
+    }
+  }
+
+  /**
+   * Try to create a simple (non-expandable) list while specifying
+   * two types. Expected to fail.
+   */
+
+  @Test
+  public void testSimpleListMultiTypes() {
+
+    // Schema with a list declared with one type, not expandable
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addList("list")
+          .addType(MinorType.VARCHAR)
+          .addType(MinorType.INT)
+          .resumeSchema()
+        .buildSchema();
+
+    try {
+      schema.metadata("list").variantSchema().becomeSimple();
+    } catch (final IllegalStateException e) {
+      // expected
+    }
+  }
+
+
+  /**
+   * Test a variant list created dynamically at load time.
+   * The list starts with no type, at which time it can hold
+   * only null values. Then we add a Varchar, and finally an
+   * Int.
+   * <p>
+   * This test is superficial. There are many odd cases to consider.
+   * <ul>
+   * <li>Write nulls to a list with no type. (This test ensures that
+   * adding a (nullable) scalar "does the right thing.")</li>
+   * <li>Add a map to the list. Maps carry no "bits" vector, so null
+   * list entries to that point are lost. (For maps, we could go straight
+   * to a union, with just a map, to preserve the null states. This whole
+   * area is a huge mess...)</li>
+   * <li>Do the type transitions when writing to a row. (The tests here
+   * do the transition between rows.</li>
+   * </ul>
+   *
+   * The reason for the sparse coverage is that Drill barely supports lists
+   * and unions; most code is just plain broken. Our goal here is not to fix
+   * all those problems, just to leave things no more broken than before.
+   */
+
+  @Test
+  public void testVariantListDynamic() {
+
+    final ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator());
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Can write a batch as if this was a repeated Varchar, except
+    // that any value can also be null.
+
+    rsLoader.startBatch();
+
+    writer.addColumn(MaterializedField.create("id", Types.required(MinorType.INT)));
+    writer.addColumn(MaterializedField.create("list", Types.optional(MinorType.LIST)));
+
+    // Sanity check: should be an array of variants because we said the
+    // types within the list are expandable (which is the default.)
+
+    final ArrayWriter arrWriter = writer.array("list");
+    assertEquals(ObjectType.VARIANT, arrWriter.entryType());
+    final VariantWriter variant = arrWriter.variant();
+
+    // Don't try this at home: sniff the internal state of the writer.
 
 Review comment:
   Can you check on this comment ? Do you mean to remove it ?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services