You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by GitBox <gi...@apache.org> on 2019/03/27 07:54:25 UTC

[GitHub] [drill] paul-rogers commented on a change in pull request #1711: DRILL-7011: Support schema in scan framework

paul-rogers commented on a change in pull request #1711: DRILL-7011: Support schema in scan framework
URL: https://github.com/apache/drill/pull/1711#discussion_r269437772
 
 

 ##########
 File path: exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestCsvWithSchema.java
 ##########
 @@ -82,6 +167,468 @@ public void testSchema() throws Exception {
           .addRow(10, new LocalDate(2019, 3, 20), "it works!", 1234.5D, 20L, "")
           .build();
       RowSetUtilities.verify(expected, actual);
+    } finally {
+      resetV3();
+      resetSchema();
+    }
+  }
+
+
+  /**
+   * Use a schema with explicit projection to get a consistent view
+   * of the table schema, even if columns are missing, rows are ragged,
+   * and column order changes.
+   * <p>
+   * Force the scans to occur in distinct fragments so the order of the
+   * file batches is random.
+   */
+  @Test
+  public void testMultiFileSchema() throws Exception {
+    RowSet expected1 = null;
+    RowSet expected2 = null;
+    try {
+      enableV3(true);
+      enableSchema(true);
+      enableMultiScan();
+      String tablePath = buildTwoFileTable("multiFileSchema", raggedMulti1Contents, reordered2Contents);
+      run(SCHEMA_SQL, tablePath);
+
+      // Wildcard expands to union of schema + table. In this case
+      // all table columns appear in the schema (though not all schema
+      // columns appear in the table.)
+
+      String sql = "SELECT id, `name`, `date`, gender, comment FROM " + tablePath;
+      TupleMetadata expectedSchema = new SchemaBuilder()
+          .add("id", MinorType.INT)
+          .add("name", MinorType.VARCHAR)
+          .addNullable("date", MinorType.DATE)
+          .add("gender", MinorType.VARCHAR)
+          .add("comment", MinorType.VARCHAR)
+          .buildSchema();
+      expected1 = new RowSetBuilder(client.allocator(), expectedSchema)
+          .addRow(1, "arina", new LocalDate(2019, 1, 18), "female", "ABC")
+          .addRow(2, "javan", new LocalDate(2019, 1, 19), "male", "ABC")
+          .addRow(4, "albert", new LocalDate(2019, 5, 4), "", "ABC")
+          .build();
+      expected2 = new RowSetBuilder(client.allocator(), expectedSchema)
+          .addRow(3, "bob", new LocalDate(2001, 1, 16), "NA", "ABC")
+          .build();
+
+      // Loop 10 times so that, as the two reader fragments read the two
+      // files, we end up with (acceptable) races that read the files in
+      // random order.
+
+      for (int i = 0; i < 10; i++) {
+        boolean sawSchema = false;
+        boolean sawFile1 = false;
+        boolean sawFile2 = false;
+        Iterator<DirectRowSet> iter = client.queryBuilder().sql(sql).rowSetIterator();
+        while (iter.hasNext()) {
+          RowSet result = iter.next();
+          if (result.rowCount() == 3) {
+            sawFile1 = true;
+            new RowSetComparison(expected1).verifyAndClear(result);
+          } else if (result.rowCount() == 1) {
+            sawFile2 = true;
+            new RowSetComparison(expected2).verifyAndClear(result);
+          } else {
+            assertEquals(0, result.rowCount());
+            sawSchema = true;
+          }
+        }
+        assertTrue(sawSchema);
+        assertTrue(sawFile1);
+        assertTrue(sawFile2);
+      }
+    } finally {
+      expected1.clear();
+      expected2.clear();
+      client.resetSession(ExecConstants.ENABLE_V3_TEXT_READER_KEY);
+      client.resetSession(ExecConstants.STORE_TABLE_USE_SCHEMA_FILE);
+      client.resetSession(ExecConstants.MIN_READER_WIDTH_KEY);
+    }
+  }
+
+  /**
+   * Test the schema we get in V2 when the table read order is random.
+   * Worst-case: the two files have different column counts and
+   * column orders.
+   * <p>
+   * Though the results are random, we iterate 10 times which, in most runs,
+   * shows the random variation in schemas:
+   * <ul>
+   * <li>Sometimes the first batch has three columns, sometimes four.</li>
+   * <li>Sometimes the column `id` is in position 0, sometimes in position 1
+   * (correlated with the above).</li>
+   * <li>Due to the fact that sometimes the first file (with four columns)
+   * is returned first, sometimes the second file (with three columns) is
+   * returned first.</li>
+   * </ul>
+   */
+  @Test
+  public void testSchemaRaceV2() throws Exception {
+    try {
+      enableV3(false);
+      enableSchema(false);
+      enableMultiScan();
+      String tablePath = buildTwoFileTable("schemaRaceV2", multi1Contents, reordered2Contents);
+      boolean sawFile1First = false;
+      boolean sawFile2First = false;
+      boolean sawFullSchema = false;
+      boolean sawPartialSchema = false;
+      boolean sawIdAsCol0 = false;
+      boolean sawIdAsCol1 = false;
+      String sql = "SELECT * FROM " + tablePath;
+      for (int i = 0; i < 10; i++) {
+        Iterator<DirectRowSet> iter = client.queryBuilder().sql(sql).rowSetIterator();
+        int batchCount = 0;
+        while(iter.hasNext()) {
+          batchCount++;
+          RowSet result = iter.next();
+          TupleMetadata resultSchema = result.schema();
+          if (resultSchema.size() == 4) {
+            sawFullSchema = true;
+          } else {
+            assertEquals(3, resultSchema.size());
+            sawPartialSchema = true;
+          }
+          if (resultSchema.index("id") == 0) {
+            sawIdAsCol0 = true;
+          } else {
+            assertEquals(1, resultSchema.index("id"));
+            sawIdAsCol1 = true;
+          }
+          if (batchCount == 1) {
+            RowSetReader reader = result.reader();
+            assertTrue(reader.next());
+            String id = reader.scalar("id").getString();
+            if (id.equals("1")) {
+              sawFile1First = true;
+            } else {
+              assertEquals("3", id);
+              sawFile2First = true;
+            }
+          }
+          result.clear();
+        }
+      }
+
+      // Outcome is random (which is the key problem). Don't assert on these
+      // because doing so can lead to a flakey test.
+
+      if (!sawFile1First || ! sawFile2First || !sawFullSchema || !sawPartialSchema || !sawIdAsCol0 || !sawIdAsCol1) {
+        System.out.println("Some variations did not occur");
 
 Review comment:
   Actually, this is a poor person's "test". Can't actually fail when some variations don't occur as test will be flaky. Since this is more of a demo of existing problems, it is not really a test. We'll remove this code when we switch to V3 as the default.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services