You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by dz...@apache.org on 2022/10/21 11:49:17 UTC

[drill] 04/09: DRILL-8280: Cannot ANALYZE files containing non-ASCII column names (#2625)

This is an automated email from the ASF dual-hosted git repository.

dzamo pushed a commit to branch 1.20
in repository https://gitbox.apache.org/repos/asf/drill.git

commit e6f67d254191d51b5a1c5615dbeef92798ef04d2
Author: James Turton <91...@users.noreply.github.com>
AuthorDate: Thu Aug 18 02:12:44 2022 +0800

    DRILL-8280: Cannot ANALYZE files containing non-ASCII column names (#2625)
---
 .../drill/exec/expr/fn/impl/SchemaFunctions.java   |   5 +-
 .../drill/exec/sql/TestMetastoreCommands.java      | 193 ++++++++++++---------
 2 files changed, 116 insertions(+), 82 deletions(-)

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SchemaFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SchemaFunctions.java
index 7bce81be08..a71447eb4a 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SchemaFunctions.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SchemaFunctions.java
@@ -151,8 +151,9 @@ public class SchemaFunctions {
       }
 
       org.apache.drill.exec.record.metadata.TupleMetadata currentSchema =
-          org.apache.drill.exec.expr.fn.impl.SchemaFunctions.getTupleMetadata(
-              org.apache.drill.common.util.DrillStringUtils.toBinaryString(input.buffer, input.start, input.end));
+        org.apache.drill.exec.expr.fn.impl.SchemaFunctions.getTupleMetadata(
+          org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(input.start, input.end, input.buffer));
+
       if (schemaHolder.obj == null) {
         schemaHolder.obj = currentSchema;
         return;
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java
index 75f90868e4..7f42403c9f 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java
@@ -64,10 +64,12 @@ import java.nio.file.Paths;
 import java.time.Instant;
 import java.time.LocalDateTime;
 import java.time.ZoneId;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -100,85 +102,87 @@ public class TestMetastoreCommands extends ClusterTest {
       .build();
 
   public static final Map<SchemaPath, ColumnStatistics<?>> TABLE_COLUMN_STATISTICS =
-      ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder()
-      .put(SchemaPath.getSimplePath("o_shippriority"),
-          getColumnStatistics(0, 0, 120L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("o_orderstatus"),
-          getColumnStatistics("F", "P", 120L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderpriority"),
-          getColumnStatistics("1-URGENT", "5-LOW", 120L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderkey"),
-          getColumnStatistics(1, 1319, 120L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("o_clerk"),
-          getColumnStatistics("Clerk#000000004", "Clerk#000000995", 120L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_totalprice"),
-          getColumnStatistics(3266.69, 350110.21, 120L, TypeProtos.MinorType.FLOAT8))
-      .put(SchemaPath.getSimplePath("o_comment"),
+    new LinkedHashMap<SchemaPath, ColumnStatistics<?>>()
+    {{
+      put(SchemaPath.getSimplePath("o_shippriority"),
+          getColumnStatistics(0, 0, 120L, TypeProtos.MinorType.INT));
+      put(SchemaPath.getSimplePath("o_orderstatus"),
+          getColumnStatistics("F", "P", 120L, TypeProtos.MinorType.VARCHAR));
+      put(SchemaPath.getSimplePath("o_orderpriority"),
+          getColumnStatistics("1-URGENT", "5-LOW", 120L, TypeProtos.MinorType.VARCHAR));
+      put(SchemaPath.getSimplePath("o_orderkey"),
+          getColumnStatistics(1, 1319, 120L, TypeProtos.MinorType.INT));
+      put(SchemaPath.getSimplePath("o_clerk"),
+          getColumnStatistics("Clerk#000000004", "Clerk#000000995", 120L, TypeProtos.MinorType.VARCHAR));
+      put(SchemaPath.getSimplePath("o_totalprice"),
+          getColumnStatistics(3266.69, 350110.21, 120L, TypeProtos.MinorType.FLOAT8));
+      put(SchemaPath.getSimplePath("o_comment"),
           getColumnStatistics(" about the final platelets. dependen",
-              "zzle. carefully enticing deposits nag furio", 120L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_custkey"),
-          getColumnStatistics(25, 1498, 120L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("dir0"),
-          getColumnStatistics("1994", "1996", 120L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("dir1"),
-          getColumnStatistics("Q1", "Q4", 120L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderdate"),
-          getColumnStatistics(757382400000L, 850953600000L, 120L, TypeProtos.MinorType.DATE))
-      .build();
+              "zzle. carefully enticing deposits nag furio", 120L, TypeProtos.MinorType.VARCHAR));
+      put(SchemaPath.getSimplePath("o_custkey"),
+          getColumnStatistics(25, 1498, 120L, TypeProtos.MinorType.INT));
+      put(SchemaPath.getSimplePath("dir0"),
+          getColumnStatistics("1994", "1996", 120L, TypeProtos.MinorType.VARCHAR));
+      put(SchemaPath.getSimplePath("dir1"),
+          getColumnStatistics("Q1", "Q4", 120L, TypeProtos.MinorType.VARCHAR));
+      put(SchemaPath.getSimplePath("o_orderdate"),
+          getColumnStatistics(757382400000L, 850953600000L, 120L, TypeProtos.MinorType.DATE));
+    }};
 
   public static final Map<SchemaPath, ColumnStatistics<?>> DIR0_1994_SEGMENT_COLUMN_STATISTICS =
-      ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder()
-      .put(SchemaPath.getSimplePath("o_shippriority"),
-          getColumnStatistics(0, 0, 40L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("o_orderstatus"),
-          getColumnStatistics("F", "F", 40L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderpriority"),
-          getColumnStatistics("1-URGENT", "5-LOW", 40L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderkey"),
-          getColumnStatistics(5, 1031, 40L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("o_clerk"),
-          getColumnStatistics("Clerk#000000004", "Clerk#000000973", 40L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_totalprice"),
-          getColumnStatistics(3266.69, 350110.21, 40L, TypeProtos.MinorType.FLOAT8))
-      .put(SchemaPath.getSimplePath("o_comment"),
+    new LinkedHashMap<SchemaPath, ColumnStatistics<?>>()
+    {{
+        put(SchemaPath.getSimplePath("o_shippriority"),
+          getColumnStatistics(0, 0, 40L, TypeProtos.MinorType.INT));
+        put(SchemaPath.getSimplePath("o_orderstatus"),
+          getColumnStatistics("F", "F", 40L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_orderpriority"),
+          getColumnStatistics("1-URGENT", "5-LOW", 40L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_orderkey"),
+          getColumnStatistics(5, 1031, 40L, TypeProtos.MinorType.INT));
+        put(SchemaPath.getSimplePath("o_clerk"),
+          getColumnStatistics("Clerk#000000004", "Clerk#000000973", 40L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_totalprice"),
+          getColumnStatistics(3266.69, 350110.21, 40L, TypeProtos.MinorType.FLOAT8));
+        put(SchemaPath.getSimplePath("o_comment"),
           getColumnStatistics(" accounts nag slyly. ironic, ironic accounts wake blithel",
-              "yly final requests over the furiously regula", 40L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_custkey"),
-          getColumnStatistics(25, 1469, 40L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("dir0"),
-          getColumnStatistics("1994", "1994", 40L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("dir1"),
-          getColumnStatistics("Q1", "Q4", 40L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderdate"),
-          getColumnStatistics(757382400000L, 788140800000L, 40L, TypeProtos.MinorType.DATE))
-      .build();
+            "yly final requests over the furiously regula", 40L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_custkey"),
+          getColumnStatistics(25, 1469, 40L, TypeProtos.MinorType.INT));
+        put(SchemaPath.getSimplePath("dir0"),
+          getColumnStatistics("1994", "1994", 40L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("dir1"),
+          getColumnStatistics("Q1", "Q4", 40L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_orderdate"),
+          getColumnStatistics(757382400000L, 788140800000L, 40L, TypeProtos.MinorType.DATE));
+    }};
 
   public static final Map<SchemaPath, ColumnStatistics<?>> DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS =
-      ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder()
-      .put(SchemaPath.getSimplePath("o_shippriority"),
-          getColumnStatistics(0, 0, 10L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("o_orderstatus"),
-          getColumnStatistics("F", "F", 10L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderpriority"),
-          getColumnStatistics("1-URGENT", "5-LOW", 10L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderkey"),
-          getColumnStatistics(66, 833, 10L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("o_clerk"),
-          getColumnStatistics("Clerk#000000062", "Clerk#000000973", 10L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_totalprice"),
-          getColumnStatistics(3266.69, 132531.73, 10L, TypeProtos.MinorType.FLOAT8))
-      .put(SchemaPath.getSimplePath("o_comment"),
+    new LinkedHashMap<SchemaPath, ColumnStatistics<?>>() {{
+        put(SchemaPath.getSimplePath("o_shippriority"),
+          getColumnStatistics(0, 0, 10L, TypeProtos.MinorType.INT));
+        put(SchemaPath.getSimplePath("o_orderstatus"),
+          getColumnStatistics("F", "F", 10L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_orderpriority"),
+          getColumnStatistics("1-URGENT", "5-LOW", 10L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_orderkey"),
+          getColumnStatistics(66, 833, 10L, TypeProtos.MinorType.INT));
+        put(SchemaPath.getSimplePath("o_clerk"),
+          getColumnStatistics("Clerk#000000062", "Clerk#000000973", 10L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_totalprice"),
+          getColumnStatistics(3266.69, 132531.73, 10L, TypeProtos.MinorType.FLOAT8));
+        put(SchemaPath.getSimplePath("o_comment"),
           getColumnStatistics(" special pinto beans use quickly furiously even depende",
-              "y pending requests integrate", 10L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_custkey"),
-          getColumnStatistics(392, 1411, 10L, TypeProtos.MinorType.INT))
-      .put(SchemaPath.getSimplePath("dir0"),
-          getColumnStatistics("1994", "1994", 10L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("dir1"),
-          getColumnStatistics("Q1", "Q1", 10L, TypeProtos.MinorType.VARCHAR))
-      .put(SchemaPath.getSimplePath("o_orderdate"),
-          getColumnStatistics(757382400000L, 764640000000L, 10L, TypeProtos.MinorType.DATE))
-      .build();
+            "y pending requests integrate", 10L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_custkey"),
+          getColumnStatistics(392, 1411, 10L, TypeProtos.MinorType.INT));
+        put(SchemaPath.getSimplePath("dir0"),
+          getColumnStatistics("1994", "1994", 10L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("dir1"),
+          getColumnStatistics("Q1", "Q1", 10L, TypeProtos.MinorType.VARCHAR));
+        put(SchemaPath.getSimplePath("o_orderdate"),
+          getColumnStatistics(757382400000L, 764640000000L, 10L, TypeProtos.MinorType.DATE));
+    }};
 
   public static final MetadataInfo TABLE_META_INFO = MetadataInfo.builder()
       .type(MetadataType.TABLE)
@@ -3564,15 +3568,44 @@ public class TestMetastoreCommands extends ClusterTest {
     }
   }
 
-  public static <T> ColumnStatistics<T> getColumnStatistics(T minValue, T maxValue,
-      long rowCount, TypeProtos.MinorType minorType) {
+  @Test // DRILL-8280
+  public void testNonAsciiColumnName() throws Exception {
+    String tableName = "utf8_col_name";
+    String colName = "Käse";
+
+    run("create table dfs.tmp.%s as select 'Cheddar' as `%s`", tableName, colName);
+    try {
+      testBuilder()
+        .sqlQuery("analyze table dfs.tmp.`%s` refresh metadata", tableName)
+        .unOrdered()
+        .baselineColumns("ok", "summary")
+        .baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName))
+        .go();
+      String query = "select column_name from information_schema.`columns` where table_name='%s' and column_name='%s'";
+
+      testBuilder()
+        .sqlQuery(query, tableName, colName)
+        .unOrdered()
+        .baselineColumns("column_name")
+        .baselineValues(colName)
+        .go();
+    } finally {
+      run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
+      run("drop table if exists dfs.tmp.`%s`", tableName);
+    }
+  }
+
+
+  public static <T> ColumnStatistics<T> getColumnStatistics(T minValue, T maxValue, long rowCount,
+                                                            TypeProtos.MinorType minorType) {
     return new ColumnStatistics<>(
-        Arrays.asList(
-            new StatisticsHolder<>(minValue, ColumnStatisticsKind.MIN_VALUE),
-            new StatisticsHolder<>(maxValue, ColumnStatisticsKind.MAX_VALUE),
-            new StatisticsHolder<>(rowCount, TableStatisticsKind.ROW_COUNT),
-            new StatisticsHolder<>(rowCount, ColumnStatisticsKind.NON_NULL_VALUES_COUNT),
-            new StatisticsHolder<>(0L, ColumnStatisticsKind.NULLS_COUNT)),
+      new ArrayList() {{
+          add(new StatisticsHolder<>(minValue, ColumnStatisticsKind.MIN_VALUE));
+          add(new StatisticsHolder<>(maxValue, ColumnStatisticsKind.MAX_VALUE));
+          add(new StatisticsHolder<>(rowCount, TableStatisticsKind.ROW_COUNT));
+          add(new StatisticsHolder<>(rowCount, ColumnStatisticsKind.NON_NULL_VALUES_COUNT));
+          add(new StatisticsHolder<>(0L, ColumnStatisticsKind.NULLS_COUNT));
+        }},
         minorType);
   }