You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2021/02/17 15:08:00 UTC

[drill] branch master updated: DRILL-7861: Add Function to Obtain Inner Map Schemata

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 39c4cc8  DRILL-7861: Add Function to Obtain Inner Map Schemata
39c4cc8 is described below

commit 39c4cc8482cabd36ada4e50effe68e0738ddbd67
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Wed Feb 17 10:07:51 2021 -0500

    DRILL-7861: Add Function to Obtain Inner Map Schemata
    
    * Initial commit
    
    * Addressed Review Comments
    
    * Added to unit tests
---
 contrib/udfs/README.md                             |  37 +++++++
 .../drill/exec/udfs/ComplexSchemaFunctions.java    |  69 ++++++++++++
 .../apache/drill/exec/udfs/ComplexSchemaUtils.java |  68 ++++++++++++
 .../exec/udfs/TestComplexSchemaFunctions.java      | 119 +++++++++++++++++++++
 .../udfs/src/test/resources/json/nestedSchema.json |  21 ++++
 5 files changed, 314 insertions(+)

diff --git a/contrib/udfs/README.md b/contrib/udfs/README.md
index 3ef761c..ae65e1d 100644
--- a/contrib/udfs/README.md
+++ b/contrib/udfs/README.md
@@ -272,3 +272,40 @@ The function can also be called with an optional field as an argument. IE:
 SELECT parse_user_agent( `user_agent`, 'AgentName` ) as AgentName ...
 ```
 which will just return the requested field. If the user agent string is empty, all fields will have the value of `Hacker`.  
+
+## Map Schema Function
+This function allows you to drill down into the schema of maps.  The REST API and JDBC interfaces will only return `MAP`, `LIST` for the MAP, however, it is not possible to get 
+the schema of the inner map. The function `getMapSchema(<MAP>)` will return a `MAP` of the fields and datatypes.
+
+### Example Usage
+
+Using the data below, the query below will return the schema as shown below.
+```bash
+apache drill> SELECT getMapSchema(record) AS schema FROM dfs.test.`schema_test.json`;
++----------------------------------------------------------------------------------+
+|                                      schema                                      |
++----------------------------------------------------------------------------------+
+| {"int_field":"BIGINT","double_field":"FLOAT8","string_field":"VARCHAR","int_list":"REPEATED_BIGINT","double_list":"REPEATED_FLOAT8","map":"MAP"} |
++----------------------------------------------------------------------------------+
+1 row selected (0.298 seconds)
+```
+
+```json
+{
+  "record" : {
+    "int_field": 1,
+    "double_field": 2.0,
+    "string_field": "My string",
+    "int_list": [1,2,3],
+    "double_list": [1.0,2.0,3.0],
+    "map": {
+      "nested_int_field" : 5,
+      "nested_double_field": 5.0,
+      "nested_string_field": "5.0"
+    }
+  },
+  "single_field": 10
+}
+```
+
+The function returns an empty map if the row is `null`.
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/ComplexSchemaFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/ComplexSchemaFunctions.java
new file mode 100644
index 0000000..50fef82
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/ComplexSchemaFunctions.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate.NullHandling;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.vector.complex.reader.FieldReader;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+
+import javax.inject.Inject;
+
+public class ComplexSchemaFunctions {
+
+  /**
+   * This function exists to help the user understand the inner schemata of maps
+   * It is NOT recursive (yet).
+   */
+  @FunctionTemplate(names = {"get_map_schema", "getMapSchema"},
+    scope = FunctionTemplate.FunctionScope.SIMPLE,
+    nulls = NullHandling.INTERNAL)
+  public static class GetMapSchemaFunction implements DrillSimpleFunc {
+
+    @Param
+    FieldReader reader;
+
+    @Output
+    BaseWriter.ComplexWriter outWriter;
+
+    @Inject
+    DrillBuf outBuffer;
+
+    @Override
+    public void setup() {
+      // Nothing to see here...
+    }
+
+    @Override
+    public void eval() {
+      if (reader.isSet()) {
+        org.apache.drill.exec.udfs.ComplexSchemaUtils.getFields(reader, outWriter, outBuffer);
+      } else {
+        org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
+        // Return empty map
+        queryMapWriter.start();
+        queryMapWriter.end();
+      }
+    }
+  }
+}
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/ComplexSchemaUtils.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/ComplexSchemaUtils.java
new file mode 100644
index 0000000..7f027aa
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/ComplexSchemaUtils.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+import org.apache.drill.exec.vector.complex.reader.FieldReader;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+
+import java.util.Iterator;
+
+public class ComplexSchemaUtils {
+
+  public static void getFields(FieldReader reader, BaseWriter.ComplexWriter outWriter, DrillBuf buffer) {
+
+    BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
+
+    if (reader.getType().getMinorType() != MinorType.MAP) {
+      // If the field is not a map, return an empty map
+      queryMapWriter.start();
+      queryMapWriter.end();
+    }
+
+    Iterator<String> fieldIterator = reader.iterator();
+    queryMapWriter.start();
+
+    while (fieldIterator.hasNext()) {
+      String fieldName = fieldIterator.next();
+      FieldReader fieldReader = reader.reader(fieldName);
+      String dataType = fieldReader.getType().getMinorType().toString();
+
+      DataMode dataMode = fieldReader.getType().getMode();
+      if (dataMode == DataMode.REPEATED) {
+        dataType = dataMode + "_" + dataType;
+      }
+
+      VarCharHolder rowHolder = new VarCharHolder();
+      byte[] rowStringBytes = dataType.getBytes();
+      buffer.reallocIfNeeded(rowStringBytes.length);
+      buffer.setBytes(0, rowStringBytes);
+
+      rowHolder.start = 0;
+      rowHolder.end = rowStringBytes.length;
+      rowHolder.buffer = buffer;
+
+      queryMapWriter.varChar(fieldName).write(rowHolder);
+    }
+    queryMapWriter.end();
+  }
+}
diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestComplexSchemaFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestComplexSchemaFunctions.java
new file mode 100644
index 0000000..bf453ef
--- /dev/null
+++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestComplexSchemaFunctions.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.rpc.RpcException;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryBuilder;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import static org.apache.drill.test.rowSet.RowSetUtilities.mapArray;
+import static org.apache.drill.test.rowSet.RowSetUtilities.strArray;
+import static org.junit.Assert.assertEquals;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestComplexSchemaFunctions extends ClusterTest {
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+    startCluster(builder);
+  }
+
+  @Test
+  public void testMapSchemaFunction() throws RpcException {
+    String sql = "SELECT getMapSchema(record) AS schema FROM cp.`json/nestedSchema.json`";
+
+    QueryBuilder q = client.queryBuilder().sql(sql);
+    RowSet results = q.rowSet();
+    assertEquals(results.rowCount(), 1);
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addMap("schema")
+          .addNullable("int_field", MinorType.VARCHAR)
+          .addNullable("double_field", MinorType.VARCHAR)
+          .addNullable("string_field", MinorType.VARCHAR)
+          .addNullable("boolean_field", MinorType.VARCHAR)
+          .addNullable("int_list", MinorType.VARCHAR)
+          .addNullable("double_list", MinorType.VARCHAR)
+          .addNullable("boolean_list", MinorType.VARCHAR)
+          .addNullable("map", MinorType.VARCHAR)
+          .addNullable("repeated_map", MinorType.VARCHAR)
+        .resumeSchema()
+      .build();
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+      .addRow((Object)strArray("BIGINT", "FLOAT8", "VARCHAR", "BIT", "REPEATED_BIGINT", "REPEATED_FLOAT8", "REPEATED_BIT", "MAP", "REPEATED_MAP"))
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
+  public void testMapSchemaFunctionWithInnerMap() throws RpcException {
+    String sql = "SELECT getMapSchema(t1.record.map) AS schema FROM cp.`json/nestedSchema.json` AS t1";
+
+    QueryBuilder q = client.queryBuilder().sql(sql);
+    RowSet results = q.rowSet();
+    assertEquals(results.rowCount(), 1);
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addMap("schema")
+          .addNullable("nested_int_field", MinorType.VARCHAR)
+          .addNullable("nested_double_field", MinorType.VARCHAR)
+          .addNullable("nested_string_field", MinorType.VARCHAR)
+        .resumeSchema()
+      .build();
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+      .addRow((Object)strArray("BIGINT", "FLOAT8", "VARCHAR"))
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
+  public void testMapSchemaFunctionWithNull() throws RpcException {
+    String sql = "SELECT getMapSchema(null) AS schema FROM cp.`json/nestedSchema.json` AS t1";
+
+    QueryBuilder q = client.queryBuilder().sql(sql);
+    RowSet results = q.rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .add("schema", MinorType.MAP)
+      .build();
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+      .addRow((Object) mapArray())
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+}
diff --git a/contrib/udfs/src/test/resources/json/nestedSchema.json b/contrib/udfs/src/test/resources/json/nestedSchema.json
new file mode 100644
index 0000000..22abcc2
--- /dev/null
+++ b/contrib/udfs/src/test/resources/json/nestedSchema.json
@@ -0,0 +1,21 @@
+{
+  "record" : {
+    "int_field": 1,
+    "double_field": 2.0,
+    "string_field": "My string",
+    "boolean_field": true,
+    "int_list": [1,2,3],
+    "double_list": [1.0,2.0,3.0],
+    "boolean_list": [true, false, true],
+    "map": {
+      "nested_int_field" : 5,
+      "nested_double_field": 5.0,
+      "nested_string_field": "5.0"
+    },
+    "repeated_map": [
+      { "a" : 1 },
+      { "b" : "abc" }
+    ]
+  },
+  "single_field": 10
+}
\ No newline at end of file