You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2021/11/03 12:09:13 UTC

[drill] branch master updated: DRILL-8020: Add JSON Configuration Options to HTTP Rest Plugin (#2348)

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 9ff0c5b  DRILL-8020: Add JSON Configuration Options to HTTP Rest Plugin (#2348)
9ff0c5b is described below

commit 9ff0c5bebcaed8066e19c199d9932b136632e2c0
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Wed Nov 3 08:09:03 2021 -0400

    DRILL-8020: Add JSON Configuration Options to HTTP Rest Plugin (#2348)
    
    * DRILL-8020: Add JSON Configuration Options to HTTP Rest Plugin
    
    * Addressed review comments
    
    * Addressed Review Comments
    
    * Changed to Boolean
    
    * Removed unused import
---
 contrib/storage-http/README.md                     | 23 +++++++
 .../drill/exec/store/http/HttpApiConfig.java       |  5 ++
 .../drill/exec/store/http/HttpBatchReader.java     | 18 +++--
 .../drill/exec/store/http/HttpJsonOptions.java     | 79 ++++++++++++++++++++++
 .../drill/exec/store/http/TestHttpPlugin.java      | 40 +++++++++++
 .../src/test/resources/data/response2.json         | 10 +++
 6 files changed, 171 insertions(+), 4 deletions(-)

diff --git a/contrib/storage-http/README.md b/contrib/storage-http/README.md
index 77ffc35..a287f9d 100644
--- a/contrib/storage-http/README.md
+++ b/contrib/storage-http/README.md
@@ -244,6 +244,29 @@ The REST plugin accepts three different types of input: `json`, `csv` and `xml`.
 configuration option called `xmlDataLevel` which reduces the level of unneeded nesting found in XML files.  You can find more information in the documentation for Drill's XML 
 format plugin. 
 
+#### JSON Configuration
+Drill has a collection of JSON configuration options to allow you to configure how Drill interprets JSON files.  These are set at the global level, however the HTTP plugin 
+allows you to configure these options individually per connection and override the Drill defaults.  The options are:
+
+* `allowNanInf`:  Configures the connection to interpret `NaN` and `Inf` values
+* `allTextMode`:  By default, Drill attempts to infer data types from JSON data. If the data is malformed, Drill may throw schema change exceptions. If your data is 
+  inconsistent, you can enable `allTextMode` which when true, Drill will read all JSON values as strings, rather than try to infer the data type. 
+* `readNumbersAsDouble`:  By default Drill will attempt to interpret integers, floating point number types and strings.  One challenge is when data is consistent, Drill may 
+  throw schema change exceptions. In addition to `allTextMode`, you can make Drill less sensitive by setting the `readNumbersAsDouble` to `true` which causes Drill to read all 
+  numeric fields in JSON data as `double` data type rather than trying to distinguish between ints and doubles.
+* `enableEscapeAnyChar`:  Allows a user to escape any character with a \
+
+All of these can be set by adding the `jsonOptions` to your connection configuration as shown below:
+
+```json
+
+"jsonOptions": {
+  "allTextMode": true, 
+  "readNumbersAsDouble": true
+}
+
+```
+
 #### Authorization
 
 `authType`: If your API requires authentication, specify the authentication
diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpApiConfig.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpApiConfig.java
index 75f85f5..cca2767 100644
--- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpApiConfig.java
+++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpApiConfig.java
@@ -103,6 +103,10 @@ public class HttpApiConfig {
   @JsonProperty
   private final boolean errorOn400;
 
+  // Enables the user to configure JSON options at the connection level rather than globally.
+  @JsonProperty
+  private final HttpJsonOptions jsonOptions;
+
   @JsonInclude
   @JsonProperty
   private final boolean verifySSLCert;
@@ -127,6 +131,7 @@ public class HttpApiConfig {
     this.method = StringUtils.isEmpty(builder.method)
         ? HttpMethod.GET.toString() : builder.method.trim().toUpperCase();
     this.url = builder.url;
+    this.jsonOptions = builder.jsonOptions;
 
     HttpMethod httpMethod = HttpMethod.valueOf(this.method);
     // Get the request method.  Only accept GET and POST requests.  Anything else will default to GET.
diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java
index 921210c..59d14af 100644
--- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java
+++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java
@@ -32,11 +32,14 @@ import org.apache.drill.exec.physical.impl.scan.framework.SchemaNegotiator;
 import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
 import org.apache.drill.exec.store.easy.json.loader.JsonLoader;
 import org.apache.drill.exec.store.easy.json.loader.JsonLoaderImpl.JsonLoaderBuilder;
+import org.apache.drill.exec.store.easy.json.loader.JsonLoaderOptions;
 import org.apache.drill.exec.store.http.util.HttpProxyConfig;
 import org.apache.drill.exec.store.http.util.HttpProxyConfig.ProxyBuilder;
 import org.apache.drill.exec.store.http.util.SimpleHttp;
 import org.apache.drill.exec.store.security.UsernamePasswordCredentials;
 import org.apache.drill.exec.store.ImplicitColumnUtils.ImplicitColumns;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.io.InputStream;
@@ -47,6 +50,7 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> {
 
   private static final String[] STRING_METADATA_FIELDS = {"_response_message", "_response_protocol", "_response_url"};
   private static final String RESPONSE_CODE_FIELD = "_response_code";
+  private static final Logger logger = LoggerFactory.getLogger(HttpBatchReader.class);
 
   private final HttpSubScan subScan;
   private final int maxRecords;
@@ -100,14 +104,20 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> {
     populateImplicitFieldMap(http);
 
     try {
-      jsonLoader = new JsonLoaderBuilder()
+      JsonLoaderBuilder jsonBuilder = new JsonLoaderBuilder()
           .implicitFields(implicitColumns)
           .resultSetLoader(loader)
-          .standardOptions(negotiator.queryOptions())
           .dataPath(subScan.tableSpec().connectionConfig().dataPath())
           .errorContext(errorContext)
-          .fromStream(inStream)
-          .build();
+          .fromStream(inStream);
+
+      if (subScan.tableSpec().connectionConfig().jsonOptions() != null) {
+        JsonLoaderOptions jsonOptions = subScan.tableSpec().connectionConfig().jsonOptions().getJsonOptions(negotiator.queryOptions());
+        jsonBuilder.options(jsonOptions);
+      } else {
+        jsonBuilder.standardOptions(negotiator.queryOptions());
+      }
+      jsonLoader = jsonBuilder.build();
     } catch (Throwable t) {
 
       // Paranoia: ensure stream is closed if anything goes wrong.
diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpJsonOptions.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpJsonOptions.java
new file mode 100644
index 0000000..4928c66
--- /dev/null
+++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpJsonOptions.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.http;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import lombok.Builder;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.ToString;
+import lombok.experimental.Accessors;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.drill.exec.server.options.OptionSet;
+import org.apache.drill.exec.store.easy.json.loader.JsonLoaderOptions;
+
+@Slf4j
+@Builder
+@Getter
+@Accessors(fluent = true)
+@EqualsAndHashCode
+@ToString
+@JsonInclude(JsonInclude.Include.NON_DEFAULT)
+@JsonDeserialize(builder = HttpJsonOptions.HttpJsonOptionsBuilder.class)
+public class HttpJsonOptions {
+
+  @JsonInclude
+  private final Boolean allowNanInf;
+
+  @JsonInclude
+  private final Boolean allTextMode;
+
+  @JsonInclude
+  private final Boolean readNumbersAsDouble;
+
+  @JsonInclude
+  private final Boolean enableEscapeAnyChar;
+
+  @JsonIgnore
+  public JsonLoaderOptions getJsonOptions(OptionSet optionSet) {
+
+    JsonLoaderOptions options = new JsonLoaderOptions(optionSet);
+
+    if (allowNanInf != null) {
+      options.allowNanInf = allowNanInf;
+    }
+
+    if (allTextMode != null) {
+      options.allTextMode = allTextMode;
+    }
+
+    if (readNumbersAsDouble != null) {
+      options.readNumbersAsDouble = readNumbersAsDouble;
+    }
+
+    if (enableEscapeAnyChar != null) {
+      options.enableEscapeAnyChar = enableEscapeAnyChar;
+    }
+
+    return options;
+  }
+
+}
diff --git a/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestHttpPlugin.java b/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestHttpPlugin.java
index 0375c99..0aa2c05 100644
--- a/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestHttpPlugin.java
+++ b/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestHttpPlugin.java
@@ -23,6 +23,7 @@ import okhttp3.mockwebserver.MockWebServer;
 import okhttp3.mockwebserver.RecordedRequest;
 import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.TypeProtos.DataMode;
 import org.apache.drill.common.types.TypeProtos.MinorType;
 import org.apache.drill.common.util.DrillFileUtils;
 import org.apache.drill.exec.physical.rowSet.RowSet;
@@ -71,6 +72,7 @@ public class TestHttpPlugin extends ClusterTest {
   private static String TEST_JSON_RESPONSE;
   private static String TEST_CSV_RESPONSE;
   private static String TEST_XML_RESPONSE;
+  private static String TEST_JSON_RESPONSE_WITH_DATATYPES;
 
   @BeforeClass
   public static void setup() throws Exception {
@@ -79,6 +81,7 @@ public class TestHttpPlugin extends ClusterTest {
     TEST_JSON_RESPONSE = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response.json"), Charsets.UTF_8).read();
     TEST_CSV_RESPONSE = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response.csv"), Charsets.UTF_8).read();
     TEST_XML_RESPONSE = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response.xml"), Charsets.UTF_8).read();
+    TEST_JSON_RESPONSE_WITH_DATATYPES = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response2.json"), Charsets.UTF_8).read();
 
     dirTestWatcher.copyResourceToRoot(Paths.get("data/"));
     makeLiveConfig();
@@ -230,6 +233,16 @@ public class TestHttpPlugin extends ClusterTest {
       .requireTail(false)
       .build();
 
+    HttpApiConfig mockTableWithJsonOptions = HttpApiConfig.builder()
+      .url("http://localhost:8091/json")
+      .method("GET")
+      .headers(headers)
+      .requireTail(false)
+      .jsonOptions(HttpJsonOptions.builder()
+        .allTextMode(true)
+        .build()
+      )
+      .build();
 
     Map<String, HttpApiConfig> configs = new HashMap<>();
     configs.put("sunrise", mockSchema);
@@ -240,6 +253,7 @@ public class TestHttpPlugin extends ClusterTest {
     configs.put("github", mockGithubWithParam);
     configs.put("github2", mockGithubWithDuplicateParam);
     configs.put("github3", mockGithubWithParamInQuery);
+    configs.put("mockJsonAllText", mockTableWithJsonOptions);
 
     HttpStoragePluginConfig mockStorageConfigWithWorkspace =
         new HttpStoragePluginConfig(false, configs, 2, "", 80, "", "", "", PlainCredentialsProvider.EMPTY_CREDENTIALS_PROVIDER);
@@ -414,6 +428,32 @@ public class TestHttpPlugin extends ClusterTest {
   }
 
   @Test
+  public void simpleTestWithJsonConfig() throws Exception {
+    String sql = "SELECT * FROM local.mockJsonAllText";
+
+    try (MockWebServer server = startServer()) {
+      server.enqueue(new MockResponse().setResponseCode(200).setBody(TEST_JSON_RESPONSE_WITH_DATATYPES));
+      RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+      TupleMetadata expectedSchema = new SchemaBuilder()
+        .add("col_1", MinorType.VARCHAR, DataMode.OPTIONAL)
+        .add("col_2", MinorType.VARCHAR, DataMode.OPTIONAL)
+        .add("col_3", MinorType.VARCHAR, DataMode.OPTIONAL)
+        .build();
+
+      RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+        .addRow("1.0", "2", "3.0")
+        .addRow("4.0", "5", "6.0")
+        .build();
+
+      RowSetUtilities.verify(expected, results);
+    } catch (Exception e) {
+      System.out.println(e.getMessage());
+      fail();
+    }
+  }
+
+  @Test
   public void simpleTestWithMockServerWithURLParams() throws Exception {
     String sql = "SELECT _response_url FROM local.github\n" +
         "WHERE `org` = 'apache'";
diff --git a/contrib/storage-http/src/test/resources/data/response2.json b/contrib/storage-http/src/test/resources/data/response2.json
new file mode 100644
index 0000000..467a458
--- /dev/null
+++ b/contrib/storage-http/src/test/resources/data/response2.json
@@ -0,0 +1,10 @@
+[
+  {
+    "col_1": 1.0,
+    "col_2":  2,
+    "col_3": "3.0"
+  },{
+  "col_1": 4.0,
+  "col_2":  5,
+  "col_3": "6.0"
+}]