You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2022/09/28 18:20:10 UTC

[drill] branch master updated: DRILL-8320: Prevent Infinite Pagination for Index Paginator (#2660)

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 9551372cd5 DRILL-8320: Prevent Infinite Pagination for Index Paginator (#2660)
9551372cd5 is described below

commit 9551372cd5678ff4ab7d40425b51ae485e202881
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Wed Sep 28 14:20:02 2022 -0400

    DRILL-8320: Prevent Infinite Pagination for Index Paginator (#2660)
    
    * DRILL-8320: Prevent Infinite Pagination for Index Paginator
---
 contrib/storage-http/.gitignore                    |  1 +
 .../drill/exec/store/http/HttpBatchReader.java     | 28 ++++++++++-
 .../drill/exec/store/http/HttpPaginatorConfig.java |  7 +--
 .../exec/store/http/HttpScanBatchCreator.java      |  6 +--
 .../exec/store/http/paginator/IndexPaginator.java  | 22 +++++++--
 .../drill/exec/store/http/TestPagination.java      | 54 +++++++++++++++++++++-
 .../resources/data/nested_pagination_fields.json   | 17 +++++++
 .../resources/data/nested_pagination_fields2.json  | 14 ++++++
 .../easy/json/parser/JsonStructureParser.java      |  1 -
 9 files changed, 131 insertions(+), 19 deletions(-)

diff --git a/contrib/storage-http/.gitignore b/contrib/storage-http/.gitignore
new file mode 100644
index 0000000000..710368b187
--- /dev/null
+++ b/contrib/storage-http/.gitignore
@@ -0,0 +1 @@
+./src/test/resources/logback-test.xml
diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java
index e2bfee9114..6f312b0e9b 100644
--- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java
+++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java
@@ -111,6 +111,8 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> {
     };
     negotiator.setErrorContext(errorContext);
 
+    logger.debug("Executing request with url: {}", url);
+
     // Http client setup
     SimpleHttp http = SimpleHttp.builder()
       .scanDefn(subScan)
@@ -359,13 +361,32 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> {
       if (StringUtils.isNotEmpty(indexPaginator.getNextPageParam())) {
         indexPaginator.setNextPageValue(paginationFields.get(indexPaginator.getNextPageParam()).toString());
       }
+    } else {
+      // This covers the case of keyset/index pagination where there isn't a boolean parameter to indicate whether there are more results.
+      // In this case, we will interpret the absence of the pagination field, receiving the same value, or a null value as the marker to stop pagination.
+      if ( (!paginationFields.containsKey(indexPaginator.getIndexParam())) ||
+        paginationFields.get(indexPaginator.getIndexParam()) == null
+      ) {
+        // End pagination
+        paginator.notifyPartialPage();
+      } else {
+        // Otherwise, check to see if the field is present but empty, or contains the value from the last page.
+        // This will prevent runaway pagination calls.
+        String indexParameter = paginationFields.get(indexPaginator.getIndexParam()).toString();
+        // Empty value or the last value is the same as the current one.
+        if (StringUtils.isEmpty(indexParameter) || (StringUtils.equals(indexParameter, indexPaginator.getLastIndexValue()))) {
+          paginator.notifyPartialPage();
+        } else {
+          // Whew!  We made it... get the next page.
+          indexPaginator.setIndexValue(indexParameter);
+        }
+      }
     }
   }
 
   @Override
   public boolean next() {
     boolean result = jsonLoader.readBatch();
-
     // This code implements the index/keyset pagination.  This pagination method
     // uses a value returned in the current result set as the starting point for the
     // next page.  Some APIs will have a boolean parameter to indicate that there are
@@ -373,9 +394,12 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> {
     // it is necessary to grab these values from the returned data.
     if (paginator != null && paginator.getMode() == PaginatorMethod.INDEX) {
       // First check to see if the limit has been reached.  If so, mark the end of pagination.
-      if (maxRecords > 0 && (resultSetLoader.totalRowCount() > maxRecords)) {
+      long totalRowCount = resultSetLoader.totalRowCount();
+      if (maxRecords > 0 && totalRowCount >= maxRecords) {
         // End Pagination
         paginator.notifyPartialPage();
+        // Returning false here because if there is a partial page, we want the reader to stop and no further batches to be created.
+        return false;
       } else {
         populateIndexPaginator();
       }
diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java
index ad474f9bac..e6b226f8c6 100644
--- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java
+++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java
@@ -120,12 +120,7 @@ public class HttpPaginatorConfig {
         break;
       case INDEX:
         // Either the nextPageParam OR the indexParam must be populated
-        if (StringUtils.isEmpty(this.hasMoreParam)) {
-          throw UserException
-            .validationError()
-            .message("Invalid paginator configuration.  For INDEX pagination, the hasMoreParam must be defined.")
-            .build(logger);
-        } else if ((StringUtils.isEmpty(this.nextPageParam) && StringUtils.isNotEmpty(this.indexParam)) &&
+        if ((StringUtils.isEmpty(this.nextPageParam) && StringUtils.isNotEmpty(this.indexParam)) &&
           (StringUtils.isNotEmpty(this.nextPageParam) && StringUtils.isEmpty(this.indexParam))) {
           throw UserException
             .validationError()
diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java
index edc7dee1f9..3de783f7da 100644
--- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java
+++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java
@@ -86,9 +86,6 @@ public class HttpScanBatchCreator implements BatchCreator<HttpSubScan> {
     ReaderFactory readerFactory = new HttpReaderFactory(subScan);
     builder.setReaderFactory(readerFactory);
     builder.nullType(Types.optional(MinorType.VARCHAR));
-
-    // TODO Add page size limit here to ScanFramework Builder
-
     return builder;
   }
 
@@ -139,8 +136,9 @@ public class HttpScanBatchCreator implements BatchCreator<HttpSubScan> {
           paginatorConfig.pageSizeParam());
       } else if (paginatorConfig.getMethodType() == PaginatorMethod.INDEX) {
         paginator = new IndexPaginator(urlBuilder,
+          0,  // Page size not used for Index/Keyset pagination
           subScan.maxRecords(),
-          0, paginatorConfig.hasMoreParam(),
+          paginatorConfig.hasMoreParam(),
           paginatorConfig.indexParam(),
           paginatorConfig.nextPageParam());
       }
diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java
index 22aa531f54..67bc89c4a6 100644
--- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java
+++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java
@@ -20,19 +20,25 @@ package org.apache.drill.exec.store.http.paginator;
 
 import okhttp3.HttpUrl.Builder;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.exec.store.http.HttpPaginatorConfig.PaginatorMethod;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
 import java.util.NoSuchElementException;
 
 public class IndexPaginator extends Paginator {
 
+  private static final Logger logger = LoggerFactory.getLogger(IndexPaginator.class);
   private final String hasMoreParam;
   private final String indexParam;
   private final String nextPageParam;
 
   private String indexValue;
+  private String lastIndexValue;
   private Boolean hasMoreValue;
   private String nextPageValue;
   private int pageCount;
@@ -75,9 +81,14 @@ public class IndexPaginator extends Paginator {
   }
 
   public void setNextPageValue(String nextPageValue) {
+    this.lastIndexValue = this.nextPageValue;
     this.nextPageValue = nextPageValue;
   }
 
+  public String getLastIndexValue() {
+    return lastIndexValue;
+  }
+
   public boolean isFirstPage() {
     return pageCount < 1;
   }
@@ -94,13 +105,14 @@ public class IndexPaginator extends Paginator {
       throw new NoSuchElementException();
     }
 
-    if (StringUtils.isNotEmpty(nextPageValue)) {
-      // TODO figure this out...
-    } else if (StringUtils.isNotEmpty(indexValue)) {
+    if (StringUtils.isNotEmpty(indexValue)) {
       try {
-        indexValue = URLEncoder.encode(indexValue, "UTF-8");
+        indexValue = URLEncoder.encode(indexValue, StandardCharsets.UTF_8.name());
       } catch (UnsupportedEncodingException e) {
-        // Do nothing.
+        // Should never happen
+        throw UserException.internalError()
+          .message(e.getMessage())
+          .build(logger);
       }
       builder.removeAllEncodedQueryParameters(indexParam);
       builder.addQueryParameter(indexParam, indexValue);
diff --git a/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java b/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java
index 328a447cb4..d04f230f3d 100644
--- a/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java
+++ b/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java
@@ -65,7 +65,8 @@ public class TestPagination extends ClusterTest {
   private static String TEST_JSON_INDEX_PAGE2;
   private static String TEST_JSON_INDEX_PAGE3;
   private static String TEST_JSON_INDEX_PAGE4;
-
+  private static String TEST_JSON_NESTED_INDEX;
+  private static String TEST_JSON_NESTED_INDEX2;
   private static String TEST_XML_PAGE1;
   private static String TEST_XML_PAGE2;
   private static String TEST_XML_PAGE3;
@@ -89,6 +90,9 @@ public class TestPagination extends ClusterTest {
     TEST_JSON_INDEX_PAGE3 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/index_response3.json"), Charsets.UTF_8).read();
     TEST_JSON_INDEX_PAGE4 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/index_response4.json"), Charsets.UTF_8).read();
 
+    TEST_JSON_NESTED_INDEX = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/nested_pagination_fields.json"), Charsets.UTF_8).read();
+    TEST_JSON_NESTED_INDEX2 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/nested_pagination_fields2.json"), Charsets.UTF_8).read();
+
     TEST_XML_PAGE1 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response_1.xml"), Charsets.UTF_8).read();
     TEST_XML_PAGE2 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response_2.xml"), Charsets.UTF_8).read();
     TEST_XML_PAGE3 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response_3.xml"), Charsets.UTF_8).read();
@@ -167,6 +171,31 @@ public class TestPagination extends ClusterTest {
       .inputType("json")
       .build();
 
+    HttpPaginatorConfig nestedIndexPaginator = HttpPaginatorConfig.builder()
+      .indexParam("after")
+      .method("index")
+      .build();
+
+    HttpApiConfig mockJsonConfigWitNestedKeyset = HttpApiConfig.builder()
+      .url("http://localhost:8092/json")
+      .method("get")
+      .headers(headers)
+      .requireTail(false)
+      .paginator(nestedIndexPaginator)
+      .inputType("json")
+      .build();
+
+    HttpApiConfig mockJsonConfigWitNestedKeysetAndDataPath = HttpApiConfig.builder()
+      .url("http://localhost:8092/json")
+      .method("get")
+      .headers(headers)
+      .dataPath("results")
+      .requireTail(false)
+      .paginator(nestedIndexPaginator)
+      .inputType("json")
+      .build();
+
+
     HttpApiConfig mockJsonConfigWithKeysetAndDataPath = HttpApiConfig.builder()
       .url("http://localhost:8092/json")
       .method("get")
@@ -232,6 +261,8 @@ public class TestPagination extends ClusterTest {
     configs.put("csv_paginator", mockCsvConfigWithPaginator);
     configs.put("json_index", mockJsonConfigWithKeyset);
     configs.put("json_index_datapath", mockJsonConfigWithKeysetAndDataPath);
+    configs.put("nested_keyset", mockJsonConfigWitNestedKeyset);
+    configs.put("nested_keyset_and_datapath", mockJsonConfigWitNestedKeysetAndDataPath);
     configs.put("json_paginator", mockJsonConfigWithPaginator);
     configs.put("xml_paginator", mockXmlConfigWithPaginator);
     configs.put("xml_paginator_url_params", mockXmlConfigWithPaginatorAndUrlParams);
@@ -348,6 +379,27 @@ public class TestPagination extends ClusterTest {
       assertEquals(4, count);
     }
   }
+  @Test
+  public void jsonQueryWithoutHasMore() throws Exception {
+    String sql = "SELECT * FROM `local`.`nested_keyset` LIMIT 4";
+    try (MockWebServer server = startServer()) {
+
+      server.enqueue(new MockResponse().setResponseCode(200).setBody(TEST_JSON_NESTED_INDEX));
+      server.enqueue(new MockResponse().setResponseCode(200).setBody(TEST_JSON_NESTED_INDEX2));
+
+      List<QueryDataBatch> results = client.queryBuilder()
+        .sql(sql)
+        .results();
+
+      int count = 0;
+      for(QueryDataBatch b : results){
+        count += b.getHeader().getRowCount();
+        b.release();
+      }
+      assertEquals(2, results.size());
+      assertEquals(2, count);
+    }
+  }
 
   @Test
   public void simpleJSONPaginatorQueryWithoutLimit() throws Exception {
diff --git a/contrib/storage-http/src/test/resources/data/nested_pagination_fields.json b/contrib/storage-http/src/test/resources/data/nested_pagination_fields.json
new file mode 100644
index 0000000000..3ad3845cd4
--- /dev/null
+++ b/contrib/storage-http/src/test/resources/data/nested_pagination_fields.json
@@ -0,0 +1,17 @@
+{
+  "results": [
+    {
+      "id": "1",
+      "properties": {
+        "company": "HubSpot"
+      },
+      "archived": false
+    }
+  ],
+  "paging": {
+    "next": {
+      "after": "2",
+      "link": "https://api.hubapi.com/crm/v3/objects/contacts?includeAssociations=true&properties=record_id&properties=company&properties=createdate&properties=email&properties=firstname&properties=lastmodifieddate&properties=lastname&properties=phone&properties=website&limit=1&after=2"
+    }
+  }
+}
diff --git a/contrib/storage-http/src/test/resources/data/nested_pagination_fields2.json b/contrib/storage-http/src/test/resources/data/nested_pagination_fields2.json
new file mode 100644
index 0000000000..68f68d2f73
--- /dev/null
+++ b/contrib/storage-http/src/test/resources/data/nested_pagination_fields2.json
@@ -0,0 +1,14 @@
+{
+  "results": [
+    {
+      "id": "2",
+      "properties": {
+        "company": "HubSpot2"
+      },
+      "archived": false
+    }
+  ],
+  "paging": {
+
+  }
+}
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java
index 03fa51b695..4d33403070 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java
@@ -264,7 +264,6 @@ public class JsonStructureParser {
     }
     while (true) {
       try {
-        // System.out.println(tokenizer.stringValue());
         return rootState.parseRoot(tokenizer);
       } catch (RecoverableJsonException e) {
         if (! recover()) {