You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by mb...@apache.org on 2020/07/19 22:10:46 UTC

[asterixdb] 03/07: [ASTERIXDB-2757] Properly handle include/exclude pattern translation

This is an automated email from the ASF dual-hosted git repository.

mblow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git

commit 8a64bf87cea9113a6b5ed0715c5e8d6ad8ff56c4
Author: Hussain Towaileb <Hu...@Gmail.com>
AuthorDate: Wed Jul 8 23:04:52 2020 +0300

    [ASTERIXDB-2757] Properly handle include/exclude pattern translation
    
    - user model changes: no
    - storage format changes: no
    - interface changes: no
    
    Details:
    - Handles properly handling the translation of a wildcard
      pattern to a regex to avoid potential failure in the query.
    - Added and updated some test cases.
    
    Change-Id: I661c1fad9b2c2692fa231e48f00eb5cf9d79ad5e
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7143
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Ali Alsuliman <al...@gmail.com>
---
 .../include-exclude/include-11/test.000.ddl.sqlpp  |  39 +++++++
 .../include-11/test.001.query.sqlpp                |  21 ++++
 .../include-exclude/include-11/test.099.ddl.sqlpp  |  20 ++++
 .../include-exclude/include-12/test.000.ddl.sqlpp  |  41 +++++++
 .../include-12/test.001.query.sqlpp                |  21 ++++
 .../include-exclude/include-12/test.099.ddl.sqlpp  |  20 ++++
 .../s3/include-exclude/include-10/result.001.adm   |   1 +
 .../s3/include-exclude/include-11/result.001.adm   |   1 +
 .../s3/include-exclude/include-12/result.001.adm   |   1 +
 .../runtimets/testsuite_external_dataset.xml       |  11 +-
 .../record/reader/aws/AwsS3InputStreamFactory.java |   4 +-
 .../asterix/external/util/ExternalDataUtils.java   | 119 ++++++++++++---------
 .../evaluators/functions/StringEvaluatorUtils.java |   4 +-
 13 files changed, 247 insertions(+), 56 deletions(-)

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp
new file mode 100644
index 0000000..e4cf69b
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use test;
+
+drop type test if exists;
+create type test as open {
+};
+
+drop dataset test if exists;
+create external dataset test(test) using S3 (
+("accessKeyId"="dummyAccessKey"),
+("secretAccessKey"="dummySecretKey"),
+("region"="us-west-2"),
+("serviceEndpoint"="http://localhost:8001"),
+("container"="include-exclude"),
+("definition"="data/mixed/"),
+("format"="csv"),
+("header"=false),
+("include"="*.[a-c][a-z][a-z**||\\\\&&--~~]")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp
new file mode 100644
index 0000000..3306d5c
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+select count(*) as `count` from test;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp
new file mode 100644
index 0000000..548e632
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp
new file mode 100644
index 0000000..47fbaef
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+ // This test case matches nothing for "include", but has extreme cases and complicated pattern, expected is to not fail
+
+drop dataverse test if exists;
+create dataverse test;
+use test;
+
+drop type test if exists;
+create type test as open {
+};
+
+drop dataset test if exists;
+create external dataset test(test) using S3 (
+("accessKeyId"="dummyAccessKey"),
+("secretAccessKey"="dummySecretKey"),
+("region"="us-west-2"),
+("serviceEndpoint"="http://localhost:8001"),
+("container"="include-exclude"),
+("definition"="data/mixed/"),
+("format"="csv"),
+("header"=false),
+("include"="[][!][^]]]]*[![*a-zA--&&^$||0-9B$\\*&&]*&&[^a-b||0--9][[[")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp
new file mode 100644
index 0000000..3306d5c
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+select count(*) as `count` from test;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp
new file mode 100644
index 0000000..548e632
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm
new file mode 100644
index 0000000..c1a0ea2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm
@@ -0,0 +1 @@
+{ "count": 0 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm
new file mode 100644
index 0000000..95204aa
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm
@@ -0,0 +1 @@
+{ "count": 6 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm
new file mode 100644
index 0000000..c1a0ea2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm
@@ -0,0 +1 @@
+{ "count": 0 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
index ad17afe..3a4f03e 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
@@ -232,7 +232,16 @@
     <test-case FilePath="external-dataset">
       <compilation-unit name="aws/s3/include-exclude/include-10">
         <output-dir compare="Text">aws/s3/include-exclude/include-10</output-dir>
-        <expected-error>Invalid pattern *[abc][.*</expected-error>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="external-dataset">
+      <compilation-unit name="aws/s3/include-exclude/include-11">
+        <output-dir compare="Text">aws/s3/include-exclude/include-11</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="external-dataset">
+      <compilation-unit name="aws/s3/include-exclude/include-12">
+        <output-dir compare="Text">aws/s3/include-exclude/include-12</output-dir>
       </compilation-unit>
     </test-case>
   </test-group>
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
index 67a7c93..f3a36ff 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
@@ -103,10 +103,10 @@ public class AwsS3InputStreamFactory implements IInputStreamFactory {
             for (Map.Entry<String, String> entry : configuration.entrySet()) {
                 if (entry.getKey().startsWith(KEY_INCLUDE)) {
                     pattern = entry.getValue();
-                    includeMatchers.add(Pattern.compile(ExternalDataUtils.wildcardToRegex(pattern)).matcher(""));
+                    includeMatchers.add(Pattern.compile(ExternalDataUtils.patternToRegex(pattern)).matcher(""));
                 } else if (entry.getKey().startsWith(KEY_EXCLUDE)) {
                     pattern = entry.getValue();
-                    excludeMatchers.add(Pattern.compile(ExternalDataUtils.wildcardToRegex(pattern)).matcher(""));
+                    excludeMatchers.add(Pattern.compile(ExternalDataUtils.patternToRegex(pattern)).matcher(""));
                 }
             }
         } catch (PatternSyntaxException ex) {
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index 1300ac3..fc31286 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -23,10 +23,12 @@ import static org.apache.asterix.external.util.ExternalDataConstants.KEY_ESCAPE;
 import static org.apache.asterix.external.util.ExternalDataConstants.KEY_QUOTE;
 import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_END;
 import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_START;
+import static org.apache.asterix.runtime.evaluators.functions.StringEvaluatorUtils.RESERVED_REGEX_CHARS;
 
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.EnumMap;
 import java.util.List;
 import java.util.Map;
@@ -498,76 +500,91 @@ public class ExternalDataUtils {
     /**
      * Converts the wildcard to proper regex
      *
-     * @param wildcard wildcard pattern to convert
+     * @param pattern wildcard pattern to convert
      *
      * @return regex expression
      */
-    public static String wildcardToRegex(String wildcard) {
-        StringBuilder builder = new StringBuilder(wildcard.length());
-        builder.append('^');
+    public static String patternToRegex(String pattern) {
+        int charPosition = 0;
+        int patternLength = pattern.length();
+        StringBuilder stuffBuilder = new StringBuilder();
+        StringBuilder result = new StringBuilder();
 
-        // This keeps an eye on the presence inside or outside a sequence, everything inside a sequence is a literal
-        // e.g ("*" ===> ".*" while "[*]" ===> "[\*]"
-        boolean outsideBracketSequence = true;
+        while (charPosition < patternLength) {
+            char c = pattern.charAt(charPosition);
+            charPosition++;
 
-        for (int i = 0; i < wildcard.length(); i++) {
-            char c = wildcard.charAt(i);
             switch (c) {
                 case '*':
-                    builder.append(outsideBracketSequence ? "." : "\\").append(c);
+                    result.append(".*");
                     break;
                 case '?':
-                    builder.append(outsideBracketSequence ? "." : "\\?");
+                    result.append(".");
                     break;
                 case '[':
-                    if (outsideBracketSequence) {
-                        outsideBracketSequence = false;
-                        builder.append(c);
-                        if (i + 1 < wildcard.length()) {
-                            if (wildcard.charAt(i + 1) == '!') {
-                                i++;
-                                builder.append('^');
-                            }
-                        }
-                    } else {
-                        // escape the open bracket "[" if we are already inside a bracket sequence
-                        builder.append("\\").append(c);
+                    int closingBracketPosition = charPosition;
+                    if (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) == '!') {
+                        closingBracketPosition++;
                     }
-                    break;
-                case ']':
-                    if (outsideBracketSequence) {
-                        // escape if we are outside bracket sequence
-                        builder.append("\\").append(c);
+
+                    // 2 cases can happen here:
+                    // 1- Empty character class [] which is invalid for java, so treat ] as literal and find another
+                    // closing bracket, if no closing bracket is found, the whole thing is a literal
+                    // 2- Negated empty class [!] converted to [^] which is invalid for java, so treat ] as literal and
+                    // find another closing bracket, if no closing bracket is found, the whole thing is a literal
+                    if (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) == ']') {
+                        closingBracketPosition++;
+                    }
+
+                    // No [] and [!] cases, search for the closing bracket
+                    while (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) != ']') {
+                        closingBracketPosition++;
+                    }
+
+                    // No closing bracket found (or [] or [!]), escape the opening bracket, treat it as literals
+                    if (closingBracketPosition >= patternLength) {
+                        result.append("\\[");
                     } else {
-                        // Inside bracket, close it and mark as outside bracket
-                        outsideBracketSequence = true;
-                        builder.append(c);
+                        // Found closing bracket, get the stuff in between the found the character class ("[" and "]")
+                        String stuff = pattern.substring(charPosition, closingBracketPosition);
+
+                        stuffBuilder.setLength(0);
+                        int stuffCharPos = 0;
+
+                        // If first character in the character class is "!" then convert it to "^"
+                        if (stuff.charAt(0) == '!') {
+                            stuffBuilder.append('^');
+                            stuffCharPos++; // ignore first character when escaping metacharacters next step
+                        }
+
+                        for (; stuffCharPos < stuff.length(); stuffCharPos++) {
+                            char stuffChar = stuff.charAt(stuffCharPos);
+                            if (stuffChar != '-' && Arrays.binarySearch(RESERVED_REGEX_CHARS, stuffChar) >= 0) {
+                                stuffBuilder.append("\\");
+                            }
+                            stuffBuilder.append(stuffChar);
+                        }
+
+                        String stuffEscaped = stuffBuilder.toString();
+
+                        // Escape the set operations
+                        stuffEscaped = stuffEscaped.replace("&&", "\\&\\&").replace("~~", "\\~\\~")
+                                .replace("||", "\\|\\|").replace("--", "\\-\\-");
+
+                        result.append("[").append(stuffEscaped).append("]");
+                        charPosition = closingBracketPosition + 1;
                     }
                     break;
-                // escape special regexp-characters
-                case '(':
-                case ')':
-                case '$':
-                case '^':
-                case '.':
-                case '{':
-                case '}':
-                case '|':
-                case '+':
-                case '=':
-                case '<':
-                case '>':
-                case '!':
-                case '\\':
-                    builder.append("\\").append(c);
-                    break;
                 default:
-                    builder.append(c);
+                    if (Arrays.binarySearch(RESERVED_REGEX_CHARS, c) >= 0) {
+                        result.append("\\");
+                    }
+                    result.append(c);
                     break;
             }
         }
-        builder.append('$');
-        return builder.toString();
+
+        return result.toString();
     }
 
     public static class AwsS3 {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java
index 81ce832..492d64c 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java
@@ -60,8 +60,8 @@ public final class StringEvaluatorUtils {
         return destString;
     }
 
-    static final char[] RESERVED_REGEX_CHARS = new char[] { '\\', '(', ')', '[', ']', '{', '}', '.', '^', '$', '*', '|',
-            '+', '?', '<', '>', '-', '=', '!' };
+    public static final char[] RESERVED_REGEX_CHARS = new char[] { '\\', '(', ')', '[', ']', '{', '}', '.', '^', '$',
+            '*', '|', '+', '?', '<', '>', '-', '=', '!' };
 
     static {
         Arrays.sort(RESERVED_REGEX_CHARS);