You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by mb...@apache.org on 2020/05/14 20:26:08 UTC

[asterixdb] 15/26: [ASTERIXDB-2720][EXT] Support arrays of objects for external dataset with json format

This is an automated email from the ASF dual-hosted git repository.

mblow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git

commit 5c0ff279b52c4867e8dcb67ba0aff3737079ea65
Author: Ali Alsuliman <al...@gmail.com>
AuthorDate: Tue May 5 15:36:19 2020 -0700

    [ASTERIXDB-2720][EXT] Support arrays of objects for external dataset with json format
    
    - user model changes: no
    - storage format changes: no
    - interface changes: no
    
    Details:
    Currently, the json format that the external dataset support is
    the extended version of jsonl. The format is consecutive json objects
    (normally one object per line). Arrays of objects should
    be supported as valid values as well.
    
    Change-Id: I59462c274ec3186cf35ba4ce222a9ffc4056e00a
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/6063
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Ali Alsuliman <al...@gmail.com>
    Reviewed-by: Dmitry Lychagin <dm...@couchbase.com>
---
 .../jsonl/multi-lines-with-arrays/5-records.json   |  20 ++++
 .../multi-lines-with-nested-objects/5-records.json |  33 ++++++
 .../data/jsonl/multi-lines/20-records.json         |  33 ++++++
 .../data/jsonl/single-line/20-records.json         |  20 ++++
 .../aws/AwsS3ExternalDatasetTest.java              |  34 ++++--
 .../aws/s3/jsonl/query-dataset.000.s3bucket.sqlpp  |  20 ++++
 .../aws/s3/jsonl/query-dataset.001.ddl.sqlpp       |  36 +++++++
 .../aws/s3/jsonl/query-dataset.002.query.sqlpp     |  23 +++++
 .../aws/s3/jsonl/query-dataset.099.ddl.sqlpp       |  20 ++++
 .../aws/s3/jsonl/external_dataset.002.adm          |  50 +++++++++
 .../testsuite_external_dataset_one_partition.xml   |   5 +
 .../asterix/common/exceptions/ErrorCode.java       |   6 +-
 .../reader/stream/QuotedLineRecordReader.java      |   2 +-
 .../reader/stream/SemiStructuredRecordReader.java  | 115 ++++++++++++---------
 .../external/util/ExternalDataConstants.java       |   3 +
 .../asterix/external/util/ExternalDataUtils.java   |  77 +++++++-------
 16 files changed, 394 insertions(+), 103 deletions(-)

diff --git a/asterixdb/asterix-app/data/jsonl/multi-lines-with-arrays/5-records.json b/asterixdb/asterix-app/data/jsonl/multi-lines-with-arrays/5-records.json
new file mode 100644
index 0000000..5bd8cb6
--- /dev/null
+++ b/asterixdb/asterix-app/data/jsonl/multi-lines-with-arrays/5-records.json
@@ -0,0 +1,20 @@
+[]
+{"id": 21, "year": null, "quarter": null, "review": "good", "array":  [1, 2, 3]}
+
+
+{"id": 22, "year": null, "quarter": null, "review": "good", "array":  [1, [1, 2], [1]]}
+{"id": 23, "year": 2018, "quarter": null, "review": "good", "array":  [1,
+2, 3]}
+
+{"id": 24,
+  "year": 2018,
+  "quarter": null,
+  "review": "bad",
+  "array": [
+    1,
+    2,
+    3
+  ]
+}
+{"id": 25,
+  "year": 2018, "quarter": 1, "review": "good", "array":  [1,   2,       3]}
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/jsonl/multi-lines-with-nested-objects/5-records.json b/asterixdb/asterix-app/data/jsonl/multi-lines-with-nested-objects/5-records.json
new file mode 100644
index 0000000..063e30a
--- /dev/null
+++ b/asterixdb/asterix-app/data/jsonl/multi-lines-with-nested-objects/5-records.json
@@ -0,0 +1,33 @@
+{"id": 26, "year": null, "quarter": null, "review": "good", "array":  [1, 2, 3], "nested":  { "id" :  1}}
+
+[]
+[    ]
+[{"id": 27, "year": null, "quarter": null, "review": "good", "array":  [1, [1, 2], [1]], "nested":  { "id" : 1}, "nested2": [ {"id":1 }]},
+{"id":28,"year":2018,"quarter":null,"review":"good","array":[1,2,3,{"nested":{"array":[{"nested":{"array":[1,2]}}]}}]}]
+
+{
+  "id": 29,
+  "year": 2018,
+  "quarter": null,
+  "review": "bad",
+  "array": [
+    1,
+    2,
+    3,
+    {
+      "nested1": {
+        "id": 1,
+        "nested2": {
+          "id": 2,
+          "nested3": [
+            {
+              "nested4": null
+            }
+          ]
+        }
+      }
+    }
+  ]
+}
+{"id": 30,
+  "year": 2018, "quarter": 1, "review": "good", "array":  [1,   2,       3, {"nested": { "array": [1, 2]}}]}
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/jsonl/multi-lines/20-records.json b/asterixdb/asterix-app/data/jsonl/multi-lines/20-records.json
new file mode 100644
index 0000000..69aa775
--- /dev/null
+++ b/asterixdb/asterix-app/data/jsonl/multi-lines/20-records.json
@@ -0,0 +1,33 @@
+[{"id": 1, "year": null, "quarter": null, "review": "good"},
+
+
+{"id": 2, "year": null, "quarter": null, "review": "good"}   ,
+{"id": 3, "year": 2018, "quarter": null, "review": "good"}]
+
+{"id": 4,
+  "year": 2018,
+  "quarter": null,
+  "review": "bad"
+}
+{"id": 5,
+  "year": 2018, "quarter": 1, "review": "good"}
+{"id": 6, "year": 2018, "quarter": 1, "review": "bad"
+}
+{"id": 7, "year": 2018, "quarter": 2, "review": "good"}
+{"id": 8, "year": 2018, "quarter": 2, "review": "bad"}
+[{"id": 9, "year": 2019, "quarter": null,
+
+  "review": "good"},
+{"id": 10, "year": 2019,
+
+  "quarter": null,
+  "review": "bad"}    ,
+{"id": 11, "year": 2019, "quarter": 1, "review": "good"}]
+[{"id": 12, "year": 2019, "quarter": 1, "review": "bad"}]
+{"id": 13, "year": 2019, "quarter": 2, "review": "good"}
+{"id": 14, "year": 2019, "quarter": 2, "review": "bad"}
+{"id": 15, "year": 2019, "quarter": null, "review": "good"}
+{"id": 16, "year": 2019, "quarter": null, "review": "bad"}
+{"id": 17, "year": 2019, "quarter": 1, "review": "good"}
+[{"id": 18, "year": 2019, "quarter": 1, "review": "bad"}, {"id": 19, "year": 2019, "quarter": 2, "review": "good"},
+{"id": 20, "year": 2019, "quarter": 2, "review": "bad"}]
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/jsonl/single-line/20-records.json b/asterixdb/asterix-app/data/jsonl/single-line/20-records.json
new file mode 100644
index 0000000..bf86095
--- /dev/null
+++ b/asterixdb/asterix-app/data/jsonl/single-line/20-records.json
@@ -0,0 +1,20 @@
+[{"id": 31, "year": null, "quarter": null, "review": "good"},
+{"id": 32, "year": null, "quarter": null, "review": "good"},
+{"id": 33, "year": 2018, "quarter": null, "review": "good"},
+{"id": 34, "year": 2018, "quarter": null, "review": "bad"},
+{"id": 35, "year": 2018, "quarter": 1, "review": "good"},
+{"id": 36, "year": 2018, "quarter": 1, "review": "bad"},
+{"id": 37, "year": 2018, "quarter": 2, "review": "good"},
+{"id": 38, "year": 2018, "quarter": 2, "review": "bad"},
+{"id": 39, "year": 2019, "quarter": null, "review": "good"},
+{"id": 40, "year": 2019, "quarter": null, "review": "bad"},
+{"id": 41, "year": 2019, "quarter": 1, "review": "good"},
+{"id": 42, "year": 2019, "quarter": 1, "review": "bad"},
+{"id": 43, "year": 2019, "quarter": 2, "review": "good"},
+{"id": 44, "year": 2019, "quarter": 2, "review": "bad"},
+{"id": 45, "year": 2019, "quarter": null, "review": "good"},
+{"id": 46, "year": 2019, "quarter": null, "review": "bad"},
+{"id": 47, "year": 2019, "quarter": 1, "review": "good"},
+{"id": 48, "year": 2019, "quarter": 1, "review": "bad"},
+{"id": 49, "year": 2019, "quarter": 2, "review": "good"},
+{"id": 50, "year": 2019, "quarter": 2, "review": "bad"}]
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
index 5e2d2de..b5866e9 100644
--- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
+++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
@@ -365,7 +365,7 @@ public class AwsS3ExternalDatasetTest {
             String[] lines;
             switch (ctx.getType()) {
                 case "s3bucket":
-                    // <bucket_name> <def_name> <file1,file2,file3>
+                    // <bucket_name> <def_name> <sub-path:src_file1,sub-path:src_file2,sub-path:src_file3>
                     lines = TestExecutor.stripAllComments(statement).trim().split("\n");
                     String lastLine = lines[lines.length - 1];
                     String[] command = lastLine.trim().split(" ");
@@ -386,7 +386,7 @@ public class AwsS3ExternalDatasetTest {
         String definitionPath = definition + (definition.endsWith("/") ? "" : "/");
         String[] fileSplits = files.split(",");
 
-        LOGGER.info("Dropping bucket");
+        LOGGER.info("Dropping bucket " + bucketName);
         try {
             client.deleteBucket(DELETE_BUCKET_BUILDER.bucket(bucketName).build());
         } catch (NoSuchBucketException e) {
@@ -397,13 +397,31 @@ public class AwsS3ExternalDatasetTest {
         LOGGER.info("Uploading to bucket " + bucketName + " definition " + definitionPath);
         fileNames.clear();
         for (int i = 0; i < fileSplits.length; i++) {
-            String fileName = FilenameUtils.getName(fileSplits[i]);
-            while (fileNames.contains(fileName)) {
-                fileName = (i + 1) + fileName;
+            String[] s3pathAndSourceFile = fileSplits[i].split(":");
+            int size = s3pathAndSourceFile.length;
+            String path;
+            String sourceFilePath;
+            String sourceFileName;
+            if (size == 1) {
+                // case: playground json-data/reviews SOURCE_FILE1,SOURCE_FILE2
+                path = definitionPath;
+                sourceFilePath = s3pathAndSourceFile[0];
+                sourceFileName = FilenameUtils.getName(s3pathAndSourceFile[0]);
+            } else {
+                // case: playground json-data/reviews level1/sub-level:SOURCE_FILE1,level2/sub-level:SOURCE_FILE2
+                path = definitionPath + s3pathAndSourceFile[0] + (s3pathAndSourceFile[0].endsWith("/") ? "" : "/");
+                sourceFilePath = s3pathAndSourceFile[1];
+                sourceFileName = FilenameUtils.getName(s3pathAndSourceFile[1]);
+            }
+
+            String keyPath = path + sourceFileName;
+            int k = 1;
+            while (fileNames.contains(keyPath)) {
+                keyPath = path + (k++) + sourceFileName;
             }
-            fileNames.add(fileName);
-            client.putObject(PUT_OBJECT_BUILDER.bucket(bucketName).key(definitionPath + fileName).build(),
-                    RequestBody.fromFile(Paths.get(fileSplits[i])));
+            fileNames.add(keyPath);
+            client.putObject(PUT_OBJECT_BUILDER.bucket(bucketName).key(keyPath).build(),
+                    RequestBody.fromFile(Paths.get(sourceFilePath)));
         }
         LOGGER.info("Done creating bucket with data");
     }
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.000.s3bucket.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.000.s3bucket.sqlpp
new file mode 100644
index 0000000..5def284
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.000.s3bucket.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+// create S3 bucket with data
+playground data_dir level1a:data/jsonl/multi-lines/20-records.json,level1a/level2a:data/jsonl/multi-lines-with-arrays/5-records.json,level1b:data/jsonl/multi-lines-with-nested-objects/5-records.json,level1b/level2b:data/jsonl/single-line/20-records.json
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.001.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.001.ddl.sqlpp
new file mode 100644
index 0000000..6b2e0cd
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.001.ddl.sqlpp
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test IF EXISTS;
+CREATE DATAVERSE test;
+USE test;
+
+DROP TYPE t1 IF EXISTS;
+CREATE TYPE t1 AS OPEN {};
+
+DROP DATASET ds1 IF EXISTS;
+CREATE EXTERNAL DATASET ds1(t1) USING S3 (
+("accessKey"="dummyAccessKey"),
+("secretKey"="dummySecretKey"),
+("region"="us-west-2"),
+("serviceEndpoint"="http://localhost:8001"),
+("container"="playground"),
+("definition"="data_dir"),
+("format"="json")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.002.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.002.query.sqlpp
new file mode 100644
index 0000000..87490e6
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.002.query.sqlpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+// requesttype=application/json
+// param max-warnings:json=100
+USE test;
+
+FROM ds1 v SELECT VALUE v ORDER BY v.id ASC;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.099.ddl.sqlpp
new file mode 100644
index 0000000..36b2bab
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/jsonl/query-dataset.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test IF EXISTS;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/jsonl/external_dataset.002.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/jsonl/external_dataset.002.adm
new file mode 100644
index 0000000..bf2b0e1
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/jsonl/external_dataset.002.adm
@@ -0,0 +1,50 @@
+{ "id": 1, "year": null, "quarter": null, "review": "good" }
+{ "id": 2, "year": null, "quarter": null, "review": "good" }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good" }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad" }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good" }
+{ "id": 6, "year": 2018, "quarter": 1, "review": "bad" }
+{ "id": 7, "year": 2018, "quarter": 2, "review": "good" }
+{ "id": 8, "year": 2018, "quarter": 2, "review": "bad" }
+{ "id": 9, "year": 2019, "quarter": null, "review": "good" }
+{ "id": 10, "year": 2019, "quarter": null, "review": "bad" }
+{ "id": 11, "year": 2019, "quarter": 1, "review": "good" }
+{ "id": 12, "year": 2019, "quarter": 1, "review": "bad" }
+{ "id": 13, "year": 2019, "quarter": 2, "review": "good" }
+{ "id": 14, "year": 2019, "quarter": 2, "review": "bad" }
+{ "id": 15, "year": 2019, "quarter": null, "review": "good" }
+{ "id": 16, "year": 2019, "quarter": null, "review": "bad" }
+{ "id": 17, "year": 2019, "quarter": 1, "review": "good" }
+{ "id": 18, "year": 2019, "quarter": 1, "review": "bad" }
+{ "id": 19, "year": 2019, "quarter": 2, "review": "good" }
+{ "id": 20, "year": 2019, "quarter": 2, "review": "bad" }
+{ "id": 21, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 22, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ] }
+{ "id": 23, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 24, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3 ] }
+{ "id": 25, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 26, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ], "nested": { "id": 1 } }
+{ "id": 27, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ], "nested": { "id": 1 }, "nested2": [ { "id": 1 } ] }
+{ "id": 28, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ { "nested": { "array": [ 1, 2 ] } } ] } } ] }
+{ "id": 29, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3, { "nested1": { "id": 1, "nested2": { "id": 2, "nested3": [ { "nested4": null } ] } } } ] }
+{ "id": 30, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ 1, 2 ] } } ] }
+{ "id": 31, "year": null, "quarter": null, "review": "good" }
+{ "id": 32, "year": null, "quarter": null, "review": "good" }
+{ "id": 33, "year": 2018, "quarter": null, "review": "good" }
+{ "id": 34, "year": 2018, "quarter": null, "review": "bad" }
+{ "id": 35, "year": 2018, "quarter": 1, "review": "good" }
+{ "id": 36, "year": 2018, "quarter": 1, "review": "bad" }
+{ "id": 37, "year": 2018, "quarter": 2, "review": "good" }
+{ "id": 38, "year": 2018, "quarter": 2, "review": "bad" }
+{ "id": 39, "year": 2019, "quarter": null, "review": "good" }
+{ "id": 40, "year": 2019, "quarter": null, "review": "bad" }
+{ "id": 41, "year": 2019, "quarter": 1, "review": "good" }
+{ "id": 42, "year": 2019, "quarter": 1, "review": "bad" }
+{ "id": 43, "year": 2019, "quarter": 2, "review": "good" }
+{ "id": 44, "year": 2019, "quarter": 2, "review": "bad" }
+{ "id": 45, "year": 2019, "quarter": null, "review": "good" }
+{ "id": 46, "year": 2019, "quarter": null, "review": "bad" }
+{ "id": 47, "year": 2019, "quarter": 1, "review": "good" }
+{ "id": 48, "year": 2019, "quarter": 1, "review": "bad" }
+{ "id": 49, "year": 2019, "quarter": 2, "review": "good" }
+{ "id": 50, "year": 2019, "quarter": 2, "review": "bad" }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_one_partition.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_one_partition.xml
index e194c86..0597d8f 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_one_partition.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_one_partition.xml
@@ -73,5 +73,10 @@
         <expected-warn>Parsing error at data_dir/no_h_missing_fields.tsv record 2 field 3: some fields are missing</expected-warn>
       </compilation-unit>
     </test-case>
+    <test-case FilePath="external-dataset" check-warnings="true">
+      <compilation-unit name="aws/s3/jsonl">
+        <output-dir compare="Text">aws/s3/jsonl</output-dir>
+      </compilation-unit>
+    </test-case>
   </test-group>
 </test-suite>
diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
index cfe4646..6ec0ea2 100644
--- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
+++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
@@ -240,9 +240,9 @@ public class ErrorCode {
     public static final int LIBRARY_JAVA_FUNCTION_HELPER_OBJ_TYPE_NOT_SUPPORTED = 3046;
     public static final int LIBRARY_EXTERNAL_FUNCTION_UNSUPPORTED_NAME = 3047;
     public static final int OPERATORS_FEED_META_OPERATOR_DESCRIPTOR_INVALID_RUNTIME = 3048;
-    public static final int PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_DELIMITER = 3049;
-    public static final int PARSER_INVALID_CHAR_LENGTH = 3050;
-    public static final int PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_QUOTE_DELIMITER_MISMATCH = 3051;
+    public static final int INVALID_DELIMITER = 3049;
+    public static final int INVALID_CHAR_LENGTH = 3050;
+    public static final int QUOTE_DELIMITER_MISMATCH = 3051;
     public static final int INDEXING_EXTERNAL_FILE_INDEX_ACCESSOR_UNABLE_TO_FIND_FILE_INDEX = 3052;
     public static final int PARSER_ADM_DATA_PARSER_FIELD_NOT_NULL = 3053;
     public static final int PARSER_ADM_DATA_PARSER_TYPE_MISMATCH = 3054;
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
index e6e8ae0..81b8e41 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
@@ -52,7 +52,7 @@ public class QuotedLineRecordReader extends LineRecordReader {
         super.configure(ctx, inputStream, config);
         this.warnings = ctx.getWarningCollector();
         String quoteString = config.get(ExternalDataConstants.KEY_QUOTE);
-        ExternalDataUtils.validateQuote(quoteString);
+        ExternalDataUtils.validateChar(quoteString, ExternalDataConstants.KEY_QUOTE);
         this.quote = quoteString.charAt(0);
         this.escape = ExternalDataUtils.validateGetEscape(config);
     }
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java
index fa4a4a5..1fb5b25 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java
@@ -18,6 +18,15 @@
  */
 package org.apache.asterix.external.input.record.reader.stream;
 
+import static org.apache.asterix.external.util.ExternalDataConstants.CLOSING_BRACKET;
+import static org.apache.asterix.external.util.ExternalDataConstants.COMMA;
+import static org.apache.asterix.external.util.ExternalDataConstants.CR;
+import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_END;
+import static org.apache.asterix.external.util.ExternalDataConstants.LF;
+import static org.apache.asterix.external.util.ExternalDataConstants.OPEN_BRACKET;
+import static org.apache.asterix.external.util.ExternalDataConstants.SPACE;
+import static org.apache.asterix.external.util.ExternalDataConstants.TAB;
+
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collections;
@@ -25,21 +34,32 @@ import java.util.List;
 import java.util.Map;
 
 import org.apache.asterix.common.exceptions.ErrorCode;
-import org.apache.asterix.common.exceptions.ExceptionUtils;
 import org.apache.asterix.common.exceptions.RuntimeDataException;
 import org.apache.asterix.external.api.AsterixInputStream;
 import org.apache.asterix.external.util.ExternalDataConstants;
+import org.apache.asterix.external.util.ExternalDataUtils;
 import org.apache.hyracks.api.context.IHyracksTaskContext;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 
 public class SemiStructuredRecordReader extends StreamRecordReader {
 
+    private enum State {
+        TOP_LEVEL, // valid chars at this state: '{' or '[' to start a new record or array of records
+        ARRAY, // valid chars at this state: '{' or ']' to start the first nested record or close the array
+        NESTED_OBJECT, // valid chars at this state: ',' or ']' to close the array or expect another nested record
+        AFTER_COMMA // valid chars at this state: '{' to start a new nested record
+    }
+
     private int depth;
     private boolean prevCharEscape;
     private boolean inString;
     private char recordStart;
     private char recordEnd;
+    private boolean hasStarted;
+    private boolean hasFinished;
     private int recordNumber = 0;
+    private State state = State.TOP_LEVEL;
+
     private static final List<String> recordReaderFormats = Collections.unmodifiableList(
             Arrays.asList(ExternalDataConstants.FORMAT_ADM, ExternalDataConstants.FORMAT_JSON_LOWER_CASE,
                     ExternalDataConstants.FORMAT_JSON_UPPER_CASE, ExternalDataConstants.FORMAT_SEMISTRUCTURED));
@@ -49,32 +69,26 @@ public class SemiStructuredRecordReader extends StreamRecordReader {
     public void configure(IHyracksTaskContext ctx, AsterixInputStream stream, Map<String, String> config)
             throws HyracksDataException {
         super.configure(stream, config);
-        String recStartString = config.get(ExternalDataConstants.KEY_RECORD_START);
-        String recEndString = config.get(ExternalDataConstants.KEY_RECORD_END);
+        stream.setNotificationHandler(this);
         // set record opening char
-        if (recStartString != null) {
-            if (recStartString.length() != 1) {
-                throw new HyracksDataException(
-                        ExceptionUtils.incorrectParameterMessage(ExternalDataConstants.KEY_RECORD_START,
-                                ExternalDataConstants.PARAMETER_OF_SIZE_ONE, recStartString));
-            }
-            recordStart = recStartString.charAt(0);
-        } else {
-            recordStart = ExternalDataConstants.DEFAULT_RECORD_START;
-        }
+        recordStart = ExternalDataUtils.validateGetRecordStart(config);
         // set record ending char
-        if (recEndString != null) {
-            if (recEndString.length() != 1) {
-                throw new HyracksDataException(
-                        ExceptionUtils.incorrectParameterMessage(ExternalDataConstants.KEY_RECORD_END,
-                                ExternalDataConstants.PARAMETER_OF_SIZE_ONE, recEndString));
-            }
-            recordEnd = recEndString.charAt(0);
-        } else {
-            recordEnd = ExternalDataConstants.DEFAULT_RECORD_END;
+        recordEnd = ExternalDataUtils.validateGetRecordEnd(config);
+        if (recordStart == recordEnd) {
+            throw new RuntimeDataException(ErrorCode.INVALID_REQ_PARAM_VAL, KEY_RECORD_END, recordEnd);
         }
     }
 
+    @Override
+    public void notifyNewSource() {
+        if (hasStarted) {
+            // TODO(ali): WARN
+        }
+        recordNumber = 0;
+        state = State.TOP_LEVEL;
+        resetForNewRecord();
+    }
+
     public int getRecordNumber() {
         return recordNumber;
     }
@@ -84,12 +98,7 @@ public class SemiStructuredRecordReader extends StreamRecordReader {
         if (done) {
             return false;
         }
-        record.reset();
-        boolean hasStarted = false;
-        boolean hasFinished = false;
-        prevCharEscape = false;
-        inString = false;
-        depth = 0;
+        resetForNewRecord();
         do {
             int startPosn = bufferPosn; // starting from where we left off the last time
             if (bufferPosn >= bufferLength) {
@@ -102,16 +111,30 @@ public class SemiStructuredRecordReader extends StreamRecordReader {
             }
             if (!hasStarted) {
                 for (; bufferPosn < bufferLength; ++bufferPosn) { // search for record begin
-                    if (inputBuffer[bufferPosn] == recordStart) {
+                    char c = inputBuffer[bufferPosn];
+                    if (c == SPACE || c == TAB || c == LF || c == CR) {
+                        continue;
+                    }
+                    if (c == recordStart && state != State.NESTED_OBJECT) {
+                        // '{' is allowed at the top level, after '[' and after ','
+                        if (state == State.ARRAY || state == State.AFTER_COMMA) {
+                            state = State.NESTED_OBJECT;
+                        }
                         startPosn = bufferPosn;
                         hasStarted = true;
                         depth = 1;
-                        ++bufferPosn; // at next invocation proceed from following byte
+                        ++bufferPosn;
                         break;
-                    } else if (inputBuffer[bufferPosn] != ExternalDataConstants.SPACE
-                            && inputBuffer[bufferPosn] != ExternalDataConstants.TAB
-                            && inputBuffer[bufferPosn] != ExternalDataConstants.LF
-                            && inputBuffer[bufferPosn] != ExternalDataConstants.CR) {
+                    } else if (c == OPEN_BRACKET && state == State.TOP_LEVEL) {
+                        // '[' is allowed at the top level only
+                        state = State.ARRAY;
+                    } else if (c == CLOSING_BRACKET && (state == State.ARRAY || state == State.NESTED_OBJECT)) {
+                        // ']' is allowed after '[' and after capturing a record in an array
+                        state = State.TOP_LEVEL;
+                    } else if (c == COMMA && state == State.NESTED_OBJECT) {
+                        // ',' is allowed after capturing a record in an array
+                        state = State.AFTER_COMMA;
+                    } else {
                         // corrupted file. clear the buffer and stop reading
                         reader.reset();
                         bufferPosn = bufferLength = 0;
@@ -120,17 +143,13 @@ public class SemiStructuredRecordReader extends StreamRecordReader {
                 }
             }
             if (hasStarted) {
-                for (; bufferPosn < bufferLength; ++bufferPosn) { // search for record begin
+                for (; bufferPosn < bufferLength; ++bufferPosn) {
                     if (inString) {
                         // we are in a string, we only care about the string end
                         if (inputBuffer[bufferPosn] == ExternalDataConstants.QUOTE && !prevCharEscape) {
                             inString = false;
                         }
-                        if (prevCharEscape) {
-                            prevCharEscape = false;
-                        } else {
-                            prevCharEscape = inputBuffer[bufferPosn] == ExternalDataConstants.ESCAPE;
-                        }
+                        prevCharEscape = inputBuffer[bufferPosn] == ExternalDataConstants.ESCAPE && !prevCharEscape;
                     } else {
                         if (inputBuffer[bufferPosn] == ExternalDataConstants.QUOTE) {
                             inString = true;
@@ -174,14 +193,12 @@ public class SemiStructuredRecordReader extends StreamRecordReader {
         return REQUIRED_CONFIGS;
     }
 
-    @Override
-    public boolean stop() {
-        try {
-            reader.stop();
-        } catch (Exception e) {
-            e.printStackTrace();
-            return false;
-        }
-        return true;
+    private void resetForNewRecord() {
+        record.reset();
+        hasStarted = false;
+        hasFinished = false;
+        prevCharEscape = false;
+        inString = false;
+        depth = 0;
     }
 }
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
index ce95435..9390d59 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
@@ -235,6 +235,9 @@ public class ExternalDataConstants {
     public static final char CR = '\r';
     public static final char DEFAULT_RECORD_START = '{';
     public static final char DEFAULT_RECORD_END = '}';
+    public static final char OPEN_BRACKET = '[';
+    public static final char CLOSING_BRACKET = ']';
+    public static final char COMMA = ',';
 
     /**
      * Constant byte characters
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index 409b69b..8fb17bc 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -18,6 +18,11 @@
  */
 package org.apache.asterix.external.util;
 
+import static org.apache.asterix.external.util.ExternalDataConstants.KEY_DELIMITER;
+import static org.apache.asterix.external.util.ExternalDataConstants.KEY_ESCAPE;
+import static org.apache.asterix.external.util.ExternalDataConstants.KEY_QUOTE;
+import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_END;
+import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_START;
 import static org.apache.asterix.external.util.ExternalDataConstants.KEY_REDACT_WARNINGS;
 
 import java.util.EnumMap;
@@ -61,34 +66,27 @@ public class ExternalDataUtils {
 
     // Get a delimiter from the given configuration
     public static char validateGetDelimiter(Map<String, String> configuration) throws HyracksDataException {
-        String delimiterValue = configuration.get(ExternalDataConstants.KEY_DELIMITER);
-        if (delimiterValue == null) {
-            return ExternalDataConstants.DEFAULT_DELIMITER.charAt(0);
-        }
-        validateDelimiter(delimiterValue);
-        return delimiterValue.charAt(0);
+        return validateCharOrDefault(configuration, KEY_DELIMITER, ExternalDataConstants.DEFAULT_DELIMITER.charAt(0));
     }
 
     // Get a quote from the given configuration when the delimiter is given
     // Need to pass delimiter to check whether they share the same character
     public static char validateGetQuote(Map<String, String> configuration, char delimiter) throws HyracksDataException {
-        String quoteValue = configuration.get(ExternalDataConstants.KEY_QUOTE);
-        if (quoteValue == null) {
-            return ExternalDataConstants.DEFAULT_QUOTE.charAt(0);
-        }
-        validateQuote(quoteValue);
-        char quote = quoteValue.charAt(0);
+        char quote = validateCharOrDefault(configuration, KEY_QUOTE, ExternalDataConstants.DEFAULT_QUOTE.charAt(0));
         validateDelimiterAndQuote(delimiter, quote);
         return quote;
     }
 
     public static char validateGetEscape(Map<String, String> configuration) throws HyracksDataException {
-        String escapeValue = configuration.get(ExternalDataConstants.KEY_ESCAPE);
-        if (escapeValue == null) {
-            return ExternalDataConstants.ESCAPE;
-        }
-        validateEscape(escapeValue);
-        return escapeValue.charAt(0);
+        return validateCharOrDefault(configuration, KEY_ESCAPE, ExternalDataConstants.ESCAPE);
+    }
+
+    public static char validateGetRecordStart(Map<String, String> configuration) throws HyracksDataException {
+        return validateCharOrDefault(configuration, KEY_RECORD_START, ExternalDataConstants.DEFAULT_RECORD_START);
+    }
+
+    public static char validateGetRecordEnd(Map<String, String> configuration) throws HyracksDataException {
+        return validateCharOrDefault(configuration, KEY_RECORD_END, ExternalDataConstants.DEFAULT_RECORD_END);
     }
 
     public static void validateDataParserParameters(Map<String, String> configuration) throws AsterixException {
@@ -329,13 +327,13 @@ public class ExternalDataUtils {
         if (format != null) {
             // default quote, escape character for quote and fields delimiter for csv and tsv format
             if (format.equals(ExternalDataConstants.FORMAT_CSV)) {
-                configuration.putIfAbsent(ExternalDataConstants.KEY_DELIMITER, ExternalDataConstants.DEFAULT_DELIMITER);
-                configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE, ExternalDataConstants.DEFAULT_QUOTE);
-                configuration.putIfAbsent(ExternalDataConstants.KEY_ESCAPE, ExternalDataConstants.DEFAULT_QUOTE);
+                configuration.putIfAbsent(KEY_DELIMITER, ExternalDataConstants.DEFAULT_DELIMITER);
+                configuration.putIfAbsent(KEY_QUOTE, ExternalDataConstants.DEFAULT_QUOTE);
+                configuration.putIfAbsent(KEY_ESCAPE, ExternalDataConstants.DEFAULT_QUOTE);
             } else if (format.equals(ExternalDataConstants.FORMAT_TSV)) {
-                configuration.putIfAbsent(ExternalDataConstants.KEY_DELIMITER, ExternalDataConstants.TAB_STR);
-                configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE, ExternalDataConstants.NULL_STR);
-                configuration.putIfAbsent(ExternalDataConstants.KEY_ESCAPE, ExternalDataConstants.NULL_STR);
+                configuration.putIfAbsent(KEY_DELIMITER, ExternalDataConstants.TAB_STR);
+                configuration.putIfAbsent(KEY_QUOTE, ExternalDataConstants.NULL_STR);
+                configuration.putIfAbsent(KEY_ESCAPE, ExternalDataConstants.NULL_STR);
             }
         }
     }
@@ -411,30 +409,25 @@ public class ExternalDataUtils {
         return value.equals(ExternalDataConstants.TRUE) || value.equals(ExternalDataConstants.FALSE);
     }
 
-    private static void validateDelimiter(String delimiter) throws RuntimeDataException {
-        if (delimiter.length() != 1) {
-            throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_DELIMITER,
-                    delimiter);
-        }
-    }
-
-    public static void validateQuote(String quote) throws RuntimeDataException {
-        if (quote.length() != 1) {
-            throw new RuntimeDataException(ErrorCode.PARSER_INVALID_CHAR_LENGTH, quote,
-                    ExternalDataConstants.KEY_QUOTE);
+    private static void validateDelimiterAndQuote(char delimiter, char quote) throws RuntimeDataException {
+        if (quote == delimiter) {
+            throw new RuntimeDataException(ErrorCode.QUOTE_DELIMITER_MISMATCH, quote, delimiter);
         }
     }
 
-    private static void validateEscape(String esc) throws RuntimeDataException {
-        if (esc.length() != 1) {
-            throw new RuntimeDataException(ErrorCode.PARSER_INVALID_CHAR_LENGTH, esc, ExternalDataConstants.KEY_ESCAPE);
+    private static char validateCharOrDefault(Map<String, String> configuration, String key, char defaultValue)
+            throws HyracksDataException {
+        String value = configuration.get(key);
+        if (value == null) {
+            return defaultValue;
         }
+        validateChar(value, key);
+        return value.charAt(0);
     }
 
-    private static void validateDelimiterAndQuote(char delimiter, char quote) throws RuntimeDataException {
-        if (quote == delimiter) {
-            throw new RuntimeDataException(
-                    ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_QUOTE_DELIMITER_MISMATCH, quote, delimiter);
+    public static void validateChar(String parameterValue, String parameterName) throws RuntimeDataException {
+        if (parameterValue.length() != 1) {
+            throw new RuntimeDataException(ErrorCode.INVALID_CHAR_LENGTH, parameterValue, parameterName);
         }
     }
 }