You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by al...@apache.org on 2023/02/27 20:29:12 UTC

[asterixdb] 12/16: [ASTERIXDB-3117][EXT] Allow specifying a subpath for EXTERNAL datasets queries

This is an automated email from the ASF dual-hosted git repository.

alsuliman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git

commit ba91b1cdec9e8ed9fa8c5e5cc8502c987abe2bad
Author: Ali Alsuliman <al...@gmail.com>
AuthorDate: Wed Feb 22 21:33:55 2023 -0800

    [ASTERIXDB-3117][EXT] Allow specifying a subpath for EXTERNAL datasets queries
    
    - user model changes: no
    - storage format changes: no
    - interface changes: yes
    
    Details:
    This patch is to allow users to specify a subpath in queries scanning
    external data sources like S3.
    
    - add "subpath" hint to allow specifying a subpath to be used in
      conjunction with the "definition" of external datasets.
    - capture the "subpath" hint for each term in the FROM clause.
    - pass the hint to the properties of IDataSource and expose them.
    - Op Isomorphism: compare IDataSource properties only for EXTERNAL datasets.
    
    Change-Id: I3d1ec61fd8aa3275260c1aed6d00fa3c7b408351
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17395
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Ali Alsuliman <al...@gmail.com>
    Reviewed-by: Murtadha Hubail <mh...@apache.org>
---
 .../SqlppExpressionToPlanTranslator.java           | 11 ++++
 .../asterix/app/function/DatasetRewriter.java      | 13 +++++
 .../json/json/external_dataset.000.ddl.sqlpp       | 23 ++++++++
 .../json/json/external_dataset.008.query.sqlpp     | 24 ++-------
 .../json/json/external_dataset.009.query.sqlpp     | 24 ++-------
 .../json/json/external_dataset.010.query.sqlpp     | 24 ++-------
 .../json/json/external_dataset.011.query.sqlpp     | 26 ++-------
 .../json/json/external_dataset.012.query.sqlpp     | 26 ++-------
 .../json/json/external_dataset.013.query.sqlpp     | 26 ++-------
 .../json/json/external_dataset.014.query.sqlpp     | 26 ++-------
 .../common/json/json/external_dataset.008.adm      | 25 +++++++++
 .../common/json/json/external_dataset.009.adm      | 25 +++++++++
 .../common/json/json/external_dataset.010.adm      |  1 +
 .../common/json/json/external_dataset.011.plan     | 48 +++++++++++++++++
 .../common/json/json/external_dataset.012.adm      |  1 +
 .../common/json/json/external_dataset.013.plan     | 62 ++++++++++++++++++++++
 .../common/json/json/external_dataset.014.adm      |  1 +
 .../annotations/ExternalSubpathAnnotation.java     | 26 ++++-----
 .../external/util/ExternalDataConstants.java       |  1 +
 .../asterix/external/util/ExternalDataUtils.java   | 22 +++++++-
 .../asterix/lang/sqlpp/parser/SqlppHint.java       |  1 +
 .../visitor/VariableCheckAndRewriteVisitor.java    |  1 +
 .../asterix-lang-sqlpp/src/main/javacc/SQLPP.jj    | 23 +++++++-
 .../metadata/declared/DatasetDataSource.java       | 20 ++++++-
 .../core/algebra/metadata/IDataSource.java         |  8 +++
 .../visitors/IsomorphismOperatorVisitor.java       |  7 +++
 26 files changed, 327 insertions(+), 168 deletions(-)

diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/translator/SqlppExpressionToPlanTranslator.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/translator/SqlppExpressionToPlanTranslator.java
index 3c4537f98d..ddabaa0934 100644
--- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/translator/SqlppExpressionToPlanTranslator.java
+++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/translator/SqlppExpressionToPlanTranslator.java
@@ -18,6 +18,8 @@
  */
 package org.apache.asterix.translator;
 
+import static org.apache.asterix.external.util.ExternalDataConstants.SUBPATH;
+
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -28,6 +30,7 @@ import java.util.Set;
 import java.util.function.Predicate;
 
 import org.apache.asterix.algebra.base.ILangExpressionToPlanTranslator;
+import org.apache.asterix.common.annotations.ExternalSubpathAnnotation;
 import org.apache.asterix.common.exceptions.CompilationException;
 import org.apache.asterix.common.exceptions.ErrorCode;
 import org.apache.asterix.common.functions.FunctionSignature;
@@ -325,6 +328,10 @@ public class SqlppExpressionToPlanTranslator extends LangExpressionToPlanTransla
         } else {
             unnestOp = new UnnestOperator(fromVar, new MutableObject<>(pUnnestExpr.first));
         }
+        ExternalSubpathAnnotation hint = ((AbstractExpression) fromExpr).findHint(ExternalSubpathAnnotation.class);
+        if (hint != null) {
+            unnestOp.getAnnotations().put(SUBPATH, hint.getSubPath());
+        }
         unnestOp.getInputs().add(pUnnestExpr.second);
         unnestOp.setSourceLocation(sourceLoc);
 
@@ -576,6 +583,10 @@ public class SqlppExpressionToPlanTranslator extends LangExpressionToPlanTransla
                             outerUnnestMissingValue)
                     : new UnnestOperator(rightVar, new MutableObject<>(pUnnestExpr.first));
         }
+        ExternalSubpathAnnotation hint = ((AbstractExpression) rightExpr).findHint(ExternalSubpathAnnotation.class);
+        if (hint != null) {
+            unnestOp.getAnnotations().put(SUBPATH, hint.getSubPath());
+        }
         unnestOp.getInputs().add(pUnnestExpr.second);
         unnestOp.setSourceLocation(binaryCorrelate.getRightVariable().getSourceLocation());
         return new Pair<>(unnestOp, rightVar);
diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/DatasetRewriter.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/DatasetRewriter.java
index b1c2f0a956..68edc0ddf6 100644
--- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/DatasetRewriter.java
+++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/DatasetRewriter.java
@@ -21,10 +21,14 @@ package org.apache.asterix.app.function;
 import static org.apache.asterix.common.api.IIdentifierMapper.Modifier.PLURAL;
 import static org.apache.asterix.common.api.IIdentifierMapper.Modifier.SINGULAR;
 import static org.apache.asterix.common.utils.IdentifierUtil.dataset;
+import static org.apache.asterix.external.util.ExternalDataConstants.SUBPATH;
 
+import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 
+import org.apache.asterix.common.config.DatasetConfig;
 import org.apache.asterix.common.exceptions.CompilationException;
 import org.apache.asterix.common.exceptions.ErrorCode;
 import org.apache.asterix.common.metadata.DatasetFullyQualifiedName;
@@ -102,6 +106,15 @@ public class DatasetRewriter implements IFunctionToDataSourceRewriter, IResultTy
         }
         DataSourceScanOperator scan = new DataSourceScanOperator(variables, dataSource);
         scan.setSourceLocation(unnest.getSourceLocation());
+        if (dataset.getDatasetType() == DatasetConfig.DatasetType.EXTERNAL) {
+            Map<String, Object> unnestAnnotations = unnest.getAnnotations();
+            scan.getAnnotations().putAll(unnestAnnotations);
+            Map<String, Serializable> dataSourceProperties = dataSource.getProperties();
+            Object externalSubpath = unnestAnnotations.get(SUBPATH);
+            if (externalSubpath instanceof String) {
+                dataSourceProperties.put(SUBPATH, (String) externalSubpath);
+            }
+        }
         List<Mutable<ILogicalOperator>> scanInpList = scan.getInputs();
         scanInpList.addAll(unnest.getInputs());
         opRef.setValue(scan);
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.000.ddl.sqlpp
index f203da0101..1538bf9052 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.000.ddl.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.000.ddl.sqlpp
@@ -63,4 +63,27 @@ CREATE EXTERNAL DATASET test5(test) USING %adapter% (
 ("container"="playground"),
 ("definition"="json-data/single-line/json-array-of-objects"),
 ("format"="json")
+);
+
+drop dataset test6 if exists;
+CREATE EXTERNAL DATASET test6(test) USING %adapter% (
+%template%,
+("container"="playground"),
+("definition"="json-data/reviews"),
+("format"="json")
+);
+
+drop dataset test7 if exists;
+CREATE EXTERNAL DATASET test7(test) USING %adapter% (
+%template%,
+("container"="playground"),
+("format"="json")
+);
+
+drop dataset test8 if exists;
+CREATE EXTERNAL DATASET test8(test) USING %adapter% (
+%template%,
+("container"="playground"),
+("definition"="json-data"),
+("format"="json")
 );
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.008.query.sqlpp
similarity index 52%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.008.query.sqlpp
index e15b699b2b..1b265f1dce 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.008.query.sqlpp
@@ -16,25 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
 
-import java.util.List;
-
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
-
-public interface IDataSource<T> {
-    public T getId();
-
-    public Object[] getSchemaTypes();
-
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
-}
+use test;
+// testing subpath hint. the result should be = scanning test3
+select value test6 from /*+ subpath /multi-lines-with-arrays/json */ test6 order by id;
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.009.query.sqlpp
similarity index 52%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.009.query.sqlpp
index e15b699b2b..9356366ab7 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.009.query.sqlpp
@@ -16,25 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
 
-import java.util.List;
-
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
-
-public interface IDataSource<T> {
-    public T getId();
-
-    public Object[] getSchemaTypes();
-
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
-}
+use test;
+// testing subpath hint. the result should be = scanning test4
+select value test7 from /*+ subpath json-data/reviews/multi-lines-with-nested-objects/json */ test7 order by id;
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.010.query.sqlpp
similarity index 52%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.010.query.sqlpp
index e15b699b2b..8425714602 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.010.query.sqlpp
@@ -16,25 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
 
-import java.util.List;
-
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
-
-public interface IDataSource<T> {
-    public T getId();
-
-    public Object[] getSchemaTypes();
-
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
-}
+use test;
+// testing subpath hint. the result should be = scanning test5
+select value count(*) from /*+ subpath single-line/json-array-of-objects */ test8;
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.011.query.sqlpp
similarity index 52%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.011.query.sqlpp
index e15b699b2b..d0f08aa748 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.011.query.sqlpp
@@ -16,25 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
 
-import java.util.List;
-
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
-
-public interface IDataSource<T> {
-    public T getId();
-
-    public Object[] getSchemaTypes();
-
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
-}
+use test;
+// testing that test6 a and test6 b are two different data sources and hence no replicate should exist in the plan
+explain select count(a.quarter) as cnt
+from /*+ subpath /multi-lines/json */ test6 a
+join test6 b on a.quarter = b.quarter;
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.012.query.sqlpp
similarity index 52%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.012.query.sqlpp
index e15b699b2b..3a675ef117 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.012.query.sqlpp
@@ -16,25 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
 
-import java.util.List;
-
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
-
-public interface IDataSource<T> {
-    public T getId();
-
-    public Object[] getSchemaTypes();
-
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
-}
+use test;
+// testing that test6 a and test6 b are two different data sources and hence no replicate should exist in the plan
+select count(a.quarter) as cnt
+from /*+ subpath /multi-lines/json */ test6 a
+join test6 b on a.quarter = b.quarter;
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.013.query.sqlpp
similarity index 52%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.013.query.sqlpp
index e15b699b2b..e886adb898 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.013.query.sqlpp
@@ -16,25 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
 
-import java.util.List;
-
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
-
-public interface IDataSource<T> {
-    public T getId();
-
-    public Object[] getSchemaTypes();
-
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
-}
+use test;
+// testing that test6 a and test6 b are the same data sources and hence replicate should exist in the plan
+explain select count(a.quarter) as cnt
+from /*+ subpath /multi-lines/json */ test6 a
+join /*+ subpath /multi-lines/json */ test6 b on a.quarter = b.quarter;
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.014.query.sqlpp
similarity index 52%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.014.query.sqlpp
index e15b699b2b..5b39d4f239 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/json/json/external_dataset.014.query.sqlpp
@@ -16,25 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
 
-import java.util.List;
-
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
-
-public interface IDataSource<T> {
-    public T getId();
-
-    public Object[] getSchemaTypes();
-
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
-}
+use test;
+// testing that test6 a and test6 b are the same data sources and hence replicate should exist in the plan
+select count(a.quarter) as cnt
+from /*+ subpath /multi-lines/json */ test6 a
+join /*+ subpath /multi-lines/json */ test6 b on a.quarter = b.quarter;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.008.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.008.adm
new file mode 100644
index 0000000000..7660e7e5e6
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.008.adm
@@ -0,0 +1,25 @@
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3 ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3 ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3 ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3 ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3 ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3 ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3 ] }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.009.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.009.adm
new file mode 100644
index 0000000000..764398637a
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.009.adm
@@ -0,0 +1,25 @@
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ], "nested": { "id": 1 } }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ], "nested": { "id": 1 } }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ], "nested": { "id": 1 } }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ], "nested": { "id": 1 } }
+{ "id": 1, "year": null, "quarter": null, "review": "good", "array": [ 1, 2, 3 ], "nested": { "id": 1 } }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ], "nested": { "id": 1 }, "nested2": [ { "id": 1 } ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ], "nested": { "id": 1 }, "nested2": [ { "id": 1 } ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ], "nested": { "id": 1 }, "nested2": [ { "id": 1 } ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ], "nested": { "id": 1 }, "nested2": [ { "id": 1 } ] }
+{ "id": 2, "year": null, "quarter": null, "review": "good", "array": [ 1, [ 1, 2 ], [ 1 ] ], "nested": { "id": 1 }, "nested2": [ { "id": 1 } ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ { "nested": { "array": [ 1, 2 ] } } ] } } ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ { "nested": { "array": [ 1, 2 ] } } ] } } ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ { "nested": { "array": [ 1, 2 ] } } ] } } ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ { "nested": { "array": [ 1, 2 ] } } ] } } ] }
+{ "id": 3, "year": 2018, "quarter": null, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ { "nested": { "array": [ 1, 2 ] } } ] } } ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3, { "nested1": { "id": 1, "nested2": { "id": 2, "nested3": [ { "nested4": null } ] } } } ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3, { "nested1": { "id": 1, "nested2": { "id": 2, "nested3": [ { "nested4": null } ] } } } ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3, { "nested1": { "id": 1, "nested2": { "id": 2, "nested3": [ { "nested4": null } ] } } } ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3, { "nested1": { "id": 1, "nested2": { "id": 2, "nested3": [ { "nested4": null } ] } } } ] }
+{ "id": 4, "year": 2018, "quarter": null, "review": "bad", "array": [ 1, 2, 3, { "nested1": { "id": 1, "nested2": { "id": 2, "nested3": [ { "nested4": null } ] } } } ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ 1, 2 ] } } ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ 1, 2 ] } } ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ 1, 2 ] } } ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ 1, 2 ] } } ] }
+{ "id": 5, "year": 2018, "quarter": 1, "review": "good", "array": [ 1, 2, 3, { "nested": { "array": [ 1, 2 ] } } ] }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.010.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.010.adm
new file mode 100644
index 0000000000..86babba1b3
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.010.adm
@@ -0,0 +1 @@
+50128
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.011.plan b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.011.plan
new file mode 100644
index 0000000000..6ec7195319
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.011.plan
@@ -0,0 +1,48 @@
+distribute result [$$48] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+-- DISTRIBUTE_RESULT  |UNPARTITIONED|
+  exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+  -- ONE_TO_ONE_EXCHANGE  |UNPARTITIONED|
+    project ([$$48]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+    -- STREAM_PROJECT  |UNPARTITIONED|
+      assign [$$48] <- [{"cnt": $$51}] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+      -- ASSIGN  |UNPARTITIONED|
+        aggregate [$$51] <- [agg-sql-sum($$53)] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+        -- AGGREGATE  |UNPARTITIONED|
+          exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+          -- RANDOM_MERGE_EXCHANGE  |PARTITIONED|
+            aggregate [$$53] <- [agg-sql-count($$49)] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+            -- AGGREGATE  |PARTITIONED|
+              project ([$$49]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+              -- STREAM_PROJECT  |PARTITIONED|
+                exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                  join (eq($$49, $$50)) [cardinality: 1000000.0, op-cost: 2000000.0, total-cost: 6000000.0]
+                  -- HYBRID_HASH_JOIN [$$49][$$50]  |PARTITIONED|
+                    exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                    -- HASH_PARTITION_EXCHANGE [$$49]  |PARTITIONED|
+                      project ([$$49]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                      -- STREAM_PROJECT  |PARTITIONED|
+                        assign [$$49] <- [$$a.getField("quarter")] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                        -- ASSIGN  |PARTITIONED|
+                          exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                            data-scan []<-[$$a] <- test.test6 [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 1000000.0]
+                            -- DATASOURCE_SCAN  |PARTITIONED|
+                              exchange [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                              -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                                empty-tuple-source [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                                -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
+                    exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                    -- HASH_PARTITION_EXCHANGE [$$50]  |PARTITIONED|
+                      project ([$$50]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                      -- STREAM_PROJECT  |PARTITIONED|
+                        assign [$$50] <- [$$b.getField("quarter")] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                        -- ASSIGN  |PARTITIONED|
+                          exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                            data-scan []<-[$$b] <- test.test6 [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 1000000.0]
+                            -- DATASOURCE_SCAN  |PARTITIONED|
+                              exchange [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                              -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                                empty-tuple-source [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                                -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.012.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.012.adm
new file mode 100644
index 0000000000..6cd35abbe2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.012.adm
@@ -0,0 +1 @@
+{ "cnt": 15600 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.013.plan b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.013.plan
new file mode 100644
index 0000000000..22d5bd44f5
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.013.plan
@@ -0,0 +1,62 @@
+distribute result [$$47] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+-- DISTRIBUTE_RESULT  |UNPARTITIONED|
+  exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+  -- ONE_TO_ONE_EXCHANGE  |UNPARTITIONED|
+    project ([$$47]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+    -- STREAM_PROJECT  |UNPARTITIONED|
+      assign [$$47] <- [{"cnt": $$50}] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+      -- ASSIGN  |UNPARTITIONED|
+        aggregate [$$50] <- [agg-sql-sum($$52)] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+        -- AGGREGATE  |UNPARTITIONED|
+          exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+          -- RANDOM_MERGE_EXCHANGE  |PARTITIONED|
+            aggregate [$$52] <- [agg-sql-count($$48)] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+            -- AGGREGATE  |PARTITIONED|
+              project ([$$48]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+              -- STREAM_PROJECT  |PARTITIONED|
+                exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 6000000.0]
+                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                  join (eq($$48, $$49)) [cardinality: 1000000.0, op-cost: 2000000.0, total-cost: 6000000.0]
+                  -- HYBRID_HASH_JOIN [$$48][$$49]  |PARTITIONED|
+                    exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                      project ([$$48]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                      -- STREAM_PROJECT  |PARTITIONED|
+                        assign [$$48] <- [$$49] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                        -- ASSIGN  |PARTITIONED|
+                          exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                            replicate [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                            -- REPLICATE  |PARTITIONED|
+                              exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                              -- HASH_PARTITION_EXCHANGE [$$49]  |PARTITIONED|
+                                project ([$$49]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                                -- STREAM_PROJECT  |PARTITIONED|
+                                  assign [$$49] <- [$$b.getField("quarter")] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                                  -- ASSIGN  |PARTITIONED|
+                                    exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                                      data-scan []<-[$$b] <- test.test6 [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 1000000.0]
+                                      -- DATASOURCE_SCAN  |PARTITIONED|
+                                        exchange [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                                        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                                          empty-tuple-source [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                                          -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
+                    exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                      replicate [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                      -- REPLICATE  |PARTITIONED|
+                        exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                        -- HASH_PARTITION_EXCHANGE [$$49]  |PARTITIONED|
+                          project ([$$49]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                          -- STREAM_PROJECT  |PARTITIONED|
+                            assign [$$49] <- [$$b.getField("quarter")] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]
+                            -- ASSIGN  |PARTITIONED|
+                              exchange [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 2000000.0]
+                              -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                                data-scan []<-[$$b] <- test.test6 [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 1000000.0]
+                                -- DATASOURCE_SCAN  |PARTITIONED|
+                                  exchange [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                                  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
+                                    empty-tuple-source [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]
+                                    -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.014.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.014.adm
new file mode 100644
index 0000000000..8006c78129
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/json/json/external_dataset.014.adm
@@ -0,0 +1 @@
+{ "cnt": 1800 }
\ No newline at end of file
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/annotations/ExternalSubpathAnnotation.java
similarity index 53%
copy from hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
copy to asterixdb/asterix-common/src/main/java/org/apache/asterix/common/annotations/ExternalSubpathAnnotation.java
index e15b699b2b..e3e5ea3d30 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/annotations/ExternalSubpathAnnotation.java
@@ -16,25 +16,19 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.hyracks.algebricks.core.algebra.metadata;
+package org.apache.asterix.common.annotations;
 
-import java.util.List;
+import org.apache.hyracks.algebricks.core.algebra.expressions.IExpressionAnnotation;
 
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
-import org.apache.hyracks.algebricks.core.algebra.properties.INodeDomain;
+public final class ExternalSubpathAnnotation implements IExpressionAnnotation {
 
-public interface IDataSource<T> {
-    public T getId();
+    private final String subPath;
 
-    public Object[] getSchemaTypes();
+    public ExternalSubpathAnnotation(String subPath) {
+        this.subPath = subPath == null ? "" : subPath.trim();
+    }
 
-    public IDataSourcePropertiesProvider getPropertiesProvider();
-
-    public void computeFDs(List<LogicalVariable> scanVariables, List<FunctionalDependency> fdList);
-
-    // https://issues.apache.org/jira/browse/ASTERIXDB-1619
-    public boolean isScanAccessPathALeaf();
-
-    public INodeDomain getDomain();
+    public String getSubPath() {
+        return subPath;
+    }
 }
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
index 2097b4bcff..050a080622 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
@@ -297,6 +297,7 @@ public class ExternalDataConstants {
 
     public static final String DEFINITION_FIELD_NAME = "definition";
     public static final String CONTAINER_NAME_FIELD_NAME = "container";
+    public static final String SUBPATH = "subpath";
 
     public static class ParquetOptions {
         private ParquetOptions() {
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index ba07629b05..29e04e9904 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -738,10 +738,28 @@ public class ExternalDataUtils {
 
     public static String getPrefix(Map<String, String> configuration, boolean appendSlash) {
         String definition = configuration.get(ExternalDataConstants.DEFINITION_FIELD_NAME);
-        if (definition != null && !definition.isEmpty()) {
+        String subPath = configuration.get(ExternalDataConstants.SUBPATH);
+        boolean hasDefinition = definition != null && !definition.isEmpty();
+        boolean hasSubPath = subPath != null && !subPath.isEmpty();
+        if (hasDefinition && !hasSubPath) {
             return appendSlash ? definition + (!definition.endsWith("/") ? "/" : "") : definition;
         }
-        return "";
+        String fullPath = "";
+        if (hasSubPath) {
+            if (!hasDefinition) {
+                fullPath = subPath.startsWith("/") ? subPath.substring(1) : subPath;
+            } else {
+                // concatenate definition + subPath:
+                if (definition.endsWith("/") && subPath.startsWith("/")) {
+                    subPath = subPath.substring(1);
+                } else if (!definition.endsWith("/") && !subPath.startsWith("/")) {
+                    definition = definition + "/";
+                }
+                fullPath = definition + subPath;
+            }
+            fullPath = appendSlash ? fullPath + (!fullPath.endsWith("/") ? "/" : "") : fullPath;
+        }
+        return fullPath;
     }
 
     /**
diff --git a/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/parser/SqlppHint.java b/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/parser/SqlppHint.java
index f565fcd494..da3504d448 100644
--- a/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/parser/SqlppHint.java
+++ b/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/parser/SqlppHint.java
@@ -49,6 +49,7 @@ public enum SqlppHint {
     GEN_FIELDS_HINT("gen-fields"),
     SINGLE_DATASET_PREDICATE_SELECTIVITY_HINT("selectivity"),
     JOIN_PREDICATE_PRODUCTIVITY_HINT("productivity"),
+    SUBPATH_HINT("subpath"),
 
     // data generator hints
     DGEN_HINT("dgen");
diff --git a/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/rewrites/visitor/VariableCheckAndRewriteVisitor.java b/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/rewrites/visitor/VariableCheckAndRewriteVisitor.java
index 4b0caca142..8fbad4fba3 100644
--- a/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/rewrites/visitor/VariableCheckAndRewriteVisitor.java
+++ b/asterixdb/asterix-lang-sqlpp/src/main/java/org/apache/asterix/lang/sqlpp/rewrites/visitor/VariableCheckAndRewriteVisitor.java
@@ -203,6 +203,7 @@ public class VariableCheckAndRewriteVisitor extends AbstractSqlppExpressionScopi
             argList.add(new LiteralExpr(new StringLiteral(datasetName)));
         }
         CallExpr callExpr = new CallExpr(new FunctionSignature(BuiltinFunctions.DATASET), argList);
+        callExpr.addHints(varExpr.getHints());
         callExpr.setSourceLocation(sourceLoc);
         return callExpr;
     }
diff --git a/asterixdb/asterix-lang-sqlpp/src/main/javacc/SQLPP.jj b/asterixdb/asterix-lang-sqlpp/src/main/javacc/SQLPP.jj
index 759fe9043c..e03a3e31bb 100644
--- a/asterixdb/asterix-lang-sqlpp/src/main/javacc/SQLPP.jj
+++ b/asterixdb/asterix-lang-sqlpp/src/main/javacc/SQLPP.jj
@@ -52,6 +52,7 @@ import org.apache.asterix.common.annotations.AutoDataGen;
 import org.apache.asterix.common.annotations.DateBetweenYearsDataGen;
 import org.apache.asterix.common.annotations.DatetimeAddRandHoursDataGen;
 import org.apache.asterix.common.annotations.DatetimeBetweenYearsDataGen;
+import org.apache.asterix.common.annotations.ExternalSubpathAnnotation;
 import org.apache.asterix.common.annotations.FieldIntervalDataGen;
 import org.apache.asterix.common.annotations.FieldValFileDataGen;
 import org.apache.asterix.common.annotations.FieldValFileSameIndexDataGen;
@@ -4982,7 +4983,16 @@ FromTerm FromTerm() throws ParseException :
   List<AbstractBinaryCorrelateClause> correlateClauses = new ArrayList<AbstractBinaryCorrelateClause>();
 }
 {
-  leftExpr = Expression() ((<AS>)? leftVar = Variable())? (<AT> posVar = Variable())?
+  leftExpr = Expression()
+  {
+    if (leftExpr.getKind() == Expression.Kind.VARIABLE_EXPRESSION) {
+      Token hintToken = fetchHint(token, SqlppHint.SUBPATH_HINT);
+      if (hintToken != null) {
+        String subPath = hintToken.hintParams;
+        ((VariableExpr) leftExpr).addHint(new ExternalSubpathAnnotation(subPath));
+      }
+    }
+  } ((<AS>)? leftVar = Variable())? (<AT> posVar = Variable())?
   (
      (
       correlateClause = JoinOrUnnestClause(JoinType.INNER, UnnestType.INNER)
@@ -5053,7 +5063,16 @@ Triple<Expression, VariableExpr, VariableExpr> JoinClauseRightInput() throws Par
     VariableExpr posVar = null;
 }
 {
-  rightExpr = Expression() ((<AS>)? rightVar = Variable())? (<AT> posVar = Variable())?
+  rightExpr = Expression()
+  {
+    if (rightExpr.getKind() == Expression.Kind.VARIABLE_EXPRESSION) {
+        Token hintToken = fetchHint(token, SqlppHint.SUBPATH_HINT);
+        if (hintToken != null) {
+          String subPath = hintToken.hintParams;
+          ((VariableExpr) rightExpr).addHint(new ExternalSubpathAnnotation(subPath));
+        }
+    }
+  } ((<AS>)? rightVar = Variable())? (<AT> posVar = Variable())?
   {
     if (rightVar == null) {
       rightVar = ExpressionToVariableUtil.getGeneratedVariable(rightExpr, true);
diff --git a/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/declared/DatasetDataSource.java b/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/declared/DatasetDataSource.java
index 661b954813..66ea5a7313 100644
--- a/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/declared/DatasetDataSource.java
+++ b/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/declared/DatasetDataSource.java
@@ -19,8 +19,10 @@
 package org.apache.asterix.metadata.declared;
 
 import static org.apache.asterix.external.util.ExternalDataConstants.KEY_EXTERNAL_SCAN_BUFFER_SIZE;
+import static org.apache.asterix.external.util.ExternalDataConstants.SUBPATH;
 
 import java.io.IOException;
+import java.io.Serializable;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -120,7 +122,8 @@ public class DatasetDataSource extends DataSource {
             IProjectionInfo<?> projectionInfo) throws AlgebricksException {
         switch (dataset.getDatasetType()) {
             case EXTERNAL:
-                Dataset externalDataset = ((DatasetDataSource) dataSource).getDataset();
+                DatasetDataSource externalDataSource = (DatasetDataSource) dataSource;
+                Dataset externalDataset = externalDataSource.getDataset();
                 String itemTypeName = externalDataset.getItemTypeName();
                 IAType itemType = MetadataManager.INSTANCE.getDatatype(metadataProvider.getMetadataTxnContext(),
                         externalDataset.getItemTypeDataverseName(), itemTypeName).getDatatype();
@@ -129,6 +132,7 @@ public class DatasetDataSource extends DataSource {
                 PhysicalOptimizationConfig physicalOptimizationConfig = context.getPhysicalOptimizationConfig();
                 int externalScanBufferSize = physicalOptimizationConfig.getExternalScanBufferSize();
                 Map<String, String> properties = addExternalProjectionInfo(projectionInfo, edd.getProperties());
+                properties = addSubPath(externalDataSource.getProperties(), properties);
                 properties.put(KEY_EXTERNAL_SCAN_BUFFER_SIZE, String.valueOf(externalScanBufferSize));
                 ITypedAdapterFactory adapterFactory = metadataProvider.getConfiguredAdapterFactory(externalDataset,
                         edd.getAdapter(), properties, (ARecordType) itemType, null, context.getWarningCollector());
@@ -168,6 +172,16 @@ public class DatasetDataSource extends DataSource {
         return propertiesCopy;
     }
 
+    private Map<String, String> addSubPath(Map<String, Serializable> dataSourceProps, Map<String, String> properties) {
+        Serializable subPath = dataSourceProps.get(SUBPATH);
+        if (!(subPath instanceof String)) {
+            return properties;
+        }
+        Map<String, String> propertiesCopy = new HashMap<>(properties);
+        propertiesCopy.put(SUBPATH, (String) subPath);
+        return propertiesCopy;
+    }
+
     private int[] createFilterIndexes(List<LogicalVariable> filterVars, IOperatorSchema opSchema) {
         if (filterVars != null && !filterVars.isEmpty()) {
             final int size = filterVars.size();
@@ -185,4 +199,8 @@ public class DatasetDataSource extends DataSource {
         return dataset.getDatasetType() == DatasetType.EXTERNAL;
     }
 
+    @Override
+    public boolean compareProperties() {
+        return dataset.getDatasetType() == DatasetType.EXTERNAL;
+    }
 }
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java b/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
index e15b699b2b..8b4e56d3fc 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
+++ b/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/metadata/IDataSource.java
@@ -18,7 +18,9 @@
  */
 package org.apache.hyracks.algebricks.core.algebra.metadata;
 
+import java.io.Serializable;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
 import org.apache.hyracks.algebricks.core.algebra.properties.FunctionalDependency;
@@ -37,4 +39,10 @@ public interface IDataSource<T> {
     public boolean isScanAccessPathALeaf();
 
     public INodeDomain getDomain();
+
+    public Map<String, Serializable> getProperties();
+
+    default boolean compareProperties() {
+        return false;
+    }
 }
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/logical/visitors/IsomorphismOperatorVisitor.java b/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/logical/visitors/IsomorphismOperatorVisitor.java
index b44607513d..9e2e87c197 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/logical/visitors/IsomorphismOperatorVisitor.java
+++ b/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/logical/visitors/IsomorphismOperatorVisitor.java
@@ -34,6 +34,7 @@ import org.apache.hyracks.algebricks.core.algebra.base.ILogicalOperator;
 import org.apache.hyracks.algebricks.core.algebra.base.ILogicalPlan;
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalOperatorTag;
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
+import org.apache.hyracks.algebricks.core.algebra.metadata.IDataSource;
 import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator;
 import org.apache.hyracks.algebricks.core.algebra.operators.logical.AggregateOperator;
 import org.apache.hyracks.algebricks.core.algebra.operators.logical.AssignOperator;
@@ -476,6 +477,12 @@ public class IsomorphismOperatorVisitor implements ILogicalOperatorVisitor<Boole
         if (!isomorphic) {
             return Boolean.FALSE;
         }
+        IDataSource<?> dataSource = op.getDataSource();
+        IDataSource<?> argDataSource = argScan.getDataSource();
+        if (dataSource.compareProperties() && argDataSource.compareProperties()
+                && !Objects.equals(dataSource.getProperties(), argDataSource.getProperties())) {
+            return Boolean.FALSE;
+        }
         DataSourceScanOperator scanOpArg = (DataSourceScanOperator) copyAndSubstituteVar(op, arg);
         ILogicalExpression opCondition = op.getSelectCondition() != null ? op.getSelectCondition().getValue() : null;
         ILogicalExpression argCondition =