You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@atlas.apache.org by sa...@apache.org on 2021/06/03 16:33:52 UTC

[atlas] branch master updated: ATLAS-4324: FS entity created for load data inpath is created as shell entity

This is an automated email from the ASF dual-hosted git repository.

sarath pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/atlas.git


The following commit(s) were added to refs/heads/master by this push:
     new a9e528b  ATLAS-4324: FS entity created for load data inpath is created as shell entity
a9e528b is described below

commit a9e528bca9277707a1847d110c33288188a8fff5
Author: Radhika Kundam <rk...@cloudera.com>
AuthorDate: Thu Jun 3 00:10:12 2021 -0700

    ATLAS-4324: FS entity created for load data inpath is created as shell entity
    
    Signed-off-by: Sarath Subramanian <sa...@apache.org>
---
 .../java/org/apache/atlas/hive/hook/HiveHook.java  |  1 +
 .../atlas/hive/hook/utils/HiveDDLEntityFilter.java | 47 +++++++++++---
 .../hive/hook/utils/ActiveEntityFilterTest.java    | 34 +++++++++-
 .../src/test/resources/json/hs2-create-db-v2.json  | 24 +++++++
 ...eate-db-with-no-pathentities-to-retain-v2.json} |  0
 ...2-create-db-with-no-pathentities-to-retain.json | 73 ++++++++++++++++++++++
 .../test/resources/json/hs2-create-table-v2.json   | 17 ++++-
 .../test/resources/json/hs2-load-inpath-v2.json    | 18 ++++++
 ...d-inpath-with-no-pathentities-to-retain-v2.json |  4 ++
 ...load-inpath-with-no-pathentities-to-retain.json | 18 ++++++
 .../src/test/resources/json/hs2-load-inpath.json   | 18 ++++++
 11 files changed, 240 insertions(+), 14 deletions(-)

diff --git a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java
index cf918ef..94ef225 100644
--- a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java
+++ b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/HiveHook.java
@@ -70,6 +70,7 @@ public class HiveHook extends AtlasHook implements ExecuteWithHookContext {
     public static final String HOOK_HIVE_TABLE_PRUNE_PATTERN                             = CONF_PREFIX + "hive_table.prune.pattern";
     public static final String HOOK_HIVE_TABLE_CACHE_SIZE                                = CONF_PREFIX + "hive_table.cache.size";
     public static final String HOOK_HIVE_IGNORE_DDL_OPERATIONS                           = CONF_PREFIX + "hs2.ignore.ddl.operations";
+    public static final String HOOK_HIVE_FILTER_ENTITY_TYPES_TO_RETAIN                   = CONF_PREFIX + "hs2.filter.entity.types.to.retain";
     public static final String DEFAULT_HOST_NAME = "localhost";
 
     private static final Map<String, HiveOperation> OPERATION_MAP = new HashMap<>();
diff --git a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java
index 2b39e81..9163c47 100644
--- a/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java
+++ b/addons/hive-bridge/src/main/java/org/apache/atlas/hive/hook/utils/HiveDDLEntityFilter.java
@@ -18,15 +18,20 @@
 package org.apache.atlas.hive.hook.utils;
 
 import com.google.common.annotations.VisibleForTesting;
+import org.apache.atlas.ApplicationProperties;
+import org.apache.atlas.hive.hook.HiveHook;
 import org.apache.atlas.hive.hook.events.BaseHiveEvent;
 import org.apache.atlas.model.instance.AtlasEntity;
 import org.apache.atlas.model.instance.AtlasObjectId;
-import org.apache.atlas.model.instance.AtlasRelatedObjectId;
 import org.apache.atlas.model.notification.HookNotification;
+import org.apache.atlas.utils.AtlasPathExtractorUtil;
 import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.collections.MapUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -36,14 +41,38 @@ import java.util.function.Function;
 import java.util.stream.Collectors;
 
 public class HiveDDLEntityFilter implements EntityFilter {
+    private static final Logger LOG = LoggerFactory.getLogger(HiveDDLEntityFilter.class);
+
+    private static final Set<String> defaultPathTypes = new HashSet<String>() {{
+        add(AtlasPathExtractorUtil.HDFS_TYPE_PATH);
+        add(AtlasPathExtractorUtil.ADLS_GEN2_DIRECTORY);
+        add(AtlasPathExtractorUtil.GCS_VIRTUAL_DIR);
+        add(AtlasPathExtractorUtil.AWS_S3_V2_PSEUDO_DIR);
+        add(AtlasPathExtractorUtil.AWS_S3_PSEUDO_DIR);
+    }};
+
     private static final Set<String> typesToRetain = new HashSet<String>() {{
         add(BaseHiveEvent.HIVE_TYPE_PROCESS);
         add(BaseHiveEvent.HIVE_TYPE_PROCESS_EXECUTION);
         add(BaseHiveEvent.HIVE_TYPE_COLUMN_LINEAGE);
         add(BaseHiveEvent.HIVE_DB_DDL);
         add(BaseHiveEvent.HIVE_TABLE_DDL);
+        addAll(defaultPathTypes);
+        addAll(getConfiguredTypesToRetain());
     }};
 
+    private static List<String> getConfiguredTypesToRetain() {
+        String[]        configuredTypesToRetain = null;
+
+        try {
+            configuredTypesToRetain = ApplicationProperties.get().getStringArray(HiveHook.HOOK_HIVE_FILTER_ENTITY_TYPES_TO_RETAIN);
+        } catch (Exception e) {
+            LOG.error("Failed to load application properties", e);
+        }
+
+        return configuredTypesToRetain != null ? Arrays.asList(configuredTypesToRetain) : new ArrayList<>();
+    }
+
     public List<HookNotification> apply(List<HookNotification> incoming) {
         if (CollectionUtils.isEmpty(incoming)) {
             return incoming;
@@ -164,15 +193,11 @@ public class HiveDDLEntityFilter implements EntityFilter {
     }
 
     private static void inferObjectTypeResetGuid(Object o) {
-        if (o instanceof AtlasRelatedObjectId) {
-            AtlasRelatedObjectId oid = (AtlasRelatedObjectId) o;
-            if (oid.getUniqueAttributes() != null) {
-                oid.setGuid(null);
-            }
-        }
         if (o instanceof AtlasObjectId) {
-            AtlasObjectId oid = (AtlasObjectId) o;
-            if (oid.getUniqueAttributes() != null) {
+            AtlasObjectId oid      = (AtlasObjectId) o;
+            String        typeName = oid.getTypeName();
+
+            if (oid.getUniqueAttributes() != null && !defaultPathTypes.contains(typeName)) {
                 oid.setGuid(null);
             }
         } else {
@@ -181,7 +206,9 @@ public class HiveDDLEntityFilter implements EntityFilter {
                 return;
             }
 
-            if (hm.containsKey(BaseHiveEvent.ATTRIBUTE_UNIQUE_ATTRIBUTES)) {
+            String typeName = hm.containsKey(AtlasObjectId.KEY_TYPENAME) ? (String) hm.get(AtlasObjectId.KEY_TYPENAME) : null;
+
+            if (hm.containsKey(BaseHiveEvent.ATTRIBUTE_UNIQUE_ATTRIBUTES) && !defaultPathTypes.contains(typeName)) {
                 hm.put(BaseHiveEvent.ATTRIBUTE_GUID, null);
             }
         }
diff --git a/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java b/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java
index 6a7af5e..4dde1dc 100644
--- a/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java
+++ b/addons/hive-bridge/src/test/java/org/apache/atlas/hive/hook/utils/ActiveEntityFilterTest.java
@@ -35,6 +35,7 @@ import java.util.Map;
 import java.util.Set;
 
 import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertNotEquals;
 import static org.testng.Assert.assertNotNull;
 import static org.testng.Assert.assertTrue;
 
@@ -55,6 +56,9 @@ public class ActiveEntityFilterTest {
         assertMessageFromFile("hs2-alter-view");
         assertMessageFromFile("hs2-drop-table");
         assertAtlasEntitiesWithExtInfoFromFile("hs2-create-process");
+        assertAtlasEntitiesWithExtInfoFromFile("hs2-load-inpath");
+        assertAtlasEntitiesWithExtInfoFromFile("hs2-create-db-with-no-pathentities-to-retain", false);
+        assertAtlasEntitiesWithExtInfoFromFile("hs2-load-inpath-with-no-pathentities-to-retain", false);
     }
 
     private void assertMessageFromFile(String msgFile) throws IOException {
@@ -135,20 +139,31 @@ public class ActiveEntityFilterTest {
     }
 
     private void assertAtlasEntitiesWithExtInfoFromFile(String entityFile) throws IOException {
+        assertAtlasEntitiesWithExtInfoFromFile(entityFile, true);
+    }
+
+    private void assertAtlasEntitiesWithExtInfoFromFile(String entityFile, boolean retainPathEntities) throws IOException {
         AtlasEntity.AtlasEntitiesWithExtInfo incoming = TestResourceFileUtils.readObjectFromJson("", entityFile, AtlasEntity.AtlasEntitiesWithExtInfo.class);
         AtlasEntity.AtlasEntitiesWithExtInfo expected = TestResourceFileUtils.readObjectFromJson("", entityFile + FILE_SUFFIX, AtlasEntity.AtlasEntitiesWithExtInfo.class);
 
         HiveDDLEntityFilter hiveLineageEntityFilter = new HiveDDLEntityFilter();
         AtlasEntity.AtlasEntitiesWithExtInfo actual = hiveLineageEntityFilter.apply(incoming);
-        assertAtlasEntitiesWithExtInfo(actual, expected);
+
+        if (retainPathEntities) {
+            assertAtlasEntitiesWithExtInfo(actual, expected);
+        } else {
+            assertAtlasEntitiesWithNoPathEntitiesToRetain(actual, expected);
+        }
     }
 
     private void assertAtlasEntitiesWithExtInfo(AtlasEntity.AtlasEntitiesWithExtInfo actual, AtlasEntity.AtlasEntitiesWithExtInfo expected) {
         assertNotNull(actual);
         assertNotNull(expected);
 
-        assertEquals(actual.getEntities().size(), expected.getEntities().size());
-        assertEntity(actual.getEntities(), expected.getEntities());
+        if (expected.getEntities() != null && actual.getEntities() != null) {
+            assertEquals(actual.getEntities().size(), expected.getEntities().size());
+            assertEntity(actual.getEntities(), expected.getEntities());
+        }
 
         assertEquals(MapUtils.isEmpty(actual.getReferredEntities()), MapUtils.isEmpty(expected.getReferredEntities()));
         if (expected.getReferredEntities() != null && actual.getReferredEntities() != null) {
@@ -156,6 +171,19 @@ public class ActiveEntityFilterTest {
         }
     }
 
+    private void assertAtlasEntitiesWithNoPathEntitiesToRetain(AtlasEntity.AtlasEntitiesWithExtInfo actual, AtlasEntity.AtlasEntitiesWithExtInfo expected) {
+        assertNotNull(actual);
+        assertNotNull(expected);
+
+        if (expected.getEntities() != null && actual.getEntities() != null) {
+            assertNotEquals(actual.getEntities().size(), expected.getEntities().size());
+        }
+
+        if (expected.getReferredEntities() != null && actual.getReferredEntities() != null) {
+            assertNotEquals(actual.getReferredEntities().size(), expected.getReferredEntities().size());
+        }
+    }
+
     private void assertEntity(Map<String, AtlasEntity> actual, Map<String, AtlasEntity> expected) {
         assertEquals(actual.size(), expected.size());
     }
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json
index 42553b5..881ee10 100644
--- a/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json
+++ b/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json
@@ -25,6 +25,30 @@
         }
       },
       "proxy": false
+    },
+    {
+      "typeName": "hdfs_path",
+      "attributes": {
+        "path": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db",
+        "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db@cm",
+        "clusterName": "cm",
+        "name": "/warehouse/tablespace/external/hive/cadb02.db"
+      },
+      "guid": "-14529329955589450",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "relationshipAttributes": {
+        "hiveDb": {
+          "guid": "-14529329955589448",
+          "typeName": "hive_db",
+          "uniqueAttributes": {
+            "qualifiedName": "cadb02@cm"
+          },
+          "relationshipType": "hive_db_location"
+        }
+      },
+      "proxy": false
     }
   ]
 }
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain-v2.json
similarity index 100%
copy from addons/hive-bridge/src/test/resources/json/hs2-create-db-v2.json
copy to addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain-v2.json
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain.json b/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain.json
new file mode 100644
index 0000000..a5b810f
--- /dev/null
+++ b/addons/hive-bridge/src/test/resources/json/hs2-create-db-with-no-pathentities-to-retain.json
@@ -0,0 +1,73 @@
+{
+  "referredEntities": {},
+  "entities": [
+    {
+      "typeName": "hive_db",
+      "attributes": {
+        "owner": "hive",
+        "ownerType": "USER",
+        "managedLocation": null,
+        "qualifiedName": "cadb02@cm",
+        "clusterName": "cm",
+        "name": "cadb02",
+        "location": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db",
+        "parameters": {}
+      },
+      "guid": "-14529329955589448",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "proxy": false
+    },
+    {
+      "typeName": "hive_db_ddl",
+      "attributes": {
+        "serviceType": "hive",
+        "qualifiedName": "cadb02@cm:1616450673617",
+        "execTime": 1616450673617,
+        "queryText": "create database cadb02",
+        "name": "create database cadb02",
+        "userName": "hive"
+      },
+      "guid": "-14529329955589449",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "relationshipAttributes": {
+        "db": {
+          "guid": "-14529329955589448",
+          "typeName": "hive_db",
+          "uniqueAttributes": {
+            "qualifiedName": "cadb02@cm"
+          },
+          "relationshipType": "hive_db_ddl_queries"
+        }
+      },
+      "proxy": false
+    },
+    {
+      "typeName": "hdfs_path",
+      "attributes": {
+        "path": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db",
+        "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/warehouse/tablespace/external/hive/cadb02.db@cm",
+        "clusterName": "cm",
+        "name": "/warehouse/tablespace/external/hive/cadb02.db"
+      },
+      "guid": "-14529329955589450",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "relationshipAttributes": {
+        "hiveDb": {
+          "guid": "-14529329955589448",
+          "typeName": "hive_db",
+          "uniqueAttributes": {
+            "qualifiedName": "cadb02@cm"
+          },
+          "relationshipType": "hive_db_location"
+        }
+      },
+      "proxy": false
+    }
+  ]
+}
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json
index b67f2ff..801918e 100644
--- a/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json
+++ b/addons/hive-bridge/src/test/resources/json/hs2-create-table-v2.json
@@ -1,5 +1,20 @@
 {
-  "referredEntities": {},
+  "referredEntities": {
+    "-14529329955589455": {
+      "typeName": "hdfs_path",
+      "attributes": {
+        "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv",
+        "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm",
+        "clusterName": "cm",
+        "name": "/tmp/external/hh6.csv"
+      },
+      "guid": "-14529329955589455",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "proxy": false
+    }
+  },
   "entities": [
     {
       "typeName": "hive_process",
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-v2.json
new file mode 100644
index 0000000..dd31aa0
--- /dev/null
+++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-v2.json
@@ -0,0 +1,18 @@
+{
+  "referredEntities": {
+    "-14529329955589455": {
+      "typeName": "hdfs_path",
+      "attributes": {
+        "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv",
+        "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm",
+        "clusterName": "cm",
+        "name": "/tmp/external/hh6.csv"
+      },
+      "guid": "-14529329955589455",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "proxy": false
+    }
+  }
+}
\ No newline at end of file
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain-v2.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain-v2.json
new file mode 100644
index 0000000..7f90d19
--- /dev/null
+++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain-v2.json
@@ -0,0 +1,4 @@
+{
+  "referredEntities": {
+  }
+}
\ No newline at end of file
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain.json
new file mode 100644
index 0000000..dd31aa0
--- /dev/null
+++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath-with-no-pathentities-to-retain.json
@@ -0,0 +1,18 @@
+{
+  "referredEntities": {
+    "-14529329955589455": {
+      "typeName": "hdfs_path",
+      "attributes": {
+        "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv",
+        "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm",
+        "clusterName": "cm",
+        "name": "/tmp/external/hh6.csv"
+      },
+      "guid": "-14529329955589455",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "proxy": false
+    }
+  }
+}
\ No newline at end of file
diff --git a/addons/hive-bridge/src/test/resources/json/hs2-load-inpath.json b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath.json
new file mode 100644
index 0000000..dd31aa0
--- /dev/null
+++ b/addons/hive-bridge/src/test/resources/json/hs2-load-inpath.json
@@ -0,0 +1,18 @@
+{
+  "referredEntities": {
+    "-14529329955589455": {
+      "typeName": "hdfs_path",
+      "attributes": {
+        "path": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv",
+        "qualifiedName": "hdfs://ve0126.halxg.cloudera.com:8020/tmp/external/hh6.csv@cm",
+        "clusterName": "cm",
+        "name": "/tmp/external/hh6.csv"
+      },
+      "guid": "-14529329955589455",
+      "isIncomplete": false,
+      "provenanceType": 0,
+      "version": 0,
+      "proxy": false
+    }
+  }
+}
\ No newline at end of file