You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gobblin.apache.org by ab...@apache.org on 2018/03/21 08:30:28 UTC

[10/50] incubator-gobblin git commit: [GOBBLIN-397] Create a new dataset version selection policy for filtering dataset versions that have "hidden" paths.

[GOBBLIN-397] Create a new dataset version selection policy for filtering dataset versions that have "hidden" paths.

Closes #2271 from sv2000/gobblin-397


Project: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/commit/ff13dde1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/tree/ff13dde1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/diff/ff13dde1

Branch: refs/heads/0.12.0
Commit: ff13dde1c88d21048494cf79fdf2319c488b81c5
Parents: 161bef0
Author: suvasude <su...@linkedin.biz>
Authored: Wed Jan 31 14:00:49 2018 -0800
Committer: Hung Tran <hu...@linkedin.com>
Committed: Wed Jan 31 14:00:49 2018 -0800

----------------------------------------------------------------------
 .../policy/HiddenFilterSelectionPolicy.java     | 90 ++++++++++++++++++++
 .../policy/HiddenFilterSelectionPolicyTest.java | 77 +++++++++++++++++
 2 files changed, 167 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ff13dde1/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicy.java
----------------------------------------------------------------------
diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicy.java b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicy.java
new file mode 100644
index 0000000..1c515ae
--- /dev/null
+++ b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicy.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.data.management.policy;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.Lists;
+import com.typesafe.config.Config;
+
+import org.apache.gobblin.data.management.version.FileSystemDatasetVersion;
+import org.apache.gobblin.util.ConfigUtils;
+
+
+/*
+ * Select dataset versions that pass the hidden path filter i.e. accept paths that do not have sub-dirs whose names start with "." or "_".
+ */
+public class HiddenFilterSelectionPolicy implements VersionSelectionPolicy<FileSystemDatasetVersion> {
+  public static final String HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY = "selection.hiddenFilter.hiddenFilePrefix";
+  private static final String[] DEFAULT_HIDDEN_FILE_PREFIXES = {".", "_"};
+  private List<String> hiddenFilePrefixes;
+
+  public HiddenFilterSelectionPolicy(Config config) {
+    if (config.hasPath(HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY)) {
+      this.hiddenFilePrefixes = ConfigUtils.getStringList(config, HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY);
+    } else {
+      this.hiddenFilePrefixes = Arrays.asList(DEFAULT_HIDDEN_FILE_PREFIXES);
+    }
+  }
+
+  @Override
+  public Class<? extends FileSystemDatasetVersion> versionClass() {
+    return FileSystemDatasetVersion.class;
+  }
+
+  private boolean isPathHidden(Path path) {
+    while (path != null) {
+      String name = path.getName();
+      for (String prefix : this.hiddenFilePrefixes) {
+        if (name.startsWith(prefix)) {
+          return true;
+        }
+      }
+      path = path.getParent();
+    }
+    return false;
+  }
+
+  private Predicate<FileSystemDatasetVersion> getSelectionPredicate() {
+    return new Predicate<FileSystemDatasetVersion>() {
+      @Override
+      public boolean apply(FileSystemDatasetVersion version) {
+        Set<Path> paths = version.getPaths();
+        for (Path path : paths) {
+          Path p = path.getPathWithoutSchemeAndAuthority(path);
+          if (isPathHidden(p)) {
+            return false;
+          }
+        }
+        return true;
+      }
+    };
+  }
+
+  @Override
+  public Collection<FileSystemDatasetVersion> listSelectedVersions(List<FileSystemDatasetVersion> allVersions) {
+    return Lists.newArrayList(Collections2.filter(allVersions, getSelectionPredicate()));
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ff13dde1/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicyTest.java
----------------------------------------------------------------------
diff --git a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicyTest.java b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicyTest.java
new file mode 100644
index 0000000..5c08b7c
--- /dev/null
+++ b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/policy/HiddenFilterSelectionPolicyTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.data.management.policy;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.fs.Path;
+import org.joda.time.DateTime;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import com.google.common.collect.ImmutableMap;
+import com.typesafe.config.Config;
+import com.typesafe.config.ConfigFactory;
+
+import org.apache.gobblin.data.management.version.FileSystemDatasetVersion;
+import org.apache.gobblin.data.management.version.TimestampedDatasetVersion;
+
+
+public class HiddenFilterSelectionPolicyTest {
+  @Test
+  public void testListSelectedVersions() throws Exception {
+    List<FileSystemDatasetVersion> versionList = new ArrayList<>();
+    Set<String> pathSet = new HashSet<>();
+    Path path1 = new Path("/data/dataset/versions/version1");
+    pathSet.add(path1.toString());
+    Path path2 = new Path("/data/dataset/versions/version2");
+    pathSet.add(path2.toString());
+    Path path3 = new Path("/data/dataset/.temp/tmpPath");
+    Path path4 = new Path("/data/dataset/_temp/tmpPath");
+
+    versionList.add(new TimestampedDatasetVersion(new DateTime(), path1));
+    versionList.add(new TimestampedDatasetVersion(new DateTime(), path2));
+    versionList.add(new TimestampedDatasetVersion(new DateTime(), path3));
+    versionList.add(new TimestampedDatasetVersion(new DateTime(), path4));
+
+    List<String> hiddenFilePrefixes = Arrays.asList("_", ".");
+    List<Config> configList = new ArrayList<>();
+    Config config1 = ConfigFactory.parseMap(
+        ImmutableMap.of(HiddenFilterSelectionPolicy.HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY, hiddenFilePrefixes));
+    configList.add(config1);
+    Config config2 = ConfigFactory.parseMap(
+        ImmutableMap.of(HiddenFilterSelectionPolicy.HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY, "_,."));
+    configList.add(config2);
+    for (Config config : configList) {
+      HiddenFilterSelectionPolicy policy = new HiddenFilterSelectionPolicy(config);
+      Collection<FileSystemDatasetVersion> selectedVersions = policy.listSelectedVersions(versionList);
+      Assert.assertEquals(selectedVersions.size(), 2);
+      for (FileSystemDatasetVersion version : selectedVersions) {
+        Set<Path> paths = version.getPaths();
+        for (Path path : paths) {
+          Assert.assertTrue(pathSet.contains(path.toString()));
+        }
+      }
+    }
+  }
+}
\ No newline at end of file