You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gobblin.apache.org by ap...@apache.org on 2021/08/11 18:07:54 UTC

[gobblin] branch master updated: [GOBBLIN-1513] fixed the construction of regex filters (#3361)

This is an automated email from the ASF dual-hosted git repository.

aplex pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/gobblin.git


The following commit(s) were added to refs/heads/master by this push:
     new cdd63ed  [GOBBLIN-1513] fixed the construction of regex filters (#3361)
cdd63ed is described below

commit cdd63edc2cb67d0393a666cc493ffc14a7173f96
Author: Arjun Singh Bora <ab...@linkedin.com>
AuthorDate: Wed Aug 11 23:37:50 2021 +0530

    [GOBBLIN-1513] fixed the construction of regex filters (#3361)
    
    Path filters are supposed to be provided by key "gobblin.dataset.path.filter.class", but instantiatePathFilter only calls the empty constructors for the path filter classes. Right now only HiddenFilter can be created with empty constructor.
    This PR will make a constructor for RegexPathFilter so that it can be used with config "gobblin.dataset.path.filter.class"
---
 .../data/management/dataset/DatasetUtils.java      | 12 +++---
 .../data/management/dataset/PathFilterTest.java    | 47 ++++++++++++++++++++++
 .../org/apache/gobblin/util/PropertiesUtils.java   | 22 ++++++++++
 .../gobblin/util/filters/RegexPathFilter.java      |  6 +++
 .../apache/gobblin/util/PropertiesUtilsTest.java   | 27 +++++++++++++
 5 files changed, 107 insertions(+), 7 deletions(-)

diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/dataset/DatasetUtils.java b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/dataset/DatasetUtils.java
index 97dd2d9..16386c2 100644
--- a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/dataset/DatasetUtils.java
+++ b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/dataset/DatasetUtils.java
@@ -33,6 +33,7 @@ import org.apache.gobblin.dataset.IterableDatasetFinderImpl;
 import org.apache.gobblin.data.management.copy.CopyableFile;
 import org.apache.gobblin.data.management.copy.CopyableFileFilter;
 import org.apache.gobblin.dataset.DatasetsFinder;
+import org.apache.gobblin.util.PropertiesUtils;
 import org.apache.gobblin.util.reflection.GobblinConstructorUtils;
 
 
@@ -43,7 +44,7 @@ public class DatasetUtils {
 
   public static final String CONFIGURATION_KEY_PREFIX = "gobblin.dataset.";
   public static final String DATASET_PROFILE_CLASS_KEY = CONFIGURATION_KEY_PREFIX + "profile.class";
-  private static final String PATH_FILTER_KEY = CONFIGURATION_KEY_PREFIX + "path.filter.class";
+  public static final String PATH_FILTER_KEY = CONFIGURATION_KEY_PREFIX + "path.filter.class";
   private static final String COPYABLE_FILE_FILTER_KEY = CONFIGURATION_KEY_PREFIX + "copyable.file.filter.class";
 
   private static final PathFilter ACCEPT_ALL_PATH_FILTER = new PathFilter() {
@@ -114,12 +115,9 @@ public class DatasetUtils {
 
     try {
       Class<?> pathFilterClass = Class.forName(props.getProperty(PATH_FILTER_KEY));
-      return (PathFilter) pathFilterClass.newInstance();
-    } catch (ClassNotFoundException exception) {
-      throw new RuntimeException(exception);
-    } catch (InstantiationException exception) {
-      throw new RuntimeException(exception);
-    } catch (IllegalAccessException exception) {
+      return (PathFilter) GobblinConstructorUtils.invokeLongestConstructor(pathFilterClass,
+          PropertiesUtils.extractPropertiesWithPrefixAfterRemovingPrefix(props, CONFIGURATION_KEY_PREFIX));
+    } catch (ReflectiveOperationException exception) {
       throw new RuntimeException(exception);
     }
   }
diff --git a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/PathFilterTest.java b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/PathFilterTest.java
new file mode 100644
index 0000000..ec72647
--- /dev/null
+++ b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/PathFilterTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.data.management.dataset;
+
+import java.util.Properties;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import org.apache.gobblin.util.filters.RegexPathFilter;
+
+
+public class PathFilterTest {
+
+  @Test
+  public void testRegexFilter() {
+    Path unmatchedPath = new Path(".abc");
+    Path matchedPath1 = new Path("abc");
+    Path matchedPath2 = new Path("a.bc");
+    Properties props = new Properties();
+    props.setProperty(DatasetUtils.PATH_FILTER_KEY, RegexPathFilter.class.getName());
+    props.setProperty(DatasetUtils.CONFIGURATION_KEY_PREFIX + RegexPathFilter.REGEX, "^[^.].*"); // match everything that does not start with a dot
+
+    PathFilter includeFilter = DatasetUtils.instantiatePathFilter(props);
+
+    Assert.assertFalse(includeFilter.accept(unmatchedPath));
+    Assert.assertTrue(includeFilter.accept(matchedPath1));
+    Assert.assertTrue(includeFilter.accept(matchedPath2));
+  }
+}
diff --git a/gobblin-utility/src/main/java/org/apache/gobblin/util/PropertiesUtils.java b/gobblin-utility/src/main/java/org/apache/gobblin/util/PropertiesUtils.java
index 527e250..bb53404 100644
--- a/gobblin-utility/src/main/java/org/apache/gobblin/util/PropertiesUtils.java
+++ b/gobblin-utility/src/main/java/org/apache/gobblin/util/PropertiesUtils.java
@@ -131,6 +131,28 @@ public class PropertiesUtils {
     return extractedProperties;
   }
 
+  /**
+   * Extract all the keys that start with a <code>prefix</code> in {@link Properties} to a new {@link Properties}
+   * instance. It removes the prefix from the properties.
+   *
+   * @param properties the given {@link Properties} instance
+   * @param prefix of keys to be extracted
+   * @return a {@link Properties} instance
+   */
+  public static Properties extractPropertiesWithPrefixAfterRemovingPrefix(Properties properties, String prefix) {
+    Preconditions.checkNotNull(properties);
+    Preconditions.checkNotNull(prefix);
+
+    Properties extractedProperties = new Properties();
+    for (Map.Entry<Object, Object> entry : properties.entrySet()) {
+      if (StringUtils.startsWith(entry.getKey().toString(), prefix)) {
+        extractedProperties.put(entry.getKey().toString().substring(prefix.length()), entry.getValue());
+      }
+    }
+
+    return extractedProperties;
+  }
+
   public static String serialize(Properties properties) throws IOException {
     StringWriter outputWriter = new StringWriter();
     properties.store(outputWriter, "");
diff --git a/gobblin-utility/src/main/java/org/apache/gobblin/util/filters/RegexPathFilter.java b/gobblin-utility/src/main/java/org/apache/gobblin/util/filters/RegexPathFilter.java
index 619d8d5..cf25122 100644
--- a/gobblin-utility/src/main/java/org/apache/gobblin/util/filters/RegexPathFilter.java
+++ b/gobblin-utility/src/main/java/org/apache/gobblin/util/filters/RegexPathFilter.java
@@ -17,6 +17,7 @@
 
 package org.apache.gobblin.util.filters;
 
+import java.util.Properties;
 import java.util.regex.Pattern;
 
 import org.apache.hadoop.fs.Path;
@@ -34,6 +35,11 @@ public class RegexPathFilter implements PathFilter {
 
   private final Pattern regex;
   private final boolean include;
+  public static final String REGEX = "path.filter.regex";
+
+  public RegexPathFilter(Properties props) {
+    this(props.getProperty(REGEX));
+  }
 
   public RegexPathFilter(String regex) {
     this(regex, true);
diff --git a/gobblin-utility/src/test/java/org/apache/gobblin/util/PropertiesUtilsTest.java b/gobblin-utility/src/test/java/org/apache/gobblin/util/PropertiesUtilsTest.java
index ae2ae80..f4470a8 100644
--- a/gobblin-utility/src/test/java/org/apache/gobblin/util/PropertiesUtilsTest.java
+++ b/gobblin-utility/src/test/java/org/apache/gobblin/util/PropertiesUtilsTest.java
@@ -55,6 +55,33 @@ public class PropertiesUtilsTest {
   }
 
   @Test
+  public void testExtractPropertiesWithPrefixAfterRemovingPrefix() {
+
+    Properties properties = new Properties();
+    properties.setProperty("k1.kk1", "v1");
+    properties.setProperty("k1.kk2", "v2");
+    properties.setProperty("k2.kk", "v3");
+
+    // First prefix
+    Properties extractedPropertiesK1 = PropertiesUtils.extractPropertiesWithPrefixAfterRemovingPrefix(properties, "k1.");
+    Assert.assertEquals(extractedPropertiesK1.getProperty("kk1"), "v1");
+    Assert.assertEquals(extractedPropertiesK1.getProperty("kk2"), "v2");
+    Assert.assertTrue(!extractedPropertiesK1.containsKey("k2.kk"));
+
+    // Second prefix
+    Properties extractedPropertiesK2 = PropertiesUtils.extractPropertiesWithPrefixAfterRemovingPrefix(properties, "k2");
+    Assert.assertTrue(!extractedPropertiesK2.containsKey("k1.kk1"));
+    Assert.assertTrue(!extractedPropertiesK2.containsKey("k1.kk2"));
+    Assert.assertEquals(extractedPropertiesK2.getProperty(".kk"), "v3");
+
+    // Missing prefix
+    Properties extractedPropertiesK3 = PropertiesUtils.extractPropertiesWithPrefixAfterRemovingPrefix(properties, "k3");
+    Assert.assertTrue(!extractedPropertiesK3.containsKey("k1.kk1"));
+    Assert.assertTrue(!extractedPropertiesK3.containsKey("k1.kk1"));
+    Assert.assertTrue(!extractedPropertiesK3.containsKey("k2.kk"));
+  }
+
+  @Test
   public void testGetStringList() {
     Properties properties = new Properties();
     properties.put("key", "1,2, 3");