You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by jm...@apache.org on 2023/02/14 13:14:17 UTC

[accumulo] branch 2.1 updated: Update validation of options in Sampler code (#3193)

This is an automated email from the ASF dual-hosted git repository.

jmark99 pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/2.1 by this push:
     new d15493b275 Update validation of options in Sampler code (#3193)
d15493b275 is described below

commit d15493b275cbd38aee1275957f30ef9c0724b927
Author: Mark Owens <jm...@apache.org>
AuthorDate: Tue Feb 14 08:14:10 2023 -0500

    Update validation of options in Sampler code (#3193)
    
    Discovered a bug in AbstractHashSampler and RowColumnSampler classes
    while examining the Sample example. The final portion of the example was
    failing to return any data. Found that the AbstractHashSampler would produce and
    error any time an any additional option were provided. It would only handle the
    required hasher and modulus options correctly.
    
    Conversely, the RowColumnSampler would fail when presented with the
    require hasher and modulus options.
    
    Updated such that the RowColumnSampler will ignore the require options,
    allowing the AbstractHashSampler to handle them, and only validate options
    specific to the RowColumnSampler.
    
    The AbstractHashSampler was updated to ignore any non-required option.
    
    Comments were added to indicate that the AbstractHashSampler validates
    required options while call sub-classes should handle any additional Sampler
    specific options.
---
 .../core/client/sample/AbstractHashSampler.java    | 37 +++++-----
 .../core/client/sample/RowColumnSampler.java       |  5 +-
 .../apache/accumulo/test/shell/ShellServerIT.java  | 79 +++++++++++++++++++++-
 3 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/core/src/main/java/org/apache/accumulo/core/client/sample/AbstractHashSampler.java b/core/src/main/java/org/apache/accumulo/core/client/sample/AbstractHashSampler.java
index f4a2abd1a8..072f6cae97 100644
--- a/core/src/main/java/org/apache/accumulo/core/client/sample/AbstractHashSampler.java
+++ b/core/src/main/java/org/apache/accumulo/core/client/sample/AbstractHashSampler.java
@@ -57,8 +57,12 @@ public abstract class AbstractHashSampler implements Sampler {
   private HashFunction hashFunction;
   private int modulus;
 
-  private static final Set<String> VALID_OPTIONS = Set.of("hasher", "modulus");
-  private static final Set<String> VALID_VALUES_HASHER = Set.of("murmur3_32", "md5", "sha1");
+  private static final String HASHER_PROP_NAME = "hasher";
+  private static final String MODULUS_PROP_NAME = "modulus";
+
+  protected static final Set<String> REQUIRED_SAMPLER_OPTIONS =
+      Set.of(HASHER_PROP_NAME, MODULUS_PROP_NAME);
+  private static final Set<String> VALID_HASHERS = Set.of("murmur3_32", "md5", "sha1");
 
   /**
    * Subclasses with options should override this method to validate subclass options while also
@@ -66,17 +70,16 @@ public abstract class AbstractHashSampler implements Sampler {
    */
   @Override
   public void validateOptions(Map<String,String> config) {
+    // Validate required properties, HASHER_PROP_NAME and MODULUS_PROP_NAME
+    // Any additional options are validated in calling subclass
     for (Map.Entry<String,String> entry : config.entrySet()) {
-      checkArgument(VALID_OPTIONS.contains(entry.getKey()), "Unknown option: %s", entry.getKey());
-
-      if (entry.getKey().equals("hasher")) {
-        checkArgument(VALID_VALUES_HASHER.contains(entry.getValue()),
-            "Unknown value for hasher: %s", entry.getValue());
+      if (entry.getKey().equals(HASHER_PROP_NAME)) {
+        checkArgument(VALID_HASHERS.contains(entry.getValue()), "Unknown value for %s: %s",
+            HASHER_PROP_NAME, entry.getValue());
       }
-
-      if (entry.getKey().equals("modulus")) {
-        checkArgument(Integer.parseInt(entry.getValue()) > 0,
-            "Improper Integer value for modulus: %s", entry.getValue());
+      if (entry.getKey().equals(MODULUS_PROP_NAME)) {
+        checkArgument(Integer.parseInt(entry.getValue()) > 0, "Improper Integer value for %s: %s",
+            MODULUS_PROP_NAME, entry.getValue());
       }
     }
   }
@@ -89,7 +92,7 @@ public abstract class AbstractHashSampler implements Sampler {
    */
   @Deprecated(since = "2.1.0")
   protected boolean isValidOption(String option) {
-    return VALID_OPTIONS.contains(option);
+    return REQUIRED_SAMPLER_OPTIONS.contains(option);
   }
 
   /**
@@ -99,11 +102,11 @@ public abstract class AbstractHashSampler implements Sampler {
       justification = "these hashes don't protect any secrets, just used for binning")
   @Override
   public void init(SamplerConfiguration config) {
-    String hasherOpt = config.getOptions().get("hasher");
-    String modulusOpt = config.getOptions().get("modulus");
+    String hasherOpt = config.getOptions().get(HASHER_PROP_NAME);
+    String modulusOpt = config.getOptions().get(MODULUS_PROP_NAME);
 
-    requireNonNull(hasherOpt, "Hasher not specified");
-    requireNonNull(modulusOpt, "Modulus not specified");
+    requireNonNull(hasherOpt, HASHER_PROP_NAME + " not specified");
+    requireNonNull(modulusOpt, MODULUS_PROP_NAME + " not specified");
 
     switch (hasherOpt) {
       case "murmur3_32":
@@ -120,7 +123,7 @@ public abstract class AbstractHashSampler implements Sampler {
         hashFunction = deprecatedSha1;
         break;
       default:
-        throw new IllegalArgumentException("Unknown hasher " + hasherOpt);
+        throw new IllegalArgumentException("Unknown " + HASHER_PROP_NAME + ": " + hasherOpt);
     }
 
     modulus = Integer.parseInt(modulusOpt);
diff --git a/core/src/main/java/org/apache/accumulo/core/client/sample/RowColumnSampler.java b/core/src/main/java/org/apache/accumulo/core/client/sample/RowColumnSampler.java
index dbc2ca1e67..01f1f489a1 100644
--- a/core/src/main/java/org/apache/accumulo/core/client/sample/RowColumnSampler.java
+++ b/core/src/main/java/org/apache/accumulo/core/client/sample/RowColumnSampler.java
@@ -84,9 +84,12 @@ public class RowColumnSampler extends AbstractHashSampler {
 
   @Override
   public void validateOptions(Map<String,String> config) {
+    // The required base options are validated in the super class.
     super.validateOptions(config);
+    // RowColumnSampler specific options are validated here.
     for (String option : config.keySet()) {
-      checkArgument(VALID_OPTIONS.contains(option), "Unknown option : %s", option);
+      checkArgument(VALID_OPTIONS.contains(option) || REQUIRED_SAMPLER_OPTIONS.contains(option),
+          "Unknown option : %s", option);
     }
   }
 
diff --git a/test/src/main/java/org/apache/accumulo/test/shell/ShellServerIT.java b/test/src/main/java/org/apache/accumulo/test/shell/ShellServerIT.java
index a29dba82ab..d24363a169 100644
--- a/test/src/main/java/org/apache/accumulo/test/shell/ShellServerIT.java
+++ b/test/src/main/java/org/apache/accumulo/test/shell/ShellServerIT.java
@@ -53,6 +53,7 @@ import org.apache.accumulo.core.client.AccumuloClient;
 import org.apache.accumulo.core.client.IteratorSetting;
 import org.apache.accumulo.core.client.Scanner;
 import org.apache.accumulo.core.client.admin.TableOperations;
+import org.apache.accumulo.core.client.sample.RowColumnSampler;
 import org.apache.accumulo.core.client.sample.RowSampler;
 import org.apache.accumulo.core.client.security.tokens.AuthenticationToken;
 import org.apache.accumulo.core.client.security.tokens.KerberosToken;
@@ -901,7 +902,7 @@ public class ShellServerIT extends SharedMiniClusterBase {
   }
 
   @Test
-  public void testScanScample() throws Exception {
+  public void testScanSample() throws Exception {
     final String table = getUniqueNames(1)[0];
 
     // compact
@@ -929,7 +930,7 @@ public class ShellServerIT extends SharedMiniClusterBase {
     ts.exec("scan --sample", true, "accumulo", false);
     ts.exec("grep --sample acc", true, "accumulo", false);
 
-    // create table where table sample config differs from whats in file
+    // Create table where table sample config differs from what's in file
     String clone2 = table + "_clone_2";
     ts.exec("clonetable -s"
         + " table.sampler.opt.hasher=murmur3_32,table.sampler.opt.modulus=2,table.sampler="
@@ -950,6 +951,80 @@ public class ShellServerIT extends SharedMiniClusterBase {
     ts.exec("grep --sample 89", true, "8934", false);
   }
 
+  // Feb 2023 - the sample example utilizing RowColumnSampler was failing. Analysis found that
+  // the AbstractHashSampler was throwing an exception when additional options were utilized,
+  // i.e., options beyond the required base 'hasher' and 'modulus' options. Additionally, the
+  // RowColumnSampler threw an exception when the base options were parsed.
+  // This test exercises a sampler that utilizes additional options and verifies options are parsed
+  // successfully.
+  @Test
+  public void testScanSampleOptions() throws Exception {
+    final String table = getUniqueNames(1)[0];
+
+    ts.exec("createtable " + table);
+
+    ts.exec("insert 9255 doc content 'abcde'");
+    ts.exec("insert 9255 doc url file://foo.txt");
+    ts.exec("insert 8934 doc content 'accumulo scales'");
+    ts.exec("insert 8934 doc url file://accumulo_notes.txt");
+    ts.exec("insert 5454 image size 2024,800");
+    ts.exec("insert 7000 image metadata '2023/01/02 12:34:43'");
+    ts.exec("insert 7000 image uri file://image1.jpg");
+    ts.exec("insert 2317 doc content 'milk, eggs, bread, parmigiano-reggiano'");
+    ts.exec("insert 2317 doc url file://groceries/9.txt");
+    ts.exec("insert 3900 doc content 'EC2 ate my homework'");
+    ts.exec("insert 3900 doc url file://final_project.txt");
+
+    // Verify sampler with more than just base options parses correctly
+    String clone1 = table + "_clone_1";
+    ts.exec("clonetable -s"
+        + " table.sampler.opt.hasher=murmur3_32,table.sampler.opt.modulus=3,table.sampler.opt.qualifier="
+        + "true,table.sampler=" + RowColumnSampler.class.getName() + " " + table + " " + clone1);
+    ts.exec("compact -t " + clone1 + " -w --sf-no-sample");
+    ts.exec("table " + clone1);
+    ts.exec("scan --sample", true);
+    for (String expected : Arrays.asList("groceries", "final_project", "accumulo_notes",
+        "foo.txt")) {
+      ts.exec("scan --sample", true, expected, true);
+      ts.exec("grep --sample " + expected.substring(0, 2), true, expected, true);
+    }
+    for (String notExpected : Arrays.asList("bread", "homework", "scales", "abcde", "1024", "2023",
+        "image1")) {
+      ts.exec("scan --sample", true, notExpected, false);
+      ts.exec("grep --sample " + notExpected.substring(0, 2), true, notExpected, false);
+    }
+
+    // Verify failure to provide a base option results in empty sample result
+    String clone2 = table + "_clone_2";
+    ts.exec("clonetable -s table.sampler.opt.hasher=murmur3_32,table.sampler.opt.qualifier="
+        + "true,table.sampler=" + RowColumnSampler.class.getName() + " " + table + " " + clone2);
+    ts.exec("compact -t " + clone2 + " -w --sf-no-sample");
+    ts.exec("table " + clone2, true);
+    ts.exec("config -t " + clone2 + " -f sampler", true);
+    ts.exec("scan --sample", true);
+    // None of the expected rows/values should be returned from scan
+    for (String notExpected : Arrays.asList("2317", "3900", "5454", "7000", "8934", "9255")) {
+      ts.exec("scan --sample", true, notExpected, false);
+      ts.exec("grep --sample " + notExpected.substring(0, 3), true, notExpected, false);
+    }
+
+    // Verify providing an invalid option results in empty sample result
+    String clone3 = table + "_clone_3";
+    ts.exec("clonetable -s "
+        + "table.sampler.opt.hasher=murmur3_32,table.sampler.opt.modulus=5,table.sampler.opt.qualifier="
+        + "true,table.sampler.opt.badprop=42,table.sampler=" + RowColumnSampler.class.getName()
+        + " " + table + " " + clone3);
+    ts.exec("compact -t " + clone3 + " -w --sf-no-sample");
+    ts.exec("table " + clone3, true);
+    ts.exec("scan --sample", true);
+    // None of the expected rows/values should be returned from scan
+    for (String expected : Arrays.asList("2317", "3900", "5454", "7000", "8934", "9255")) {
+      ts.exec("scan --sample", true, expected, false);
+      ts.exec("grep --sample " + expected.substring(0, 3), true, expected, false);
+    }
+
+  }
+
   @Test
   public void constraint() throws Exception {
     final String table = getUniqueNames(1)[0];