You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2018/01/02 18:33:25 UTC

[1/2] incubator-datafu git commit: DATAFU-47 UDF for Murmur3, SipHash-2-4 (and other) Hash functions.

Repository: incubator-datafu
Updated Branches:
  refs/heads/master b2b3beb23 -> c39313da6


DATAFU-47 UDF for Murmur3, SipHash-2-4 (and other) Hash functions.

Signed-off-by: Matthew Hayes <mh...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/5262f075
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/5262f075
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/5262f075

Branch: refs/heads/master
Commit: 5262f075159215aff7ab3d1be89d27779f5dbf30
Parents: b2b3beb
Author: Philip (flip) Kromer <fl...@infochimps.org>
Authored: Tue Dec 5 11:55:52 2017 +0200
Committer: Matthew Hayes <mh...@apache.org>
Committed: Tue Jan 2 10:11:49 2018 -0800

----------------------------------------------------------------------
 .../src/main/java/datafu/pig/hash/Hasher.java   | 220 +++++++++++++++++++
 .../main/java/datafu/pig/hash/HasherRand.java   | 119 ++++++++++
 .../java/datafu/test/pig/hash/HashTests.java    | 183 +++++++++++++++
 .../test/pig/hash/HasherRandForTesting.java     |  35 +++
 4 files changed, 557 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/5262f075/datafu-pig/src/main/java/datafu/pig/hash/Hasher.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/Hasher.java b/datafu-pig/src/main/java/datafu/pig/hash/Hasher.java
new file mode 100644
index 0000000..36accc8
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/Hasher.java
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash;
+
+import java.math.BigInteger;
+
+import com.google.common.hash.Hashing;
+import com.google.common.hash.HashFunction;
+
+import datafu.pig.util.SimpleEvalFunc;
+
+/**
+ *
+ * Computes a hash value of a string and outputs it in hex.
+ *
+ * The default constructor produces a fast 32-bit non-cryptographic-strength
+ * hash ('murmur3-32') that is reasonable if all you need is good
+ * mixing. Additional constructors are available to choose alternative hash
+ * functions:
+ *
+ * <ul>
+ * <li><code>'murmur3-32', [optional seed]</code> or <code>'murmur3-128',
+ *   [optional seed]</code>: Returns a <a
+ *   href="https://code.google.com/p/smhasher/">murmur3 hash</a> of the given
+ *   length. Murmur3 is fast, with has exceptionally good statistical
+ *   properties; it's a good choice if all you need is good mixing of the
+ *   inputs. It is <em>not</em> cryptographically secure; that is, given an
+ *   output value from murmur3, there are efficient algorithms to find an input
+ *   yielding the same output value. Supply the seed as a string that
+ *   <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Integer.html#decode(java.lang.String)">Integer.decode</a>
+ *   can handle. Examples: <code>datafu.pig.hash.Hasher('murmur3-32', '0x56789abc');</code> or <code>datafu.pig.hash.Hasher('murmur3-32', '-12345678');</code>.</li>
+ * <li><code>'sip24', [optional seed]</code>: Returns a <a href="https://131002.net/siphash/">64-bit
+ *   SipHash-2-4</a> hash. SipHash is competitive in performance with Murmur3,
+ *   and is simpler and faster than the cryptographic algorithms below. When
+ *   used with a seed, it can be considered cryptographically secure: given
+ *   the output from a sip24 instance but not the seed used, we cannot
+ *   efficiently craft a message yielding the same output from that instance. To
+ *   supply a seed, pass in a 32-character string representing the seed in
+ *   hexadecimal. If none is given, k = '00010203&hellip;0e0f' is used.</li>
+ * <li><code>'adler32'</code>: Returns an Adler-32 checksum (32 hash bits) by delegating to Java's Adler32 Checksum.</li>
+ * <li><code>'crc32'</code>:   Returns a CRC-32 checksum (32 hash bits) by delegating to Java's CRC32 Checksum.</li>
+ * <li><code>'md5'</code>:     Returns an MD5 hash (128 hash bits) using Java's MD5 MessageDigest.</li>
+ * <li><code>'sha1'</code>:    Returns a SHA-1 hash (160 hash bits) using Java's SHA-1 MessageDigest.</li>
+ * <li><code>'sha256'</code>:  Returns a SHA-256 hash (256 hash bits) using Java's SHA-256 MessageDigest.</li>
+ * <li><code>'sha512'</code>:  Returns a SHA-512 hash (160 hash bits) using Java's SHA-512 MessageDigest.</li>
+ * <li><code>'good-{integer number of bits}'</code>: Returns a general-purpose,
+ *   <i>non-cryptographic-strength</i>, streaming hash function that produces
+ *   hash codes of length at least minimumBits. Users without specific'
+ *   compatibility requirements and who do not persist the hash codes are
+ *   encouraged to choose this hash function. (Cryptographers, like dieticians
+ *   and fashionistas, occasionally realize that We've Been Doing it Wrong
+ *   This Whole Time. Using 'good-*' lets you track What the Experts From
+ *   (Milan|NIH|IEEE) Say To (Wear|Eat|Hash With) this Fall.) Expect values
+ *   returned by this hasher to change run-to-run.</li>
+ * </ul>
+ *
+ */
+public class Hasher extends SimpleEvalFunc<String>
+{
+  protected     HashFunction hash_func = null;
+
+  private static final String HASH_NAMES = "'murmur3-32' (with optional seed); 'murmur3-128' (with optional seed); 'sip24' (with optional seed); 'crc32', 'adler32', 'md5'; 'sha1'; 'sha256'; 'sha512';  or 'good-{number of bits}'.";
+  protected static final String SEEDED_HASH_NAMES = "'murmur3-32' (with optional seed); 'murmur3-128' (with optional seed); 'sip24' (with optional seed)";
+
+  /**
+   * Generates hash values according to murmur3-32, a non-cryptographic-strength
+   * hash function with good mixing.
+   *
+   * @throws IllegalArgumentException, RuntimeException
+   */
+  public Hasher() throws IllegalArgumentException, RuntimeException
+  {
+    this("murmur3-32");
+  }
+
+  /**
+   * Generates hash values according to the hash function given by algorithm.
+   *
+   * See the Hasher class docs for a list of algorithms and guidance on selection.
+   *
+   * @param algorithm
+   * @throws IllegalArgumentException, RuntimeException
+   * @see    Hasher#makeHashFunc(String algorithm)
+   *
+   */
+  public Hasher(String algorithm) throws IllegalArgumentException, RuntimeException
+  {
+    makeHashFunc(algorithm);
+  }
+
+  /**
+   * Generates hash values according to the hash function given by algorithm,
+   * with initial seed given by the seed.
+   *
+   * See the Hasher class docs for a list of algorithms and guidance on selection.
+   *
+   * @param  algorithm
+   * @param  seed
+   * @throws IllegalArgumentException, RuntimeException
+   * @see    Hasher#makeHashFunc(String algorithm, String seed)
+   *
+   */
+  public Hasher(String algorithm, String seed) throws IllegalArgumentException, RuntimeException
+  {
+    makeHashFunc(algorithm, seed);
+  }
+
+  /**
+   * Returns the HashFunction named by algorithm
+   *
+   * See the Hasher class docs for a list of algorithms and guidance on selection.
+   *
+   * @param algorithm
+   * @throws IllegalArgumentException, RuntimeException
+   */
+  private void makeHashFunc(String algorithm) throws IllegalArgumentException, RuntimeException
+  {
+    if (hash_func != null) { throw new RuntimeException("The hash function should only be set once per instance"); }
+
+    if      (algorithm.startsWith("good-")) {
+      int bits = Integer.parseInt(algorithm.substring(5));
+      hash_func = Hashing.goodFastHash(bits);
+    }
+    else if (algorithm.equals("murmur3-32")) { hash_func = Hashing.murmur3_32();  }
+    else if (algorithm.equals("murmur3-128")){ hash_func = Hashing.murmur3_128(); }
+    else if (algorithm.equals("sip24"))      { hash_func = Hashing.sipHash24();   }
+    else if (algorithm.equals("sha1"))       { hash_func = Hashing.sha1();        }
+    else if (algorithm.equals("sha256"))     { hash_func = Hashing.sha256();      }
+    else if (algorithm.equals("sha512"))     { hash_func = Hashing.sha512();      }
+    else if (algorithm.equals("md5"))        { hash_func = Hashing.md5();         }
+    else if (algorithm.equals("adler32"))    { hash_func = Hashing.adler32();     }
+    else if (algorithm.equals("crc32"))      { hash_func = Hashing.crc32();       }
+    else { throw new IllegalArgumentException("No hash function found for algorithm "+algorithm+". Allowed values include "+HASH_NAMES); }
+  }
+
+  /**
+   * Returns the HashFunction named by algorithm, with initial seed given by the
+   * seed.
+   *
+   * See the Hasher class docs for a list of algorithms and guidance on selection.
+   *
+   * The seed is interpreted as follows:
+   *
+   * <ul>
+   * <li>With algorithm 'murmur3-32' or 'murmur3-128', supply a 32-bit
+   *   number as a string of exactly 8 hexadecimal digits.</li>
+   * <li>With algorithm 'sip24' supply a 128-bit number as a string of exactly
+   *   32 hexadecimal digits.</li>
+   * </ul>
+   *
+   * @param algorithm
+   * @param seed
+   * @throws IllegalArgumentException, RuntimeException
+   */
+  protected void makeHashFunc(String algorithm, String seed) throws IllegalArgumentException, RuntimeException
+  {
+    try {
+      if (algorithm.equals("murmur3-32")) {
+        if (seed.length() != 8) { throw new IllegalArgumentException("Seed for "+algorithm+" must be an 8-character string representing a 32-bit unsigned number in hexadecimal."); }
+        int seedint = Hasher.intFromHex(seed);
+        hash_func = Hashing.murmur3_32(seedint);
+      }
+      else if (algorithm.equals("murmur3-128")) {
+        if (seed.length() != 8) { throw new IllegalArgumentException("Seed for "+algorithm+" must be an 8-character string representing a 32-bit unsigned number in hexadecimal."); }
+        int seedint = Hasher.intFromHex(seed);
+        hash_func = Hashing.murmur3_128(seedint);
+      }
+      else if (algorithm.equals("sip24")){
+        if (seed.length() != 32){ throw new IllegalArgumentException("Seed for "+algorithm+" must be a 32-character string representing a 128-bit unsigned number in hexadecimal."); }
+        long k0 = Hasher.longFromHex(seed.substring( 0,16));
+        long k1 = Hasher.longFromHex(seed.substring(16,32));
+        hash_func = Hashing.sipHash24(k0, k1);
+      }
+      else { throw new IllegalArgumentException("No hash function found for algorithm "+algorithm+" with a seed. Allowed values include "+SEEDED_HASH_NAMES); }
+    }
+    catch (NumberFormatException err) {
+      throw new RuntimeException(err);
+    }
+  }
+
+  /*
+   * Convert the given unsigned hex string to a long value.  Values higher than
+   * MAX_LONG become negative. A value higher than 2^64 is undefined behavior.
+   * For example, f0e0d0c0b0a09080 becomes -1089641583808049024l.
+   */
+  public static long longFromHex(String hex_str) {
+    return ( new BigInteger(hex_str, 16) ).longValue();
+  }
+
+  /*
+   * Convert the given unsigned hex string to an int value.  Values higher than
+   * MAX_INT become negative. A value higher than 2^32 is undefined behavior.
+   * For example, f1e1d1c1 becomes -236858943.
+   */
+  public static int intFromHex(String hex_str) {
+    return ( new BigInteger(hex_str, 16) ).intValue();
+  }
+
+  public String call(String val)
+  {
+    return hash_func.hashBytes(val.getBytes()).toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/5262f075/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java b/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java
new file mode 100644
index 0000000..19993a1
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash;
+
+import java.util.Random;
+
+import datafu.pig.hash.Hasher;
+import com.google.common.hash.HashFunction;
+
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ *
+ * Computes a hash value of a string using a randomly generated see and
+ * outputs it in hex.
+ *
+ * This class should only be used for hashing algorithms that accept a seed
+ * (murmur3-32, murmur3-128 and sip24).
+ * 
+ * It allows you to generate a well-mixed sequence of values, unpredictable
+ * for every run, without relying on the random number generator for each
+ * record. The seed is generated by the front end (i.e. when you launch your
+ * script) and so is identical for every task.
+ * 
+ * See also @Hasher
+ */
+public class HasherRand extends Hasher
+{
+  protected       HashFunction hash_func;
+  protected final String       algorithm;
+
+ /**
+  * Generates hash values according to murmur3-32, a non-cryptographic-strength
+  * hash function with good mixing.
+  * 
+  * @param  algorithm
+  * @throws IllegalArgumentException, RuntimeException
+  * @see    Hasher(String alg)
+  */
+ public HasherRand() throws IllegalArgumentException, RuntimeException
+ {
+   this("murmur3-32");
+ }
+
+  /*
+   * @param  algorithm
+   * @throws IllegalArgumentException, RuntimeException
+   * @see    Hasher(String alg)
+   */
+  public HasherRand(String alg) throws IllegalArgumentException, RuntimeException
+  {
+    algorithm = alg;
+  }
+
+  /*
+   * @param  val the single string to hash
+   * @return val, hashed according to the algorithm specified at instantiation
+   */
+  @Override
+  public String call(String val)
+  {
+    if (hash_func == null) {
+      // memoize the hash func
+      String rand_seed = (String)getInstanceProperties().get("rand_seed");
+      super.makeHashFunc(algorithm, rand_seed);
+    }
+    return super.call(val);
+  }
+
+  /*
+   * Generate a seed exactly once on the front end, so all workers get same value
+   */
+  @Override
+  protected void onReady(Schema in_schema, Schema out_schema) {
+    String rand_seed;
+    Random rg = getRandomGenerator();
+
+    if      (algorithm.equals("murmur3-32")) {
+      int rand_int = rg.nextInt();
+      rand_seed = String.format("%08x", rand_int);
+    }
+    else if (algorithm.equals("murmur3-128")){
+      int rand_int = rg.nextInt();
+      rand_seed = String.format("%08x", rand_int);
+    }
+    else if (algorithm.equals("sip24")) {
+      long rand_k0  = rg.nextLong();
+      long rand_k1  = rg.nextLong();
+      rand_seed = String.format("%016x%016x", rand_k0, rand_k1);
+    }
+    else { throw new IllegalArgumentException("No hash function found for algorithm "+algorithm+" with a seed. Allowed values include "+SEEDED_HASH_NAMES); }
+
+    getInstanceProperties().put("rand_seed", rand_seed);
+
+    super.onReady(in_schema, out_schema);
+  }
+
+  // exists so tests can inject constant seed.
+  protected Random getRandomGenerator() {
+    return new Random();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/5262f075/datafu-pig/src/test/java/datafu/test/pig/hash/HashTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/HashTests.java b/datafu-pig/src/test/java/datafu/test/pig/hash/HashTests.java
index 7ff8fb9..3d9cfa0 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/hash/HashTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/hash/HashTests.java
@@ -21,6 +21,7 @@ package datafu.test.pig.hash;
 
 import org.adrianwalker.multilinestring.Multiline;
 import org.apache.pig.pigunit.PigTest;
+import org.junit.Assert;
 import org.testng.annotations.Test;
 
 import datafu.test.pig.PigTests;
@@ -159,4 +160,186 @@ public class HashTests extends PigTests
                  "(y5QTmoufMkPmiomOxr2bPQ==)",
                  "(AAAI5Uh7Orrnvoih1LrVcw==)");
   }
+  
+
+  /**
+
+  define DefaultH    datafu.pig.hash.Hasher();
+  define MurmurH32   datafu.pig.hash.Hasher('murmur3-32');
+  define MurmurH32A  datafu.pig.hash.Hasher('murmur3-32', '00000000');
+  define MurmurH32B  datafu.pig.hash.Hasher('murmur3-32', 'b98b9e85');
+  define MurmurH128  datafu.pig.hash.Hasher('murmur3-128');
+  define MurmurH128A datafu.pig.hash.Hasher('murmur3-128', '00000000');
+  define MurmurH128B datafu.pig.hash.Hasher('murmur3-128', 'b98b9e85');
+  --
+  define Sip24H      datafu.pig.hash.Hasher('sip24');
+  define Sip24HA     datafu.pig.hash.Hasher('sip24', '000102030405060708090a0b0c0d0e0f');
+  define Sip24HB     datafu.pig.hash.Hasher('sip24', 'b98b9e856508b355f068d792e1c251c8');
+  --
+  define SHA1H       datafu.pig.hash.Hasher('sha1');
+  define SHA256H     datafu.pig.hash.Hasher('sha256');
+  define SHA512H     datafu.pig.hash.Hasher('sha512');
+  --
+  define MD5H        datafu.pig.hash.Hasher('md5');
+  define CRC32       datafu.pig.hash.Hasher('crc32');
+  define Adler32     datafu.pig.hash.Hasher('adler32');
+
+  data_in = LOAD 'input' as (val:chararray);
+
+  most_hashes = FOREACH data_in GENERATE
+    (MurmurH32A(val)  == DefaultH(val)   ? 'y' : 'n'),
+    (MurmurH32A(val)  == MurmurH32(val)  ? 'y' : 'n'),
+    (MurmurH128A(val) == MurmurH128(val) ? 'y' : 'n'),
+    MurmurH32A(val),  MurmurH32B(val),
+    MurmurH128A(val), MurmurH128B(val),
+    Sip24HA(val),     Sip24HB(val),
+    SHA1H(val),       SHA256H(val),     SHA512H(val),
+    MD5H(val),        CRC32(val),       Adler32(val)
+    ;
+
+  STORE most_hashes INTO 'output';
+   */
+  @Multiline private String hasherTest;
+
+  @Test
+  public void hasherTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(hasherTest);
+
+    writeLinesToFile("input",
+                     "Of all who give and receive gifts, ",
+                     "such as they are wisest. ",
+                     "Everywhere they are wisest. They are the magi.",
+                     "has_lo_md5_1065433");
+
+    test.runScript();
+
+    assertOutput(test, "most_hashes",
+        "(y,y,y,5dd3faff,858e0486,70874abeba0d07a0848bde717968362e,89002495272c9240f228863c9d229253,ad146f93da1f3cbe,f23760b1a2ac49dd,ce2df99fc66f35f8d00abbb3e58a73dcdc46857d,a6114a2f7bad5016cd11b0f74eab219bf629a41184041bb0e3dce16463a39674,73a7c303812965c115f9190ffdff843bf79289e643dc57d1d35865462fe6059daac2cd9209f50547ce63a902dc10f659aa0a4786e338a6e69062d3eeeccee252,b4b0d2cb8da053680556da959b611614,163c13a8,270c9ada)",
+        "(y,y,y,f292b84a,07f6261e,cf6c44df722abd72ccc7c485bd259ea3,f9e77d8ad1061430160b0d3a0db7d290,23506548545203c0,2a9323fe502784b0,3db8d76faa5ae6b700db8f083e3162f1ff723edd,156acc8aa21b5110140bb5201245bdfaff99cab38f7a030dc6af8e198687e789,42ae14b43e2e4d75166dec84137c1c157c97a2fbf35e380f4f3b015a0af01a3f868b8ea1566a9b7564fbaef490b4e25614823e811ab43339c14a6d2c2fd0f5d0,8d0e66419d96d4a677f69758a7cf17cf,79710b14,e708e576)",
+        "(y,y,y,820c3879,70b0a439,db96091e79c70a61dd328f92a17a657e,45b006d1363e9667081e7667330ff970,b345eac42e6551ad,d9ed1275d9d17639,c9f8b9ec0c9f92a898c81972304ea221ee3b87d3,6b6c1d0e17aa96a8d9dd616e62dc5c00147ba5167bdbaf204b3b3d2a424040fa,30d738452fbd9caaef06f6c7920a02a73797eb7644a361bdf53d154e4f9b2a8fc6a80dc8d3de09706191c76bd87666584fb0150b3c0e8e9a70bf318320771ae3,506a07d334a7fa034550d839671f17f2,348d70a4,a510ed94)",
+        "(y,y,y,d3af58ed,134a4b46,301d20747aca51c7b73d4ceaf622b7fa,4d0702752ff1a376475aa12ef72ea832,9081ffb69dfeafc6,82106544171ed2a7,3d08148fe48f0e00486667833fd7b8bdc63412cf,60019a95b6e67b47d7527ff814eeba1001261f54ce14684a14c3ab6f716bc934,7d4f10c90ec6ca2112ae2f92c34e0d8ff557ed3fabdcef6b4b99af72194e6a60f9df311558f6556c04ba220b5c402c4dbb6268158762c6aa91e4e0a6ef13f8ec,000008e5487b3abae7be88a1d4bad573,5c7ff6ae,a1058d3d)");
+  }
+
+  /**
+
+
+  define Murmur32H_R_T    datafu.test.pig.hash.HasherRandForTesting('murmur3-32');
+  define Murmur128H_R_T   datafu.test.pig.hash.HasherRandForTesting('murmur3-128');
+  define Sip24H_R_T       datafu.test.pig.hash.HasherRandForTesting('sip24');
+
+  define Murmur32H_R_T_2  datafu.test.pig.hash.HasherRandForTesting('murmur3-32'); -- should give same val as R_T1 because we fix the seed
+  define Murmur32H_R_1    datafu.pig.hash.HasherRand('murmur3-32');
+  define Murmur32H_R_2    datafu.pig.hash.HasherRand('murmur3-32'); -- should not give same val because it's random
+
+  data_in = LOAD 'input' as (val:chararray);
+
+  rand_hashes = FOREACH data_in GENERATE
+    (Murmur32H_R_T(val) == Murmur32H_R_T_2(val) ? 'y' : 'n'),
+    (Murmur32H_R_T(val) != Murmur32H_R_1(val)   ? 'y' : 'n'),
+    (Murmur32H_R_1(val) != Murmur32H_R_2(val)   ? 'y' : 'n'),
+    Murmur32H_R_T(val),
+    Murmur128H_R_T(val),
+    Sip24H_R_T(val)
+    ;
+
+  STORE rand_hashes INTO 'output';
+
+   */
+  @Multiline private String hasherRandTest;
+
+  @Test
+  public void hasherRandTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(hasherRandTest);
+
+    writeLinesToFile("input",
+                     "Of all who give and receive gifts, ",
+                     "such as they are wisest. ",
+                     "Everywhere they are wisest. They are the magi.");
+
+    test.runScript();
+
+    assertOutput(test, "rand_hashes",
+        "(y,y,y,858e0486,89002495272c9240f228863c9d229253,f23760b1a2ac49dd)",
+        "(y,y,y,07f6261e,f9e77d8ad1061430160b0d3a0db7d290,2a9323fe502784b0)",
+        "(y,y,y,70b0a439,45b006d1363e9667081e7667330ff970,d9ed1275d9d17639)");
+  }
+
+  /**
+
+
+  define GoodH       datafu.pig.hash.Hasher('good-32');
+  define BetterH     datafu.pig.hash.Hasher('good-127');
+
+  data_in = LOAD 'input' as (val:chararray);
+
+  vals = FOREACH data_in GENERATE
+    GoodH(val) AS h_32,       BetterH(val) AS h_65;
+
+  -- Seed value changes run-to-run, but we can at least ensure the bitness is sound
+  good_hashes = FOREACH vals GENERATE
+    (SIZE(h_32)*4  >= 32 ? 'y' : 'n') AS got_at_least_32_bits,
+    (SIZE(h_65)*4  >= 65 ? 'y' : 'n') AS got_at_least_65_bits;
+
+  STORE good_hashes INTO 'output';
+   */
+  @Multiline private String hasherGoodTest;
+
+  @Test
+  public void hasherGoodTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(hasherGoodTest);
+
+    writeLinesToFile("input",
+                     "Of all who give and receive gifts, ",
+                     "such as they are wisest. ",
+                     "Everywhere they are wisest. They are the magi.");
+
+    test.runScript();
+
+    assertOutput(test, "good_hashes",
+      "(y,y)",
+      "(y,y)",
+      "(y,y)");
+  }
+
+  /*
+    Check that seed conversion is correct on your platform
+  */
+  @Test
+  public void hasherSeedConversionTest() throws Exception
+  {
+    String seed_64 = "f0e0d0c0b0a09080";
+    long   seed_lng = datafu.pig.hash.Hasher.longFromHex(seed_64);
+
+    // bits in match bits out
+    Assert.assertEquals(seed_64,               Long.toHexString(seed_lng));
+    // your bits equal author's bits
+    Assert.assertEquals(-1089641583808049024l, seed_lng);
+
+    String seed_32  = "f1e1d1c1";
+    int    seed_int = datafu.pig.hash.Hasher.intFromHex(seed_32);
+
+    // bits in match bits out
+    Assert.assertEquals(seed_32,     Integer.toHexString(seed_int));
+    // your bits equal author's bits
+    Assert.assertEquals(-236858943,  seed_int);
+
+  }
+
+  /*
+    Check that the random seeds we test with are stable
+  */
+  @Test
+  public void hasherRandGenerationTest() throws Exception
+  {
+    java.util.Random rg = new java.util.Random(69);
+    Assert.assertEquals("b98b9e85", Integer.toHexString(rg.nextInt()));
+
+    rg = new java.util.Random(69);
+    Assert.assertEquals("b98b9e856508b355", Long.toHexString(rg.nextLong()));
+    Assert.assertEquals("f068d792e1c251c8", Long.toHexString(rg.nextLong()));
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/5262f075/datafu-pig/src/test/java/datafu/test/pig/hash/HasherRandForTesting.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/HasherRandForTesting.java b/datafu-pig/src/test/java/datafu/test/pig/hash/HasherRandForTesting.java
new file mode 100644
index 0000000..575ade4
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/hash/HasherRandForTesting.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.hash;
+
+/**
+ * class with a fixed random seed for testing purposes
+ */
+public class HasherRandForTesting extends datafu.pig.hash.HasherRand
+{
+  public HasherRandForTesting(String alg) {
+    super(alg);
+  }
+  /* always yield the same seed. */
+  @Override
+  protected java.util.Random getRandomGenerator() {
+    return new java.util.Random(69);
+  }
+}


[2/2] incubator-datafu git commit: Fix typo in docs

Posted by mh...@apache.org.
Fix typo in docs


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/c39313da
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/c39313da
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/c39313da

Branch: refs/heads/master
Commit: c39313da6d9aa49808c0c93004eb6455c0808dff
Parents: 5262f07
Author: Matthew Hayes <mh...@apache.org>
Authored: Tue Jan 2 10:32:28 2018 -0800
Committer: Matthew Hayes <mh...@apache.org>
Committed: Tue Jan 2 10:32:28 2018 -0800

----------------------------------------------------------------------
 datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/c39313da/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java b/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java
index 19993a1..c5be5cd 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/HasherRand.java
@@ -28,17 +28,17 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 /**
  *
- * Computes a hash value of a string using a randomly generated see and
+ * Computes a hash value of a string using a randomly generated seed and
  * outputs it in hex.
  *
  * This class should only be used for hashing algorithms that accept a seed
  * (murmur3-32, murmur3-128 and sip24).
- * 
+ *
  * It allows you to generate a well-mixed sequence of values, unpredictable
  * for every run, without relying on the random number generator for each
  * record. The seed is generated by the front end (i.e. when you launch your
  * script) and so is identical for every task.
- * 
+ *
  * See also @Hasher
  */
 public class HasherRand extends Hasher
@@ -49,7 +49,7 @@ public class HasherRand extends Hasher
  /**
   * Generates hash values according to murmur3-32, a non-cryptographic-strength
   * hash function with good mixing.
-  * 
+  *
   * @param  algorithm
   * @throws IllegalArgumentException, RuntimeException
   * @see    Hasher(String alg)