You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by rs...@apache.org on 2012/10/10 18:32:08 UTC

[1/3] git commit: CRUNCH-75: Added BloomFilters in crunch-contrib

Updated Branches:
  refs/heads/master 3eb2d3f3b -> ad90b151d


CRUNCH-75: Added BloomFilters in crunch-contrib


Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/ad90b151
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/ad90b151
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/ad90b151

Branch: refs/heads/master
Commit: ad90b151d9114cebe5382f678170e4a5bee1c119
Parents: 3eb2d3f
Author: Rahul Sharma <rs...@apache.org>
Authored: Wed Oct 10 22:00:07 2012 +0530
Committer: Rahul Sharma <rs...@apache.org>
Committed: Wed Oct 10 22:00:07 2012 +0530

----------------------------------------------------------------------
 crunch-contrib/pom.xml                             |   63 +
 .../crunch/contrib/bloomfilter/BloomFiltersIT.java |   61 +
 crunch-contrib/src/it/resources/shakes.txt         | 3667 +++++++++++++++
 .../contrib/bloomfilter/BloomFilterFactory.java    |  109 +
 .../crunch/contrib/bloomfilter/BloomFilterFn.java  |   68 +
 .../crunch/contrib/bloomfilter/package-info.java   |   24 +
 .../org/apache/crunch/contrib/package-info.java    |   25 +
 crunch-dist/pom.xml                                |    4 +
 pom.xml                                            |    7 +
 9 files changed, 4028 insertions(+), 0 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/ad90b151/crunch-contrib/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-contrib/pom.xml b/crunch-contrib/pom.xml
new file mode 100644
index 0000000..ef0c4ff
--- /dev/null
+++ b/crunch-contrib/pom.xml
@@ -0,0 +1,63 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"  xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  
+  <parent>
+    <groupId>org.apache.crunch</groupId>
+    <artifactId>crunch-parent</artifactId>
+    <version>0.4.0-incubating-SNAPSHOT</version>
+  </parent>
+  
+  <artifactId>crunch-contrib</artifactId>
+  <name>Apache Crunch Contrib</name>
+  
+  <dependencies>
+  
+    <dependency>
+      <groupId>org.apache.crunch</groupId>
+      <artifactId>crunch</artifactId>
+    </dependency>
+    
+    <dependency>
+      <groupId>org.apache.crunch</groupId>
+      <artifactId>crunch-test</artifactId>
+      <scope>test</scope>
+    </dependency>
+    
+     <dependency>
+      <groupId>commons-httpclient</groupId>
+      <artifactId>commons-httpclient</artifactId>
+      <scope>test</scope> <!-- only needed for LocalJobRunner -->
+    </dependency>
+    
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <scope>provided</scope>
+    </dependency>
+  
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+     </plugins>
+   </build>   
+</project>

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/ad90b151/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java
----------------------------------------------------------------------
diff --git a/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java b/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java
new file mode 100644
index 0000000..d91e07f
--- /dev/null
+++ b/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.contrib.bloomfilter;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.crunch.test.CrunchTestSupport;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.bloom.BloomFilter;
+import org.apache.hadoop.util.bloom.Key;
+import org.junit.Test;
+
+public class BloomFiltersIT extends CrunchTestSupport implements Serializable {
+
+  @Test
+  public void testFilterCreation() throws IOException {
+    String inputPath = tempDir.copyResourceFileName("shakes.txt");
+    BloomFilterFn<String> filterFn = new BloomFilterFn<String>() {
+      @Override
+      public Collection<Key> generateKeys(String input) {
+        List<String> parts = Arrays.asList(StringUtils.split(input, " "));
+        Collection<Key> keys = new HashSet<Key>();
+        for (String stringpart : parts) {
+          keys.add(new Key(stringpart.getBytes()));
+        }
+        return keys;
+      }
+    };
+    Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn).getValue();
+    assertEquals(1, filterValues.size());
+    BloomFilter filter = filterValues.get("shakes.txt");
+    assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes())));
+    assertTrue(filter.membershipTest(new Key("apples".getBytes())));
+  }
+
+}