You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by rs...@apache.org on 2012/10/10 18:32:08 UTC
[1/3] git commit: CRUNCH-75: Added BloomFilters in crunch-contrib
Updated Branches:
refs/heads/master 3eb2d3f3b -> ad90b151d
CRUNCH-75: Added BloomFilters in crunch-contrib
Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/ad90b151
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/ad90b151
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/ad90b151
Branch: refs/heads/master
Commit: ad90b151d9114cebe5382f678170e4a5bee1c119
Parents: 3eb2d3f
Author: Rahul Sharma <rs...@apache.org>
Authored: Wed Oct 10 22:00:07 2012 +0530
Committer: Rahul Sharma <rs...@apache.org>
Committed: Wed Oct 10 22:00:07 2012 +0530
----------------------------------------------------------------------
crunch-contrib/pom.xml | 63 +
.../crunch/contrib/bloomfilter/BloomFiltersIT.java | 61 +
crunch-contrib/src/it/resources/shakes.txt | 3667 +++++++++++++++
.../contrib/bloomfilter/BloomFilterFactory.java | 109 +
.../crunch/contrib/bloomfilter/BloomFilterFn.java | 68 +
.../crunch/contrib/bloomfilter/package-info.java | 24 +
.../org/apache/crunch/contrib/package-info.java | 25 +
crunch-dist/pom.xml | 4 +
pom.xml | 7 +
9 files changed, 4028 insertions(+), 0 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/ad90b151/crunch-contrib/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-contrib/pom.xml b/crunch-contrib/pom.xml
new file mode 100644
index 0000000..ef0c4ff
--- /dev/null
+++ b/crunch-contrib/pom.xml
@@ -0,0 +1,63 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-parent</artifactId>
+ <version>0.4.0-incubating-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>crunch-contrib</artifactId>
+ <name>Apache Crunch Contrib</name>
+
+ <dependencies>
+
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-test</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ <scope>test</scope> <!-- only needed for LocalJobRunner -->
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/ad90b151/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java
----------------------------------------------------------------------
diff --git a/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java b/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java
new file mode 100644
index 0000000..d91e07f
--- /dev/null
+++ b/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.contrib.bloomfilter;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.crunch.test.CrunchTestSupport;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.bloom.BloomFilter;
+import org.apache.hadoop.util.bloom.Key;
+import org.junit.Test;
+
+public class BloomFiltersIT extends CrunchTestSupport implements Serializable {
+
+ @Test
+ public void testFilterCreation() throws IOException {
+ String inputPath = tempDir.copyResourceFileName("shakes.txt");
+ BloomFilterFn<String> filterFn = new BloomFilterFn<String>() {
+ @Override
+ public Collection<Key> generateKeys(String input) {
+ List<String> parts = Arrays.asList(StringUtils.split(input, " "));
+ Collection<Key> keys = new HashSet<Key>();
+ for (String stringpart : parts) {
+ keys.add(new Key(stringpart.getBytes()));
+ }
+ return keys;
+ }
+ };
+ Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn).getValue();
+ assertEquals(1, filterValues.size());
+ BloomFilter filter = filterValues.get("shakes.txt");
+ assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes())));
+ assertTrue(filter.membershipTest(new Key("apples".getBytes())));
+ }
+
+}