You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2008/10/13 18:20:49 UTC

svn commit: r704151 - in /incubator/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/ contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/

Author: gates
Date: Mon Oct 13 09:20:49 2008
New Revision: 704151

URL: http://svn.apache.org/viewvc?rev=704151&view=rev
Log:
 PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, determines the host.

Added:
    incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
    incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
Modified:
    incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704151&r1=704150&r2=704151&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:20:49 2008
@@ -364,3 +364,6 @@
     PIG-486: Added SearchEngineExtractor, a piggybank eval func that
 	recognizes a set of the most common search engines in a URL and extracts
 	the name of the search engine (spackest via gates).
+
+    PIG-487: Added HostExtractor, a piggybank eval func that, given a URL,
+	determines the host (spackest via gates).

Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java?rev=704151&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java Mon Oct 13 09:20:49 2008
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+/*
+ * HostExtractor takes a url and returns the host. For example,
+ * 
+ * http://sports.espn.go.com/mlb/recap?gameId=281009122
+ * 
+ * leads to
+ * 
+ * sports.espn.go.com
+ * 
+ * Pig latin usage looks like
+ * 
+ * host = FOREACH row GENERATE
+ * org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor(referer);
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+
+import java.net.URL;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+public class HostExtractor extends EvalFunc<DataAtom> {
+    @Override
+    public void exec(Tuple input, DataAtom output) {
+        String string = input.getAtomField(0).strval();
+
+        if (string == null)
+            return;
+
+        String host = null;
+        try {
+            host = new URL(string).getHost().toLowerCase();
+        } catch (Exception e) {
+        }
+        if (host != null)
+            output.setValue(host);
+    }
+}

Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java?rev=704151&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java Mon Oct 13 09:20:49 2008
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.util.apachelogparser;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor;
+import org.junit.Test;
+
+public class TestHostExtractor extends TestCase {
+    private static HashMap<String, String> tests = new HashMap<String, String>();
+    static {
+        tests.put("http://sports.espn.go.com/mlb/recap?gameId=281009122", "sports.espn.go.com");
+        tests.put("http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search", "www.google.com");
+        tests.put("http://search.msn.com/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR", "search.msn.com");
+        tests.put("http://www.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0", "www.altavista.com");
+        tests.put("dud", null);
+    }
+
+    @Test
+    public void testInstantiation() {
+        assertNotNull(new SearchEngineExtractor());
+    }
+
+    @Test
+    public void testTests() {
+        HostExtractor hostExtractor = new HostExtractor();
+        int testCount = 0;
+        for (String key : tests.keySet()) {
+            String expected = tests.get(key);
+
+            ArrayList<Datum> input = new ArrayList<Datum>();
+            input.add(new DataAtom(key));
+
+            DataAtom output = new DataAtom();
+            hostExtractor.exec(new Tuple(input), output);
+            if (expected == null)
+                assertEquals(0, output.toString().length());
+            else
+                assertEquals(expected, output.toString());
+            testCount++;
+        }
+        assertEquals(tests.size(), testCount);
+    }
+}