You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2008/10/13 18:20:49 UTC
svn commit: r704151 - in /incubator/pig/trunk: ./
contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/
contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/
Author: gates
Date: Mon Oct 13 09:20:49 2008
New Revision: 704151
URL: http://svn.apache.org/viewvc?rev=704151&view=rev
Log:
PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, determines the host.
Added:
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
Modified:
incubator/pig/trunk/CHANGES.txt
Modified: incubator/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704151&r1=704150&r2=704151&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:20:49 2008
@@ -364,3 +364,6 @@
PIG-486: Added SearchEngineExtractor, a piggybank eval func that
recognizes a set of the most common search engines in a URL and extracts
the name of the search engine (spackest via gates).
+
+ PIG-487: Added HostExtractor, a piggybank eval func that, given a URL,
+ determines the host (spackest via gates).
Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java?rev=704151&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java Mon Oct 13 09:20:49 2008
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+/*
+ * HostExtractor takes a url and returns the host. For example,
+ *
+ * http://sports.espn.go.com/mlb/recap?gameId=281009122
+ *
+ * leads to
+ *
+ * sports.espn.go.com
+ *
+ * Pig latin usage looks like
+ *
+ * host = FOREACH row GENERATE
+ * org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor(referer);
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+
+import java.net.URL;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+public class HostExtractor extends EvalFunc<DataAtom> {
+ @Override
+ public void exec(Tuple input, DataAtom output) {
+ String string = input.getAtomField(0).strval();
+
+ if (string == null)
+ return;
+
+ String host = null;
+ try {
+ host = new URL(string).getHost().toLowerCase();
+ } catch (Exception e) {
+ }
+ if (host != null)
+ output.setValue(host);
+ }
+}
Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java?rev=704151&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java Mon Oct 13 09:20:49 2008
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.util.apachelogparser;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor;
+import org.junit.Test;
+
+public class TestHostExtractor extends TestCase {
+ private static HashMap<String, String> tests = new HashMap<String, String>();
+ static {
+ tests.put("http://sports.espn.go.com/mlb/recap?gameId=281009122", "sports.espn.go.com");
+ tests.put("http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search", "www.google.com");
+ tests.put("http://search.msn.com/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR", "search.msn.com");
+ tests.put("http://www.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0", "www.altavista.com");
+ tests.put("dud", null);
+ }
+
+ @Test
+ public void testInstantiation() {
+ assertNotNull(new SearchEngineExtractor());
+ }
+
+ @Test
+ public void testTests() {
+ HostExtractor hostExtractor = new HostExtractor();
+ int testCount = 0;
+ for (String key : tests.keySet()) {
+ String expected = tests.get(key);
+
+ ArrayList<Datum> input = new ArrayList<Datum>();
+ input.add(new DataAtom(key));
+
+ DataAtom output = new DataAtom();
+ hostExtractor.exec(new Tuple(input), output);
+ if (expected == null)
+ assertEquals(0, output.toString().length());
+ else
+ assertEquals(expected, output.toString());
+ testCount++;
+ }
+ assertEquals(tests.size(), testCount);
+ }
+}