You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2008/10/13 18:26:21 UTC

svn commit: r704154 - in /incubator/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/ contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/

Author: gates
Date: Mon Oct 13 09:26:21 2008
New Revision: 704154

URL: http://svn.apache.org/viewvc?rev=704154&view=rev
Log:
 PIG-488: Added SearchTermExtractor, a piggybank eval func that, for many search engines, recognizes the search term in the URL returns it to the caller.

Added:
    incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchTermExtractor.java
    incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchTermExtractor.java
Modified:
    incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704154&r1=704153&r2=704154&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:26:21 2008
@@ -367,3 +367,7 @@
 
     PIG-487: Added HostExtractor, a piggybank eval func that, given a URL,
 	determines the host (spackest via gates).
+
+	PIG-488: Added SearchTermExtractor, a piggybank eval func that, for many
+	search engines, recognizes the search term in the URL returns it to the
+	caller (spackest via gates).

Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchTermExtractor.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchTermExtractor.java?rev=704154&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchTermExtractor.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/SearchTermExtractor.java Mon Oct 13 09:26:21 2008
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.evaluation.util.apachelogparser;
+
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+
+
+/**
+ * SearchTermExtractor takes a url string and extracts the search terms. For example, given
+ * 
+ * http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search
+ * 
+ * then
+ * 
+ * purpose of life
+ * 
+ * would be extracted.
+ * 
+ * From pig latin, usage looks something like
+ * 
+ * searchTerm = FOREACH row GENERATE
+ * org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchTermExtractor(referer);
+ * 
+ * Supported search engines include alltheweb.com, altavista.com, aolsearch.aol.com, arianna.libero.it,
+ * as.starware.com, ask.com, blogs.icerocket.com, blueyonder.co.uk, busca.orange.es, buscador.lycos.es,
+ * buscador.terra.es, buscar.ozu.es, categorico.it, cerca.lycos.it, cuil.com, excite.it, godado.com,
+ * godado.it, gps.virgin.net, hotbot.com, ilmotore.com, it.altavista.com, ithaki.net, libero.it, lycos.es,
+ * lycos.it, mamma.com, megasearching.net, mirago.co.uk, netscape.com, ozu.es, ricerca.alice.it,
+ * search.aol.co.uk, search.bbc.co.uk, search.conduit.com, search.icq.com, search.live.com,
+ * search.lycos.co.uk, search.lycos.com, search.msn.co.uk, search.msn.com, search.myway.com,
+ * search.mywebsearch.com, search.ntlworld.com, search.orange.co.uk, search.sweetim.com,
+ * search.virginmedia.com, simpatico.ws, soso.com, suche.fireball.de, suche.web.de, terra.es, tesco.net,
+ * thespider.it, tiscali.co.uk, uk.altavista.com, uk.ask.com
+ * 
+ * Thanks to Spiros Denaxas for his URI::ParseSearchString, which is the basis for the lookups.
+ */
+public class SearchTermExtractor extends EvalFunc<DataAtom> {
+    private static Matcher TERM_MATCHER = null;
+    private static Matcher P_TERM_MATCHER = null;
+
+    static {
+        TERM_MATCHER = Pattern.compile("\\b(?:q|buscar|key|qry|qs|query|s|searchfor|su|w)=([^&]+)").matcher("");
+        P_TERM_MATCHER = Pattern.compile("\\bp=([^&]+)").matcher("");
+    }
+
+    private String myDecode(String string) {
+        try {
+            string = URLDecoder.decode(string, "UTF-8");
+        } catch (UnsupportedEncodingException e) {
+            e.printStackTrace();
+        }
+        return string;
+    }
+
+    private static HashMap<String, Boolean> HOSTS = new HashMap<String, Boolean>();
+    static {
+        HOSTS.put("alltheweb.com", true);
+        HOSTS.put("altavista.com", true);
+        HOSTS.put("aolsearch.aol.com", true);
+        HOSTS.put("arianna.libero.it", true);
+        HOSTS.put("as.starware.com", true);
+        HOSTS.put("ask.com", true);
+        HOSTS.put("blogs.icerocket.com", true);
+        HOSTS.put("blueyonder.co.uk", true);
+        HOSTS.put("busca.orange.es", true);
+        HOSTS.put("buscador.lycos.es", true);
+        HOSTS.put("buscador.terra.es", true);
+        HOSTS.put("buscar.ozu.es", true);
+        HOSTS.put("categorico.it", true);
+        HOSTS.put("cerca.lycos.it", true);
+        HOSTS.put("cuil.com", true);
+        HOSTS.put("excite.it", true);
+        HOSTS.put("godado.com", true);
+        HOSTS.put("godado.it", true);
+        HOSTS.put("gps.virgin.net", true);
+        HOSTS.put("hotbot.com", true);
+        HOSTS.put("ilmotore.com", true);
+        HOSTS.put("it.altavista.com", true);
+        HOSTS.put("ithaki.net", true);
+        HOSTS.put("libero.it", true);
+        HOSTS.put("lycos.es", true);
+        HOSTS.put("lycos.it", true);
+        HOSTS.put("mamma.com", true);
+        HOSTS.put("megasearching.net", true);
+        HOSTS.put("mirago.co.uk", true);
+        HOSTS.put("netscape.com", true);
+        HOSTS.put("ozu.es", true);
+        HOSTS.put("ricerca.alice.it", true);
+        HOSTS.put("search.aol.co.uk", true);
+        HOSTS.put("search.bbc.co.uk", true);
+        HOSTS.put("search.conduit.com", true);
+        HOSTS.put("search.icq.com", true);
+        HOSTS.put("search.live.com", true);
+        HOSTS.put("search.lycos.co.uk", true);
+        HOSTS.put("search.lycos.com", true);
+        HOSTS.put("search.msn.co.uk", true);
+        HOSTS.put("search.msn.com", true);
+        HOSTS.put("search.myway.com", true);
+        HOSTS.put("search.mywebsearch.com", true);
+        HOSTS.put("search.ntlworld.com", true);
+        HOSTS.put("search.orange.co.uk", true);
+        HOSTS.put("search.sweetim.com", true);
+        HOSTS.put("search.virginmedia.com", true);
+        HOSTS.put("simpatico.ws", true);
+        HOSTS.put("soso.com", true);
+        HOSTS.put("suche.fireball.de", true);
+        HOSTS.put("suche.web.de", true);
+        HOSTS.put("terra.es", true);
+        HOSTS.put("tesco.net", true);
+        HOSTS.put("thespider.it", true);
+        HOSTS.put("tiscali.co.uk", true);
+        HOSTS.put("uk.altavista.com", true);
+        HOSTS.put("uk.ask.com", true);
+    }
+
+    @Override
+    public void exec(Tuple input, DataAtom output) {
+        String url = input.getAtomField(0).strval();
+
+        if (url == null)
+            return;
+
+        URL urlObject = null;
+        try {
+            urlObject = new URL(url);
+        } catch (MalformedURLException e) {
+            e.printStackTrace();
+        }
+        if (urlObject == null)
+            return;
+
+        String host = urlObject.getHost();
+        if (host == null)
+            return;
+
+        host = host.replaceFirst("^www\\.", "");
+        if (host == null)
+            return;
+
+        host = host.toLowerCase();
+
+        if (HOSTS.containsKey(host) || host.contains("google.co") || host.contains("search.yahoo")) {
+            String queryString = urlObject.getQuery();
+
+            TERM_MATCHER.reset(queryString);
+            if (TERM_MATCHER.find()) {
+                String terms = TERM_MATCHER.group(1);
+                output.setValue(myDecode(terms));
+
+                // at least once, a p= comes before a q= when p= isn't tied to the search terms
+            } else {
+                P_TERM_MATCHER.reset(queryString);
+                if (P_TERM_MATCHER.find()) {
+                    String terms = P_TERM_MATCHER.group(1);
+                    output.setValue(myDecode(terms));
+                }
+            }
+            return;
+        }
+
+        if (host.endsWith("feedster.com") || host.endsWith("technorati.com")) {
+            String path = urlObject.getPath();
+            if (path == null)
+                return;
+
+            path = path.replaceFirst("^/search/", "");
+            output.setValue(myDecode(path));
+        }
+    }
+}

Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchTermExtractor.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchTermExtractor.java?rev=704154&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchTermExtractor.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestSearchTermExtractor.java Mon Oct 13 09:26:21 2008
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.util.apachelogparser;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor;
+import org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchTermExtractor;
+import org.junit.Test;
+
+public class TestSearchTermExtractor extends TestCase {
+    private static HashMap<String, String> tests = new HashMap<String, String>();
+    static {
+        tests.put("http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search", "a simple test");
+        tests.put("http://www.google.co.uk/search?hl=en&q=a+simple+test&btnG=Google+Search&meta=", "a simple test");
+        tests.put("http://www.google.co.jp/search?hl=ja&q=a+simple+test&btnG=Google+%E6%A4%9C%E7%B4%A2&lr=", "a simple test");
+        tests.put("http://search.msn.co.uk/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR", "a simple test");
+        tests.put("http://search.msn.com/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR", "a simple test");
+        tests.put("http://www.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0", "a simple test");
+        tests.put("http://uk.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0", "a simple test");
+        tests.put("http://www.blueyonder.co.uk/blueyonder/searches/search.jsp?q=a+simple+test&cr=&sitesearch=&x=0&y=0", "a simple test");
+        tests.put("http://www.alltheweb.com/search?cat=web&cs=iso88591&q=a+simple+test&rys=0&itag=crv&_sb_lang=pref", "a simple test");
+        tests.put("http://search.lycos.com/?query=a+simple+test&x=0&y=0", "a simple test");
+        tests.put("http://search.lycos.co.uk/cgi-bin/pursuit?query=a+simple+test&enc=utf-8&cat=slim_loc&sc=blue", "a simple test");
+        tests.put("http://www.hotbot.com/index.php?query=a+simple+test&ps=&loc=searchbox&tab=web&mode=search&currProv=msn", "a simple test");
+        tests.put("http://search.yahoo.com/search?p=a+simple+test&fr=FP-tab-web-t400&toggle=1&cop=&ei=UTF-8", "a simple test");
+        tests.put("http://uk.search.yahoo.com/search?p=a+simple+test&fr=FP-tab-web-t340&ei=UTF-8&meta=vc%3D", "a simple test");
+        tests.put("http://uk.ask.com/web?q=a+simple+test&qsrc=0&o=0&l=dir&dm=all", "a simple test");
+        tests.put("http://www.mirago.co.uk/scripts/qhandler.aspx?qry=a+simple+test&x=0&y=0", "a simple test");
+        tests.put("http://www.netscape.com/search/?s=a+simple+test", "a simple test");
+        tests.put("http://search.aol.co.uk/web?invocationType=ns_uk&query=a%20simple%20test", "a simple test");
+        tests.put("http://www.tiscali.co.uk/search/results.php?section=&from=&query=a+simple+test", "a simple test");
+        tests.put("http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+simple+test&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0", "a simple test");
+        tests.put("http://blogs.icerocket.com/search?q=a+simple+test", "a simple test");
+        tests.put("http://blogsearch.google.com/blogsearch?hl=en&ie=UTF-8&q=a+simple+test&btnG=Search+Blogs", "a simple test");
+        tests.put("http://suche.fireball.de/cgi-bin/pursuit?query=a+simple+test&x=0&y=0&cat=fb_loc&enc=utf-8", "a simple test");
+        tests.put("http://suche.web.de/search/web/?allparams=&smode=&su=a+simple+test&webRb=de", "a simple test");
+        tests.put("http://www.technorati.com/search/a%20simple%20test", "a simple test");
+        tests.put("http://www.feedster.com/search/a%20simple%20test", "a simple test");
+        tests.put("http://www.tesco.net/google/searchresults.asp?q=a+simple+test&cr=", "a simple test");
+        tests
+            .put(
+                "http://gps.virgin.net/search/sitesearch?submit.x=1&start=0&format=1&num=10&restrict=site&sitefilter=site%2Fsite_filter.hts&siteresults=site%2Fsite_results.hts&sitescorethreshold=28&q=a+simple+test&scope=UK&x=0&y=0",
+                "a simple test");
+        tests.put("http://search.bbc.co.uk/cgi-bin/search/results.pl?tab=web&go=homepage&q=a+simple+test&Search.x=0&Search.y=0&Search=Search&scope=all",
+            "a simple test");
+        tests.put("http://search.live.com/results.aspx?q=a+simple+test&mkt=en-us&FORM=LVSP&go.x=0&go.y=0&go=Search", "a simple test");
+        tests.put("http://search.mywebsearch.com/mywebsearch/AJmain.jhtml?searchfor=a+simple+test", "a simple test");
+        tests.put("http://www.megasearching.net/m/search.aspx?s=a+simple+test&mkt=&orig=1", "a simple test");
+        tests.put("http://www.blueyonder.co.uk/blueyonder/searches/search.jsp?q=a+simple+test&cr=&sitesearch=&x=0&y=0", "a simple test");
+        tests.put("http://search.ntlworld.com/ntlworld/search.php?q=a+simple+test&cr=&x=0&y=0", "a simple test");
+        tests.put("http://search.orange.co.uk/all?p=_searchbox&pt=resultgo&brand=ouk&tab=web&q=a+simple+test", "a simple test");
+        tests.put("http://search.virginmedia.com/results/index.php?channel=other&q=a+simple+test&cr=&x=0&y=0", "a simple test");
+        tests.put("http://as.starware.com/dp/search?src_id=305&product=unknown&qry=a+simple+test&z=Find+It", "a simple test");
+        tests.put("http://aolsearch.aol.com/aol/search?invocationType=topsearchbox.webhome&query=a+simple+test", "a simple test");
+        tests.put("http://www.ask.com/web?q=a+simple+test&qsrc=0&o=0&l=dir", "a simple test");
+        tests.put("http://buscador.terra.es/Default.aspx?source=Search&ca=s&query=a%20simple%20test", "a simple test");
+        tests.put("http://busca.orange.es/search?origen=home&destino=web&buscar=a+simple+test", "a simple test");
+        tests.put("http://search.sweetim.com/search.asp?ln=en&q=a%20simple%20test", "a simple test");
+        tests.put("http://search.conduit.com/Results.aspx?q=a+simple+test&hl=en&SelfSearch=1&SearchSourceOrigin=1&ctid=WEBSITE", "a simple test");
+        tests.put("http://buscar.ozu.es/index.php?etq=web&q=a+simple+test", "a simple test");
+        tests.put("http://buscador.lycos.es/cgi-bin/pursuit?query=a+simple+test&websearchCat=loc&cat=loc&SITE=de&enc=utf-8&ref=sboxlink", "a simple test");
+        tests.put("http://search.icq.com/search/results.php?q=a+simple+test&ch_id=st&search_mode=web", "a simple test");
+        tests.put("http://search.yahoo.co.jp/search?ei=UTF-8&fr=sfp_as&p=a+simple+test&meta=vc%3D", "a simple test");
+        tests.put("http://www.soso.com/q?pid=s.idx&w=a+simple+test", "a simple test");
+        tests.put("http://search.myway.com/search/AJmain.jhtml?searchfor=a+simple+test", "a simple test");
+        tests.put("http://www.ilmotore.com/newsearch/?query=a+simple+test&where=web", "a simple test");
+        tests.put("http://www.ithaki.net/ricerca.cgi?where=italia&query=a+simple+test", "a simple test");
+        tests.put("http://ricerca.alice.it/ricerca?f=hpn&qs=a+simple+test", "a simple test");
+        tests.put("http://it.search.yahoo.com/search?p=a+simple+test&fr=yfp-t-501&ei=UTF-8&rd=r1", "a simple test");
+        tests.put("http://www.excite.it/search/web/results?l=&q=a+simple+test", "a simple test");
+        tests.put("http://it.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0", "a simple test");
+        tests.put("http://cerca.lycos.it/cgi-bin/pursuit?query=a+simple+test&cat=web", "a simple test");
+        tests.put("http://arianna.libero.it/search/abin/integrata.cgi?query=a+simple+test&regione=8&x=0&y=0", "a simple test");
+        tests.put("http://www.thespider.it/dir/index.php?q=a+simple+test&search-btn.x=0&search-btn.y=0", "a simple test");
+        tests.put("http://godado.it/engine.php?l=it&key=a+simple+test&x=0&y=0", "a simple test");
+        tests.put("http://www.simpatico.ws/cgi-bin/links/search.cgi?query=a+simple+test&Vai=Go", "a simple test");
+        tests
+            .put(
+                "http://www.categorico.it/ricerca.html?domains=Categorico.it&q=a+simple+test&sa=Cerca+con+Google&sitesearch=&client=pub-0499722654836507&forid=1&channel=7983145815&ie=ISO-8859-1&oe=ISO-8859-1&cof=GALT%3A%23008000%3BGL%3A1%3BDIV%3A%23336699%3BVLC%3A663399%3BAH%3Acenter%3BBGC%3AFFFFFF%3BLBGC%3A336699%3BALC%3A0000FF%3BLC%3A0000FF%3BT%3A000000%3BGFNT%3A0000FF%3BGIMP%3A0000FF%3BFORID%3A11&hl=it",
+                "a simple test");
+        tests.put("http://www.cuil.com/search?q=a+simple+test", "a simple test");
+        tests.put("http://www.google.com/search?hl=en&lr=&q=a+more%21+complex_+search%24&btnG=Search", "a more! complex_ search$");
+        tests.put("http://www.google.co.uk/search?hl=en&q=a+more%21+complex_+search%24&btnG=Google+Search&meta=", "a more! complex_ search$");
+        tests.put("http://www.google.co.jp/search?hl=ja&q=a+more%21+complex_+search%24&btnG=Google+%E6%A4%9C%E7%B4%A2&lr=", "a more! complex_ search$");
+        tests.put("http://search.msn.com/results.aspx?q=a+more%21+complex_+search%24&FORM=QBHP", "a more! complex_ search$");
+        tests.put("http://search.msn.co.uk/results.aspx?q=a+more%21+complex_+search%24&FORM=MSNH&srch_type=0&cp=65001", "a more! complex_ search$");
+        tests.put("http://www.altavista.com/web/results?itag=ody&q=a+more%21+complex_+search%24&kgs=1&kls=0", "a more! complex_ search$");
+        tests.put("http://uk.altavista.com/web/results?itag=ody&q=a+more%21+complex_+search%24&kgs=1&kls=0", "a more! complex_ search$");
+        tests.put("http://www.blueyonder.co.uk/blueyonder/searches/search.jsp?q=a+more%21+complex_+search%24&cr=&sitesearch=&x=0&y=0",
+            "a more! complex_ search$");
+        tests
+            .put("http://www.alltheweb.com/search?cat=web&cs=iso88591&q=a+more%21+complex_+search%24&rys=0&itag=crv&_sb_lang=pref", "a more! complex_ search$");
+        tests.put("http://search.lycos.com/?query=a+more%21+complex_+search%24&x=0&y=0", "a more! complex_ search$");
+        tests.put("http://search.lycos.co.uk/cgi-bin/pursuit?query=a+more%21+complex_+search%24&enc=utf-8&cat=slim_loc&sc=blue", "a more! complex_ search$");
+        tests.put("http://www.hotbot.com/index.php?query=a+more%21+complex_+search%24&ps=&loc=searchbox&tab=web&mode=search&currProv=msn",
+            "a more! complex_ search$");
+        tests.put("http://search.yahoo.com/search?p=a+more%21+complex_+search%24&fr=FP-tab-web-t400&toggle=1&cop=&ei=UTF-8", "a more! complex_ search$");
+        tests.put("http://uk.search.yahoo.com/search?p=a+more%21+complex_+search%24&fr=FP-tab-web-t340&ei=UTF-8&meta=vc%3D", "a more! complex_ search$");
+        tests.put("http://uk.ask.com/web?q=a+more%21+complex_+search%24&qsrc=0&o=0&l=dir&dm=all", "a more! complex_ search$");
+        tests.put("http://www.mirago.co.uk/scripts/qhandler.aspx?qry=a+more%21+complex_+search%24&x=0&y=0", "a more! complex_ search$");
+        tests.put("http://www.netscape.com/search/?s=a+more%21+complex_+search%24", "a more! complex_ search$");
+        tests.put("http://search.aol.co.uk/web?query=a+more%21+complex_+search%24&x=0&y=0&isinit=true&restrict=wholeweb", "a more! complex_ search$");
+        tests.put("http://www.tiscali.co.uk/search/results.php?section=&from=&query=a+more%21+complex_+search%24", "a more! complex_ search$");
+        tests.put("http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+more%21+complex_+search%24&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0",
+            "a more! complex_ search$");
+        tests.put("dud", null);
+    }
+
+    @Test
+    public void testInstantiation() {
+        assertNotNull(new SearchEngineExtractor());
+    }
+
+    @Test
+    public void testTests() {
+        SearchTermExtractor searchTermExtractor = new SearchTermExtractor();
+        int testCount = 0;
+        for (String key : tests.keySet()) {
+            String expected = tests.get(key);
+
+            ArrayList<Datum> input = new ArrayList<Datum>();
+            input.add(new DataAtom(key));
+
+            DataAtom output = new DataAtom();
+            searchTermExtractor.exec(new Tuple(input), output);
+            if (expected == null) {
+                assertEquals(0, output.strval().length());
+            } else {
+                assertEquals(expected, output.toString());
+            }
+            testCount++;
+        }
+        assertEquals(tests.size(), testCount);
+    }
+}