You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/30 00:09:08 UTC

svn commit: r389901 - in /lucene/nutch/trunk: ./ src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/ src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/ src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/ src/plugin...

Author: jerome
Date: Wed Mar 29 14:09:03 2006
New Revision: 389901

URL: http://svn.apache.org/viewcvs?rev=389901&view=rev
Log:
Refactor some plugins packages:
* urlfilter-prefix package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.prefix
* urlfilter-automaton package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.automaton
* urlfilter-regex package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.regex
* lib-regex-filter package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.api
* ontology package moved from org.apache.nutch.ontology to org.apache.nutch.ontology.jena

Added:
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java   (with props)
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java   (with props)
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java   (with props)
    lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/
    lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java   (with props)
    lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java   (with props)
    lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java   (with props)
    lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/
    lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/
    lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/
    lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java   (with props)
Removed:
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java
    lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OwlParser.java
    lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/Parser.java
    lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/
Modified:
    lucene/nutch/trunk/default.properties
    lucene/nutch/trunk/src/plugin/ontology/plugin.xml
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
    lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
    lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml

Modified: lucene/nutch/trunk/default.properties
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Wed Mar 29 14:09:03 2006
@@ -61,13 +61,13 @@
 plugin.js=org.apache.nutch.parse.js*
 plugin.language=org.apache.nutch.analysis.lang*
 plugin.libhttp=org.apache.nutch.protocol.http.api*
+plugin.liburlfilter=org.apache.nutch.urlfilter.api*
 plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more*
 plugin.mp3=org.apache.nutch.parse.mp3*
 plugin.msexcel=org.apache.nutch.parse.msexcel*
 plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint*
 plugin.msword=org.apache.nutch.parse.msword*
-# Unfortunately, ontology on core and plugin uses the same package:
-# plugin.ontology=org.apache.nutch.ontology*
+plugin.ontology.jena=org.apache.nutch.ontology.jena*
 plugin.parsems=org.apache.nutch.parse.ms*
 plugin.pdf=org.apache.nutch.parse.pdf*
 plugin.reltag=org.apache.nutch.microformats.reltag*
@@ -77,6 +77,9 @@
 plugin.swf=org.apache.nutch.parse.swf*
 plugin.text=org.apache.nutch.parse.text*
 plugin.url=org.apache.nutch.searcher.url*
+plugin.urlfilter.automaton=org.apache.nutch.urlfilter.automaton*
+plugin.urlfilter.prefix=org.apache.nutch.urlfilter.prefix*
+plugin.urlfilter.regex=org.apache.nutch.urlfilter.regex*
 plugin.zip=org.apache.nutch.parse.zip*
 
 plugins.packages=\
@@ -92,11 +95,13 @@
    ${plugin.js}:\
    ${plugin.language}:\
    ${plugin.libhttp}:\
+   ${plugin.liburlfilter}:\
    ${plugin.more}:\
    ${plugin.mp3}:\
    ${plugin.msexcel}:\
    ${plugin.mspowerpoint}:\
    ${plugin.msword}:\
+   ${plugin.ontology.jena}:\
    ${plugin.parsems}:\
    ${plugin.pdf}:\
    ${plugin.reltag}:\
@@ -106,4 +111,7 @@
    ${plugin.swf}:\
    ${plugin.text}:\
    ${plugin.url}:\
+   ${plugin.urlfilter.automaton}:\
+   ${plugin.urlfilter.prefix}:\
+   ${plugin.urlfilter.regex}:\
    ${plugin.zip}

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+import org.apache.nutch.net.*;
+
+
+/**
+ * A generic regular expression rule.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexRule {
+
+  private boolean sign;
+  private String regex;
+
+  /**
+   * Constructs a new regular expression rule.
+   *
+   * @param sign specifies if this rule must filter-in or filter-out.
+   *        A <code>true</code> value means that any url matching this rule
+   *        must be accepted, a <code>false</code> value means that any url
+   *        matching this rule must be rejected.
+   * @param regex is the regular expression used for matching (see
+   *        {@link #match(String)} method).
+   */
+  protected RegexRule(boolean sign, String regex) {
+    this.sign = sign;
+    this.regex = regex;
+  }
+
+  /**
+   * Return if this rule is used for filtering-in or out.
+   *
+   * @return <code>true</code> if any url matching this rule must be accepted,
+   *         otherwise <code>false</code>.
+   */
+  protected boolean accept() { return sign; }
+  
+  /**
+   * Checks if a url matches this rule.
+   * @param url is the url to check.
+   * @return <code>true</code> if the specified url matches this rule,
+   *         otherwise <code>false</code>.
+   */
+  protected abstract boolean match(String url);
+
+}
+

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,217 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.net.*;
+
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
+ * regular expressions.
+ *
+ * <p>The regular expressions rules are expressed in a file. The file of rules
+ * is provided by each implementation using the
+ * {@link #getRulesFile(Configuration)} method.</p>
+ * 
+ * <p>The format of this file is made of many rules (one per line):<br/>
+ * <code>
+ * [+-]&lt;regex&gt;
+ * </code><br/>
+ * where plus (<code>+</code>)means go ahead and index it and minus 
+ * (<code>-</code>)means no.</p>
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBase implements URLFilter {
+
+  /** My logger */
+  private final static Logger LOG =
+    LogFormatter.getLogger(RegexURLFilterBase.class.getName());
+
+  /** An array of applicable rules */
+  private RegexRule[] rules;
+
+  /** The current configuration */
+  private Configuration conf;
+
+
+  /**
+   * Constructs a new empty RegexURLFilterBase
+   */
+  public RegexURLFilterBase() { }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a file of rules.
+   * @param filename is the name of rules file.
+   */
+  public RegexURLFilterBase(String filename)
+    throws IOException, IllegalArgumentException {
+    this(new FileReader(filename));
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a Reader of rules.
+   * @param reader is a reader of rules.
+   */
+  protected RegexURLFilterBase(Reader reader)
+    throws IOException, IllegalArgumentException {
+    rules = readRulesFile(reader);
+  }
+  
+  /**
+   * Creates a new {@link RegexRule}.
+   * @param sign of the regular expression.
+   *        A <code>true</code> value means that any URL matching this rule
+   *        must be included, whereas a <code>false</code>
+   *        value means that any URL matching this rule must be excluded.
+   * @param regex is the regular expression associated to this rule.
+   */
+  protected abstract RegexRule createRule(boolean sign, String regex);
+  
+  /**
+   * Returns the name of the file of rules to use for
+   * a particular implementation.
+   * @param conf is the current configuration.
+   * @return the name of the file of rules to use.
+   */
+  protected abstract String getRulesFile(Configuration conf);
+  
+  
+  /* -------------------------- *
+   * <implementation:URLFilter> *
+   * -------------------------- */
+  
+  // Inherited Javadoc
+  public synchronized String filter(String url) {
+    for (int i=0; i<rules.length; i++) {
+      if (rules[i].match(url)) {
+        return rules[i].accept() ? url : null;
+      }
+    };
+    return null;
+  }
+
+  /* --------------------------- *
+   * </implementation:URLFilter> *
+   * --------------------------- */
+  
+  
+  /* ----------------------------- *
+   * <implementation:Configurable> *
+   * ----------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String file = getRulesFile(conf);
+    Reader reader = conf.getConfResourceAsReader(file);
+    if (reader == null) {
+      LOG.severe("Can't find resource: " + file);
+    } else {
+      try {
+        rules = readRulesFile(reader);
+      } catch (IOException e) {
+        LOG.severe(e.getMessage());
+        //TODO mb@media-style.com: throw Exception? Because broken api.
+        throw new RuntimeException(e.getMessage(), e);
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  /* ------------------------------ *
+   * </implementation:Configurable> *
+   * ------------------------------ */
+  
+
+  /**
+   * Read the specified file of rules.
+   * @param reader is a reader of regular expressions rules.
+   * @return the corresponding {@RegexRule rules}.
+   */
+  private RegexRule[] readRulesFile(Reader reader)
+    throws IOException, IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(reader);
+    List rules = new ArrayList();
+    String line;
+       
+    while((line=in.readLine())!=null) {
+      if (line.length() == 0) {
+        continue;
+      }
+      char first=line.charAt(0);
+      boolean sign=false;
+      switch (first) {
+      case '+' : 
+        sign=true;
+        break;
+      case '-' :
+        sign=false;
+        break;
+      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
+        continue;
+      default :
+        throw new IOException("Invalid first character: "+line);
+      }
+
+      String regex = line.substring(1);
+      LOG.fine("Adding rule [" + regex + "]");
+      RegexRule rule = createRule(sign, regex);
+      rules.add(rule);
+    }
+    return (RegexRule[]) rules.toArray(new RegexRule[rules.size()]);
+  }
+
+  /**
+   * Filter the standard input using a RegexURLFilterBase.
+   * @param filter is the RegexURLFilterBase to use for filtering the
+   *        standard input.
+   * @param args some optional parameters (not used).
+   */
+  public static void main(RegexURLFilterBase filter, String args[])
+    throws IOException, IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while((line=in.readLine())!=null) {
+      String out = filter.filter(line);
+      if (out!=null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.net.URLFilter;
+
+
+/**
+ * JUnit based test of class <code>RegexURLFilterBase</code>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBaseTest extends TestCase {
+  
+  /** My logger */
+  protected static final Logger LOG =
+    LogFormatter.getLogger(RegexURLFilterBaseTest.class.getName());  
+
+  private final static String SEPARATOR = System.getProperty("file.separator");  
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+  
+  public RegexURLFilterBaseTest(String testName) {
+    super(testName);
+  }
+  
+  protected abstract URLFilter getURLFilter(Reader rules);
+
+  protected void bench(int loops, String file) {
+    try {
+      bench(loops,
+            new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+            new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void bench(int loops, Reader rules, Reader urls) {
+    long start = System.currentTimeMillis();
+    try {
+      URLFilter filter = getURLFilter(rules);
+      FilteredURL[] expected = readURLFile(urls);
+      for (int i=0; i<loops; i++) {
+        test(filter, expected);
+      }
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+    LOG.info("bench time (" + loops + ") " +
+             (System.currentTimeMillis()-start) + "ms");
+  }
+  
+  protected void test(String file) {
+    try {
+      test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+           new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void test(Reader rules, Reader urls) {
+    try {
+      test(getURLFilter(rules), readURLFile(urls));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void test(URLFilter filter, FilteredURL[] expected) {
+    for (int i=0; i<expected.length; i++) {
+      String result = filter.filter(expected[i].url);
+      if (result != null) {
+        assertTrue(expected[i].url, expected[i].sign);
+      } else {
+        assertFalse(expected[i].url, expected[i].sign);
+      }
+    }
+  }
+  
+  private static FilteredURL[] readURLFile(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    List list = new ArrayList();
+    String line;
+    while((line=in.readLine()) != null) {
+      if (line.length() != 0) {
+        list.add(new FilteredURL(line));
+      }
+    }
+    return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
+  }
+    
+  private static class FilteredURL {
+  
+    boolean sign;
+    String url;
+
+    FilteredURL(String line) {
+      switch (line.charAt(0)) {
+      case '+' : 
+        sign = true;
+        break;
+      case '-' :
+        sign = false;
+        break;
+      default :
+        // Simply ignore...
+      }
+      url = line.substring(1);
+    }
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/plugin/ontology/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/ontology/plugin.xml Wed Mar 29 14:09:03 2006
@@ -24,13 +24,13 @@
 
    <!-- attribute "point" is the plugin interface class -->
    <!-- seems kinda redundant to have to define the point here too -->   
-   <extension id="org.apache.nutch.ontology.OntologyImpl"
+   <extension id="org.apache.nutch.ontology.jena"
               name="Ontology Model Loader"
               point="org.apache.nutch.ontology.Ontology">
 
       <!-- define all the classes that implement the point defined above -->
-      <implementation id="org.apache.nutch.ontology.OntologyImpl"
-                      class="org.apache.nutch.ontology.OntologyImpl"
+      <implementation id="org.apache.nutch.ontology.jena.OntologyImpl"
+                      class="org.apache.nutch.ontology.jena.OntologyImpl"
                       pathSuffix=""/>
 
    </extension>

Added: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,360 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.ontology.*;
+import org.apache.nutch.util.NutchConfiguration;
+
+import com.hp.hpl.jena.ontology.Individual;
+import com.hp.hpl.jena.ontology.OntClass;
+import com.hp.hpl.jena.ontology.OntModel;
+import com.hp.hpl.jena.ontology.OntModelSpec;
+import com.hp.hpl.jena.ontology.OntResource;
+import com.hp.hpl.jena.ontology.Restriction;
+import com.hp.hpl.jena.rdf.model.Literal;
+import com.hp.hpl.jena.rdf.model.Resource;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.shared.PrefixMapping;
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.List;
+import java.util.LinkedList;
+
+import java.util.logging.Logger;
+
+import java.io.PrintStream;
+
+/**
+ * this class wraps about a model, 
+ * built from a list of ontologies,
+ * uses HP's Jena
+ *
+ * @author michael j pan
+ */
+public class OntologyImpl implements org.apache.nutch.ontology.Ontology {
+  public static final Logger LOG =
+    LogFormatter.getLogger("org.apache.nutch.ontology.Ontology");
+
+  public final static String DELIMITER_SEARCHTERM = " ";
+
+  private static Hashtable searchTerms = new Hashtable();
+  private static Parser parser;
+
+  //private static Object ontologyModel;
+  private static OntModel ontologyModel;
+
+  private static Ontology ontology = null;
+
+  private static Map m_anonIDs = new HashMap();
+  private static int m_anonCount = 0;
+
+  public OntologyImpl() {
+    //only initialize all the static variables
+    //if first time called to this ontology constructor
+    if (ontology == null) {
+      LOG.info( "creating new ontology");
+      parser = new OwlParser();
+      ontology = this;
+    }
+
+    if (ontologyModel == null)
+      ontologyModel =
+        ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM, null);
+        //ModelFactory.createOntologyModel();
+  }
+
+  public static Ontology getInstance () {
+    if (ontology == null) {
+      //ontology = new org.apache.nutch.ontology.Ontology();
+      ontology = new org.apache.nutch.ontology.jena.OntologyImpl();
+    }
+    return ontology;
+  }
+
+  public void load (String[] urls) {
+    for (int i=0; i<urls.length; i++) {
+      String url = urls[i].trim();
+      if (!url.equals(""))
+        load(ontologyModel, url);
+    }
+    parser.parse(ontologyModel);
+  }
+
+  private void load (Object m, String url) {
+    try {
+      LOG.info( "reading "+url);
+      ((OntModel)m).read(url);
+    } catch (Exception e) {
+      LOG.severe("failed on attempting to read ontology "+url);
+      LOG.severe(e.getMessage());
+      StackTraceElement[] traces = e.getStackTrace();
+      for (int i=0; i<traces.length; i++) {
+        LOG.severe(traces[i].toString());
+      }
+    }
+  }
+
+  public static Parser getParser() {
+    if (parser == null) {
+      parser = new OwlParser();
+    }
+    return parser;
+  }
+
+  public static OntModel getModel() {
+    return (OntModel)ontologyModel;
+  }
+
+  // not yet implemented
+  //public void merge (org.apache.nutch.ontology.Ontology o) {
+  //}
+
+  /**
+   * retrieve all subclasses of entity(ies) hashed to searchTerm
+   */
+  public Iterator subclasses (String entitySearchTerm) {
+    Map classMap = retrieve(entitySearchTerm);
+    Map subclasses = new HashMap();
+  
+    Iterator iter = classMap.keySet().iterator();
+    while (iter.hasNext()) {
+      //OntClass resource = (OntClass) iter.next();
+      OntResource resource = (OntResource) iter.next();
+  
+      if (resource instanceof OntClass) {
+        //get subclasses
+        for (Iterator i=((OntClass)resource).listSubClasses(); i.hasNext();) {
+          OntResource subclass = (OntResource) i.next();
+          for (Iterator j=subclass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            subclasses.put(l.toString(), "1");
+          }
+        }
+        //get individuals
+        for (Iterator i=((OntClass)resource).listInstances(); i.hasNext();) {
+          OntResource subclass = (OntResource) i.next();
+          for (Iterator j=subclass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            subclasses.put(l.toString(), "1");
+          }
+        }
+      } else if (resource instanceof Individual) {
+        for (Iterator i=resource.listSameAs(); i.hasNext();) {  
+          OntResource subclass = (OntResource) i.next();
+          for (Iterator j=subclass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            subclasses.put(l.toString(), "1");
+          }    
+        }
+      }
+    }
+
+    return subclasses.keySet().iterator();
+  }
+
+  /**
+   * retrieves synonyms from wordnet via sweet's web interface
+   */
+  public Iterator synonyms (String queryKeyPhrase) {
+    //need to have a html quote method instead
+    queryKeyPhrase = queryKeyPhrase.replaceAll("\\s+", "\\+");
+
+    Map classMap = retrieve(queryKeyPhrase);
+
+    Map synonyms = new HashMap();
+
+    Iterator iter = classMap.keySet().iterator();
+    while (iter.hasNext()) {
+      OntResource resource = (OntResource) iter.next();
+
+      //listLabels
+      for (Iterator i=resource.listLabels(null); i.hasNext();) {
+        Literal l = (Literal) i.next();
+        synonyms.put(l.toString(), "1");
+      }
+    
+      if (resource instanceof Individual) {
+      //get all individuals same as this one
+        for (Iterator i=resource.listSameAs(); i.hasNext();) {
+          Individual individual = (Individual) i.next();
+          //add labels
+          for (Iterator j =individual.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) i.next();
+            synonyms.put(l.toString(), "1");
+          }
+        }
+      } else if (resource instanceof OntClass) {
+        //list equivalent classes
+        for (Iterator i=((OntClass)resource).listEquivalentClasses();
+          i.hasNext();) {
+          OntClass equivClass = (OntClass) i.next();
+          //add labels
+          for (Iterator j=equivClass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            synonyms.put(l.toString(), "1");
+          }
+        }
+      }
+    }
+
+    return synonyms.keySet().iterator();
+  }
+
+  public static void addSearchTerm(String label, OntResource resource) {
+    Map m = retrieve(label);
+    if (m == null) {
+      m=new HashMap();
+    }
+    m.put(resource, "1");
+    searchTerms.put(label.toLowerCase(), m);
+  }
+
+  public static Map retrieve(String label) {
+    Map m = (Map) searchTerms.get(label.toLowerCase());
+    if (m==null) {
+      m = new HashMap();
+    }
+    return m;
+  }
+
+  protected static void renderHierarchy( PrintStream out, OntClass cls, 
+            List occurs, int depth ) {
+    renderClassDescription( out, cls, depth );
+    out.println();
+  
+    // recurse to the next level down
+    if (cls.canAs( OntClass.class ) && !occurs.contains( cls )) {
+      for (Iterator i = cls.listSubClasses( true ); i.hasNext(); ) {
+        OntClass sub = (OntClass) i.next();
+
+        // we push this expression on the occurs list before we recurse
+        occurs.add( cls );
+        renderHierarchy( out, sub, occurs, depth + 1 );
+        occurs.remove( cls );
+      }
+      for (Iterator i=cls.listInstances(); i.hasNext(); ) {
+        Individual individual = (Individual) i.next();
+        renderURI(out, individual.getModel(), individual.getURI());
+        out.print(" [");
+        for (Iterator j=individual.listLabels(null); j.hasNext();) {
+          out.print(((Literal)j.next()).getString()+", ");
+        }
+        out.print("] ");
+        out.println();
+      }
+    }
+  }
+
+  public static void renderClassDescription( PrintStream out, 
+    OntClass c, int depth ) {
+    indent( out, depth );
+    
+    if (c.isRestriction()) {
+      renderRestriction( out, (Restriction) c.as( Restriction.class ) );
+    } else {
+      if (!c.isAnon()) {
+        out.print( "Class " );
+        //renderURI( out, c.getModel(), c.getURI() );
+
+        out.print (c.getLocalName());
+
+        out.print( " [" );
+        for (Iterator i=c.listLabels(null); i.hasNext(); ) {
+          out.print(((Literal)i.next()).getString()+", ");
+        }
+        out.print( "] ");
+      } else {
+        renderAnonymous( out, c, "class" );
+      }
+    }
+  }
+  
+  protected static void renderRestriction( PrintStream out, Restriction r ) {
+    if (!r.isAnon()) {
+      out.print( "Restriction " );
+      renderURI( out, r.getModel(), r.getURI() );
+    } else {
+      renderAnonymous( out, r, "restriction" );
+    }
+    
+    out.print( " on property " );
+    renderURI( out, r.getModel(), r.getOnProperty().getURI() );
+  }
+
+  protected static void renderURI( PrintStream out, 
+    PrefixMapping prefixes, String uri ) {
+    out.print( prefixes.usePrefix( uri ) );
+  }
+  
+  protected static void renderAnonymous( PrintStream out, 
+    Resource anon, String name ) {
+    String anonID = (String) m_anonIDs.get( anon.getId() );
+    if (anonID == null) {
+      anonID = "a-" + m_anonCount++;
+      m_anonIDs.put( anon.getId(), anonID );
+    }
+    
+    out.print( "Anonymous ");
+    out.print( name );
+    out.print( " with ID " );
+    out.print( anonID );
+  }
+  
+  protected static void indent( PrintStream out, int depth ) {
+    for (int i = 0; i < depth; i++) {
+      out.print( " " );
+    }
+  }
+
+  public static void main( String[] args ) throws Exception {
+
+    Configuration conf = NutchConfiguration.create(); 
+    Ontology ontology = new OntologyFactory(conf).getOntology();
+
+    String urls = conf.get("extension.ontology.urls");
+    if (urls==null || urls.trim().equals("")) {
+      LOG.severe("No ontology url found.");
+      return;
+    }
+    ontology.load(urls.split("\\s+"));
+    LOG.info( "created new ontology");
+    
+    for (Iterator i = getParser().rootClasses( getModel() ); 
+      i.hasNext(); ) {
+    
+      //print class
+      OntClass c = (OntClass) i.next();
+
+      renderHierarchy(System.out, c, new LinkedList(), 0);
+    }
+
+    String[] terms =
+      new String[] { "Season" };
+
+    for (int i=0; i<terms.length; i++) {
+      Iterator iter = ontology.subclasses(terms[i]);
+      while (iter.hasNext()) {
+        System.out.println("subclass >> "+(String)iter.next());
+      }
+    }
+  }
+}

Propchange: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,146 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+//import org.apache.hadoop.util.LogFormatter;
+
+import com.hp.hpl.jena.ontology.OntClass;
+import com.hp.hpl.jena.ontology.OntModel;
+import com.hp.hpl.jena.ontology.Individual;
+import com.hp.hpl.jena.rdf.model.Literal;
+import org.apache.nutch.ontology.*;
+
+/**
+ * implementation of parser for w3c's OWL files
+ *
+ * @author michael j pan
+ */
+public class OwlParser implements Parser {
+  public OwlParser () {
+  }
+
+  /**
+   * parse owl ontology files using jena
+   */
+  public void parse(OntModel m) {
+    for (Iterator i = rootClasses( m );  i.hasNext();  ) {
+      OntClass c = (OntClass) i.next();
+
+      //dont deal with anonymous classes
+      if (c.isAnon()) {
+        continue;
+      }
+
+      parseClass( c,  new ArrayList(), 0 );
+    }
+  }
+
+  protected  void parseClass( OntClass cls, List occurs, int depth ) {
+    //dont deal with anonymous classes
+    if (cls.isAnon()) {
+      return;
+    }
+
+    //add cls to Ontology searchterms
+    //list labels
+    Iterator labelIter = cls.listLabels(null);
+    //if has no labels
+    if (!labelIter.hasNext()) {
+        //add rdf:ID as a label
+        cls.addLabel(rdfidToLabel(cls.getLocalName()), null);
+    }    
+    //reset the label iterator
+    labelIter = cls.listLabels(null);
+  
+    while(labelIter.hasNext()) {
+      Literal l = (Literal) labelIter.next();
+      OntologyImpl.addSearchTerm(l.toString(), cls);
+    }
+
+    // recurse to the next level down
+    if (cls.canAs( OntClass.class )  &&  !occurs.contains( cls )) {
+      //list subclasses
+      for (Iterator i = cls.listSubClasses( true );  i.hasNext(); ) {
+        OntClass sub = (OntClass) i.next();
+
+        // we push this expression on the occurs list before we recurse
+        occurs.add( cls );
+        parseClass(sub, occurs, depth+1);
+        occurs.remove( cls );
+      }
+
+      //list instances
+      for (Iterator i=cls.listInstances(); i.hasNext(); ) {
+        //add search terms for each instance
+
+        //list labels
+        Individual individual = (Individual) i.next();
+        for (Iterator j=individual.listLabels(null); j.hasNext();) {
+          Literal l = (Literal) j.next();
+          OntologyImpl.addSearchTerm(l.toString(), individual);
+        }
+      }
+    }
+  }
+
+  public Iterator rootClasses( OntModel m ) {
+    List roots = new ArrayList();
+    
+    for (Iterator i = m.listClasses();  i.hasNext(); ) {
+      OntClass c = (OntClass) i.next();
+        
+      try {
+      // too confusing to list all the restrictions as root classes 
+        if (c.isAnon()) {
+          continue;
+        }
+    
+        if (c.hasSuperClass( m.getProfile().THING(), true ) ) {
+          // this class is directly descended from Thing
+          roots.add( c );
+        } else if (c.getCardinality( m.getProfile().SUB_CLASS_OF() ) == 0 ) {
+          // this class has no super-classes
+          // (can occur if we're not using the reasoner)
+          roots.add( c );
+        }
+      } catch (Exception e) {
+        //e.printStackTrace();
+        System.out.println(e.getMessage());
+      }
+    }
+    
+    return roots.iterator();
+  }
+
+  public String rdfidToLabel (String idString) {
+    Pattern p = Pattern.compile("([a-z0-9])([A-Z])");
+    Matcher m = p.matcher(idString);
+
+    String labelString = new String(idString);
+    while(m.find()) {
+      labelString = labelString.replaceAll(m.group(1)+m.group(2),
+             m.group(1)+" "+m.group(2));
+    }
+    return labelString;
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import com.hp.hpl.jena.ontology.OntModel;
+
+import java.util.Iterator;
+import org.apache.nutch.ontology.*;
+
+/**
+ * interface for the parser
+ *
+ * @author michael j pan
+ */
+public interface Parser {
+  public void parse(OntModel m);
+  public Iterator rootClasses(OntModel m);
+}

Propchange: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import org.apache.nutch.ontology.*;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.LinkedList;
+
+import java.lang.Exception;
+
+/** 
+ * Unit tests for Ontology
+ * 
+ * @author michael j pan
+ */
+public class TestOntology extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/ontology/build.xml during plugin compilation.
+  // Check ./src/plugin/ontology/sample/README.txt for what they are.
+  private String[] sampleFiles = {"time.owl"};
+
+  private static Ontology ontology;
+  private Configuration conf;
+  public TestOntology(String name) { 
+    super(name); 
+  }
+
+  protected void setUp() {
+      this.conf = NutchConfiguration.create();
+  }
+
+  protected void tearDown() {}
+
+  public void testIt() throws ProtocolException, ParseException, Exception {
+    String className = "Season";
+    String[] subclassNames =
+      new String[] {"Spring", "Summer", "Fall", "Winter"};
+
+    if (ontology==null) {
+      try {
+        ontology = new OntologyFactory(this.conf).getOntology();
+      } catch (Exception e) {
+        throw new Exception("Failed to instantiate ontology");
+      }
+    }
+
+    //foreach sample file
+    for (int i=0; i<sampleFiles.length; i++) {
+      //construct the url
+      String urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      ontology.load(new String[] {urlString});
+
+      List subclassList = new LinkedList();
+  
+      Iterator iter = ontology.subclasses(className);
+      while (iter.hasNext()) {
+        String subclassLabel = (String) iter.next();
+        System.out.println(subclassLabel);
+        subclassList.add(subclassLabel);
+      }
+  
+      for (int j=0; j<subclassNames.length; j++) {
+        assertTrue(subclassList.contains(subclassNames[j]));
+      }
+    }
+
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Wed Mar 29 14:09:03 2006
@@ -21,7 +21,7 @@
               name="Nutch Automaton URL Filter"
               point="org.apache.nutch.net.URLFilter">
       <implementation id="AutomatonURLFilter"
-                      class="org.apache.nutch.net.AutomatonURLFilter"/>
+                      class="org.apache.nutch.urlfilter.automaton.AutomatonURLFilter"/>
    </extension>
 
 </plugin>

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Automaton imports
+import dk.brics.automaton.RegExp;
+import dk.brics.automaton.RunAutomaton;
+import org.apache.nutch.net.*;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+
+
+/**
+ * RegexURLFilterBase implementation based on the
+ * <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ * Finite-State Automata for Java<sup>TM</sup>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ */
+public class AutomatonURLFilter extends RegexURLFilterBase {
+
+  public AutomatonURLFilter() {
+    super();
+  }
+
+  public AutomatonURLFilter(String filename)
+    throws IOException, PatternSyntaxException {
+    super(filename);
+  }
+
+  AutomatonURLFilter(Reader reader)
+    throws IOException, IllegalArgumentException {
+    super(reader);
+  }
+
+  
+  /* ----------------------------------- *
+   * <implementation:RegexURLFilterBase> *
+   * ----------------------------------- */
+  
+  // Inherited Javadoc
+  protected String getRulesFile(Configuration conf) {
+    return conf.get("urlfilter.automaton.file");
+  }
+
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  /* ------------------------------------ *
+   * </implementation:RegexURLFilterBase> *
+   * ------------------------------------ */
+
+  
+  public static void main(String args[]) throws IOException {
+    main(new AutomatonURLFilter(), args);
+  }
+
+
+  private class Rule extends RegexRule {
+    
+    private RunAutomaton automaton;
+    
+    Rule(boolean sign, String regex) {
+      super(sign, regex);
+      automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+    }
+
+    protected boolean match(String url) {
+      return automaton.run(url);
+    }
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html Wed Mar 29 14:09:03 2006
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>
+A url filter plugin based on
+<a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+Automata for Java<sup>TM</sup>.
+</p>
+</body>
+</html>

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+// JUnit imports
+import junit.framework.Test;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+import org.apache.nutch.net.*;
+
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+
+
+/**
+ * JUnit based test of class <code>AutomatonURLFilter</code>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
+  
+  public TestAutomatonURLFilter(String testName) {
+    super(testName);
+  }
+  
+  public static Test suite() {
+    return new TestSuite(TestAutomatonURLFilter.class);
+  }
+  
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new AutomatonURLFilter(rules);
+    } catch (IOException e) {
+      fail(e.toString());
+      return null;
+    }
+  }
+  
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml Wed Mar 29 14:09:03 2006
@@ -19,7 +19,7 @@
               name="Nutch Prefix URL Filter"
               point="org.apache.nutch.net.URLFilter">
       <implementation id="PrefixURLFilter"
-                      class="org.apache.nutch.net.PrefixURLFilter"/>
+                      class="org.apache.nutch.urlfilter.prefix.PrefixURLFilter"/>
       <!-- by default, attribute "file" is undefined, to keep classic behavior.
       <implementation id="PrefixURLFilter"
                       class="org.apache.nutch.net.PrefixURLFilter"

Added: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,164 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// $Id: PrefixURLFilter.java,v 1.2 2005/02/07 19:10:37 cutting Exp $
+
+package org.apache.nutch.urlfilter.prefix;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.net.*;
+
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+/**
+ * Filters URLs based on a file of URL prefixes. The file is named by
+ * (1) property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and
+ * (2) attribute "file" in plugin.xml of this plugin
+ * Attribute "file" has higher precedence if defined.
+ *
+ * <p>The format of this file is one URL prefix per line.</p>
+ */
+public class PrefixURLFilter implements URLFilter {
+
+  private static final Logger LOG =
+    LogFormatter.getLogger(PrefixURLFilter.class.getName());
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+
+  private TrieStringMatcher trie;
+
+  private Configuration conf;
+
+  public PrefixURLFilter() throws IOException {
+   
+  }
+
+  public PrefixURLFilter(String filename) throws IOException {
+    trie = readConfigurationFile(new FileReader(filename));
+  }
+
+  public String filter(String url) {
+    if (trie.shortestMatch(url) == null)
+      return null;
+    else
+      return url;
+  }
+
+  private TrieStringMatcher readConfigurationFile(Reader reader)
+    throws IOException {
+    
+    BufferedReader in=new BufferedReader(reader);
+    List urlprefixes = new ArrayList();
+    String line;
+
+    while((line=in.readLine())!=null) {
+      if (line.length() == 0)
+        continue;
+
+      char first=line.charAt(0);
+      switch (first) {
+      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
+        continue;
+      default :
+	urlprefixes.add(line);
+      }
+    }
+
+    return new PrefixStringMatcher(urlprefixes);
+  }
+
+  public static void main(String args[])
+    throws IOException {
+    
+    PrefixURLFilter filter;
+    if (args.length >= 1)
+      filter = new PrefixURLFilter(args[0]);
+    else
+      filter = new PrefixURLFilter();
+    
+    BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while((line=in.readLine())!=null) {
+      String out=filter.filter(line);
+      if(out!=null) {
+        System.out.println(out);
+      }
+    }
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    String pluginName = "urlfilter-prefix";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+        URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+    if (attributeFile != null && attributeFile.trim().equals(""))
+      attributeFile = null;
+    if (attributeFile != null) {
+      LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+    } else {
+      // LOG.warning("Attribute \"file\" is not defined in plugin.xml for
+      // plugin "+pluginName);
+    }
+
+    String file = conf.get("urlfilter.prefix.file");
+    // attribute "file" takes precedence if defined
+    if (attributeFile != null)
+      file = attributeFile;
+    Reader reader = conf.getConfResourceAsReader(file);
+
+    if (reader == null) {
+      trie = new PrefixStringMatcher(new String[0]);
+    } else {
+      try {
+        trie = readConfigurationFile(reader);
+      } catch (IOException e) {
+        LOG.severe(e.getMessage());
+        // TODO mb@media-style.com: throw Exception? Because broken api.
+        throw new RuntimeException(e.getMessage(), e);
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html Wed Mar 29 14:09:03 2006
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A url filter plugin.</p><p></p>
+</body>
+</html>

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Wed Mar 29 14:09:03 2006
@@ -20,7 +20,7 @@
               name="Nutch Regex URL Filter"
               point="org.apache.nutch.net.URLFilter">
       <implementation id="RegexURLFilter"
-                      class="org.apache.nutch.net.RegexURLFilter"/>
+                      class="org.apache.nutch.urlfilter.regex.RegexURLFilter"/>
       <!-- by default, attribute "file" is undefined, to keep classic behavior.
       <implementation id="RegexURLFilter"
                       class="org.apache.nutch.net.RegexURLFilter"

Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.*;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+
+
+/**
+ * Filters URLs based on a file of regular expressions using the
+ * {@link java.util.regex Java Regex implementation}.
+ */
+public class RegexURLFilter extends RegexURLFilterBase {
+
+  public RegexURLFilter() {
+    super();
+  }
+
+  public RegexURLFilter(String filename)
+    throws IOException, PatternSyntaxException {
+    super(filename);
+  }
+
+  RegexURLFilter(Reader reader)
+    throws IOException, IllegalArgumentException {
+    super(reader);
+  }
+
+  
+  /* ----------------------------------- *
+   * <implementation:RegexURLFilterBase> *
+   * ----------------------------------- */
+  
+  // Inherited Javadoc
+  protected String getRulesFile(Configuration conf) {
+    return conf.get("urlfilter.regex.file");
+  }
+
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  /* ------------------------------------ *
+   * </implementation:RegexURLFilterBase> *
+   * ------------------------------------ */
+
+  
+  public static void main(String args[]) throws IOException {
+    main(new RegexURLFilter(), args);
+  }
+
+
+  private class Rule extends RegexRule {
+    
+    private Pattern pattern;
+    
+    Rule(boolean sign, String regex) {
+      super(sign, regex);
+      pattern = Pattern.compile(regex);
+    }
+
+    protected boolean match(String url) {
+      return pattern.matcher(url).find();
+    }
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html Wed Mar 29 14:09:03 2006
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A url filter plugin.</p><p></p>
+</body>
+</html>

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+// JUnit imports
+import junit.framework.Test;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+import org.apache.nutch.net.*;
+
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+
+
+/**
+ * JUnit based test of class <code>RegexURLFilter</code>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestRegexURLFilter extends RegexURLFilterBaseTest {
+  
+  public TestRegexURLFilter(String testName) {
+    super(testName);
+  }
+  
+  public static Test suite() {
+    return new TestSuite(TestRegexURLFilter.class);
+  }
+  
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new RegexURLFilter(rules);
+    } catch (IOException e) {
+      fail(e.toString());
+      return null;
+    }
+  }
+  
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native