You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/30 00:09:08 UTC
svn commit: r389901 - in /lucene/nutch/trunk: ./
src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/
src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/
src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
src/plugin...
Author: jerome
Date: Wed Mar 29 14:09:03 2006
New Revision: 389901
URL: http://svn.apache.org/viewcvs?rev=389901&view=rev
Log:
Refactor some plugins packages:
* urlfilter-prefix package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.prefix
* urlfilter-automaton package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.automaton
* urlfilter-regex package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.regex
* lib-regex-filter package moved from org.apache.nutch.net to org.apache.nutch.urlfilter.api
* ontology package moved from org.apache.nutch.ontology to org.apache.nutch.ontology.jena
Added:
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (with props)
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (with props)
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (with props)
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java (with props)
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java (with props)
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java (with props)
lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/
lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java (with props)
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (with props)
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html (with props)
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (with props)
lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/
lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/
lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (with props)
lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html (with props)
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (with props)
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html (with props)
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (with props)
Removed:
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OwlParser.java
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/Parser.java
lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/
Modified:
lucene/nutch/trunk/default.properties
lucene/nutch/trunk/src/plugin/ontology/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml
Modified: lucene/nutch/trunk/default.properties
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Wed Mar 29 14:09:03 2006
@@ -61,13 +61,13 @@
plugin.js=org.apache.nutch.parse.js*
plugin.language=org.apache.nutch.analysis.lang*
plugin.libhttp=org.apache.nutch.protocol.http.api*
+plugin.liburlfilter=org.apache.nutch.urlfilter.api*
plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more*
plugin.mp3=org.apache.nutch.parse.mp3*
plugin.msexcel=org.apache.nutch.parse.msexcel*
plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint*
plugin.msword=org.apache.nutch.parse.msword*
-# Unfortunately, ontology on core and plugin uses the same package:
-# plugin.ontology=org.apache.nutch.ontology*
+plugin.ontology.jena=org.apache.nutch.ontology.jena*
plugin.parsems=org.apache.nutch.parse.ms*
plugin.pdf=org.apache.nutch.parse.pdf*
plugin.reltag=org.apache.nutch.microformats.reltag*
@@ -77,6 +77,9 @@
plugin.swf=org.apache.nutch.parse.swf*
plugin.text=org.apache.nutch.parse.text*
plugin.url=org.apache.nutch.searcher.url*
+plugin.urlfilter.automaton=org.apache.nutch.urlfilter.automaton*
+plugin.urlfilter.prefix=org.apache.nutch.urlfilter.prefix*
+plugin.urlfilter.regex=org.apache.nutch.urlfilter.regex*
plugin.zip=org.apache.nutch.parse.zip*
plugins.packages=\
@@ -92,11 +95,13 @@
${plugin.js}:\
${plugin.language}:\
${plugin.libhttp}:\
+ ${plugin.liburlfilter}:\
${plugin.more}:\
${plugin.mp3}:\
${plugin.msexcel}:\
${plugin.mspowerpoint}:\
${plugin.msword}:\
+ ${plugin.ontology.jena}:\
${plugin.parsems}:\
${plugin.pdf}:\
${plugin.reltag}:\
@@ -106,4 +111,7 @@
${plugin.swf}:\
${plugin.text}:\
${plugin.url}:\
+ ${plugin.urlfilter.automaton}:\
+ ${plugin.urlfilter.prefix}:\
+ ${plugin.urlfilter.regex}:\
${plugin.zip}
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+import org.apache.nutch.net.*;
+
+
+/**
+ * A generic regular expression rule.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexRule {
+
+ private boolean sign;
+ private String regex;
+
+ /**
+ * Constructs a new regular expression rule.
+ *
+ * @param sign specifies if this rule must filter-in or filter-out.
+ * A <code>true</code> value means that any url matching this rule
+ * must be accepted, a <code>false</code> value means that any url
+ * matching this rule must be rejected.
+ * @param regex is the regular expression used for matching (see
+ * {@link #match(String)} method).
+ */
+ protected RegexRule(boolean sign, String regex) {
+ this.sign = sign;
+ this.regex = regex;
+ }
+
+ /**
+ * Return if this rule is used for filtering-in or out.
+ *
+ * @return <code>true</code> if any url matching this rule must be accepted,
+ * otherwise <code>false</code>.
+ */
+ protected boolean accept() { return sign; }
+
+ /**
+ * Checks if a url matches this rule.
+ * @param url is the url to check.
+ * @return <code>true</code> if the specified url matches this rule,
+ * otherwise <code>false</code>.
+ */
+ protected abstract boolean match(String url);
+
+}
+
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,217 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.net.*;
+
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
+ * regular expressions.
+ *
+ * <p>The regular expressions rules are expressed in a file. The file of rules
+ * is provided by each implementation using the
+ * {@link #getRulesFile(Configuration)} method.</p>
+ *
+ * <p>The format of this file is made of many rules (one per line):<br/>
+ * <code>
+ * [+-]<regex>
+ * </code><br/>
+ * where plus (<code>+</code>)means go ahead and index it and minus
+ * (<code>-</code>)means no.</p>
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexURLFilterBase implements URLFilter {
+
+ /** My logger */
+ private final static Logger LOG =
+ LogFormatter.getLogger(RegexURLFilterBase.class.getName());
+
+ /** An array of applicable rules */
+ private RegexRule[] rules;
+
+ /** The current configuration */
+ private Configuration conf;
+
+
+ /**
+ * Constructs a new empty RegexURLFilterBase
+ */
+ public RegexURLFilterBase() { }
+
+ /**
+ * Constructs a new RegexURLFilter and init it with a file of rules.
+ * @param filename is the name of rules file.
+ */
+ public RegexURLFilterBase(String filename)
+ throws IOException, IllegalArgumentException {
+ this(new FileReader(filename));
+ }
+
+ /**
+ * Constructs a new RegexURLFilter and init it with a Reader of rules.
+ * @param reader is a reader of rules.
+ */
+ protected RegexURLFilterBase(Reader reader)
+ throws IOException, IllegalArgumentException {
+ rules = readRulesFile(reader);
+ }
+
+ /**
+ * Creates a new {@link RegexRule}.
+ * @param sign of the regular expression.
+ * A <code>true</code> value means that any URL matching this rule
+ * must be included, whereas a <code>false</code>
+ * value means that any URL matching this rule must be excluded.
+ * @param regex is the regular expression associated to this rule.
+ */
+ protected abstract RegexRule createRule(boolean sign, String regex);
+
+ /**
+ * Returns the name of the file of rules to use for
+ * a particular implementation.
+ * @param conf is the current configuration.
+ * @return the name of the file of rules to use.
+ */
+ protected abstract String getRulesFile(Configuration conf);
+
+
+ /* -------------------------- *
+ * <implementation:URLFilter> *
+ * -------------------------- */
+
+ // Inherited Javadoc
+ public synchronized String filter(String url) {
+ for (int i=0; i<rules.length; i++) {
+ if (rules[i].match(url)) {
+ return rules[i].accept() ? url : null;
+ }
+ };
+ return null;
+ }
+
+ /* --------------------------- *
+ * </implementation:URLFilter> *
+ * --------------------------- */
+
+
+ /* ----------------------------- *
+ * <implementation:Configurable> *
+ * ----------------------------- */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ String file = getRulesFile(conf);
+ Reader reader = conf.getConfResourceAsReader(file);
+ if (reader == null) {
+ LOG.severe("Can't find resource: " + file);
+ } else {
+ try {
+ rules = readRulesFile(reader);
+ } catch (IOException e) {
+ LOG.severe(e.getMessage());
+ //TODO mb@media-style.com: throw Exception? Because broken api.
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /* ------------------------------ *
+ * </implementation:Configurable> *
+ * ------------------------------ */
+
+
+ /**
+ * Read the specified file of rules.
+ * @param reader is a reader of regular expressions rules.
+ * @return the corresponding {@RegexRule rules}.
+ */
+ private RegexRule[] readRulesFile(Reader reader)
+ throws IOException, IllegalArgumentException {
+
+ BufferedReader in = new BufferedReader(reader);
+ List rules = new ArrayList();
+ String line;
+
+ while((line=in.readLine())!=null) {
+ if (line.length() == 0) {
+ continue;
+ }
+ char first=line.charAt(0);
+ boolean sign=false;
+ switch (first) {
+ case '+' :
+ sign=true;
+ break;
+ case '-' :
+ sign=false;
+ break;
+ case ' ' : case '\n' : case '#' : // skip blank & comment lines
+ continue;
+ default :
+ throw new IOException("Invalid first character: "+line);
+ }
+
+ String regex = line.substring(1);
+ LOG.fine("Adding rule [" + regex + "]");
+ RegexRule rule = createRule(sign, regex);
+ rules.add(rule);
+ }
+ return (RegexRule[]) rules.toArray(new RegexRule[rules.size()]);
+ }
+
+ /**
+ * Filter the standard input using a RegexURLFilterBase.
+ * @param filter is the RegexURLFilterBase to use for filtering the
+ * standard input.
+ * @param args some optional parameters (not used).
+ */
+ public static void main(RegexURLFilterBase filter, String args[])
+ throws IOException, IllegalArgumentException {
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while((line=in.readLine())!=null) {
+ String out = filter.filter(line);
+ if (out!=null) {
+ System.out.print("+");
+ System.out.println(out);
+ } else {
+ System.out.print("-");
+ System.out.println(line);
+ }
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.net.URLFilter;
+
+
+/**
+ * JUnit based test of class <code>RegexURLFilterBase</code>.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexURLFilterBaseTest extends TestCase {
+
+ /** My logger */
+ protected static final Logger LOG =
+ LogFormatter.getLogger(RegexURLFilterBaseTest.class.getName());
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public RegexURLFilterBaseTest(String testName) {
+ super(testName);
+ }
+
+ protected abstract URLFilter getURLFilter(Reader rules);
+
+ protected void bench(int loops, String file) {
+ try {
+ bench(loops,
+ new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ protected void bench(int loops, Reader rules, Reader urls) {
+ long start = System.currentTimeMillis();
+ try {
+ URLFilter filter = getURLFilter(rules);
+ FilteredURL[] expected = readURLFile(urls);
+ for (int i=0; i<loops; i++) {
+ test(filter, expected);
+ }
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ LOG.info("bench time (" + loops + ") " +
+ (System.currentTimeMillis()-start) + "ms");
+ }
+
+ protected void test(String file) {
+ try {
+ test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ protected void test(Reader rules, Reader urls) {
+ try {
+ test(getURLFilter(rules), readURLFile(urls));
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ protected void test(URLFilter filter, FilteredURL[] expected) {
+ for (int i=0; i<expected.length; i++) {
+ String result = filter.filter(expected[i].url);
+ if (result != null) {
+ assertTrue(expected[i].url, expected[i].sign);
+ } else {
+ assertFalse(expected[i].url, expected[i].sign);
+ }
+ }
+ }
+
+ private static FilteredURL[] readURLFile(Reader reader) throws IOException {
+ BufferedReader in = new BufferedReader(reader);
+ List list = new ArrayList();
+ String line;
+ while((line=in.readLine()) != null) {
+ if (line.length() != 0) {
+ list.add(new FilteredURL(line));
+ }
+ }
+ return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
+ }
+
+ private static class FilteredURL {
+
+ boolean sign;
+ String url;
+
+ FilteredURL(String line) {
+ switch (line.charAt(0)) {
+ case '+' :
+ sign = true;
+ break;
+ case '-' :
+ sign = false;
+ break;
+ default :
+ // Simply ignore...
+ }
+ url = line.substring(1);
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/plugin/ontology/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/ontology/plugin.xml Wed Mar 29 14:09:03 2006
@@ -24,13 +24,13 @@
<!-- attribute "point" is the plugin interface class -->
<!-- seems kinda redundant to have to define the point here too -->
- <extension id="org.apache.nutch.ontology.OntologyImpl"
+ <extension id="org.apache.nutch.ontology.jena"
name="Ontology Model Loader"
point="org.apache.nutch.ontology.Ontology">
<!-- define all the classes that implement the point defined above -->
- <implementation id="org.apache.nutch.ontology.OntologyImpl"
- class="org.apache.nutch.ontology.OntologyImpl"
+ <implementation id="org.apache.nutch.ontology.jena.OntologyImpl"
+ class="org.apache.nutch.ontology.jena.OntologyImpl"
pathSuffix=""/>
</extension>
Added: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,360 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.ontology.*;
+import org.apache.nutch.util.NutchConfiguration;
+
+import com.hp.hpl.jena.ontology.Individual;
+import com.hp.hpl.jena.ontology.OntClass;
+import com.hp.hpl.jena.ontology.OntModel;
+import com.hp.hpl.jena.ontology.OntModelSpec;
+import com.hp.hpl.jena.ontology.OntResource;
+import com.hp.hpl.jena.ontology.Restriction;
+import com.hp.hpl.jena.rdf.model.Literal;
+import com.hp.hpl.jena.rdf.model.Resource;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.shared.PrefixMapping;
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.List;
+import java.util.LinkedList;
+
+import java.util.logging.Logger;
+
+import java.io.PrintStream;
+
+/**
+ * this class wraps about a model,
+ * built from a list of ontologies,
+ * uses HP's Jena
+ *
+ * @author michael j pan
+ */
+public class OntologyImpl implements org.apache.nutch.ontology.Ontology {
+ public static final Logger LOG =
+ LogFormatter.getLogger("org.apache.nutch.ontology.Ontology");
+
+ public final static String DELIMITER_SEARCHTERM = " ";
+
+ private static Hashtable searchTerms = new Hashtable();
+ private static Parser parser;
+
+ //private static Object ontologyModel;
+ private static OntModel ontologyModel;
+
+ private static Ontology ontology = null;
+
+ private static Map m_anonIDs = new HashMap();
+ private static int m_anonCount = 0;
+
+ public OntologyImpl() {
+ //only initialize all the static variables
+ //if first time called to this ontology constructor
+ if (ontology == null) {
+ LOG.info( "creating new ontology");
+ parser = new OwlParser();
+ ontology = this;
+ }
+
+ if (ontologyModel == null)
+ ontologyModel =
+ ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM, null);
+ //ModelFactory.createOntologyModel();
+ }
+
+ public static Ontology getInstance () {
+ if (ontology == null) {
+ //ontology = new org.apache.nutch.ontology.Ontology();
+ ontology = new org.apache.nutch.ontology.jena.OntologyImpl();
+ }
+ return ontology;
+ }
+
+ public void load (String[] urls) {
+ for (int i=0; i<urls.length; i++) {
+ String url = urls[i].trim();
+ if (!url.equals(""))
+ load(ontologyModel, url);
+ }
+ parser.parse(ontologyModel);
+ }
+
+ private void load (Object m, String url) {
+ try {
+ LOG.info( "reading "+url);
+ ((OntModel)m).read(url);
+ } catch (Exception e) {
+ LOG.severe("failed on attempting to read ontology "+url);
+ LOG.severe(e.getMessage());
+ StackTraceElement[] traces = e.getStackTrace();
+ for (int i=0; i<traces.length; i++) {
+ LOG.severe(traces[i].toString());
+ }
+ }
+ }
+
+ public static Parser getParser() {
+ if (parser == null) {
+ parser = new OwlParser();
+ }
+ return parser;
+ }
+
+ public static OntModel getModel() {
+ return (OntModel)ontologyModel;
+ }
+
+ // not yet implemented
+ //public void merge (org.apache.nutch.ontology.Ontology o) {
+ //}
+
+ /**
+ * retrieve all subclasses of entity(ies) hashed to searchTerm
+ */
+ public Iterator subclasses (String entitySearchTerm) {
+ Map classMap = retrieve(entitySearchTerm);
+ Map subclasses = new HashMap();
+
+ Iterator iter = classMap.keySet().iterator();
+ while (iter.hasNext()) {
+ //OntClass resource = (OntClass) iter.next();
+ OntResource resource = (OntResource) iter.next();
+
+ if (resource instanceof OntClass) {
+ //get subclasses
+ for (Iterator i=((OntClass)resource).listSubClasses(); i.hasNext();) {
+ OntResource subclass = (OntResource) i.next();
+ for (Iterator j=subclass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ subclasses.put(l.toString(), "1");
+ }
+ }
+ //get individuals
+ for (Iterator i=((OntClass)resource).listInstances(); i.hasNext();) {
+ OntResource subclass = (OntResource) i.next();
+ for (Iterator j=subclass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ subclasses.put(l.toString(), "1");
+ }
+ }
+ } else if (resource instanceof Individual) {
+ for (Iterator i=resource.listSameAs(); i.hasNext();) {
+ OntResource subclass = (OntResource) i.next();
+ for (Iterator j=subclass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ subclasses.put(l.toString(), "1");
+ }
+ }
+ }
+ }
+
+ return subclasses.keySet().iterator();
+ }
+
+ /**
+ * retrieves synonyms from wordnet via sweet's web interface
+ */
+ public Iterator synonyms (String queryKeyPhrase) {
+ //need to have a html quote method instead
+ queryKeyPhrase = queryKeyPhrase.replaceAll("\\s+", "\\+");
+
+ Map classMap = retrieve(queryKeyPhrase);
+
+ Map synonyms = new HashMap();
+
+ Iterator iter = classMap.keySet().iterator();
+ while (iter.hasNext()) {
+ OntResource resource = (OntResource) iter.next();
+
+ //listLabels
+ for (Iterator i=resource.listLabels(null); i.hasNext();) {
+ Literal l = (Literal) i.next();
+ synonyms.put(l.toString(), "1");
+ }
+
+ if (resource instanceof Individual) {
+ //get all individuals same as this one
+ for (Iterator i=resource.listSameAs(); i.hasNext();) {
+ Individual individual = (Individual) i.next();
+ //add labels
+ for (Iterator j =individual.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) i.next();
+ synonyms.put(l.toString(), "1");
+ }
+ }
+ } else if (resource instanceof OntClass) {
+ //list equivalent classes
+ for (Iterator i=((OntClass)resource).listEquivalentClasses();
+ i.hasNext();) {
+ OntClass equivClass = (OntClass) i.next();
+ //add labels
+ for (Iterator j=equivClass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ synonyms.put(l.toString(), "1");
+ }
+ }
+ }
+ }
+
+ return synonyms.keySet().iterator();
+ }
+
+ public static void addSearchTerm(String label, OntResource resource) {
+ Map m = retrieve(label);
+ if (m == null) {
+ m=new HashMap();
+ }
+ m.put(resource, "1");
+ searchTerms.put(label.toLowerCase(), m);
+ }
+
+ public static Map retrieve(String label) {
+ Map m = (Map) searchTerms.get(label.toLowerCase());
+ if (m==null) {
+ m = new HashMap();
+ }
+ return m;
+ }
+
+ protected static void renderHierarchy( PrintStream out, OntClass cls,
+ List occurs, int depth ) {
+ renderClassDescription( out, cls, depth );
+ out.println();
+
+ // recurse to the next level down
+ if (cls.canAs( OntClass.class ) && !occurs.contains( cls )) {
+ for (Iterator i = cls.listSubClasses( true ); i.hasNext(); ) {
+ OntClass sub = (OntClass) i.next();
+
+ // we push this expression on the occurs list before we recurse
+ occurs.add( cls );
+ renderHierarchy( out, sub, occurs, depth + 1 );
+ occurs.remove( cls );
+ }
+ for (Iterator i=cls.listInstances(); i.hasNext(); ) {
+ Individual individual = (Individual) i.next();
+ renderURI(out, individual.getModel(), individual.getURI());
+ out.print(" [");
+ for (Iterator j=individual.listLabels(null); j.hasNext();) {
+ out.print(((Literal)j.next()).getString()+", ");
+ }
+ out.print("] ");
+ out.println();
+ }
+ }
+ }
+
+ public static void renderClassDescription( PrintStream out,
+ OntClass c, int depth ) {
+ indent( out, depth );
+
+ if (c.isRestriction()) {
+ renderRestriction( out, (Restriction) c.as( Restriction.class ) );
+ } else {
+ if (!c.isAnon()) {
+ out.print( "Class " );
+ //renderURI( out, c.getModel(), c.getURI() );
+
+ out.print (c.getLocalName());
+
+ out.print( " [" );
+ for (Iterator i=c.listLabels(null); i.hasNext(); ) {
+ out.print(((Literal)i.next()).getString()+", ");
+ }
+ out.print( "] ");
+ } else {
+ renderAnonymous( out, c, "class" );
+ }
+ }
+ }
+
+ protected static void renderRestriction( PrintStream out, Restriction r ) {
+ if (!r.isAnon()) {
+ out.print( "Restriction " );
+ renderURI( out, r.getModel(), r.getURI() );
+ } else {
+ renderAnonymous( out, r, "restriction" );
+ }
+
+ out.print( " on property " );
+ renderURI( out, r.getModel(), r.getOnProperty().getURI() );
+ }
+
+ protected static void renderURI( PrintStream out,
+ PrefixMapping prefixes, String uri ) {
+ out.print( prefixes.usePrefix( uri ) );
+ }
+
+ protected static void renderAnonymous( PrintStream out,
+ Resource anon, String name ) {
+ String anonID = (String) m_anonIDs.get( anon.getId() );
+ if (anonID == null) {
+ anonID = "a-" + m_anonCount++;
+ m_anonIDs.put( anon.getId(), anonID );
+ }
+
+ out.print( "Anonymous ");
+ out.print( name );
+ out.print( " with ID " );
+ out.print( anonID );
+ }
+
+ protected static void indent( PrintStream out, int depth ) {
+ for (int i = 0; i < depth; i++) {
+ out.print( " " );
+ }
+ }
+
+ public static void main( String[] args ) throws Exception {
+
+ Configuration conf = NutchConfiguration.create();
+ Ontology ontology = new OntologyFactory(conf).getOntology();
+
+ String urls = conf.get("extension.ontology.urls");
+ if (urls==null || urls.trim().equals("")) {
+ LOG.severe("No ontology url found.");
+ return;
+ }
+ ontology.load(urls.split("\\s+"));
+ LOG.info( "created new ontology");
+
+ for (Iterator i = getParser().rootClasses( getModel() );
+ i.hasNext(); ) {
+
+ //print class
+ OntClass c = (OntClass) i.next();
+
+ renderHierarchy(System.out, c, new LinkedList(), 0);
+ }
+
+ String[] terms =
+ new String[] { "Season" };
+
+ for (int i=0; i<terms.length; i++) {
+ Iterator iter = ontology.subclasses(terms[i]);
+ while (iter.hasNext()) {
+ System.out.println("subclass >> "+(String)iter.next());
+ }
+ }
+ }
+}
Propchange: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OntologyImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,146 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+//import org.apache.hadoop.util.LogFormatter;
+
+import com.hp.hpl.jena.ontology.OntClass;
+import com.hp.hpl.jena.ontology.OntModel;
+import com.hp.hpl.jena.ontology.Individual;
+import com.hp.hpl.jena.rdf.model.Literal;
+import org.apache.nutch.ontology.*;
+
+/**
+ * implementation of parser for w3c's OWL files
+ *
+ * @author michael j pan
+ */
+public class OwlParser implements Parser {
+ public OwlParser () {
+ }
+
+ /**
+ * parse owl ontology files using jena
+ */
+ public void parse(OntModel m) {
+ for (Iterator i = rootClasses( m ); i.hasNext(); ) {
+ OntClass c = (OntClass) i.next();
+
+ //dont deal with anonymous classes
+ if (c.isAnon()) {
+ continue;
+ }
+
+ parseClass( c, new ArrayList(), 0 );
+ }
+ }
+
+ protected void parseClass( OntClass cls, List occurs, int depth ) {
+ //dont deal with anonymous classes
+ if (cls.isAnon()) {
+ return;
+ }
+
+ //add cls to Ontology searchterms
+ //list labels
+ Iterator labelIter = cls.listLabels(null);
+ //if has no labels
+ if (!labelIter.hasNext()) {
+ //add rdf:ID as a label
+ cls.addLabel(rdfidToLabel(cls.getLocalName()), null);
+ }
+ //reset the label iterator
+ labelIter = cls.listLabels(null);
+
+ while(labelIter.hasNext()) {
+ Literal l = (Literal) labelIter.next();
+ OntologyImpl.addSearchTerm(l.toString(), cls);
+ }
+
+ // recurse to the next level down
+ if (cls.canAs( OntClass.class ) && !occurs.contains( cls )) {
+ //list subclasses
+ for (Iterator i = cls.listSubClasses( true ); i.hasNext(); ) {
+ OntClass sub = (OntClass) i.next();
+
+ // we push this expression on the occurs list before we recurse
+ occurs.add( cls );
+ parseClass(sub, occurs, depth+1);
+ occurs.remove( cls );
+ }
+
+ //list instances
+ for (Iterator i=cls.listInstances(); i.hasNext(); ) {
+ //add search terms for each instance
+
+ //list labels
+ Individual individual = (Individual) i.next();
+ for (Iterator j=individual.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ OntologyImpl.addSearchTerm(l.toString(), individual);
+ }
+ }
+ }
+ }
+
+ public Iterator rootClasses( OntModel m ) {
+ List roots = new ArrayList();
+
+ for (Iterator i = m.listClasses(); i.hasNext(); ) {
+ OntClass c = (OntClass) i.next();
+
+ try {
+ // too confusing to list all the restrictions as root classes
+ if (c.isAnon()) {
+ continue;
+ }
+
+ if (c.hasSuperClass( m.getProfile().THING(), true ) ) {
+ // this class is directly descended from Thing
+ roots.add( c );
+ } else if (c.getCardinality( m.getProfile().SUB_CLASS_OF() ) == 0 ) {
+ // this class has no super-classes
+ // (can occur if we're not using the reasoner)
+ roots.add( c );
+ }
+ } catch (Exception e) {
+ //e.printStackTrace();
+ System.out.println(e.getMessage());
+ }
+ }
+
+ return roots.iterator();
+ }
+
+ public String rdfidToLabel (String idString) {
+ Pattern p = Pattern.compile("([a-z0-9])([A-Z])");
+ Matcher m = p.matcher(idString);
+
+ String labelString = new String(idString);
+ while(m.find()) {
+ labelString = labelString.replaceAll(m.group(1)+m.group(2),
+ m.group(1)+" "+m.group(2));
+ }
+ return labelString;
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/OwlParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import com.hp.hpl.jena.ontology.OntModel;
+
+import java.util.Iterator;
+import org.apache.nutch.ontology.*;
+
+/**
+ * interface for the parser
+ *
+ * @author michael j pan
+ */
+public interface Parser {
+ public void parse(OntModel m);
+ public Iterator rootClasses(OntModel m);
+}
Propchange: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/jena/Parser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java (added)
+++ lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.ontology.jena;
+
+import org.apache.nutch.ontology.*;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.LinkedList;
+
+import java.lang.Exception;
+
+/**
+ * Unit tests for Ontology
+ *
+ * @author michael j pan
+ */
+public class TestOntology extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data",".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/ontology/build.xml during plugin compilation.
+ // Check ./src/plugin/ontology/sample/README.txt for what they are.
+ private String[] sampleFiles = {"time.owl"};
+
+ private static Ontology ontology;
+ private Configuration conf;
+ public TestOntology(String name) {
+ super(name);
+ }
+
+ protected void setUp() {
+ this.conf = NutchConfiguration.create();
+ }
+
+ protected void tearDown() {}
+
+ public void testIt() throws ProtocolException, ParseException, Exception {
+ String className = "Season";
+ String[] subclassNames =
+ new String[] {"Spring", "Summer", "Fall", "Winter"};
+
+ if (ontology==null) {
+ try {
+ ontology = new OntologyFactory(this.conf).getOntology();
+ } catch (Exception e) {
+ throw new Exception("Failed to instantiate ontology");
+ }
+ }
+
+ //foreach sample file
+ for (int i=0; i<sampleFiles.length; i++) {
+ //construct the url
+ String urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ ontology.load(new String[] {urlString});
+
+ List subclassList = new LinkedList();
+
+ Iterator iter = ontology.subclasses(className);
+ while (iter.hasNext()) {
+ String subclassLabel = (String) iter.next();
+ System.out.println(subclassLabel);
+ subclassList.add(subclassLabel);
+ }
+
+ for (int j=0; j<subclassNames.length; j++) {
+ assertTrue(subclassList.contains(subclassNames[j]));
+ }
+ }
+
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/jena/TestOntology.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Wed Mar 29 14:09:03 2006
@@ -21,7 +21,7 @@
name="Nutch Automaton URL Filter"
point="org.apache.nutch.net.URLFilter">
<implementation id="AutomatonURLFilter"
- class="org.apache.nutch.net.AutomatonURLFilter"/>
+ class="org.apache.nutch.urlfilter.automaton.AutomatonURLFilter"/>
</extension>
</plugin>
Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Automaton imports
+import dk.brics.automaton.RegExp;
+import dk.brics.automaton.RunAutomaton;
+import org.apache.nutch.net.*;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+
+
+/**
+ * RegexURLFilterBase implementation based on the
+ * <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ * Finite-State Automata for Java<sup>TM</sup>.
+ *
+ * @author Jérôme Charron
+ * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ */
+public class AutomatonURLFilter extends RegexURLFilterBase {
+
+ public AutomatonURLFilter() {
+ super();
+ }
+
+ public AutomatonURLFilter(String filename)
+ throws IOException, PatternSyntaxException {
+ super(filename);
+ }
+
+ AutomatonURLFilter(Reader reader)
+ throws IOException, IllegalArgumentException {
+ super(reader);
+ }
+
+
+ /* ----------------------------------- *
+ * <implementation:RegexURLFilterBase> *
+ * ----------------------------------- */
+
+ // Inherited Javadoc
+ protected String getRulesFile(Configuration conf) {
+ return conf.get("urlfilter.automaton.file");
+ }
+
+ // Inherited Javadoc
+ protected RegexRule createRule(boolean sign, String regex) {
+ return new Rule(sign, regex);
+ }
+
+ /* ------------------------------------ *
+ * </implementation:RegexURLFilterBase> *
+ * ------------------------------------ */
+
+
+ public static void main(String args[]) throws IOException {
+ main(new AutomatonURLFilter(), args);
+ }
+
+
+ private class Rule extends RegexRule {
+
+ private RunAutomaton automaton;
+
+ Rule(boolean sign, String regex) {
+ super(sign, regex);
+ automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+ }
+
+ protected boolean match(String url) {
+ return automaton.run(url);
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html Wed Mar 29 14:09:03 2006
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>
+A url filter plugin based on
+<a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+Automata for Java<sup>TM</sup>.
+</p>
+</body>
+</html>
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+// JUnit imports
+import junit.framework.Test;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+import org.apache.nutch.net.*;
+
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+
+
+/**
+ * JUnit based test of class <code>AutomatonURLFilter</code>.
+ *
+ * @author Jérôme Charron
+ */
+public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
+
+ public TestAutomatonURLFilter(String testName) {
+ super(testName);
+ }
+
+ public static Test suite() {
+ return new TestSuite(TestAutomatonURLFilter.class);
+ }
+
+ public static void main(String[] args) {
+ TestRunner.run(suite());
+ }
+
+ protected URLFilter getURLFilter(Reader rules) {
+ try {
+ return new AutomatonURLFilter(rules);
+ } catch (IOException e) {
+ fail(e.toString());
+ return null;
+ }
+ }
+
+ public void test() {
+ test("WholeWebCrawling");
+ test("IntranetCrawling");
+ bench(50, "Benchmarks");
+ bench(100, "Benchmarks");
+ bench(200, "Benchmarks");
+ bench(400, "Benchmarks");
+ bench(800, "Benchmarks");
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml Wed Mar 29 14:09:03 2006
@@ -19,7 +19,7 @@
name="Nutch Prefix URL Filter"
point="org.apache.nutch.net.URLFilter">
<implementation id="PrefixURLFilter"
- class="org.apache.nutch.net.PrefixURLFilter"/>
+ class="org.apache.nutch.urlfilter.prefix.PrefixURLFilter"/>
<!-- by default, attribute "file" is undefined, to keep classic behavior.
<implementation id="PrefixURLFilter"
class="org.apache.nutch.net.PrefixURLFilter"
Added: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,164 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// $Id: PrefixURLFilter.java,v 1.2 2005/02/07 19:10:37 cutting Exp $
+
+package org.apache.nutch.urlfilter.prefix;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.net.*;
+
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+/**
+ * Filters URLs based on a file of URL prefixes. The file is named by
+ * (1) property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and
+ * (2) attribute "file" in plugin.xml of this plugin
+ * Attribute "file" has higher precedence if defined.
+ *
+ * <p>The format of this file is one URL prefix per line.</p>
+ */
+public class PrefixURLFilter implements URLFilter {
+
+ private static final Logger LOG =
+ LogFormatter.getLogger(PrefixURLFilter.class.getName());
+
+ // read in attribute "file" of this plugin.
+ private static String attributeFile = null;
+
+ private TrieStringMatcher trie;
+
+ private Configuration conf;
+
+ public PrefixURLFilter() throws IOException {
+
+ }
+
+ public PrefixURLFilter(String filename) throws IOException {
+ trie = readConfigurationFile(new FileReader(filename));
+ }
+
+ public String filter(String url) {
+ if (trie.shortestMatch(url) == null)
+ return null;
+ else
+ return url;
+ }
+
+ private TrieStringMatcher readConfigurationFile(Reader reader)
+ throws IOException {
+
+ BufferedReader in=new BufferedReader(reader);
+ List urlprefixes = new ArrayList();
+ String line;
+
+ while((line=in.readLine())!=null) {
+ if (line.length() == 0)
+ continue;
+
+ char first=line.charAt(0);
+ switch (first) {
+ case ' ' : case '\n' : case '#' : // skip blank & comment lines
+ continue;
+ default :
+ urlprefixes.add(line);
+ }
+ }
+
+ return new PrefixStringMatcher(urlprefixes);
+ }
+
+ public static void main(String args[])
+ throws IOException {
+
+ PrefixURLFilter filter;
+ if (args.length >= 1)
+ filter = new PrefixURLFilter(args[0]);
+ else
+ filter = new PrefixURLFilter();
+
+ BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while((line=in.readLine())!=null) {
+ String out=filter.filter(line);
+ if(out!=null) {
+ System.out.println(out);
+ }
+ }
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ String pluginName = "urlfilter-prefix";
+ Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+ URLFilter.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+ if (attributeFile != null && attributeFile.trim().equals(""))
+ attributeFile = null;
+ if (attributeFile != null) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ } else {
+ // LOG.warning("Attribute \"file\" is not defined in plugin.xml for
+ // plugin "+pluginName);
+ }
+
+ String file = conf.get("urlfilter.prefix.file");
+ // attribute "file" takes precedence if defined
+ if (attributeFile != null)
+ file = attributeFile;
+ Reader reader = conf.getConfResourceAsReader(file);
+
+ if (reader == null) {
+ trie = new PrefixStringMatcher(new String[0]);
+ } else {
+ try {
+ trie = readConfigurationFile(reader);
+ } catch (IOException e) {
+ LOG.severe(e.getMessage());
+ // TODO mb@media-style.com: throw Exception? Because broken api.
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html Wed Mar 29 14:09:03 2006
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A url filter plugin.</p><p></p>
+</body>
+</html>
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml?rev=389901&r1=389900&r2=389901&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Wed Mar 29 14:09:03 2006
@@ -20,7 +20,7 @@
name="Nutch Regex URL Filter"
point="org.apache.nutch.net.URLFilter">
<implementation id="RegexURLFilter"
- class="org.apache.nutch.net.RegexURLFilter"/>
+ class="org.apache.nutch.urlfilter.regex.RegexURLFilter"/>
<!-- by default, attribute "file" is undefined, to keep classic behavior.
<implementation id="RegexURLFilter"
class="org.apache.nutch.net.RegexURLFilter"
Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.*;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+
+
+/**
+ * Filters URLs based on a file of regular expressions using the
+ * {@link java.util.regex Java Regex implementation}.
+ */
+public class RegexURLFilter extends RegexURLFilterBase {
+
+ public RegexURLFilter() {
+ super();
+ }
+
+ public RegexURLFilter(String filename)
+ throws IOException, PatternSyntaxException {
+ super(filename);
+ }
+
+ RegexURLFilter(Reader reader)
+ throws IOException, IllegalArgumentException {
+ super(reader);
+ }
+
+
+ /* ----------------------------------- *
+ * <implementation:RegexURLFilterBase> *
+ * ----------------------------------- */
+
+ // Inherited Javadoc
+ protected String getRulesFile(Configuration conf) {
+ return conf.get("urlfilter.regex.file");
+ }
+
+ // Inherited Javadoc
+ protected RegexRule createRule(boolean sign, String regex) {
+ return new Rule(sign, regex);
+ }
+
+ /* ------------------------------------ *
+ * </implementation:RegexURLFilterBase> *
+ * ------------------------------------ */
+
+
+ public static void main(String args[]) throws IOException {
+ main(new RegexURLFilter(), args);
+ }
+
+
+ private class Rule extends RegexRule {
+
+ private Pattern pattern;
+
+ Rule(boolean sign, String regex) {
+ super(sign, regex);
+ pattern = Pattern.compile(regex);
+ }
+
+ protected boolean match(String url) {
+ return pattern.matcher(url).find();
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html Wed Mar 29 14:09:03 2006
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A url filter plugin.</p><p></p>
+</body>
+</html>
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=389901&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Wed Mar 29 14:09:03 2006
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+// JUnit imports
+import junit.framework.Test;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+import org.apache.nutch.net.*;
+
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+
+
+/**
+ * JUnit based test of class <code>RegexURLFilter</code>.
+ *
+ * @author Jérôme Charron
+ */
+public class TestRegexURLFilter extends RegexURLFilterBaseTest {
+
+ public TestRegexURLFilter(String testName) {
+ super(testName);
+ }
+
+ public static Test suite() {
+ return new TestSuite(TestRegexURLFilter.class);
+ }
+
+ public static void main(String[] args) {
+ TestRunner.run(suite());
+ }
+
+ protected URLFilter getURLFilter(Reader rules) {
+ try {
+ return new RegexURLFilter(rules);
+ } catch (IOException e) {
+ fail(e.toString());
+ return null;
+ }
+ }
+
+ public void test() {
+ test("WholeWebCrawling");
+ test("IntranetCrawling");
+ bench(50, "Benchmarks");
+ bench(100, "Benchmarks");
+ bench(200, "Benchmarks");
+ bench(400, "Benchmarks");
+ bench(800, "Benchmarks");
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
------------------------------------------------------------------------------
svn:eol-style = native