You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/21 23:24:19 UTC
svn commit: r387647 - in /lucene/nutch/trunk/src/plugin/lib-regex-filter: ./
sample/ src/ src/java/ src/java/org/ src/java/org/apache/
src/java/org/apache/nutch/ src/java/org/apache/nutch/net/ src/test/
src/test/org/ src/test/org/apache/ src/test/org/a...
Author: jerome
Date: Tue Mar 21 14:24:16 2006
New Revision: 387647
URL: http://svn.apache.org/viewcvs?rev=387647&view=rev
Log:
Add a mini framework plugin for regex url filter plugins.
Added:
lucene/nutch/trunk/src/plugin/lib-regex-filter/
lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml (with props)
lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml (with props)
lucene/nutch/trunk/src/plugin/lib-regex-filter/sample/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java (with props)
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java (with props)
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/
lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java (with props)
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml Tue Mar 21 14:24:16 2006
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="lib-regex-filter" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml Tue Mar 21 14:24:16 2006
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ! A common framework for RegExp based URL filters
+ !-->
+<plugin
+ id="lib-regex-filter"
+ name="Regex URL Filter Framework"
+ version="1.0"
+ provider-name="org.apache.nutch">
+
+ <runtime>
+ <library name="lib-regex-filter.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+</plugin>
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java Tue Mar 21 14:24:16 2006
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+
+/**
+ * A generic regular expression rule.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexRule {
+
+ private boolean sign;
+ private String regex;
+
+ /**
+ * Constructs a new regular expression rule.
+ *
+ * @param sign specifies if this rule must filter-in or filter-out.
+ * A <code>true</code> value means that any url matching this rule
+ * must be accepted, a <code>false</code> value means that any url
+ * matching this rule must be rejected.
+ * @param regex is the regular expression used for matching (see
+ * {@link #match(String)} method).
+ */
+ protected RegexRule(boolean sign, String regex) {
+ this.sign = sign;
+ this.regex = regex;
+ }
+
+ /**
+ * Return if this rule is used for filtering-in or out.
+ *
+ * @return <code>true</code> if any url matching this rule must be accepted,
+ * otherwise <code>false</code>.
+ */
+ protected boolean accept() { return sign; }
+
+ /**
+ * Checks if a url matches this rule.
+ * @param url is the url to check.
+ * @return <code>true</code> if the specified url matches this rule,
+ * otherwise <code>false</code>.
+ */
+ protected abstract boolean match(String url);
+
+}
+
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java Tue Mar 21 14:24:16 2006
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+// JDK imports
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
+ * regular expressions.
+ *
+ * <p>The regular expressions rules are expressed in a file. The file of rules
+ * is provided by each implementation using the
+ * {@link #getRulesFile(Configuration)} method.</p>
+ *
+ * <p>The format of this file is made of many rules (one per line):<br/>
+ * <code>
+ * [+-]<regex>
+ * </code><br/>
+ * where plus (<code>+</code>)means go ahead and index it and minus
+ * (<code>-</code>)means no.</p>
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexURLFilterBase implements URLFilter {
+
+ /** My logger */
+ private final static Logger LOG =
+ LogFormatter.getLogger(RegexURLFilterBase.class.getName());
+
+ /** An array of applicable rules */
+ private RegexRule[] rules;
+
+ /** The current configuration */
+ private Configuration conf;
+
+
+ /**
+ * Constructs a new empty RegexURLFilterBase
+ */
+ public RegexURLFilterBase() { }
+
+ /**
+ * Constructs a new RegexURLFilter and init it with a file of rules.
+ * @param filename is the name of rules file.
+ */
+ public RegexURLFilterBase(String filename)
+ throws IOException, IllegalArgumentException {
+ this(new FileReader(filename));
+ }
+
+ /**
+ * Constructs a new RegexURLFilter and init it with a Reader of rules.
+ * @param reader is a reader of rules.
+ */
+ protected RegexURLFilterBase(Reader reader)
+ throws IOException, IllegalArgumentException {
+ rules = readRulesFile(reader);
+ }
+
+ /**
+ * Creates a new {@link RegexRule}.
+ * @param sign of the regular expression.
+ * A <code>true</code> value means that any URL matching this rule
+ * must be included, whereas a <code>false</code>
+ * value means that any URL matching this rule must be excluded.
+ * @param regex is the regular expression associated to this rule.
+ */
+ protected abstract RegexRule createRule(boolean sign, String regex);
+
+ /**
+ * Returns the name of the file of rules to use for
+ * a particular implementation.
+ * @param conf is the current configuration.
+ * @return the name of the file of rules to use.
+ */
+ protected abstract String getRulesFile(Configuration conf);
+
+
+ /* -------------------------- *
+ * <implementation:URLFilter> *
+ * -------------------------- */
+
+ // Inherited Javadoc
+ public synchronized String filter(String url) {
+ for (int i=0; i<rules.length; i++) {
+ if (rules[i].match(url)) {
+ return rules[i].accept() ? url : null;
+ }
+ };
+ return null;
+ }
+
+ /* --------------------------- *
+ * </implementation:URLFilter> *
+ * --------------------------- */
+
+
+ /* ----------------------------- *
+ * <implementation:Configurable> *
+ * ----------------------------- */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ String file = getRulesFile(conf);
+ Reader reader = conf.getConfResourceAsReader(file);
+ if (reader == null) {
+ LOG.severe("Can't find resource: " + file);
+ } else {
+ try {
+ rules = readRulesFile(reader);
+ } catch (IOException e) {
+ LOG.severe(e.getMessage());
+ //TODO mb@media-style.com: throw Exception? Because broken api.
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /* ------------------------------ *
+ * </implementation:Configurable> *
+ * ------------------------------ */
+
+
+ /**
+ * Read the specified file of rules.
+ * @param reader is a reader of regular expressions rules.
+ * @return the corresponding {@RegexRule rules}.
+ */
+ private RegexRule[] readRulesFile(Reader reader)
+ throws IOException, IllegalArgumentException {
+
+ BufferedReader in = new BufferedReader(reader);
+ List rules = new ArrayList();
+ String line;
+
+ while((line=in.readLine())!=null) {
+ if (line.length() == 0) {
+ continue;
+ }
+ char first=line.charAt(0);
+ boolean sign=false;
+ switch (first) {
+ case '+' :
+ sign=true;
+ break;
+ case '-' :
+ sign=false;
+ break;
+ case ' ' : case '\n' : case '#' : // skip blank & comment lines
+ continue;
+ default :
+ throw new IOException("Invalid first character: "+line);
+ }
+
+ String regex = line.substring(1);
+ LOG.fine("Adding rule [" + regex + "]");
+ RegexRule rule = createRule(sign, regex);
+ rules.add(rule);
+ }
+ return (RegexRule[]) rules.toArray(new RegexRule[rules.size()]);
+ }
+
+ /**
+ * Filter the standard input using a RegexURLFilterBase.
+ * @param filter is the RegexURLFilterBase to use for filtering the
+ * standard input.
+ * @param args some optional parameters (not used).
+ */
+ public static void main(RegexURLFilterBase filter, String args[])
+ throws IOException, IllegalArgumentException {
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while((line=in.readLine())!=null) {
+ String out = filter.filter(line);
+ if (out!=null) {
+ System.out.print("+");
+ System.out.println(out);
+ } else {
+ System.out.print("-");
+ System.out.println(line);
+ }
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java Tue Mar 21 14:24:16 2006
@@ -0,0 +1,139 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+
+/**
+ * JUnit based test of class <code>RegexURLFilterBase</code>.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexURLFilterBaseTest extends TestCase {
+
+ /** My logger */
+ protected static final Logger LOG =
+ LogFormatter.getLogger(RegexURLFilterBaseTest.class.getName());
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public RegexURLFilterBaseTest(String testName) {
+ super(testName);
+ }
+
+ protected abstract URLFilter getURLFilter(Reader rules);
+
+ protected void bench(int loops, String file) {
+ try {
+ bench(loops,
+ new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ protected void bench(int loops, Reader rules, Reader urls) {
+ long start = System.currentTimeMillis();
+ try {
+ URLFilter filter = getURLFilter(rules);
+ FilteredURL[] expected = readURLFile(urls);
+ for (int i=0; i<loops; i++) {
+ test(filter, expected);
+ }
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ LOG.info("bench time (" + loops + ") " +
+ (System.currentTimeMillis()-start) + "ms");
+ }
+
+ protected void test(String file) {
+ try {
+ test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ protected void test(Reader rules, Reader urls) {
+ try {
+ test(getURLFilter(rules), readURLFile(urls));
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ protected void test(URLFilter filter, FilteredURL[] expected) {
+ for (int i=0; i<expected.length; i++) {
+ String result = filter.filter(expected[i].url);
+ if (result != null) {
+ assertTrue(expected[i].url, expected[i].sign);
+ } else {
+ assertFalse(expected[i].url, expected[i].sign);
+ }
+ }
+ }
+
+ private static FilteredURL[] readURLFile(Reader reader) throws IOException {
+ BufferedReader in = new BufferedReader(reader);
+ List list = new ArrayList();
+ String line;
+ while((line=in.readLine()) != null) {
+ if (line.length() != 0) {
+ list.add(new FilteredURL(line));
+ }
+ }
+ return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
+ }
+
+ private static class FilteredURL {
+
+ boolean sign;
+ String url;
+
+ FilteredURL(String line) {
+ switch (line.charAt(0)) {
+ case '+' :
+ sign = true;
+ break;
+ case '-' :
+ sign = false;
+ break;
+ default :
+ // Simply ignore...
+ }
+ url = line.substring(1);
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java
------------------------------------------------------------------------------
svn:eol-style = native