You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/21 23:24:19 UTC

svn commit: r387647 - in /lucene/nutch/trunk/src/plugin/lib-regex-filter: ./ sample/ src/ src/java/ src/java/org/ src/java/org/apache/ src/java/org/apache/nutch/ src/java/org/apache/nutch/net/ src/test/ src/test/org/ src/test/org/apache/ src/test/org/a...

Author: jerome
Date: Tue Mar 21 14:24:16 2006
New Revision: 387647

URL: http://svn.apache.org/viewcvs?rev=387647&view=rev
Log:
Add a mini framework plugin for regex url filter plugins.

Added:
    lucene/nutch/trunk/src/plugin/lib-regex-filter/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/lib-regex-filter/sample/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java   (with props)
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java   (with props)
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java   (with props)

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml Tue Mar 21 14:24:16 2006
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="lib-regex-filter" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml Tue Mar 21 14:24:16 2006
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ! A common framework for RegExp based URL filters
+ !-->
+<plugin
+   id="lib-regex-filter"
+   name="Regex URL Filter Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-regex-filter.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java Tue Mar 21 14:24:16 2006
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+
+/**
+ * A generic regular expression rule.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexRule {
+
+  private boolean sign;
+  private String regex;
+
+  /**
+   * Constructs a new regular expression rule.
+   *
+   * @param sign specifies if this rule must filter-in or filter-out.
+   *        A <code>true</code> value means that any url matching this rule
+   *        must be accepted, a <code>false</code> value means that any url
+   *        matching this rule must be rejected.
+   * @param regex is the regular expression used for matching (see
+   *        {@link #match(String)} method).
+   */
+  protected RegexRule(boolean sign, String regex) {
+    this.sign = sign;
+    this.regex = regex;
+  }
+
+  /**
+   * Return if this rule is used for filtering-in or out.
+   *
+   * @return <code>true</code> if any url matching this rule must be accepted,
+   *         otherwise <code>false</code>.
+   */
+  protected boolean accept() { return sign; }
+  
+  /**
+   * Checks if a url matches this rule.
+   * @param url is the url to check.
+   * @return <code>true</code> if the specified url matches this rule,
+   *         otherwise <code>false</code>.
+   */
+  protected abstract boolean match(String url);
+
+}
+

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexRule.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java Tue Mar 21 14:24:16 2006
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+// JDK imports
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
+ * regular expressions.
+ *
+ * <p>The regular expressions rules are expressed in a file. The file of rules
+ * is provided by each implementation using the
+ * {@link #getRulesFile(Configuration)} method.</p>
+ * 
+ * <p>The format of this file is made of many rules (one per line):<br/>
+ * <code>
+ * [+-]&lt;regex&gt;
+ * </code><br/>
+ * where plus (<code>+</code>)means go ahead and index it and minus 
+ * (<code>-</code>)means no.</p>
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBase implements URLFilter {
+
+  /** My logger */
+  private final static Logger LOG =
+    LogFormatter.getLogger(RegexURLFilterBase.class.getName());
+
+  /** An array of applicable rules */
+  private RegexRule[] rules;
+
+  /** The current configuration */
+  private Configuration conf;
+
+
+  /**
+   * Constructs a new empty RegexURLFilterBase
+   */
+  public RegexURLFilterBase() { }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a file of rules.
+   * @param filename is the name of rules file.
+   */
+  public RegexURLFilterBase(String filename)
+    throws IOException, IllegalArgumentException {
+    this(new FileReader(filename));
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a Reader of rules.
+   * @param reader is a reader of rules.
+   */
+  protected RegexURLFilterBase(Reader reader)
+    throws IOException, IllegalArgumentException {
+    rules = readRulesFile(reader);
+  }
+  
+  /**
+   * Creates a new {@link RegexRule}.
+   * @param sign of the regular expression.
+   *        A <code>true</code> value means that any URL matching this rule
+   *        must be included, whereas a <code>false</code>
+   *        value means that any URL matching this rule must be excluded.
+   * @param regex is the regular expression associated to this rule.
+   */
+  protected abstract RegexRule createRule(boolean sign, String regex);
+  
+  /**
+   * Returns the name of the file of rules to use for
+   * a particular implementation.
+   * @param conf is the current configuration.
+   * @return the name of the file of rules to use.
+   */
+  protected abstract String getRulesFile(Configuration conf);
+  
+  
+  /* -------------------------- *
+   * <implementation:URLFilter> *
+   * -------------------------- */
+  
+  // Inherited Javadoc
+  public synchronized String filter(String url) {
+    for (int i=0; i<rules.length; i++) {
+      if (rules[i].match(url)) {
+        return rules[i].accept() ? url : null;
+      }
+    };
+    return null;
+  }
+
+  /* --------------------------- *
+   * </implementation:URLFilter> *
+   * --------------------------- */
+  
+  
+  /* ----------------------------- *
+   * <implementation:Configurable> *
+   * ----------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String file = getRulesFile(conf);
+    Reader reader = conf.getConfResourceAsReader(file);
+    if (reader == null) {
+      LOG.severe("Can't find resource: " + file);
+    } else {
+      try {
+        rules = readRulesFile(reader);
+      } catch (IOException e) {
+        LOG.severe(e.getMessage());
+        //TODO mb@media-style.com: throw Exception? Because broken api.
+        throw new RuntimeException(e.getMessage(), e);
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  /* ------------------------------ *
+   * </implementation:Configurable> *
+   * ------------------------------ */
+  
+
+  /**
+   * Read the specified file of rules.
+   * @param reader is a reader of regular expressions rules.
+   * @return the corresponding {@RegexRule rules}.
+   */
+  private RegexRule[] readRulesFile(Reader reader)
+    throws IOException, IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(reader);
+    List rules = new ArrayList();
+    String line;
+       
+    while((line=in.readLine())!=null) {
+      if (line.length() == 0) {
+        continue;
+      }
+      char first=line.charAt(0);
+      boolean sign=false;
+      switch (first) {
+      case '+' : 
+        sign=true;
+        break;
+      case '-' :
+        sign=false;
+        break;
+      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
+        continue;
+      default :
+        throw new IOException("Invalid first character: "+line);
+      }
+
+      String regex = line.substring(1);
+      LOG.fine("Adding rule [" + regex + "]");
+      RegexRule rule = createRule(sign, regex);
+      rules.add(rule);
+    }
+    return (RegexRule[]) rules.toArray(new RegexRule[rules.size()]);
+  }
+
+  /**
+   * Filter the standard input using a RegexURLFilterBase.
+   * @param filter is the RegexURLFilterBase to use for filtering the
+   *        standard input.
+   * @param args some optional parameters (not used).
+   */
+  public static void main(RegexURLFilterBase filter, String args[])
+    throws IOException, IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while((line=in.readLine())!=null) {
+      String out = filter.filter(line);
+      if (out!=null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/net/RegexURLFilterBase.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java?rev=387647&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java Tue Mar 21 14:24:16 2006
@@ -0,0 +1,139 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+
+/**
+ * JUnit based test of class <code>RegexURLFilterBase</code>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBaseTest extends TestCase {
+  
+  /** My logger */
+  protected static final Logger LOG =
+    LogFormatter.getLogger(RegexURLFilterBaseTest.class.getName());  
+
+  private final static String SEPARATOR = System.getProperty("file.separator");  
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+  
+  public RegexURLFilterBaseTest(String testName) {
+    super(testName);
+  }
+  
+  protected abstract URLFilter getURLFilter(Reader rules);
+
+  protected void bench(int loops, String file) {
+    try {
+      bench(loops,
+            new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+            new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void bench(int loops, Reader rules, Reader urls) {
+    long start = System.currentTimeMillis();
+    try {
+      URLFilter filter = getURLFilter(rules);
+      FilteredURL[] expected = readURLFile(urls);
+      for (int i=0; i<loops; i++) {
+        test(filter, expected);
+      }
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+    LOG.info("bench time (" + loops + ") " +
+             (System.currentTimeMillis()-start) + "ms");
+  }
+  
+  protected void test(String file) {
+    try {
+      test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+           new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void test(Reader rules, Reader urls) {
+    try {
+      test(getURLFilter(rules), readURLFile(urls));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void test(URLFilter filter, FilteredURL[] expected) {
+    for (int i=0; i<expected.length; i++) {
+      String result = filter.filter(expected[i].url);
+      if (result != null) {
+        assertTrue(expected[i].url, expected[i].sign);
+      } else {
+        assertFalse(expected[i].url, expected[i].sign);
+      }
+    }
+  }
+  
+  private static FilteredURL[] readURLFile(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    List list = new ArrayList();
+    String line;
+    while((line=in.readLine()) != null) {
+      if (line.length() != 0) {
+        list.add(new FilteredURL(line));
+      }
+    }
+    return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
+  }
+    
+  private static class FilteredURL {
+  
+    boolean sign;
+    String url;
+
+    FilteredURL(String line) {
+      switch (line.charAt(0)) {
+      case '+' : 
+        sign = true;
+        break;
+      case '-' :
+        sign = false;
+        break;
+      default :
+        // Simply ignore...
+      }
+      url = line.substring(1);
+    }
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/net/RegexURLFilterBaseTest.java
------------------------------------------------------------------------------
    svn:eol-style = native