You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/03/11 22:18:24 UTC
svn commit: r517015 - in /lucene/nutch/trunk: ./ lib/ src/java/org/apache/nutch/parse/ src/plugin/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/index-more/src/test/ src/plugin/ontology/lib/ src/plugin/parse-js/src/java/org/a...

Author: siren
Date: Sun Mar 11 14:18:23 2007
New Revision: 517015

URL: http://svn.apache.org/viewvc?view=rev&rev=517015
Log:
merging 517012:516728 excluding changes made by dennis



Added:
    lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar
      - copied unchanged from r516728, lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar
    lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
      - copied unchanged from r516728, lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
    lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
      - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
    lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar
      - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar
    lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
      - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
Removed:
    lucene/nutch/trunk/src/plugin/index-more/src/test/
    lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html
    lucene/nutch/trunk/src/plugin/parse-js/src/test/
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/build.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
    lucene/nutch/trunk/src/plugin/build.xml
    lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
    lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
    lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 14:18:23 2007
@@ -158,18 +158,11 @@
 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins
     framework to operate properly (Heiko Dietze via mattmann)
 
-54. Change OutlinkExtractor to use Regular Expressions from JRE (siren)
-
-55. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan
+54. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan
     Groschupf via kubes)
     
-56. NUTCH-436 - Incorrect handling of relative paths when the embedded URL 
-	path is empty (kubes)
-	
-57. Replace oro with jre regular expressions in plugins, remove oro from
-    dependencies (siren)
-
-58. Remove redundant commons logging jars (siren)
+55. NUTCH-436 - Incorrect handling of relative paths when the embedded URL 
+    path is empty (kubes)
 	
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Sun Mar 11 14:18:23 2007
@@ -148,20 +148,8 @@
     <jar jarfile="${build.dir}/${final.name}.job">
       <zipfileset dir="${build.classes}"/>
       <zipfileset dir="${conf.dir}" excludes="*.template"/>
-      <zipfileset dir="${lib.dir}" prefix="lib" includes="**/*.jar">
-        <exclude name="hadoop-*.jar"/>
-        <exclude name="servlet-*.jar"/>
-        <exclude name="junit*.jar"/>
-        <exclude name="jetty-*.jar"/>
-        <exclude name="pmd-ext/*"/>
-        <exclude name="jetty-ext/*"/>
-        <exclude name="jets3t.jar"/>
-        <exclude name="taglib*.jar"/>
-        <exclude name="commons-cli*.jar"/>
-        <exclude name="xerces-*.jar"/>
-        <exclude name="commons-logging-1.0.4.jar"/>
-        <exclude name="log4j-1.2.13.jar"/>
-      </zipfileset>
+      <zipfileset dir="${lib.dir}" prefix="lib"
+                  includes="**/*.jar" excludes="hadoop-*.jar"/>
       <zipfileset dir="${build.plugins}" prefix="plugins"/>
     </jar>
   </target>

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Sun Mar 11 14:18:23 2007
@@ -1,4 +1,4 @@
-/*
+/**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,21 +14,28 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.nutch.parse;
 
 import java.net.MalformedURLException;
 import java.util.ArrayList;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
 
 /**
- * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
- * plain text using Regular Expressions.
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s 
+ * / URLs from plain text using Regular Expressions.
  * 
  * @see <a
  *      href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
@@ -37,14 +44,12 @@
  *      </a>
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
- *
+ * @version 1.0
  * @since 0.7
  */
 public class OutlinkExtractor {
   private static final Log LOG = LogFactory.getLog(OutlinkExtractor.class);
 
-  private static final Outlink[] NO_LINKS = new Outlink[0];
-
   /**
    * Regex pattern to get URLs within a plain text.
    * 
@@ -52,63 +57,190 @@
    *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
    *      </a>
    */
-  private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
-
-  static final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+  private static final String URL_PATTERN = 
+    "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
 
   /**
-   * Extracts outlinks from a plain text.
-   * </p>
-   * @param plainText
+   * Extracts <code>Outlink</code> from given plain text.
+   * Applying this method to non-plain-text can result in extremely lengthy
+   * runtimes for parasitic cases (postscript is a known example).
+   * @param plainText  the plain text from wich URLs should be extracted.
    * 
-   * @return Array of <code>Outlink</code> s within found in plainText
+   * @return Array of <code>Outlink</code>s within found in plainText
    */
-  public static Outlink[] getOutlinks(final String plainText, Configuration conf){
-    return getOutlinks(plainText, null, conf);
+  public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
+    return OutlinkExtractor.getOutlinks(plainText, "", conf);
   }
 
-  
   /**
-   * Extracts outlinks from a plain text.
-   * </p>
-   * @param plainText text to extract urls from
+   * Extracts <code>Outlink</code> from given plain text and adds anchor
+   * to the extracted <code>Outlink</code>s
+   * 
+   * @param plainText the plain text from wich URLs should be extracted.
+   * @param anchor    the anchor of the url
    * 
-   * @return Array of <code>Outlink</code> s found in plainText
+   * @return Array of <code>Outlink</code>s within found in plainText
    */
-  public static Outlink[] getOutlinks(final String plainText, String anchor,
-      Configuration conf) {
-    
-    if(plainText == null){
-      return NO_LINKS;
-    }
-
-    final ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
-    Outlink[] retval;
-    Outlink link;
-
-    Matcher m = urlPattern.matcher(plainText);
-    while (m.find()) {
-
-      try {
-        link = new Outlink(m.toMatchResult().group(), anchor, conf);
-        outlinks.add(link);
-      } catch (MalformedURLException ex) {
-        // if it is a malformed URL we just throw it away and continue with
-        // extraction.
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("extracted malformed url:" + m.toMatchResult().group(), ex);
+  public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) {
+    long start = System.currentTimeMillis();
+    final List outlinks = new ArrayList();
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(URL_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      //loop the matches
+      while (matcher.contains(input, pattern)) {
+        // if this is taking too long, stop matching
+        //   (SHOULD really check cpu time used so that heavily loaded systems
+        //   do not unnecessarily hit this limit.)
+        if (System.currentTimeMillis() - start >= 60000L) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Time limit exceeded for getOutLinks");
+          }
+          break;
+        }
+        result = matcher.getMatch();
+        url = result.group(0);
+        try {
+          Outlink outlink = new Outlink(url, anchor, conf);
+          outlinks.add(new Outlink(url, anchor, conf));
+        } catch (MalformedURLException mue) {
+          LOG.warn("Invalid url: '" + url + "', skipping.");
         }
       }
-
+    } catch (Exception ex) {
+      // if the matcher fails (perhaps a malformed URL) we just log it and move on
+      if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
     }
 
-    if (outlinks.size() > 0) {
-      retval = outlinks.toArray(new Outlink[outlinks.size()]);
+    final Outlink[] retval;
+
+    //create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
     } else {
-      retval = NO_LINKS;
+      retval = new Outlink[0];
     }
 
     return retval;
   }
+  
 
+  /**
+   * Extracts outlinks from a plain text. <br />
+   * This Method takes the Jakarta Regexp API.
+   * 
+   * @param plainText
+   * 
+   * @return Array of <code>Outlink</code> s within found in plainText
+   * @deprecated only for tests
+   */
+  private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
+
+    throw new UnsupportedOperationException(
+        "Implementation commented out. Please uncomment to use it.");
+
+    // final List outlinks = new ArrayList();
+    // String url;
+    // Outlink link;
+    //
+    // RE re = new RE(URL_PATTERN);
+    //
+    // int pos = 0;
+    //
+    // while (re.match(plainText, pos)) {
+    //
+    // url = re.getParen(0);
+    //
+    // if (LOG.isTraceEnabled()) {
+    //   LOG.trace("Extracted url: " + url);
+    // }
+    //
+    // try {
+    //
+    // link = new Outlink(url, null);
+    // outlinks.add(link);
+    //
+    // } catch (MalformedURLException ex) {
+    // // if it is a malformed URL we just throw it away and continue with
+    // // extraction.
+    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+    // }
+    //
+    // pos = re.getParenEnd(0);
+    // }
+    //
+    // final Outlink[] retval;
+    //
+    // if (pos > 0) {
+    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    // } else {
+    // retval = new Outlink[0];
+    // }
+    //
+    // return retval;
+
+  }
+
+  /**
+   * Extracts outlinks from a plain text.
+   * </p>
+   * This Method takes the JDK5 Regexp API.
+   * 
+   * @param plainText
+   * 
+   * @return Array of <code>Outlink</code> s within found in plainText
+   * @deprecated only for tests
+   */
+  private Outlink[] getOutlinksJDK5Impl(final String plainText) {
+
+    throw new UnsupportedOperationException(
+        "Implementation commented out. Please uncomment to use it.");
+
+    // final List outlinks = new ArrayList();
+    // String url;
+    // Outlink link;
+    //
+    // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+    // final RE re = new RE(urlPattern);
+    //
+    // int pos = 0;
+    //
+    // while (re.match(plainText, pos)) {
+    //
+    // url = re.getParen(0);
+    //
+    // try {
+    //
+    // link = new Outlink(url, null);
+    // outlinks.add(link);
+    // } catch (MalformedURLException ex) {
+    // // if it is a malformed URL we just throw it away and continue with
+    // // extraction.
+    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+    // }
+    //
+    // pos = re.getParenEnd(0);
+    // }
+    //
+    // final Outlink[] retval;
+    //
+    // if (pos > 0) {
+    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    // } else {
+    // retval = new Outlink[0];
+    // }
+    //
+    // return retval;
+  }
+ 
 }

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sun Mar 11 14:18:23 2007
@@ -83,13 +83,11 @@
   <target name="test">
     <parallel threadCount="2">
      <ant dir="creativecommons" target="test"/>
-     <ant dir="index-more" target="test"/>
      <ant dir="languageidentifier" target="test"/>
      <ant dir="lib-http" target="test"/>
      <ant dir="ontology" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="parse-html" target="test"/>
-     <ant dir="parse-js" target="test"/>
      <!-- <ant dir="parse-mp3" target="test"/> -->
      <ant dir="parse-msexcel" target="test"/>
      <ant dir="parse-mspowerpoint" target="test"/>

Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sun Mar 11 14:18:23 2007
@@ -16,6 +16,14 @@
  */
 package org.apache.nutch.indexer.more;
 
+
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.apache.oro.text.regex.Perl5Pattern;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.MalformedPatternException;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
@@ -48,8 +56,6 @@
 
 import java.util.Date;
 import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.commons.lang.time.DateUtils;
 
@@ -238,15 +244,21 @@
   // Patterns used to extract filename from possible non-standard
   // HTTP header "Content-Disposition". Typically it looks like:
   // Content-Disposition: inline; filename="foo.ppt"
+  private PatternMatcher matcher = new Perl5Matcher();
 
   private Configuration conf;
-  static Pattern patterns[] = new Pattern[2];
+  static Perl5Pattern patterns[] = {null, null};
   static {
+    Perl5Compiler compiler = new Perl5Compiler();
+    try {
       // order here is important
       patterns[0] =
-        Pattern.compile("\\bfilename=['\"](.+)['\"]");
+        (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]");
       patterns[1] =
-        Pattern.compile("\\bfilename=(\\S+)\\b");
+        (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
+    } catch (MalformedPatternException e) {
+      // just ignore
+    }
   }
 
   private Document resetTitle(Document doc, ParseData data, String url) {
@@ -254,28 +266,16 @@
     if (contentDisposition == null)
       return doc;
 
-    String filename = getFileName(contentDisposition);
-
-    if (filename != null) {
-      doc.add(new Field("title", filename, Field.Store.YES, Field.Index.NO));
-    }
-
-    return doc;
-  }
-  
-  String getFileName(String value) {
-
-    String filename = null;
-
-    for (int i = 0; i < patterns.length; i++) {
-      Matcher matcher = patterns[i].matcher(value);
-      if(matcher.find()) {
-        filename = matcher.group(1);
+    MatchResult result;
+    for (int i=0; i<patterns.length; i++) {
+      if (matcher.contains(contentDisposition,patterns[i])) {
+        result = matcher.getMatch();
+        doc.add(new Field("title", result.group(1), Field.Store.YES, Field.Index.NO));
         break;
       }
     }
-    return filename;
 
+    return doc;
   }
 
   public void setConf(Configuration conf) {

Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sun Mar 11 14:18:23 2007
@@ -25,8 +25,6 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -42,6 +40,13 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -49,24 +54,11 @@
 import org.w3c.dom.NodeList;
 
 /**
- * <p>
- * This class is a heuristic link extractor for JavaScript files and code
- * snippets. The general idea of a two-pass regex matching comes from Heritrix.
- * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter.
- * </p>
- * 
- * <p>
- * This Filter extracts javascript from following locations:
- * </p>
- * <li>from inside &lt;script> tags</li>
- * <li>from html 4.0 events like Window: onload,onunload, Form:
- * onchange,onsubmit,onreset,onselect,onblur,onfocus Keyboard:
- * onkeydown,onkeypress,onkeyup Mouse:
- * onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
- * </li>
- * <li>a href starting with literal "javascript"</li>
- * 
- * 
+ * This class is a heuristic link extractor for JavaScript files and
+ * code snippets. The general idea of a two-pass regex matching comes from
+ * Heritrix. Parts of the code come from OutlinkExtractor.java
+ * by Stephan Strittmatter.
+ *
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
  */
 public class JSParseFilter implements HtmlParseFilter, Parser {
@@ -105,7 +97,6 @@
         Node lNode = n.getAttributes().getNamedItem("language");
         if (lNode == null) lang = "javascript";
         else lang = lNode.getNodeValue();
-        //XXX lang is not checked??
         StringBuffer script = new StringBuffer();
         NodeList nn = n.getChildNodes();
         if (nn.getLength() > 0) {
@@ -113,9 +104,9 @@
             if (i > 0) script.append('\n');
             script.append(nn.item(i).getNodeValue());
           }
-          if (LOG.isDebugEnabled()) {
-            LOG.info("script: language=" + lang + ", text: " + script.toString());
-          }
+          // if (LOG.isInfoEnabled()) {
+          //   LOG.info("script: language=" + lang + ", text: " + script.toString());
+          // }
           Outlink[] links = getJSLinks(script.toString(), "", base);
           if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
           // no other children of interest here, go one level up.
@@ -184,7 +175,7 @@
   /**
    *  This method extracts URLs from literals embedded in JavaScript.
    */
-  Outlink[] getJSLinks(String plainText, String anchor, String base) {
+  private Outlink[] getJSLinks(String plainText, String anchor, String base) {
 
     final List outlinks = new ArrayList();
     URL baseURL = null;
@@ -196,27 +187,30 @@
     }
 
     try {
-      final Pattern stringPattern = Pattern.compile(STRING_PATTERN,
-          Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
-      final Pattern urlPattern = Pattern.compile(URI_PATTERN,
-              Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
-      
-      final Matcher quoted = stringPattern.matcher(plainText);
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(STRING_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final Pattern pattern1 = cp.compile(URI_PATTERN,
+              Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+                  | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcher matcher1 = new Perl5Matcher();
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
 
+      MatchResult result;
       String url;
 
       //loop the matches
-      while (quoted.find()) {
-        String quotedString = quoted.group(2);
-        Matcher urls = urlPattern.matcher(quotedString);
-        
-        if (!urls.find()) {
+      while (matcher.contains(input, pattern)) {
+        result = matcher.getMatch();
+        url = result.group(2);
+        PatternMatcherInput input1 = new PatternMatcherInput(url);
+        if (!matcher1.matches(input1, pattern1)) {
           //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); }
           continue;
         }
-
-        url = urls.group();
-        
         if (url.startsWith("www.")) {
             url = "http://" + url;
         } else {

Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Sun Mar 11 14:18:23 2007
@@ -1,4 +1,4 @@
-/*
+/**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,62 +14,60 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.nutch.net.urlnormalizer.basic;
 
 import java.net.URL;
 import java.net.MalformedURLException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
+// Commons Logging imports
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
+// Nutch imports
 import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.util.LogUtil;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.*;
 
-/**
- * Converts URLs to a normal form.
- * <p>
- * All substitutions will be done step by step, to ensure that certain
- * constellations will be normalized, too.
- * </p>
- * <p>
- * For example: "/aa/bb/../../cc/../foo.html will be normalized in the following
- * manner: "/aa/bb/../../cc/../foo.html" "/aa/../cc/../foo.html"
- * "/cc/../foo.html" "/foo.html".
- * </p>
- * <p>
- * The normalization also takes care of leading "/../", which will be replaced
- * by "/", because this is a rather a sign of bad webserver configuration than
- * of a wanted link. For example, urls like "http://www.foo.com/../" should
- * return a http 404 error instead of redirecting to "http://www.foo.com".
- * </p>
- */
+/** Converts URLs to a normal form . */
 public class BasicURLNormalizer implements URLNormalizer {
     public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class);
 
-    /**
-     * This pattern tries to find spots like "/xx/../" in the url, which could
-     * be replaced by "/" xx consists of chars, different then "/" (slash) and
-     * needs to have at least one char different from ".".
-     */
-    private static final Pattern RELATIVE_PATH_PATTERN = Pattern.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)");
-
-    private static final String RELATIVE_PATH_SUBSTITUTION="/";
-    
-    /**
-     * This pattern tries to find spots like leading "/../" in the url, which
-     * could be replaced by "/".
-     */
-    private static final Pattern LEADING_RELATIVE_PATH_PATTERN = Pattern.compile("^(/\\.\\./)+");
-
-    private static final String LEADING_RELATIVE_PATH_SUBSTITUTION="/";
+    private Perl5Compiler compiler = new Perl5Compiler();
+    private ThreadLocal matchers = new ThreadLocal() {
+        protected synchronized Object initialValue() {
+          return new Perl5Matcher();
+        }
+      };
+    private Rule relativePathRule = null;
+    private Rule leadingRelativePathRule = null;
 
     private Configuration conf;
 
-
     public BasicURLNormalizer() {
+      try {
+        // this pattern tries to find spots like "/xx/../" in the url, which
+        // could be replaced by "/" xx consists of chars, different then "/"
+        // (slash) and needs to have at least one char different from "."
+        relativePathRule = new Rule();
+        relativePathRule.pattern = (Perl5Pattern)
+          compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
+                           Perl5Compiler.READ_ONLY_MASK);
+        relativePathRule.substitution = new Perl5Substitution("/");
+
+        // this pattern tries to find spots like leading "/../" in the url,
+        // which could be replaced by "/"
+        leadingRelativePathRule = new Rule();
+        leadingRelativePathRule.pattern = (Perl5Pattern)
+          compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
+        leadingRelativePathRule.substitution = new Perl5Substitution("/");
+
+      } catch (MalformedPatternException e) {
+        e.printStackTrace(LogUtil.getWarnStream(LOG));
+        throw new RuntimeException(e);
+      }
     }
 
     public String normalize(String urlString, String scope)
@@ -131,25 +129,56 @@
         return urlString;
     }
 
-  private String substituteUnnecessaryRelativePaths(String file) {
-    String fileWorkCopy = file;
-    int oldLen = file.length();
-    int newLen = oldLen - 1;
-    Matcher m;
-    
-    while (oldLen != newLen) {
-      oldLen = fileWorkCopy.length();
-      m = RELATIVE_PATH_PATTERN.matcher(fileWorkCopy);
-      // substitue first occurence of "/xx/../" by "/"
-      fileWorkCopy = m.replaceFirst(RELATIVE_PATH_SUBSTITUTION);
-      m = LEADING_RELATIVE_PATH_PATTERN.matcher(fileWorkCopy);
-      // remove leading "/../"
-      fileWorkCopy = m.replaceFirst(LEADING_RELATIVE_PATH_SUBSTITUTION);
-      newLen = fileWorkCopy.length();
+    private String substituteUnnecessaryRelativePaths(String file) {
+        String fileWorkCopy = file;
+        int oldLen = file.length();
+        int newLen = oldLen - 1;
+
+        // All substitutions will be done step by step, to ensure that certain
+        // constellations will be normalized, too
+        //
+        // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
+        // following manner:
+        //   "/aa/bb/../../cc/../foo.html"
+        //   "/aa/../cc/../foo.html"
+        //   "/cc/../foo.html"
+        //   "/foo.html"
+        //
+        // The normalization also takes care of leading "/../", which will be
+        // replaced by "/", because this is a rather a sign of bad webserver
+        // configuration than of a wanted link.  For example, urls like
+        // "http://www.foo.com/../" should return a http 404 error instead of
+        // redirecting to "http://www.foo.com".
+        //
+        Perl5Matcher matcher = (Perl5Matcher)matchers.get();
+
+        while (oldLen != newLen) {
+            // substitue first occurence of "/xx/../" by "/"
+            oldLen = fileWorkCopy.length();
+            fileWorkCopy = Util.substitute
+              (matcher, relativePathRule.pattern,
+               relativePathRule.substitution, fileWorkCopy, 1);
+
+            // remove leading "/../"
+            fileWorkCopy = Util.substitute
+              (matcher, leadingRelativePathRule.pattern,
+               leadingRelativePathRule.substitution, fileWorkCopy, 1);
+            newLen = fileWorkCopy.length();
+        }
+
+        return fileWorkCopy;
+    }
+
+
+    /**
+     * Class which holds a compiled pattern and its corresponding substition
+     * string.
+     */
+    private static class Rule {
+        public Perl5Pattern pattern;
+        public Perl5Substitution substitution;
     }
 
-    return fileWorkCopy;
-  }
 
   public void setConf(Configuration conf) {
     this.conf = conf;
@@ -160,3 +189,4 @@
   }
 
 }
+

Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original)
+++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Sun Mar 11 14:18:23 2007
@@ -1,4 +1,4 @@
-/*
+/**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.nutch.net.urlnormalizer.regex;
 
 import java.net.URL;
@@ -27,7 +28,6 @@
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -40,6 +40,7 @@
 
 import javax.xml.parsers.*;
 import org.w3c.dom.*;
+import org.apache.oro.text.regex.*;
 
 /**
  * Allows users to do regex substitutions on all/any URLs that are encountered,
@@ -64,14 +65,16 @@
    * string.
    */
   private static class Rule {
-    public Pattern pattern;
+    public Perl5Pattern pattern;
 
     public String substitution;
   }
 
-  private HashMap<String, List<Rule>> scopedRules;
+  private HashMap scopedRules;
   
-  private static final List<Rule> EMPTY_RULES = Collections.EMPTY_LIST;
+  private static final List EMPTY_RULES = Collections.EMPTY_LIST;
+
+  private PatternMatcher matcher = new Perl5Matcher();
 
   /**
    * The default constructor which is called from UrlNormalizerFactory
@@ -90,9 +93,9 @@
    * configuration files for it.
    */
   public RegexURLNormalizer(Configuration conf, String filename)
-          throws IOException {
+          throws IOException, MalformedPatternException {
     super(conf);
-    List<Rule> rules = readConfigurationFile(filename);
+    List rules = readConfigurationFile(filename);
     if (rules != null)
       scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
   }
@@ -103,9 +106,9 @@
     // the default constructor was called
     if (this.scopedRules == null) {
       String filename = getConf().get("urlnormalizer.regex.file");
-      scopedRules = new HashMap<String, List<Rule>>();
+      scopedRules = new HashMap();
       URL url = getConf().getResource(filename);
-      List<Rule> rules = null;
+      List rules = null;
       if (url == null) {
         LOG.warn("Can't load the default config file! " + filename);
         rules = EMPTY_RULES;
@@ -123,7 +126,7 @@
 
   // used in JUnit test.
   void setConfiguration(InputStream is, String scope) {
-    List<Rule> rules = readConfiguration(is);
+    List rules = readConfiguration(is);
     scopedRules.put(scope, rules);
     LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules.");
   }
@@ -133,7 +136,7 @@
    * patterns. It accepts a string url as input and returns the altered string.
    */
   public synchronized String regexNormalize(String urlString, String scope) {
-    List<Rule> curRules = scopedRules.get(scope);
+    List curRules = (List)scopedRules.get(scope);
     if (curRules == null) {
       // try to populate
       String configFile = getConf().get("urlnormalizer.regex.file." + scope);
@@ -144,6 +147,7 @@
           LOG.warn("Can't load resource for config file: " + configFile);
         } else {
           try {
+            InputStream is = resource.openStream();
             curRules = readConfiguration(resource.openStream());
             scopedRules.put(scope, curRules);
           } catch (Exception e) {
@@ -158,11 +162,14 @@
     }
     if (curRules == EMPTY_RULES || curRules == null) {
       // use global rules
-      curRules = scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
+      curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
     }
-    
-    for (Rule rule: curRules) {
-      urlString = rule.pattern.matcher(urlString).replaceAll(rule.substitution);
+    Iterator i = curRules.iterator();
+    while (i.hasNext()) {
+      Rule r = (Rule) i.next();
+      urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution(
+              r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual
+                                                                // substitution
     }
     return urlString;
   }
@@ -173,7 +180,7 @@
   }
 
   /** Reads the configuration file and populates a List of Rules. */
-  private List<Rule> readConfigurationFile(String filename) {
+  private List readConfigurationFile(String filename) {
     if (LOG.isInfoEnabled()) {
       LOG.info("loading " + filename);
     }
@@ -186,8 +193,9 @@
     }
   }
   
-  private List<Rule> readConfiguration(InputStream is) {
-    List<Rule> rules = new ArrayList<Rule>();
+  private List readConfiguration(InputStream is) {
+    Perl5Compiler compiler = new Perl5Compiler();
+    List rules = new ArrayList();
     try {
 
       // borrowed heavily from code in Configuration.java
@@ -225,7 +233,7 @@
         }
         if (patternValue != null && subValue != null) {
           Rule rule = new Rule();
-          rule.pattern = Pattern.compile(patternValue);
+          rule.pattern = (Perl5Pattern) compiler.compile(patternValue);
           rule.substitution = subValue;
           rules.add(rule);
         }
@@ -241,14 +249,15 @@
   }
 
   /** Spits out patterns and substitutions that are in the configuration file. */
-  public static void main(String args[]) throws IOException {
+  public static void main(String args[]) throws MalformedPatternException,
+          IOException {
     RegexURLNormalizer normalizer = new RegexURLNormalizer();
     normalizer.setConf(NutchConfiguration.create());
     Iterator i = ((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator();
     System.out.println("* Rules for 'DEFAULT' scope:");
     while (i.hasNext()) {
       Rule r = (Rule) i.next();
-      System.out.print("  " + r.pattern.pattern() + " -> ");
+      System.out.print("  " + r.pattern.getPattern() + " -> ");
       System.out.println(r.substitution);
     }
     // load the scope
@@ -264,7 +273,7 @@
         i = ((List)normalizer.scopedRules.get(scope)).iterator();
         while (i.hasNext()) {
           Rule r = (Rule) i.next();
-          System.out.print("  " + r.pattern.pattern() + " -> ");
+          System.out.print("  " + r.pattern.getPattern() + " -> ");
           System.out.println(r.substitution);
         }
       }