You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2015/04/22 11:55:24 UTC

svn commit: r1675305 - in /nutch/trunk: ./ src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/ src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/

Author: jnioche
Date: Wed Apr 22 09:55:23 2015
New Revision: 1675305

URL: http://svn.apache.org/r1675305
Log:
NUTCH-1990 Use URI.normalise() in BasicURLNormalizer (snagel, jnioche)

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
    nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1675305&r1=1675304&r2=1675305&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Apr 22 09:55:23 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1990 Use URI.normalise() in BasicURLNormalizer (snagel, jnioche)
+
 * NUTCH-1973 Job Administration end point for the REST service (Sujen Shah via mattmann)
 
 * NUTCH-1697 SegmentMerger to implement Tool (markus, snagel)

Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1675305&r1=1675304&r2=1675305&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Wed Apr 22 09:55:23 2015
@@ -17,20 +17,21 @@
 
 package org.apache.nutch.net.urlnormalizer.basic;
 
-import java.net.URL;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
 import java.net.MalformedURLException;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.regex.Pattern;
 
-// Slf4j Logging imports
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-// Nutch imports
-import org.apache.nutch.net.URLNormalizer;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.oro.text.regex.*;
-
 /**
  * Converts URLs to a normal form:
  * <ul>
@@ -42,57 +43,12 @@ public class BasicURLNormalizer extends
   public static final Logger LOG = LoggerFactory
       .getLogger(BasicURLNormalizer.class);
 
-  private Perl5Compiler compiler = new Perl5Compiler();
-  private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() {
-    protected Perl5Matcher initialValue() {
-      return new Perl5Matcher();
-    }
-  };
-  private final Rule relativePathRule;
-  private final Rule leadingRelativePathRule;
-  private final Rule currentPathRule;
-  private final Rule adjacentSlashRule;
-
-  private final static java.util.regex.Pattern hasNormalizablePattern = java.util.regex.Pattern
-      .compile("/\\.?\\.?/");
-
-  private Configuration conf;
-
-  public BasicURLNormalizer() {
-    try {
-      // this pattern tries to find spots like "/xx/../" in the url, which
-      // could be replaced by "/" xx consists of chars, different then "/"
-      // (slash) and needs to have at least one char different from "."
-      relativePathRule = new Rule();
-      relativePathRule.pattern = (Perl5Pattern) compiler.compile(
-          "(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK);
-      relativePathRule.substitution = new Perl5Substitution("/");
-
-      // this pattern tries to find spots like leading "/../" in the url,
-      // which could be replaced by "/"
-      leadingRelativePathRule = new Rule();
-      leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile(
-          "^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
-      leadingRelativePathRule.substitution = new Perl5Substitution("/");
-
-      // this pattern tries to find spots like "/./" in the url,
-      // which could be replaced by "/"
-      currentPathRule = new Rule();
-      currentPathRule.pattern = (Perl5Pattern) compiler.compile("(/\\./)",
-          Perl5Compiler.READ_ONLY_MASK);
-      currentPathRule.substitution = new Perl5Substitution("/");
-
-      // this pattern tries to find spots like "xx//yy" in the url,
-      // which could be replaced by a "/"
-      adjacentSlashRule = new Rule();
-      adjacentSlashRule.pattern = (Perl5Pattern) compiler.compile("/{2,}",
-          Perl5Compiler.READ_ONLY_MASK);
-      adjacentSlashRule.substitution = new Perl5Substitution("/");
-
-    } catch (MalformedPatternException e) {
-      throw new RuntimeException(e);
-    }
-  }
+  /**
+   * Pattern to detect whether a URL path could be normalized. Contains one of
+   * /. or ./ /.. or ../ //
+   */
+  private final static Pattern hasNormalizablePathPattern = Pattern
+      .compile("/[./]|[.]/");
 
   public String normalize(String urlString, String scope)
       throws MalformedURLException {
@@ -138,9 +94,8 @@ public class BasicURLNormalizer extends
         changed = true;
       }
 
-      // check for unnecessary use of "/../"
-      String file2 = substituteUnnecessaryRelativePaths(file);
-
+      // check for unnecessary use of "/../", "/./", and "//"
+      String file2 = getFileWithNormalizedPath(url);
       if (!file.equals(file2)) {
         changed = true;
         file = file2;
@@ -154,72 +109,58 @@ public class BasicURLNormalizer extends
     return urlString;
   }
 
-  private String substituteUnnecessaryRelativePaths(String file) {
-
-    if (!hasNormalizablePattern.matcher(file).find())
-      return file;
-
-    String fileWorkCopy = file;
-    int oldLen = file.length();
-    int newLen = oldLen - 1;
-
-    // All substitutions will be done step by step, to ensure that certain
-    // constellations will be normalized, too
-    //
-    // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
-    // following manner:
-    // "/aa/bb/../../cc/../foo.html"
-    // "/aa/../cc/../foo.html"
-    // "/cc/../foo.html"
-    // "/foo.html"
-    //
-    // The normalization also takes care of leading "/../", which will be
-    // replaced by "/", because this is a rather a sign of bad webserver
-    // configuration than of a wanted link. For example, urls like
-    // "http://www.foo.com/../" should return a http 404 error instead of
-    // redirecting to "http://www.foo.com".
-    //
-    Perl5Matcher matcher = (Perl5Matcher) matchers.get();
-
-    while (oldLen != newLen) {
-      // substitue first occurence of "/xx/../" by "/"
-      oldLen = fileWorkCopy.length();
-      fileWorkCopy = Util.substitute(matcher, relativePathRule.pattern,
-          relativePathRule.substitution, fileWorkCopy, 1);
-
-      // remove leading "/../"
-      fileWorkCopy = Util.substitute(matcher, leadingRelativePathRule.pattern,
-          leadingRelativePathRule.substitution, fileWorkCopy, 1);
-
-      // remove unnecessary "/./"
-      fileWorkCopy = Util.substitute(matcher, currentPathRule.pattern,
-          currentPathRule.substitution, fileWorkCopy, 1);
-
-      // collapse adjacent slashes with "/"
-      fileWorkCopy = Util.substitute(matcher, adjacentSlashRule.pattern,
-          adjacentSlashRule.substitution, fileWorkCopy, 1);
+  private String getFileWithNormalizedPath(URL url)
+      throws MalformedURLException {
+    String file;
 
-      newLen = fileWorkCopy.length();
+    if (hasNormalizablePathPattern.matcher(url.getPath()).find()) {
+      // only normalize the path if there is something to normalize
+      // to avoid needless work
+      try {
+        file = url.toURI().normalize().toURL().getFile();
+        // URI.normalize() does not normalize leading dot segments,
+        // see also http://tools.ietf.org/html/rfc3986#section-5.2.4
+        int start = 0;
+        while (file.startsWith("/../", start)) {
+          start += 3;
+        }
+        if (start > 0) {
+          file = file.substring(start);
+        }
+      } catch (URISyntaxException e) {
+        file = url.getFile();
+      }
+    } else {
+      file = url.getFile();
     }
 
-    return fileWorkCopy;
-  }
-
-  /**
-   * Class which holds a compiled pattern and its corresponding substition
-   * string.
-   */
-  private static class Rule {
-    public Perl5Pattern pattern;
-    public Perl5Substitution substitution;
-  }
+    // if path is empty return a single slash
+    if (file.isEmpty()) {
+      file = "/";
+    }
 
-  public void setConf(Configuration conf) {
-    this.conf = conf;
+    return file;
   }
 
-  public Configuration getConf() {
-    return this.conf;
+  public static void main(String args[]) throws IOException {
+    BasicURLNormalizer normalizer = new BasicURLNormalizer();
+    normalizer.setConf(NutchConfiguration.create());
+    String scope = URLNormalizers.SCOPE_DEFAULT;
+    if (args.length >= 1) {
+      scope = args[0];
+      System.out.println("Scope: " + scope);
+    }
+    String line, normUrl;
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    while ((line = in.readLine()) != null) {
+      try {
+        normUrl = normalizer.normalize(line, scope);
+        System.out.println(normUrl);
+      } catch (MalformedURLException e) {
+        System.out.println("failed: " + line);
+      }
+    }
+    System.exit(0);
   }
 
 }

Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1675305&r1=1675304&r2=1675305&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Wed Apr 22 09:55:23 2015
@@ -65,7 +65,7 @@ public class TestBasicURLNormalizer {
     normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
     normalizeTest("http://foo.com/aa/../", "http://foo.com/");
     normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");
-    normalizeTest("http://foo.com/aa/..", "http://foo.com/aa/..");
+    normalizeTest("http://foo.com/aa/..", "http://foo.com/");
     normalizeTest("http://foo.com/aa/bb/cc/../../foo.html",
         "http://foo.com/aa/foo.html");
     normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html",
@@ -93,10 +93,12 @@ public class TestBasicURLNormalizer {
         "http://foo.com/aa/bb/foo.html");
     normalizeTest("http://foo.com////aa////bb////foo.html",
         "http://foo.com/aa/bb/foo.html");
+    normalizeTest("http://foo.com/aa?referer=http://bar.com",
+        "http://foo.com/aa?referer=http://bar.com");
   }
 
   private void normalizeTest(String weird, String normal) throws Exception {
-    Assert.assertEquals(normal,
+    Assert.assertEquals("normalizing: " + weird, normal,
         normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
   }