You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2015/04/22 23:05:13 UTC
svn commit: r1675499 - in /nutch/branches/2.x: ./
src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/
src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/
Author: snagel
Date: Wed Apr 22 21:05:13 2015
New Revision: 1675499
URL: http://svn.apache.org/r1675499
Log:
NUTCH-1990 Use URI.normalise() in BasicURLNormalizer
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1675499&r1=1675498&r2=1675499&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Apr 22 21:05:13 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.4-SNAPSHOT
+* NUTCH-1990 Use URI.normalise() in BasicURLNormalizer (snagel, jnioche)
+
* NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel)
* NUTCH-1944 Index HTML raw content (meabed via mattmann)
Modified: nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1675499&r1=1675498&r2=1675499&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Wed Apr 22 21:05:13 2015
@@ -17,8 +17,19 @@
package org.apache.nutch.net.urlnormalizer.basic;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URISyntaxException;
import java.net.URL;
import java.net.MalformedURLException;
+import java.util.regex.Pattern;
+
+
+
+
+
+
// Commons Logging imports
import org.slf4j.Logger;
@@ -26,10 +37,10 @@ import org.slf4j.LoggerFactory;
// Nutch imports
import org.apache.nutch.net.URLNormalizer;
-
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
-import org.apache.oro.text.regex.*;
/**
* Converts URLs to a normal form:
@@ -42,54 +53,12 @@ public class BasicURLNormalizer extends
public static final Logger LOG = LoggerFactory
.getLogger(BasicURLNormalizer.class);
- private Perl5Compiler compiler = new Perl5Compiler();
- private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() {
- protected Perl5Matcher initialValue() {
- return new Perl5Matcher();
- }
- };
- private final Rule relativePathRule;
- private final Rule leadingRelativePathRule;
- private final Rule currentPathRule;
- private final Rule adjacentSlashRule;
-
- private Configuration conf;
-
- public BasicURLNormalizer() {
- try {
- // this pattern tries to find spots like "/xx/../" in the url, which
- // could be replaced by "/" xx consists of chars, different then "/"
- // (slash) and needs to have at least one char different from "."
- relativePathRule = new Rule();
- relativePathRule.pattern = (Perl5Pattern) compiler.compile(
- "(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK);
- relativePathRule.substitution = new Perl5Substitution("/");
-
- // this pattern tries to find spots like leading "/../" in the url,
- // which could be replaced by "/"
- leadingRelativePathRule = new Rule();
- leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile(
- "^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
- leadingRelativePathRule.substitution = new Perl5Substitution("/");
-
- // this pattern tries to find spots like "/./" in the url,
- // which could be replaced by "/"
- currentPathRule = new Rule();
- currentPathRule.pattern = (Perl5Pattern) compiler.compile("(/\\./)",
- Perl5Compiler.READ_ONLY_MASK);
- currentPathRule.substitution = new Perl5Substitution("/");
-
- // this pattern tries to find spots like "xx//yy" in the url,
- // which could be replaced by a "/"
- adjacentSlashRule = new Rule();
- adjacentSlashRule.pattern = (Perl5Pattern) compiler.compile("/{2,}",
- Perl5Compiler.READ_ONLY_MASK);
- adjacentSlashRule.substitution = new Perl5Substitution("/");
-
- } catch (MalformedPatternException e) {
- throw new RuntimeException(e);
- }
- }
+ /**
+ * Pattern to detect whether a URL path could be normalized. Contains one of
+ * /. or ./ /.. or ../ //
+ */
+ private final static Pattern hasNormalizablePathPattern = Pattern
+ .compile("/[./]|[.]/");
public String normalize(String urlString, String scope)
throws MalformedURLException {
@@ -135,9 +104,8 @@ public class BasicURLNormalizer extends
changed = true;
}
- // check for unnecessary use of "/../"
- String file2 = substituteUnnecessaryRelativePaths(file);
-
+ // check for unnecessary use of "/../", "/./", and "//"
+ String file2 = getFileWithNormalizedPath(url);
if (!file.equals(file2)) {
changed = true;
file = file2;
@@ -151,59 +119,58 @@ public class BasicURLNormalizer extends
return urlString;
}
- private String substituteUnnecessaryRelativePaths(String file) {
- String fileWorkCopy = file;
- int oldLen = file.length();
- int newLen = oldLen - 1;
-
- // All substitutions will be done step by step, to ensure that certain
- // constellations will be normalized, too
- //
- // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
- // following manner:
- // "/aa/bb/../../cc/../foo.html"
- // "/aa/../cc/../foo.html"
- // "/cc/../foo.html"
- // "/foo.html"
- //
- // The normalization also takes care of leading "/../", which will be
- // replaced by "/", because this is a rather a sign of bad webserver
- // configuration than of a wanted link. For example, urls like
- // "http://www.foo.com/../" should return a http 404 error instead of
- // redirecting to "http://www.foo.com".
- //
- Perl5Matcher matcher = matchers.get();
-
- while (oldLen != newLen) {
- // substitue first occurence of "/xx/../" by "/"
- oldLen = fileWorkCopy.length();
- fileWorkCopy = Util.substitute(matcher, relativePathRule.pattern,
- relativePathRule.substitution, fileWorkCopy, 1);
-
- // remove leading "/../"
- fileWorkCopy = Util.substitute(matcher, leadingRelativePathRule.pattern,
- leadingRelativePathRule.substitution, fileWorkCopy, 1);
-
- // remove unnecessary "/./"
- fileWorkCopy = Util.substitute(matcher, currentPathRule.pattern,
- currentPathRule.substitution, fileWorkCopy, 1);
-
- // collapse adjacent slashes with "/"
- fileWorkCopy = Util.substitute(matcher, adjacentSlashRule.pattern,
- adjacentSlashRule.substitution, fileWorkCopy, 1);
+ private String getFileWithNormalizedPath(URL url)
+ throws MalformedURLException {
+ String file;
+
+ if (hasNormalizablePathPattern.matcher(url.getPath()).find()) {
+ // only normalize the path if there is something to normalize
+ // to avoid needless work
+ try {
+ file = url.toURI().normalize().toURL().getFile();
+ // URI.normalize() does not normalize leading dot segments,
+ // see also http://tools.ietf.org/html/rfc3986#section-5.2.4
+ int start = 0;
+ while (file.startsWith("/../", start)) {
+ start += 3;
+ }
+ if (start > 0) {
+ file = file.substring(start);
+ }
+ } catch (URISyntaxException e) {
+ file = url.getFile();
+ }
+ } else {
+ file = url.getFile();
+ }
- newLen = fileWorkCopy.length();
+ // if path is empty return a single slash
+ if (file.isEmpty()) {
+ file = "/";
}
- return fileWorkCopy;
+ return file;
}
- /**
- * Class which holds a compiled pattern and its corresponding substition
- * string.
- */
- private static class Rule {
- public Perl5Pattern pattern;
- public Perl5Substitution substitution;
+ public static void main(String args[]) throws IOException {
+ BasicURLNormalizer normalizer = new BasicURLNormalizer();
+ normalizer.setConf(NutchConfiguration.create());
+ String scope = URLNormalizers.SCOPE_DEFAULT;
+ if (args.length >= 1) {
+ scope = args[0];
+ System.out.println("Scope: " + scope);
+ }
+ String line, normUrl;
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ while ((line = in.readLine()) != null) {
+ try {
+ normUrl = normalizer.normalize(line, scope);
+ System.out.println(normUrl);
+ } catch (MalformedURLException e) {
+ System.out.println("failed: " + line);
+ }
+ }
+ System.exit(0);
}
-}
+
+}
\ No newline at end of file
Modified: nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1675499&r1=1675498&r2=1675499&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original)
+++ nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Wed Apr 22 21:05:13 2015
@@ -67,7 +67,7 @@ public class TestBasicURLNormalizer {
normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
normalizeTest("http://foo.com/aa/../", "http://foo.com/");
normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");
- normalizeTest("http://foo.com/aa/..", "http://foo.com/aa/..");
+ normalizeTest("http://foo.com/aa/..", "http://foo.com/");
normalizeTest("http://foo.com/aa/bb/cc/../../foo.html",
"http://foo.com/aa/foo.html");
normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html",
@@ -95,10 +95,12 @@ public class TestBasicURLNormalizer {
"http://foo.com/aa/bb/foo.html");
normalizeTest("http://foo.com////aa////bb////foo.html",
"http://foo.com/aa/bb/foo.html");
+ normalizeTest("http://foo.com/aa?referer=http://bar.com",
+ "http://foo.com/aa?referer=http://bar.com");
}
private void normalizeTest(String weird, String normal) throws Exception {
- assertEquals(normal,
+ assertEquals("normalizing: " + weird, normal,
normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
}