You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2015/04/22 11:55:24 UTC
svn commit: r1675305 - in /nutch/trunk: ./
src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/
src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/
Author: jnioche
Date: Wed Apr 22 09:55:23 2015
New Revision: 1675305
URL: http://svn.apache.org/r1675305
Log:
NUTCH-1990 Use URI.normalise() in BasicURLNormalizer (snagel, jnioche)
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1675305&r1=1675304&r2=1675305&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Apr 22 09:55:23 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1990 Use URI.normalise() in BasicURLNormalizer (snagel, jnioche)
+
* NUTCH-1973 Job Administration end point for the REST service (Sujen Shah via mattmann)
* NUTCH-1697 SegmentMerger to implement Tool (markus, snagel)
Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1675305&r1=1675304&r2=1675305&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Wed Apr 22 09:55:23 2015
@@ -17,20 +17,21 @@
package org.apache.nutch.net.urlnormalizer.basic;
-import java.net.URL;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
import java.net.MalformedURLException;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.regex.Pattern;
-// Slf4j Logging imports
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-// Nutch imports
-import org.apache.nutch.net.URLNormalizer;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.oro.text.regex.*;
-
/**
* Converts URLs to a normal form:
* <ul>
@@ -42,57 +43,12 @@ public class BasicURLNormalizer extends
public static final Logger LOG = LoggerFactory
.getLogger(BasicURLNormalizer.class);
- private Perl5Compiler compiler = new Perl5Compiler();
- private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() {
- protected Perl5Matcher initialValue() {
- return new Perl5Matcher();
- }
- };
- private final Rule relativePathRule;
- private final Rule leadingRelativePathRule;
- private final Rule currentPathRule;
- private final Rule adjacentSlashRule;
-
- private final static java.util.regex.Pattern hasNormalizablePattern = java.util.regex.Pattern
- .compile("/\\.?\\.?/");
-
- private Configuration conf;
-
- public BasicURLNormalizer() {
- try {
- // this pattern tries to find spots like "/xx/../" in the url, which
- // could be replaced by "/" xx consists of chars, different then "/"
- // (slash) and needs to have at least one char different from "."
- relativePathRule = new Rule();
- relativePathRule.pattern = (Perl5Pattern) compiler.compile(
- "(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK);
- relativePathRule.substitution = new Perl5Substitution("/");
-
- // this pattern tries to find spots like leading "/../" in the url,
- // which could be replaced by "/"
- leadingRelativePathRule = new Rule();
- leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile(
- "^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
- leadingRelativePathRule.substitution = new Perl5Substitution("/");
-
- // this pattern tries to find spots like "/./" in the url,
- // which could be replaced by "/"
- currentPathRule = new Rule();
- currentPathRule.pattern = (Perl5Pattern) compiler.compile("(/\\./)",
- Perl5Compiler.READ_ONLY_MASK);
- currentPathRule.substitution = new Perl5Substitution("/");
-
- // this pattern tries to find spots like "xx//yy" in the url,
- // which could be replaced by a "/"
- adjacentSlashRule = new Rule();
- adjacentSlashRule.pattern = (Perl5Pattern) compiler.compile("/{2,}",
- Perl5Compiler.READ_ONLY_MASK);
- adjacentSlashRule.substitution = new Perl5Substitution("/");
-
- } catch (MalformedPatternException e) {
- throw new RuntimeException(e);
- }
- }
+ /**
+ * Pattern to detect whether a URL path could be normalized. Contains one of
+ * /. or ./ /.. or ../ //
+ */
+ private final static Pattern hasNormalizablePathPattern = Pattern
+ .compile("/[./]|[.]/");
public String normalize(String urlString, String scope)
throws MalformedURLException {
@@ -138,9 +94,8 @@ public class BasicURLNormalizer extends
changed = true;
}
- // check for unnecessary use of "/../"
- String file2 = substituteUnnecessaryRelativePaths(file);
-
+ // check for unnecessary use of "/../", "/./", and "//"
+ String file2 = getFileWithNormalizedPath(url);
if (!file.equals(file2)) {
changed = true;
file = file2;
@@ -154,72 +109,58 @@ public class BasicURLNormalizer extends
return urlString;
}
- private String substituteUnnecessaryRelativePaths(String file) {
-
- if (!hasNormalizablePattern.matcher(file).find())
- return file;
-
- String fileWorkCopy = file;
- int oldLen = file.length();
- int newLen = oldLen - 1;
-
- // All substitutions will be done step by step, to ensure that certain
- // constellations will be normalized, too
- //
- // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
- // following manner:
- // "/aa/bb/../../cc/../foo.html"
- // "/aa/../cc/../foo.html"
- // "/cc/../foo.html"
- // "/foo.html"
- //
- // The normalization also takes care of leading "/../", which will be
- // replaced by "/", because this is a rather a sign of bad webserver
- // configuration than of a wanted link. For example, urls like
- // "http://www.foo.com/../" should return a http 404 error instead of
- // redirecting to "http://www.foo.com".
- //
- Perl5Matcher matcher = (Perl5Matcher) matchers.get();
-
- while (oldLen != newLen) {
- // substitue first occurence of "/xx/../" by "/"
- oldLen = fileWorkCopy.length();
- fileWorkCopy = Util.substitute(matcher, relativePathRule.pattern,
- relativePathRule.substitution, fileWorkCopy, 1);
-
- // remove leading "/../"
- fileWorkCopy = Util.substitute(matcher, leadingRelativePathRule.pattern,
- leadingRelativePathRule.substitution, fileWorkCopy, 1);
-
- // remove unnecessary "/./"
- fileWorkCopy = Util.substitute(matcher, currentPathRule.pattern,
- currentPathRule.substitution, fileWorkCopy, 1);
-
- // collapse adjacent slashes with "/"
- fileWorkCopy = Util.substitute(matcher, adjacentSlashRule.pattern,
- adjacentSlashRule.substitution, fileWorkCopy, 1);
+ private String getFileWithNormalizedPath(URL url)
+ throws MalformedURLException {
+ String file;
- newLen = fileWorkCopy.length();
+ if (hasNormalizablePathPattern.matcher(url.getPath()).find()) {
+ // only normalize the path if there is something to normalize
+ // to avoid needless work
+ try {
+ file = url.toURI().normalize().toURL().getFile();
+ // URI.normalize() does not normalize leading dot segments,
+ // see also http://tools.ietf.org/html/rfc3986#section-5.2.4
+ int start = 0;
+ while (file.startsWith("/../", start)) {
+ start += 3;
+ }
+ if (start > 0) {
+ file = file.substring(start);
+ }
+ } catch (URISyntaxException e) {
+ file = url.getFile();
+ }
+ } else {
+ file = url.getFile();
}
- return fileWorkCopy;
- }
-
- /**
- * Class which holds a compiled pattern and its corresponding substition
- * string.
- */
- private static class Rule {
- public Perl5Pattern pattern;
- public Perl5Substitution substitution;
- }
+ // if path is empty return a single slash
+ if (file.isEmpty()) {
+ file = "/";
+ }
- public void setConf(Configuration conf) {
- this.conf = conf;
+ return file;
}
- public Configuration getConf() {
- return this.conf;
+ public static void main(String args[]) throws IOException {
+ BasicURLNormalizer normalizer = new BasicURLNormalizer();
+ normalizer.setConf(NutchConfiguration.create());
+ String scope = URLNormalizers.SCOPE_DEFAULT;
+ if (args.length >= 1) {
+ scope = args[0];
+ System.out.println("Scope: " + scope);
+ }
+ String line, normUrl;
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ while ((line = in.readLine()) != null) {
+ try {
+ normUrl = normalizer.normalize(line, scope);
+ System.out.println(normUrl);
+ } catch (MalformedURLException e) {
+ System.out.println("failed: " + line);
+ }
+ }
+ System.exit(0);
}
}
Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1675305&r1=1675304&r2=1675305&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Wed Apr 22 09:55:23 2015
@@ -65,7 +65,7 @@ public class TestBasicURLNormalizer {
normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
normalizeTest("http://foo.com/aa/../", "http://foo.com/");
normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");
- normalizeTest("http://foo.com/aa/..", "http://foo.com/aa/..");
+ normalizeTest("http://foo.com/aa/..", "http://foo.com/");
normalizeTest("http://foo.com/aa/bb/cc/../../foo.html",
"http://foo.com/aa/foo.html");
normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html",
@@ -93,10 +93,12 @@ public class TestBasicURLNormalizer {
"http://foo.com/aa/bb/foo.html");
normalizeTest("http://foo.com////aa////bb////foo.html",
"http://foo.com/aa/bb/foo.html");
+ normalizeTest("http://foo.com/aa?referer=http://bar.com",
+ "http://foo.com/aa?referer=http://bar.com");
}
private void normalizeTest(String weird, String normal) throws Exception {
- Assert.assertEquals(normal,
+ Assert.assertEquals("normalizing: " + weird, normal,
normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
}