You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/05 17:50:57 UTC
svn commit: r1055520 - in /nutch/trunk: ./
src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/
src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/
Author: jnioche
Date: Wed Jan 5 16:50:57 2011
New Revision: 1055520
URL: http://svn.apache.org/viewvc?rev=1055520&view=rev
Log:
NUTCH-935 basicurlnormalizer removes unnecessary /./ in URLs
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1055520&r1=1055519&r2=1055520&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 5 16:50:57 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.0 - Current Development
+* NUTCH-935 basicurlnormalizer removes unnecessary /./ in URLs
+
* NUTCH-912 MoreIndexingFilter does not parse docx and xlsx date formats (Markus Jelsma, jnioche)
* NUTCH-936 LanguageIdentifier should not set empty lang field on NutchDocument (Markus Jelsma via jnioche)
Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1055520&r1=1055519&r2=1055520&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Wed Jan 5 16:50:57 2011
@@ -43,6 +43,7 @@ public class BasicURLNormalizer implemen
};
private Rule relativePathRule = null;
private Rule leadingRelativePathRule = null;
+ private Rule currentPathRule = null;
private Rule adjacentSlashRule = null;
private Configuration conf;
@@ -65,6 +66,13 @@ public class BasicURLNormalizer implemen
compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
leadingRelativePathRule.substitution = new Perl5Substitution("/");
+ // this pattern tries to find spots like "/./" in the url,
+ // which could be replaced by "/"
+ currentPathRule = new Rule();
+ currentPathRule.pattern = (Perl5Pattern)
+ compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK);
+ currentPathRule.substitution = new Perl5Substitution("/");
+
// this pattern tries to find spots like "xx//yy" in the url,
// which could be replaced by a "/"
adjacentSlashRule = new Rule();
@@ -171,6 +179,11 @@ public class BasicURLNormalizer implemen
fileWorkCopy = Util.substitute
(matcher, leadingRelativePathRule.pattern,
leadingRelativePathRule.substitution, fileWorkCopy, 1);
+
+ // remove unnecessary "/./"
+ fileWorkCopy = Util.substitute
+ (matcher, currentPathRule.pattern,
+ currentPathRule.substitution, fileWorkCopy, 1);
// collapse adjacent slashes with "/"
Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1055520&r1=1055519&r2=1055520&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Wed Jan 5 16:50:57 2011
@@ -60,6 +60,9 @@ public class TestBasicURLNormalizer exte
// normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
// check that unnecessary "../" are removed
+
+ normalizeTest("http://foo.com/aa/./foo.html",
+ "http://foo.com/aa/foo.html" );
normalizeTest("http://foo.com/aa/../",
"http://foo.com/" );
normalizeTest("http://foo.com/aa/bb/../",
@@ -112,4 +115,5 @@ public class TestBasicURLNormalizer exte
-}
+
+}
\ No newline at end of file