You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/05 17:50:57 UTC

svn commit: r1055520 - in /nutch/trunk: ./ src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/ src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/

Author: jnioche
Date: Wed Jan  5 16:50:57 2011
New Revision: 1055520

URL: http://svn.apache.org/viewvc?rev=1055520&view=rev
Log:
NUTCH-935 basicurlnormalizer removes unnecessary /./ in URLs

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
    nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1055520&r1=1055519&r2=1055520&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan  5 16:50:57 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.0 - Current Development
 
+* NUTCH-935 basicurlnormalizer removes unnecessary /./ in URLs
+
 * NUTCH-912 MoreIndexingFilter does not parse docx and xlsx date formats (Markus Jelsma, jnioche)
 
 * NUTCH-936 LanguageIdentifier should not set empty lang field on NutchDocument (Markus Jelsma via jnioche)

Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1055520&r1=1055519&r2=1055520&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Wed Jan  5 16:50:57 2011
@@ -43,6 +43,7 @@ public class BasicURLNormalizer implemen
       };
     private Rule relativePathRule = null;
     private Rule leadingRelativePathRule = null;
+    private Rule currentPathRule = null;
     private Rule adjacentSlashRule = null;
 
     private Configuration conf;
@@ -65,6 +66,13 @@ public class BasicURLNormalizer implemen
           compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
         leadingRelativePathRule.substitution = new Perl5Substitution("/");
 
+        // this pattern tries to find spots like "/./" in the url,
+        // which could be replaced by "/"
+        currentPathRule = new Rule();
+        currentPathRule.pattern = (Perl5Pattern)
+          compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK);
+        currentPathRule.substitution = new Perl5Substitution("/");
+
         // this pattern tries to find spots like "xx//yy" in the url,
         // which could be replaced by a "/"
         adjacentSlashRule = new Rule();
@@ -171,6 +179,11 @@ public class BasicURLNormalizer implemen
             fileWorkCopy = Util.substitute
               (matcher, leadingRelativePathRule.pattern,
                leadingRelativePathRule.substitution, fileWorkCopy, 1);
+
+            // remove unnecessary "/./"
+            fileWorkCopy = Util.substitute
+            (matcher, currentPathRule.pattern,
+            		currentPathRule.substitution, fileWorkCopy, 1);
             
             
             // collapse adjacent slashes with "/"

Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1055520&r1=1055519&r2=1055520&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Wed Jan  5 16:50:57 2011
@@ -60,6 +60,9 @@ public class TestBasicURLNormalizer exte
     //     normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
 
     // check that unnecessary "../" are removed
+
+    normalizeTest("http://foo.com/aa/./foo.html",
+                  "http://foo.com/aa/foo.html" );
     normalizeTest("http://foo.com/aa/../",
                   "http://foo.com/" );
     normalizeTest("http://foo.com/aa/bb/../",
@@ -112,4 +115,5 @@ public class TestBasicURLNormalizer exte
 
 
 
-}
+
+}
\ No newline at end of file