You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/11/04 22:07:42 UTC

svn commit: r1636736 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/java/org/apache/nutch/util/ branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/ branches/2.x/src/plugin/urlnormalizer-regex/sample/ branches/...

Author: snagel
Date: Tue Nov  4 21:07:41 2014
New Revision: 1636736

URL: http://svn.apache.org/r1636736
Log:
NUTCH-1483 (including NUTCH-1879, NUTCH-1880, NUTCH-1885) fix errors related to protocol-file

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/conf/regex-normalize.xml.template
    nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
    nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
    nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
    nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
    nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/conf/regex-normalize.xml.template
    nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
    nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
    nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
    nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
    nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Nov  4 21:07:41 2014
@@ -2,6 +2,14 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel)
+
+* NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying Wang, snagel)
+
+* NUTCH-1880 URLUtil should not add additional slashes for file URLs (snagel)
+
+* NUTCH-1879 Regex URL normalizer should remove multiple slashes after file: protocol (snagel)
+
 * NUTCH-1820 remove field "orig" which duplicates "id" (lewismc, snagel)
 
 * NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Tue Nov  4 21:07:41 2014
@@ -46,6 +46,19 @@
 </property>
 
 <property>
+  <name>file.crawl.redirect_noncanonical</name>
+  <value>true</value>
+  <description>
+    If true, protocol-file treats non-canonical file names as
+    redirects and does not canonicalize file names internally. A file
+    name containing symbolic links as path elements is then not
+    resolved and &quot;fetched&quot; but recorded as redirect with the
+    canonical name (all links on path are resolved) as redirect
+    target.
+  </description>
+</property>
+
+<property>
   <name>file.content.ignored</name>
   <value>true</value>
   <description>If true, no file content will be saved during fetch.

Modified: nutch/branches/2.x/conf/regex-normalize.xml.template
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/regex-normalize.xml.template?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/conf/regex-normalize.xml.template (original)
+++ nutch/branches/2.x/conf/regex-normalize.xml.template Tue Nov  4 21:07:41 2014
@@ -63,7 +63,15 @@
   <substitution></substitution>
 </regex>
 
-<!-- removes duplicate slashes -->
+<!-- normalize file:/// protocol prefix: -->
+<!--  keep one single slash (NUTCH-1483) -->
+<regex>
+  <pattern>^file://+</pattern>
+  <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
 <regex>
   <pattern>(?&lt;!:)/{2,}</pattern>
   <substitution>/</substitution>

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Tue Nov  4 21:07:41 2014
@@ -383,9 +383,15 @@ public class URLUtil {
   public static String toASCII(String url) {
     try {
       URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
       URI p = new URI(u.getProtocol(),
         u.getUserInfo(),
-        IDN.toASCII(u.getHost()),
+        IDN.toASCII(host),
         u.getPort(),
         u.getPath(),
         u.getQuery(),
@@ -401,6 +407,12 @@ public class URLUtil {
   public static String toUNICODE(String url) {
     try {
       URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
       StringBuilder sb = new StringBuilder();
       sb.append(u.getProtocol());
       sb.append("://");
@@ -408,7 +420,7 @@ public class URLUtil {
         sb.append(u.getUserInfo());
         sb.append('@');
       }
-      sb.append(IDN.toUnicode(u.getHost()));
+      sb.append(IDN.toUnicode(host));
       if (u.getPort() != -1) {
         sb.append(':');
         sb.append(u.getPort());

Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Tue Nov  4 21:07:41 2014
@@ -54,6 +54,12 @@ public class File implements Protocol {
   
   boolean crawlParents;
 
+  /**
+   * if true return a redirect for symbolic links and do not resolve the links
+   * internally
+   */
+  boolean symlinksAsRedirects = true;
+
   private Configuration conf;
 
   // constructor
@@ -66,6 +72,8 @@ public class File implements Protocol {
     this.conf = conf;
     this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
     this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+    this.symlinksAsRedirects = conf.getBoolean(
+        "file.crawl.redirect_noncanonical", true);
   }
   
   /**
@@ -116,13 +124,20 @@ public class File implements Protocol {
           return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.STATUS_NOTFOUND);
 
         } else if (code >= 300 && code < 400) { // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new FileException("Too many redirects: " + url);
           u = new URL(response.getHeader("Location"));
-          redirects++;
           if (LOG.isTraceEnabled()) {
             LOG.trace("redirect to " + u);
           }
+          if (symlinksAsRedirects) {
+            return new ProtocolOutput(response.toContent(),
+                ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.MOVED, u));
+          } else if (redirects == MAX_REDIRECTS) {
+            LOG.trace("Too many redirects: {}", url);
+            return new ProtocolOutput(response.toContent(),
+                ProtocolStatusUtils.makeStatus(
+                    ProtocolStatusUtils.REDIR_EXCEEDED, u));
+          }
+          redirects++;
 
         } else { // convert to exception
           throw new FileError(code);
@@ -174,14 +189,26 @@ public class File implements Protocol {
     if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
       file.setMaxContentLength(maxContentLength);
 
-    Content content = file.getProtocolOutput(urlString, WebPage.newBuilder().build())
-        .getContent();
-
+    ProtocolOutput output = file.getProtocolOutput(urlString, WebPage
+        .newBuilder().build());
+    Content content = output.getContent();
+
+    System.err.println("URL: " + content.getUrl());
+    ProtocolStatus status = output.getStatus();
+    String protocolMessage = ProtocolStatusUtils.getMessage(status);
+    System.err.println("Status: "
+        + ProtocolStatusUtils.getName(status.getCode())
+        + (protocolMessage == null ? "" : ": " + protocolMessage));
     System.out.println("Content-Type: " + content.getContentType());
     System.out.println("Content-Length: "
         + content.getMetadata().get(Response.CONTENT_LENGTH));
     System.out.println("Last-Modified: "
         + content.getMetadata().get(Response.LAST_MODIFIED));
+    String redirectLocation = content.getMetadata().get("Location");
+    if (redirectLocation != null) {
+      System.err.println("Location: " + redirectLocation);
+    }
+
     if (dumpContent) {
       System.out.print(new String(content.getContent()));
     }

Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (original)
+++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test Tue Nov  4 21:07:41 2014
@@ -75,3 +75,10 @@ http://www.foo.com/foo.html?&x=y http://
 http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
 http://www.foo.com/foo.html? http://www.foo.com/foo.html
 
+# remove double slashes but keep 2 slashes after protocol
+http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
+https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
+
+# normalize file: protocol prefix (keep one slash)
+file:///path//foo.html file:/path/foo.html
+file:/path//foo.html file:/path/foo.html

Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (original)
+++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Tue Nov  4 21:07:41 2014
@@ -48,5 +48,19 @@
   <substitution></substitution>
 </regex>
 
+<!-- normalize file:/// protocol prefix: -->
+<!--  keep one single slash (NUTCH-1483) -->
+<regex>
+  <pattern>^file://+</pattern>
+  <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
+<regex>
+  <pattern>(?&lt;!:)/{2,}</pattern>
+  <substitution>/</substitution>
+</regex>
+
 </regex-normalize>
 

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Tue Nov  4 21:07:41 2014
@@ -277,4 +277,11 @@ public class TestURLUtil {
         URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1")); 
   }
 
+  @Test
+  public void testFileProtocol() throws Exception {
+    // keep one single slash NUTCH-XXX
+    assertEquals("file:/path/file.html", URLUtil.toASCII("file:/path/file.html"));
+    assertEquals("file:/path/file.html", URLUtil.toUNICODE("file:/path/file.html"));
+  }
+
 }

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Nov  4 21:07:41 2014
@@ -2,6 +2,14 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel)
+
+* NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying Wang, snagel)
+
+* NUTCH-1880 URLUtil should not add additional slashes for file URLs (snagel)
+
+* NUTCH-1879 Regex URL normalizer should remove multiple slashes after file: protocol (snagel)
+
 * NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel)
 
 * NUTCH-1865 Enable use of SNAPSHOT's with Nutch Ivy dependency management (lewismc)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Nov  4 21:07:41 2014
@@ -56,6 +56,19 @@
 </property>
 
 <property>
+  <name>file.crawl.redirect_noncanonical</name>
+  <value>true</value>
+  <description>
+    If true, protocol-file treats non-canonical file names as
+    redirects and does not canonicalize file names internally. A file
+    name containing symbolic links as path elements is then not
+    resolved and &quot;fetched&quot; but recorded as redirect with the
+    canonical name (all links on path are resolved) as redirect
+    target.
+  </description>
+</property>
+
+<property>
   <name>file.content.ignored</name>
   <value>true</value>
   <description>If true, no file content will be saved during fetch.

Modified: nutch/trunk/conf/regex-normalize.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Tue Nov  4 21:07:41 2014
@@ -63,7 +63,15 @@
   <substitution></substitution>
 </regex>
 
-<!-- removes duplicate slashes -->
+<!-- normalize file:/// protocol prefix: -->
+<!--  keep one single slash (NUTCH-1483) -->
+<regex>
+  <pattern>^file://+</pattern>
+  <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
 <regex>
   <pattern>(?&lt;!:)/{2,}</pattern>
   <substitution>/</substitution>

Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Tue Nov  4 21:07:41 2014
@@ -425,9 +425,15 @@ public class URLUtil {
   public static String toASCII(String url) {
     try {
       URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
       URI p = new URI(u.getProtocol(),
         u.getUserInfo(),
-        IDN.toASCII(u.getHost()),
+        IDN.toASCII(host),
         u.getPort(),
         u.getPath(),
         u.getQuery(),
@@ -443,6 +449,12 @@ public class URLUtil {
   public static String toUNICODE(String url) {
     try {
       URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
       StringBuilder sb = new StringBuilder();
       sb.append(u.getProtocol());
       sb.append("://");
@@ -450,7 +462,7 @@ public class URLUtil {
         sb.append(u.getUserInfo());
         sb.append('@');
       }
-      sb.append(IDN.toUnicode(u.getHost()));
+      sb.append(IDN.toUnicode(host));
       if (u.getPort() != -1) {
         sb.append(':');
         sb.append(u.getPort());

Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Tue Nov  4 21:07:41 2014
@@ -53,6 +53,12 @@ public class File implements Protocol {
   int maxContentLength;
   boolean crawlParents;
 
+  /**
+   * if true return a redirect for symbolic links and do not resolve the links
+   * internally
+   */
+  boolean symlinksAsRedirects = true;
+
   private Configuration conf;
 
   public File() {}
@@ -64,6 +70,8 @@ public class File implements Protocol {
     this.conf = conf;
     this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
     this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+    this.symlinksAsRedirects = conf.getBoolean(
+        "file.crawl.redirect_noncanonical", true);
   }
 
   /**
@@ -115,13 +123,19 @@ public class File implements Protocol {
           return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND);
 
         } else if (code >= 300 && code < 400) {     // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new FileException("Too many redirects: " + url);
           u = new URL(response.getHeader("Location"));
-          redirects++;                
           if (LOG.isTraceEnabled()) {
             LOG.trace("redirect to " + u); 
           }
+          if (symlinksAsRedirects) {
+            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+                ProtocolStatus.MOVED, u));
+          } else if (redirects == MAX_REDIRECTS) {
+            LOG.trace("Too many redirects: {}", url);
+            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+                ProtocolStatus.REDIR_EXCEEDED, u));
+          }
+          redirects++;
   
         } else {                                    // convert to exception
           throw new FileError(code);
@@ -172,13 +186,21 @@ public class File implements Protocol {
     // set log level
     //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+    ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
+    Content content = output.getContent();
 
+    System.err.println("URL: " + content.getUrl());
+    System.err.println("Status: " + output.getStatus());
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " +
                        content.getMetadata().get(Response.CONTENT_LENGTH));
     System.err.println("Last-Modified: " +
                        content.getMetadata().get(Response.LAST_MODIFIED));
+    String redirectLocation = content.getMetadata().get("Location");
+    if (redirectLocation != null) {
+      System.err.println("Location: " + redirectLocation);
+    }
+
     if (dumpContent) {
       System.out.print(new String(content.getContent()));
     }

Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (original)
+++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test Tue Nov  4 21:07:41 2014
@@ -75,3 +75,10 @@ http://www.foo.com/foo.html?&x=y http://
 http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
 http://www.foo.com/foo.html? http://www.foo.com/foo.html
 
+# remove double slashes but keep 2 slashes after protocol
+http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
+https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
+
+# normalize file: protocol prefix (keep one slash)
+file:///path//foo.html file:/path/foo.html
+file:/path//foo.html file:/path/foo.html

Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (original)
+++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Tue Nov  4 21:07:41 2014
@@ -48,5 +48,19 @@
   <substitution></substitution>
 </regex>
 
+<!-- normalize file:/// protocol prefix: -->
+<!--  keep one single slash (NUTCH-1483) -->
+<regex>
+  <pattern>^file://+</pattern>
+  <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
+<regex>
+  <pattern>(?&lt;!:)/{2,}</pattern>
+  <substitution>/</substitution>
+</regex>
+
 </regex-normalize>
 

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Tue Nov  4 21:07:41 2014
@@ -275,4 +275,11 @@ public class TestURLUtil {
         URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1")); 
   }
 
+  @Test
+  public void testFileProtocol() throws Exception {
+    // keep one single slash NUTCH-XXX
+    Assert.assertEquals("file:/path/file.html", URLUtil.toASCII("file:/path/file.html"));
+    Assert.assertEquals("file:/path/file.html", URLUtil.toUNICODE("file:/path/file.html"));
+  }
+
 }